Skip to content

ENH: Add online operations for EWM.mean #41888

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Jun 12, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
e195c58
Add scaffolding for online EWM
mroeschke May 31, 2021
3d95167
Merge remote-tracking branch 'upstream/master' into online/ewm
mroeschke May 31, 2021
9354bd0
Add online op and new methods and class
mroeschke Jun 7, 2021
0ce197d
Make signatures match
mroeschke Jun 7, 2021
8096cc6
Add some tests, rename some variables
mroeschke Jun 7, 2021
a5273b9
Add newline for readability
mroeschke Jun 7, 2021
bab78cc
Parameterize over adjust and ignore_na
mroeschke Jun 7, 2021
d72a03e
Test resetting in tests
mroeschke Jun 7, 2021
0b7e773
Add test with invalid update
mroeschke Jun 7, 2021
8444b42
Add docstring for mean
mroeschke Jun 7, 2021
7847373
Add docstring for online
mroeschke Jun 7, 2021
df13b55
Parameterize over dataframe and series
mroeschke Jun 7, 2021
57db06e
Generalize axis call for update_times
mroeschke Jun 7, 2021
329dbd2
Remove comments
mroeschke Jun 7, 2021
9594afe
Merge remote-tracking branch 'upstream/master' into online/ewm
mroeschke Jun 8, 2021
28be18a
Add more test and ensure constructions
mroeschke Jun 8, 2021
85025ff
Passing all the non-time tests
mroeschke Jun 8, 2021
3345271
Add whatsnew and window.rst; xfail update_times
mroeschke Jun 9, 2021
2186ea0
Merge remote-tracking branch 'upstream/master' into online/ewm
mroeschke Jun 9, 2021
8024a7b
mypy
mroeschke Jun 9, 2021
80c8b7f
Merge remote-tracking branch 'upstream/master' into online/ewm
mroeschke Jun 9, 2021
8a5b0b9
Address comments
mroeschke Jun 9, 2021
e790947
Fix doctest
mroeschke Jun 9, 2021
916e68b
Merge remote-tracking branch 'upstream/master' into online/ewm
mroeschke Jun 11, 2021
175c4ca
Fix doctest
mroeschke Jun 11, 2021
f799a0f
Merge remote-tracking branch 'upstream/master' into online/ewm
mroeschke Jun 11, 2021
c8b09b6
Merge remote-tracking branch 'upstream/master' into online/ewm
mroeschke Jun 11, 2021
2cb4019
Cannot parallelize a loop
mroeschke Jun 11, 2021
fea8b0b
Trigger CI
mroeschke Jun 11, 2021
04ea064
Merge remote-tracking branch 'upstream/master' into online/ewm
mroeschke Jun 12, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Passing all the non-time tests
  • Loading branch information
mroeschke committed Jun 8, 2021
commit 85025ff8630bbe3c5f22bef9bd440994c99910b1
18 changes: 13 additions & 5 deletions pandas/core/window/ewm.py
Original file line number Diff line number Diff line change
Expand Up @@ -819,36 +819,44 @@ def mean(self, *args, update=None, update_times=None, **kwargs):
1 0.75 5.75
"""
result_kwargs = {}
is_frame = True if self._selected_obj.ndim == 2 else False
if update is not None:
if self._mean.last_ewm is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A user needs to call mean() first then can call mean(update=new_df)

This checks that mean() was called first.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, and test for this? (with good error message)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raise ValueError(
"Must call mean with update=None first before passing update"
)
result_from = 1
result_kwargs["index"] = update.index
if update.ndim == 2:
if is_frame:
last_value = self._mean.last_ewm[np.newaxis, :]
result_kwargs["columns"] = update.columns
else:
last_value = self._mean.last_ewm
result_kwargs["name"] = update.name
obj = np.concatenate((last_value, update.to_numpy()))
np_array = np.concatenate((last_value, update.to_numpy()))
else:
result_from = 0
result_kwargs["index"] = self._selected_obj.index
if self._selected_obj.ndim == 2:
if is_frame:
result_kwargs["columns"] = self._selected_obj.columns
else:
result_kwargs["name"] = self._selected_obj.name
obj = self._selected_obj.astype(np.float64).to_numpy()
np_array = self._selected_obj.astype(np.float64).to_numpy()
if update_times is None:
update_times = np.ones(
max(self._selected_obj.shape[self.axis - 1] - 1, 0), dtype=np.float64
)
else:
update_times = _calculate_deltas(update_times, self.halflife)
ewma_func = generate_online_numba_ewma_func(self.engine_kwargs)
result = self._mean.run_ewm(obj, update_times, self.min_periods, ewma_func)
result = self._mean.run_ewm(
np_array if is_frame else np_array[:, np.newaxis],
update_times,
self.min_periods,
ewma_func,
)
if not is_frame:
result = result.squeeze()
result = result[result_from:]
result = self._selected_obj._constructor(result, **result_kwargs)
return result
28 changes: 21 additions & 7 deletions pandas/tests/window/test_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,37 +39,51 @@ def test_online_vs_non_online_mean(
# Test resetting once
for _ in range(2):
result = online_ewm.mean()
tm.assert_frame_equal(result, expected.head(2))
tm.assert_equal(result, expected.head(2))

result = online_ewm.mean(update=obj.tail(3))
tm.assert_frame_equal(result, expected.tail(3))
tm.assert_equal(result, expected.tail(3))

online_ewm.reset()

@pytest.mark.parametrize(
"obj", [DataFrame({"a": range(5), "b": range(5)}), Series(range(5), name="foo")]
)
def test_update_times_mean(self, obj, nogil, parallel, nopython, adjust, ignore_na):
def test_update_times_mean(
self, obj, nogil, parallel, nopython, adjust, ignore_na, halflife_with_times
):
times = Series(
np.array(
["2020-01-01", "2020-01-02", "2020-01-04", "2020-01-17", "2020-01-21"],
dtype="datetime64",
)
)
expected = obj.ewm(0.5, adjust=adjust, ignore_na=ignore_na, times=times).mean()
expected = obj.ewm(
0.5,
adjust=adjust,
ignore_na=ignore_na,
times=times,
halflife=halflife_with_times,
).mean()

engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
online_ewm = (
obj.head(2)
.ewm(0.5, adjust=adjust, ignore_na=ignore_na, times=times.head(2))
.ewm(
0.5,
adjust=adjust,
ignore_na=ignore_na,
times=times.head(2),
halflife=halflife_with_times,
)
.online(engine_kwargs=engine_kwargs)
)
# Test resetting once
for _ in range(2):
result = online_ewm.mean()
tm.assert_frame_equal(result, expected.head(2))
tm.assert_equal(result, expected.head(2))

result = online_ewm.mean(update=obj.tail(3), update_times=times.tail(3))
tm.assert_frame_equal(result, expected.tail(3))
tm.assert_equal(result, expected.tail(3))

online_ewm.reset()