Skip to content

Commit e299560

Browse files
sinhrksjreback
authored andcommitted
PERF: Improve replace perf
When .replace is called with `dict`, replacements are done per value. Current impl try to soft convert the dtype in every replacement, but it is enough to be done in the final replacement. Author: sinhrks <sinhrks@gmail.com> Closes pandas-dev#12745 from sinhrks/replace_perf and squashes the following commits: ffc59b0 [sinhrks] PERF: Improve replace perf
1 parent 11ca57f commit e299560

File tree

4 files changed

+52
-13
lines changed

4 files changed

+52
-13
lines changed

asv_bench/benchmarks/replace.py

+24
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,30 @@ def time_replace_large_dict(self):
3232
self.s.replace(self.to_rep, inplace=True)
3333

3434

35+
class replace_convert(object):
36+
goal_time = 0.5
37+
38+
def setup(self):
39+
self.n = (10 ** 3)
40+
self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n)))
41+
self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n)))
42+
self.s = Series(np.random.randint(self.n, size=(10 ** 3)))
43+
self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)),
44+
'B': np.random.randint(self.n, size=(10 ** 3))})
45+
46+
def time_replace_series_timestamp(self):
47+
self.s.replace(self.to_ts)
48+
49+
def time_replace_series_timedelta(self):
50+
self.s.replace(self.to_td)
51+
52+
def time_replace_frame_timestamp(self):
53+
self.df.replace(self.to_ts)
54+
55+
def time_replace_frame_timedelta(self):
56+
self.df.replace(self.to_td)
57+
58+
3559
class replace_replacena(object):
3660
goal_time = 0.2
3761

doc/source/whatsnew/v0.19.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Highlights include:
2121
Performance Improvements
2222
~~~~~~~~~~~~~~~~~~~~~~~~
2323

24+
- Improved performance of ``.replace()`` (:issue:`12745`)
2425

2526
.. _whatsnew_0192.bug_fixes:
2627

pandas/core/generic.py

+15-8
Original file line numberDiff line numberDiff line change
@@ -3477,20 +3477,27 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
34773477
res = self if inplace else self.copy()
34783478
for c, src in compat.iteritems(to_replace):
34793479
if c in value and c in self:
3480+
# object conversion is handled in
3481+
# series.replace which is called recursivelly
34803482
res[c] = res[c].replace(to_replace=src,
34813483
value=value[c],
3482-
inplace=False, regex=regex)
3484+
inplace=False,
3485+
regex=regex)
34833486
return None if inplace else res
34843487

34853488
# {'A': NA} -> 0
34863489
elif not is_list_like(value):
3487-
for k, src in compat.iteritems(to_replace):
3488-
if k in self:
3489-
new_data = new_data.replace(to_replace=src,
3490-
value=value,
3491-
filter=[k],
3492-
inplace=inplace,
3493-
regex=regex)
3490+
keys = [(k, src) for k, src in compat.iteritems(to_replace)
3491+
if k in self]
3492+
keys_len = len(keys) - 1
3493+
for i, (k, src) in enumerate(keys):
3494+
convert = i == keys_len
3495+
new_data = new_data.replace(to_replace=src,
3496+
value=value,
3497+
filter=[k],
3498+
inplace=inplace,
3499+
regex=regex,
3500+
convert=convert)
34943501
else:
34953502
raise TypeError('value argument must be scalar, dict, or '
34963503
'Series')

pandas/core/internals.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -622,7 +622,6 @@ def replace(self, to_replace, value, inplace=False, filter=None,
622622

623623
original_to_replace = to_replace
624624
mask = isnull(self.values)
625-
626625
# try to replace, if we raise an error, convert to ObjectBlock and
627626
# retry
628627
try:
@@ -1795,13 +1794,14 @@ def should_store(self, value):
17951794
return issubclass(value.dtype.type, np.bool_)
17961795

17971796
def replace(self, to_replace, value, inplace=False, filter=None,
1798-
regex=False, mgr=None):
1797+
regex=False, convert=True, mgr=None):
17991798
to_replace_values = np.atleast_1d(to_replace)
18001799
if not np.can_cast(to_replace_values, bool):
18011800
return self
18021801
return super(BoolBlock, self).replace(to_replace, value,
18031802
inplace=inplace, filter=filter,
1804-
regex=regex, mgr=mgr)
1803+
regex=regex, convert=convert,
1804+
mgr=mgr)
18051805

18061806

18071807
class ObjectBlock(Block):
@@ -3214,6 +3214,7 @@ def comp(s):
32143214
masks = [comp(s) for i, s in enumerate(src_list)]
32153215

32163216
result_blocks = []
3217+
src_len = len(src_list) - 1
32173218
for blk in self.blocks:
32183219

32193220
# its possible to get multiple result blocks here
@@ -3223,8 +3224,9 @@ def comp(s):
32233224
new_rb = []
32243225
for b in rb:
32253226
if b.dtype == np.object_:
3227+
convert = i == src_len
32263228
result = b.replace(s, d, inplace=inplace, regex=regex,
3227-
mgr=mgr)
3229+
mgr=mgr, convert=convert)
32283230
new_rb = _extend_blocks(result, new_rb)
32293231
else:
32303232
# get our mask for this element, sized to this
@@ -4788,7 +4790,12 @@ def _putmask_smart(v, m, n):
47884790

47894791
# change the dtype
47904792
dtype, _ = _maybe_promote(n.dtype)
4791-
nv = v.astype(dtype)
4793+
4794+
if is_extension_type(v.dtype) and is_object_dtype(dtype):
4795+
nv = v.get_values(dtype)
4796+
else:
4797+
nv = v.astype(dtype)
4798+
47924799
try:
47934800
nv[m] = n[m]
47944801
except ValueError:

0 commit comments

Comments
 (0)