Skip to content

Commit f8a0989

Browse files
jbrockmendelTomAugspurger
authored andcommitted
BUG: hash_pandas_object fails on array containing tuple #28969 (#30508)
* BUG: hash_pandas_object fails on array containing tuple #28969
1 parent 19578e3 commit f8a0989

File tree

4 files changed

+30
-1
lines changed

4 files changed

+30
-1
lines changed

Diff for: doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -973,6 +973,7 @@ Other
973973
- Bug in :meth:`Series.diff` where a boolean series would incorrectly raise a ``TypeError`` (:issue:`17294`)
974974
- :meth:`Series.append` will no longer raise a ``TypeError`` when passed a tuple of ``Series`` (:issue:`28410`)
975975
- Fix corrupted error message when calling ``pandas.libs._json.encode()`` on a 0d array (:issue:`18878`)
976+
- Bug in ``pd.core.util.hashing.hash_pandas_object`` where arrays containing tuples were incorrectly treated as non-hashable (:issue:`28969`)
976977
- Bug in :meth:`DataFrame.append` that raised ``IndexError`` when appending with empty list (:issue:`28769`)
977978
- Fix :class:`AbstractHolidayCalendar` to return correct results for
978979
years after 2030 (now goes up to 2200) (:issue:`27790`)

Diff for: pandas/_libs/hashing.pyx

+6
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,12 @@ def hash_object_array(object[:] arr, object key, object encoding='utf8'):
7070
# null, stringify and encode
7171
data = <bytes>str(val).encode(encoding)
7272

73+
elif isinstance(val, tuple):
74+
# GH#28969 we could have a tuple, but need to ensure that
75+
# the tuple entries are themselves hashable before converting
76+
# to str
77+
hash(val)
78+
data = <bytes>str(val).encode(encoding)
7379
else:
7480
raise TypeError(f"{val} of type {type(val)} is not a valid type "
7581
"for hashing, must be string or null")

Diff for: pandas/core/util/hashing.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -85,11 +85,12 @@ def hash_pandas_object(
8585
if isinstance(obj, ABCMultiIndex):
8686
return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
8787

88-
if isinstance(obj, ABCIndexClass):
88+
elif isinstance(obj, ABCIndexClass):
8989
h = hash_array(obj.values, encoding, hash_key, categorize).astype(
9090
"uint64", copy=False
9191
)
9292
h = Series(h, index=obj, dtype="uint64", copy=False)
93+
9394
elif isinstance(obj, ABCSeries):
9495
h = hash_array(obj.values, encoding, hash_key, categorize).astype(
9596
"uint64", copy=False

Diff for: pandas/tests/util/test_hashing.py

+21
Original file line numberDiff line numberDiff line change
@@ -353,3 +353,24 @@ def test_hash_collisions():
353353

354354
result = hash_array(np.asarray(hashes, dtype=object), "utf8")
355355
tm.assert_numpy_array_equal(result, np.concatenate([expected1, expected2], axis=0))
356+
357+
358+
def test_hash_with_tuple():
359+
# GH#28969 array containing a tuple raises on call to arr.astype(str)
360+
# apparently a numpy bug github.com/numpy/numpy/issues/9441
361+
362+
df = pd.DataFrame({"data": [tuple("1"), tuple("2")]})
363+
result = hash_pandas_object(df)
364+
expected = pd.Series([10345501319357378243, 8331063931016360761], dtype=np.uint64)
365+
tm.assert_series_equal(result, expected)
366+
367+
df2 = pd.DataFrame({"data": [tuple([1]), tuple([2])]})
368+
result = hash_pandas_object(df2)
369+
expected = pd.Series([9408946347443669104, 3278256261030523334], dtype=np.uint64)
370+
tm.assert_series_equal(result, expected)
371+
372+
# require that the elements of such tuples are themselves hashable
373+
374+
df3 = pd.DataFrame({"data": [tuple([1, []]), tuple([2, {}])]})
375+
with pytest.raises(TypeError, match="unhashable type: 'list'"):
376+
hash_pandas_object(df3)

0 commit comments

Comments
 (0)