Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit e68c42d

Browse files
committedMar 10, 2021
Merge remote-tracking branch 'upstream/master' into numpy-types
2 parents 81966ec + 7b5957f commit e68c42d

File tree

12 files changed

+139
-142
lines changed

12 files changed

+139
-142
lines changed
 

‎doc/source/whatsnew/v1.3.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,7 @@ Reshaping
591591
- Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`)
592592
- Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`)
593593
- Allow :class:`Index` to be passed to the :func:`numpy.all` function (:issue:`40180`)
594-
-
594+
- Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`)
595595

596596
Sparse
597597
^^^^^^

‎pandas/_libs/parsers.pyx

+16-113
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ from cpython.ref cimport (
3636
from cpython.unicode cimport (
3737
PyUnicode_AsUTF8String,
3838
PyUnicode_Decode,
39+
PyUnicode_DecodeUTF8,
3940
)
4041

4142

@@ -321,7 +322,6 @@ cdef class TextReader:
321322
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
322323
uint64_t parser_start
323324
list clocks
324-
char *c_encoding
325325
const char *encoding_errors
326326
kh_str_starts_t *false_set
327327
kh_str_starts_t *true_set
@@ -381,7 +381,6 @@ cdef class TextReader:
381381
encoding_errors=b"strict"):
382382

383383
# set encoding for native Python and C library
384-
self.c_encoding = NULL
385384
if isinstance(encoding_errors, str):
386385
encoding_errors = encoding_errors.encode("utf-8")
387386
Py_INCREF(encoding_errors)
@@ -638,7 +637,6 @@ cdef class TextReader:
638637
char *word
639638
object name, old_name
640639
uint64_t hr, data_line = 0
641-
StringPath path = _string_path(self.c_encoding)
642640
list header = []
643641
set unnamed_cols = set()
644642

@@ -678,8 +676,8 @@ cdef class TextReader:
678676
for i in range(field_count):
679677
word = self.parser.words[start + i]
680678

681-
name = PyUnicode_Decode(word, strlen(word),
682-
self.c_encoding, self.encoding_errors)
679+
name = PyUnicode_DecodeUTF8(word, strlen(word),
680+
self.encoding_errors)
683681

684682
# We use this later when collecting placeholder names.
685683
old_name = name
@@ -987,8 +985,7 @@ cdef class TextReader:
987985
f"for column {name} - only the converter will "
988986
f"be used"), ParserWarning,
989987
stacklevel=5)
990-
results[i] = _apply_converter(conv, self.parser, i, start, end,
991-
self.c_encoding)
988+
results[i] = _apply_converter(conv, self.parser, i, start, end)
992989
continue
993990

994991
# Collect the list of NaN values associated with the column.
@@ -1102,8 +1099,7 @@ cdef class TextReader:
11021099
# TODO: I suspect that _categorical_convert could be
11031100
# optimized when dtype is an instance of CategoricalDtype
11041101
codes, cats, na_count = _categorical_convert(
1105-
self.parser, i, start, end, na_filter,
1106-
na_hashset, self.c_encoding)
1102+
self.parser, i, start, end, na_filter, na_hashset)
11071103

11081104
# Method accepts list of strings, not encoded ones.
11091105
true_values = [x.decode() for x in self.true_values]
@@ -1199,14 +1195,8 @@ cdef class TextReader:
11991195
cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
12001196
bint na_filter, kh_str_starts_t *na_hashset):
12011197

1202-
cdef StringPath path = _string_path(self.c_encoding)
1203-
1204-
if path == UTF8:
1205-
return _string_box_utf8(self.parser, i, start, end, na_filter,
1206-
na_hashset, self.encoding_errors)
1207-
elif path == ENCODED:
1208-
return _string_box_decode(self.parser, i, start, end,
1209-
na_filter, na_hashset, self.c_encoding)
1198+
return _string_box_utf8(self.parser, i, start, end, na_filter,
1199+
na_hashset, self.encoding_errors)
12101200

12111201
def _get_converter(self, i, name):
12121202
if self.converters is None:
@@ -1336,18 +1326,6 @@ def _maybe_upcast(arr):
13361326
return arr
13371327

13381328

1339-
cdef enum StringPath:
1340-
UTF8
1341-
ENCODED
1342-
1343-
1344-
# factored out logic to pick string converter
1345-
cdef inline StringPath _string_path(char *encoding):
1346-
if encoding != NULL and encoding != b"utf-8":
1347-
return ENCODED
1348-
return UTF8
1349-
1350-
13511329
# ----------------------------------------------------------------------
13521330
# Type conversions / inference support code
13531331

@@ -1406,68 +1384,10 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
14061384
return result, na_count
14071385

14081386

1409-
cdef _string_box_decode(parser_t *parser, int64_t col,
1410-
int64_t line_start, int64_t line_end,
1411-
bint na_filter, kh_str_starts_t *na_hashset,
1412-
char *encoding):
1413-
cdef:
1414-
int na_count = 0
1415-
Py_ssize_t i, size, lines
1416-
coliter_t it
1417-
const char *word = NULL
1418-
ndarray[object] result
1419-
1420-
int ret = 0
1421-
kh_strbox_t *table
1422-
1423-
char *errors = "strict"
1424-
1425-
object pyval
1426-
1427-
object NA = na_values[np.object_]
1428-
khiter_t k
1429-
1430-
table = kh_init_strbox()
1431-
lines = line_end - line_start
1432-
result = np.empty(lines, dtype=np.object_)
1433-
coliter_setup(&it, parser, col, line_start)
1434-
1435-
for i in range(lines):
1436-
COLITER_NEXT(it, word)
1437-
1438-
if na_filter:
1439-
if kh_get_str_starts_item(na_hashset, word):
1440-
# in the hash table
1441-
na_count += 1
1442-
result[i] = NA
1443-
continue
1444-
1445-
k = kh_get_strbox(table, word)
1446-
1447-
# in the hash table
1448-
if k != table.n_buckets:
1449-
# this increments the refcount, but need to test
1450-
pyval = <object>table.vals[k]
1451-
else:
1452-
# box it. new ref?
1453-
size = strlen(word)
1454-
pyval = PyUnicode_Decode(word, size, encoding, errors)
1455-
1456-
k = kh_put_strbox(table, word, &ret)
1457-
table.vals[k] = <PyObject *>pyval
1458-
1459-
result[i] = pyval
1460-
1461-
kh_destroy_strbox(table)
1462-
1463-
return result, na_count
1464-
1465-
14661387
@cython.boundscheck(False)
14671388
cdef _categorical_convert(parser_t *parser, int64_t col,
14681389
int64_t line_start, int64_t line_end,
1469-
bint na_filter, kh_str_starts_t *na_hashset,
1470-
char *encoding):
1390+
bint na_filter, kh_str_starts_t *na_hashset):
14711391
"Convert column data into codes, categories"
14721392
cdef:
14731393
int na_count = 0
@@ -1480,7 +1400,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
14801400
int64_t current_category = 0
14811401

14821402
char *errors = "strict"
1483-
StringPath path = _string_path(encoding)
14841403

14851404
int ret = 0
14861405
kh_str_t *table
@@ -1516,16 +1435,9 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
15161435

15171436
# parse and box categories to python strings
15181437
result = np.empty(table.n_occupied, dtype=np.object_)
1519-
if path == ENCODED:
1520-
for k in range(table.n_buckets):
1521-
if kh_exist_str(table, k):
1522-
size = strlen(table.keys[k])
1523-
result[table.vals[k]] = PyUnicode_Decode(
1524-
table.keys[k], size, encoding, errors)
1525-
elif path == UTF8:
1526-
for k in range(table.n_buckets):
1527-
if kh_exist_str(table, k):
1528-
result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
1438+
for k in range(table.n_buckets):
1439+
if kh_exist_str(table, k):
1440+
result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
15291441

15301442
kh_destroy_str(table)
15311443
return np.asarray(codes), result, na_count
@@ -2064,13 +1976,11 @@ for k in list(na_values):
20641976

20651977

20661978
cdef _apply_converter(object f, parser_t *parser, int64_t col,
2067-
int64_t line_start, int64_t line_end,
2068-
char* c_encoding):
1979+
int64_t line_start, int64_t line_end):
20691980
cdef:
20701981
Py_ssize_t i, lines
20711982
coliter_t it
20721983
const char *word = NULL
2073-
char *errors = "strict"
20741984
ndarray[object] result
20751985
object val
20761986

@@ -2079,17 +1989,10 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
20791989

20801990
coliter_setup(&it, parser, col, line_start)
20811991

2082-
if c_encoding == NULL or c_encoding == b'utf-8':
2083-
for i in range(lines):
2084-
COLITER_NEXT(it, word)
2085-
val = PyUnicode_FromString(word)
2086-
result[i] = f(val)
2087-
else:
2088-
for i in range(lines):
2089-
COLITER_NEXT(it, word)
2090-
val = PyUnicode_Decode(word, strlen(word),
2091-
c_encoding, errors)
2092-
result[i] = f(val)
1992+
for i in range(lines):
1993+
COLITER_NEXT(it, word)
1994+
val = PyUnicode_FromString(word)
1995+
result[i] = f(val)
20931996

20941997
return lib.maybe_convert_objects(result)
20951998

‎pandas/core/indexes/base.py

+7
Original file line numberDiff line numberDiff line change
@@ -4643,6 +4643,13 @@ def __getitem__(self, key):
46434643
else:
46444644
return result
46454645

4646+
def _getitem_slice(self: _IndexT, slobj: slice) -> _IndexT:
4647+
"""
4648+
Fastpath for __getitem__ when we know we have a slice.
4649+
"""
4650+
res = self._data[slobj]
4651+
return type(self)._simple_new(res, name=self._name)
4652+
46464653
@final
46474654
def _can_hold_identifiers_and_holds_name(self, name) -> bool:
46484655
"""

‎pandas/core/indexes/multi.py

+18
Original file line numberDiff line numberDiff line change
@@ -2105,6 +2105,24 @@ def __getitem__(self, key):
21052105
verify_integrity=False,
21062106
)
21072107

2108+
def _getitem_slice(self: MultiIndex, slobj: slice) -> MultiIndex:
2109+
"""
2110+
Fastpath for __getitem__ when we know we have a slice.
2111+
"""
2112+
sortorder = None
2113+
if slobj.step is None or slobj.step > 0:
2114+
sortorder = self.sortorder
2115+
2116+
new_codes = [level_codes[slobj] for level_codes in self.codes]
2117+
2118+
return type(self)(
2119+
levels=self.levels,
2120+
codes=new_codes,
2121+
names=self._names,
2122+
sortorder=sortorder,
2123+
verify_integrity=False,
2124+
)
2125+
21082126
@Appender(_index_shared_docs["take"] % _index_doc_kwargs)
21092127
def take(
21102128
self: MultiIndex, indices, axis=0, allow_fill=True, fill_value=None, **kwargs

‎pandas/core/indexes/range.py

+7
Original file line numberDiff line numberDiff line change
@@ -827,6 +827,13 @@ def __getitem__(self, key):
827827
# fall back to Int64Index
828828
return super().__getitem__(key)
829829

830+
def _getitem_slice(self: RangeIndex, slobj: slice) -> RangeIndex:
831+
"""
832+
Fastpath for __getitem__ when we know we have a slice.
833+
"""
834+
res = self._range[slobj]
835+
return type(self)._simple_new(res, name=self._name)
836+
830837
@unpack_zerodim_and_defer("__floordiv__")
831838
def __floordiv__(self, other):
832839

‎pandas/core/internals/array_manager.py

+2
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager:
806806

807807
return type(self)(arrays, new_axes, verify_integrity=False)
808808

809+
getitem_mgr = get_slice
810+
809811
def fast_xs(self, loc: int) -> ArrayLike:
810812
"""
811813
Return the array corresponding to `frame.iloc[loc]`.

‎pandas/core/internals/blocks.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,7 @@ def getitem_block(self, slicer, new_mgr_locs=None) -> Block:
374374
"""
375375
Perform __getitem__-like, return result as block.
376376
377-
As of now, only supports slices that preserve dimensionality.
377+
Only supports slices that preserve dimensionality.
378378
"""
379379
if new_mgr_locs is None:
380380
axis0_slicer = slicer[0] if isinstance(slicer, tuple) else slicer

0 commit comments

Comments
 (0)
Please sign in to comment.