@@ -36,6 +36,7 @@ from cpython.ref cimport (
36
36
from cpython.unicode cimport (
37
37
PyUnicode_AsUTF8String,
38
38
PyUnicode_Decode,
39
+ PyUnicode_DecodeUTF8,
39
40
)
40
41
41
42
@@ -321,7 +322,6 @@ cdef class TextReader:
321
322
bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
322
323
uint64_t parser_start
323
324
list clocks
324
- char * c_encoding
325
325
const char * encoding_errors
326
326
kh_str_starts_t * false_set
327
327
kh_str_starts_t * true_set
@@ -381,7 +381,6 @@ cdef class TextReader:
381
381
encoding_errors = b" strict" ):
382
382
383
383
# set encoding for native Python and C library
384
- self .c_encoding = NULL
385
384
if isinstance (encoding_errors, str ):
386
385
encoding_errors = encoding_errors.encode(" utf-8" )
387
386
Py_INCREF(encoding_errors)
@@ -638,7 +637,6 @@ cdef class TextReader:
638
637
char * word
639
638
object name, old_name
640
639
uint64_t hr, data_line = 0
641
- StringPath path = _string_path(self .c_encoding)
642
640
list header = []
643
641
set unnamed_cols = set ()
644
642
@@ -678,8 +676,8 @@ cdef class TextReader:
678
676
for i in range (field_count):
679
677
word = self .parser.words[start + i]
680
678
681
- name = PyUnicode_Decode (word, strlen(word),
682
- self .c_encoding, self .encoding_errors)
679
+ name = PyUnicode_DecodeUTF8 (word, strlen(word),
680
+ self .encoding_errors)
683
681
684
682
# We use this later when collecting placeholder names.
685
683
old_name = name
@@ -987,8 +985,7 @@ cdef class TextReader:
987
985
f" for column {name} - only the converter will "
988
986
f" be used" ), ParserWarning,
989
987
stacklevel = 5 )
990
- results[i] = _apply_converter(conv, self .parser, i, start, end,
991
- self .c_encoding)
988
+ results[i] = _apply_converter(conv, self .parser, i, start, end)
992
989
continue
993
990
994
991
# Collect the list of NaN values associated with the column.
@@ -1102,8 +1099,7 @@ cdef class TextReader:
1102
1099
# TODO: I suspect that _categorical_convert could be
1103
1100
# optimized when dtype is an instance of CategoricalDtype
1104
1101
codes, cats, na_count = _categorical_convert(
1105
- self .parser, i, start, end, na_filter,
1106
- na_hashset, self .c_encoding)
1102
+ self .parser, i, start, end, na_filter, na_hashset)
1107
1103
1108
1104
# Method accepts list of strings, not encoded ones.
1109
1105
true_values = [x.decode() for x in self .true_values]
@@ -1199,14 +1195,8 @@ cdef class TextReader:
1199
1195
cdef _string_convert(self , Py_ssize_t i, int64_t start, int64_t end,
1200
1196
bint na_filter, kh_str_starts_t * na_hashset):
1201
1197
1202
- cdef StringPath path = _string_path(self .c_encoding)
1203
-
1204
- if path == UTF8:
1205
- return _string_box_utf8(self .parser, i, start, end, na_filter,
1206
- na_hashset, self .encoding_errors)
1207
- elif path == ENCODED:
1208
- return _string_box_decode(self .parser, i, start, end,
1209
- na_filter, na_hashset, self .c_encoding)
1198
+ return _string_box_utf8(self .parser, i, start, end, na_filter,
1199
+ na_hashset, self .encoding_errors)
1210
1200
1211
1201
def _get_converter (self , i , name ):
1212
1202
if self .converters is None :
@@ -1336,18 +1326,6 @@ def _maybe_upcast(arr):
1336
1326
return arr
1337
1327
1338
1328
1339
- cdef enum StringPath:
1340
- UTF8
1341
- ENCODED
1342
-
1343
-
1344
- # factored out logic to pick string converter
1345
- cdef inline StringPath _string_path(char * encoding):
1346
- if encoding != NULL and encoding != b" utf-8" :
1347
- return ENCODED
1348
- return UTF8
1349
-
1350
-
1351
1329
# ----------------------------------------------------------------------
1352
1330
# Type conversions / inference support code
1353
1331
@@ -1406,68 +1384,10 @@ cdef _string_box_utf8(parser_t *parser, int64_t col,
1406
1384
return result, na_count
1407
1385
1408
1386
1409
- cdef _string_box_decode(parser_t * parser, int64_t col,
1410
- int64_t line_start, int64_t line_end,
1411
- bint na_filter, kh_str_starts_t * na_hashset,
1412
- char * encoding):
1413
- cdef:
1414
- int na_count = 0
1415
- Py_ssize_t i, size, lines
1416
- coliter_t it
1417
- const char * word = NULL
1418
- ndarray[object ] result
1419
-
1420
- int ret = 0
1421
- kh_strbox_t * table
1422
-
1423
- char * errors = " strict"
1424
-
1425
- object pyval
1426
-
1427
- object NA = na_values[np.object_]
1428
- khiter_t k
1429
-
1430
- table = kh_init_strbox()
1431
- lines = line_end - line_start
1432
- result = np.empty(lines, dtype = np.object_)
1433
- coliter_setup(& it, parser, col, line_start)
1434
-
1435
- for i in range (lines):
1436
- COLITER_NEXT(it, word)
1437
-
1438
- if na_filter:
1439
- if kh_get_str_starts_item(na_hashset, word):
1440
- # in the hash table
1441
- na_count += 1
1442
- result[i] = NA
1443
- continue
1444
-
1445
- k = kh_get_strbox(table, word)
1446
-
1447
- # in the hash table
1448
- if k != table.n_buckets:
1449
- # this increments the refcount, but need to test
1450
- pyval = < object > table.vals[k]
1451
- else :
1452
- # box it. new ref?
1453
- size = strlen(word)
1454
- pyval = PyUnicode_Decode(word, size, encoding, errors)
1455
-
1456
- k = kh_put_strbox(table, word, & ret)
1457
- table.vals[k] = < PyObject * > pyval
1458
-
1459
- result[i] = pyval
1460
-
1461
- kh_destroy_strbox(table)
1462
-
1463
- return result, na_count
1464
-
1465
-
1466
1387
@ cython.boundscheck (False )
1467
1388
cdef _categorical_convert(parser_t * parser, int64_t col,
1468
1389
int64_t line_start, int64_t line_end,
1469
- bint na_filter, kh_str_starts_t * na_hashset,
1470
- char * encoding):
1390
+ bint na_filter, kh_str_starts_t * na_hashset):
1471
1391
" Convert column data into codes, categories"
1472
1392
cdef:
1473
1393
int na_count = 0
@@ -1480,7 +1400,6 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
1480
1400
int64_t current_category = 0
1481
1401
1482
1402
char * errors = " strict"
1483
- StringPath path = _string_path(encoding)
1484
1403
1485
1404
int ret = 0
1486
1405
kh_str_t * table
@@ -1516,16 +1435,9 @@ cdef _categorical_convert(parser_t *parser, int64_t col,
1516
1435
1517
1436
# parse and box categories to python strings
1518
1437
result = np.empty(table.n_occupied, dtype = np.object_)
1519
- if path == ENCODED:
1520
- for k in range (table.n_buckets):
1521
- if kh_exist_str(table, k):
1522
- size = strlen(table.keys[k])
1523
- result[table.vals[k]] = PyUnicode_Decode(
1524
- table.keys[k], size, encoding, errors)
1525
- elif path == UTF8:
1526
- for k in range (table.n_buckets):
1527
- if kh_exist_str(table, k):
1528
- result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
1438
+ for k in range (table.n_buckets):
1439
+ if kh_exist_str(table, k):
1440
+ result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
1529
1441
1530
1442
kh_destroy_str(table)
1531
1443
return np.asarray(codes), result, na_count
@@ -2064,13 +1976,11 @@ for k in list(na_values):
2064
1976
2065
1977
2066
1978
cdef _apply_converter(object f, parser_t * parser, int64_t col,
2067
- int64_t line_start, int64_t line_end,
2068
- char * c_encoding):
1979
+ int64_t line_start, int64_t line_end):
2069
1980
cdef:
2070
1981
Py_ssize_t i, lines
2071
1982
coliter_t it
2072
1983
const char * word = NULL
2073
- char * errors = " strict"
2074
1984
ndarray[object ] result
2075
1985
object val
2076
1986
@@ -2079,17 +1989,10 @@ cdef _apply_converter(object f, parser_t *parser, int64_t col,
2079
1989
2080
1990
coliter_setup(& it, parser, col, line_start)
2081
1991
2082
- if c_encoding == NULL or c_encoding == b' utf-8' :
2083
- for i in range (lines):
2084
- COLITER_NEXT(it, word)
2085
- val = PyUnicode_FromString(word)
2086
- result[i] = f(val)
2087
- else :
2088
- for i in range (lines):
2089
- COLITER_NEXT(it, word)
2090
- val = PyUnicode_Decode(word, strlen(word),
2091
- c_encoding, errors)
2092
- result[i] = f(val)
1992
+ for i in range (lines):
1993
+ COLITER_NEXT(it, word)
1994
+ val = PyUnicode_FromString(word)
1995
+ result[i] = f(val)
2093
1996
2094
1997
return lib.maybe_convert_objects(result)
2095
1998
0 commit comments