Skip to content

Commit 4eb0cec

Browse files
jbrockmendeljreback
authored andcommitted
Refactor out libwriters, fix references to Timestamp, Timedelta (pandas-dev#19413)
1 parent d7fa5b3 commit 4eb0cec

29 files changed

+262
-263
lines changed

pandas/_libs/lib.pyx

+2-194
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,7 @@ from cpython cimport (Py_INCREF, PyTuple_SET_ITEM,
2121
PyBytes_Check,
2222
PyUnicode_Check,
2323
PyTuple_New,
24-
PyObject_RichCompareBool,
25-
PyBytes_GET_SIZE,
26-
PyUnicode_GET_SIZE)
27-
28-
try:
29-
from cpython cimport PyString_GET_SIZE
30-
except ImportError:
31-
from cpython cimport PyUnicode_GET_SIZE as PyString_GET_SIZE
24+
PyObject_RichCompareBool)
3225

3326
cimport cpython
3427

@@ -38,7 +31,7 @@ from cpython.datetime cimport (PyDateTime_Check, PyDate_Check,
3831
PyDateTime_IMPORT)
3932
PyDateTime_IMPORT
4033

41-
from tslib import NaT, Timestamp, Timedelta, array_to_datetime
34+
from tslib import NaT, array_to_datetime
4235
from missing cimport checknull
4336

4437

@@ -127,28 +120,6 @@ def item_from_zerodim(object val):
127120
return util.unbox_if_zerodim(val)
128121

129122

130-
@cython.wraparound(False)
131-
@cython.boundscheck(False)
132-
def fast_unique(ndarray[object] values):
133-
cdef:
134-
Py_ssize_t i, n = len(values)
135-
list uniques = []
136-
dict table = {}
137-
object val, stub = 0
138-
139-
for i from 0 <= i < n:
140-
val = values[i]
141-
if val not in table:
142-
table[val] = stub
143-
uniques.append(val)
144-
try:
145-
uniques.sort()
146-
except Exception:
147-
pass
148-
149-
return uniques
150-
151-
152123
@cython.wraparound(False)
153124
@cython.boundscheck(False)
154125
def fast_unique_multiple(list arrays):
@@ -368,30 +339,6 @@ def has_infs_f8(ndarray[float64_t] arr):
368339
return False
369340

370341

371-
def convert_timestamps(ndarray values):
372-
cdef:
373-
object val, f, result
374-
dict cache = {}
375-
Py_ssize_t i, n = len(values)
376-
ndarray[object] out
377-
378-
# for HDFStore, a bit temporary but...
379-
380-
from datetime import datetime
381-
f = datetime.fromtimestamp
382-
383-
out = np.empty(n, dtype='O')
384-
385-
for i in range(n):
386-
val = util.get_value_1d(values, i)
387-
if val in cache:
388-
out[i] = cache[val]
389-
else:
390-
cache[val] = out[i] = f(val)
391-
392-
return out
393-
394-
395342
def maybe_indices_to_slice(ndarray[int64_t] indices, int max_len):
396343
cdef:
397344
Py_ssize_t i, n = len(indices)
@@ -731,145 +678,6 @@ def clean_index_list(list obj):
731678
return np.asarray(obj), 0
732679

733680

734-
ctypedef fused pandas_string:
735-
str
736-
unicode
737-
bytes
738-
739-
740-
@cython.boundscheck(False)
741-
@cython.wraparound(False)
742-
cpdef Py_ssize_t max_len_string_array(pandas_string[:] arr):
743-
""" return the maximum size of elements in a 1-dim string array """
744-
cdef:
745-
Py_ssize_t i, m = 0, l = 0, length = arr.shape[0]
746-
pandas_string v
747-
748-
for i in range(length):
749-
v = arr[i]
750-
if PyString_Check(v):
751-
l = PyString_GET_SIZE(v)
752-
elif PyBytes_Check(v):
753-
l = PyBytes_GET_SIZE(v)
754-
elif PyUnicode_Check(v):
755-
l = PyUnicode_GET_SIZE(v)
756-
757-
if l > m:
758-
m = l
759-
760-
return m
761-
762-
763-
@cython.boundscheck(False)
764-
@cython.wraparound(False)
765-
def string_array_replace_from_nan_rep(
766-
ndarray[object, ndim=1] arr, object nan_rep,
767-
object replace=None):
768-
"""
769-
Replace the values in the array with 'replacement' if
770-
they are 'nan_rep'. Return the same array.
771-
"""
772-
773-
cdef int length = arr.shape[0], i = 0
774-
if replace is None:
775-
replace = np.nan
776-
777-
for i from 0 <= i < length:
778-
if arr[i] == nan_rep:
779-
arr[i] = replace
780-
781-
return arr
782-
783-
784-
@cython.boundscheck(False)
785-
@cython.wraparound(False)
786-
def convert_json_to_lines(object arr):
787-
"""
788-
replace comma separated json with line feeds, paying special attention
789-
to quotes & brackets
790-
"""
791-
cdef:
792-
Py_ssize_t i = 0, num_open_brackets_seen = 0, length
793-
bint in_quotes = 0, is_escaping = 0
794-
ndarray[uint8_t] narr
795-
unsigned char v, comma, left_bracket, right_brack, newline
796-
797-
newline = ord('\n')
798-
comma = ord(',')
799-
left_bracket = ord('{')
800-
right_bracket = ord('}')
801-
quote = ord('"')
802-
backslash = ord('\\')
803-
804-
narr = np.frombuffer(arr.encode('utf-8'), dtype='u1').copy()
805-
length = narr.shape[0]
806-
for i in range(length):
807-
v = narr[i]
808-
if v == quote and i > 0 and not is_escaping:
809-
in_quotes = ~in_quotes
810-
if v == backslash or is_escaping:
811-
is_escaping = ~is_escaping
812-
if v == comma: # commas that should be \n
813-
if num_open_brackets_seen == 0 and not in_quotes:
814-
narr[i] = newline
815-
elif v == left_bracket:
816-
if not in_quotes:
817-
num_open_brackets_seen += 1
818-
elif v == right_bracket:
819-
if not in_quotes:
820-
num_open_brackets_seen -= 1
821-
822-
return narr.tostring().decode('utf-8')
823-
824-
825-
@cython.boundscheck(False)
826-
@cython.wraparound(False)
827-
def write_csv_rows(list data, ndarray data_index,
828-
int nlevels, ndarray cols, object writer):
829-
830-
cdef int N, j, i, ncols
831-
cdef list rows
832-
cdef object val
833-
834-
# In crude testing, N>100 yields little marginal improvement
835-
N=100
836-
837-
# pre-allocate rows
838-
ncols = len(cols)
839-
rows = [[None] * (nlevels + ncols) for x in range(N)]
840-
841-
j = -1
842-
if nlevels == 1:
843-
for j in range(len(data_index)):
844-
row = rows[j % N]
845-
row[0] = data_index[j]
846-
for i in range(ncols):
847-
row[1 + i] = data[i][j]
848-
849-
if j >= N - 1 and j % N == N - 1:
850-
writer.writerows(rows)
851-
elif nlevels > 1:
852-
for j in range(len(data_index)):
853-
row = rows[j % N]
854-
row[:nlevels] = list(data_index[j])
855-
for i in range(ncols):
856-
row[nlevels + i] = data[i][j]
857-
858-
if j >= N - 1 and j % N == N - 1:
859-
writer.writerows(rows)
860-
else:
861-
for j in range(len(data_index)):
862-
row = rows[j % N]
863-
for i in range(ncols):
864-
row[i] = data[i][j]
865-
866-
if j >= N - 1 and j % N == N - 1:
867-
writer.writerows(rows)
868-
869-
if j >= 0 and (j < N - 1 or (j % N) != N - 1):
870-
writer.writerows(rows[:((j + 1) % N)])
871-
872-
873681
# ------------------------------------------------------------------------------
874682
# Groupby-related functions
875683

pandas/_libs/parsers.pyx

+34
Original file line numberDiff line numberDiff line change
@@ -2225,3 +2225,37 @@ def _maybe_encode(values):
22252225
if values is None:
22262226
return []
22272227
return [x.encode('utf-8') if isinstance(x, unicode) else x for x in values]
2228+
2229+
2230+
def sanitize_objects(ndarray[object] values, set na_values,
2231+
convert_empty=True):
2232+
"""
2233+
Convert specified values, including the given set na_values and empty
2234+
strings if convert_empty is True, to np.nan.
2235+
2236+
Parameters
2237+
----------
2238+
values : ndarray[object]
2239+
na_values : set
2240+
convert_empty : bool (default True)
2241+
"""
2242+
cdef:
2243+
Py_ssize_t i, n
2244+
object val, onan
2245+
Py_ssize_t na_count = 0
2246+
dict memo = {}
2247+
2248+
n = len(values)
2249+
onan = np.nan
2250+
2251+
for i from 0 <= i < n:
2252+
val = values[i]
2253+
if (convert_empty and val == '') or (val in na_values):
2254+
values[i] = onan
2255+
na_count += 1
2256+
elif val in memo:
2257+
values[i] = memo[val]
2258+
else:
2259+
memo[val] = val
2260+
2261+
return na_count

pandas/_libs/src/inference.pyx

+1-25
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ from tslibs.nattype import NaT
66
from tslibs.conversion cimport convert_to_tsobject
77
from tslibs.timedeltas cimport convert_to_timedelta64
88
from tslibs.timezones cimport get_timezone, tz_compare
9-
from datetime import datetime, timedelta
9+
1010
iNaT = util.get_nat()
1111

1212
cdef bint PY2 = sys.version_info[0] == 2
@@ -1405,30 +1405,6 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
14051405
return objects
14061406

14071407

1408-
def sanitize_objects(ndarray[object] values, set na_values,
1409-
convert_empty=True):
1410-
cdef:
1411-
Py_ssize_t i, n
1412-
object val, onan
1413-
Py_ssize_t na_count = 0
1414-
dict memo = {}
1415-
1416-
n = len(values)
1417-
onan = np.nan
1418-
1419-
for i from 0 <= i < n:
1420-
val = values[i]
1421-
if (convert_empty and val == '') or (val in na_values):
1422-
values[i] = onan
1423-
na_count += 1
1424-
elif val in memo:
1425-
values[i] = memo[val]
1426-
else:
1427-
memo[val] = val
1428-
1429-
return na_count
1430-
1431-
14321408
def maybe_convert_bool(ndarray[object] arr,
14331409
true_values=None, false_values=None):
14341410
cdef:

0 commit comments

Comments
 (0)