forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlib.pyx
3059 lines (2529 loc) · 86.8 KB
/
lib.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
from collections import abc
from decimal import Decimal
from enum import Enum
from typing import (
Literal,
_GenericAlias,
)
cimport cython
from cpython.datetime cimport (
PyDate_Check,
PyDateTime_Check,
PyDelta_Check,
PyTime_Check,
import_datetime,
)
from cpython.iterator cimport PyIter_Check
from cpython.number cimport PyNumber_Check
from cpython.object cimport (
Py_EQ,
PyObject,
PyObject_RichCompareBool,
PyTypeObject,
)
from cpython.ref cimport Py_INCREF
from cpython.sequence cimport PySequence_Check
from cpython.tuple cimport (
PyTuple_New,
PyTuple_SET_ITEM,
)
from cython cimport (
Py_ssize_t,
floating,
)
from pandas._libs.missing import check_na_tuples_nonequal
import_datetime()
import numpy as np
cimport numpy as cnp
from numpy cimport (
NPY_OBJECT,
PyArray_Check,
PyArray_GETITEM,
PyArray_ITER_DATA,
PyArray_ITER_NEXT,
PyArray_IterNew,
complex128_t,
flatiter,
float64_t,
int32_t,
int64_t,
intp_t,
ndarray,
uint8_t,
uint64_t,
)
cnp.import_array()
cdef extern from "Python.h":
# Note: importing extern-style allows us to declare these as nogil
# functions, whereas `from cpython cimport` does not.
bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
cdef extern from "numpy/arrayobject.h":
# cython's numpy.dtype specification is incorrect, which leads to
# errors in issubclass(self.dtype.type, np.bool_), so we directly
# include the correct version
# https://github.com/cython/cython/issues/2022
ctypedef class numpy.dtype [object PyArray_Descr]:
# Use PyDataType_* macros when possible, however there are no macros
# for accessing some of the fields, so some are defined. Please
# ask on cython-dev if you need more.
cdef:
int type_num
int itemsize "elsize"
char byteorder
object fields
tuple names
PyTypeObject PySignedIntegerArrType_Type
PyTypeObject PyUnsignedIntegerArrType_Type
cdef extern from "numpy/ndarrayobject.h":
bint PyArray_CheckScalar(obj) nogil
cdef extern from "pd_parser.h":
int floatify(object, float64_t *result, int *maybe_int) except -1
void PandasParser_IMPORT()
PandasParser_IMPORT
from pandas._libs cimport util
from pandas._libs.util cimport (
INT64_MAX,
INT64_MIN,
UINT64_MAX,
is_nan,
)
from pandas._libs.tslibs import (
OutOfBoundsDatetime,
OutOfBoundsTimedelta,
)
from pandas._libs.tslibs.period import Period
from pandas._libs.missing cimport (
C_NA,
checknull,
is_matching_na,
is_null_datetime64,
is_null_timedelta64,
)
from pandas._libs.tslibs.conversion cimport (
_TSObject,
convert_to_tsobject,
)
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_NaT as NaT,
checknull_with_nat,
)
from pandas._libs.tslibs.np_datetime cimport NPY_FR_ns
from pandas._libs.tslibs.offsets cimport is_offset_object
from pandas._libs.tslibs.period cimport is_period_object
from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
from pandas._libs.tslibs.timezones cimport tz_compare
# constants that will be compared to potentially arbitrarily large
# python int
cdef:
object oINT64_MAX = <int64_t>INT64_MAX
object oINT64_MIN = <int64_t>INT64_MIN
object oUINT64_MAX = <uint64_t>UINT64_MAX
float64_t NaN = <float64_t>np.NaN
# python-visible
i8max = <int64_t>INT64_MAX
u8max = <uint64_t>UINT64_MAX
@cython.wraparound(False)
@cython.boundscheck(False)
def memory_usage_of_objects(arr: object[:]) -> int64_t:
"""
Return the memory usage of an object array in bytes.
Does not include the actual bytes of the pointers
"""
cdef:
Py_ssize_t i
Py_ssize_t n
int64_t size = 0
n = len(arr)
for i in range(n):
size += arr[i].__sizeof__()
return size
# ----------------------------------------------------------------------
def is_scalar(val: object) -> bool:
"""
Return True if given object is scalar.
Parameters
----------
val : object
This includes:
- numpy array scalar (e.g. np.int64)
- Python builtin numerics
- Python builtin byte arrays and strings
- None
- datetime.datetime
- datetime.timedelta
- Period
- decimal.Decimal
- Interval
- DateOffset
- Fraction
- Number.
Returns
-------
bool
Return True if given object is scalar.
Examples
--------
>>> import datetime
>>> dt = datetime.datetime(2018, 10, 3)
>>> pd.api.types.is_scalar(dt)
True
>>> pd.api.types.is_scalar([2, 3])
False
>>> pd.api.types.is_scalar({0: 1, 2: 3})
False
>>> pd.api.types.is_scalar((0, 2))
False
pandas supports PEP 3141 numbers:
>>> from fractions import Fraction
>>> pd.api.types.is_scalar(Fraction(3, 5))
True
"""
# Start with C-optimized checks
if (cnp.PyArray_IsAnyScalar(val)
# PyArray_IsAnyScalar is always False for bytearrays on Py3
or PyDate_Check(val)
or PyDelta_Check(val)
or PyTime_Check(val)
# We differ from numpy, which claims that None is not scalar;
# see np.isscalar
or val is C_NA
or val is None):
return True
# Next use C-optimized checks to exclude common non-scalars before falling
# back to non-optimized checks.
if PySequence_Check(val):
# e.g. list, tuple
# includes np.ndarray, Series which PyNumber_Check can return True for
return False
# Note: PyNumber_Check check includes Decimal, Fraction, numbers.Number
return (PyNumber_Check(val)
or is_period_object(val)
or is_interval(val)
or is_offset_object(val))
cdef int64_t get_itemsize(object val):
"""
Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.
Parameters
----------
val : object
Returns
-------
is_ndarray : bool
"""
if PyArray_CheckScalar(val):
return cnp.PyArray_DescrFromScalar(val).itemsize
else:
return -1
def is_iterator(obj: object) -> bool:
"""
Check if the object is an iterator.
This is intended for generators, not list-like objects.
Parameters
----------
obj : The object to check
Returns
-------
is_iter : bool
Whether `obj` is an iterator.
Examples
--------
>>> import datetime
>>> from pandas.api.types import is_iterator
>>> is_iterator((x for x in []))
True
>>> is_iterator([1, 2, 3])
False
>>> is_iterator(datetime.datetime(2017, 1, 1))
False
>>> is_iterator("foo")
False
>>> is_iterator(1)
False
"""
return PyIter_Check(obj)
def item_from_zerodim(val: object) -> object:
"""
If the value is a zerodim array, return the item it contains.
Parameters
----------
val : object
Returns
-------
object
Examples
--------
>>> item_from_zerodim(1)
1
>>> item_from_zerodim('foobar')
'foobar'
>>> item_from_zerodim(np.array(1))
1
>>> item_from_zerodim(np.array([1]))
array([1])
"""
if cnp.PyArray_IsZeroDim(val):
return cnp.PyArray_ToScalar(cnp.PyArray_DATA(val), val)
return val
@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list:
cdef:
list buf
Py_ssize_t k = len(lists)
Py_ssize_t i, j, n
list uniques = []
dict table = {}
object val, stub = 0
for i in range(k):
buf = lists[i]
n = len(buf)
for j in range(n):
val = buf[j]
if val not in table:
table[val] = stub
uniques.append(val)
if sort:
try:
uniques.sort()
except TypeError:
pass
return uniques
@cython.wraparound(False)
@cython.boundscheck(False)
def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list:
"""
Generate a list of unique values from a generator of lists.
Parameters
----------
gen : generator object
Generator of lists from which the unique list is created.
sort : bool
Whether or not to sort the resulting unique list.
Returns
-------
list of unique values
"""
cdef:
list buf
Py_ssize_t j, n
list uniques = []
dict table = {}
object val, stub = 0
for buf in gen:
n = len(buf)
for j in range(n):
val = buf[j]
if val not in table:
table[val] = stub
uniques.append(val)
if sort:
try:
uniques.sort()
except TypeError:
pass
return uniques
@cython.wraparound(False)
@cython.boundscheck(False)
def dicts_to_array(dicts: list, columns: list):
cdef:
Py_ssize_t i, j, k, n
ndarray[object, ndim=2] result
dict row
object col, onan = np.nan
k = len(columns)
n = len(dicts)
result = np.empty((n, k), dtype="O")
for i in range(n):
row = dicts[i]
for j in range(k):
col = columns[j]
if col in row:
result[i, j] = row[col]
else:
result[i, j] = onan
return result
def fast_zip(list ndarrays) -> ndarray[object]:
"""
For zipping multiple ndarrays into an ndarray of tuples.
"""
cdef:
Py_ssize_t i, j, k, n
ndarray[object, ndim=1] result
flatiter it
object val, tup
k = len(ndarrays)
n = len(ndarrays[0])
result = np.empty(n, dtype=object)
# initialize tuples on first pass
arr = ndarrays[0]
it = <flatiter>PyArray_IterNew(arr)
for i in range(n):
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
tup = PyTuple_New(k)
PyTuple_SET_ITEM(tup, 0, val)
Py_INCREF(val)
result[i] = tup
PyArray_ITER_NEXT(it)
for j in range(1, k):
arr = ndarrays[j]
it = <flatiter>PyArray_IterNew(arr)
if len(arr) != n:
raise ValueError("all arrays must be same length")
for i in range(n):
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
PyTuple_SET_ITEM(result[i], j, val)
Py_INCREF(val)
PyArray_ITER_NEXT(it)
return result
def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
"""
Reverse indexing operation.
Given `indexer`, make `indexer_inv` of it, such that::
indexer_inv[indexer[x]] = x
Parameters
----------
indexer : np.ndarray[np.intp]
length : int
Returns
-------
np.ndarray[np.intp]
Notes
-----
If indexer is not unique, only first occurrence is accounted.
"""
cdef:
Py_ssize_t i, n = len(indexer)
ndarray[intp_t, ndim=1] rev_indexer
intp_t idx
rev_indexer = np.empty(length, dtype=np.intp)
rev_indexer[:] = -1
for i in range(n):
idx = indexer[i]
if idx != -1:
rev_indexer[idx] = i
return rev_indexer
@cython.wraparound(False)
@cython.boundscheck(False)
# TODO(cython3): Can add const once cython#1772 is resolved
def has_infs(floating[:] arr) -> bool:
cdef:
Py_ssize_t i, n = len(arr)
floating inf, neginf, val
bint ret = False
inf = np.inf
neginf = -inf
with nogil:
for i in range(n):
val = arr[i]
if val == inf or val == neginf:
ret = True
break
return ret
def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len):
cdef:
Py_ssize_t i, n = len(indices)
intp_t k, vstart, vlast, v
if n == 0:
return slice(0, 0)
vstart = indices[0]
if vstart < 0 or max_len <= vstart:
return indices
if n == 1:
return slice(vstart, <intp_t>(vstart + 1))
vlast = indices[n - 1]
if vlast < 0 or max_len <= vlast:
return indices
k = indices[1] - indices[0]
if k == 0:
return indices
else:
for i in range(2, n):
v = indices[i]
if v - indices[i - 1] != k:
return indices
if k > 0:
return slice(vstart, <intp_t>(vlast + 1), k)
else:
if vlast == 0:
return slice(vstart, None, k)
else:
return slice(vstart, <intp_t>(vlast - 1), k)
@cython.wraparound(False)
@cython.boundscheck(False)
def maybe_booleans_to_slice(ndarray[uint8_t, ndim=1] mask):
cdef:
Py_ssize_t i, n = len(mask)
Py_ssize_t start = 0, end = 0
bint started = False, finished = False
for i in range(n):
if mask[i]:
if finished:
return mask.view(np.bool_)
if not started:
started = True
start = i
else:
if finished:
continue
if started:
end = i
finished = True
if not started:
return slice(0, 0)
if not finished:
return slice(start, None)
else:
return slice(start, end)
@cython.wraparound(False)
@cython.boundscheck(False)
def array_equivalent_object(ndarray left, ndarray right) -> bool:
"""
Perform an element by element comparison on N-d object arrays
taking into account nan positions.
"""
# left and right both have object dtype, but we cannot annotate that
# without limiting ndim.
cdef:
Py_ssize_t i, n = left.size
object x, y
cnp.broadcast mi = cnp.PyArray_MultiIterNew2(left, right)
# Caller is responsible for checking left.shape == right.shape
for i in range(n):
# Analogous to: x = left[i]
x = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 0))[0]
y = <object>(<PyObject**>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
# we are either not equal or both nan
# I think None == None will be true here
try:
if PyArray_Check(x) and PyArray_Check(y):
if x.shape != y.shape:
return False
if x.dtype == y.dtype == object:
if not array_equivalent_object(x, y):
return False
else:
# Circular import isn't great, but so it goes.
# TODO: could use np.array_equal?
from pandas.core.dtypes.missing import array_equivalent
if not array_equivalent(x, y):
return False
elif (x is C_NA) ^ (y is C_NA):
return False
elif not (
PyObject_RichCompareBool(x, y, Py_EQ)
or is_matching_na(x, y, nan_matches_none=True)
):
return False
except (ValueError, TypeError):
# Avoid raising ValueError when comparing Numpy arrays to other types
if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y):
# Only compare scalars to scalars and non-scalars to non-scalars
return False
elif (not (cnp.PyArray_IsPythonScalar(x) or cnp.PyArray_IsPythonScalar(y))
and not (isinstance(x, type(y)) or isinstance(y, type(x)))):
# Check if non-scalars have the same type
return False
elif check_na_tuples_nonequal(x, y):
# We have tuples where one Side has a NA and the other side does not
# Only condition we may end up with a TypeError
return False
raise
cnp.PyArray_MultiIter_NEXT(mi)
return True
ctypedef fused int6432_t:
int64_t
int32_t
@cython.wraparound(False)
@cython.boundscheck(False)
def is_range_indexer(ndarray[int6432_t, ndim=1] left, int n) -> bool:
"""
Perform an element by element comparison on 1-d integer arrays, meant for indexer
comparisons
"""
cdef:
Py_ssize_t i
if left.size != n:
return False
for i in range(n):
if left[i] != i:
return False
return True
ctypedef fused ndarr_object:
ndarray[object, ndim=1]
ndarray[object, ndim=2]
# TODO: get rid of this in StringArray and modify
# and go through ensure_string_array instead
@cython.wraparound(False)
@cython.boundscheck(False)
def convert_nans_to_NA(ndarr_object arr) -> ndarray:
"""
Helper for StringArray that converts null values that
are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
have already been validated as null.
"""
cdef:
Py_ssize_t i, m, n
object val
ndarr_object result
result = np.asarray(arr, dtype="object")
if arr.ndim == 2:
m, n = arr.shape[0], arr.shape[1]
for i in range(m):
for j in range(n):
val = arr[i, j]
if not isinstance(val, str):
result[i, j] = <object>C_NA
else:
n = len(arr)
for i in range(n):
val = arr[i]
if not isinstance(val, str):
result[i] = <object>C_NA
return result
@cython.wraparound(False)
@cython.boundscheck(False)
cpdef ndarray[object] ensure_string_array(
arr,
object na_value=np.nan,
bint convert_na_value=True,
bint copy=True,
bint skipna=True,
):
"""
Returns a new numpy array with object dtype and only strings and na values.
Parameters
----------
arr : array-like
The values to be converted to str, if needed.
na_value : Any, default np.nan
The value to use for na. For example, np.nan or pd.NA.
convert_na_value : bool, default True
If False, existing na values will be used unchanged in the new array.
copy : bool, default True
Whether to ensure that a new array is returned.
skipna : bool, default True
Whether or not to coerce nulls to their stringified form
(e.g. if False, NaN becomes 'nan').
Returns
-------
np.ndarray[object]
An array with the input array's elements casted to str or nan-like.
"""
cdef:
Py_ssize_t i = 0, n = len(arr)
bint already_copied = True
if hasattr(arr, "to_numpy"):
if hasattr(arr, "dtype") and arr.dtype.kind in ["m", "M"]:
# dtype check to exclude DataFrame
# GH#41409 TODO: not a great place for this
out = arr.astype(str).astype(object)
out[arr.isna()] = na_value
return out
arr = arr.to_numpy()
elif not util.is_array(arr):
arr = np.array(arr, dtype="object")
result = np.asarray(arr, dtype="object")
if copy and result is arr:
result = result.copy()
elif not copy and result is arr:
already_copied = False
if issubclass(arr.dtype.type, np.str_):
# short-circuit, all elements are str
return result
for i in range(n):
val = arr[i]
if isinstance(val, str):
continue
elif not already_copied:
result = result.copy()
already_copied = True
if not checknull(val):
if isinstance(val, bytes):
# GH#49658 discussion of desired behavior here
result[i] = val.decode()
elif not util.is_float_object(val):
# f"{val}" is faster than str(val)
result[i] = f"{val}"
else:
# f"{val}" is not always equivalent to str(val) for floats
result[i] = str(val)
else:
if convert_na_value:
val = na_value
if skipna:
result[i] = val
else:
result[i] = f"{val}"
return result
def is_all_arraylike(obj: list) -> bool:
"""
Should we treat these as levels of a MultiIndex, as opposed to Index items?
"""
cdef:
Py_ssize_t i, n = len(obj)
object val
bint all_arrays = True
for i in range(n):
val = obj[i]
if not (isinstance(val, list) or
util.is_array(val) or hasattr(val, "_data")):
# TODO: EA?
# exclude tuples, frozensets as they may be contained in an Index
all_arrays = False
break
return all_arrays
# ------------------------------------------------------------------------------
# Groupby-related functions
# TODO: could do even better if we know something about the data. eg, index has
# 1-min data, binner has 5-min data, then bins are just strides in index. This
# is a general, O(max(len(values), len(binner))) method.
@cython.boundscheck(False)
@cython.wraparound(False)
def generate_bins_dt64(ndarray[int64_t, ndim=1] values, const int64_t[:] binner,
object closed="left", bint hasnans=False):
"""
Int64 (datetime64) version of generic python version in ``groupby.py``.
"""
cdef:
Py_ssize_t lenidx, lenbin, i, j, bc
ndarray[int64_t, ndim=1] bins
int64_t r_bin, nat_count
bint right_closed = closed == "right"
nat_count = 0
if hasnans:
mask = values == NPY_NAT
nat_count = np.sum(mask)
values = values[~mask]
lenidx = len(values)
lenbin = len(binner)
if lenidx <= 0 or lenbin <= 0:
raise ValueError("Invalid length for values or for binner")
# check binner fits data
if values[0] < binner[0]:
raise ValueError("Values falls before first bin")
if values[lenidx - 1] > binner[lenbin - 1]:
raise ValueError("Values falls after last bin")
bins = np.empty(lenbin - 1, dtype=np.int64)
j = 0 # index into values
bc = 0 # bin count
# linear scan
if right_closed:
for i in range(0, lenbin - 1):
r_bin = binner[i + 1]
# count values in current bin, advance to next bin
while j < lenidx and values[j] <= r_bin:
j += 1
bins[bc] = j
bc += 1
else:
for i in range(0, lenbin - 1):
r_bin = binner[i + 1]
# count values in current bin, advance to next bin
while j < lenidx and values[j] < r_bin:
j += 1
bins[bc] = j
bc += 1
if nat_count > 0:
# shift bins by the number of NaT
bins = bins + nat_count
bins = np.insert(bins, 0, nat_count)
return bins
@cython.boundscheck(False)
@cython.wraparound(False)
def get_level_sorter(
ndarray[int64_t, ndim=1] codes, const intp_t[:] starts
) -> ndarray:
"""
Argsort for a single level of a multi-index, keeping the order of higher
levels unchanged. `starts` points to starts of same-key indices w.r.t
to leading levels; equivalent to:
np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort')
+ starts[i] for i in range(len(starts) - 1)])
Parameters
----------
codes : np.ndarray[int64_t, ndim=1]
starts : np.ndarray[intp, ndim=1]
Returns
-------
np.ndarray[np.int, ndim=1]
"""
cdef:
Py_ssize_t i, l, r
ndarray[intp_t, ndim=1] out = cnp.PyArray_EMPTY(1, codes.shape, cnp.NPY_INTP, 0)
for i in range(len(starts) - 1):
l, r = starts[i], starts[i + 1]
out[l:r] = l + codes[l:r].argsort(kind="mergesort")
return out
@cython.boundscheck(False)
@cython.wraparound(False)
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
const intp_t[:] labels,
Py_ssize_t max_bin,
):
cdef:
Py_ssize_t i, j, k, n
ndarray[int64_t, ndim=2] counts
n, k = (<object>mask).shape
counts = np.zeros((n, max_bin), dtype="i8")
with nogil:
for i in range(n):
for j in range(k):
if mask[i, j]:
counts[i, labels[j]] += 1
return counts
@cython.wraparound(False)
@cython.boundscheck(False)
def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups):
cdef:
Py_ssize_t i, group_size, n, start
intp_t lab
int64_t[::1] starts, ends
n = len(labels)
starts = np.zeros(ngroups, dtype=np.int64)
ends = np.zeros(ngroups, dtype=np.int64)
start = 0
group_size = 0
with nogil:
for i in range(n):
lab = labels[i]
if lab < 0:
start += 1
else:
group_size += 1
if i == n - 1 or lab != labels[i + 1]:
starts[lab] = start
ends[lab] = start + group_size
start += group_size
group_size = 0
return np.asarray(starts), np.asarray(ends)
def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list keys,
list sorted_labels) -> dict:
"""
Parameters
----------
index : ndarray[intp]
labels : ndarray[int64]
keys : list
sorted_labels : list[ndarray[int64]]
"""
cdef:
Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
dict result = {}
object tup
k = len(keys)
# Start at the first non-null entry
j = 0
for j in range(0, n):
if labels[j] != -1:
break
else:
return result
cur = labels[j]