1
- import os
2
- import pytest
3
-
4
1
import numpy as np
5
- from pandas . compat import zip
2
+ import pytest
6
3
7
4
import pandas as pd
8
- from pandas import (DataFrame , Series , isna , to_datetime , DatetimeIndex , Index ,
9
- Timestamp , Interval , IntervalIndex , Categorical ,
10
- cut , qcut , date_range , timedelta_range , NaT ,
11
- TimedeltaIndex )
12
- from pandas .tseries .offsets import Nano , Day
13
- import pandas .util .testing as tm
5
+ from pandas import (
6
+ Categorical , DataFrame , DatetimeIndex , Index , Interval , IntervalIndex ,
7
+ Series , TimedeltaIndex , Timestamp , cut , date_range , isna , qcut ,
8
+ timedelta_range , to_datetime )
14
9
from pandas .api .types import CategoricalDtype as CDT
15
-
16
- from pandas .core .algorithms import quantile
17
10
import pandas .core .reshape .tile as tmod
11
+ import pandas .util .testing as tm
18
12
19
13
20
14
def test_simple ():
@@ -211,44 +205,6 @@ def test_inf_handling():
211
205
assert result_ser [0 ] == Interval (- np .inf , 2 )
212
206
213
207
214
- def test_qcut ():
215
- arr = np .random .randn (1000 )
216
-
217
- # We store the bins as Index that have been
218
- # rounded to comparisons are a bit tricky.
219
- labels , bins = qcut (arr , 4 , retbins = True )
220
- ex_bins = quantile (arr , [0 , .25 , .5 , .75 , 1. ])
221
-
222
- result = labels .categories .left .values
223
- assert np .allclose (result , ex_bins [:- 1 ], atol = 1e-2 )
224
-
225
- result = labels .categories .right .values
226
- assert np .allclose (result , ex_bins [1 :], atol = 1e-2 )
227
-
228
- ex_levels = cut (arr , ex_bins , include_lowest = True )
229
- tm .assert_categorical_equal (labels , ex_levels )
230
-
231
-
232
- def test_qcut_bounds ():
233
- arr = np .random .randn (1000 )
234
-
235
- factor = qcut (arr , 10 , labels = False )
236
- assert len (np .unique (factor )) == 10
237
-
238
-
239
- def test_qcut_specify_quantiles ():
240
- arr = np .random .randn (100 )
241
- factor = qcut (arr , [0 , .25 , .5 , .75 , 1. ])
242
-
243
- expected = qcut (arr , 4 )
244
- tm .assert_categorical_equal (factor , expected )
245
-
246
-
247
- def test_qcut_all_bins_same ():
248
- with pytest .raises (ValueError , match = "edges.*unique" ):
249
- qcut ([0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 ], 3 )
250
-
251
-
252
208
def test_cut_out_of_bounds ():
253
209
arr = np .random .randn (100 )
254
210
result = cut (arr , [- 1 , 0 , 1 ])
@@ -286,31 +242,6 @@ def test_cut_pass_labels_compat():
286
242
tm .assert_categorical_equal (result , exp )
287
243
288
244
289
- def test_qcut_include_lowest ():
290
- values = np .arange (10 )
291
- ii = qcut (values , 4 )
292
-
293
- ex_levels = IntervalIndex ([Interval (- 0.001 , 2.25 ), Interval (2.25 , 4.5 ),
294
- Interval (4.5 , 6.75 ), Interval (6.75 , 9 )])
295
- tm .assert_index_equal (ii .categories , ex_levels )
296
-
297
-
298
- def test_qcut_nas ():
299
- arr = np .random .randn (100 )
300
- arr [:20 ] = np .nan
301
-
302
- result = qcut (arr , 4 )
303
- assert isna (result [:20 ]).all ()
304
-
305
-
306
- def test_qcut_index ():
307
- result = qcut ([0 , 2 ], 2 )
308
- intervals = [Interval (- 0.001 , 1 ), Interval (1 , 2 )]
309
-
310
- expected = Categorical (intervals , ordered = True )
311
- tm .assert_categorical_equal (result , expected )
312
-
313
-
314
245
@pytest .mark .parametrize ("x" , [np .arange (11. ), np .arange (11. ) / 1e10 ])
315
246
def test_round_frac_just_works (x ):
316
247
# It works.
@@ -329,30 +260,6 @@ def test_round_frac(val, precision, expected):
329
260
assert result == expected
330
261
331
262
332
- def test_qcut_binning_issues (datapath ):
333
- # see gh-1978, gh-1979
334
- cut_file = datapath (os .path .join ("reshape" , "data" , "cut_data.csv" ))
335
- arr = np .loadtxt (cut_file )
336
- result = qcut (arr , 20 )
337
-
338
- starts = []
339
- ends = []
340
-
341
- for lev in np .unique (result ):
342
- s = lev .left
343
- e = lev .right
344
- assert s != e
345
-
346
- starts .append (float (s ))
347
- ends .append (float (e ))
348
-
349
- for (sp , sn ), (ep , en ) in zip (zip (starts [:- 1 ], starts [1 :]),
350
- zip (ends [:- 1 ], ends [1 :])):
351
- assert sp < sn
352
- assert ep < en
353
- assert ep <= sn
354
-
355
-
356
263
def test_cut_return_intervals ():
357
264
ser = Series ([0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 ])
358
265
result = cut (ser , 3 )
@@ -365,17 +272,6 @@ def test_cut_return_intervals():
365
272
tm .assert_series_equal (result , expected )
366
273
367
274
368
- def test_qcut_return_intervals ():
369
- ser = Series ([0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 ])
370
- res = qcut (ser , [0 , 0.333 , 0.666 , 1 ])
371
-
372
- exp_levels = np .array ([Interval (- 0.001 , 2.664 ),
373
- Interval (2.664 , 5.328 ), Interval (5.328 , 8 )])
374
- exp = Series (exp_levels .take ([0 , 0 , 0 , 1 , 1 , 1 , 2 , 2 , 2 ])).astype (
375
- CDT (ordered = True ))
376
- tm .assert_series_equal (res , exp )
377
-
378
-
379
275
def test_series_ret_bins ():
380
276
# see gh-8589
381
277
ser = Series (np .arange (4 ))
@@ -406,47 +302,6 @@ def test_cut_duplicates_bin(kwargs, msg):
406
302
tm .assert_series_equal (result , expected )
407
303
408
304
409
- @pytest .mark .parametrize ("kwargs,msg" , [
410
- (dict (duplicates = "drop" ), None ),
411
- (dict (), "Bin edges must be unique" ),
412
- (dict (duplicates = "raise" ), "Bin edges must be unique" ),
413
- (dict (duplicates = "foo" ), "invalid value for 'duplicates' parameter" )
414
- ])
415
- def test_qcut_duplicates_bin (kwargs , msg ):
416
- # see gh-7751
417
- values = [0 , 0 , 0 , 0 , 1 , 2 , 3 ]
418
-
419
- if msg is not None :
420
- with pytest .raises (ValueError , match = msg ):
421
- qcut (values , 3 , ** kwargs )
422
- else :
423
- result = qcut (values , 3 , ** kwargs )
424
- expected = IntervalIndex ([Interval (- 0.001 , 1 ), Interval (1 , 3 )])
425
- tm .assert_index_equal (result .categories , expected )
426
-
427
-
428
- @pytest .mark .parametrize ("data,start,end" , [
429
- (9.0 , 8.999 , 9.0 ),
430
- (0.0 , - 0.001 , 0.0 ),
431
- (- 9.0 , - 9.001 , - 9.0 ),
432
- ])
433
- @pytest .mark .parametrize ("length" , [1 , 2 ])
434
- @pytest .mark .parametrize ("labels" , [None , False ])
435
- def test_single_quantile (data , start , end , length , labels ):
436
- # see gh-15431
437
- ser = Series ([data ] * length )
438
- result = qcut (ser , 1 , labels = labels )
439
-
440
- if labels is None :
441
- intervals = IntervalIndex ([Interval (start , end )] *
442
- length , closed = "right" )
443
- expected = Series (intervals ).astype (CDT (ordered = True ))
444
- else :
445
- expected = Series ([0 ] * length )
446
-
447
- tm .assert_series_equal (result , expected )
448
-
449
-
450
305
@pytest .mark .parametrize ("data" , [9.0 , - 9.0 , 0.0 ])
451
306
@pytest .mark .parametrize ("length" , [1 , 2 ])
452
307
def test_single_bin (data , length ):
@@ -474,21 +329,6 @@ def test_cut_read_only(array_1_writeable, array_2_writeable):
474
329
cut (hundred_elements , array_2 ))
475
330
476
331
477
- @pytest .mark .parametrize ("ser" , [
478
- Series (DatetimeIndex (["20180101" , NaT , "20180103" ])),
479
- Series (TimedeltaIndex (["0 days" , NaT , "2 days" ]))],
480
- ids = lambda x : str (x .dtype ))
481
- def test_qcut_nat (ser ):
482
- # see gh-19768
483
- intervals = IntervalIndex .from_tuples ([
484
- (ser [0 ] - Nano (), ser [2 ] - Day ()),
485
- np .nan , (ser [2 ] - Day (), ser [2 ])])
486
- expected = Series (Categorical (intervals , ordered = True ))
487
-
488
- result = qcut (ser , 2 )
489
- tm .assert_series_equal (result , expected )
490
-
491
-
492
332
@pytest .mark .parametrize ("conv" , [
493
333
lambda v : Timestamp (v ),
494
334
lambda v : to_datetime (v ),
@@ -558,24 +398,6 @@ def test_datetime_tz_cut(bins, box):
558
398
tm .assert_series_equal (result , expected )
559
399
560
400
561
- @pytest .mark .parametrize ("bins" , [3 , np .linspace (0 , 1 , 4 )])
562
- def test_datetime_tz_qcut (bins ):
563
- # see gh-19872
564
- tz = "US/Eastern"
565
- ser = Series (date_range ("20130101" , periods = 3 , tz = tz ))
566
-
567
- result = qcut (ser , bins )
568
- expected = Series (IntervalIndex ([
569
- Interval (Timestamp ("2012-12-31 23:59:59.999999999" , tz = tz ),
570
- Timestamp ("2013-01-01 16:00:00" , tz = tz )),
571
- Interval (Timestamp ("2013-01-01 16:00:00" , tz = tz ),
572
- Timestamp ("2013-01-02 08:00:00" , tz = tz )),
573
- Interval (Timestamp ("2013-01-02 08:00:00" , tz = tz ),
574
- Timestamp ("2013-01-03 00:00:00" , tz = tz ))])).astype (
575
- CDT (ordered = True ))
576
- tm .assert_series_equal (result , expected )
577
-
578
-
579
401
def test_datetime_nan_error ():
580
402
msg = "bins must be of datetime64 dtype"
581
403
@@ -623,15 +445,3 @@ def test_timedelta_cut_roundtrip():
623
445
"2 days 00:00:00" ,
624
446
"3 days 00:00:00" ])
625
447
tm .assert_index_equal (result_bins , expected_bins )
626
-
627
-
628
- @pytest .mark .parametrize ("arg,expected_bins" , [
629
- [timedelta_range ("1day" , periods = 3 ),
630
- TimedeltaIndex (["1 days" , "2 days" , "3 days" ])],
631
- [date_range ("20180101" , periods = 3 ),
632
- DatetimeIndex (["2018-01-01" , "2018-01-02" , "2018-01-03" ])]])
633
- def test_date_like_qcut_bins (arg , expected_bins ):
634
- # see gh-19891
635
- ser = Series (arg )
636
- result , result_bins = qcut (ser , 2 , retbins = True )
637
- tm .assert_index_equal (result_bins , expected_bins )
0 commit comments