@@ -77,12 +77,11 @@ def return_df_in_original_format(df, received_ID_col=False, received_single_time
77
77
pd.Dataframe
78
78
original input format
79
79
"""
80
- new_df = df .copy (deep = True )
81
80
if not received_ID_col and received_single_time_series :
82
- assert len (new_df ["ID" ].unique ()) == 1
83
- new_df .drop ("ID" , axis = 1 , inplace = True )
81
+ assert len (df ["ID" ].unique ()) == 1
82
+ df .drop ("ID" , axis = 1 , inplace = True )
84
83
log .info ("Returning df with no ID column" )
85
- return new_df
84
+ return df
86
85
87
86
88
87
def merge_dataframes (df : pd .DataFrame ) -> pd .DataFrame :
@@ -102,7 +101,7 @@ def merge_dataframes(df: pd.DataFrame) -> pd.DataFrame:
102
101
raise ValueError ("Can not join other than pd.DataFrames" )
103
102
if "ID" not in df .columns :
104
103
raise ValueError ("df does not contain 'ID' column" )
105
- df_merged = df .copy ( deep = True ). drop ("ID" , axis = 1 )
104
+ df_merged = df .drop ("ID" , axis = 1 )
106
105
df_merged = df_merged .sort_values ("ds" )
107
106
df_merged = df_merged .drop_duplicates (subset = ["ds" ])
108
107
df_merged = df_merged .reset_index (drop = True )
@@ -282,11 +281,8 @@ def init_data_params(
282
281
ShiftScale entries containing ``shift`` and ``scale`` parameters for each column
283
282
"""
284
283
# Compute Global data params
285
- # df = df.copy(deep=True)
286
- # df, _, _, _ = check_multiple_series_id(df)
287
- df_merged = df .copy (deep = True ).drop ("ID" , axis = 1 )
288
284
global_data_params = data_params_definition (
289
- df_merged , normalize , config_lagged_regressors , config_regressors , config_events , config_seasonality
285
+ df , normalize , config_lagged_regressors , config_regressors , config_events , config_seasonality
290
286
)
291
287
if global_normalization :
292
288
log .debug (
@@ -296,7 +292,6 @@ def init_data_params(
296
292
local_data_params = OrderedDict ()
297
293
local_run_despite_global = True if global_normalization else None
298
294
for df_name , df_i in df .groupby ("ID" ):
299
- df_i .drop ("ID" , axis = 1 , inplace = True )
300
295
local_data_params [df_name ] = data_params_definition (
301
296
df = df_i ,
302
297
normalize = normalize ,
@@ -378,7 +373,6 @@ def normalize(df, data_params):
378
373
pd.DataFrame
379
374
normalized dataframes
380
375
"""
381
- df = df .copy (deep = True )
382
376
for name in df .columns :
383
377
if name == "ID" :
384
378
continue
@@ -428,8 +422,7 @@ def check_dataframe(
428
422
pd.DataFrame or dict
429
423
checked dataframe
430
424
"""
431
- # df = df.copy(deep=True)
432
- # df, _, _, _ = check_multiple_series_id(df)
425
+ # TODO: move call to check_multiple_series_id here
433
426
if df .groupby ("ID" ).size ().min () < 1 :
434
427
raise ValueError ("Dataframe has no rows." )
435
428
if "ds" not in df :
@@ -542,7 +535,7 @@ def _crossvalidation_split_df(df, n_lags, n_forecasts, k, fold_pct, fold_overlap
542
535
min_train = total_samples - samples_fold - (k - 1 ) * (samples_fold - samples_overlap )
543
536
assert min_train >= samples_fold
544
537
folds = []
545
- df_fold = df . copy ( deep = True )
538
+ df_fold = df
546
539
for i in range (k , 0 , - 1 ):
547
540
df_train , df_val = split_df (df_fold , n_lags , n_forecasts , valid_p = samples_fold , inputs_overbleed = True )
548
541
folds .append ((df_train , df_val ))
@@ -635,33 +628,30 @@ def _crossvalidation_with_time_threshold(df, n_lags, n_forecasts, k, fold_pct, f
635
628
636
629
validation data
637
630
"""
638
- df_merged = merge_dataframes (df )
631
+ df_merged = merge_dataframes (df . copy ( deep = True ) )
639
632
total_samples = len (df_merged ) - n_lags + 2 - (2 * n_forecasts )
640
633
samples_fold = max (1 , int (fold_pct * total_samples ))
641
634
samples_overlap = int (fold_overlap_pct * samples_fold )
642
635
assert samples_overlap < samples_fold
643
636
min_train = total_samples - samples_fold - (k - 1 ) * (samples_fold - samples_overlap )
644
637
assert min_train >= samples_fold
645
638
folds = []
646
- df_fold = df
647
- # df_fold = df.copy(deep=True)
648
- # df_fold, _, _, _ = check_multiple_series_id(df_fold)
649
639
for i in range (k , 0 , - 1 ):
650
- threshold_time_stamp = find_time_threshold (df_fold , n_lags , n_forecasts , samples_fold , inputs_overbleed = True )
640
+ threshold_time_stamp = find_time_threshold (df , n_lags , n_forecasts , samples_fold , inputs_overbleed = True )
651
641
df_train , df_val = split_considering_timestamp (
652
- df_fold , n_lags , n_forecasts , inputs_overbleed = True , threshold_time_stamp = threshold_time_stamp
642
+ df , n_lags , n_forecasts , inputs_overbleed = True , threshold_time_stamp = threshold_time_stamp
653
643
)
654
644
folds .append ((df_train , df_val ))
655
645
split_idx = len (df_merged ) - samples_fold + samples_overlap
656
646
df_merged = df_merged [:split_idx ].reset_index (drop = True )
657
647
threshold_time_stamp = df_merged ["ds" ].iloc [- 1 ]
658
648
df_fold_aux = pd .DataFrame ()
659
- for df_name , df_i in df_fold .groupby ("ID" ):
660
- df_aux = (
661
- df_i .copy (deep = True ).iloc [: len (df_i [df_i ["ds" ] < threshold_time_stamp ]) + 1 ].reset_index (drop = True )
662
- )
649
+ for df_name , df_i in df .groupby ("ID" ):
650
+ # df_i = df_i.copy(deep=True)
651
+ df_aux = df_i .iloc [: len (df_i [df_i ["ds" ] < threshold_time_stamp ]) + 1 ].reset_index (drop = True )
663
652
df_fold_aux = pd .concat ((df_fold_aux , df_aux ), ignore_index = True )
664
- df_fold = df_fold_aux .copy (deep = True )
653
+ df = df_fold_aux
654
+ # df = df.copy(deep=True)
665
655
folds = folds [::- 1 ]
666
656
return folds
667
657
@@ -707,7 +697,6 @@ def crossvalidation_split_df(
707
697
708
698
validation data
709
699
"""
710
- # df = df.copy(deep=True)
711
700
df , _ , _ , _ = check_multiple_series_id (df )
712
701
folds = []
713
702
if len (df ["ID" ].unique ()) == 1 :
@@ -733,7 +722,7 @@ def crossvalidation_split_df(
733
722
start_date , end_date = find_valid_time_interval_for_cv (df )
734
723
for df_name , df_i in df .groupby ("ID" ):
735
724
mask = (df_i ["ds" ] >= start_date ) & (df_i ["ds" ] <= end_date )
736
- df_i = df_i [mask ]. copy ( deep = True )
725
+ df_i = df_i [mask ]
737
726
folds_dict [df_name ] = _crossvalidation_split_df (
738
727
df_i , n_lags , n_forecasts , k , fold_pct , fold_overlap_pct
739
728
)
@@ -768,8 +757,6 @@ def double_crossvalidation_split_df(df, n_lags, n_forecasts, k, valid_pct, test_
768
757
tuple of k tuples [(folds_val, folds_test), …]
769
758
elements same as :meth:`crossvalidation_split_df` returns
770
759
"""
771
- # df = df.copy(deep=True)
772
- # df, _, _, _ = check_multiple_series_id(df)
773
760
if len (df ["ID" ].unique ()) > 1 :
774
761
raise NotImplementedError ("double_crossvalidation_split_df not implemented for df with many time series" )
775
762
fold_pct_test = float (test_pct ) / k
@@ -800,7 +787,7 @@ def find_time_threshold(df, n_lags, n_forecasts, valid_p, inputs_overbleed):
800
787
str
801
788
time stamp threshold defines the boundary for the train and validation sets split.
802
789
"""
803
- df_merged = merge_dataframes (df )
790
+ df_merged = merge_dataframes (df . copy ( deep = True ) )
804
791
n_samples = len (df_merged ) - n_lags + 2 - (2 * n_forecasts )
805
792
n_samples = n_samples if inputs_overbleed else n_samples - n_lags
806
793
if 0.0 < valid_p < 1.0 :
@@ -842,11 +829,14 @@ def split_considering_timestamp(df, n_lags, n_forecasts, inputs_overbleed, thres
842
829
df_val = pd .DataFrame ()
843
830
for df_name , df_i in df .groupby ("ID" ):
844
831
if df [df ["ID" ] == df_name ]["ds" ].max () < threshold_time_stamp :
845
- df_train = pd .concat ((df_train , df_i .copy (deep = True )), ignore_index = True )
832
+ # df_i = df_i.copy(deep=True)
833
+ df_train = pd .concat ((df_train , df_i ), ignore_index = True )
846
834
elif df [df ["ID" ] == df_name ]["ds" ].min () > threshold_time_stamp :
847
- df_val = pd .concat ((df_val , df_i .copy (deep = True )), ignore_index = True )
835
+ # df_i = df_i.copy(deep=True)
836
+ df_val = pd .concat ((df_val , df_i ), ignore_index = True )
848
837
else :
849
- df_aux = df_i .copy (deep = True )
838
+ df_aux = df_i
839
+ # df_i = df_i.copy(deep=True)
850
840
n_train = len (df_aux [df_aux ["ds" ] < threshold_time_stamp ])
851
841
split_idx_train = n_train + n_lags + n_forecasts - 1
852
842
split_idx_val = split_idx_train - n_lags if inputs_overbleed else split_idx_train
@@ -890,8 +880,6 @@ def split_df(
890
880
pd.DataFrame, dict
891
881
validation data
892
882
"""
893
- # df = df.copy(deep=True)
894
- # df, _, _, _ = check_multiple_series_id(df)
895
883
df_train = pd .DataFrame ()
896
884
df_val = pd .DataFrame ()
897
885
if local_split :
@@ -1373,8 +1361,6 @@ def infer_frequency(df, freq, n_lags, min_freq_percentage=0.7):
1373
1361
Valid frequency tag according to major frequency.
1374
1362
1375
1363
"""
1376
- # df = df.copy(deep=True)
1377
- # df, _, _, _ = check_multiple_series_id(df)
1378
1364
freq_df = list ()
1379
1365
for df_name , df_i in df .groupby ("ID" ):
1380
1366
freq_df .append (_infer_frequency (df_i , freq , min_freq_percentage ))
@@ -1396,6 +1382,7 @@ def create_dict_for_events_or_regressors(
1396
1382
df : pd .DataFrame ,
1397
1383
other_df : Optional [pd .DataFrame ],
1398
1384
other_df_name : str ,
1385
+ received_ID_col : bool ,
1399
1386
) -> dict : # Not sure about the naming of this function
1400
1387
"""Create a dict for events or regressors according to input df.
1401
1388
@@ -1417,12 +1404,10 @@ def create_dict_for_events_or_regressors(
1417
1404
if other_df is None :
1418
1405
# if other_df is None, create dictionary with None for each ID
1419
1406
return {df_name : None for df_name in df_names }
1420
- other_df = other_df .copy (deep = True )
1421
- other_df , received_ID_col , _ , _ = check_multiple_series_id (other_df )
1422
1407
# if other_df does not contain ID, create dictionary with original ID with the same other_df for each ID
1423
1408
if not received_ID_col :
1424
1409
other_df = other_df .drop ("ID" , axis = 1 )
1425
- return {df_name : other_df . copy ( deep = True ) for df_name in df_names }
1410
+ return {df_name : other_df for df_name in df_names }
1426
1411
1427
1412
# else, other_df does contain ID, create dict with respective IDs
1428
1413
df_unique_names , other_df_unique_names = list (df ["ID" ].unique ()), list (other_df ["ID" ].unique ())
@@ -1438,7 +1423,7 @@ def create_dict_for_events_or_regressors(
1438
1423
df_other_dict = {}
1439
1424
for df_name in df_unique_names :
1440
1425
if df_name in other_df_unique_names :
1441
- df_aux = other_df [other_df ["ID" ] == df_name ].reset_index (drop = True ). copy ( deep = True )
1426
+ df_aux = other_df [other_df ["ID" ] == df_name ].reset_index (drop = True )
1442
1427
df_aux .drop ("ID" , axis = 1 , inplace = True )
1443
1428
else :
1444
1429
df_aux = None
0 commit comments