Skip to content

Commit 5e6b231

Browse files
authored
[Minor] Reduce DataFrame copies (#1648)
* don't copy return df * explicity copy df before calling merge * remove copy from normalize * fix test * remove cv fold copy * do not drop ID when normalizing * do not copy when init data params * cleanup * remove copy from global cv intersect * remove copy from create_dict_for_events_or_regressors * handle None events regressors * fix regressors * remove copy from _normalize * remove copy from _make_future_dataframe * remove copy from _prepare_dataframe_to_predict * remove double copy in plotting func * fix typo * remove split deepcopy * retain comments of former copies in split
1 parent 4904808 commit 5e6b231

File tree

6 files changed

+60
-71
lines changed

6 files changed

+60
-71
lines changed

neuralprophet/data/process.py

+1-7
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,6 @@ def _prepare_dataframe_to_predict(model, df: pd.DataFrame, max_lags: int, freq:
251251
# Receives df with ID column
252252
df_prepared = pd.DataFrame()
253253
for df_name, df_i in df.groupby("ID"):
254-
df_i = df_i.copy(deep=True)
255254
_ = df_utils.infer_frequency(df_i, n_lags=max_lags, freq=freq)
256255
# check if received pre-processed df
257256
if "y_scaled" in df_i.columns or "t" in df_i.columns:
@@ -283,7 +282,7 @@ def _prepare_dataframe_to_predict(model, df: pd.DataFrame, max_lags: int, freq:
283282
config_seasonality=model.config_seasonality,
284283
predicting=True,
285284
)
286-
df_prepared = pd.concat((df_prepared, df_i.copy(deep=True).reset_index(drop=True)), ignore_index=True)
285+
df_prepared = pd.concat((df_prepared, df_i.reset_index(drop=True)), ignore_index=True)
287286
return df_prepared
288287

289288

@@ -399,8 +398,6 @@ def _check_dataframe(
399398
"Dataframe has less than n_forecasts + n_lags rows. "
400399
"Forecasting not possible. Please either use a larger dataset, or adjust the model parameters."
401400
)
402-
# df = df.copy(deep=True)
403-
# df, _, _, _ = df_utils.check_multiple_series_id(df)
404401
df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe(
405402
df=df,
406403
check_y=check_y,
@@ -475,9 +472,6 @@ def _handle_missing_data(
475472
The pre-processed DataFrame, including imputed missing data, if applicable.
476473
477474
"""
478-
# df = df.copy(deep=True)
479-
# df, _, _, _ = df_utils.check_multiple_series_id(df)
480-
481475
if n_lags == 0 and not predicting:
482476
# drop rows with NaNs in y and count them
483477
df_na_dropped = df.dropna(subset=["y"])

neuralprophet/data/split.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def _maybe_extend_df(
6565
future_df["ID"] = df_name
6666
df_i = pd.concat([df_i, future_df])
6767
df_i.reset_index(drop=True, inplace=True)
68-
extended_df = pd.concat((extended_df, df_i.copy(deep=True)), ignore_index=True)
68+
extended_df = pd.concat((extended_df, df_i), ignore_index=True)
6969
return extended_df, periods_add
7070

7171

@@ -126,8 +126,8 @@ def _get_maybe_extend_periods(
126126
def _make_future_dataframe(
127127
model,
128128
df: pd.DataFrame,
129-
events_df: pd.DataFrame,
130-
regressors_df: pd.DataFrame,
129+
events_df: Optional[pd.DataFrame],
130+
regressors_df: Optional[pd.DataFrame],
131131
periods: Optional[int],
132132
n_historic_predictions: int,
133133
n_forecasts: int,
@@ -174,13 +174,12 @@ def _make_future_dataframe(
174174
log.warning(
175175
"Not extending df into future as no periods specified. You can skip this and predict directly instead."
176176
)
177-
df = df.copy(deep=True)
178177
_ = df_utils.infer_frequency(df, n_lags=max_lags, freq=freq)
179178
last_date = pd.to_datetime(df["ds"].copy(deep=True).dropna()).sort_values().max()
180179
if events_df is not None:
181-
events_df = events_df.copy(deep=True).reset_index(drop=True)
180+
events_df = events_df.reset_index(drop=True)
182181
if regressors_df is not None:
183-
regressors_df = regressors_df.copy(deep=True).reset_index(drop=True)
182+
regressors_df = regressors_df.reset_index(drop=True)
184183
if periods is None:
185184
periods = 1 if max_lags == 0 else n_forecasts
186185
else:

neuralprophet/data/transform.py

+1-3
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,11 @@ def _normalize(df: pd.DataFrame, config_normalization: Normalization) -> pd.Data
2424
-------
2525
df: pd.DataFrame, normalized
2626
"""
27-
# df = df.copy(deep=True)
28-
# df, _, _, _ = df_utils.check_multiple_series_id(df)
2927
df_norm = pd.DataFrame()
3028
for df_name, df_i in df.groupby("ID"):
3129
data_params = config_normalization.get_data_params(df_name)
3230
df_i.drop("ID", axis=1, inplace=True)
33-
df_aux = df_utils.normalize(df_i, data_params).copy(deep=True)
31+
df_aux = df_utils.normalize(df_i, data_params)
3432
df_aux["ID"] = df_name
3533
df_norm = pd.concat((df_norm, df_aux), ignore_index=True)
3634
return df_norm

neuralprophet/df_utils.py

+26-41
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,11 @@ def return_df_in_original_format(df, received_ID_col=False, received_single_time
7777
pd.Dataframe
7878
original input format
7979
"""
80-
new_df = df.copy(deep=True)
8180
if not received_ID_col and received_single_time_series:
82-
assert len(new_df["ID"].unique()) == 1
83-
new_df.drop("ID", axis=1, inplace=True)
81+
assert len(df["ID"].unique()) == 1
82+
df.drop("ID", axis=1, inplace=True)
8483
log.info("Returning df with no ID column")
85-
return new_df
84+
return df
8685

8786

8887
def merge_dataframes(df: pd.DataFrame) -> pd.DataFrame:
@@ -102,7 +101,7 @@ def merge_dataframes(df: pd.DataFrame) -> pd.DataFrame:
102101
raise ValueError("Can not join other than pd.DataFrames")
103102
if "ID" not in df.columns:
104103
raise ValueError("df does not contain 'ID' column")
105-
df_merged = df.copy(deep=True).drop("ID", axis=1)
104+
df_merged = df.drop("ID", axis=1)
106105
df_merged = df_merged.sort_values("ds")
107106
df_merged = df_merged.drop_duplicates(subset=["ds"])
108107
df_merged = df_merged.reset_index(drop=True)
@@ -282,11 +281,8 @@ def init_data_params(
282281
ShiftScale entries containing ``shift`` and ``scale`` parameters for each column
283282
"""
284283
# Compute Global data params
285-
# df = df.copy(deep=True)
286-
# df, _, _, _ = check_multiple_series_id(df)
287-
df_merged = df.copy(deep=True).drop("ID", axis=1)
288284
global_data_params = data_params_definition(
289-
df_merged, normalize, config_lagged_regressors, config_regressors, config_events, config_seasonality
285+
df, normalize, config_lagged_regressors, config_regressors, config_events, config_seasonality
290286
)
291287
if global_normalization:
292288
log.debug(
@@ -296,7 +292,6 @@ def init_data_params(
296292
local_data_params = OrderedDict()
297293
local_run_despite_global = True if global_normalization else None
298294
for df_name, df_i in df.groupby("ID"):
299-
df_i.drop("ID", axis=1, inplace=True)
300295
local_data_params[df_name] = data_params_definition(
301296
df=df_i,
302297
normalize=normalize,
@@ -378,7 +373,6 @@ def normalize(df, data_params):
378373
pd.DataFrame
379374
normalized dataframes
380375
"""
381-
df = df.copy(deep=True)
382376
for name in df.columns:
383377
if name == "ID":
384378
continue
@@ -428,8 +422,7 @@ def check_dataframe(
428422
pd.DataFrame or dict
429423
checked dataframe
430424
"""
431-
# df = df.copy(deep=True)
432-
# df, _, _, _ = check_multiple_series_id(df)
425+
# TODO: move call to check_multiple_series_id here
433426
if df.groupby("ID").size().min() < 1:
434427
raise ValueError("Dataframe has no rows.")
435428
if "ds" not in df:
@@ -542,7 +535,7 @@ def _crossvalidation_split_df(df, n_lags, n_forecasts, k, fold_pct, fold_overlap
542535
min_train = total_samples - samples_fold - (k - 1) * (samples_fold - samples_overlap)
543536
assert min_train >= samples_fold
544537
folds = []
545-
df_fold = df.copy(deep=True)
538+
df_fold = df
546539
for i in range(k, 0, -1):
547540
df_train, df_val = split_df(df_fold, n_lags, n_forecasts, valid_p=samples_fold, inputs_overbleed=True)
548541
folds.append((df_train, df_val))
@@ -635,33 +628,30 @@ def _crossvalidation_with_time_threshold(df, n_lags, n_forecasts, k, fold_pct, f
635628
636629
validation data
637630
"""
638-
df_merged = merge_dataframes(df)
631+
df_merged = merge_dataframes(df.copy(deep=True))
639632
total_samples = len(df_merged) - n_lags + 2 - (2 * n_forecasts)
640633
samples_fold = max(1, int(fold_pct * total_samples))
641634
samples_overlap = int(fold_overlap_pct * samples_fold)
642635
assert samples_overlap < samples_fold
643636
min_train = total_samples - samples_fold - (k - 1) * (samples_fold - samples_overlap)
644637
assert min_train >= samples_fold
645638
folds = []
646-
df_fold = df
647-
# df_fold = df.copy(deep=True)
648-
# df_fold, _, _, _ = check_multiple_series_id(df_fold)
649639
for i in range(k, 0, -1):
650-
threshold_time_stamp = find_time_threshold(df_fold, n_lags, n_forecasts, samples_fold, inputs_overbleed=True)
640+
threshold_time_stamp = find_time_threshold(df, n_lags, n_forecasts, samples_fold, inputs_overbleed=True)
651641
df_train, df_val = split_considering_timestamp(
652-
df_fold, n_lags, n_forecasts, inputs_overbleed=True, threshold_time_stamp=threshold_time_stamp
642+
df, n_lags, n_forecasts, inputs_overbleed=True, threshold_time_stamp=threshold_time_stamp
653643
)
654644
folds.append((df_train, df_val))
655645
split_idx = len(df_merged) - samples_fold + samples_overlap
656646
df_merged = df_merged[:split_idx].reset_index(drop=True)
657647
threshold_time_stamp = df_merged["ds"].iloc[-1]
658648
df_fold_aux = pd.DataFrame()
659-
for df_name, df_i in df_fold.groupby("ID"):
660-
df_aux = (
661-
df_i.copy(deep=True).iloc[: len(df_i[df_i["ds"] < threshold_time_stamp]) + 1].reset_index(drop=True)
662-
)
649+
for df_name, df_i in df.groupby("ID"):
650+
# df_i = df_i.copy(deep=True)
651+
df_aux = df_i.iloc[: len(df_i[df_i["ds"] < threshold_time_stamp]) + 1].reset_index(drop=True)
663652
df_fold_aux = pd.concat((df_fold_aux, df_aux), ignore_index=True)
664-
df_fold = df_fold_aux.copy(deep=True)
653+
df = df_fold_aux
654+
# df = df.copy(deep=True)
665655
folds = folds[::-1]
666656
return folds
667657

@@ -707,7 +697,6 @@ def crossvalidation_split_df(
707697
708698
validation data
709699
"""
710-
# df = df.copy(deep=True)
711700
df, _, _, _ = check_multiple_series_id(df)
712701
folds = []
713702
if len(df["ID"].unique()) == 1:
@@ -733,7 +722,7 @@ def crossvalidation_split_df(
733722
start_date, end_date = find_valid_time_interval_for_cv(df)
734723
for df_name, df_i in df.groupby("ID"):
735724
mask = (df_i["ds"] >= start_date) & (df_i["ds"] <= end_date)
736-
df_i = df_i[mask].copy(deep=True)
725+
df_i = df_i[mask]
737726
folds_dict[df_name] = _crossvalidation_split_df(
738727
df_i, n_lags, n_forecasts, k, fold_pct, fold_overlap_pct
739728
)
@@ -768,8 +757,6 @@ def double_crossvalidation_split_df(df, n_lags, n_forecasts, k, valid_pct, test_
768757
tuple of k tuples [(folds_val, folds_test), …]
769758
elements same as :meth:`crossvalidation_split_df` returns
770759
"""
771-
# df = df.copy(deep=True)
772-
# df, _, _, _ = check_multiple_series_id(df)
773760
if len(df["ID"].unique()) > 1:
774761
raise NotImplementedError("double_crossvalidation_split_df not implemented for df with many time series")
775762
fold_pct_test = float(test_pct) / k
@@ -800,7 +787,7 @@ def find_time_threshold(df, n_lags, n_forecasts, valid_p, inputs_overbleed):
800787
str
801788
time stamp threshold defines the boundary for the train and validation sets split.
802789
"""
803-
df_merged = merge_dataframes(df)
790+
df_merged = merge_dataframes(df.copy(deep=True))
804791
n_samples = len(df_merged) - n_lags + 2 - (2 * n_forecasts)
805792
n_samples = n_samples if inputs_overbleed else n_samples - n_lags
806793
if 0.0 < valid_p < 1.0:
@@ -842,11 +829,14 @@ def split_considering_timestamp(df, n_lags, n_forecasts, inputs_overbleed, thres
842829
df_val = pd.DataFrame()
843830
for df_name, df_i in df.groupby("ID"):
844831
if df[df["ID"] == df_name]["ds"].max() < threshold_time_stamp:
845-
df_train = pd.concat((df_train, df_i.copy(deep=True)), ignore_index=True)
832+
# df_i = df_i.copy(deep=True)
833+
df_train = pd.concat((df_train, df_i), ignore_index=True)
846834
elif df[df["ID"] == df_name]["ds"].min() > threshold_time_stamp:
847-
df_val = pd.concat((df_val, df_i.copy(deep=True)), ignore_index=True)
835+
# df_i = df_i.copy(deep=True)
836+
df_val = pd.concat((df_val, df_i), ignore_index=True)
848837
else:
849-
df_aux = df_i.copy(deep=True)
838+
df_aux = df_i
839+
# df_i = df_i.copy(deep=True)
850840
n_train = len(df_aux[df_aux["ds"] < threshold_time_stamp])
851841
split_idx_train = n_train + n_lags + n_forecasts - 1
852842
split_idx_val = split_idx_train - n_lags if inputs_overbleed else split_idx_train
@@ -890,8 +880,6 @@ def split_df(
890880
pd.DataFrame, dict
891881
validation data
892882
"""
893-
# df = df.copy(deep=True)
894-
# df, _, _, _ = check_multiple_series_id(df)
895883
df_train = pd.DataFrame()
896884
df_val = pd.DataFrame()
897885
if local_split:
@@ -1373,8 +1361,6 @@ def infer_frequency(df, freq, n_lags, min_freq_percentage=0.7):
13731361
Valid frequency tag according to major frequency.
13741362
13751363
"""
1376-
# df = df.copy(deep=True)
1377-
# df, _, _, _ = check_multiple_series_id(df)
13781364
freq_df = list()
13791365
for df_name, df_i in df.groupby("ID"):
13801366
freq_df.append(_infer_frequency(df_i, freq, min_freq_percentage))
@@ -1396,6 +1382,7 @@ def create_dict_for_events_or_regressors(
13961382
df: pd.DataFrame,
13971383
other_df: Optional[pd.DataFrame],
13981384
other_df_name: str,
1385+
received_ID_col: bool,
13991386
) -> dict: # Not sure about the naming of this function
14001387
"""Create a dict for events or regressors according to input df.
14011388
@@ -1417,12 +1404,10 @@ def create_dict_for_events_or_regressors(
14171404
if other_df is None:
14181405
# if other_df is None, create dictionary with None for each ID
14191406
return {df_name: None for df_name in df_names}
1420-
other_df = other_df.copy(deep=True)
1421-
other_df, received_ID_col, _, _ = check_multiple_series_id(other_df)
14221407
# if other_df does not contain ID, create dictionary with original ID with the same other_df for each ID
14231408
if not received_ID_col:
14241409
other_df = other_df.drop("ID", axis=1)
1425-
return {df_name: other_df.copy(deep=True) for df_name in df_names}
1410+
return {df_name: other_df for df_name in df_names}
14261411

14271412
# else, other_df does contain ID, create dict with respective IDs
14281413
df_unique_names, other_df_unique_names = list(df["ID"].unique()), list(other_df["ID"].unique())
@@ -1438,7 +1423,7 @@ def create_dict_for_events_or_regressors(
14381423
df_other_dict = {}
14391424
for df_name in df_unique_names:
14401425
if df_name in other_df_unique_names:
1441-
df_aux = other_df[other_df["ID"] == df_name].reset_index(drop=True).copy(deep=True)
1426+
df_aux = other_df[other_df["ID"] == df_name].reset_index(drop=True)
14421427
df_aux.drop("ID", axis=1, inplace=True)
14431428
else:
14441429
df_aux = None

0 commit comments

Comments
 (0)