|
| 1 | +import os |
| 2 | +import shutil |
| 3 | + |
| 4 | +import networkx as nx |
| 5 | +import numpy as np |
| 6 | +import pandas as pd |
| 7 | +import spektral |
| 8 | +import tensorflow as tf |
| 9 | +from sklearn.model_selection import StratifiedKFold, train_test_split |
| 10 | + |
| 11 | +from model import MSDMT |
| 12 | + |
| 13 | +############################## |
| 14 | +seed_value = 2021 |
| 15 | +lr = 0.0001 |
| 16 | +epochs = 500 |
| 17 | +alpha = 0.5 |
| 18 | +beta = 0.5 |
| 19 | +timestep = 10 |
| 20 | +maxlen = 64 |
| 21 | +############################## |
| 22 | + |
| 23 | + |
| 24 | +def data_process(timestep=10, maxlen=64): |
| 25 | + df_U = pd.read_csv('../data/sample_data_player_portrait.csv') |
| 26 | + df_B = pd.read_csv('../data/sample_data_behavior_sequence.csv') |
| 27 | + df_G = pd.read_csv('../data/sample_data_social_network.csv') |
| 28 | + df_Y = pd.read_csv('../data/sample_data_label.csv') |
| 29 | + |
| 30 | + U = df_U.drop(['uid', 'ds'], axis=1).values |
| 31 | + U = U.reshape(-1, timestep, U.shape[-1]) |
| 32 | + B = df_B['seq'].apply(lambda x: x.split(',') if pd.notna(x) else []).values |
| 33 | + B = tf.keras.preprocessing.sequence.pad_sequences(sequences=B, |
| 34 | + maxlen=maxlen, |
| 35 | + padding='post') |
| 36 | + B = B.reshape(-1, timestep, maxlen) |
| 37 | + |
| 38 | + G = nx.from_pandas_edgelist(df=df_G, |
| 39 | + source='src_uid', |
| 40 | + target='dst_uid', |
| 41 | + edge_attr=['weight']) |
| 42 | + A = nx.adjacency_matrix(G) |
| 43 | + A = spektral.layers.GCNConv.preprocess(A).astype('f4') |
| 44 | + y1 = df_Y['churn_label'].values.reshape(-1, 1) |
| 45 | + y2 = np.log(df_Y['payment_label'].values + 1).reshape(-1, 1) |
| 46 | + |
| 47 | + print('U:', U.shape) |
| 48 | + print('B:', B.shape) |
| 49 | + print('G:', A.shape) |
| 50 | + print('y1:', y1.shape, 'y2:', y2.shape) |
| 51 | + |
| 52 | + return U, B, A, y1, y2 |
| 53 | + |
| 54 | + |
| 55 | +U, B, A, y1, y2 = data_process(timestep=timestep, maxlen=maxlen) |
| 56 | +N = A.shape[0] |
| 57 | + |
| 58 | +kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed_value) |
| 59 | + |
| 60 | +for train_index, test_index in kfold.split(U, y1): |
| 61 | + |
| 62 | + train_index, val_index = train_test_split(train_index, test_size=0.1, random_state=seed_value) |
| 63 | + |
| 64 | + mask_train = np.zeros(N, dtype=bool) |
| 65 | + mask_val = np.zeros(N, dtype=bool) |
| 66 | + mask_test = np.zeros(N, dtype=bool) |
| 67 | + mask_train[train_index] = True |
| 68 | + mask_val[val_index] = True |
| 69 | + mask_test[test_index] = True |
| 70 | + |
| 71 | + checkpoint_path = './model/checkpoint-{epoch:04d}.ckpt' |
| 72 | + checkpoint_dir = os.path.dirname(checkpoint_path) |
| 73 | + |
| 74 | + if os.path.exists(checkpoint_dir): |
| 75 | + shutil.rmtree(checkpoint_dir) |
| 76 | + |
| 77 | + early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', |
| 78 | + patience=5, |
| 79 | + mode='min') |
| 80 | + |
| 81 | + best_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, |
| 82 | + monitor='val_loss', |
| 83 | + verbose=1, |
| 84 | + save_best_only=True, |
| 85 | + save_weights_only=True, |
| 86 | + mode='auto') |
| 87 | + |
| 88 | + model = MSDMT(timestep=timestep, behavior_maxlen=maxlen) |
| 89 | + |
| 90 | + model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), |
| 91 | + loss={'output_1': tf.keras.losses.BinaryCrossentropy(), |
| 92 | + 'output_2': tf.keras.losses.MeanSquaredError()}, |
| 93 | + loss_weights={'output_1': alpha, 'output_2': beta}, |
| 94 | + metrics={'output_1': tf.keras.metrics.AUC(), |
| 95 | + 'output_2': 'mae'}) |
| 96 | + |
| 97 | + model.fit([U, B, A], [y1, y2], |
| 98 | + validation_data=([U, B, A], [y1, y2], mask_val), |
| 99 | + sample_weight=mask_train, |
| 100 | + batch_size=N, |
| 101 | + epochs=epochs, |
| 102 | + shuffle=False, |
| 103 | + callbacks=[early_stopping, best_checkpoint], |
| 104 | + verbose=1) |
0 commit comments