mk-z
diff --git a/‎scripts/postprocessing/importance_selection.py
+323 b/‎scripts/postprocessing/importance_selection.py
+323
@@ -0,0 +1,323 @@
+import logging
+import warnings
+
+import numpy as np
+import pandas as pd
+import psycopg2
+from scipy.optimize import LinearConstraint, minimize
+from scipy.sparse import coo_array, csr_array, csr_matrix, hstack
+from scipy.special import softmax
+from tqdm import trange
+from tqdm.contrib.logging import logging_redirect_tqdm
+
+
+def least_squares_fit(features, target, scaling=1):
+    X = features  # (features - np.mean(features, 0)) / np.std(features, 0)
+    # print("feature",X.shape)
+    # get target
+    y = target.reshape(-1)
+    # Use simple imputer for mean to not change the importance of tree split
+    # Create an instance of the ExtraTreesRegressor algorithm
+    zX = X.toarray()
+    summed_target = y  # (y+zX[:,-1])/2
+    vote_matrix = csr_matrix(zX[:, :-1])
+    constraint = LinearConstraint(np.ones(X.shape[-1] - 1), 1 * scaling, 1 * scaling)
+    init = np.ones(X.shape[-1] - 1)  # lsqr(vote_matrix,summed_target)[0]
+    init = init / np.linalg.norm(init)
+    result = minimize(
+        lambda x: np.sum((vote_matrix @ x - summed_target) ** 2),
+        init,
+        jac=lambda x: 2 * vote_matrix.T @ (vote_matrix @ x - summed_target),
+        constraints=constraint,
+        hess=lambda _: 2 * vote_matrix.T @ vote_matrix,
+        method="trust-constr",
+    )
+    # result = least_squares(residual, np.ones(X.shape[-1]-1))
+    # result = least_squares(zX[:,:-1], (y+zX[:,-1])/2,
+    # print(result)
+    return np.concatenate([result.x, np.ones(1)])
+
+
+def get_df(study_label):
+    conn = psycopg2.connect("host=0.0.0.0 port=5432 user=postgres password=postgres dbname=postgres")
+    # Define the SQL query
+    query = (
+        "SELECT DISTINCT message_id, labels, message.user_id FROM text_labels JOIN message ON message_id = message.id;"
+    )
+
+    # Read the query results into a Pandas dataframe
+    df = pd.read_sql(query, con=conn)
+    print(df.head())
+    # Close the database connection
+    conn.close()
+    users = set()
+    messages = set()
+    for row in df.itertuples(index=False):
+        row = row._asdict()
+        users.add(str(row["user_id"]))
+        # for row in df.itertuples(index=False):
+        # row = row._asdict()
+        messages.add(str(row["message_id"]))
+    users = list(users)
+    messages = list(messages)
+    print("num users:", len(users), "num messages:", len(messages), "num in df", len(df))
+
+    # arr = np.full((len(messages), len(users)), np.NaN, dtype=np.half)
+    row_idx = []
+    col_idx = []
+    data = []
+
+    def swap(x):
+        return (x[1], x[0])
+
+    dct = dict(map(swap, enumerate(messages)))
+    print("converting messages...")
+    df["message_id"] = df["message_id"].map(dct)
+    print("converting users...")
+    df["user_id"] = df["user_id"].map(dict(map(swap, enumerate(users))))
+    print("converting labels...")
+    df["labels"] = df["labels"].map(lambda x: float(x.get(study_label, 0)))
+    row_idx = df["message_id"].to_numpy()
+    col_idx = df["user_id"].to_numpy()
+    data = df["labels"].to_numpy()
+    print(data)
+    print(row_idx)
+    print(col_idx)
+    """ for row in df.itertuples(index=False):
+        row = row._asdict()
+        labels = row["labels"]
+        value = labels.get(study_label, None)
+        if value is not None:
+            # tmp=out[str(row["message_id"])]
+            # tmp = np.array(tmp)
+            # tmp[users.index(row["user_id"])] = value
+            # out[str(row["message_id"])] = np.array(tmp)
+            # print(out[str(row["message_id"])].density)
+            row_idx.append(messages.index(str(row["message_id"])))
+            col_idx.append(users.index(str(row["user_id"])))
+            data.append(value)
+            #arr[mid, uid] = value """
+    arr = csr_array(coo_array((data, (row_idx, col_idx))))
+    print("results", len(users), arr.shape)
+    # df = pd.DataFrame.from_dict(out,orient="index")
+    print("generated dataframe")
+    return arr, messages, users
+
+
+def reweight_features(features, weights, noise_scale=0.0):
+    # X = df.drop(target_col, axis=1)
+    # print("info",features.shape,weights.shape)
+    # X = (features - np.mean(features, 0).reshape(1,-1)) / np.std(features, 0).reshape(1,-1)
+    noise = np.random.randn(weights.shape[0]) * noise_scale
+    weights = weights + noise
+    # normalizer = (X.notna().astype(float) * weights).sum(skipna=True, axis=1)
+    values = features @ weights
+    # values = values / normalizer
+    return values
+
+
+def get_subframe(arr, columns_to_filter):
+    # return np.delete(arr, columns_to_filter, axis=1)
+    """
+    Remove the rows denoted by ``indices`` form the CSR sparse matrix ``mat``.
+    """
+    if not isinstance(arr, csr_array):
+        raise ValueError("works only for CSR format -- use .tocsr() first")
+    indices = list(columns_to_filter)
+    mask = np.ones(arr.shape[1], dtype=bool)
+    mask[indices] = False
+    return arr[:, mask]
+
+
+def sample_importance_weights(importance_weights, temperature=1.0):
+    weights = softmax(
+        abs(importance_weights) / temperature,
+    )
+    column = np.random.choice(len(importance_weights), p=weights)
+    return column
+
+
+def make_random_testframe(num_rows, num_cols, frac_missing):
+    data = np.random.rand(num_rows, num_cols).astype(np.float16)
+    mask = np.random.rand(num_rows, num_cols) < frac_missing
+    data[mask] = np.nan
+    return data
+
+
+def combine_underrepresented_columns(arr, num_instances):
+    # 1. get the mask
+    mask = arr != 0
+    to_combine = mask.sum(0) < num_instances
+    # print("to combine", mask.sum(0))
+    # print("combining", to_combine.astype(int).sum().tolist(), "many columns")
+    if not any(to_combine):
+        return arr
+    # mean = np.zeros(arr.shape[0])
+    # for i in to_combine.tolist():
+    #    mean = np.nansum(np.stack(arr[:,i],mean),0)
+    # mean = mean/len(to_combine)
+    mean = np.mean(arr[:, to_combine], 1).reshape(-1, 1)
+    # print("mean shape",mean.shape)
+    dp = np.arange(len(to_combine))[to_combine]
+    # print("removing unused columns")
+    arr = get_subframe(arr, dp)
+    # print("subframe shape",arr.shape)
+    arr = hstack([arr, mean])
+    # print("out arr", arr.shape)
+    # print((mean==0).astype(int).sum())
+    return arr
+
+
+def importance_votes(arr, to_fit=10, init_weight=None):
+    # arr = combine_underrepresented_columns(matrix,underrepresentation_thresh)
+    filtered_columns = []
+    weighter = None
+    if init_weight is None:
+        weighter = np.ones(arr.shape[1]) / arr.shape[1]  # pd.Series(1.0, index=df.drop(columns=target).columns)
+    else:
+        weighter = init_weight
+    # print(arr.shape)
+    index = np.arange(arr.shape[1])
+    # subtract 1: the last one will always have maximal reduction!
+    bar = trange(to_fit)
+    target = np.ones(arr.shape[0])
+    for i in bar:
+        index = list(filter(lambda x: x not in filtered_columns, index))
+        # 0. produce target column:
+        # print("step 0")
+        target_old = target
+        target = reweight_features(arr, weighter)
+        error = np.mean((target - target_old) ** 2)
+        bar.set_description(f"expected error: {error}", refresh=True)
+        if error < 1e-10:
+            break
+        # 1. get a subframe of interesting features
+        # print("step 1")
+        # subframe = get_subframe(arr, filtered_columns)
+        # 2. compute feature importance
+        # print("step 2")
+        # importance_weights=None
+        # importance_weights = compute_feature_importance(arr, target, index)
+        weighter = least_squares_fit(arr, target)
+        # 3. sample column
+        # print("step 3")
+        # new_column = sample_importance_weights(importance_weights["importance"], temperature)
+        # new_column=index[new_column]
+        # value = importance_weights["importance"][new_column]
+        # print(weighter.shape, importance_weights["importance"].shape)
+        # weighter += alpha[i] * importance_weights["importance"].to_numpy()
+        # normalize to maintain the "1-voter one vote" total number of votes!
+        # weighter = weighter / sum(abs(weighter))
+        # stepsize = np.mean(abs(importance_weights["importance"].to_numpy()))
+        # bar.set_description(f"expected stepsize: {stepsize}", refresh=True)
+        # filtered_columns.append(new_column)
+    # print("new weight values", weighter)
+    return reweight_features(arr, weighter), weighter
+
+
+def select_ids(arr, pick_frac, minima=(50, 500), folds=50, to_fit=200, frac=0.6):
+    """
+    selects the top-"pick_frac"% of messages from "arr" after merging all
+    users with less than "minima" votes (minima increases linearly with each iteration from min to max).
+    The method returns all messages that are within `frac` many "minima" selection
+    """
+    votes = []
+    minima = np.linspace(*minima, num=folds, dtype=int)
+    num_per_iter = int(arr.shape[0] * pick_frac)
+    writer_num = 0
+    tmp = None
+    for i in trange(folds):
+        tofit = combine_underrepresented_columns(arr, minima[i])
+        if tofit.shape[1] == writer_num:
+            print("already tested these writer counts, skipping and using cached value.....")
+            votes.append(tmp)
+            continue
+        writer_num = tofit.shape[1]
+        # print("arr shape", arr.shape)
+        init_weight = np.ones(tofit.shape[1]) / tofit.shape[1]
+        out, weight = importance_votes(tofit, init_weight=init_weight, to_fit=to_fit)
+        # print(i, "final weight")
+        # print(weight)
+        # mask =(out>thresh)
+        # out = np.arange(arr.shape[0])[mask]
+        indices = np.argpartition(out, -num_per_iter)[-num_per_iter:]
+        tmp = np.zeros((arr.shape[0]))
+        tmp[indices] = 1
+        votes.append(tmp)
+        # votes.append(indices.tolist())
+    # print(*[f"user_id: {users[idx]} {m}±{s}" for m, s, idx in zip(weights_mean, weights_std, range(len(weights_mean)))], sep="\n")
+    out = []
+    votes = np.stack(votes, axis=0)
+    print("votespace", votes.shape)
+    votes = np.mean(votes, 0)
+    for idx, f in enumerate(votes):
+        if f > frac:
+            out.append((idx, f))
+    return out
+
+
+LOG = logging.getLogger(__name__)
+
+if __name__ == "__main__":
+    warnings.filterwarnings("ignore", category=FutureWarning)
+    warnings.filterwarnings("ignore", category=DeprecationWarning)
+    warnings.simplefilter("ignore")
+    logging.captureWarnings(True)
+    logging.basicConfig(level=logging.ERROR)
+    # Generate some example data
+    # df = make_random_testframe(100_000,5000,0.99)
+    df, message_ids, users = get_df("quality")
+    print("combining columns:")
+    # df = combine_underrepresented_columns(df, 100)
+    weights = np.ones(df.shape[-1])
+    y = reweight_features(df, weights)
+    num_per_iter = int(df.shape[0] * 0.5)
+    naive = np.argpartition(y, -num_per_iter)[-num_per_iter:]
+
+    print("after preprocessing")
+    # print(df)
+    # preproc input
+
+    # Compute feature importances
+    # y = reweight_features(df,np.ones(df.shape[1]))
+    # importance_weights = compute_feature_importance(df, y, list(range(df.shape[1])))
+    # Print the importance weights for each feature
+    # print(importance_weights)
+
+    print("STARTING RUN")
+
+    # sampled_columns = sample_importance_weights(
+    #    importance_weights["importance"],
+    # )
+    # print("sampled column", sampled_columns)
+    # print("compute importance votes:")
+    # weighted_votes, weightings = importance_votes(df)
+    # print(weighted_votes)
+    # print(weightings)
+    with logging_redirect_tqdm():
+        print("selected ids")
+        ids = select_ids(df, 0.5, folds=500)
+
+    #    print(res, frac)
+    conn = psycopg2.connect("host=0.0.0.0 port=5432 user=postgres password=postgres dbname=postgres")
+    # Define the SQL query
+    # , payload#>'{payload, text}' as text
+    query = "SELECT DISTINCT id as message_id, message_tree_id FROM message;"
+    print("selected", len(ids), "messages")
+    # Read the query results into a Pandas dataframe
+    df = pd.read_sql(query, con=conn)
+    out = []
+    fracs = []
+    in_naive = []
+    for i, frac in ids:
+        res = message_ids[i]
+        out.append((df.loc[df["message_id"] == res]))
+        fracs.append(frac)
+        in_naive.append(i in naive)
+    df = pd.concat(out)
+    df["fracs"] = fracs
+    df["in_naive"] = in_naive
+    print(df.shape)
+    print("differences from naive", len(in_naive) - sum(in_naive))
+    print(df)
+    df.to_csv("output.csv")