|
| 1 | +import json |
| 2 | +from pathlib import Path |
| 3 | + |
| 4 | +import polars as pl |
| 5 | +from tqdm import tqdm |
| 6 | + |
| 7 | +# Sets up paths |
| 8 | +# TODO: Source paths from env file |
| 9 | +path_string = "PUT THE PATH HERE TO WHERE YOU STORED THE PARQUET FILES" |
| 10 | +folder_path = Path(path_string) |
| 11 | +processed_folder_path = folder_path / "processed" |
| 12 | +output_path = folder_path / "twitter-conv-trees.jsonl" |
| 13 | + |
| 14 | +# Get parq files |
| 15 | +parq_files = sorted(processed_folder_path.rglob("*.parquet")) |
| 16 | + |
| 17 | +wanted_cols = [ |
| 18 | + "timestamp_ms", |
| 19 | + "id", |
| 20 | + "text", |
| 21 | + "truncated", |
| 22 | + "in_reply_to_status_id", |
| 23 | + "in_reply_to_user_id", |
| 24 | + "is_quote_status", |
| 25 | + "quote_count", |
| 26 | + "reply_count", |
| 27 | + "retweet_count", |
| 28 | + "favorite_count", |
| 29 | + "filter_level", |
| 30 | + "lang", |
| 31 | + "possibly_sensitive", |
| 32 | + "hashtags", |
| 33 | + "user_id", |
| 34 | + "user_verified", |
| 35 | + "user_followers_count", |
| 36 | + "user_statuses_count", |
| 37 | +] |
| 38 | + |
| 39 | +# Load parqs into list. Using Polars for performance reasons. |
| 40 | +df_list = [] |
| 41 | +for p in parq_files: |
| 42 | + df_list.append(pl.read_parquet(p, columns=wanted_cols)) |
| 43 | + |
| 44 | +# Create major dataframe. |
| 45 | +# This can be done incrementally if RAM is constrained by modifying the above code. |
| 46 | +p_df = pl.concat(df_list) |
| 47 | + |
| 48 | +# Clean up the reference just in case to help with memory if needed. |
| 49 | +del df_list |
| 50 | + |
| 51 | +# Get tweets that are replies to other tweets |
| 52 | +p_df_replies_only = p_df.filter(pl.col("in_reply_to_status_id").is_null().is_not()) |
| 53 | + |
| 54 | +# Group by replied to status id to see the most replied to statuses. This can take some time. |
| 55 | +p_df_group_reply_to_status = p_df_replies_only.groupby("in_reply_to_status_id").count().sort("count", reverse=True) |
| 56 | + |
| 57 | +# Save output of grouping the top replied to statuses |
| 58 | +group_reply_parq = folder_path / "group_reply_parq.parquet" |
| 59 | +p_df_group_reply_to_status.write_parquet(group_reply_parq) |
| 60 | + |
| 61 | +# Join the main dataframe with the top replies to find tweets that have replies. |
| 62 | +p_join = p_df.join(p_df_group_reply_to_status, left_on="id", right_on="in_reply_to_status_id", how="inner") |
| 63 | + |
| 64 | +# Save output of tweets that have replies |
| 65 | +tweets_that_have_replies_path = folder_path / "tweets_that_have_replies.parquet" |
| 66 | +p_join.write_parquet(tweets_that_have_replies_path) |
| 67 | + |
| 68 | +# Save output of tweets that are replies to other tweets |
| 69 | +tweets_that_are_replies_path = folder_path / "tweets_that_are_replies.parquet" |
| 70 | +p_df_replies_only.write_parquet(tweets_that_are_replies_path) |
| 71 | + |
| 72 | +# Filter the tweets that have replies to ones that aren't replies to others. |
| 73 | +# Also filter for only english for now. |
| 74 | +# This gives the root tweets that have replies but are the start of a conversation. |
| 75 | +origin_tweets = p_join.filter((pl.col("in_reply_to_status_id").is_null()) & (pl.col("lang") == "en")) |
| 76 | + |
| 77 | + |
| 78 | +# Helper functions and classes below for the next steps |
| 79 | + |
| 80 | + |
| 81 | +def role_decide(user_id, prompt_user): |
| 82 | + if user_id == prompt_user: |
| 83 | + return "prompter" |
| 84 | + else: |
| 85 | + return "assistant" |
| 86 | + |
| 87 | + |
| 88 | +class ConversationTreeNode: |
| 89 | + def __init__(self, tweet_id, prompt_user, from_df, children_df, metadata=None): |
| 90 | + |
| 91 | + if metadata: |
| 92 | + self.metadata = metadata |
| 93 | + else: |
| 94 | + self.metadata = from_df.filter(pl.col("id") == tweet_id).to_dicts()[0] |
| 95 | + |
| 96 | + self.metadata["prompt_user"] = prompt_user |
| 97 | + self.role = role_decide(self.metadata["user_id"], prompt_user) |
| 98 | + self.children = None |
| 99 | + self.text = self.metadata["text"] |
| 100 | + del self.metadata["text"] |
| 101 | + self.get_children(tweet_id=tweet_id, children_df=children_df) |
| 102 | + |
| 103 | + def get_children(self, tweet_id, children_df): |
| 104 | + children_dicts = children_df.filter(pl.col("in_reply_to_status_id") == tweet_id).to_dicts() |
| 105 | + if len(children_dicts) > 0: |
| 106 | + children = [ |
| 107 | + ConversationTreeNode( |
| 108 | + tweet_id=c["id"], |
| 109 | + prompt_user=self.metadata["prompt_user"], |
| 110 | + from_df=children_df, |
| 111 | + children_df=children_df, |
| 112 | + metadata=c, |
| 113 | + ) |
| 114 | + for c in children_dicts |
| 115 | + ] |
| 116 | + self.children = children |
| 117 | + |
| 118 | + |
| 119 | +class ConversationTree: |
| 120 | + def __init__(self, tweet_id, prompt_user, from_df, children_df, r_metadata=None): |
| 121 | + |
| 122 | + self.root = ConversationTreeNode( |
| 123 | + tweet_id=tweet_id, prompt_user=prompt_user, from_df=from_df, children_df=children_df, metadata=r_metadata |
| 124 | + ) |
| 125 | + self.metadata = None |
| 126 | + |
| 127 | + |
| 128 | +# Create conversation trees |
| 129 | +conv_tree_list = [ |
| 130 | + ConversationTree( |
| 131 | + tweet_id=r["id"], prompt_user=r["user_id"], from_df=origin_tweets, children_df=p_df_replies_only, r_metadata=r |
| 132 | + ) |
| 133 | + for r in tqdm(origin_tweets.to_dicts()) |
| 134 | +] |
| 135 | + |
| 136 | +# Write conversation trees to jsonl file. |
| 137 | +# Might need to clean up the last newline. |
| 138 | +with open(output_path, "w") as output: |
| 139 | + for t in tqdm(conv_tree_list): |
| 140 | + json.dump(obj=t, fp=output, default=lambda x: x.__dict__) |
| 141 | + output.write("\n") |
0 commit comments