From 27dfe61e776d8de58203d966f9a4bebde788a3ac Mon Sep 17 00:00:00 2001 From: Jaivardhan Shukla <93859359+jaivsh@users.noreply.github.com> Date: Thu, 10 Aug 2023 23:31:05 +0530 Subject: [PATCH 1/3] Create reddit.py --- Reddit Scraper/reddit.py | 410 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 410 insertions(+) create mode 100644 Reddit Scraper/reddit.py diff --git a/Reddit Scraper/reddit.py b/Reddit Scraper/reddit.py new file mode 100644 index 0000000000..f927066c06 --- /dev/null +++ b/Reddit Scraper/reddit.py @@ -0,0 +1,410 @@ +import requests +from bs4 import BeautifulSoup + + +class Reddit: + """ + Create an instance of `Reddit` class.\n + ```python + posts = Reddit() + ``` + | Methods | Details | + | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------ | + | `.getFeed()` | Returns the posts with title, descriptions, subreddit, subreddit avatar, time, vote and comment count, image, category and link | + | `.get_best()` | Returns the best posts with title, descriptions, subreddit, subreddit avatar, time, vote and comment count, image, category and link | + | `.get_hot()` | Returns the hot posts with title, descriptions, subreddit, subreddit avatar, time, vote and comment count, image, category and link | + | `.get_top()` | Returns the top posts with title, descriptions, subreddit, subreddit avatar, time, vote and comment count, image, category and link | + | `.search(topic)` | Returns the top posts with title, subreddit, subreddit avatar, date, vote and comment count and link for a searched topic | + """ + + def getFeed(self): + """ + Class - `Reddit` + Example: + ```python + posts = Reddit() + posts.getFeed() + ``` + Returns: + ```js + { + "title": Title of the post + "description": Description of the post + "subreddit": subreddit name, + "subreddit_avatar": subreddit avatar, + "time": Time the post was posted, + "vote_count": No. of votes of the post, + "comment_count": No. of comments of the post, + "img_url": URL of any image provided in the post, + "category": Category of the post, + } + ``` + """ + url = "https://www.reddit.com/" + try: + res = requests.get(url) + soup = BeautifulSoup(res.text, "html.parser") + + posts_data = {"posts": []} + + posts = soup.find_all("div", attrs={"data-testid": "post-container"}) + + for p in posts: + base = p.find("a", attrs={"data-click-id": "subreddit"}) + subreddit = base["href"] + subreddit_avatar = base.find("img")["src"] + title = p.find("h3", class_="_eYtD2XCVieq6emjKBH3m").getText() + try: + desc = p.find("p").getText() + except: + desc = "" + votes = p.find( + "div", class_="_1rZYMD_4xY3gRcSS3p8ODO _3a2ZHWaih05DgAOtvu6cIo" + ).getText() + time = p.find("span", class_="_2VF2J19pUIMSLJFky-7PEI").getText() + comment_count = p.find("span", class_="FHCV02u6Cp2zYL0fhQPsO").getText() + try: + img = p.find("img", alt="Post image")["src"] + except: + img = "" + try: + category = p.find( + "span", + class_="_1jNPl3YUk6zbpLWdjaJT1r _2VqfzH0dZ9dIl3XWNxs42y aJrgrewN9C8x1Fusdx4hh _1Dl-kvSxyJMWO9nuoTof8N", + ).getText() + except: + category = "" + link = p.find( + "a", class_="SQnoC3ObvgnGjWt90zD9Z _2INHSNB8V5eaWp4P0rY_mE" + )["href"] + + posts_data["posts"].append( + { + "title": title, + "description": desc, + "subreddit": subreddit, + "subreddit_avatar": subreddit_avatar, + "time": time, + "vote_count": votes, + "comment_count": comment_count, + "img_url": img, + "category": category, + "link": link, + } + ) + return posts_data + except: + return None + + def get_best(self): + """ + Class - `Reddit` + Example: + ```python + posts = Reddit() + posts.get_best() + ``` + Returns: + ```js + [ + { + "title":"This is Child abuse. Who gave them permission to procreate?", + "description":"", + "subreddit":"/r/facepalm/", + "subreddit_avatar":"https://styles.redditmedia.com/t5_2r5rp/styles/communityIcon_2c4ms7mggreb1.png", + "time":"13 hours ago", + "vote_count":"28.0k", + "comment_count":"2.5k comments", + "img_url":"https://preview.redd.it/75z7yw2hlyeb1.png?width=640&crop=smart&auto=webp&s=70556d4e2753676d9d4d23207321235ab1c0c28e", + "category":"", + "link":"/r/facepalm/comments/15d18ak/this_is_child_abuse_who_gave_them_permission_to/" + } + ... + ] + ``` + """ + url = "https://www.reddit.com/r/popular/best/?feedViewType=classicView" + try: + res = requests.get(url) + soup = BeautifulSoup(res.text, "html.parser") + + posts_data = {"posts": []} + + posts = soup.find_all("div", attrs={"data-testid": "post-container"}) + + for p in posts: + base = p.find("a", attrs={"data-click-id": "subreddit"}) + subreddit = base["href"] + subreddit_avatar = base.find("img")["src"] + title = p.find("h3", class_="_eYtD2XCVieq6emjKBH3m").getText() + try: + desc = p.find("p").getText() + except: + desc = "" + votes = p.find( + "div", class_="_1rZYMD_4xY3gRcSS3p8ODO _3a2ZHWaih05DgAOtvu6cIo" + ).getText() + time = p.find("span", class_="_2VF2J19pUIMSLJFky-7PEI").getText() + comment_count = p.find("span", class_="FHCV02u6Cp2zYL0fhQPsO").getText() + try: + img = p.find("img", alt="Post image")["src"] + except: + img = "" + try: + category = p.find( + "span", + class_="_1jNPl3YUk6zbpLWdjaJT1r _2VqfzH0dZ9dIl3XWNxs42y aJrgrewN9C8x1Fusdx4hh _1Dl-kvSxyJMWO9nuoTof8N", + ).getText() + except: + category = "" + link = p.find( + "a", class_="SQnoC3ObvgnGjWt90zD9Z _2INHSNB8V5eaWp4P0rY_mE" + )["href"] + + posts_data["posts"].append( + { + "title": title, + "description": desc, + "subreddit": subreddit, + "subreddit_avatar": subreddit_avatar, + "time": time, + "vote_count": votes, + "comment_count": comment_count, + "img_url": img, + "category": category, + "link": link, + } + ) + return posts_data["posts"] + except: + return None + + def get_hot(self): + """ + Class - `Reddit` + Example: + ```python + posts = Reddit() + posts.get_hot() + ``` + Returns: + ```js + [ + { + "title":"Catching a ball while smoking and looking cool", + "description":"", + "subreddit":"/r/nextfuckinglevel/", + "subreddit_avatar":"https://styles.redditmedia.com/t5_m0bnr/styles/communityIcon_qanlm185crr71.png", + "time":"7 hours ago", + "vote_count":"2.4k", + "comment_count":"80 comments", + "img_url":"", + "category":"", + "link":"/r/nextfuckinglevel/comments/15d9yg7/catching_a_ball_while_smoking_and_looking_cool/" + } + ... + ] + ``` + """ + url = "https://www.reddit.com/r/popular/hot/?feedViewType=cardView" + try: + res = requests.get(url) + soup = BeautifulSoup(res.text, "html.parser") + + posts_data = {"posts": []} + + posts = soup.find_all("div", attrs={"data-testid": "post-container"}) + + for p in posts: + base = p.find("a", attrs={"data-click-id": "subreddit"}) + subreddit = base["href"] + subreddit_avatar = base.find("img")["src"] + title = p.find("h3", class_="_eYtD2XCVieq6emjKBH3m").getText() + try: + desc = p.find("p").getText() + except: + desc = "" + votes = p.find( + "div", class_="_1rZYMD_4xY3gRcSS3p8ODO _3a2ZHWaih05DgAOtvu6cIo" + ).getText() + time = p.find("span", class_="_2VF2J19pUIMSLJFky-7PEI").getText() + comment_count = p.find("span", class_="FHCV02u6Cp2zYL0fhQPsO").getText() + try: + img = p.find("img", alt="Post image")["src"] + except: + img = "" + try: + category = p.find( + "span", + class_="_1jNPl3YUk6zbpLWdjaJT1r _2VqfzH0dZ9dIl3XWNxs42y aJrgrewN9C8x1Fusdx4hh _1Dl-kvSxyJMWO9nuoTof8N", + ).getText() + except: + category = "" + link = p.find( + "a", class_="SQnoC3ObvgnGjWt90zD9Z _2INHSNB8V5eaWp4P0rY_mE" + )["href"] + + posts_data["posts"].append( + { + "title": title, + "description": desc, + "subreddit": subreddit, + "subreddit_avatar": subreddit_avatar, + "time": time, + "vote_count": votes, + "comment_count": comment_count, + "img_url": img, + "category": category, + "link": link, + } + ) + return posts_data["posts"] + except: + return None + + def get_top(self): + """ + Class - `Reddit` + Example: + ```python + posts = Reddit() + posts.get_top() + ``` + Returns: + ```js + { + "title": Title of the post + "description": Description of the post + "subreddit": subreddit name, + "subreddit_avatar": subreddit avatar, + "time": Time the post was posted, + "vote_count": No. of votes of the post, + "comment_count": No. of comments of the post, + "img_url": URL of any image provided in the post, + "category": Category of the post, + } + ``` + """ + url = "https://www.reddit.com/r/popular/top/" + try: + res = requests.get(url) + soup = BeautifulSoup(res.text, "html.parser") + + posts_data = {"posts": []} + + posts = soup.find_all("div", attrs={"data-testid": "post-container"}) + for p in posts: + base = p.find("a", attrs={"data-click-id": "subreddit"}) + subreddit = base["href"] + subreddit_avatar = base.find("img")["src"] + title = p.find("h3", class_="_eYtD2XCVieq6emjKBH3m").getText() + try: + desc = p.find("p").getText() + except: + desc = "" + votes = p.find( + "div", class_="_1rZYMD_4xY3gRcSS3p8ODO _3a2ZHWaih05DgAOtvu6cIo" + ).getText() + time = p.find("span", class_="_2VF2J19pUIMSLJFky-7PEI").getText() + comment_count = p.find("span", class_="FHCV02u6Cp2zYL0fhQPsO").getText() + try: + img = p.find("img", alt="Post image")["src"] + except: + img = "" + try: + category = p.find( + "span", + class_="_1jNPl3YUk6zbpLWdjaJT1r _2VqfzH0dZ9dIl3XWNxs42y aJrgrewN9C8x1Fusdx4hh _1Dl-kvSxyJMWO9nuoTof8N", + ).getText() + except: + category = "" + link = p.find( + "a", class_="SQnoC3ObvgnGjWt90zD9Z _2INHSNB8V5eaWp4P0rY_mE" + )["href"] + posts_data["posts"].append( + { + "title": title, + "description": desc, + "subreddit": subreddit, + "subreddit_avatar": subreddit_avatar, + "time": time, + "vote_count": votes, + "comment_count": comment_count, + "img_url": img, + "category": category, + "link": link, + } + ) + return posts_data["posts"] + except: + return None + + def search(self, topic): + """ + Class - `Reddit` + Example: + ```python + posts = Reddit() + posts.search("github") + ``` + Returns: + ```js + [ + { + "title":"What is the best self-hosted Github alternative?", + "subreddit":"r/selfhosted", + "subreddit_avatar":"https://styles.redditmedia.com/t5_32hch/styles/communityIcon_b2t5inv46z331.png", + "date":"2023-07-26", + "vote_count":"32", + "comment_count":"62", + "link":"https://www.reddit.com/r/selfhosted/comments/15a0lic/what_is_the_best_selfhosted_github_alternative/" + } + ... + ] + ``` + """ + url = "https://www.reddit.com/search/?q=" + topic + try: + res = requests.get(url) + soup = BeautifulSoup(res.text, "html.parser") + + posts_data = {"posts": []} + + posts = soup.find_all("div", class_="pb-xl") + for p in posts: + try: + title = ( + p.find("a", attrs={"data-testid": "post-title"}) + .getText() + .strip() + ) + subreddit = p.find("a").getText().strip() + date = p.find("faceplate-timeago")["ts"][0:10] + base = p.find( + "div", class_="text-neutral-content-weak text-12" + ).find_all("span") + upvotes = base[0].find("faceplate-number")["number"] + comment_count = base[2].find("faceplate-number")["number"] + try: + subreddit_img = p.find("faceplate-img")["src"] + except: + subreddit_img = "" + link = p.find("a", attrs={"data-testid": "post-title"})["href"] + except: + pass + posts_data["posts"].append( + { + "title": title, + "subreddit": subreddit, + "subreddit_avatar": subreddit_img, + "date": date, + "vote_count": upvotes, + "comment_count": comment_count, + "link": "https://www.reddit.com" + link, + } + ) + return posts_data["posts"] + except: + return None +posts = Reddit() +print(posts.search("github")) From 0a836b4953998815c5e24515bf9ddded0da1c168 Mon Sep 17 00:00:00 2001 From: Jaivardhan Shukla <93859359+jaivsh@users.noreply.github.com> Date: Thu, 10 Aug 2023 23:32:55 +0530 Subject: [PATCH 2/3] Create requirements.txt --- Reddit Scraper/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Reddit Scraper/requirements.txt diff --git a/Reddit Scraper/requirements.txt b/Reddit Scraper/requirements.txt new file mode 100644 index 0000000000..5d3386da47 --- /dev/null +++ b/Reddit Scraper/requirements.txt @@ -0,0 +1,3 @@ +beautifulsoup4==4.9.1 +bs4==0.0.1 +requests==2.31.0 From f6c947af26271e41f52bda94ea34633fdc3d6e45 Mon Sep 17 00:00:00 2001 From: Jaivardhan Shukla <93859359+jaivsh@users.noreply.github.com> Date: Thu, 10 Aug 2023 23:33:28 +0530 Subject: [PATCH 3/3] Create README.md --- Reddit Scraper/README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 Reddit Scraper/README.md diff --git a/Reddit Scraper/README.md b/Reddit Scraper/README.md new file mode 100644 index 0000000000..1c1b3099ed --- /dev/null +++ b/Reddit Scraper/README.md @@ -0,0 +1,3 @@ +## Reddit Scraper + +This script scrapes the feeds for a particular query from the reddit website.