From 4dc00a455b19c4ac7b354d7796e94f665fd88b69 Mon Sep 17 00:00:00 2001 From: Praveen Date: Thu, 13 Oct 2022 11:29:09 +0530 Subject: [PATCH 1/8] update crawl_google_results.py --- web_programming/crawl_google_results.py | 33 +++++++++++-------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py index a33a3f3bbe5c..e54d0e817e8a 100644 --- a/web_programming/crawl_google_results.py +++ b/web_programming/crawl_google_results.py @@ -1,24 +1,21 @@ -import sys +from sys import argv +from urllib.parse import quote +from requests import get import webbrowser - -import requests from bs4 import BeautifulSoup -from fake_useragent import UserAgent if __name__ == "__main__": + if len(argv) > 1: + query = '%20'.join(argv[1:]) + else: + query = quote(str(input("Search: "))) + print("Googling.....") - url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:]) - res = requests.get(url, headers={"UserAgent": UserAgent().random}) - # res.raise_for_status() - with open("project1a.html", "wb") as out_file: # only for knowing the class - for data in res.iter_content(10000): - out_file.write(data) - soup = BeautifulSoup(res.text, "html.parser") - links = list(soup.select(".eZt8xd"))[:5] - print(len(links)) - for link in links: - if link.text == "Maps": - webbrowser.open(link.get("href")) - else: - webbrowser.open(f"http://google.com{link.get('href')}") + url = f"https://www.google.com/search?q={query}&num=2" + + res = get(url, headers={"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}) + + link = BeautifulSoup(res.text, "html.parser").find("div", attrs={"class" : "yuRUbf"}).find("a").get("href") + + webbrowser.open(link) \ No newline at end of file From cb9661cc4479757a822219ed6a9656cc1849480c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 13 Oct 2022 06:12:21 +0000 Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- web_programming/crawl_google_results.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py index e54d0e817e8a..8893d0afcaa6 100644 --- a/web_programming/crawl_google_results.py +++ b/web_programming/crawl_google_results.py @@ -1,12 +1,13 @@ +import webbrowser from sys import argv from urllib.parse import quote -from requests import get -import webbrowser + from bs4 import BeautifulSoup +from requests import get if __name__ == "__main__": if len(argv) > 1: - query = '%20'.join(argv[1:]) + query = "%20".join(argv[1:]) else: query = quote(str(input("Search: "))) @@ -14,8 +15,18 @@ url = f"https://www.google.com/search?q={query}&num=2" - res = get(url, headers={"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"}) + res = get( + url, + headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0" + }, + ) - link = BeautifulSoup(res.text, "html.parser").find("div", attrs={"class" : "yuRUbf"}).find("a").get("href") + link = ( + BeautifulSoup(res.text, "html.parser") + .find("div", attrs={"class": "yuRUbf"}) + .find("a") + .get("href") + ) - webbrowser.open(link) \ No newline at end of file + webbrowser.open(link) From 43c5fd6a97e9986411d1930e34aa9a6059637ce0 Mon Sep 17 00:00:00 2001 From: Praveen Date: Thu, 13 Oct 2022 13:28:57 +0530 Subject: [PATCH 3/8] Update and rename crawl_google_results.py to open_google_results.py --- .../{crawl_google_results.py => open_google_results.py} | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename web_programming/{crawl_google_results.py => open_google_results.py} (94%) diff --git a/web_programming/crawl_google_results.py b/web_programming/open_google_results.py similarity index 94% rename from web_programming/crawl_google_results.py rename to web_programming/open_google_results.py index 8893d0afcaa6..59f0b8b7b145 100644 --- a/web_programming/crawl_google_results.py +++ b/web_programming/open_google_results.py @@ -3,7 +3,7 @@ from urllib.parse import quote from bs4 import BeautifulSoup -from requests import get +import requests if __name__ == "__main__": if len(argv) > 1: @@ -15,7 +15,7 @@ url = f"https://www.google.com/search?q={query}&num=2" - res = get( + res = requests.get( url, headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0" From e49f2dc56cd72e39f21c1a4595b3fa875dc11bf9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 13 Oct 2022 07:59:52 +0000 Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- web_programming/open_google_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_programming/open_google_results.py b/web_programming/open_google_results.py index 59f0b8b7b145..44f341b69ce5 100644 --- a/web_programming/open_google_results.py +++ b/web_programming/open_google_results.py @@ -2,8 +2,8 @@ from sys import argv from urllib.parse import quote -from bs4 import BeautifulSoup import requests +from bs4 import BeautifulSoup if __name__ == "__main__": if len(argv) > 1: From 2db0177dec1980f750f2c79d702eea9042b226c6 Mon Sep 17 00:00:00 2001 From: Praveen Date: Thu, 13 Oct 2022 13:36:44 +0530 Subject: [PATCH 5/8] Create crawl_google_results.py --- web_programming/crawl_google_results.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 web_programming/crawl_google_results.py diff --git a/web_programming/crawl_google_results.py b/web_programming/crawl_google_results.py new file mode 100644 index 000000000000..a33a3f3bbe5c --- /dev/null +++ b/web_programming/crawl_google_results.py @@ -0,0 +1,24 @@ +import sys +import webbrowser + +import requests +from bs4 import BeautifulSoup +from fake_useragent import UserAgent + +if __name__ == "__main__": + print("Googling.....") + url = "https://www.google.com/search?q=" + " ".join(sys.argv[1:]) + res = requests.get(url, headers={"UserAgent": UserAgent().random}) + # res.raise_for_status() + with open("project1a.html", "wb") as out_file: # only for knowing the class + for data in res.iter_content(10000): + out_file.write(data) + soup = BeautifulSoup(res.text, "html.parser") + links = list(soup.select(".eZt8xd"))[:5] + + print(len(links)) + for link in links: + if link.text == "Maps": + webbrowser.open(link.get("href")) + else: + webbrowser.open(f"http://google.com{link.get('href')}") From 5786bd16167815aec186d881436f77999c90ecaa Mon Sep 17 00:00:00 2001 From: Christian Clauss Date: Thu, 13 Oct 2022 17:21:45 +0200 Subject: [PATCH 6/8] Update web_programming/open_google_results.py --- web_programming/open_google_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web_programming/open_google_results.py b/web_programming/open_google_results.py index 44f341b69ce5..f0006c2463d6 100644 --- a/web_programming/open_google_results.py +++ b/web_programming/open_google_results.py @@ -18,7 +18,7 @@ res = requests.get( url, headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0" + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) " "Gecko/20100101 Firefox/98.0" }, ) From 296fe10bb8e39186dfe3f9d577c903d36459064c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 13 Oct 2022 15:22:35 +0000 Subject: [PATCH 7/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- web_programming/open_google_results.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/web_programming/open_google_results.py b/web_programming/open_google_results.py index f0006c2463d6..fd9fda3f3808 100644 --- a/web_programming/open_google_results.py +++ b/web_programming/open_google_results.py @@ -18,7 +18,8 @@ res = requests.get( url, headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) " "Gecko/20100101 Firefox/98.0" + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) " + "Gecko/20100101 Firefox/98.0" }, ) From fb195cc568bdae2acbc348e98c1c7de0c47947b4 Mon Sep 17 00:00:00 2001 From: Praveen Date: Thu, 13 Oct 2022 21:04:02 +0530 Subject: [PATCH 8/8] Update open_google_results.py --- web_programming/open_google_results.py | 29 +++++++++++++++++--------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/web_programming/open_google_results.py b/web_programming/open_google_results.py index fd9fda3f3808..0e1dba8c5856 100644 --- a/web_programming/open_google_results.py +++ b/web_programming/open_google_results.py @@ -1,6 +1,7 @@ import webbrowser from sys import argv -from urllib.parse import quote +from urllib.parse import quote, parse_qs +from fake_useragent import UserAgent import requests from bs4 import BeautifulSoup @@ -13,21 +14,29 @@ print("Googling.....") - url = f"https://www.google.com/search?q={query}&num=2" + url = f"https://www.google.com/search?q={query}&num=100" res = requests.get( url, headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) " - "Gecko/20100101 Firefox/98.0" + "User-Agent": str(UserAgent().random) }, ) - link = ( - BeautifulSoup(res.text, "html.parser") - .find("div", attrs={"class": "yuRUbf"}) - .find("a") - .get("href") - ) + try: + link = ( + BeautifulSoup(res.text, "html.parser") + .find("div", attrs={"class": "yuRUbf"}) + .find("a") + .get("href") + ) + + except AttributeError: + link = parse_qs( + BeautifulSoup(res.text, "html.parser") + .find("div", attrs={"class": "kCrYT"}) + .find("a") + .get("href") + )["url"][0] webbrowser.open(link)