Skip to content

Commit 5e60ea3

Browse files
Added Scraping
1 parent 456aecb commit 5e60ea3

File tree

1 file changed

+40
-0
lines changed

1 file changed

+40
-0
lines changed

scrap.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# To scrape Project Euler website for problem links in its archive page
2+
3+
from urllib import request
4+
5+
# to handle HTTP requests
6+
7+
from bs4 import BeautifulSoup
8+
9+
# to parse the html data
10+
11+
link = "https://projecteuler.net/archives"
12+
13+
# link we need to visit
14+
15+
site = request.urlopen(link)
16+
17+
# site contains HTTP data received from the link
18+
19+
content = site.read()
20+
21+
# content contains the html code
22+
23+
soup = BeautifulSoup(content,"html.parser")
24+
25+
# soup contains html code in a parsed object; also see: print(soup.prettify()) to print indentated html code
26+
27+
table = soup.find("table",{"id":"problems_table"})
28+
29+
# we dont need to use find_all() because after examining the html code, there is only one table with this id
30+
31+
listLinks = table.find_all("a")
32+
33+
# "bs4.element.ResultSet" -> list of all <a></a> tags found under table
34+
35+
problemLinks = [] # to store links to problems
36+
37+
for links in listLinks:
38+
p = "https://projecteuler.net/"+links.get("href")
39+
problemLinks.append(p)
40+
print(p)

0 commit comments

Comments
 (0)