Skip to content

Commit 9fa64e2

Browse files
authored
simple-web-scraper
When run the program will ask for a link and the amount of threads it will use. the output of the scrape will be saved to a file named index.json.
1 parent 79b5940 commit 9fa64e2

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

web-scraper.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#Rembember to install beautifulsoup
2+
#pip install requests beautifulsoup4
3+
4+
import requests
5+
from bs4 import BeautifulSoup
6+
import json
7+
8+
class WebCrawler:
9+
def __init__(self, base_url):
10+
self.base_url = base_url
11+
self.visited_links = set()
12+
self.links = []
13+
self.images = []
14+
15+
def get_page_content(self, url):
16+
try:
17+
response = requests.get(url)
18+
if response.status_code == 200:
19+
return response.text
20+
else:
21+
print(f"Failed to fetch {url}. Status code: {response.status_code}")
22+
except Exception as e:
23+
print(f"Error fetching {url}: {e}")
24+
return None
25+
26+
def extract_links_and_images(self, html_content):
27+
if html_content:
28+
soup = BeautifulSoup(html_content, 'html.parser')
29+
for link in soup.find_all('a', href=True):
30+
self.links.append(link['href'])
31+
for img in soup.find_all('img', src=True):
32+
self.images.append(img['src'])
33+
34+
def crawl_page(self, url):
35+
if url not in self.visited_links:
36+
print(f"Crawling: {url}")
37+
html_content = self.get_page_content(url)
38+
if html_content:
39+
self.extract_links_and_images(html_content)
40+
self.visited_links.add(url)
41+
42+
def save_results_to_json(self):
43+
results = {
44+
"links": list(set(self.links)),
45+
"images": list(set(self.images))
46+
}
47+
with open("index.json", "w") as index_file:
48+
json.dump(results, index_file, indent=2)
49+
print("Results saved to index.json")
50+
51+
def crawl_site(self, max_pages=10):
52+
queue = [self.base_url]
53+
54+
while queue and len(self.visited_links) < max_pages:
55+
current_url = queue.pop(0)
56+
self.crawl_page(current_url)
57+
58+
# Extract links from the current page and add them to the queue
59+
html_content = self.get_page_content(current_url)
60+
if html_content:
61+
soup = BeautifulSoup(html_content, 'html.parser')
62+
for link in soup.find_all('a', href=True):
63+
absolute_url = link['href']
64+
if absolute_url.startswith(self.base_url) and absolute_url not in self.visited_links:
65+
queue.append(absolute_url)
66+
67+
self.save_results_to_json()
68+
69+
if __name__ == '__main__':
70+
# Replace 'https://example.com' with the target website URL
71+
base_url = input("Enter the website URL: ")
72+
73+
# Create an instance of WebCrawler and crawl the site (adjust max_pages as needed)
74+
web_crawler = WebCrawler(base_url)
75+
web_crawler.crawl_site(max_pages=10)

0 commit comments

Comments
 (0)