Skip to content

Commit 75658a2

Browse files
fixed the async scraper and added the sync-scraper as its own separate py file
1 parent ed78ff0 commit 75658a2

File tree

2 files changed

+184
-57
lines changed

2 files changed

+184
-57
lines changed

scraper.py

Lines changed: 70 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ async def fetch(session, url):
1414

1515
async def download_image(session, url, path_to_file):
1616
try:
17-
print(f"Downloading image from URL: {url}")
17+
print(f"\tDownloading image from URL: {url}")
1818
async with session.get(url) as response:
1919
if response.status == 200:
2020
with open(path_to_file, 'wb') as f:
@@ -23,78 +23,91 @@ async def download_image(session, url, path_to_file):
2323
if not chunk:
2424
break
2525
f.write(chunk)
26-
print(f"Image saved to: {path_to_file}")
26+
print(f"\t\tImage saved to: {path_to_file}")
2727
else:
28-
print(f"Failed to download {url}. Status code: {response.status}")
28+
print(f"\t\tFailed to download {url}. Status code: {response.status}")
2929
except Exception as e:
30-
print(f"An error occurred while downloading {url}: {e}")
30+
print(f"\t\tAn error occurred while downloading {url}: {e}")
3131

3232
async def parse_navbar(session, url, skins_dir):
33-
print(f"Parsing navbar: {url}")
33+
print(f"\tParsing navbar: {url}")
3434
html = await fetch(session, url)
3535
soup = BeautifulSoup(html, 'lxml')
3636

3737
navbar = soup.find('nav', class_='main')
38-
sections = navbar.find_all('li')[1:-1] # Skip the first and last item
38+
sections = navbar.find_all('li')[:-1] # Skip the last item
3939

4040
tasks = [parse_section(session, url, li.a['href'], skins_dir) for li in sections]
4141
await asyncio.gather(*tasks)
4242

43+
async def get_num_pages(session, section_url):
44+
html = await fetch(session, section_url)
45+
soup = BeautifulSoup(html, 'lxml')
46+
47+
# Get the page counter
48+
page_counter = soup.find('span', class_='count')
49+
# Get the page counter's string
50+
page_counter_span = page_counter.find('span')
51+
page_counter_str = page_counter_span.text
52+
# Get the start of the number representing the number of pages
53+
page_count_start = page_counter_str.rfind(' ') + 1
54+
# Return the number of pages
55+
return int(page_counter_str[page_count_start:])
56+
4357
async def parse_section(session, base_url, section, skins_dir):
4458
section_dir = os.path.join(skins_dir, os.path.basename(section))
4559
safe_mkdir(section_dir)
4660

4761
section_url = base_url + section
48-
html = await fetch(session, section_url)
49-
soup = BeautifulSoup(html, 'lxml')
50-
51-
# Find all the skin blocks in the section.
52-
skin_blocks = soup.find_all('div', class_='card')
53-
54-
# Loop over each skin block
55-
for block in skin_blocks:
56-
# Extract the relative URL of the skin
57-
skin = block.find('a')['href']
58-
59-
# Create the section directories if they don't exist
60-
skinDir = section_dir + skin
61-
safe_mkdir(skinDir)
62-
63-
# Get the URL to the skin
64-
skinURL = base_url + skin
65-
skinResult = await fetch(session, skinURL)
66-
67-
# Get the name of the skin
68-
skinName = skinResult[skinResult.find("<h2 class=\"card-title\">") + 23:]
69-
skinName = skinName[:skinName.find('<')]
70-
71-
# Get the description for the skin
72-
skinDescription = skinResult[skinResult.find("<p class=\"card-description\">") + 28:]
73-
skinDescription = skinDescription[:skinDescription.find('<')]
74-
75-
# Create a text file containing the skin's name and description
76-
with open(skinDir + "/meta.txt", 'w') as f:
77-
f.write(f"Name: { skinName }\nDescription: { skinDescription }")
78-
79-
# Get the URL to the skin img
80-
skinImgURL = skinURL + "/download"
81-
path_to_file = skinDir + "/skin.png"
82-
await download_image(session, skinImgURL, path_to_file)
83-
84-
# Create a text file containing the skin's name and description
85-
with open(os.path.join(skinDir, "meta.txt"), 'w') as f:
86-
f.write(f"Name: { skinName }\nDescription: { skinDescription }")
87-
88-
# Get the URL to the skin img
89-
skinImgURL = skinURL + "/download"
90-
path_to_file = os.path.join(skinDir, "skin.png")
91-
await download_image(session, skinImgURL, path_to_file)
92-
93-
# Pagination: Continue to the next page if a 'next' button is present.
94-
next_button = soup.find('a', string='Next')
95-
if next_button and next_button.has_attr('href'):
96-
next_page_url = base_url + next_button['href']
97-
await parse_section(session, base_url, next_page_url, skins_dir)
62+
num_pages = await get_num_pages(session, section_url)
63+
64+
# Loop over all pages
65+
for i in range(1, num_pages + 1):
66+
section_page_url = f"{ section_url }/{ i }"
67+
html = await fetch(session, section_page_url)
68+
soup = BeautifulSoup(html, 'lxml')
69+
70+
# Find all the skin blocks in the section.
71+
skin_blocks = soup.find_all('div', class_='card')
72+
73+
# Loop over each skin block
74+
for block in skin_blocks:
75+
# Extract the relative URL of the skin
76+
skin = block.find('a')['href']
77+
78+
# Create the section directories if they don't exist
79+
skinDir = section_dir + skin
80+
safe_mkdir(skinDir)
81+
82+
# Get the URL to the skin
83+
skinURL = base_url + skin
84+
skinResult = await fetch(session, skinURL)
85+
86+
# Get the name of the skin
87+
skinName = skinResult[skinResult.find("<h2 class=\"card-title\">") + 23:]
88+
skinName = skinName[:skinName.find('<')]
89+
90+
# Get the description for the skin
91+
skinDescription = skinResult[skinResult.find("<p class=\"card-description\">") + 28:]
92+
skinDescription = skinDescription[:skinDescription.find('<')]
93+
94+
# Create a text file containing the skin's name and description
95+
with open(skinDir + "/meta.txt", 'w') as f:
96+
f.write(f"Name: { skinName }\nDescription: { skinDescription }")
97+
98+
# Get the URL to the skin img
99+
skinImgURL = skinURL + "/download"
100+
path_to_file = skinDir + "/skin.png"
101+
await download_image(session, skinImgURL, path_to_file)
102+
103+
# Create a text file containing the skin's name and description
104+
with open(os.path.join(skinDir, "meta.txt"), 'w') as f:
105+
f.write(f"Name: { skinName }\nDescription: { skinDescription }")
106+
107+
# Get the URL to the skin img
108+
skinImgURL = skinURL + "/download"
109+
path_to_file = os.path.join(skinDir, "skin.png")
110+
await download_image(session, skinImgURL, path_to_file)
98111

99112
async def main():
100113
print("Starting the script.")
@@ -110,4 +123,4 @@ async def main():
110123
try:
111124
asyncio.run(main())
112125
except Exception as e:
113-
print(f"An error occurred: {e}")
126+
print(f"An error occurred: {e}")

sync-scraper.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
import requests
2+
import shutil
3+
import os
4+
5+
def removeWhitespace(string):
6+
string = string.replace('\n', '');
7+
string = string.replace('\t', '');
8+
string = string.replace(' ', '');
9+
return string
10+
11+
def safeMkdir(directory):
12+
if not os.path.isdir(directory):
13+
os.mkdir(directory)
14+
15+
# Creates the skins directory if it doesn't exist
16+
skinsDir = "skins"
17+
safeMkdir(skinsDir)
18+
19+
print("Getting the navbar!")
20+
# The URL to the website
21+
url = "https://www.minecraftskins.net"
22+
# Get the HTML from the website
23+
result = requests.get(url).text
24+
# Remove whitespace and unnecessary characters
25+
result = removeWhitespace(result)
26+
27+
# Get the navbar
28+
navbar = result[result.find("<navclass=\"main\">"):result.find("</nav>")]
29+
print("Got the navbar!")
30+
31+
print("Getting the sections!")
32+
# Get the navbar's sections
33+
sections = navbar.split("<li>")
34+
# Remove the unnecessary sections at the start and end
35+
sections.pop(0)
36+
sections.pop()
37+
# Remove the HTML surrounding the hrefs
38+
sections = [section[8:] for section in sections]
39+
sections = [section[:section.find('"')] for section in sections]
40+
print("Got the sections!")
41+
42+
# Loop over each section
43+
for section in sections:
44+
print(f"Getting section { section[9:] }")
45+
# Create the section directories if they don't exist
46+
sectionDir = skinsDir + section[9:]
47+
safeMkdir(sectionDir)
48+
49+
# Get the URL to the section
50+
sectionURL = url + section
51+
# Get the HTML from the section
52+
sectionResult = requests.get(sectionURL).text
53+
# Remove whitespace and unnecessary characters
54+
sectionResult = removeWhitespace(sectionResult)
55+
56+
# Get the counter
57+
sectionCounter = sectionResult[sectionResult.find("<spanclass=\"count\">"):]
58+
59+
# Get the number of pages
60+
numPages = sectionCounter[sectionCounter.find("of") + 2:]
61+
numPages = int(numPages[:numPages.find('<')])
62+
63+
# Loop over each page
64+
for i in range(1, numPages + 1):
65+
print(f"\tGetting page { i }")
66+
# Get the URL to the page
67+
pageURL = f"{ sectionURL }/{ i }"
68+
# Get the HTML from the section
69+
pageResult = requests.get(pageURL).text
70+
# Remove whitespace and unnecessary characters
71+
pageResult = removeWhitespace(pageResult)
72+
73+
# Get the skin section
74+
pageSection = pageResult[pageResult.find("<divclass=\"rowgrid\">"):]
75+
76+
# Get the skins
77+
skins = pageSection.split("<aclass=\"panel-link\"href=\"")
78+
skins = [skin[:skin.find('"')] for skin in skins]
79+
# Remove the unnecessary sections at the start
80+
skins.pop(0)
81+
82+
# Loop over each skin
83+
for skin in skins:
84+
print(f"\t\tGetting skin { skin }")
85+
# Create the section directories if they don't exist
86+
skinDir = sectionDir + skin
87+
safeMkdir(skinDir)
88+
89+
# Get the URL to the skin
90+
skinURL = url + skin
91+
skinResult = requests.get(skinURL).text
92+
93+
# Get the name of the skin
94+
skinName = skinResult[skinResult.find("<h2 class=\"hero-title\">") + 23:]
95+
skinName = skinName[:skinName.find('<')]
96+
97+
# Get the description for the skin
98+
skinDescription = skinResult[skinResult.find("<p class=\"card-description\">") + 28:]
99+
skinDescription = skinDescription[:skinDescription.find('<')]
100+
101+
# Create a text file containing the skin's name and description
102+
with open(skinDir + "/meta.txt", 'w') as f:
103+
f.write(f"Name: { skinName }\nDescription: { skinDescription }")
104+
105+
# Get the URL to the skin img
106+
skinImgURL = skinURL + "/download"
107+
skinImgResult = requests.get(skinImgURL, stream=True).raw
108+
skinImgResult.decode_content = True
109+
# Save the skin img
110+
with open(skinDir + "/skin.png", "wb") as f:
111+
shutil.copyfileobj(skinImgResult, f)
112+
print(f"\t\tGot skin { skin }")
113+
print(f"\tGot page { i }")
114+
print(f"Got section { section[9:] }")

0 commit comments

Comments
 (0)