Skip to content

Commit ed78ff0

Browse files
Merge pull request #1 from Kiddooo/conflict-resolve
Adding async version from Kiddooo
2 parents 140a5e5 + 5e27282 commit ed78ff0

File tree

3 files changed

+111
-108
lines changed

3 files changed

+111
-108
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
scraper2.py
2+
skins/

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,4 @@
11
Pillow==10.0.0
22
requests==2.28.1
3+
aiohttp==3.9.1
4+
lxml==4.9.3

scraper.py

Lines changed: 107 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,114 +1,113 @@
1-
import requests
2-
import shutil
1+
from bs4 import BeautifulSoup
2+
import aiohttp
3+
import asyncio
34
import os
45

5-
def removeWhitespace(string):
6-
string = string.replace('\n', '');
7-
string = string.replace('\t', '');
8-
string = string.replace(' ', '');
9-
return string
10-
11-
def safeMkdir(directory):
6+
def safe_mkdir(directory):
127
if not os.path.isdir(directory):
138
os.mkdir(directory)
149

15-
# Creates the skins directory if it doesn't exist
16-
skinsDir = "skins"
17-
safeMkdir(skinsDir)
18-
19-
print("Getting the navbar!")
20-
# The URL to the website
21-
url = "https://www.minecraftskins.net"
22-
# Get the HTML from the website
23-
result = requests.get(url).text
24-
# Remove whitespace and unnecessary characters
25-
result = removeWhitespace(result)
26-
27-
# Get the navbar
28-
navbar = result[result.find("<navclass=\"main\">"):result.find("</nav>")]
29-
print("Got the navbar!")
30-
31-
print("Getting the sections!")
32-
# Get the navbar's sections
33-
sections = navbar.split("<li>")
34-
# Remove the unnecessary sections at the start and end
35-
sections.pop(0)
36-
sections.pop()
37-
# Remove the HTML surrounding the hrefs
38-
sections = [section[8:] for section in sections]
39-
sections = [section[:section.find('"')] for section in sections]
40-
print("Got the sections!")
41-
42-
# Loop over each section
43-
for section in sections:
44-
print(f"Getting section { section[9:] }")
45-
# Create the section directories if they don't exist
46-
sectionDir = skinsDir + section[9:]
47-
safeMkdir(sectionDir)
48-
49-
# Get the URL to the section
50-
sectionURL = url + section
51-
# Get the HTML from the section
52-
sectionResult = requests.get(sectionURL).text
53-
# Remove whitespace and unnecessary characters
54-
sectionResult = removeWhitespace(sectionResult)
55-
56-
# Get the counter
57-
sectionCounter = sectionResult[sectionResult.find("<spanclass=\"count\">"):]
58-
59-
# Get the number of pages
60-
numPages = sectionCounter[sectionCounter.find("of") + 2:]
61-
numPages = int(numPages[:numPages.find('<')])
10+
async def fetch(session, url):
11+
print(f"Fetching URL: {url}")
12+
async with session.get(url) as response:
13+
return await response.text()
14+
15+
async def download_image(session, url, path_to_file):
16+
try:
17+
print(f"Downloading image from URL: {url}")
18+
async with session.get(url) as response:
19+
if response.status == 200:
20+
with open(path_to_file, 'wb') as f:
21+
while True:
22+
chunk = await response.content.read(1024)
23+
if not chunk:
24+
break
25+
f.write(chunk)
26+
print(f"Image saved to: {path_to_file}")
27+
else:
28+
print(f"Failed to download {url}. Status code: {response.status}")
29+
except Exception as e:
30+
print(f"An error occurred while downloading {url}: {e}")
31+
32+
async def parse_navbar(session, url, skins_dir):
33+
print(f"Parsing navbar: {url}")
34+
html = await fetch(session, url)
35+
soup = BeautifulSoup(html, 'lxml')
36+
37+
navbar = soup.find('nav', class_='main')
38+
sections = navbar.find_all('li')[1:-1] # Skip the first and last item
39+
40+
tasks = [parse_section(session, url, li.a['href'], skins_dir) for li in sections]
41+
await asyncio.gather(*tasks)
42+
43+
async def parse_section(session, base_url, section, skins_dir):
44+
section_dir = os.path.join(skins_dir, os.path.basename(section))
45+
safe_mkdir(section_dir)
46+
47+
section_url = base_url + section
48+
html = await fetch(session, section_url)
49+
soup = BeautifulSoup(html, 'lxml')
6250

63-
# Loop over each page
64-
for i in range(1, numPages + 1):
65-
print(f"\tGetting page { i }")
66-
# Get the URL to the page
67-
pageURL = f"{ sectionURL }/{ i }"
68-
# Get the HTML from the section
69-
pageResult = requests.get(pageURL).text
70-
# Remove whitespace and unnecessary characters
71-
pageResult = removeWhitespace(pageResult)
72-
73-
# Get the skin section
74-
pageSection = pageResult[pageResult.find("<divclass=\"rowgrid\">"):]
75-
76-
# Get the skins
77-
skins = pageSection.split("<aclass=\"panel-link\"href=\"")
78-
skins = [skin[:skin.find('"')] for skin in skins]
79-
# Remove the unnecessary sections at the start
80-
skins.pop(0)
81-
82-
# Loop over each skin
83-
for skin in skins:
84-
print(f"\t\tGetting skin { skin }")
85-
# Create the section directories if they don't exist
86-
skinDir = sectionDir + skin
87-
safeMkdir(skinDir)
88-
89-
# Get the URL to the skin
90-
skinURL = url + skin
91-
skinResult = requests.get(skinURL).text
92-
93-
# Get the name of the skin
94-
skinName = skinResult[skinResult.find("<h2 class=\"hero-title\">") + 23:]
95-
skinName = skinName[:skinName.find('<')]
96-
97-
# Get the description for the skin
98-
skinDescription = skinResult[skinResult.find("<p class=\"card-description\">") + 28:]
99-
skinDescription = skinDescription[:skinDescription.find('<')]
100-
101-
# Create a text file containing the skin's name and description
102-
with open(skinDir + "/meta.txt", 'w') as f:
103-
f.write(f"Name: { skinName }\nDescription: { skinDescription }")
104-
105-
# Get the URL to the skin img
106-
skinImgURL = skinURL + "/download"
107-
skinImgResult = requests.get(skinImgURL, stream=True).raw
108-
skinImgResult.decode_content = True
109-
# Save the skin img
110-
with open(skinDir + "/skin.png", "wb") as f:
111-
shutil.copyfileobj(skinImgResult, f)
112-
print(f"\t\tGot skin { skin }")
113-
print(f"\tGot page { i }")
114-
print(f"Got section { section[9:] }")
51+
# Find all the skin blocks in the section.
52+
skin_blocks = soup.find_all('div', class_='card')
53+
54+
# Loop over each skin block
55+
for block in skin_blocks:
56+
# Extract the relative URL of the skin
57+
skin = block.find('a')['href']
58+
59+
# Create the section directories if they don't exist
60+
skinDir = section_dir + skin
61+
safe_mkdir(skinDir)
62+
63+
# Get the URL to the skin
64+
skinURL = base_url + skin
65+
skinResult = await fetch(session, skinURL)
66+
67+
# Get the name of the skin
68+
skinName = skinResult[skinResult.find("<h2 class=\"card-title\">") + 23:]
69+
skinName = skinName[:skinName.find('<')]
70+
71+
# Get the description for the skin
72+
skinDescription = skinResult[skinResult.find("<p class=\"card-description\">") + 28:]
73+
skinDescription = skinDescription[:skinDescription.find('<')]
74+
75+
# Create a text file containing the skin's name and description
76+
with open(skinDir + "/meta.txt", 'w') as f:
77+
f.write(f"Name: { skinName }\nDescription: { skinDescription }")
78+
79+
# Get the URL to the skin img
80+
skinImgURL = skinURL + "/download"
81+
path_to_file = skinDir + "/skin.png"
82+
await download_image(session, skinImgURL, path_to_file)
83+
84+
# Create a text file containing the skin's name and description
85+
with open(os.path.join(skinDir, "meta.txt"), 'w') as f:
86+
f.write(f"Name: { skinName }\nDescription: { skinDescription }")
87+
88+
# Get the URL to the skin img
89+
skinImgURL = skinURL + "/download"
90+
path_to_file = os.path.join(skinDir, "skin.png")
91+
await download_image(session, skinImgURL, path_to_file)
92+
93+
# Pagination: Continue to the next page if a 'next' button is present.
94+
next_button = soup.find('a', string='Next')
95+
if next_button and next_button.has_attr('href'):
96+
next_page_url = base_url + next_button['href']
97+
await parse_section(session, base_url, next_page_url, skins_dir)
98+
99+
async def main():
100+
print("Starting the script.")
101+
skins_dir = "skins"
102+
safe_mkdir(skins_dir)
103+
104+
url = "https://www.minecraftskins.net"
105+
106+
async with aiohttp.ClientSession() as session:
107+
await parse_navbar(session, url, skins_dir)
108+
109+
# Run the main coroutine
110+
try:
111+
asyncio.run(main())
112+
except Exception as e:
113+
print(f"An error occurred: {e}")

0 commit comments

Comments
 (0)