Merge pull request #1 from Kiddooo/conflict-resolve

RandomGamingDev · web-flow · commit ed78ff06da44 · 2023-12-16T16:49:20.000-05:00
Adding async version from Kiddooo
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+scraper2.py
+skins/
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,4 @@
 Pillow==10.0.0
 requests==2.28.1
+aiohttp==3.9.1
+lxml==4.9.3
diff --git a/scraper.py b/scraper.py
@@ -1,114 +1,113 @@
-import requests
-import shutil
+from bs4 import BeautifulSoup
+import aiohttp
+import asyncio
 import os
 
-def removeWhitespace(string):
-    string = string.replace('\n', '');
-    string = string.replace('\t', '');
-    string = string.replace(' ', '');
-    return string
-
-def safeMkdir(directory):
+def safe_mkdir(directory):
     if not os.path.isdir(directory):
         os.mkdir(directory)
 
-# Creates the skins directory if it doesn't exist
-skinsDir = "skins"
-safeMkdir(skinsDir)
-
-print("Getting the navbar!")
-# The URL to the website
-url = "https://www.minecraftskins.net"
-# Get the HTML from the website
-result = requests.get(url).text
-# Remove whitespace and unnecessary characters
-result = removeWhitespace(result)
-
-# Get the navbar
-navbar = result[result.find("<navclass=\"main\">"):result.find("</nav>")]
-print("Got the navbar!")
-
-print("Getting the sections!")
-# Get the navbar's sections
-sections = navbar.split("<li>")
-# Remove the unnecessary sections at the start and end
-sections.pop(0)
-sections.pop()
-# Remove the HTML surrounding the hrefs
-sections = [section[8:] for section in sections]
-sections = [section[:section.find('"')] for section in sections]
-print("Got the sections!")
-
-# Loop over each section
-for section in sections:
-    print(f"Getting section { section[9:] }")
-    # Create the section directories if they don't exist
-    sectionDir = skinsDir + section[9:]
-    safeMkdir(sectionDir)
-
-    # Get the URL to the section
-    sectionURL = url + section
-    # Get the HTML from the section
-    sectionResult = requests.get(sectionURL).text
-    # Remove whitespace and unnecessary characters
-    sectionResult = removeWhitespace(sectionResult)
-
-    # Get the counter
-    sectionCounter = sectionResult[sectionResult.find("<spanclass=\"count\">"):]
-
-    # Get the number of pages
-    numPages = sectionCounter[sectionCounter.find("of") + 2:]
-    numPages = int(numPages[:numPages.find('<')])
+async def fetch(session, url):
+    print(f"Fetching URL: {url}")
+    async with session.get(url) as response:
+        return await response.text()
+
+async def download_image(session, url, path_to_file):
+    try:
+        print(f"Downloading image from URL: {url}")
+        async with session.get(url) as response:
+            if response.status == 200:
+                with open(path_to_file, 'wb') as f:
+                    while True:
+                        chunk = await response.content.read(1024)
+                        if not chunk:
+                            break
+                        f.write(chunk)
+                print(f"Image saved to: {path_to_file}")
+            else:
+                print(f"Failed to download {url}. Status code: {response.status}")
+    except Exception as e:
+        print(f"An error occurred while downloading {url}: {e}")
+
+async def parse_navbar(session, url, skins_dir):
+    print(f"Parsing navbar: {url}")
+    html = await fetch(session, url)
+    soup = BeautifulSoup(html, 'lxml')
+
+    navbar = soup.find('nav', class_='main')
+    sections = navbar.find_all('li')[1:-1]  # Skip the first and last item
+
+    tasks = [parse_section(session, url, li.a['href'], skins_dir) for li in sections]
+    await asyncio.gather(*tasks)
+
+async def parse_section(session, base_url, section, skins_dir):
+    section_dir = os.path.join(skins_dir, os.path.basename(section))
+    safe_mkdir(section_dir)
+
+    section_url = base_url + section
+    html = await fetch(session, section_url)
+    soup = BeautifulSoup(html, 'lxml')
     
-    # Loop over each page
-    for i in range(1, numPages + 1):
-        print(f"\tGetting page { i }")
-        # Get the URL to the page
-        pageURL = f"{ sectionURL }/{ i }"
-        # Get the HTML from the section
-        pageResult = requests.get(pageURL).text
-        # Remove whitespace and unnecessary characters
-        pageResult = removeWhitespace(pageResult)
-
-        # Get the skin section
-        pageSection = pageResult[pageResult.find("<divclass=\"rowgrid\">"):]
-
-        # Get the skins
-        skins = pageSection.split("<aclass=\"panel-link\"href=\"")
-        skins = [skin[:skin.find('"')] for skin in skins]
-        # Remove the unnecessary sections at the start
-        skins.pop(0)
-        
-        # Loop over each skin
-        for skin in skins:
-            print(f"\t\tGetting skin { skin }")
-            # Create the section directories if they don't exist
-            skinDir = sectionDir + skin
-            safeMkdir(skinDir)
-
-            # Get the URL to the skin
-            skinURL = url + skin
-            skinResult = requests.get(skinURL).text
-
-            # Get the name of the skin
-            skinName = skinResult[skinResult.find("<h2 class=\"hero-title\">") + 23:]
-            skinName = skinName[:skinName.find('<')]
-
-            # Get the description for the skin
-            skinDescription = skinResult[skinResult.find("<p class=\"card-description\">") + 28:]
-            skinDescription = skinDescription[:skinDescription.find('<')]
-
-            # Create a text file containing the skin's name and description
-            with open(skinDir + "/meta.txt", 'w') as f:
-                f.write(f"Name: { skinName }\nDescription: { skinDescription }")
-
-            # Get the URL to the skin img
-            skinImgURL = skinURL + "/download"
-            skinImgResult = requests.get(skinImgURL, stream=True).raw
-            skinImgResult.decode_content = True
-            # Save the skin img
-            with open(skinDir + "/skin.png", "wb") as f:
-                shutil.copyfileobj(skinImgResult, f)
-            print(f"\t\tGot skin { skin }")
-        print(f"\tGot page { i }")
-    print(f"Got section { section[9:] }")
+    # Find all the skin blocks in the section.
+    skin_blocks = soup.find_all('div', class_='card')
+
+    # Loop over each skin block
+    for block in skin_blocks:
+        # Extract the relative URL of the skin
+        skin = block.find('a')['href']
+
+        # Create the section directories if they don't exist
+        skinDir = section_dir + skin
+        safe_mkdir(skinDir)
+
+        # Get the URL to the skin
+        skinURL = base_url + skin
+        skinResult = await fetch(session, skinURL)
+
+        # Get the name of the skin
+        skinName = skinResult[skinResult.find("<h2 class=\"card-title\">") + 23:]
+        skinName = skinName[:skinName.find('<')]
+
+        # Get the description for the skin
+        skinDescription = skinResult[skinResult.find("<p class=\"card-description\">") + 28:]
+        skinDescription = skinDescription[:skinDescription.find('<')]
+
+        # Create a text file containing the skin's name and description
+        with open(skinDir + "/meta.txt", 'w') as f:
+            f.write(f"Name: { skinName }\nDescription: { skinDescription }")
+
+        # Get the URL to the skin img
+        skinImgURL = skinURL + "/download"
+        path_to_file = skinDir + "/skin.png"
+        await download_image(session, skinImgURL, path_to_file)
+
+        # Create a text file containing the skin's name and description
+        with open(os.path.join(skinDir, "meta.txt"), 'w') as f:
+            f.write(f"Name: { skinName }\nDescription: { skinDescription }")
+
+        # Get the URL to the skin img
+        skinImgURL = skinURL + "/download"
+        path_to_file = os.path.join(skinDir, "skin.png")
+        await download_image(session, skinImgURL, path_to_file)
+
+    # Pagination: Continue to the next page if a 'next' button is present.
+    next_button = soup.find('a', string='Next')
+    if next_button and next_button.has_attr('href'):
+        next_page_url = base_url + next_button['href']
+        await parse_section(session, base_url, next_page_url, skins_dir)
+
+async def main():
+    print("Starting the script.")
+    skins_dir = "skins"
+    safe_mkdir(skins_dir)
+
+    url = "https://www.minecraftskins.net"
+
+    async with aiohttp.ClientSession() as session:
+        await parse_navbar(session, url, skins_dir)
+
+# Run the main coroutine
+try:
+    asyncio.run(main())
+except Exception as e:
+    print(f"An error occurred: {e}")