fixed the async scraper and added the sync-scraper as its own separate py file

RandomGamingDev · RandomGamingDev · commit 75658a2bffa4 · 2023-12-16T16:54:42.000-05:00
diff --git a/scraper.py b/scraper.py
@@ -14,7 +14,7 @@ async def fetch(session, url):
 
 async def download_image(session, url, path_to_file):
     try:
-        print(f"Downloading image from URL: {url}")
+        print(f"\tDownloading image from URL: {url}")
         async with session.get(url) as response:
             if response.status == 200:
                 with open(path_to_file, 'wb') as f:
@@ -23,78 +23,91 @@ async def download_image(session, url, path_to_file):
                         if not chunk:
                             break
                         f.write(chunk)
-                print(f"Image saved to: {path_to_file}")
+                print(f"\t\tImage saved to: {path_to_file}")
             else:
-                print(f"Failed to download {url}. Status code: {response.status}")
+                print(f"\t\tFailed to download {url}. Status code: {response.status}")
     except Exception as e:
-        print(f"An error occurred while downloading {url}: {e}")
+        print(f"\t\tAn error occurred while downloading {url}: {e}")
 
 async def parse_navbar(session, url, skins_dir):
-    print(f"Parsing navbar: {url}")
+    print(f"\tParsing navbar: {url}")
     html = await fetch(session, url)
     soup = BeautifulSoup(html, 'lxml')
 
     navbar = soup.find('nav', class_='main')
-    sections = navbar.find_all('li')[1:-1]  # Skip the first and last item
+    sections = navbar.find_all('li')[:-1]  # Skip the last item
 
     tasks = [parse_section(session, url, li.a['href'], skins_dir) for li in sections]
     await asyncio.gather(*tasks)
 
+async def get_num_pages(session, section_url):
+    html = await fetch(session, section_url)
+    soup = BeautifulSoup(html, 'lxml')
+    
+    # Get the page counter
+    page_counter = soup.find('span', class_='count')
+    # Get the page counter's string
+    page_counter_span = page_counter.find('span')
+    page_counter_str = page_counter_span.text
+    # Get the start of the number representing the number of pages
+    page_count_start = page_counter_str.rfind(' ') + 1
+    # Return the number of pages
+    return int(page_counter_str[page_count_start:])
+
 async def parse_section(session, base_url, section, skins_dir):
     section_dir = os.path.join(skins_dir, os.path.basename(section))
     safe_mkdir(section_dir)
 
     section_url = base_url + section
-    html = await fetch(session, section_url)
-    soup = BeautifulSoup(html, 'lxml')
-    
-    # Find all the skin blocks in the section.
-    skin_blocks = soup.find_all('div', class_='card')
-
-    # Loop over each skin block
-    for block in skin_blocks:
-        # Extract the relative URL of the skin
-        skin = block.find('a')['href']
-
-        # Create the section directories if they don't exist
-        skinDir = section_dir + skin
-        safe_mkdir(skinDir)
-
-        # Get the URL to the skin
-        skinURL = base_url + skin
-        skinResult = await fetch(session, skinURL)
-
-        # Get the name of the skin
-        skinName = skinResult[skinResult.find("<h2 class=\"card-title\">") + 23:]
-        skinName = skinName[:skinName.find('<')]
-
-        # Get the description for the skin
-        skinDescription = skinResult[skinResult.find("<p class=\"card-description\">") + 28:]
-        skinDescription = skinDescription[:skinDescription.find('<')]
-
-        # Create a text file containing the skin's name and description
-        with open(skinDir + "/meta.txt", 'w') as f:
-            f.write(f"Name: { skinName }\nDescription: { skinDescription }")
-
-        # Get the URL to the skin img
-        skinImgURL = skinURL + "/download"
-        path_to_file = skinDir + "/skin.png"
-        await download_image(session, skinImgURL, path_to_file)
-
-        # Create a text file containing the skin's name and description
-        with open(os.path.join(skinDir, "meta.txt"), 'w') as f:
-            f.write(f"Name: { skinName }\nDescription: { skinDescription }")
-
-        # Get the URL to the skin img
-        skinImgURL = skinURL + "/download"
-        path_to_file = os.path.join(skinDir, "skin.png")
-        await download_image(session, skinImgURL, path_to_file)
-
-    # Pagination: Continue to the next page if a 'next' button is present.
-    next_button = soup.find('a', string='Next')
-    if next_button and next_button.has_attr('href'):
-        next_page_url = base_url + next_button['href']
-        await parse_section(session, base_url, next_page_url, skins_dir)
+    num_pages = await get_num_pages(session, section_url)
+
+    # Loop over all pages
+    for i in range(1, num_pages + 1):
+        section_page_url = f"{ section_url }/{ i }"
+        html = await fetch(session, section_page_url)
+        soup = BeautifulSoup(html, 'lxml')
+
+        # Find all the skin blocks in the section.
+        skin_blocks = soup.find_all('div', class_='card')
+
+        # Loop over each skin block
+        for block in skin_blocks:
+            # Extract the relative URL of the skin
+            skin = block.find('a')['href']
+
+            # Create the section directories if they don't exist
+            skinDir = section_dir + skin
+            safe_mkdir(skinDir)
+
+            # Get the URL to the skin
+            skinURL = base_url + skin
+            skinResult = await fetch(session, skinURL)
+
+            # Get the name of the skin
+            skinName = skinResult[skinResult.find("<h2 class=\"card-title\">") + 23:]
+            skinName = skinName[:skinName.find('<')]
+
+            # Get the description for the skin
+            skinDescription = skinResult[skinResult.find("<p class=\"card-description\">") + 28:]
+            skinDescription = skinDescription[:skinDescription.find('<')]
+
+            # Create a text file containing the skin's name and description
+            with open(skinDir + "/meta.txt", 'w') as f:
+                f.write(f"Name: { skinName }\nDescription: { skinDescription }")
+
+            # Get the URL to the skin img
+            skinImgURL = skinURL + "/download"
+            path_to_file = skinDir + "/skin.png"
+            await download_image(session, skinImgURL, path_to_file)
+
+            # Create a text file containing the skin's name and description
+            with open(os.path.join(skinDir, "meta.txt"), 'w') as f:
+                f.write(f"Name: { skinName }\nDescription: { skinDescription }")
+
+            # Get the URL to the skin img
+            skinImgURL = skinURL + "/download"
+            path_to_file = os.path.join(skinDir, "skin.png")
+            await download_image(session, skinImgURL, path_to_file)
 
 async def main():
     print("Starting the script.")
@@ -110,4 +123,4 @@ async def main():
 try:
     asyncio.run(main())
 except Exception as e:
-    print(f"An error occurred: {e}")
+    print(f"An error occurred: {e}")
diff --git a/sync-scraper.py b/sync-scraper.py
@@ -0,0 +1,114 @@
+import requests
+import shutil
+import os
+
+def removeWhitespace(string):
+    string = string.replace('\n', '');
+    string = string.replace('\t', '');
+    string = string.replace(' ', '');
+    return string
+
+def safeMkdir(directory):
+    if not os.path.isdir(directory):
+        os.mkdir(directory)
+
+# Creates the skins directory if it doesn't exist
+skinsDir = "skins"
+safeMkdir(skinsDir)
+
+print("Getting the navbar!")
+# The URL to the website
+url = "https://www.minecraftskins.net"
+# Get the HTML from the website
+result = requests.get(url).text
+# Remove whitespace and unnecessary characters
+result = removeWhitespace(result)
+
+# Get the navbar
+navbar = result[result.find("<navclass=\"main\">"):result.find("</nav>")]
+print("Got the navbar!")
+
+print("Getting the sections!")
+# Get the navbar's sections
+sections = navbar.split("<li>")
+# Remove the unnecessary sections at the start and end
+sections.pop(0)
+sections.pop()
+# Remove the HTML surrounding the hrefs
+sections = [section[8:] for section in sections]
+sections = [section[:section.find('"')] for section in sections]
+print("Got the sections!")
+
+# Loop over each section
+for section in sections:
+    print(f"Getting section { section[9:] }")
+    # Create the section directories if they don't exist
+    sectionDir = skinsDir + section[9:]
+    safeMkdir(sectionDir)
+
+    # Get the URL to the section
+    sectionURL = url + section
+    # Get the HTML from the section
+    sectionResult = requests.get(sectionURL).text
+    # Remove whitespace and unnecessary characters
+    sectionResult = removeWhitespace(sectionResult)
+
+    # Get the counter
+    sectionCounter = sectionResult[sectionResult.find("<spanclass=\"count\">"):]
+
+    # Get the number of pages
+    numPages = sectionCounter[sectionCounter.find("of") + 2:]
+    numPages = int(numPages[:numPages.find('<')])
+    
+    # Loop over each page
+    for i in range(1, numPages + 1):
+        print(f"\tGetting page { i }")
+        # Get the URL to the page
+        pageURL = f"{ sectionURL }/{ i }"
+        # Get the HTML from the section
+        pageResult = requests.get(pageURL).text
+        # Remove whitespace and unnecessary characters
+        pageResult = removeWhitespace(pageResult)
+
+        # Get the skin section
+        pageSection = pageResult[pageResult.find("<divclass=\"rowgrid\">"):]
+
+        # Get the skins
+        skins = pageSection.split("<aclass=\"panel-link\"href=\"")
+        skins = [skin[:skin.find('"')] for skin in skins]
+        # Remove the unnecessary sections at the start
+        skins.pop(0)
+        
+        # Loop over each skin
+        for skin in skins:
+            print(f"\t\tGetting skin { skin }")
+            # Create the section directories if they don't exist
+            skinDir = sectionDir + skin
+            safeMkdir(skinDir)
+
+            # Get the URL to the skin
+            skinURL = url + skin
+            skinResult = requests.get(skinURL).text
+
+            # Get the name of the skin
+            skinName = skinResult[skinResult.find("<h2 class=\"hero-title\">") + 23:]
+            skinName = skinName[:skinName.find('<')]
+
+            # Get the description for the skin
+            skinDescription = skinResult[skinResult.find("<p class=\"card-description\">") + 28:]
+            skinDescription = skinDescription[:skinDescription.find('<')]
+
+            # Create a text file containing the skin's name and description
+            with open(skinDir + "/meta.txt", 'w') as f:
+                f.write(f"Name: { skinName }\nDescription: { skinDescription }")
+
+            # Get the URL to the skin img
+            skinImgURL = skinURL + "/download"
+            skinImgResult = requests.get(skinImgURL, stream=True).raw
+            skinImgResult.decode_content = True
+            # Save the skin img
+            with open(skinDir + "/skin.png", "wb") as f:
+                shutil.copyfileobj(skinImgResult, f)
+            print(f"\t\tGot skin { skin }")
+        print(f"\tGot page { i }")
+    print(f"Got section { section[9:] }")