joyrider3774 · March 26, 2025 17:25
diff --git a/gistfile1.txt b/gistfile1.txt
 import os
 import requests
 from bs4 import BeautifulSoup
 import time
 import re
 import urllib.parse
 from PIL import Image
 from io import BytesIO

 def download_and_convert_image(url, filename, folder="downloads"):
    """Download an image and convert it to PNG if it's not already in PNG format."""
    try:
        # Create folder if it doesn't exist
        if not os.path.exists(folder):
            os.makedirs(folder)
        
        # Construct the full file path
        filepath = os.path.join(folder, filename)
        
        # Check if file already exists
        if os.path.exists(filepath):
            print(f"File already exists: {filepath}")
            return True
        
        # Download the image
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        # Check if the image is already a PNG
        content_type = response.headers.get('content-type', '').lower()
        
        if 'image/png' in content_type or url.lower().endswith('.png'):
            # If it's already a PNG, just save it
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
        else:
            # If it's not a PNG, convert it
            try:
                image = Image.open(BytesIO(response.content))
                # Save as PNG
                image.save(filepath, 'PNG')
            except Exception as e:
                print(f"Error converting image to PNG: {e}")
                # Fallback to saving the original image
                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
        
        print(f"Successfully downloaded and processed: {filepath}")
        return True
    
    except Exception as e:
        print(f"Error downloading/processing {url} to {filename}: {e}")
        return False

 def download_file(url, filename, folder="downloads"):
    """Download a file from the given URL and save it with the given filename."""
    try:
        # Create folder if it doesn't exist
        if not os.path.exists(folder):
            os.makedirs(folder)
        
        # Construct the full file path
        filepath = os.path.join(folder, filename)
        
        # Check if file already exists
        if os.path.exists(filepath):
            print(f"File already exists: {filepath}")
            return True
        
        # Download the file
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an exception for HTTP errors
        
        # Save the file
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        
        print(f"Successfully downloaded: {filepath}")
        return True
    
    except Exception as e:
        print(f"Error downloading {url} to {filename}: {e}")
        return False

 def scrape_topic_page(topic_id, base_url="https://lowresnx.inutilis.com"):
    """Scrape a single topic page and download any .nx files and .png thumbnails found."""
    url = f"{base_url}/topic.php?id={topic_id}"
    
    try:
        # Send a GET request to the topic page
        response = requests.get(url)
        response.raise_for_status()
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Look for the attachment-info div
        attachment_info = soup.find('div', class_='attachment-info')
        
        if attachment_info:
            # Find the download link
            download_link = attachment_info.find('a', href=re.compile(r'\.nx$'))
            
            if download_link:
                # Extract the .nx file URL and the download attribute
                nx_file_url = download_link['href']
                nx_filename = download_link.get('download', os.path.basename(nx_file_url))
                
                # Construct the full URL for the .nx file
                full_nx_url = f"{base_url}/{nx_file_url}" if not nx_file_url.startswith('http') else nx_file_url
                
                # Download the .nx file
                nx_downloaded = download_file(full_nx_url, nx_filename)
                
                # If .nx file was downloaded successfully, try to download the associated image
                if nx_downloaded:
                    # Look for the screenshot image
                    screenshot_img = soup.find('img', class_='screenshot')
                    
                    if screenshot_img and 'src' in screenshot_img.attrs:
                        # Get the image URL from the src attribute
                        img_src = screenshot_img['src']
                        # Construct the full image URL
                        full_img_url = f"{base_url}/{img_src}" if not img_src.startswith('http') else img_src
                        
                        # Create PNG filename based on the .nx filename
                        png_filename = nx_filename.replace('.nx', '.png')
                        
                        # Download and potentially convert the image
                        download_and_convert_image(full_img_url, png_filename)
                    else:
                        # Fallback to old method if no screenshot image is found
                        png_file_url = nx_file_url.replace('.nx', '.png')
                        png_filename = nx_filename.replace('.nx', '.png')
                        full_png_url = f"{base_url}/{png_file_url}" if not png_file_url.startswith('http') else png_file_url
                        
                        # Download the .png file
                        download_file(full_png_url, png_filename)
                    
                    return True
        
        print(f"No attachment found or download failed for topic ID: {topic_id}")
        return False
    
    except Exception as e:
        print(f"Error processing topic ID {topic_id}: {e}")
        return False

 def main():
    # Create a folder for the downloads
    download_folder = "lowresnx_downloads"
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
        
    # Check if required libraries are installed
    try:
        import PIL
    except ImportError:
        print("PIL (Pillow) library is not installed. Installing it now...")
        import pip
        pip.main(['install', 'pillow'])
        print("Pillow installed successfully.")
    
    # Set the range of topic IDs to scrape
    start_id = 1
    end_id = 3676
    
    total_topics = end_id - start_id + 1
    successful_downloads = 0
    
    print(f"Starting to scrape {total_topics} topic pages...")
    
    # Loop through each topic ID
    for topic_id in range(start_id, end_id + 1):
        print(f"Processing topic ID: {topic_id} ({topic_id - start_id + 1}/{total_topics})")
        
        # Scrape the topic page
        success = scrape_topic_page(topic_id)
        if success:
            successful_downloads += 1
        
        # Add a small delay to avoid hammering the server
        time.sleep(1)
    
    print(f"Scraping completed. Successfully downloaded files from {successful_downloads} out of {total_topics} topics.")

 if __name__ == "__main__":
    main()
	import os
	import requests
	from bs4 import BeautifulSoup
	import time
	import re
	import urllib.parse
	from PIL import Image
	from io import BytesIO

	def download_and_convert_image(url, filename, folder="downloads"):
	"""Download an image and convert it to PNG if it's not already in PNG format."""
	try:
	# Create folder if it doesn't exist
	if not os.path.exists(folder):
	os.makedirs(folder)

	# Construct the full file path
	filepath = os.path.join(folder, filename)

	# Check if file already exists
	if os.path.exists(filepath):
	print(f"File already exists: {filepath}")
	return True

	# Download the image
	response = requests.get(url, stream=True)
	response.raise_for_status()

	# Check if the image is already a PNG
	content_type = response.headers.get('content-type', '').lower()

	if 'image/png' in content_type or url.lower().endswith('.png'):
	# If it's already a PNG, just save it
	with open(filepath, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)
	else:
	# If it's not a PNG, convert it
	try:
	image = Image.open(BytesIO(response.content))
	# Save as PNG
	image.save(filepath, 'PNG')
	except Exception as e:
	print(f"Error converting image to PNG: {e}")
	# Fallback to saving the original image
	with open(filepath, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)

	print(f"Successfully downloaded and processed: {filepath}")
	return True

	except Exception as e:
	print(f"Error downloading/processing {url} to {filename}: {e}")
	return False

	def download_file(url, filename, folder="downloads"):
	"""Download a file from the given URL and save it with the given filename."""
	try:
	# Create folder if it doesn't exist
	if not os.path.exists(folder):
	os.makedirs(folder)

	# Construct the full file path
	filepath = os.path.join(folder, filename)

	# Check if file already exists
	if os.path.exists(filepath):
	print(f"File already exists: {filepath}")
	return True

	# Download the file
	response = requests.get(url, stream=True)
	response.raise_for_status() # Raise an exception for HTTP errors

	# Save the file
	with open(filepath, 'wb') as f:
	for chunk in response.iter_content(chunk_size=8192):
	f.write(chunk)

	print(f"Successfully downloaded: {filepath}")
	return True

	except Exception as e:
	print(f"Error downloading {url} to {filename}: {e}")
	return False

	def scrape_topic_page(topic_id, base_url="https://lowresnx.inutilis.com"):
	"""Scrape a single topic page and download any .nx files and .png thumbnails found."""
	url = f"{base_url}/topic.php?id={topic_id}"

	try:
	# Send a GET request to the topic page
	response = requests.get(url)
	response.raise_for_status()

	# Parse the HTML content
	soup = BeautifulSoup(response.text, 'html.parser')

	# Look for the attachment-info div
	attachment_info = soup.find('div', class_='attachment-info')

	if attachment_info:
	# Find the download link
	download_link = attachment_info.find('a', href=re.compile(r'\.nx$'))

	if download_link:
	# Extract the .nx file URL and the download attribute
	nx_file_url = download_link['href']
	nx_filename = download_link.get('download', os.path.basename(nx_file_url))

	# Construct the full URL for the .nx file
	full_nx_url = f"{base_url}/{nx_file_url}" if not nx_file_url.startswith('http') else nx_file_url

	# Download the .nx file
	nx_downloaded = download_file(full_nx_url, nx_filename)

	# If .nx file was downloaded successfully, try to download the associated image
	if nx_downloaded:
	# Look for the screenshot image
	screenshot_img = soup.find('img', class_='screenshot')

	if screenshot_img and 'src' in screenshot_img.attrs:
	# Get the image URL from the src attribute
	img_src = screenshot_img['src']
	# Construct the full image URL
	full_img_url = f"{base_url}/{img_src}" if not img_src.startswith('http') else img_src

	# Create PNG filename based on the .nx filename
	png_filename = nx_filename.replace('.nx', '.png')

	# Download and potentially convert the image
	download_and_convert_image(full_img_url, png_filename)
	else:
	# Fallback to old method if no screenshot image is found
	png_file_url = nx_file_url.replace('.nx', '.png')
	png_filename = nx_filename.replace('.nx', '.png')
	full_png_url = f"{base_url}/{png_file_url}" if not png_file_url.startswith('http') else png_file_url

	# Download the .png file
	download_file(full_png_url, png_filename)

	return True

	print(f"No attachment found or download failed for topic ID: {topic_id}")
	return False

	except Exception as e:
	print(f"Error processing topic ID {topic_id}: {e}")
	return False

	def main():
	# Create a folder for the downloads
	download_folder = "lowresnx_downloads"
	if not os.path.exists(download_folder):
	os.makedirs(download_folder)

	# Check if required libraries are installed
	try:
	import PIL
	except ImportError:
	print("PIL (Pillow) library is not installed. Installing it now...")
	import pip
	pip.main(['install', 'pillow'])
	print("Pillow installed successfully.")

	# Set the range of topic IDs to scrape
	start_id = 1
	end_id = 3676

	total_topics = end_id - start_id + 1
	successful_downloads = 0

	print(f"Starting to scrape {total_topics} topic pages...")

	# Loop through each topic ID
	for topic_id in range(start_id, end_id + 1):
	print(f"Processing topic ID: {topic_id} ({topic_id - start_id + 1}/{total_topics})")

	# Scrape the topic page
	success = scrape_topic_page(topic_id)
	if success:
	successful_downloads += 1

	# Add a small delay to avoid hammering the server
	time.sleep(1)

	print(f"Scraping completed. Successfully downloaded files from {successful_downloads} out of {total_topics} topics.")

	if __name__ == "__main__":
	main()