mohsinkhann12 · February 21, 2025 13:28
diff --git a/scribehow-scrap-export.py b/scribehow-scrap-export.py
 import requests
 import json
 from bs4 import BeautifulSoup
 from docx import Document
 from docx.shared import Inches,RGBColor,Pt
 from requests.adapters import HTTPAdapter
 from requests.packages.urllib3.util.retry import Retry
 from fpdf import FPDF
 import tkinter as tk
 from tkinter import simpledialog
 import sys

 def get_user_input():
    root = tk.Tk()
    root.withdraw()
    url = simpledialog.askstring("Input", "Enter the URL of the ScribeHow page:")
    choice = simpledialog.askstring("Input", "Choose file format (word/pdf):")
    return url, choice.lower().strip()

 def print_progress(message):
    print(message)
    sys.stdout.flush()

 def scrape_scribehow(url):
    content_data = {}
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))
    
    try:
        print_progress("Fetching webpage...")
        response = session.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        return {"error": f"Failed to fetch the webpage: {e}"}
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    heading = soup.find('h1')
    if heading:
        content_data['title'] = heading.text.strip()
    
    desc = soup.find('span')
    if desc:
        content_data['description'] = desc.text.strip()
    
    main_div = soup.find('div', class_='flex w-full flex-col items-center')
    if main_div:
        content_data['steps'] = []
        for div in main_div.find_all('div', recursive=False):
            step = div.find('div', class_="bg-opacity-0")
            if step:
                step_no_elem = step.find('span', class_="text-lg")
                step_text_elem = step.find('span', class_="action-description")
                if step_no_elem and step_text_elem:
                    step_no = step_no_elem.text.strip()
                    step_text = step_text_elem.text.strip()
                    image = div.find('img')
                    image_src = image['src'] if image else None
                    content_data['steps'].append({
                        'step': f"Step {step_no}: {step_text}",
                        'img': image_src
                    })
    
    return content_data

 def save_to_word(data, filename='scraped_data.docx'):
    print_progress("Saving data to Word file...")
    doc = Document()
    
    if 'title' in data:
        title = doc.add_heading(data['title'], level=1)
        title.runs[0].font.size = Pt(24)
        title.runs[0].font.underline = True
        title.runs[0].font.color.rgb = RGBColor(0, 0, 0)

    
    if 'description' in data:
        doc.add_paragraph(data['description'])

    
    if 'steps' in data:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        session = requests.Session()
        retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
        session.mount('https://', HTTPAdapter(max_retries=retries))
        
        for step in data['steps']:
            stepp = doc.add_heading(step['step'], level=2)
            stepp.runs[0].font.color.rgb = RGBColor(0, 0, 0)

            if step['img']:
                try:
                    response = session.get(step['img'], headers=headers, timeout=10)
                    response.raise_for_status()
                    image_path = "temp_image.jpg"
                    with open(image_path, 'wb') as img_file:
                        img_file.write(response.content)
                    doc.add_picture(image_path, width=Inches(6))
                except requests.exceptions.RequestException as e:
                    print_progress(f"Failed to download image: {e}")
    
    doc.save(filename)
    print_progress("Word file saved successfully.")

 def save_to_pdf(data, filename='scraped_data.pdf'):
    print_progress("Saving data to PDF file...")
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Arial", style='B', size=16)
    
    if 'title' in data:
        pdf.cell(200, 10, data['title'], ln=True, align='C')
    
    pdf.ln(10)
    pdf.set_font("Arial", size=12)
    
    if 'description' in data:
        pdf.multi_cell(0, 10, data['description'])
    
    if 'steps' in data:
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
        session = requests.Session()
        retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
        session.mount('https://', HTTPAdapter(max_retries=retries))
        
        for step in data['steps']:
            pdf.ln(5)
            pdf.set_font("Arial", style='B', size=12)
            pdf.cell(0, 10, step['step'], ln=True)
            pdf.set_font("Arial", size=12)
            if step['img']:
                try:
                    response = session.get(step['img'], headers=headers, timeout=10)
                    response.raise_for_status()
                    image_path = "temp_image.jpg"
                    with open(image_path, 'wb') as img_file:
                        img_file.write(response.content)
                    pdf.image(image_path, x=10, w=100)
                except requests.exceptions.RequestException as e:
                    print_progress(f"Failed to download image: {e}")
    
    pdf.output(filename)
    print_progress("PDF file saved successfully.")

 # User interaction
 url, choice = get_user_input()
 print_progress("Scraping...")
 data = scrape_scribehow(url)
 if "error" not in data:
    if choice == "word":
        save_to_word(data)
    elif choice == "pdf":
        save_to_pdf(data)
    else:
        print_progress("Invalid choice. Please choose either 'word' or 'pdf'.")
 else:
    print_progress(data["error"])
	import requests
	import json
	from bs4 import BeautifulSoup
	from docx import Document
	from docx.shared import Inches,RGBColor,Pt
	from requests.adapters import HTTPAdapter
	from requests.packages.urllib3.util.retry import Retry
	from fpdf import FPDF
	import tkinter as tk
	from tkinter import simpledialog
	import sys

	def get_user_input():
	root = tk.Tk()
	root.withdraw()
	url = simpledialog.askstring("Input", "Enter the URL of the ScribeHow page:")
	choice = simpledialog.askstring("Input", "Choose file format (word/pdf):")
	return url, choice.lower().strip()

	def print_progress(message):
	print(message)
	sys.stdout.flush()

	def scrape_scribehow(url):
	content_data = {}

	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
	session = requests.Session()
	retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
	session.mount('https://', HTTPAdapter(max_retries=retries))

	try:
	print_progress("Fetching webpage...")
	response = session.get(url, headers=headers, timeout=10)
	response.raise_for_status()
	except requests.exceptions.RequestException as e:
	return {"error": f"Failed to fetch the webpage: {e}"}

	soup = BeautifulSoup(response.text, 'html.parser')

	heading = soup.find('h1')
	if heading:
	content_data['title'] = heading.text.strip()

	desc = soup.find('span')
	if desc:
	content_data['description'] = desc.text.strip()

	main_div = soup.find('div', class_='flex w-full flex-col items-center')
	if main_div:
	content_data['steps'] = []
	for div in main_div.find_all('div', recursive=False):
	step = div.find('div', class_="bg-opacity-0")
	if step:
	step_no_elem = step.find('span', class_="text-lg")
	step_text_elem = step.find('span', class_="action-description")
	if step_no_elem and step_text_elem:
	step_no = step_no_elem.text.strip()
	step_text = step_text_elem.text.strip()
	image = div.find('img')
	image_src = image['src'] if image else None
	content_data['steps'].append({
	'step': f"Step {step_no}: {step_text}",
	'img': image_src
	})

	return content_data

	def save_to_word(data, filename='scraped_data.docx'):
	print_progress("Saving data to Word file...")
	doc = Document()

	if 'title' in data:
	title = doc.add_heading(data['title'], level=1)
	title.runs[0].font.size = Pt(24)
	title.runs[0].font.underline = True
	title.runs[0].font.color.rgb = RGBColor(0, 0, 0)


	if 'description' in data:
	doc.add_paragraph(data['description'])


	if 'steps' in data:
	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
	session = requests.Session()
	retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
	session.mount('https://', HTTPAdapter(max_retries=retries))

	for step in data['steps']:
	stepp = doc.add_heading(step['step'], level=2)
	stepp.runs[0].font.color.rgb = RGBColor(0, 0, 0)

	if step['img']:
	try:
	response = session.get(step['img'], headers=headers, timeout=10)
	response.raise_for_status()
	image_path = "temp_image.jpg"
	with open(image_path, 'wb') as img_file:
	img_file.write(response.content)
	doc.add_picture(image_path, width=Inches(6))
	except requests.exceptions.RequestException as e:
	print_progress(f"Failed to download image: {e}")

	doc.save(filename)
	print_progress("Word file saved successfully.")

	def save_to_pdf(data, filename='scraped_data.pdf'):
	print_progress("Saving data to PDF file...")
	pdf = FPDF()
	pdf.set_auto_page_break(auto=True, margin=15)
	pdf.add_page()
	pdf.set_font("Arial", style='B', size=16)

	if 'title' in data:
	pdf.cell(200, 10, data['title'], ln=True, align='C')

	pdf.ln(10)
	pdf.set_font("Arial", size=12)

	if 'description' in data:
	pdf.multi_cell(0, 10, data['description'])

	if 'steps' in data:
	headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
	session = requests.Session()
	retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
	session.mount('https://', HTTPAdapter(max_retries=retries))

	for step in data['steps']:
	pdf.ln(5)
	pdf.set_font("Arial", style='B', size=12)
	pdf.cell(0, 10, step['step'], ln=True)
	pdf.set_font("Arial", size=12)
	if step['img']:
	try:
	response = session.get(step['img'], headers=headers, timeout=10)
	response.raise_for_status()
	image_path = "temp_image.jpg"
	with open(image_path, 'wb') as img_file:
	img_file.write(response.content)
	pdf.image(image_path, x=10, w=100)
	except requests.exceptions.RequestException as e:
	print_progress(f"Failed to download image: {e}")

	pdf.output(filename)
	print_progress("PDF file saved successfully.")

	# User interaction
	url, choice = get_user_input()
	print_progress("Scraping...")
	data = scrape_scribehow(url)
	if "error" not in data:
	if choice == "word":
	save_to_word(data)
	elif choice == "pdf":
	save_to_pdf(data)
	else:
	print_progress("Invalid choice. Please choose either 'word' or 'pdf'.")
	else:
	print_progress(data["error"])