Skip to content

Instantly share code, notes, and snippets.

@mohsinkhann12
Created February 21, 2025 13:28
Show Gist options
  • Save mohsinkhann12/04c7390af8fc58c9c1d99bb6ace217d7 to your computer and use it in GitHub Desktop.
Save mohsinkhann12/04c7390af8fc58c9c1d99bb6ace217d7 to your computer and use it in GitHub Desktop.
Export public scribehow scribe as word file
import requests
import json
from bs4 import BeautifulSoup
from docx import Document
from docx.shared import Inches,RGBColor,Pt
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from fpdf import FPDF
import tkinter as tk
from tkinter import simpledialog
import sys
def get_user_input():
root = tk.Tk()
root.withdraw()
url = simpledialog.askstring("Input", "Enter the URL of the ScribeHow page:")
choice = simpledialog.askstring("Input", "Choose file format (word/pdf):")
return url, choice.lower().strip()
def print_progress(message):
print(message)
sys.stdout.flush()
def scrape_scribehow(url):
content_data = {}
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))
try:
print_progress("Fetching webpage...")
response = session.get(url, headers=headers, timeout=10)
response.raise_for_status()
except requests.exceptions.RequestException as e:
return {"error": f"Failed to fetch the webpage: {e}"}
soup = BeautifulSoup(response.text, 'html.parser')
heading = soup.find('h1')
if heading:
content_data['title'] = heading.text.strip()
desc = soup.find('span')
if desc:
content_data['description'] = desc.text.strip()
main_div = soup.find('div', class_='flex w-full flex-col items-center')
if main_div:
content_data['steps'] = []
for div in main_div.find_all('div', recursive=False):
step = div.find('div', class_="bg-opacity-0")
if step:
step_no_elem = step.find('span', class_="text-lg")
step_text_elem = step.find('span', class_="action-description")
if step_no_elem and step_text_elem:
step_no = step_no_elem.text.strip()
step_text = step_text_elem.text.strip()
image = div.find('img')
image_src = image['src'] if image else None
content_data['steps'].append({
'step': f"Step {step_no}: {step_text}",
'img': image_src
})
return content_data
def save_to_word(data, filename='scraped_data.docx'):
print_progress("Saving data to Word file...")
doc = Document()
if 'title' in data:
title = doc.add_heading(data['title'], level=1)
title.runs[0].font.size = Pt(24)
title.runs[0].font.underline = True
title.runs[0].font.color.rgb = RGBColor(0, 0, 0)
if 'description' in data:
doc.add_paragraph(data['description'])
if 'steps' in data:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))
for step in data['steps']:
stepp = doc.add_heading(step['step'], level=2)
stepp.runs[0].font.color.rgb = RGBColor(0, 0, 0)
if step['img']:
try:
response = session.get(step['img'], headers=headers, timeout=10)
response.raise_for_status()
image_path = "temp_image.jpg"
with open(image_path, 'wb') as img_file:
img_file.write(response.content)
doc.add_picture(image_path, width=Inches(6))
except requests.exceptions.RequestException as e:
print_progress(f"Failed to download image: {e}")
doc.save(filename)
print_progress("Word file saved successfully.")
def save_to_pdf(data, filename='scraped_data.pdf'):
print_progress("Saving data to PDF file...")
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", style='B', size=16)
if 'title' in data:
pdf.cell(200, 10, data['title'], ln=True, align='C')
pdf.ln(10)
pdf.set_font("Arial", size=12)
if 'description' in data:
pdf.multi_cell(0, 10, data['description'])
if 'steps' in data:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))
for step in data['steps']:
pdf.ln(5)
pdf.set_font("Arial", style='B', size=12)
pdf.cell(0, 10, step['step'], ln=True)
pdf.set_font("Arial", size=12)
if step['img']:
try:
response = session.get(step['img'], headers=headers, timeout=10)
response.raise_for_status()
image_path = "temp_image.jpg"
with open(image_path, 'wb') as img_file:
img_file.write(response.content)
pdf.image(image_path, x=10, w=100)
except requests.exceptions.RequestException as e:
print_progress(f"Failed to download image: {e}")
pdf.output(filename)
print_progress("PDF file saved successfully.")
# User interaction
url, choice = get_user_input()
print_progress("Scraping...")
data = scrape_scribehow(url)
if "error" not in data:
if choice == "word":
save_to_word(data)
elif choice == "pdf":
save_to_pdf(data)
else:
print_progress("Invalid choice. Please choose either 'word' or 'pdf'.")
else:
print_progress(data["error"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment