amkisko · December 16, 2025 07:23
diff --git a/extract-sentry-api-specs.rb b/extract-sentry-api-specs.rb
 #!/usr/bin/env ruby

 require "fileutils"
 require "nokogiri"
 require "uri"

 BASE_URL = "https://docs.sentry.io"
 API_BASE_URL = "#{BASE_URL}/api"

 OUTPUT_DIR = File.join(File.dirname(__FILE__), "api_spec")
 TEMP_DIR = File.join(OUTPUT_DIR, "raw_html")

 FileUtils.mkdir_p(OUTPUT_DIR)
 FileUtils.mkdir_p(TEMP_DIR)

 def sanitize_filename(url)
  # Extract a safe filename from URL
  parts = url.gsub("https://docs.sentry.io/api/", "").gsub(/\/$/, "").split("/")
  filename = parts.join("_").gsub(/[^a-z0-9_-]/, "_")
  # Handle root/blank page - use "index"
  filename = "index" if filename.empty? || filename == "_"
  filename
 end

 def fetch_with_curl(url)
  filename = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html")
  puts "Fetching: #{url}"

  begin
    require "open-uri"
    require "openssl"
    html = URI.open(url,
      "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
      :ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE).read
    File.write(filename, html)
    puts "  Saved raw HTML to: #{filename}"
    html
  rescue => e
    puts "  Error fetching #{url}: #{e.message}"
    # Try with curl as fallback (with -k to ignore SSL)
    if system("curl", "-k", "-s", "-L", "-H", "User-Agent: Mozilla/5.0", url, "-o", filename)
      if File.exist?(filename) && File.size(filename) > 0
        puts "  Saved raw HTML to: #{filename} (via curl)"
        File.read(filename)
      end
    end
  end
 end

 def extract_api_urls_from_sidebar(html)
  # Force UTF-8 encoding
  html = html.force_encoding("UTF-8")

  # Extract body HTML and remove script/style/svg
  body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i)
  return [] unless body_match

  body_html = body_match[1]
  body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "")
  body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "")
  body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "")
  body_html = body_html.gsub(/<svg[^>]*\/>/i, "")

  # Parse the cleaned body HTML
  doc = Nokogiri::HTML("<body>#{body_html}</body>")
  body = doc.css("body").first
  return [] unless body

  urls = []

  # Find the "Sentry API" sidebar section
  # Look for a link with href="/api/" that contains "Sentry API" text
  api_section = body.xpath(".//a[contains(@href, '/api/') and contains(., 'Sentry API')]").first ||
    body.xpath(".//a[@href='/api/']").first

  return [] unless api_section

  # Find the parent <li> with data-sidebar-branch
  api_li = api_section.ancestors("li[data-sidebar-branch='true']").first ||
    api_section.parent

  return [] unless api_li

  # Find the nested <ul> with data-sidebar-tree inside this section
  api_ul = api_li.css("ul[data-sidebar-tree]").first

  if api_ul
    # Recursively extract all links from this section
    extract_links_recursive(api_ul, urls)
  end

  # Also include the main /api/ link itself
  urls << "/api/" unless urls.include?("/api/")

  # Convert relative URLs to absolute and normalize
  urls.map do |url|
    # Normalize: ensure it starts with /api/ and ends with /
    url = url.gsub(/\/$/, "") + "/" unless url.end_with?("/")
    url = "/api/" + url unless url.start_with?("/")

    if url.start_with?("/")
      "#{BASE_URL}#{url}"
    elsif url.start_with?("http")
      url
    else
      "#{API_BASE_URL}/#{url}"
    end
  end.uniq.sort
 end

 def extract_links_recursive(node, urls)
  # Extract all <a> tags with href attributes in this node and its children
  node.css("a[href]").each do |link|
    href = link["href"]
    next unless href

    # Only include API links (starting with /api/)
    if href.start_with?("/api/")
      # Normalize: remove trailing slash then add it back for consistency
      href = href.gsub(/\/$/, "") + "/" unless href.end_with?("/")
      urls << href unless urls.include?(href)
    end
  end

  # Recursively process nested lists (including those in CollapsibleSidebarLink components)
  node.css("ul[data-sidebar-tree], ul").each do |ul|
    extract_links_recursive(ul, urls)
  end

  # Also check for links in CollapsibleSidebarLink components (they might have nested children)
  node.css("li[data-sidebar-branch]").each do |li|
    extract_links_recursive(li, urls)
  end
 end

 def extract_body(html)
  doc = Nokogiri::HTML(html)

  # Remove script, style, and svg tags from the entire document
  doc.css("script, style, svg").remove

  # Extract body content
  body = doc.css("body").first
  body ? body.to_html : html
 end

 def purify_content(html, url)
  # Force UTF-8 encoding
  html = html.force_encoding("UTF-8")

  # First, extract body HTML as string and remove script/style/svg with regex
  # This ensures we work with clean HTML before parsing
  body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i)
  return nil unless body_match

  body_html = body_match[1]

  # Remove script, style, and svg tags using regex (before parsing)
  body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "")
  body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "")
  body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "")
  body_html = body_html.gsub(/<svg[^>]*\/>/i, "")  # Self-closing svg tags

  # Now parse the cleaned body HTML
  doc = Nokogiri::HTML("<body>#{body_html}</body>")
  body = doc.css("body").first
  return nil unless body

  # Find main content - try multiple selectors
  main_content = body.xpath(".//*[@id='doc-content']").first ||
    body.css("#doc-content").first ||
    body.css("main #main").first ||
    body.css("article, main, .content, #content, .documentation-content, .markdown-body").first

  unless main_content
    return nil
  end

  # Remove navigation, breadcrumbs, and other non-content elements
  main_content.css(".breadcrumbs, .not-prose, nav, aside, .grid, header, footer").remove

  # Get the content div
  content_div = if main_content.css("#main").any?
    main_content.css("#main").first
  else
    main_content
  end

  unless content_div
    return nil
  end

  spec = []

  # Extract title from h1 or hgroup
  title_elem = main_content.css("h1").first || main_content.css("hgroup h1").first
  if title_elem
    title = title_elem.text.strip
    spec << "# #{title}\n\n" unless title.empty?
  end

  # Get all content elements in document order from #main or content_div
  main_section = content_div.css("#main").first || content_div

  # Get all headings, paragraphs, code blocks, and tables in order
  all_elements = main_section.css("h1, h2, h3, h4, h5, h6, p, pre, table, ul, ol, div.api-block, div.api-info-row, dl.api-params")

  all_elements.each do |node|
    next if node["class"]&.include?("breadcrumb") || node["class"]&.include?("not-prose")

    case node.name
    when "h1"
      text = node.text.strip
      spec << "# #{text}\n\n" unless text.empty?
    when "h2"
      node.css("a, svg").remove
      text = node.text.strip
      spec << "## #{text}\n\n" unless text.empty?
    when "h3"
      node.css("a, svg").remove
      text = node.text.strip
      spec << "### #{text}\n\n" unless text.empty?
    when "h4"
      node.css("a, svg").remove
      text = node.text.strip
      spec << "#### #{text}\n\n" unless text.empty?
    when "p"
      text = node.text.strip
      spec << "#{text}\n\n" if text.length > 10
    when "pre"
      code_elem = node.css("code").first || node
      code_text = code_elem.text.strip
      if code_text.length > 0
        lang = code_elem["class"]&.match(/language-(\w+)/)&.[](1) || ""
        spec << "```#{lang}\n#{code_text}\n```\n\n"
      end
    when "table"
      rows = node.css("tr").map do |tr|
        tr.css("td, th").map { |cell| cell.text.strip.gsub(/\s+/, " ") }
      end
      if rows.any? && rows.first.any?
        spec << "| " + rows.first.join(" | ") + " |\n"
        spec << "| " + rows.first.map { "---" }.join(" | ") + " |\n"
        rows[1..-1].each do |row|
          spec << "| " + row.join(" | ") + " |\n" if row.any?
        end
        spec << "\n"
      end
    when "ul", "ol"
      node.css("li").each do |li|
        text = li.text.strip
        spec << "- #{text}\n" if text.length > 0
      end
      spec << "\n" if node.css("li").any?
    when "div"
      # Handle API blocks and info rows
      if node["class"]&.include?("api-block")
        # Extract HTTP method and endpoint
        verb = node.css(".api-request-block-verb, .api-block-header").first&.text&.strip
        endpoint = node.css("span").map(&:text).join(" ").strip
        if verb && endpoint
          spec << "## Endpoint\n\n```\n#{verb} #{endpoint}\n```\n\n"
        end
      elsif node["class"]&.include?("api-info-row")
        # Extract parameter information
        heading = node.css("h3").first&.text&.strip
        if heading
          spec << "### #{heading}\n\n"
          node.css("dl.api-params dt").each do |dt|
            param_name = dt.css("code").first&.text&.strip
            param_type = dt.css("em").first&.text&.strip
            required = dt.css(".required").first ? "REQUIRED" : "OPTIONAL"
            dd = dt.next_element
            description = dd&.css("p")&.first&.text&.strip || dd&.text&.strip
            if param_name
              spec << "- **#{param_name}** (#{param_type}) - #{required}\n"
              spec << "  #{description}\n\n" if description && description.length > 0
            end
          end
        end
      end
    end
  end

  # Clean up excessive newlines
  result = spec.join.gsub(/\n{3,}/, "\n\n").strip

  (result.length > 50) ? result : nil
 end

 # Step 0: Fetch main API page and extract all URLs from sidebar
 puts "Step 0: Fetching main API page and extracting URLs from sidebar..."
 main_api_html = fetch_with_curl("#{API_BASE_URL}/")
 unless main_api_html
  puts "Error: Could not fetch main API page. Exiting."
  exit 1
 end

 # Extract top-level category URLs
 category_urls = extract_api_urls_from_sidebar(main_api_html)
 puts "Found #{category_urls.count} top-level category URLs"

 # Step 0.5: For each category page, fetch it and extract nested endpoint URLs
 puts "\nStep 0.5: Fetching category pages to extract nested endpoint URLs..."
 all_urls = Set.new(category_urls)

 category_urls.each do |category_url|
  next if category_url == "#{API_BASE_URL}/"  # Skip the main page, we already have it

  puts "  Fetching category: #{category_url}"
  category_html = fetch_with_curl(category_url)
  next unless category_html

  nested_urls = extract_api_urls_from_sidebar(category_html)
  nested_urls.each { |url| all_urls.add(url) }
  puts "    Found #{nested_urls.count} URLs in #{category_url}"
 end

 URLS = all_urls.to_a.sort
 puts "\nTotal unique API URLs found: #{URLS.count}"
 puts "Sample URLs:"
 URLS.first(10).each { |url| puts "  - #{url}" }
 puts "  ..." if URLS.count > 10

 if URLS.empty?
  puts "Warning: No URLs found in sidebar. Exiting."
  exit 1
 end

 # Step 1: Fetch all HTML files with curl
 puts "\nStep 1: Fetching HTML files..."
 URLS.each do |url|
  fetch_with_curl(url)
 end

 # Step 2: Extract body and purify
 puts "\nStep 2: Extracting and purifying content..."
 URLS.each do |url|
  html_file = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html")
  next unless File.exist?(html_file)

  html = File.read(html_file)
  spec = purify_content(html, url)

  if spec && !spec.strip.empty?
    filename = File.join(OUTPUT_DIR, "#{sanitize_filename(url)}.md")
    File.write(filename, spec)
    puts "  Extracted spec: #{File.basename(filename)}"
  else
    puts "  Warning: Could not extract spec from #{url}"
  end
 end

 puts "\nDone! Specs saved to #{OUTPUT_DIR}"
 puts "Total URLs processed: #{URLS.count}"

 # Clean up: Remove raw_html folder
 puts "\nCleaning up raw HTML files..."
 if File.directory?(TEMP_DIR)
  FileUtils.rm_rf(TEMP_DIR)
  puts "Removed #{TEMP_DIR}"
 end
	#!/usr/bin/env ruby

	require "fileutils"
	require "nokogiri"
	require "uri"

	BASE_URL = "https://docs.sentry.io"
	API_BASE_URL = "#{BASE_URL}/api"

	OUTPUT_DIR = File.join(File.dirname(__FILE__), "api_spec")
	TEMP_DIR = File.join(OUTPUT_DIR, "raw_html")

	FileUtils.mkdir_p(OUTPUT_DIR)
	FileUtils.mkdir_p(TEMP_DIR)

	def sanitize_filename(url)
	# Extract a safe filename from URL
	parts = url.gsub("https://docs.sentry.io/api/", "").gsub(/\/$/, "").split("/")
	filename = parts.join("_").gsub(/[^a-z0-9_-]/, "_")
	# Handle root/blank page - use "index"
	filename = "index" if filename.empty? \|\| filename == "_"
	filename
	end

	def fetch_with_curl(url)
	filename = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html")
	puts "Fetching: #{url}"

	begin
	require "open-uri"
	require "openssl"
	html = URI.open(url,
	"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
	:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE).read
	File.write(filename, html)
	puts " Saved raw HTML to: #{filename}"
	html
	rescue => e
	puts " Error fetching #{url}: #{e.message}"
	# Try with curl as fallback (with -k to ignore SSL)
	if system("curl", "-k", "-s", "-L", "-H", "User-Agent: Mozilla/5.0", url, "-o", filename)
	if File.exist?(filename) && File.size(filename) > 0
	puts " Saved raw HTML to: #{filename} (via curl)"
	File.read(filename)
	end
	end
	end
	end

	def extract_api_urls_from_sidebar(html)
	# Force UTF-8 encoding
	html = html.force_encoding("UTF-8")

	# Extract body HTML and remove script/style/svg
	body_match = html.match(/<body[^>]>([\s\S])<\/body>/i)
	return [] unless body_match

	body_html = body_match[1]
	body_html = body_html.gsub(/<script[^>]>[\s\S]?<\/script>/i, "")
	body_html = body_html.gsub(/<style[^>]>[\s\S]?<\/style>/i, "")
	body_html = body_html.gsub(/<svg[^>]>[\s\S]?<\/svg>/i, "")
	body_html = body_html.gsub(/<svg[^>]*\/>/i, "")

	# Parse the cleaned body HTML
	doc = Nokogiri::HTML("<body>#{body_html}</body>")
	body = doc.css("body").first
	return [] unless body

	urls = []

	# Find the "Sentry API" sidebar section
	# Look for a link with href="/api/" that contains "Sentry API" text
	api_section = body.xpath(".//a[contains(@href, '/api/') and contains(., 'Sentry API')]").first \|\|
	body.xpath(".//a[@href='/api/']").first

	return [] unless api_section

	# Find the parent <li> with data-sidebar-branch
	api_li = api_section.ancestors("li[data-sidebar-branch='true']").first \|\|
	api_section.parent

	return [] unless api_li

	# Find the nested <ul> with data-sidebar-tree inside this section
	api_ul = api_li.css("ul[data-sidebar-tree]").first

	if api_ul
	# Recursively extract all links from this section
	extract_links_recursive(api_ul, urls)
	end

	# Also include the main /api/ link itself
	urls << "/api/" unless urls.include?("/api/")

	# Convert relative URLs to absolute and normalize
	urls.map do \|url\|
	# Normalize: ensure it starts with /api/ and ends with /
	url = url.gsub(/\/$/, "") + "/" unless url.end_with?("/")
	url = "/api/" + url unless url.start_with?("/")

	if url.start_with?("/")
	"#{BASE_URL}#{url}"
	elsif url.start_with?("http")
	url
	else
	"#{API_BASE_URL}/#{url}"
	end
	end.uniq.sort
	end

	def extract_links_recursive(node, urls)
	# Extract all <a> tags with href attributes in this node and its children
	node.css("a[href]").each do \|link\|
	href = link["href"]
	next unless href

	# Only include API links (starting with /api/)
	if href.start_with?("/api/")
	# Normalize: remove trailing slash then add it back for consistency
	href = href.gsub(/\/$/, "") + "/" unless href.end_with?("/")
	urls << href unless urls.include?(href)
	end
	end

	# Recursively process nested lists (including those in CollapsibleSidebarLink components)
	node.css("ul[data-sidebar-tree], ul").each do \|ul\|
	extract_links_recursive(ul, urls)
	end

	# Also check for links in CollapsibleSidebarLink components (they might have nested children)
	node.css("li[data-sidebar-branch]").each do \|li\|
	extract_links_recursive(li, urls)
	end
	end

	def extract_body(html)
	doc = Nokogiri::HTML(html)

	# Remove script, style, and svg tags from the entire document
	doc.css("script, style, svg").remove

	# Extract body content
	body = doc.css("body").first
	body ? body.to_html : html
	end

	def purify_content(html, url)
	# Force UTF-8 encoding
	html = html.force_encoding("UTF-8")

	# First, extract body HTML as string and remove script/style/svg with regex
	# This ensures we work with clean HTML before parsing
	body_match = html.match(/<body[^>]>([\s\S])<\/body>/i)
	return nil unless body_match

	body_html = body_match[1]

	# Remove script, style, and svg tags using regex (before parsing)
	body_html = body_html.gsub(/<script[^>]>[\s\S]?<\/script>/i, "")
	body_html = body_html.gsub(/<style[^>]>[\s\S]?<\/style>/i, "")
	body_html = body_html.gsub(/<svg[^>]>[\s\S]?<\/svg>/i, "")
	body_html = body_html.gsub(/<svg[^>]*\/>/i, "") # Self-closing svg tags

	# Now parse the cleaned body HTML
	doc = Nokogiri::HTML("<body>#{body_html}</body>")
	body = doc.css("body").first
	return nil unless body

	# Find main content - try multiple selectors
	main_content = body.xpath(".//*[@id='doc-content']").first \|\|
	body.css("#doc-content").first \|\|
	body.css("main #main").first \|\|
	body.css("article, main, .content, #content, .documentation-content, .markdown-body").first

	unless main_content
	return nil
	end

	# Remove navigation, breadcrumbs, and other non-content elements
	main_content.css(".breadcrumbs, .not-prose, nav, aside, .grid, header, footer").remove

	# Get the content div
	content_div = if main_content.css("#main").any?
	main_content.css("#main").first
	else
	main_content
	end

	unless content_div
	return nil
	end

	spec = []

	# Extract title from h1 or hgroup
	title_elem = main_content.css("h1").first \|\| main_content.css("hgroup h1").first
	if title_elem
	title = title_elem.text.strip
	spec << "# #{title}\n\n" unless title.empty?
	end

	# Get all content elements in document order from #main or content_div
	main_section = content_div.css("#main").first \|\| content_div

	# Get all headings, paragraphs, code blocks, and tables in order
	all_elements = main_section.css("h1, h2, h3, h4, h5, h6, p, pre, table, ul, ol, div.api-block, div.api-info-row, dl.api-params")

	all_elements.each do \|node\|
	next if node["class"]&.include?("breadcrumb") \|\| node["class"]&.include?("not-prose")

	case node.name
	when "h1"
	text = node.text.strip
	spec << "# #{text}\n\n" unless text.empty?
	when "h2"
	node.css("a, svg").remove
	text = node.text.strip
	spec << "## #{text}\n\n" unless text.empty?
	when "h3"
	node.css("a, svg").remove
	text = node.text.strip
	spec << "### #{text}\n\n" unless text.empty?
	when "h4"
	node.css("a, svg").remove
	text = node.text.strip
	spec << "#### #{text}\n\n" unless text.empty?
	when "p"
	text = node.text.strip
	spec << "#{text}\n\n" if text.length > 10
	when "pre"
	code_elem = node.css("code").first \|\| node
	code_text = code_elem.text.strip
	if code_text.length > 0
	lang = code_elem["class"]&.match(/language-(\w+)/)&.[](1) \|\| ""
	spec << "```#{lang}\n#{code_text}\n```\n\n"
	end
	when "table"
	rows = node.css("tr").map do \|tr\|
	tr.css("td, th").map { \|cell\| cell.text.strip.gsub(/\s+/, " ") }
	end
	if rows.any? && rows.first.any?
	spec << "\| " + rows.first.join(" \| ") + " \|\n"
	spec << "\| " + rows.first.map { "---" }.join(" \| ") + " \|\n"
	rows[1..-1].each do \|row\|
	spec << "\| " + row.join(" \| ") + " \|\n" if row.any?
	end
	spec << "\n"
	end
	when "ul", "ol"
	node.css("li").each do \|li\|
	text = li.text.strip
	spec << "- #{text}\n" if text.length > 0
	end
	spec << "\n" if node.css("li").any?
	when "div"
	# Handle API blocks and info rows
	if node["class"]&.include?("api-block")
	# Extract HTTP method and endpoint
	verb = node.css(".api-request-block-verb, .api-block-header").first&.text&.strip
	endpoint = node.css("span").map(&:text).join(" ").strip
	if verb && endpoint
	spec << "## Endpoint\n\n```\n#{verb} #{endpoint}\n```\n\n"
	end
	elsif node["class"]&.include?("api-info-row")
	# Extract parameter information
	heading = node.css("h3").first&.text&.strip
	if heading
	spec << "### #{heading}\n\n"
	node.css("dl.api-params dt").each do \|dt\|
	param_name = dt.css("code").first&.text&.strip
	param_type = dt.css("em").first&.text&.strip
	required = dt.css(".required").first ? "REQUIRED" : "OPTIONAL"
	dd = dt.next_element
	description = dd&.css("p")&.first&.text&.strip \|\| dd&.text&.strip
	if param_name
	spec << "- #{param_name} (#{param_type}) - #{required}\n"
	spec << " #{description}\n\n" if description && description.length > 0
	end
	end
	end
	end
	end
	end

	# Clean up excessive newlines
	result = spec.join.gsub(/\n{3,}/, "\n\n").strip

	(result.length > 50) ? result : nil
	end

	# Step 0: Fetch main API page and extract all URLs from sidebar
	puts "Step 0: Fetching main API page and extracting URLs from sidebar..."
	main_api_html = fetch_with_curl("#{API_BASE_URL}/")
	unless main_api_html
	puts "Error: Could not fetch main API page. Exiting."
	exit 1
	end

	# Extract top-level category URLs
	category_urls = extract_api_urls_from_sidebar(main_api_html)
	puts "Found #{category_urls.count} top-level category URLs"

	# Step 0.5: For each category page, fetch it and extract nested endpoint URLs
	puts "\nStep 0.5: Fetching category pages to extract nested endpoint URLs..."
	all_urls = Set.new(category_urls)

	category_urls.each do \|category_url\|
	next if category_url == "#{API_BASE_URL}/" # Skip the main page, we already have it

	puts " Fetching category: #{category_url}"
	category_html = fetch_with_curl(category_url)
	next unless category_html

	nested_urls = extract_api_urls_from_sidebar(category_html)
	nested_urls.each { \|url\| all_urls.add(url) }
	puts " Found #{nested_urls.count} URLs in #{category_url}"
	end

	URLS = all_urls.to_a.sort
	puts "\nTotal unique API URLs found: #{URLS.count}"
	puts "Sample URLs:"
	URLS.first(10).each { \|url\| puts " - #{url}" }
	puts " ..." if URLS.count > 10

	if URLS.empty?
	puts "Warning: No URLs found in sidebar. Exiting."
	exit 1
	end

	# Step 1: Fetch all HTML files with curl
	puts "\nStep 1: Fetching HTML files..."
	URLS.each do \|url\|
	fetch_with_curl(url)
	end

	# Step 2: Extract body and purify
	puts "\nStep 2: Extracting and purifying content..."
	URLS.each do \|url\|
	html_file = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html")
	next unless File.exist?(html_file)

	html = File.read(html_file)
	spec = purify_content(html, url)

	if spec && !spec.strip.empty?
	filename = File.join(OUTPUT_DIR, "#{sanitize_filename(url)}.md")
	File.write(filename, spec)
	puts " Extracted spec: #{File.basename(filename)}"
	else
	puts " Warning: Could not extract spec from #{url}"
	end
	end

	puts "\nDone! Specs saved to #{OUTPUT_DIR}"
	puts "Total URLs processed: #{URLS.count}"

	# Clean up: Remove raw_html folder
	puts "\nCleaning up raw HTML files..."
	if File.directory?(TEMP_DIR)
	FileUtils.rm_rf(TEMP_DIR)
	puts "Removed #{TEMP_DIR}"
	end
No results found