Skip to content

Instantly share code, notes, and snippets.

@amkisko
Created December 16, 2025 07:23
Show Gist options
  • Select an option

  • Save amkisko/a561aeba5fd536152ce90c88f2fc5550 to your computer and use it in GitHub Desktop.

Select an option

Save amkisko/a561aeba5fd536152ce90c88f2fc5550 to your computer and use it in GitHub Desktop.
Extract Sentry API specs from docs.sentry.io as markdown files
#!/usr/bin/env ruby
require "fileutils"
require "nokogiri"
require "uri"
BASE_URL = "https://docs.sentry.io"
API_BASE_URL = "#{BASE_URL}/api"
OUTPUT_DIR = File.join(File.dirname(__FILE__), "api_spec")
TEMP_DIR = File.join(OUTPUT_DIR, "raw_html")
FileUtils.mkdir_p(OUTPUT_DIR)
FileUtils.mkdir_p(TEMP_DIR)
def sanitize_filename(url)
# Extract a safe filename from URL
parts = url.gsub("https://docs.sentry.io/api/", "").gsub(/\/$/, "").split("/")
filename = parts.join("_").gsub(/[^a-z0-9_-]/, "_")
# Handle root/blank page - use "index"
filename = "index" if filename.empty? || filename == "_"
filename
end
def fetch_with_curl(url)
filename = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html")
puts "Fetching: #{url}"
begin
require "open-uri"
require "openssl"
html = URI.open(url,
"User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
:ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE).read
File.write(filename, html)
puts " Saved raw HTML to: #{filename}"
html
rescue => e
puts " Error fetching #{url}: #{e.message}"
# Try with curl as fallback (with -k to ignore SSL)
if system("curl", "-k", "-s", "-L", "-H", "User-Agent: Mozilla/5.0", url, "-o", filename)
if File.exist?(filename) && File.size(filename) > 0
puts " Saved raw HTML to: #{filename} (via curl)"
File.read(filename)
end
end
end
end
def extract_api_urls_from_sidebar(html)
# Force UTF-8 encoding
html = html.force_encoding("UTF-8")
# Extract body HTML and remove script/style/svg
body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i)
return [] unless body_match
body_html = body_match[1]
body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "")
body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "")
body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "")
body_html = body_html.gsub(/<svg[^>]*\/>/i, "")
# Parse the cleaned body HTML
doc = Nokogiri::HTML("<body>#{body_html}</body>")
body = doc.css("body").first
return [] unless body
urls = []
# Find the "Sentry API" sidebar section
# Look for a link with href="/api/" that contains "Sentry API" text
api_section = body.xpath(".//a[contains(@href, '/api/') and contains(., 'Sentry API')]").first ||
body.xpath(".//a[@href='/api/']").first
return [] unless api_section
# Find the parent <li> with data-sidebar-branch
api_li = api_section.ancestors("li[data-sidebar-branch='true']").first ||
api_section.parent
return [] unless api_li
# Find the nested <ul> with data-sidebar-tree inside this section
api_ul = api_li.css("ul[data-sidebar-tree]").first
if api_ul
# Recursively extract all links from this section
extract_links_recursive(api_ul, urls)
end
# Also include the main /api/ link itself
urls << "/api/" unless urls.include?("/api/")
# Convert relative URLs to absolute and normalize
urls.map do |url|
# Normalize: ensure it starts with /api/ and ends with /
url = url.gsub(/\/$/, "") + "/" unless url.end_with?("/")
url = "/api/" + url unless url.start_with?("/")
if url.start_with?("/")
"#{BASE_URL}#{url}"
elsif url.start_with?("http")
url
else
"#{API_BASE_URL}/#{url}"
end
end.uniq.sort
end
def extract_links_recursive(node, urls)
# Extract all <a> tags with href attributes in this node and its children
node.css("a[href]").each do |link|
href = link["href"]
next unless href
# Only include API links (starting with /api/)
if href.start_with?("/api/")
# Normalize: remove trailing slash then add it back for consistency
href = href.gsub(/\/$/, "") + "/" unless href.end_with?("/")
urls << href unless urls.include?(href)
end
end
# Recursively process nested lists (including those in CollapsibleSidebarLink components)
node.css("ul[data-sidebar-tree], ul").each do |ul|
extract_links_recursive(ul, urls)
end
# Also check for links in CollapsibleSidebarLink components (they might have nested children)
node.css("li[data-sidebar-branch]").each do |li|
extract_links_recursive(li, urls)
end
end
def extract_body(html)
doc = Nokogiri::HTML(html)
# Remove script, style, and svg tags from the entire document
doc.css("script, style, svg").remove
# Extract body content
body = doc.css("body").first
body ? body.to_html : html
end
def purify_content(html, url)
# Force UTF-8 encoding
html = html.force_encoding("UTF-8")
# First, extract body HTML as string and remove script/style/svg with regex
# This ensures we work with clean HTML before parsing
body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i)
return nil unless body_match
body_html = body_match[1]
# Remove script, style, and svg tags using regex (before parsing)
body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "")
body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "")
body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "")
body_html = body_html.gsub(/<svg[^>]*\/>/i, "") # Self-closing svg tags
# Now parse the cleaned body HTML
doc = Nokogiri::HTML("<body>#{body_html}</body>")
body = doc.css("body").first
return nil unless body
# Find main content - try multiple selectors
main_content = body.xpath(".//*[@id='doc-content']").first ||
body.css("#doc-content").first ||
body.css("main #main").first ||
body.css("article, main, .content, #content, .documentation-content, .markdown-body").first
unless main_content
return nil
end
# Remove navigation, breadcrumbs, and other non-content elements
main_content.css(".breadcrumbs, .not-prose, nav, aside, .grid, header, footer").remove
# Get the content div
content_div = if main_content.css("#main").any?
main_content.css("#main").first
else
main_content
end
unless content_div
return nil
end
spec = []
# Extract title from h1 or hgroup
title_elem = main_content.css("h1").first || main_content.css("hgroup h1").first
if title_elem
title = title_elem.text.strip
spec << "# #{title}\n\n" unless title.empty?
end
# Get all content elements in document order from #main or content_div
main_section = content_div.css("#main").first || content_div
# Get all headings, paragraphs, code blocks, and tables in order
all_elements = main_section.css("h1, h2, h3, h4, h5, h6, p, pre, table, ul, ol, div.api-block, div.api-info-row, dl.api-params")
all_elements.each do |node|
next if node["class"]&.include?("breadcrumb") || node["class"]&.include?("not-prose")
case node.name
when "h1"
text = node.text.strip
spec << "# #{text}\n\n" unless text.empty?
when "h2"
node.css("a, svg").remove
text = node.text.strip
spec << "## #{text}\n\n" unless text.empty?
when "h3"
node.css("a, svg").remove
text = node.text.strip
spec << "### #{text}\n\n" unless text.empty?
when "h4"
node.css("a, svg").remove
text = node.text.strip
spec << "#### #{text}\n\n" unless text.empty?
when "p"
text = node.text.strip
spec << "#{text}\n\n" if text.length > 10
when "pre"
code_elem = node.css("code").first || node
code_text = code_elem.text.strip
if code_text.length > 0
lang = code_elem["class"]&.match(/language-(\w+)/)&.[](1) || ""
spec << "```#{lang}\n#{code_text}\n```\n\n"
end
when "table"
rows = node.css("tr").map do |tr|
tr.css("td, th").map { |cell| cell.text.strip.gsub(/\s+/, " ") }
end
if rows.any? && rows.first.any?
spec << "| " + rows.first.join(" | ") + " |\n"
spec << "| " + rows.first.map { "---" }.join(" | ") + " |\n"
rows[1..-1].each do |row|
spec << "| " + row.join(" | ") + " |\n" if row.any?
end
spec << "\n"
end
when "ul", "ol"
node.css("li").each do |li|
text = li.text.strip
spec << "- #{text}\n" if text.length > 0
end
spec << "\n" if node.css("li").any?
when "div"
# Handle API blocks and info rows
if node["class"]&.include?("api-block")
# Extract HTTP method and endpoint
verb = node.css(".api-request-block-verb, .api-block-header").first&.text&.strip
endpoint = node.css("span").map(&:text).join(" ").strip
if verb && endpoint
spec << "## Endpoint\n\n```\n#{verb} #{endpoint}\n```\n\n"
end
elsif node["class"]&.include?("api-info-row")
# Extract parameter information
heading = node.css("h3").first&.text&.strip
if heading
spec << "### #{heading}\n\n"
node.css("dl.api-params dt").each do |dt|
param_name = dt.css("code").first&.text&.strip
param_type = dt.css("em").first&.text&.strip
required = dt.css(".required").first ? "REQUIRED" : "OPTIONAL"
dd = dt.next_element
description = dd&.css("p")&.first&.text&.strip || dd&.text&.strip
if param_name
spec << "- **#{param_name}** (#{param_type}) - #{required}\n"
spec << " #{description}\n\n" if description && description.length > 0
end
end
end
end
end
end
# Clean up excessive newlines
result = spec.join.gsub(/\n{3,}/, "\n\n").strip
(result.length > 50) ? result : nil
end
# Step 0: Fetch main API page and extract all URLs from sidebar
puts "Step 0: Fetching main API page and extracting URLs from sidebar..."
main_api_html = fetch_with_curl("#{API_BASE_URL}/")
unless main_api_html
puts "Error: Could not fetch main API page. Exiting."
exit 1
end
# Extract top-level category URLs
category_urls = extract_api_urls_from_sidebar(main_api_html)
puts "Found #{category_urls.count} top-level category URLs"
# Step 0.5: For each category page, fetch it and extract nested endpoint URLs
puts "\nStep 0.5: Fetching category pages to extract nested endpoint URLs..."
all_urls = Set.new(category_urls)
category_urls.each do |category_url|
next if category_url == "#{API_BASE_URL}/" # Skip the main page, we already have it
puts " Fetching category: #{category_url}"
category_html = fetch_with_curl(category_url)
next unless category_html
nested_urls = extract_api_urls_from_sidebar(category_html)
nested_urls.each { |url| all_urls.add(url) }
puts " Found #{nested_urls.count} URLs in #{category_url}"
end
URLS = all_urls.to_a.sort
puts "\nTotal unique API URLs found: #{URLS.count}"
puts "Sample URLs:"
URLS.first(10).each { |url| puts " - #{url}" }
puts " ..." if URLS.count > 10
if URLS.empty?
puts "Warning: No URLs found in sidebar. Exiting."
exit 1
end
# Step 1: Fetch all HTML files with curl
puts "\nStep 1: Fetching HTML files..."
URLS.each do |url|
fetch_with_curl(url)
end
# Step 2: Extract body and purify
puts "\nStep 2: Extracting and purifying content..."
URLS.each do |url|
html_file = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html")
next unless File.exist?(html_file)
html = File.read(html_file)
spec = purify_content(html, url)
if spec && !spec.strip.empty?
filename = File.join(OUTPUT_DIR, "#{sanitize_filename(url)}.md")
File.write(filename, spec)
puts " Extracted spec: #{File.basename(filename)}"
else
puts " Warning: Could not extract spec from #{url}"
end
end
puts "\nDone! Specs saved to #{OUTPUT_DIR}"
puts "Total URLs processed: #{URLS.count}"
# Clean up: Remove raw_html folder
puts "\nCleaning up raw HTML files..."
if File.directory?(TEMP_DIR)
FileUtils.rm_rf(TEMP_DIR)
puts "Removed #{TEMP_DIR}"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment