Created
December 16, 2025 07:23
-
-
Save amkisko/a561aeba5fd536152ce90c88f2fc5550 to your computer and use it in GitHub Desktop.
Extract Sentry API specs from docs.sentry.io as markdown files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env ruby | |
| require "fileutils" | |
| require "nokogiri" | |
| require "uri" | |
| BASE_URL = "https://docs.sentry.io" | |
| API_BASE_URL = "#{BASE_URL}/api" | |
| OUTPUT_DIR = File.join(File.dirname(__FILE__), "api_spec") | |
| TEMP_DIR = File.join(OUTPUT_DIR, "raw_html") | |
| FileUtils.mkdir_p(OUTPUT_DIR) | |
| FileUtils.mkdir_p(TEMP_DIR) | |
| def sanitize_filename(url) | |
| # Extract a safe filename from URL | |
| parts = url.gsub("https://docs.sentry.io/api/", "").gsub(/\/$/, "").split("/") | |
| filename = parts.join("_").gsub(/[^a-z0-9_-]/, "_") | |
| # Handle root/blank page - use "index" | |
| filename = "index" if filename.empty? || filename == "_" | |
| filename | |
| end | |
| def fetch_with_curl(url) | |
| filename = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html") | |
| puts "Fetching: #{url}" | |
| begin | |
| require "open-uri" | |
| require "openssl" | |
| html = URI.open(url, | |
| "User-Agent" => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36", | |
| :ssl_verify_mode => OpenSSL::SSL::VERIFY_NONE).read | |
| File.write(filename, html) | |
| puts " Saved raw HTML to: #{filename}" | |
| html | |
| rescue => e | |
| puts " Error fetching #{url}: #{e.message}" | |
| # Try with curl as fallback (with -k to ignore SSL) | |
| if system("curl", "-k", "-s", "-L", "-H", "User-Agent: Mozilla/5.0", url, "-o", filename) | |
| if File.exist?(filename) && File.size(filename) > 0 | |
| puts " Saved raw HTML to: #{filename} (via curl)" | |
| File.read(filename) | |
| end | |
| end | |
| end | |
| end | |
| def extract_api_urls_from_sidebar(html) | |
| # Force UTF-8 encoding | |
| html = html.force_encoding("UTF-8") | |
| # Extract body HTML and remove script/style/svg | |
| body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i) | |
| return [] unless body_match | |
| body_html = body_match[1] | |
| body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "") | |
| body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "") | |
| body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "") | |
| body_html = body_html.gsub(/<svg[^>]*\/>/i, "") | |
| # Parse the cleaned body HTML | |
| doc = Nokogiri::HTML("<body>#{body_html}</body>") | |
| body = doc.css("body").first | |
| return [] unless body | |
| urls = [] | |
| # Find the "Sentry API" sidebar section | |
| # Look for a link with href="/api/" that contains "Sentry API" text | |
| api_section = body.xpath(".//a[contains(@href, '/api/') and contains(., 'Sentry API')]").first || | |
| body.xpath(".//a[@href='/api/']").first | |
| return [] unless api_section | |
| # Find the parent <li> with data-sidebar-branch | |
| api_li = api_section.ancestors("li[data-sidebar-branch='true']").first || | |
| api_section.parent | |
| return [] unless api_li | |
| # Find the nested <ul> with data-sidebar-tree inside this section | |
| api_ul = api_li.css("ul[data-sidebar-tree]").first | |
| if api_ul | |
| # Recursively extract all links from this section | |
| extract_links_recursive(api_ul, urls) | |
| end | |
| # Also include the main /api/ link itself | |
| urls << "/api/" unless urls.include?("/api/") | |
| # Convert relative URLs to absolute and normalize | |
| urls.map do |url| | |
| # Normalize: ensure it starts with /api/ and ends with / | |
| url = url.gsub(/\/$/, "") + "/" unless url.end_with?("/") | |
| url = "/api/" + url unless url.start_with?("/") | |
| if url.start_with?("/") | |
| "#{BASE_URL}#{url}" | |
| elsif url.start_with?("http") | |
| url | |
| else | |
| "#{API_BASE_URL}/#{url}" | |
| end | |
| end.uniq.sort | |
| end | |
| def extract_links_recursive(node, urls) | |
| # Extract all <a> tags with href attributes in this node and its children | |
| node.css("a[href]").each do |link| | |
| href = link["href"] | |
| next unless href | |
| # Only include API links (starting with /api/) | |
| if href.start_with?("/api/") | |
| # Normalize: remove trailing slash then add it back for consistency | |
| href = href.gsub(/\/$/, "") + "/" unless href.end_with?("/") | |
| urls << href unless urls.include?(href) | |
| end | |
| end | |
| # Recursively process nested lists (including those in CollapsibleSidebarLink components) | |
| node.css("ul[data-sidebar-tree], ul").each do |ul| | |
| extract_links_recursive(ul, urls) | |
| end | |
| # Also check for links in CollapsibleSidebarLink components (they might have nested children) | |
| node.css("li[data-sidebar-branch]").each do |li| | |
| extract_links_recursive(li, urls) | |
| end | |
| end | |
| def extract_body(html) | |
| doc = Nokogiri::HTML(html) | |
| # Remove script, style, and svg tags from the entire document | |
| doc.css("script, style, svg").remove | |
| # Extract body content | |
| body = doc.css("body").first | |
| body ? body.to_html : html | |
| end | |
| def purify_content(html, url) | |
| # Force UTF-8 encoding | |
| html = html.force_encoding("UTF-8") | |
| # First, extract body HTML as string and remove script/style/svg with regex | |
| # This ensures we work with clean HTML before parsing | |
| body_match = html.match(/<body[^>]*>([\s\S]*)<\/body>/i) | |
| return nil unless body_match | |
| body_html = body_match[1] | |
| # Remove script, style, and svg tags using regex (before parsing) | |
| body_html = body_html.gsub(/<script[^>]*>[\s\S]*?<\/script>/i, "") | |
| body_html = body_html.gsub(/<style[^>]*>[\s\S]*?<\/style>/i, "") | |
| body_html = body_html.gsub(/<svg[^>]*>[\s\S]*?<\/svg>/i, "") | |
| body_html = body_html.gsub(/<svg[^>]*\/>/i, "") # Self-closing svg tags | |
| # Now parse the cleaned body HTML | |
| doc = Nokogiri::HTML("<body>#{body_html}</body>") | |
| body = doc.css("body").first | |
| return nil unless body | |
| # Find main content - try multiple selectors | |
| main_content = body.xpath(".//*[@id='doc-content']").first || | |
| body.css("#doc-content").first || | |
| body.css("main #main").first || | |
| body.css("article, main, .content, #content, .documentation-content, .markdown-body").first | |
| unless main_content | |
| return nil | |
| end | |
| # Remove navigation, breadcrumbs, and other non-content elements | |
| main_content.css(".breadcrumbs, .not-prose, nav, aside, .grid, header, footer").remove | |
| # Get the content div | |
| content_div = if main_content.css("#main").any? | |
| main_content.css("#main").first | |
| else | |
| main_content | |
| end | |
| unless content_div | |
| return nil | |
| end | |
| spec = [] | |
| # Extract title from h1 or hgroup | |
| title_elem = main_content.css("h1").first || main_content.css("hgroup h1").first | |
| if title_elem | |
| title = title_elem.text.strip | |
| spec << "# #{title}\n\n" unless title.empty? | |
| end | |
| # Get all content elements in document order from #main or content_div | |
| main_section = content_div.css("#main").first || content_div | |
| # Get all headings, paragraphs, code blocks, and tables in order | |
| all_elements = main_section.css("h1, h2, h3, h4, h5, h6, p, pre, table, ul, ol, div.api-block, div.api-info-row, dl.api-params") | |
| all_elements.each do |node| | |
| next if node["class"]&.include?("breadcrumb") || node["class"]&.include?("not-prose") | |
| case node.name | |
| when "h1" | |
| text = node.text.strip | |
| spec << "# #{text}\n\n" unless text.empty? | |
| when "h2" | |
| node.css("a, svg").remove | |
| text = node.text.strip | |
| spec << "## #{text}\n\n" unless text.empty? | |
| when "h3" | |
| node.css("a, svg").remove | |
| text = node.text.strip | |
| spec << "### #{text}\n\n" unless text.empty? | |
| when "h4" | |
| node.css("a, svg").remove | |
| text = node.text.strip | |
| spec << "#### #{text}\n\n" unless text.empty? | |
| when "p" | |
| text = node.text.strip | |
| spec << "#{text}\n\n" if text.length > 10 | |
| when "pre" | |
| code_elem = node.css("code").first || node | |
| code_text = code_elem.text.strip | |
| if code_text.length > 0 | |
| lang = code_elem["class"]&.match(/language-(\w+)/)&.[](1) || "" | |
| spec << "```#{lang}\n#{code_text}\n```\n\n" | |
| end | |
| when "table" | |
| rows = node.css("tr").map do |tr| | |
| tr.css("td, th").map { |cell| cell.text.strip.gsub(/\s+/, " ") } | |
| end | |
| if rows.any? && rows.first.any? | |
| spec << "| " + rows.first.join(" | ") + " |\n" | |
| spec << "| " + rows.first.map { "---" }.join(" | ") + " |\n" | |
| rows[1..-1].each do |row| | |
| spec << "| " + row.join(" | ") + " |\n" if row.any? | |
| end | |
| spec << "\n" | |
| end | |
| when "ul", "ol" | |
| node.css("li").each do |li| | |
| text = li.text.strip | |
| spec << "- #{text}\n" if text.length > 0 | |
| end | |
| spec << "\n" if node.css("li").any? | |
| when "div" | |
| # Handle API blocks and info rows | |
| if node["class"]&.include?("api-block") | |
| # Extract HTTP method and endpoint | |
| verb = node.css(".api-request-block-verb, .api-block-header").first&.text&.strip | |
| endpoint = node.css("span").map(&:text).join(" ").strip | |
| if verb && endpoint | |
| spec << "## Endpoint\n\n```\n#{verb} #{endpoint}\n```\n\n" | |
| end | |
| elsif node["class"]&.include?("api-info-row") | |
| # Extract parameter information | |
| heading = node.css("h3").first&.text&.strip | |
| if heading | |
| spec << "### #{heading}\n\n" | |
| node.css("dl.api-params dt").each do |dt| | |
| param_name = dt.css("code").first&.text&.strip | |
| param_type = dt.css("em").first&.text&.strip | |
| required = dt.css(".required").first ? "REQUIRED" : "OPTIONAL" | |
| dd = dt.next_element | |
| description = dd&.css("p")&.first&.text&.strip || dd&.text&.strip | |
| if param_name | |
| spec << "- **#{param_name}** (#{param_type}) - #{required}\n" | |
| spec << " #{description}\n\n" if description && description.length > 0 | |
| end | |
| end | |
| end | |
| end | |
| end | |
| end | |
| # Clean up excessive newlines | |
| result = spec.join.gsub(/\n{3,}/, "\n\n").strip | |
| (result.length > 50) ? result : nil | |
| end | |
| # Step 0: Fetch main API page and extract all URLs from sidebar | |
| puts "Step 0: Fetching main API page and extracting URLs from sidebar..." | |
| main_api_html = fetch_with_curl("#{API_BASE_URL}/") | |
| unless main_api_html | |
| puts "Error: Could not fetch main API page. Exiting." | |
| exit 1 | |
| end | |
| # Extract top-level category URLs | |
| category_urls = extract_api_urls_from_sidebar(main_api_html) | |
| puts "Found #{category_urls.count} top-level category URLs" | |
| # Step 0.5: For each category page, fetch it and extract nested endpoint URLs | |
| puts "\nStep 0.5: Fetching category pages to extract nested endpoint URLs..." | |
| all_urls = Set.new(category_urls) | |
| category_urls.each do |category_url| | |
| next if category_url == "#{API_BASE_URL}/" # Skip the main page, we already have it | |
| puts " Fetching category: #{category_url}" | |
| category_html = fetch_with_curl(category_url) | |
| next unless category_html | |
| nested_urls = extract_api_urls_from_sidebar(category_html) | |
| nested_urls.each { |url| all_urls.add(url) } | |
| puts " Found #{nested_urls.count} URLs in #{category_url}" | |
| end | |
| URLS = all_urls.to_a.sort | |
| puts "\nTotal unique API URLs found: #{URLS.count}" | |
| puts "Sample URLs:" | |
| URLS.first(10).each { |url| puts " - #{url}" } | |
| puts " ..." if URLS.count > 10 | |
| if URLS.empty? | |
| puts "Warning: No URLs found in sidebar. Exiting." | |
| exit 1 | |
| end | |
| # Step 1: Fetch all HTML files with curl | |
| puts "\nStep 1: Fetching HTML files..." | |
| URLS.each do |url| | |
| fetch_with_curl(url) | |
| end | |
| # Step 2: Extract body and purify | |
| puts "\nStep 2: Extracting and purifying content..." | |
| URLS.each do |url| | |
| html_file = File.join(TEMP_DIR, "#{sanitize_filename(url)}.html") | |
| next unless File.exist?(html_file) | |
| html = File.read(html_file) | |
| spec = purify_content(html, url) | |
| if spec && !spec.strip.empty? | |
| filename = File.join(OUTPUT_DIR, "#{sanitize_filename(url)}.md") | |
| File.write(filename, spec) | |
| puts " Extracted spec: #{File.basename(filename)}" | |
| else | |
| puts " Warning: Could not extract spec from #{url}" | |
| end | |
| end | |
| puts "\nDone! Specs saved to #{OUTPUT_DIR}" | |
| puts "Total URLs processed: #{URLS.count}" | |
| # Clean up: Remove raw_html folder | |
| puts "\nCleaning up raw HTML files..." | |
| if File.directory?(TEMP_DIR) | |
| FileUtils.rm_rf(TEMP_DIR) | |
| puts "Removed #{TEMP_DIR}" | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment