Created
March 19, 2026 09:39
-
-
Save ve3/21e828a4746fa3e3e584f36d52fa379d to your computer and use it in GitHub Desktop.
Scrape all HTML tags.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| $url = 'https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements'; | |
| // 1. Fetch the HTML content | |
| $ch = curl_init($url); | |
| curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
| curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
| curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'); | |
| $html = curl_exec($ch); | |
| if (curl_errno($ch)) { | |
| die('cURL Error: ' . curl_error($ch)); | |
| } | |
| //curl_close($ch); | |
| // 2. Parse the HTML | |
| libxml_use_internal_errors(true); | |
| $dom = new DOMDocument(); | |
| $dom->loadHTML($html); | |
| libxml_clear_errors(); | |
| $xpath = new DOMXPath($dom); | |
| // Setup the three groups | |
| $normalFullTags = []; | |
| $normalTagNames = []; | |
| $obsoleteFullTags = []; | |
| // 3. Find all tables to accurately determine context | |
| $tables = $xpath->query('//table'); | |
| foreach ($tables as $table) { | |
| // Find the closest preceding <h2> to determine if we are in the "Obsolete" section | |
| $precedingH2s = $xpath->query('preceding::h2', $table); | |
| $lastH2 = $precedingH2s->length > 0 ? $precedingH2s->item($precedingH2s->length - 1)->textContent : ''; | |
| $isObsoleteSection = (stripos($lastH2, 'obsolete') !== false || stripos($lastH2, 'deprecated') !== false); | |
| $codeNodes = $xpath->query('.//code', $table); | |
| foreach ($codeNodes as $node) { | |
| $text = trim($node->textContent); | |
| // Match only valid <tag> formats | |
| if (preg_match('/^<([a-z0-9\-]+)>$/i', $text, $matches)) { | |
| $tagName = strtolower($matches[1]); | |
| $fullTag = '<' . $tagName . '>'; | |
| // Check if the table row's description explicitly mentions deprecation | |
| $tr = $xpath->query('ancestor::tr', $node)->item(0); | |
| $isDeprecatedInline = false; | |
| if ($tr) { | |
| $rowText = $tr->textContent; | |
| $isDeprecatedInline = (stripos($rowText, 'deprecated') !== false || stripos($rowText, 'obsolete') !== false); | |
| } | |
| // Route to the correct arrays | |
| if ($isObsoleteSection || $isDeprecatedInline) { | |
| if (!in_array($fullTag, $obsoleteFullTags)) { | |
| $obsoleteFullTags[] = $fullTag; | |
| } | |
| } else { | |
| if (!in_array($tagName, $normalTagNames)) { | |
| $normalFullTags[] = $fullTag; | |
| $normalTagNames[] = $tagName; | |
| } | |
| } | |
| } | |
| } | |
| } | |
| // 4. Sort all arrays ascending | |
| sort($normalFullTags); | |
| sort($normalTagNames); | |
| sort($obsoleteFullTags); | |
| // 5. Build the PHP file output | |
| $phpCode = "<?php\n\nreturn [\n"; | |
| // Group 1: Normal full tags | |
| $phpCode .= " [\n"; | |
| foreach ($normalFullTags as $index => $tag) { | |
| $phpCode .= " {$index} => '{$tag}',\n"; | |
| } | |
| $phpCode .= " ],\n"; | |
| // Group 2: Normal tag names only (without < >) | |
| $phpCode .= " [\n"; | |
| foreach ($normalTagNames as $index => $name) { | |
| $phpCode .= " {$index} => '{$name}',\n"; | |
| } | |
| $phpCode .= " ],\n"; | |
| // Group 3: Obsolete full tags | |
| $phpCode .= " [\n"; | |
| foreach ($obsoleteFullTags as $index => $tag) { | |
| $phpCode .= " {$index} => '{$tag}',\n"; | |
| } | |
| $phpCode .= " ],\n"; | |
| $phpCode .= "];\n"; | |
| // 6. Save the file | |
| $outputFilename = 'html_tags_array.php'; | |
| file_put_contents($outputFilename, $phpCode); | |
| echo "Successfully extracted and grouped HTML tags into {$outputFilename}.<br>\n"; | |
| echo "- Normal tags: " . count($normalTagNames) . "<br>\n"; | |
| echo "- Obsolete tags: " . count($obsoleteFullTags) . "<br>\n"; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * Target: MDN ARIA Attributes Reference | |
| * Output: html_aria-attributes_array.php | |
| */ | |
| $url = 'https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Reference/Attributes'; | |
| // 1. Initialize handle (PHP 8.4+ compliant - no curl_close) | |
| $ch = curl_init($url); | |
| curl_setopt_array($ch, [ | |
| CURLOPT_RETURNTRANSFER => true, | |
| CURLOPT_FOLLOWLOCATION => true, | |
| CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Aria-Scraper/1.0', | |
| ]); | |
| $html = curl_exec($ch); | |
| if ($html === false) { | |
| // Web-friendly error output | |
| die('cURL Error: ' . curl_error($ch) . "<br>\n"); | |
| } | |
| // 2. DOM & XPath setup | |
| libxml_use_internal_errors(true); | |
| $dom = new DOMDocument(); | |
| $dom->loadHTML($html); | |
| libxml_clear_errors(); | |
| $xpath = new DOMXPath($dom); | |
| // 3. Extracting aria-* from <code> tags | |
| $nodes = $xpath->query('//code'); | |
| $ariaAttributes = []; | |
| foreach ($nodes as $node) { | |
| $text = trim($node->textContent); | |
| // Regex targets strings starting with 'aria-' only | |
| if (preg_match('/^aria-[a-z0-9\-]+$/i', $text)) { | |
| $attr = strtolower($text); | |
| if (!in_array($attr, $ariaAttributes)) { | |
| $ariaAttributes[] = $attr; | |
| } | |
| } | |
| } | |
| // 4. Sorting | |
| sort($ariaAttributes); | |
| // 5. Build the file string | |
| $output = "<?php\n\nreturn [\n"; | |
| foreach ($ariaAttributes as $index => $attr) { | |
| $output .= " {$index} => '{$attr}',\n"; | |
| } | |
| $output .= "];\n"; | |
| // 6. Save and Report | |
| $filename = 'html_aria-attributes_array.php'; | |
| $bytes = file_put_contents($filename, $output); | |
| if ($bytes !== false) { | |
| echo "<strong>Success:</strong> Extracted " . count($ariaAttributes) . " ARIA attributes.<br>\n"; | |
| echo "File saved as: <code>{$filename}</code> ({$bytes} bytes).<br>\n"; | |
| } else { | |
| echo "<strong>Error:</strong> Failed to write file to disk.<br>\n"; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * Target: MDN Global HTML Attributes Reference | |
| * Output: html_global-attributes_array.php | |
| */ | |
| $url = 'https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Global_attributes'; | |
| $ch = curl_init($url); | |
| curl_setopt_array($ch, [ | |
| CURLOPT_RETURNTRANSFER => true, | |
| CURLOPT_FOLLOWLOCATION => true, | |
| CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Global-Attr-Scraper/2.0', | |
| ]); | |
| $html = curl_exec($ch); | |
| if ($html === false) { | |
| die('<strong>Error:</strong> cURL failed - ' . curl_error($ch) . "<br>\n"); | |
| } | |
| libxml_use_internal_errors(true); | |
| $dom = new DOMDocument(); | |
| $dom->loadHTML($html); | |
| libxml_clear_errors(); | |
| $xpath = new DOMXPath($dom); | |
| /** | |
| * FIX: Target ONLY the <main> or <article> area to avoid the sidebar navigation. | |
| * Target ONLY <dt> (Definition Terms) to avoid grabbing values like 'false' from code examples. | |
| */ | |
| $nodes = $xpath->query('//main//dt//code | //article//dt//code'); | |
| $globalAttributes = []; | |
| foreach ($nodes as $node) { | |
| $text = trim($node->textContent); | |
| /** | |
| * Regex criteria: | |
| * 1. Starts with a letter. | |
| * 2. Contains letters, numbers, hyphens. | |
| * 3. Optionally ends with an asterisk (to catch "data-*" and "aria-*"). | |
| */ | |
| if (preg_match('/^[a-z][a-z0-9\-]*\*?$/i', $text)) { | |
| $attr = strtolower($text); | |
| // Exclude event handlers (anything starting with 'on') | |
| if (strpos($attr, 'on') !== 0) { | |
| if (!in_array($attr, $globalAttributes)) { | |
| $globalAttributes[] = $attr; | |
| } | |
| } | |
| } | |
| } | |
| // Sort ascending | |
| sort($globalAttributes); | |
| // Generate the PHP file content | |
| $output = "<?php\n\nreturn [\n"; | |
| foreach ($globalAttributes as $index => $attr) { | |
| $output .= " {$index} => '{$attr}',\n"; | |
| } | |
| $output .= "];\n"; | |
| // Save and Output | |
| $filename = 'html_global-attributes_array.php'; | |
| $bytes = file_put_contents($filename, $output); | |
| if ($bytes !== false) { | |
| echo "<strong>Success:</strong> Extracted " . count($globalAttributes) . " exact Global attributes.<br>\n"; | |
| echo "File saved: <code>{$filename}</code> ({$bytes} bytes).<br>\n"; | |
| } else { | |
| echo "<strong>Error:</strong> Failed to write file to disk.<br>\n"; | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <?php | |
| /** | |
| * Script: WHATWG Tag Attribute Scraper | |
| * Source: https://html.spec.whatwg.org/multipage/indices.html | |
| */ | |
| $tagsFile = 'html_tags_array.php'; | |
| $indexUrl = 'https://html.spec.whatwg.org/multipage/indices.html'; | |
| $indexCache = 'whatwg_index_cache.html'; | |
| $cacheLimit = 30 * 24 * 60 * 60; | |
| set_time_limit(10 * 60); | |
| // 1. Load tag names for the selector | |
| $tagNames = []; | |
| if (file_exists($tagsFile)) { | |
| $tagsData = include($tagsFile); | |
| if (isset($tagsData[1]) && is_array($tagsData[1])) { | |
| $tagNames = $tagsData[1]; | |
| } | |
| } | |
| // 2. Handle POST | |
| if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_POST['tags'])) { | |
| echo "<h3>Scraping Progress:</h3><br>\n"; | |
| $selectedTags = (array)$_POST['tags']; | |
| if (!file_exists('html_per-tag')) { | |
| mkdir('html_per-tag', 0777, true); | |
| } | |
| // === STEP 1: Fetch / serve the WHATWG index page from cache === | |
| $indexHtml = false; | |
| if (file_exists($indexCache) && (time() - filemtime($indexCache)) <= $cacheLimit) { | |
| $indexHtml = file_get_contents($indexCache); | |
| if ($indexHtml !== false) { | |
| echo "<em>[Cache]</em> Using cached WHATWG index (<code>{$indexCache}</code>).<br>\n"; | |
| } | |
| } | |
| if ($indexHtml === false) { | |
| echo "<em>[Fetch]</em> Downloading WHATWG index page…<br>\n"; | |
| $ch = curl_init($indexUrl); | |
| curl_setopt_array($ch, [ | |
| CURLOPT_RETURNTRANSFER => true, | |
| CURLOPT_FOLLOWLOCATION => true, | |
| CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Asset-Scraper/2.0', | |
| CURLOPT_CONNECTTIMEOUT => 15, | |
| CURLOPT_FAILONERROR => true, | |
| ]); | |
| $indexHtml = curl_exec($ch); | |
| $curlErr = curl_error($ch); | |
| if ($indexHtml === false) { | |
| echo "<strong>[Fatal]</strong> Could not fetch WHATWG index: <em>{$curlErr}</em>. Aborting.<br>\n"; | |
| exit; | |
| } | |
| file_put_contents($indexCache, $indexHtml); | |
| echo "<em>[Fetch]</em> Downloaded and cached to <code>{$indexCache}</code>.<br>\n"; | |
| } | |
| // === STEP 2: Parse — build a complete tag → attributes[] map === | |
| libxml_use_internal_errors(true); | |
| $dom = new DOMDocument(); | |
| @$dom->loadHTML($indexHtml); | |
| libxml_clear_errors(); | |
| $xpath = new DOMXPath($dom); | |
| $tagAttrMap = []; | |
| // Rows use <th> for the element name, <td> for all other columns | |
| $rows = $xpath->query('//table//tr[th and td]'); | |
| foreach ($rows as $row) { | |
| // ── Tag names: read from the <th> cell ─────────────────────────────── | |
| // Element links always point to anchors like #the-input-element | |
| $thCells = $xpath->query('th', $row); | |
| if ($thCells->length < 1) continue; | |
| $tagsInRow = []; | |
| foreach ($xpath->query('.//a', $thCells->item(0)) as $a) { | |
| $fragment = (string)(parse_url($a->getAttribute('href'), PHP_URL_FRAGMENT) ?? ''); | |
| $name = strtolower(trim($a->textContent)); | |
| if (str_starts_with($fragment, 'the-') && preg_match('/^[a-z][a-z0-9]*$/', $name)) { | |
| $tagsInRow[] = $name; | |
| } | |
| } | |
| if (empty($tagsInRow)) continue; | |
| // ── Attributes: WHITELIST — only keep anchors that match WHATWG's | |
| // attribute-anchor naming conventions: | |
| // | |
| // attr-* → almost all element attributes | |
| // e.g. #attr-media-src, #attr-input-accept | |
| // | |
| // handler-* → event-handler content attributes on <body> | |
| // e.g. #handler-window-onafterprint | |
| // | |
| // Everything else (flow-content-2, htmlelement, the-*, concept-*, …) | |
| // is silently ignored. | |
| $attrs = []; | |
| foreach ($xpath->query('.//a', $row) as $a) { | |
| $fragment = (string)(parse_url($a->getAttribute('href'), PHP_URL_FRAGMENT) ?? ''); | |
| if (!str_starts_with($fragment, 'attr-') && | |
| !str_starts_with($fragment, 'handler-')) { | |
| continue; | |
| } | |
| $val = strtolower(trim($a->textContent)); | |
| if (!preg_match('/^[a-z][a-z0-9\-]*$/', $val)) continue; | |
| $attrs[] = $val; | |
| } | |
| $attrs = array_values(array_unique($attrs)); | |
| sort($attrs); | |
| // Rows shared by multiple tags (e.g. h1–h6) get the same list | |
| foreach ($tagsInRow as $t) { | |
| $tagAttrMap[$t] = $attrs; | |
| } | |
| } | |
| echo "<em>[Parse]</em> Built attribute map for <strong>" | |
| . count($tagAttrMap) . "</strong> element(s).<br><br>\n"; | |
| // === STEP 3: Write per-tag files === | |
| foreach ($selectedTags as $rawTag) { | |
| $cleanTag = strtolower(str_replace(['<', '>'], '', $rawTag)); | |
| $filename = "html_per-tag/html_per-tag-{$cleanTag}-attributes_array.php"; | |
| if (file_exists($filename) && (time() - filemtime($filename)) <= $cacheLimit) { | |
| echo "<strong>[Cached]</strong> <code>{$filename}</code> is up to date. Skipping…<br>\n"; | |
| continue; | |
| } | |
| $attributes = $tagAttrMap[$cleanTag] ?? []; | |
| $fileContent = "<?php\n\nreturn [\n"; | |
| foreach ($attributes as $idx => $attr) { | |
| $fileContent .= " {$idx} => '{$attr}',\n"; | |
| } | |
| $fileContent .= "];\n"; | |
| if (file_put_contents($filename, $fileContent) !== false) { | |
| $count = count($attributes); | |
| $msg = $count > 0 | |
| ? "Saved <strong>{$count}</strong> attributes" | |
| : "No element-specific attributes found; saved <strong>empty array</strong>"; | |
| echo "<strong>[Success]</strong> {$msg} → <code>{$filename}</code>.<br>\n"; | |
| } else { | |
| echo "<strong>[Error]</strong> Could not write <code>{$filename}</code>.<br>\n"; | |
| } | |
| } | |
| echo "<br><hr><a href=''>← Start New Scrape</a><br>\n"; | |
| exit; | |
| } | |
| ?> | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>WHATWG Attribute Scraper</title> | |
| <style> | |
| body { font-family: sans-serif; background: #f0f2f5; padding: 20px; line-height: 1.5; } | |
| .card { max-width: 500px; margin: auto; background: #fff; padding: 25px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); } | |
| select { width: 100%; height: 250px; margin-top: 10px; border: 1px solid #ccc; border-radius: 5px; font-size: 16px; } | |
| button { width: 100%; padding: 12px; background: #007bff; color: #fff; border: none; border-radius: 5px; cursor: pointer; font-size: 16px; margin-top: 16px; } | |
| button:hover { background: #0056b3; } | |
| label { font-weight: bold; } | |
| .help { font-size: 12px; color: #666; margin-bottom: 5px; } | |
| .source { font-size: 12px; color: #888; margin-bottom: 16px; } | |
| </style> | |
| </head> | |
| <body> | |
| <div class="card"> | |
| <h2>Tag Attribute Scraper</h2> | |
| <p class="source"> | |
| Source: <a href="https://html.spec.whatwg.org/multipage/indices.html" target="_blank"> | |
| WHATWG HTML Living Standard — Element Index</a><br> | |
| Index page fetched once, cached 30 days. | |
| </p> | |
| <form method="POST"> | |
| <label>Select Tags (Multiple):</label> | |
| <div class="help">Files saved in <code>/html_per-tag/</code>. 30-day cache applies per file.</div> | |
| <select name="tags[]" multiple required> | |
| <?php foreach ($tagNames as $tag): ?> | |
| <option value="<?= htmlspecialchars($tag) ?>"><?= htmlspecialchars($tag) ?></option> | |
| <?php endforeach; ?> | |
| </select> | |
| <button type="submit">Check & Generate Files</button> | |
| </form> | |
| </div> | |
| </body> | |
| </html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment