Skip to content

Instantly share code, notes, and snippets.

@ve3
Created March 19, 2026 09:39
Show Gist options
  • Select an option

  • Save ve3/21e828a4746fa3e3e584f36d52fa379d to your computer and use it in GitHub Desktop.

Select an option

Save ve3/21e828a4746fa3e3e584f36d52fa379d to your computer and use it in GitHub Desktop.
Scrape all HTML tags.
<?php
/**
* The example of HTML tags scraped from MDN.
*/
return [
[
0 => '<a>',
1 => '<abbr>',
2 => '<address>',
3 => '<area>',
4 => '<article>',
5 => '<aside>',
6 => '<audio>',
7 => '<b>',
8 => '<base>',
9 => '<bdi>',
10 => '<bdo>',
11 => '<blockquote>',
12 => '<body>',
13 => '<br>',
14 => '<button>',
15 => '<canvas>',
16 => '<caption>',
17 => '<cite>',
18 => '<code>',
19 => '<col>',
20 => '<colgroup>',
21 => '<data>',
22 => '<datalist>',
23 => '<dd>',
24 => '<del>',
25 => '<details>',
26 => '<dfn>',
27 => '<dialog>',
28 => '<div>',
29 => '<dl>',
30 => '<dt>',
31 => '<em>',
32 => '<embed>',
33 => '<fencedframe>',
34 => '<fieldset>',
35 => '<figcaption>',
36 => '<figure>',
37 => '<footer>',
38 => '<form>',
39 => '<geolocation>',
40 => '<h1>',
41 => '<h2>',
42 => '<h3>',
43 => '<h4>',
44 => '<h5>',
45 => '<h6>',
46 => '<head>',
47 => '<header>',
48 => '<hgroup>',
49 => '<hr>',
50 => '<html>',
51 => '<i>',
52 => '<iframe>',
53 => '<img>',
54 => '<input>',
55 => '<ins>',
56 => '<kbd>',
57 => '<label>',
58 => '<legend>',
59 => '<li>',
60 => '<link>',
61 => '<main>',
62 => '<map>',
63 => '<mark>',
64 => '<math>',
65 => '<menu>',
66 => '<meta>',
67 => '<meter>',
68 => '<nav>',
69 => '<noscript>',
70 => '<object>',
71 => '<ol>',
72 => '<optgroup>',
73 => '<option>',
74 => '<output>',
75 => '<p>',
76 => '<picture>',
77 => '<pre>',
78 => '<progress>',
79 => '<q>',
80 => '<rp>',
81 => '<rt>',
82 => '<ruby>',
83 => '<s>',
84 => '<samp>',
85 => '<script>',
86 => '<search>',
87 => '<section>',
88 => '<select>',
89 => '<selectedcontent>',
90 => '<slot>',
91 => '<small>',
92 => '<source>',
93 => '<span>',
94 => '<strong>',
95 => '<style>',
96 => '<sub>',
97 => '<summary>',
98 => '<sup>',
99 => '<svg>',
100 => '<table>',
101 => '<tbody>',
102 => '<td>',
103 => '<template>',
104 => '<textarea>',
105 => '<tfoot>',
106 => '<th>',
107 => '<thead>',
108 => '<time>',
109 => '<title>',
110 => '<tr>',
111 => '<track>',
112 => '<u>',
113 => '<ul>',
114 => '<var>',
115 => '<video>',
116 => '<wbr>',
],
[
0 => 'a',
1 => 'abbr',
2 => 'address',
3 => 'area',
4 => 'article',
5 => 'aside',
6 => 'audio',
7 => 'b',
8 => 'base',
9 => 'bdi',
10 => 'bdo',
11 => 'blockquote',
12 => 'body',
13 => 'br',
14 => 'button',
15 => 'canvas',
16 => 'caption',
17 => 'cite',
18 => 'code',
19 => 'col',
20 => 'colgroup',
21 => 'data',
22 => 'datalist',
23 => 'dd',
24 => 'del',
25 => 'details',
26 => 'dfn',
27 => 'dialog',
28 => 'div',
29 => 'dl',
30 => 'dt',
31 => 'em',
32 => 'embed',
33 => 'fencedframe',
34 => 'fieldset',
35 => 'figcaption',
36 => 'figure',
37 => 'footer',
38 => 'form',
39 => 'geolocation',
40 => 'h1',
41 => 'h2',
42 => 'h3',
43 => 'h4',
44 => 'h5',
45 => 'h6',
46 => 'head',
47 => 'header',
48 => 'hgroup',
49 => 'hr',
50 => 'html',
51 => 'i',
52 => 'iframe',
53 => 'img',
54 => 'input',
55 => 'ins',
56 => 'kbd',
57 => 'label',
58 => 'legend',
59 => 'li',
60 => 'link',
61 => 'main',
62 => 'map',
63 => 'mark',
64 => 'math',
65 => 'menu',
66 => 'meta',
67 => 'meter',
68 => 'nav',
69 => 'noscript',
70 => 'object',
71 => 'ol',
72 => 'optgroup',
73 => 'option',
74 => 'output',
75 => 'p',
76 => 'picture',
77 => 'pre',
78 => 'progress',
79 => 'q',
80 => 'rp',
81 => 'rt',
82 => 'ruby',
83 => 's',
84 => 'samp',
85 => 'script',
86 => 'search',
87 => 'section',
88 => 'select',
89 => 'selectedcontent',
90 => 'slot',
91 => 'small',
92 => 'source',
93 => 'span',
94 => 'strong',
95 => 'style',
96 => 'sub',
97 => 'summary',
98 => 'sup',
99 => 'svg',
100 => 'table',
101 => 'tbody',
102 => 'td',
103 => 'template',
104 => 'textarea',
105 => 'tfoot',
106 => 'th',
107 => 'thead',
108 => 'time',
109 => 'title',
110 => 'tr',
111 => 'track',
112 => 'u',
113 => 'ul',
114 => 'var',
115 => 'video',
116 => 'wbr',
],
[
0 => '<acronym>',
1 => '<big>',
2 => '<center>',
3 => '<content>',
4 => '<dir>',
5 => '<font>',
6 => '<frame>',
7 => '<frameset>',
8 => '<image>',
9 => '<img>',
10 => '<marquee>',
11 => '<menuitem>',
12 => '<nobr>',
13 => '<noembed>',
14 => '<noframes>',
15 => '<object>',
16 => '<param>',
17 => '<plaintext>',
18 => '<rb>',
19 => '<rt>',
20 => '<rtc>',
21 => '<ruby>',
22 => '<shadow>',
23 => '<slot>',
24 => '<strike>',
25 => '<tt>',
26 => '<ul>',
27 => '<xmp>',
],
];
<?php
$url = 'https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements';
// 1. Fetch the HTML content
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
$html = curl_exec($ch);
if (curl_errno($ch)) {
die('cURL Error: ' . curl_error($ch));
}
//curl_close($ch);
// 2. Parse the HTML
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$dom->loadHTML($html);
libxml_clear_errors();
$xpath = new DOMXPath($dom);
// Setup the three groups
$normalFullTags = [];
$normalTagNames = [];
$obsoleteFullTags = [];
// 3. Find all tables to accurately determine context
$tables = $xpath->query('//table');
foreach ($tables as $table) {
// Find the closest preceding <h2> to determine if we are in the "Obsolete" section
$precedingH2s = $xpath->query('preceding::h2', $table);
$lastH2 = $precedingH2s->length > 0 ? $precedingH2s->item($precedingH2s->length - 1)->textContent : '';
$isObsoleteSection = (stripos($lastH2, 'obsolete') !== false || stripos($lastH2, 'deprecated') !== false);
$codeNodes = $xpath->query('.//code', $table);
foreach ($codeNodes as $node) {
$text = trim($node->textContent);
// Match only valid <tag> formats
if (preg_match('/^<([a-z0-9\-]+)>$/i', $text, $matches)) {
$tagName = strtolower($matches[1]);
$fullTag = '<' . $tagName . '>';
// Check if the table row's description explicitly mentions deprecation
$tr = $xpath->query('ancestor::tr', $node)->item(0);
$isDeprecatedInline = false;
if ($tr) {
$rowText = $tr->textContent;
$isDeprecatedInline = (stripos($rowText, 'deprecated') !== false || stripos($rowText, 'obsolete') !== false);
}
// Route to the correct arrays
if ($isObsoleteSection || $isDeprecatedInline) {
if (!in_array($fullTag, $obsoleteFullTags)) {
$obsoleteFullTags[] = $fullTag;
}
} else {
if (!in_array($tagName, $normalTagNames)) {
$normalFullTags[] = $fullTag;
$normalTagNames[] = $tagName;
}
}
}
}
}
// 4. Sort all arrays ascending
sort($normalFullTags);
sort($normalTagNames);
sort($obsoleteFullTags);
// 5. Build the PHP file output
$phpCode = "<?php\n\nreturn [\n";
// Group 1: Normal full tags
$phpCode .= " [\n";
foreach ($normalFullTags as $index => $tag) {
$phpCode .= " {$index} => '{$tag}',\n";
}
$phpCode .= " ],\n";
// Group 2: Normal tag names only (without < >)
$phpCode .= " [\n";
foreach ($normalTagNames as $index => $name) {
$phpCode .= " {$index} => '{$name}',\n";
}
$phpCode .= " ],\n";
// Group 3: Obsolete full tags
$phpCode .= " [\n";
foreach ($obsoleteFullTags as $index => $tag) {
$phpCode .= " {$index} => '{$tag}',\n";
}
$phpCode .= " ],\n";
$phpCode .= "];\n";
// 6. Save the file
$outputFilename = 'html_tags_array.php';
file_put_contents($outputFilename, $phpCode);
echo "Successfully extracted and grouped HTML tags into {$outputFilename}.<br>\n";
echo "- Normal tags: " . count($normalTagNames) . "<br>\n";
echo "- Obsolete tags: " . count($obsoleteFullTags) . "<br>\n";
<?php
/**
* Target: MDN ARIA Attributes Reference
* Output: html_aria-attributes_array.php
*/
$url = 'https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Reference/Attributes';
// 1. Initialize handle (PHP 8.4+ compliant - no curl_close)
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Aria-Scraper/1.0',
]);
$html = curl_exec($ch);
if ($html === false) {
// Web-friendly error output
die('cURL Error: ' . curl_error($ch) . "<br>\n");
}
// 2. DOM & XPath setup
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$dom->loadHTML($html);
libxml_clear_errors();
$xpath = new DOMXPath($dom);
// 3. Extracting aria-* from <code> tags
$nodes = $xpath->query('//code');
$ariaAttributes = [];
foreach ($nodes as $node) {
$text = trim($node->textContent);
// Regex targets strings starting with 'aria-' only
if (preg_match('/^aria-[a-z0-9\-]+$/i', $text)) {
$attr = strtolower($text);
if (!in_array($attr, $ariaAttributes)) {
$ariaAttributes[] = $attr;
}
}
}
// 4. Sorting
sort($ariaAttributes);
// 5. Build the file string
$output = "<?php\n\nreturn [\n";
foreach ($ariaAttributes as $index => $attr) {
$output .= " {$index} => '{$attr}',\n";
}
$output .= "];\n";
// 6. Save and Report
$filename = 'html_aria-attributes_array.php';
$bytes = file_put_contents($filename, $output);
if ($bytes !== false) {
echo "<strong>Success:</strong> Extracted " . count($ariaAttributes) . " ARIA attributes.<br>\n";
echo "File saved as: <code>{$filename}</code> ({$bytes} bytes).<br>\n";
} else {
echo "<strong>Error:</strong> Failed to write file to disk.<br>\n";
}
<?php
/**
* Target: MDN Global HTML Attributes Reference
* Output: html_global-attributes_array.php
*/
$url = 'https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Global_attributes';
$ch = curl_init($url);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Global-Attr-Scraper/2.0',
]);
$html = curl_exec($ch);
if ($html === false) {
die('<strong>Error:</strong> cURL failed - ' . curl_error($ch) . "<br>\n");
}
libxml_use_internal_errors(true);
$dom = new DOMDocument();
$dom->loadHTML($html);
libxml_clear_errors();
$xpath = new DOMXPath($dom);
/**
* FIX: Target ONLY the <main> or <article> area to avoid the sidebar navigation.
* Target ONLY <dt> (Definition Terms) to avoid grabbing values like 'false' from code examples.
*/
$nodes = $xpath->query('//main//dt//code | //article//dt//code');
$globalAttributes = [];
foreach ($nodes as $node) {
$text = trim($node->textContent);
/**
* Regex criteria:
* 1. Starts with a letter.
* 2. Contains letters, numbers, hyphens.
* 3. Optionally ends with an asterisk (to catch "data-*" and "aria-*").
*/
if (preg_match('/^[a-z][a-z0-9\-]*\*?$/i', $text)) {
$attr = strtolower($text);
// Exclude event handlers (anything starting with 'on')
if (strpos($attr, 'on') !== 0) {
if (!in_array($attr, $globalAttributes)) {
$globalAttributes[] = $attr;
}
}
}
}
// Sort ascending
sort($globalAttributes);
// Generate the PHP file content
$output = "<?php\n\nreturn [\n";
foreach ($globalAttributes as $index => $attr) {
$output .= " {$index} => '{$attr}',\n";
}
$output .= "];\n";
// Save and Output
$filename = 'html_global-attributes_array.php';
$bytes = file_put_contents($filename, $output);
if ($bytes !== false) {
echo "<strong>Success:</strong> Extracted " . count($globalAttributes) . " exact Global attributes.<br>\n";
echo "File saved: <code>{$filename}</code> ({$bytes} bytes).<br>\n";
} else {
echo "<strong>Error:</strong> Failed to write file to disk.<br>\n";
}
<?php
/**
* Script: WHATWG Tag Attribute Scraper
* Source: https://html.spec.whatwg.org/multipage/indices.html
*/
$tagsFile = 'html_tags_array.php';
$indexUrl = 'https://html.spec.whatwg.org/multipage/indices.html';
$indexCache = 'whatwg_index_cache.html';
$cacheLimit = 30 * 24 * 60 * 60;
set_time_limit(10 * 60);
// 1. Load tag names for the selector
$tagNames = [];
if (file_exists($tagsFile)) {
$tagsData = include($tagsFile);
if (isset($tagsData[1]) && is_array($tagsData[1])) {
$tagNames = $tagsData[1];
}
}
// 2. Handle POST
if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_POST['tags'])) {
echo "<h3>Scraping Progress:</h3><br>\n";
$selectedTags = (array)$_POST['tags'];
if (!file_exists('html_per-tag')) {
mkdir('html_per-tag', 0777, true);
}
// === STEP 1: Fetch / serve the WHATWG index page from cache ===
$indexHtml = false;
if (file_exists($indexCache) && (time() - filemtime($indexCache)) <= $cacheLimit) {
$indexHtml = file_get_contents($indexCache);
if ($indexHtml !== false) {
echo "<em>[Cache]</em> Using cached WHATWG index (<code>{$indexCache}</code>).<br>\n";
}
}
if ($indexHtml === false) {
echo "<em>[Fetch]</em> Downloading WHATWG index page…<br>\n";
$ch = curl_init($indexUrl);
curl_setopt_array($ch, [
CURLOPT_RETURNTRANSFER => true,
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Asset-Scraper/2.0',
CURLOPT_CONNECTTIMEOUT => 15,
CURLOPT_FAILONERROR => true,
]);
$indexHtml = curl_exec($ch);
$curlErr = curl_error($ch);
if ($indexHtml === false) {
echo "<strong>[Fatal]</strong> Could not fetch WHATWG index: <em>{$curlErr}</em>. Aborting.<br>\n";
exit;
}
file_put_contents($indexCache, $indexHtml);
echo "<em>[Fetch]</em> Downloaded and cached to <code>{$indexCache}</code>.<br>\n";
}
// === STEP 2: Parse — build a complete tag → attributes[] map ===
libxml_use_internal_errors(true);
$dom = new DOMDocument();
@$dom->loadHTML($indexHtml);
libxml_clear_errors();
$xpath = new DOMXPath($dom);
$tagAttrMap = [];
// Rows use <th> for the element name, <td> for all other columns
$rows = $xpath->query('//table//tr[th and td]');
foreach ($rows as $row) {
// ── Tag names: read from the <th> cell ───────────────────────────────
// Element links always point to anchors like #the-input-element
$thCells = $xpath->query('th', $row);
if ($thCells->length < 1) continue;
$tagsInRow = [];
foreach ($xpath->query('.//a', $thCells->item(0)) as $a) {
$fragment = (string)(parse_url($a->getAttribute('href'), PHP_URL_FRAGMENT) ?? '');
$name = strtolower(trim($a->textContent));
if (str_starts_with($fragment, 'the-') && preg_match('/^[a-z][a-z0-9]*$/', $name)) {
$tagsInRow[] = $name;
}
}
if (empty($tagsInRow)) continue;
// ── Attributes: WHITELIST — only keep anchors that match WHATWG's
// attribute-anchor naming conventions:
//
// attr-* → almost all element attributes
// e.g. #attr-media-src, #attr-input-accept
//
// handler-* → event-handler content attributes on <body>
// e.g. #handler-window-onafterprint
//
// Everything else (flow-content-2, htmlelement, the-*, concept-*, …)
// is silently ignored.
$attrs = [];
foreach ($xpath->query('.//a', $row) as $a) {
$fragment = (string)(parse_url($a->getAttribute('href'), PHP_URL_FRAGMENT) ?? '');
if (!str_starts_with($fragment, 'attr-') &&
!str_starts_with($fragment, 'handler-')) {
continue;
}
$val = strtolower(trim($a->textContent));
if (!preg_match('/^[a-z][a-z0-9\-]*$/', $val)) continue;
$attrs[] = $val;
}
$attrs = array_values(array_unique($attrs));
sort($attrs);
// Rows shared by multiple tags (e.g. h1–h6) get the same list
foreach ($tagsInRow as $t) {
$tagAttrMap[$t] = $attrs;
}
}
echo "<em>[Parse]</em> Built attribute map for <strong>"
. count($tagAttrMap) . "</strong> element(s).<br><br>\n";
// === STEP 3: Write per-tag files ===
foreach ($selectedTags as $rawTag) {
$cleanTag = strtolower(str_replace(['<', '>'], '', $rawTag));
$filename = "html_per-tag/html_per-tag-{$cleanTag}-attributes_array.php";
if (file_exists($filename) && (time() - filemtime($filename)) <= $cacheLimit) {
echo "<strong>[Cached]</strong> <code>{$filename}</code> is up to date. Skipping…<br>\n";
continue;
}
$attributes = $tagAttrMap[$cleanTag] ?? [];
$fileContent = "<?php\n\nreturn [\n";
foreach ($attributes as $idx => $attr) {
$fileContent .= " {$idx} => '{$attr}',\n";
}
$fileContent .= "];\n";
if (file_put_contents($filename, $fileContent) !== false) {
$count = count($attributes);
$msg = $count > 0
? "Saved <strong>{$count}</strong> attributes"
: "No element-specific attributes found; saved <strong>empty array</strong>";
echo "<strong>[Success]</strong> {$msg} → <code>{$filename}</code>.<br>\n";
} else {
echo "<strong>[Error]</strong> Could not write <code>{$filename}</code>.<br>\n";
}
}
echo "<br><hr><a href=''>&larr; Start New Scrape</a><br>\n";
exit;
}
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>WHATWG Attribute Scraper</title>
<style>
body { font-family: sans-serif; background: #f0f2f5; padding: 20px; line-height: 1.5; }
.card { max-width: 500px; margin: auto; background: #fff; padding: 25px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
select { width: 100%; height: 250px; margin-top: 10px; border: 1px solid #ccc; border-radius: 5px; font-size: 16px; }
button { width: 100%; padding: 12px; background: #007bff; color: #fff; border: none; border-radius: 5px; cursor: pointer; font-size: 16px; margin-top: 16px; }
button:hover { background: #0056b3; }
label { font-weight: bold; }
.help { font-size: 12px; color: #666; margin-bottom: 5px; }
.source { font-size: 12px; color: #888; margin-bottom: 16px; }
</style>
</head>
<body>
<div class="card">
<h2>Tag Attribute Scraper</h2>
<p class="source">
Source: <a href="https://html.spec.whatwg.org/multipage/indices.html" target="_blank">
WHATWG HTML Living Standard — Element Index</a><br>
Index page fetched once, cached 30 days.
</p>
<form method="POST">
<label>Select Tags (Multiple):</label>
<div class="help">Files saved in <code>/html_per-tag/</code>. 30-day cache applies per file.</div>
<select name="tags[]" multiple required>
<?php foreach ($tagNames as $tag): ?>
<option value="<?= htmlspecialchars($tag) ?>"><?= htmlspecialchars($tag) ?></option>
<?php endforeach; ?>
</select>
<button type="submit">Check &amp; Generate Files</button>
</form>
</div>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment