ve3 · March 19, 2026 09:39
diff --git a/html_tags_array.php b/html_tags_array.php
 <?php
 /**
 * The example of HTML tags scraped from MDN.
 */

 return [
    [
        0 => '<a>',
        1 => '<abbr>',
        2 => '<address>',
        3 => '<area>',
        4 => '<article>',
        5 => '<aside>',
        6 => '<audio>',
        7 => '<b>',
        8 => '<base>',
        9 => '<bdi>',
        10 => '<bdo>',
        11 => '<blockquote>',
        12 => '<body>',
        13 => '<br>',
        14 => '<button>',
        15 => '<canvas>',
        16 => '<caption>',
        17 => '<cite>',
        18 => '<code>',
        19 => '<col>',
        20 => '<colgroup>',
        21 => '<data>',
        22 => '<datalist>',
        23 => '<dd>',
        24 => '<del>',
        25 => '<details>',
        26 => '<dfn>',
        27 => '<dialog>',
        28 => '<div>',
        29 => '<dl>',
        30 => '<dt>',
        31 => '<em>',
        32 => '<embed>',
        33 => '<fencedframe>',
        34 => '<fieldset>',
        35 => '<figcaption>',
        36 => '<figure>',
        37 => '<footer>',
        38 => '<form>',
        39 => '<geolocation>',
        40 => '<h1>',
        41 => '<h2>',
        42 => '<h3>',
        43 => '<h4>',
        44 => '<h5>',
        45 => '<h6>',
        46 => '<head>',
        47 => '<header>',
        48 => '<hgroup>',
        49 => '<hr>',
        50 => '<html>',
        51 => '<i>',
        52 => '<iframe>',
        53 => '<img>',
        54 => '<input>',
        55 => '<ins>',
        56 => '<kbd>',
        57 => '<label>',
        58 => '<legend>',
        59 => '<li>',
        60 => '<link>',
        61 => '<main>',
        62 => '<map>',
        63 => '<mark>',
        64 => '<math>',
        65 => '<menu>',
        66 => '<meta>',
        67 => '<meter>',
        68 => '<nav>',
        69 => '<noscript>',
        70 => '<object>',
        71 => '<ol>',
        72 => '<optgroup>',
        73 => '<option>',
        74 => '<output>',
        75 => '<p>',
        76 => '<picture>',
        77 => '<pre>',
        78 => '<progress>',
        79 => '<q>',
        80 => '<rp>',
        81 => '<rt>',
        82 => '<ruby>',
        83 => '<s>',
        84 => '<samp>',
        85 => '<script>',
        86 => '<search>',
        87 => '<section>',
        88 => '<select>',
        89 => '<selectedcontent>',
        90 => '<slot>',
        91 => '<small>',
        92 => '<source>',
        93 => '<span>',
        94 => '<strong>',
        95 => '<style>',
        96 => '<sub>',
        97 => '<summary>',
        98 => '<sup>',
        99 => '<svg>',
        100 => '<table>',
        101 => '<tbody>',
        102 => '<td>',
        103 => '<template>',
        104 => '<textarea>',
        105 => '<tfoot>',
        106 => '<th>',
        107 => '<thead>',
        108 => '<time>',
        109 => '<title>',
        110 => '<tr>',
        111 => '<track>',
        112 => '<u>',
        113 => '<ul>',
        114 => '<var>',
        115 => '<video>',
        116 => '<wbr>',
    ],
    [
        0 => 'a',
        1 => 'abbr',
        2 => 'address',
        3 => 'area',
        4 => 'article',
        5 => 'aside',
        6 => 'audio',
        7 => 'b',
        8 => 'base',
        9 => 'bdi',
        10 => 'bdo',
        11 => 'blockquote',
        12 => 'body',
        13 => 'br',
        14 => 'button',
        15 => 'canvas',
        16 => 'caption',
        17 => 'cite',
        18 => 'code',
        19 => 'col',
        20 => 'colgroup',
        21 => 'data',
        22 => 'datalist',
        23 => 'dd',
        24 => 'del',
        25 => 'details',
        26 => 'dfn',
        27 => 'dialog',
        28 => 'div',
        29 => 'dl',
        30 => 'dt',
        31 => 'em',
        32 => 'embed',
        33 => 'fencedframe',
        34 => 'fieldset',
        35 => 'figcaption',
        36 => 'figure',
        37 => 'footer',
        38 => 'form',
        39 => 'geolocation',
        40 => 'h1',
        41 => 'h2',
        42 => 'h3',
        43 => 'h4',
        44 => 'h5',
        45 => 'h6',
        46 => 'head',
        47 => 'header',
        48 => 'hgroup',
        49 => 'hr',
        50 => 'html',
        51 => 'i',
        52 => 'iframe',
        53 => 'img',
        54 => 'input',
        55 => 'ins',
        56 => 'kbd',
        57 => 'label',
        58 => 'legend',
        59 => 'li',
        60 => 'link',
        61 => 'main',
        62 => 'map',
        63 => 'mark',
        64 => 'math',
        65 => 'menu',
        66 => 'meta',
        67 => 'meter',
        68 => 'nav',
        69 => 'noscript',
        70 => 'object',
        71 => 'ol',
        72 => 'optgroup',
        73 => 'option',
        74 => 'output',
        75 => 'p',
        76 => 'picture',
        77 => 'pre',
        78 => 'progress',
        79 => 'q',
        80 => 'rp',
        81 => 'rt',
        82 => 'ruby',
        83 => 's',
        84 => 'samp',
        85 => 'script',
        86 => 'search',
        87 => 'section',
        88 => 'select',
        89 => 'selectedcontent',
        90 => 'slot',
        91 => 'small',
        92 => 'source',
        93 => 'span',
        94 => 'strong',
        95 => 'style',
        96 => 'sub',
        97 => 'summary',
        98 => 'sup',
        99 => 'svg',
        100 => 'table',
        101 => 'tbody',
        102 => 'td',
        103 => 'template',
        104 => 'textarea',
        105 => 'tfoot',
        106 => 'th',
        107 => 'thead',
        108 => 'time',
        109 => 'title',
        110 => 'tr',
        111 => 'track',
        112 => 'u',
        113 => 'ul',
        114 => 'var',
        115 => 'video',
        116 => 'wbr',
    ],
    [
        0 => '<acronym>',
        1 => '<big>',
        2 => '<center>',
        3 => '<content>',
        4 => '<dir>',
        5 => '<font>',
        6 => '<frame>',
        7 => '<frameset>',
        8 => '<image>',
        9 => '<img>',
        10 => '<marquee>',
        11 => '<menuitem>',
        12 => '<nobr>',
        13 => '<noembed>',
        14 => '<noframes>',
        15 => '<object>',
        16 => '<param>',
        17 => '<plaintext>',
        18 => '<rb>',
        19 => '<rt>',
        20 => '<rtc>',
        21 => '<ruby>',
        22 => '<shadow>',
        23 => '<slot>',
        24 => '<strike>',
        25 => '<tt>',
        26 => '<ul>',
        27 => '<xmp>',
    ],
 ];
diff --git a/scrape-all-html.php b/scrape-all-html.php
 <?php

 $url = 'https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements';

 // 1. Fetch the HTML content
 $ch = curl_init($url);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
 curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
 $html = curl_exec($ch);

 if (curl_errno($ch)) {
    die('cURL Error: ' . curl_error($ch));
 }
 //curl_close($ch);

 // 2. Parse the HTML
 libxml_use_internal_errors(true);
 $dom = new DOMDocument();
 $dom->loadHTML($html);
 libxml_clear_errors();

 $xpath = new DOMXPath($dom);

 // Setup the three groups
 $normalFullTags = [];
 $normalTagNames = [];
 $obsoleteFullTags = [];

 // 3. Find all tables to accurately determine context
 $tables = $xpath->query('//table');

 foreach ($tables as $table) {
    // Find the closest preceding <h2> to determine if we are in the "Obsolete" section
    $precedingH2s = $xpath->query('preceding::h2', $table);
    $lastH2 = $precedingH2s->length > 0 ? $precedingH2s->item($precedingH2s->length - 1)->textContent : '';
    
    $isObsoleteSection = (stripos($lastH2, 'obsolete') !== false || stripos($lastH2, 'deprecated') !== false);

    $codeNodes = $xpath->query('.//code', $table);
    
    foreach ($codeNodes as $node) {
        $text = trim($node->textContent);
        
        // Match only valid <tag> formats
        if (preg_match('/^<([a-z0-9\-]+)>$/i', $text, $matches)) {
            $tagName = strtolower($matches[1]);
            $fullTag = '<' . $tagName . '>';
            
            // Check if the table row's description explicitly mentions deprecation
            $tr = $xpath->query('ancestor::tr', $node)->item(0);
            $isDeprecatedInline = false;
            if ($tr) {
                $rowText = $tr->textContent;
                $isDeprecatedInline = (stripos($rowText, 'deprecated') !== false || stripos($rowText, 'obsolete') !== false);
            }

            // Route to the correct arrays
            if ($isObsoleteSection || $isDeprecatedInline) {
                if (!in_array($fullTag, $obsoleteFullTags)) {
                    $obsoleteFullTags[] = $fullTag;
                }
            } else {
                if (!in_array($tagName, $normalTagNames)) {
                    $normalFullTags[] = $fullTag;
                    $normalTagNames[] = $tagName;
                }
            }
        }
    }
 }

 // 4. Sort all arrays ascending
 sort($normalFullTags);
 sort($normalTagNames);
 sort($obsoleteFullTags);

 // 5. Build the PHP file output
 $phpCode = "<?php\n\nreturn [\n";

 // Group 1: Normal full tags
 $phpCode .= "    [\n";
 foreach ($normalFullTags as $index => $tag) {
    $phpCode .= "        {$index} => '{$tag}',\n";
 }
 $phpCode .= "    ],\n";

 // Group 2: Normal tag names only (without < >)
 $phpCode .= "    [\n";
 foreach ($normalTagNames as $index => $name) {
    $phpCode .= "        {$index} => '{$name}',\n";
 }
 $phpCode .= "    ],\n";

 // Group 3: Obsolete full tags
 $phpCode .= "    [\n";
 foreach ($obsoleteFullTags as $index => $tag) {
    $phpCode .= "        {$index} => '{$tag}',\n";
 }
 $phpCode .= "    ],\n";

 $phpCode .= "];\n";

 // 6. Save the file
 $outputFilename = 'html_tags_array.php';
 file_put_contents($outputFilename, $phpCode);

 echo "Successfully extracted and grouped HTML tags into {$outputFilename}.<br>\n";
 echo "- Normal tags: " . count($normalTagNames) . "<br>\n";
 echo "- Obsolete tags: " . count($obsoleteFullTags) . "<br>\n";
diff --git a/scrape-aria-attributes.php b/scrape-aria-attributes.php
 <?php

 /**
 * Target: MDN ARIA Attributes Reference
 * Output: html_aria-attributes_array.php
 */

 $url = 'https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Reference/Attributes';

 // 1. Initialize handle (PHP 8.4+ compliant - no curl_close)
 $ch = curl_init($url);
 curl_setopt_array($ch, [
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_USERAGENT      => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Aria-Scraper/1.0',
 ]);

 $html = curl_exec($ch);

 if ($html === false) {
    // Web-friendly error output
    die('cURL Error: ' . curl_error($ch) . "<br>\n");
 }

 // 2. DOM & XPath setup
 libxml_use_internal_errors(true);
 $dom = new DOMDocument();
 $dom->loadHTML($html);
 libxml_clear_errors();

 $xpath = new DOMXPath($dom);

 // 3. Extracting aria-* from <code> tags
 $nodes = $xpath->query('//code');
 $ariaAttributes = [];

 foreach ($nodes as $node) {
    $text = trim($node->textContent);
    
    // Regex targets strings starting with 'aria-' only
    if (preg_match('/^aria-[a-z0-9\-]+$/i', $text)) {
        $attr = strtolower($text);
        if (!in_array($attr, $ariaAttributes)) {
            $ariaAttributes[] = $attr;
        }
    }
 }

 // 4. Sorting
 sort($ariaAttributes);

 // 5. Build the file string
 $output = "<?php\n\nreturn [\n";
 foreach ($ariaAttributes as $index => $attr) {
    $output .= "    {$index} => '{$attr}',\n";
 }
 $output .= "];\n";

 // 6. Save and Report
 $filename = 'html_aria-attributes_array.php';
 $bytes = file_put_contents($filename, $output);

 if ($bytes !== false) {
    echo "<strong>Success:</strong> Extracted " . count($ariaAttributes) . " ARIA attributes.<br>\n";
    echo "File saved as: <code>{$filename}</code> ({$bytes} bytes).<br>\n";
 } else {
    echo "<strong>Error:</strong> Failed to write file to disk.<br>\n";
 }
diff --git a/scrape-global-attributes.php b/scrape-global-attributes.php
 <?php

 /**
 * Target: MDN Global HTML Attributes Reference
 * Output: html_global-attributes_array.php
 */

 $url = 'https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Global_attributes';

 $ch = curl_init($url);
 curl_setopt_array($ch, [
    CURLOPT_RETURNTRANSFER => true,
    CURLOPT_FOLLOWLOCATION => true,
    CURLOPT_USERAGENT      => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Global-Attr-Scraper/2.0',
 ]);

 $html = curl_exec($ch);

 if ($html === false) {
    die('<strong>Error:</strong> cURL failed - ' . curl_error($ch) . "<br>\n");
 }

 libxml_use_internal_errors(true);
 $dom = new DOMDocument();
 $dom->loadHTML($html);
 libxml_clear_errors();

 $xpath = new DOMXPath($dom);

 /**
 * FIX: Target ONLY the <main> or <article> area to avoid the sidebar navigation.
 * Target ONLY <dt> (Definition Terms) to avoid grabbing values like 'false' from code examples.
 */
 $nodes = $xpath->query('//main//dt//code | //article//dt//code');
 $globalAttributes = [];

 foreach ($nodes as $node) {
    $text = trim($node->textContent);
    
    /**
     * Regex criteria:
     * 1. Starts with a letter.
     * 2. Contains letters, numbers, hyphens.
     * 3. Optionally ends with an asterisk (to catch "data-*" and "aria-*").
     */
    if (preg_match('/^[a-z][a-z0-9\-]*\*?$/i', $text)) {
        $attr = strtolower($text);
        
        // Exclude event handlers (anything starting with 'on')
        if (strpos($attr, 'on') !== 0) {
            if (!in_array($attr, $globalAttributes)) {
                $globalAttributes[] = $attr;
            }
        }
    }
 }

 // Sort ascending
 sort($globalAttributes);

 // Generate the PHP file content
 $output = "<?php\n\nreturn [\n";
 foreach ($globalAttributes as $index => $attr) {
    $output .= "    {$index} => '{$attr}',\n";
 }
 $output .= "];\n";

 // Save and Output
 $filename = 'html_global-attributes_array.php';
 $bytes = file_put_contents($filename, $output);

 if ($bytes !== false) {
    echo "<strong>Success:</strong> Extracted " . count($globalAttributes) . " exact Global attributes.<br>\n";
    echo "File saved: <code>{$filename}</code> ({$bytes} bytes).<br>\n";
 } else {
    echo "<strong>Error:</strong> Failed to write file to disk.<br>\n";
 }
diff --git a/scrape-tag-attributes.php b/scrape-tag-attributes.php
 <?php

 /**
 * Script: WHATWG Tag Attribute Scraper
 * Source:  https://html.spec.whatwg.org/multipage/indices.html
 */

 $tagsFile   = 'html_tags_array.php';
 $indexUrl   = 'https://html.spec.whatwg.org/multipage/indices.html';
 $indexCache = 'whatwg_index_cache.html';
 $cacheLimit = 30 * 24 * 60 * 60;

 set_time_limit(10 * 60);

 // 1. Load tag names for the selector
 $tagNames = [];
 if (file_exists($tagsFile)) {
    $tagsData = include($tagsFile);
    if (isset($tagsData[1]) && is_array($tagsData[1])) {
        $tagNames = $tagsData[1];
    }
 }

 // 2. Handle POST
 if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_POST['tags'])) {
    echo "<h3>Scraping Progress:</h3><br>\n";

    $selectedTags = (array)$_POST['tags'];

    if (!file_exists('html_per-tag')) {
        mkdir('html_per-tag', 0777, true);
    }

    // === STEP 1: Fetch / serve the WHATWG index page from cache ===
    $indexHtml = false;

    if (file_exists($indexCache) && (time() - filemtime($indexCache)) <= $cacheLimit) {
        $indexHtml = file_get_contents($indexCache);
        if ($indexHtml !== false) {
            echo "<em>[Cache]</em> Using cached WHATWG index (<code>{$indexCache}</code>).<br>\n";
        }
    }

    if ($indexHtml === false) {
        echo "<em>[Fetch]</em> Downloading WHATWG index page…<br>\n";
        $ch = curl_init($indexUrl);
        curl_setopt_array($ch, [
            CURLOPT_RETURNTRANSFER => true,
            CURLOPT_FOLLOWLOCATION => true,
            CURLOPT_USERAGENT      => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Asset-Scraper/2.0',
            CURLOPT_CONNECTTIMEOUT => 15,
            CURLOPT_FAILONERROR    => true,
        ]);
        $indexHtml = curl_exec($ch);
        $curlErr   = curl_error($ch);

        if ($indexHtml === false) {
            echo "<strong>[Fatal]</strong> Could not fetch WHATWG index: <em>{$curlErr}</em>. Aborting.<br>\n";
            exit;
        }

        file_put_contents($indexCache, $indexHtml);
        echo "<em>[Fetch]</em> Downloaded and cached to <code>{$indexCache}</code>.<br>\n";
    }

    // === STEP 2: Parse — build a complete tag → attributes[] map ===
    libxml_use_internal_errors(true);
    $dom = new DOMDocument();
    @$dom->loadHTML($indexHtml);
    libxml_clear_errors();
    $xpath = new DOMXPath($dom);

    $tagAttrMap = [];

    // Rows use <th> for the element name, <td> for all other columns
    $rows = $xpath->query('//table//tr[th and td]');

    foreach ($rows as $row) {

        // ── Tag names: read from the <th> cell ───────────────────────────────
        // Element links always point to anchors like #the-input-element
        $thCells = $xpath->query('th', $row);
        if ($thCells->length < 1) continue;

        $tagsInRow = [];
        foreach ($xpath->query('.//a', $thCells->item(0)) as $a) {
            $fragment = (string)(parse_url($a->getAttribute('href'), PHP_URL_FRAGMENT) ?? '');
            $name     = strtolower(trim($a->textContent));
            if (str_starts_with($fragment, 'the-') && preg_match('/^[a-z][a-z0-9]*$/', $name)) {
                $tagsInRow[] = $name;
            }
        }
        if (empty($tagsInRow)) continue;

        // ── Attributes: WHITELIST — only keep anchors that match WHATWG's
        //    attribute-anchor naming conventions:
        //
        //      attr-*     → almost all element attributes
        //                   e.g. #attr-media-src, #attr-input-accept
        //
        //      handler-*  → event-handler content attributes on <body>
        //                   e.g. #handler-window-onafterprint
        //
        //    Everything else (flow-content-2, htmlelement, the-*, concept-*, …)
        //    is silently ignored.
        $attrs = [];
        foreach ($xpath->query('.//a', $row) as $a) {
            $fragment = (string)(parse_url($a->getAttribute('href'), PHP_URL_FRAGMENT) ?? '');

            if (!str_starts_with($fragment, 'attr-') &&
                !str_starts_with($fragment, 'handler-')) {
                continue;
            }

            $val = strtolower(trim($a->textContent));
            if (!preg_match('/^[a-z][a-z0-9\-]*$/', $val)) continue;

            $attrs[] = $val;
        }

        $attrs = array_values(array_unique($attrs));
        sort($attrs);

        // Rows shared by multiple tags (e.g. h1–h6) get the same list
        foreach ($tagsInRow as $t) {
            $tagAttrMap[$t] = $attrs;
        }
    }

    echo "<em>[Parse]</em> Built attribute map for <strong>"
        . count($tagAttrMap) . "</strong> element(s).<br><br>\n";

    // === STEP 3: Write per-tag files ===
    foreach ($selectedTags as $rawTag) {
        $cleanTag = strtolower(str_replace(['<', '>'], '', $rawTag));
        $filename = "html_per-tag/html_per-tag-{$cleanTag}-attributes_array.php";

        if (file_exists($filename) && (time() - filemtime($filename)) <= $cacheLimit) {
            echo "<strong>[Cached]</strong> <code>{$filename}</code> is up to date. Skipping…<br>\n";
            continue;
        }

        $attributes = $tagAttrMap[$cleanTag] ?? [];

        $fileContent = "<?php\n\nreturn [\n";
        foreach ($attributes as $idx => $attr) {
            $fileContent .= "    {$idx} => '{$attr}',\n";
        }
        $fileContent .= "];\n";

        if (file_put_contents($filename, $fileContent) !== false) {
            $count = count($attributes);
            $msg   = $count > 0
                ? "Saved <strong>{$count}</strong> attributes"
                : "No element-specific attributes found; saved <strong>empty array</strong>";
            echo "<strong>[Success]</strong> {$msg} → <code>{$filename}</code>.<br>\n";
        } else {
            echo "<strong>[Error]</strong> Could not write <code>{$filename}</code>.<br>\n";
        }
    }

    echo "<br><hr><a href=''>&larr; Start New Scrape</a><br>\n";
    exit;
 }
 ?>

 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>WHATWG Attribute Scraper</title>
    <style>
        body { font-family: sans-serif; background: #f0f2f5; padding: 20px; line-height: 1.5; }
        .card { max-width: 500px; margin: auto; background: #fff; padding: 25px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
        select { width: 100%; height: 250px; margin-top: 10px; border: 1px solid #ccc; border-radius: 5px; font-size: 16px; }
        button { width: 100%; padding: 12px; background: #007bff; color: #fff; border: none; border-radius: 5px; cursor: pointer; font-size: 16px; margin-top: 16px; }
        button:hover { background: #0056b3; }
        label { font-weight: bold; }
        .help { font-size: 12px; color: #666; margin-bottom: 5px; }
        .source { font-size: 12px; color: #888; margin-bottom: 16px; }
    </style>
 </head>
 <body>
 <div class="card">
    <h2>Tag Attribute Scraper</h2>
    <p class="source">
        Source: <a href="https://html.spec.whatwg.org/multipage/indices.html" target="_blank">
        WHATWG HTML Living Standard — Element Index</a><br>
        Index page fetched once, cached 30 days.
    </p>
    <form method="POST">
        <label>Select Tags (Multiple):</label>
        <div class="help">Files saved in <code>/html_per-tag/</code>. 30-day cache applies per file.</div>
        <select name="tags[]" multiple required>
            <?php foreach ($tagNames as $tag): ?>
                <option value="<?= htmlspecialchars($tag) ?>"><?= htmlspecialchars($tag) ?></option>
            <?php endforeach; ?>
        </select>
        <button type="submit">Check &amp; Generate Files</button>
    </form>
 </div>
 </body>
 </html>
	<?php
	/**
	* The example of HTML tags scraped from MDN.
	*/

	return [
	[
	0 => '<a>',
	1 => '<abbr>',
	2 => '<address>',
	3 => '<area>',
	4 => '<article>',
	5 => '<aside>',
	6 => '<audio>',
	7 => '<b>',
	8 => '<base>',
	9 => '<bdi>',
	10 => '<bdo>',
	11 => '<blockquote>',
	12 => '<body>',
	13 => '<br>',
	14 => '<button>',
	15 => '<canvas>',
	16 => '<caption>',
	17 => '<cite>',
	18 => '<code>',
	19 => '<col>',
	20 => '<colgroup>',
	21 => '<data>',
	22 => '<datalist>',
	23 => '<dd>',
	24 => '<del>',
	25 => '<details>',
	26 => '<dfn>',
	27 => '<dialog>',
	28 => '<div>',
	29 => '<dl>',
	30 => '<dt>',
	31 => '<em>',
	32 => '<embed>',
	33 => '<fencedframe>',
	34 => '<fieldset>',
	35 => '<figcaption>',
	36 => '<figure>',
	37 => '<footer>',
	38 => '<form>',
	39 => '<geolocation>',
	40 => '<h1>',
	41 => '<h2>',
	42 => '<h3>',
	43 => '<h4>',
	44 => '<h5>',
	45 => '<h6>',
	46 => '<head>',
	47 => '<header>',
	48 => '<hgroup>',
	49 => '<hr>',
	50 => '<html>',
	51 => '<i>',
	52 => '<iframe>',
	53 => '<img>',
	54 => '<input>',
	55 => '<ins>',
	56 => '<kbd>',
	57 => '<label>',
	58 => '<legend>',
	59 => '<li>',
	60 => '<link>',
	61 => '<main>',
	62 => '<map>',
	63 => '<mark>',
	64 => '<math>',
	65 => '<menu>',
	66 => '<meta>',
	67 => '<meter>',
	68 => '<nav>',
	69 => '<noscript>',
	70 => '<object>',
	71 => '<ol>',
	72 => '<optgroup>',
	73 => '<option>',
	74 => '<output>',
	75 => '<p>',
	76 => '<picture>',
	77 => '<pre>',
	78 => '<progress>',
	79 => '<q>',
	80 => '<rp>',
	81 => '<rt>',
	82 => '<ruby>',
	83 => '<s>',
	84 => '<samp>',
	85 => '<script>',
	86 => '<search>',
	87 => '<section>',
	88 => '<select>',
	89 => '<selectedcontent>',
	90 => '<slot>',
	91 => '<small>',
	92 => '<source>',
	93 => '<span>',
	94 => '<strong>',
	95 => '<style>',
	96 => '<sub>',
	97 => '<summary>',
	98 => '<sup>',
	99 => '<svg>',
	100 => '<table>',
	101 => '<tbody>',
	102 => '<td>',
	103 => '<template>',
	104 => '<textarea>',
	105 => '<tfoot>',
	106 => '<th>',
	107 => '<thead>',
	108 => '<time>',
	109 => '<title>',
	110 => '<tr>',
	111 => '<track>',
	112 => '<u>',
	113 => '<ul>',
	114 => '<var>',
	115 => '<video>',
	116 => '<wbr>',
	],
	[
	0 => 'a',
	1 => 'abbr',
	2 => 'address',
	3 => 'area',
	4 => 'article',
	5 => 'aside',
	6 => 'audio',
	7 => 'b',
	8 => 'base',
	9 => 'bdi',
	10 => 'bdo',
	11 => 'blockquote',
	12 => 'body',
	13 => 'br',
	14 => 'button',
	15 => 'canvas',
	16 => 'caption',
	17 => 'cite',
	18 => 'code',
	19 => 'col',
	20 => 'colgroup',
	21 => 'data',
	22 => 'datalist',
	23 => 'dd',
	24 => 'del',
	25 => 'details',
	26 => 'dfn',
	27 => 'dialog',
	28 => 'div',
	29 => 'dl',
	30 => 'dt',
	31 => 'em',
	32 => 'embed',
	33 => 'fencedframe',
	34 => 'fieldset',
	35 => 'figcaption',
	36 => 'figure',
	37 => 'footer',
	38 => 'form',
	39 => 'geolocation',
	40 => 'h1',
	41 => 'h2',
	42 => 'h3',
	43 => 'h4',
	44 => 'h5',
	45 => 'h6',
	46 => 'head',
	47 => 'header',
	48 => 'hgroup',
	49 => 'hr',
	50 => 'html',
	51 => 'i',
	52 => 'iframe',
	53 => 'img',
	54 => 'input',
	55 => 'ins',
	56 => 'kbd',
	57 => 'label',
	58 => 'legend',
	59 => 'li',
	60 => 'link',
	61 => 'main',
	62 => 'map',
	63 => 'mark',
	64 => 'math',
	65 => 'menu',
	66 => 'meta',
	67 => 'meter',
	68 => 'nav',
	69 => 'noscript',
	70 => 'object',
	71 => 'ol',
	72 => 'optgroup',
	73 => 'option',
	74 => 'output',
	75 => 'p',
	76 => 'picture',
	77 => 'pre',
	78 => 'progress',
	79 => 'q',
	80 => 'rp',
	81 => 'rt',
	82 => 'ruby',
	83 => 's',
	84 => 'samp',
	85 => 'script',
	86 => 'search',
	87 => 'section',
	88 => 'select',
	89 => 'selectedcontent',
	90 => 'slot',
	91 => 'small',
	92 => 'source',
	93 => 'span',
	94 => 'strong',
	95 => 'style',
	96 => 'sub',
	97 => 'summary',
	98 => 'sup',
	99 => 'svg',
	100 => 'table',
	101 => 'tbody',
	102 => 'td',
	103 => 'template',
	104 => 'textarea',
	105 => 'tfoot',
	106 => 'th',
	107 => 'thead',
	108 => 'time',
	109 => 'title',
	110 => 'tr',
	111 => 'track',
	112 => 'u',
	113 => 'ul',
	114 => 'var',
	115 => 'video',
	116 => 'wbr',
	],
	[
	0 => '<acronym>',
	1 => '<big>',
	2 => '<center>',
	3 => '<content>',
	4 => '<dir>',
	5 => '<font>',
	6 => '<frame>',
	7 => '<frameset>',
	8 => '<image>',
	9 => '<img>',
	10 => '<marquee>',
	11 => '<menuitem>',
	12 => '<nobr>',
	13 => '<noembed>',
	14 => '<noframes>',
	15 => '<object>',
	16 => '<param>',
	17 => '<plaintext>',
	18 => '<rb>',
	19 => '<rt>',
	20 => '<rtc>',
	21 => '<ruby>',
	22 => '<shadow>',
	23 => '<slot>',
	24 => '<strike>',
	25 => '<tt>',
	26 => '<ul>',
	27 => '<xmp>',
	],
	];
	<?php

	$url = 'https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Elements';

	// 1. Fetch the HTML content
	$ch = curl_init($url);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
	curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
	$html = curl_exec($ch);

	if (curl_errno($ch)) {
	die('cURL Error: ' . curl_error($ch));
	}
	//curl_close($ch);

	// 2. Parse the HTML
	libxml_use_internal_errors(true);
	$dom = new DOMDocument();
	$dom->loadHTML($html);
	libxml_clear_errors();

	$xpath = new DOMXPath($dom);

	// Setup the three groups
	$normalFullTags = [];
	$normalTagNames = [];
	$obsoleteFullTags = [];

	// 3. Find all tables to accurately determine context
	$tables = $xpath->query('//table');

	foreach ($tables as $table) {
	// Find the closest preceding <h2> to determine if we are in the "Obsolete" section
	$precedingH2s = $xpath->query('preceding::h2', $table);
	$lastH2 = $precedingH2s->length > 0 ? $precedingH2s->item($precedingH2s->length - 1)->textContent : '';

	$isObsoleteSection = (stripos($lastH2, 'obsolete') !== false \|\| stripos($lastH2, 'deprecated') !== false);

	$codeNodes = $xpath->query('.//code', $table);

	foreach ($codeNodes as $node) {
	$text = trim($node->textContent);

	// Match only valid <tag> formats
	if (preg_match('/^<([a-z0-9\-]+)>$/i', $text, $matches)) {
	$tagName = strtolower($matches[1]);
	$fullTag = '<' . $tagName . '>';

	// Check if the table row's description explicitly mentions deprecation
	$tr = $xpath->query('ancestor::tr', $node)->item(0);
	$isDeprecatedInline = false;
	if ($tr) {
	$rowText = $tr->textContent;
	$isDeprecatedInline = (stripos($rowText, 'deprecated') !== false \|\| stripos($rowText, 'obsolete') !== false);
	}

	// Route to the correct arrays
	if ($isObsoleteSection \|\| $isDeprecatedInline) {
	if (!in_array($fullTag, $obsoleteFullTags)) {
	$obsoleteFullTags[] = $fullTag;
	}
	} else {
	if (!in_array($tagName, $normalTagNames)) {
	$normalFullTags[] = $fullTag;
	$normalTagNames[] = $tagName;
	}
	}
	}
	}
	}

	// 4. Sort all arrays ascending
	sort($normalFullTags);
	sort($normalTagNames);
	sort($obsoleteFullTags);

	// 5. Build the PHP file output
	$phpCode = "<?php\n\nreturn [\n";

	// Group 1: Normal full tags
	$phpCode .= " [\n";
	foreach ($normalFullTags as $index => $tag) {
	$phpCode .= " {$index} => '{$tag}',\n";
	}
	$phpCode .= " ],\n";

	// Group 2: Normal tag names only (without < >)
	$phpCode .= " [\n";
	foreach ($normalTagNames as $index => $name) {
	$phpCode .= " {$index} => '{$name}',\n";
	}
	$phpCode .= " ],\n";

	// Group 3: Obsolete full tags
	$phpCode .= " [\n";
	foreach ($obsoleteFullTags as $index => $tag) {
	$phpCode .= " {$index} => '{$tag}',\n";
	}
	$phpCode .= " ],\n";

	$phpCode .= "];\n";

	// 6. Save the file
	$outputFilename = 'html_tags_array.php';
	file_put_contents($outputFilename, $phpCode);

	echo "Successfully extracted and grouped HTML tags into {$outputFilename}.<br>\n";
	echo "- Normal tags: " . count($normalTagNames) . "<br>\n";
	echo "- Obsolete tags: " . count($obsoleteFullTags) . "<br>\n";
	<?php

	/**
	* Target: MDN ARIA Attributes Reference
	* Output: html_aria-attributes_array.php
	*/

	$url = 'https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Reference/Attributes';

	// 1. Initialize handle (PHP 8.4+ compliant - no curl_close)
	$ch = curl_init($url);
	curl_setopt_array($ch, [
	CURLOPT_RETURNTRANSFER => true,
	CURLOPT_FOLLOWLOCATION => true,
	CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Aria-Scraper/1.0',
	]);

	$html = curl_exec($ch);

	if ($html === false) {
	// Web-friendly error output
	die('cURL Error: ' . curl_error($ch) . "<br>\n");
	}

	// 2. DOM & XPath setup
	libxml_use_internal_errors(true);
	$dom = new DOMDocument();
	$dom->loadHTML($html);
	libxml_clear_errors();

	$xpath = new DOMXPath($dom);

	// 3. Extracting aria-* from <code> tags
	$nodes = $xpath->query('//code');
	$ariaAttributes = [];

	foreach ($nodes as $node) {
	$text = trim($node->textContent);

	// Regex targets strings starting with 'aria-' only
	if (preg_match('/^aria-[a-z0-9\-]+$/i', $text)) {
	$attr = strtolower($text);
	if (!in_array($attr, $ariaAttributes)) {
	$ariaAttributes[] = $attr;
	}
	}
	}

	// 4. Sorting
	sort($ariaAttributes);

	// 5. Build the file string
	$output = "<?php\n\nreturn [\n";
	foreach ($ariaAttributes as $index => $attr) {
	$output .= " {$index} => '{$attr}',\n";
	}
	$output .= "];\n";

	// 6. Save and Report
	$filename = 'html_aria-attributes_array.php';
	$bytes = file_put_contents($filename, $output);

	if ($bytes !== false) {
	echo "<strong>Success:</strong> Extracted " . count($ariaAttributes) . " ARIA attributes.<br>\n";
	echo "File saved as: <code>{$filename}</code> ({$bytes} bytes).<br>\n";
	} else {
	echo "<strong>Error:</strong> Failed to write file to disk.<br>\n";
	}
	<?php

	/**
	* Target: MDN Global HTML Attributes Reference
	* Output: html_global-attributes_array.php
	*/

	$url = 'https://developer.mozilla.org/en-US/docs/Web/HTML/Reference/Global_attributes';

	$ch = curl_init($url);
	curl_setopt_array($ch, [
	CURLOPT_RETURNTRANSFER => true,
	CURLOPT_FOLLOWLOCATION => true,
	CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Global-Attr-Scraper/2.0',
	]);

	$html = curl_exec($ch);

	if ($html === false) {
	die('<strong>Error:</strong> cURL failed - ' . curl_error($ch) . "<br>\n");
	}

	libxml_use_internal_errors(true);
	$dom = new DOMDocument();
	$dom->loadHTML($html);
	libxml_clear_errors();

	$xpath = new DOMXPath($dom);

	/**
	* FIX: Target ONLY the <main> or <article> area to avoid the sidebar navigation.
	* Target ONLY <dt> (Definition Terms) to avoid grabbing values like 'false' from code examples.
	*/
	$nodes = $xpath->query('//main//dt//code \| //article//dt//code');
	$globalAttributes = [];

	foreach ($nodes as $node) {
	$text = trim($node->textContent);

	/**
	* Regex criteria:
	* 1. Starts with a letter.
	* 2. Contains letters, numbers, hyphens.
	* 3. Optionally ends with an asterisk (to catch "data-" and "aria-").
	*/
	if (preg_match('/^[a-z][a-z0-9\-]\?$/i', $text)) {
	$attr = strtolower($text);

	// Exclude event handlers (anything starting with 'on')
	if (strpos($attr, 'on') !== 0) {
	if (!in_array($attr, $globalAttributes)) {
	$globalAttributes[] = $attr;
	}
	}
	}
	}

	// Sort ascending
	sort($globalAttributes);

	// Generate the PHP file content
	$output = "<?php\n\nreturn [\n";
	foreach ($globalAttributes as $index => $attr) {
	$output .= " {$index} => '{$attr}',\n";
	}
	$output .= "];\n";

	// Save and Output
	$filename = 'html_global-attributes_array.php';
	$bytes = file_put_contents($filename, $output);

	if ($bytes !== false) {
	echo "<strong>Success:</strong> Extracted " . count($globalAttributes) . " exact Global attributes.<br>\n";
	echo "File saved: <code>{$filename}</code> ({$bytes} bytes).<br>\n";
	} else {
	echo "<strong>Error:</strong> Failed to write file to disk.<br>\n";
	}
	<?php

	/**
	* Script: WHATWG Tag Attribute Scraper
	* Source: https://html.spec.whatwg.org/multipage/indices.html
	*/

	$tagsFile = 'html_tags_array.php';
	$indexUrl = 'https://html.spec.whatwg.org/multipage/indices.html';
	$indexCache = 'whatwg_index_cache.html';
	$cacheLimit = 30 * 24 * 60 * 60;

	set_time_limit(10 * 60);

	// 1. Load tag names for the selector
	$tagNames = [];
	if (file_exists($tagsFile)) {
	$tagsData = include($tagsFile);
	if (isset($tagsData[1]) && is_array($tagsData[1])) {
	$tagNames = $tagsData[1];
	}
	}

	// 2. Handle POST
	if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_POST['tags'])) {
	echo "<h3>Scraping Progress:</h3><br>\n";

	$selectedTags = (array)$_POST['tags'];

	if (!file_exists('html_per-tag')) {
	mkdir('html_per-tag', 0777, true);
	}

	// === STEP 1: Fetch / serve the WHATWG index page from cache ===
	$indexHtml = false;

	if (file_exists($indexCache) && (time() - filemtime($indexCache)) <= $cacheLimit) {
	$indexHtml = file_get_contents($indexCache);
	if ($indexHtml !== false) {
	echo "<em>[Cache]</em> Using cached WHATWG index (<code>{$indexCache}</code>).<br>\n";
	}
	}

	if ($indexHtml === false) {
	echo "<em>[Fetch]</em> Downloading WHATWG index page…<br>\n";
	$ch = curl_init($indexUrl);
	curl_setopt_array($ch, [
	CURLOPT_RETURNTRANSFER => true,
	CURLOPT_FOLLOWLOCATION => true,
	CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Asset-Scraper/2.0',
	CURLOPT_CONNECTTIMEOUT => 15,
	CURLOPT_FAILONERROR => true,
	]);
	$indexHtml = curl_exec($ch);
	$curlErr = curl_error($ch);

	if ($indexHtml === false) {
	echo "<strong>[Fatal]</strong> Could not fetch WHATWG index: <em>{$curlErr}</em>. Aborting.<br>\n";
	exit;
	}

	file_put_contents($indexCache, $indexHtml);
	echo "<em>[Fetch]</em> Downloaded and cached to <code>{$indexCache}</code>.<br>\n";
	}

	// === STEP 2: Parse — build a complete tag → attributes[] map ===
	libxml_use_internal_errors(true);
	$dom = new DOMDocument();
	@$dom->loadHTML($indexHtml);
	libxml_clear_errors();
	$xpath = new DOMXPath($dom);

	$tagAttrMap = [];

	// Rows use <th> for the element name, <td> for all other columns
	$rows = $xpath->query('//table//tr[th and td]');

	foreach ($rows as $row) {

	// ── Tag names: read from the <th> cell ───────────────────────────────
	// Element links always point to anchors like #the-input-element
	$thCells = $xpath->query('th', $row);
	if ($thCells->length < 1) continue;

	$tagsInRow = [];
	foreach ($xpath->query('.//a', $thCells->item(0)) as $a) {
	$fragment = (string)(parse_url($a->getAttribute('href'), PHP_URL_FRAGMENT) ?? '');
	$name = strtolower(trim($a->textContent));
	if (str_starts_with($fragment, 'the-') && preg_match('/^[a-z][a-z0-9]*$/', $name)) {
	$tagsInRow[] = $name;
	}
	}
	if (empty($tagsInRow)) continue;

	// ── Attributes: WHITELIST — only keep anchors that match WHATWG's
	// attribute-anchor naming conventions:
	//
	// attr-* → almost all element attributes
	// e.g. #attr-media-src, #attr-input-accept
	//
	// handler-* → event-handler content attributes on <body>
	// e.g. #handler-window-onafterprint
	//
	// Everything else (flow-content-2, htmlelement, the-, concept-, …)
	// is silently ignored.
	$attrs = [];
	foreach ($xpath->query('.//a', $row) as $a) {
	$fragment = (string)(parse_url($a->getAttribute('href'), PHP_URL_FRAGMENT) ?? '');

	if (!str_starts_with($fragment, 'attr-') &&
	!str_starts_with($fragment, 'handler-')) {
	continue;
	}

	$val = strtolower(trim($a->textContent));
	if (!preg_match('/^[a-z][a-z0-9\-]*$/', $val)) continue;

	$attrs[] = $val;
	}

	$attrs = array_values(array_unique($attrs));
	sort($attrs);

	// Rows shared by multiple tags (e.g. h1–h6) get the same list
	foreach ($tagsInRow as $t) {
	$tagAttrMap[$t] = $attrs;
	}
	}

	echo "<em>[Parse]</em> Built attribute map for <strong>"
	. count($tagAttrMap) . "</strong> element(s).<br><br>\n";

	// === STEP 3: Write per-tag files ===
	foreach ($selectedTags as $rawTag) {
	$cleanTag = strtolower(str_replace(['<', '>'], '', $rawTag));
	$filename = "html_per-tag/html_per-tag-{$cleanTag}-attributes_array.php";

	if (file_exists($filename) && (time() - filemtime($filename)) <= $cacheLimit) {
	echo "<strong>[Cached]</strong> <code>{$filename}</code> is up to date. Skipping…<br>\n";
	continue;
	}

	$attributes = $tagAttrMap[$cleanTag] ?? [];

	$fileContent = "<?php\n\nreturn [\n";
	foreach ($attributes as $idx => $attr) {
	$fileContent .= " {$idx} => '{$attr}',\n";
	}
	$fileContent .= "];\n";

	if (file_put_contents($filename, $fileContent) !== false) {
	$count = count($attributes);
	$msg = $count > 0
	? "Saved <strong>{$count}</strong> attributes"
	: "No element-specific attributes found; saved <strong>empty array</strong>";
	echo "<strong>[Success]</strong> {$msg} → <code>{$filename}</code>.<br>\n";
	} else {
	echo "<strong>[Error]</strong> Could not write <code>{$filename}</code>.<br>\n";
	}
	}

	echo "<br><hr><a href=''>← Start New Scrape</a><br>\n";
	exit;
	}
	?>

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>WHATWG Attribute Scraper</title>
	<style>
	body { font-family: sans-serif; background: #f0f2f5; padding: 20px; line-height: 1.5; }
	.card { max-width: 500px; margin: auto; background: #fff; padding: 25px; border-radius: 10px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }
	select { width: 100%; height: 250px; margin-top: 10px; border: 1px solid #ccc; border-radius: 5px; font-size: 16px; }
	button { width: 100%; padding: 12px; background: #007bff; color: #fff; border: none; border-radius: 5px; cursor: pointer; font-size: 16px; margin-top: 16px; }
	button:hover { background: #0056b3; }
	label { font-weight: bold; }
	.help { font-size: 12px; color: #666; margin-bottom: 5px; }
	.source { font-size: 12px; color: #888; margin-bottom: 16px; }
	</style>
	</head>
	<body>
	<div class="card">
	<h2>Tag Attribute Scraper</h2>
	<p class="source">
	Source: <a href="https://html.spec.whatwg.org/multipage/indices.html" target="_blank">
	WHATWG HTML Living Standard — Element Index</a><br>
	Index page fetched once, cached 30 days.
	</p>
	<form method="POST">
	<label>Select Tags (Multiple):</label>
	<div class="help">Files saved in <code>/html_per-tag/</code>. 30-day cache applies per file.</div>
	<select name="tags[]" multiple required>
	<?php foreach ($tagNames as $tag): ?>
	<option value="<?= htmlspecialchars($tag) ?>"><?= htmlspecialchars($tag) ?></option>
	<?php endforeach; ?>
	</select>
	<button type="submit">Check & Generate Files</button>
	</form>
	</div>
	</body>
	</html>