Created
September 6, 2025 15:51
-
-
Save celsowm/adef48ea3e6bcb188a5c9370ca06a563 to your computer and use it in GitHub Desktop.
HtmlParser.php
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
declare(strict_types=1); | |
class HtmlParser | |
{ | |
private \DOMDocument $dom; | |
private array $errors = []; | |
public function __construct(string $html) | |
{ | |
$this->dom = new \DOMDocument(); | |
// Configura opções para melhor tratamento de HTML | |
$this->dom->preserveWhiteSpace = false; | |
$this->dom->strictErrorChecking = false; | |
$this->dom->recover = true; | |
// Suprime erros temporariamente | |
$internalErrors = libxml_use_internal_errors(true); | |
// Adiciona metatag de encoding para evitar problemas com caracteres especiais | |
$html = '<meta charset="UTF-8">' . $html; | |
// Carrega o HTML | |
$this->dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD); | |
// Captura erros para diagnóstico | |
$this->errors = libxml_get_errors(); | |
libxml_clear_errors(); | |
libxml_use_internal_errors($internalErrors); | |
} | |
/** | |
* Obtém todos os elementos por tag | |
*/ | |
public function getElementsByTagName(string $tagName): array | |
{ | |
$elements = $this->dom->getElementsByTagName($tagName); | |
return $this->convertNodeListToArray($elements); | |
} | |
/** | |
* Obtém elementos por classe (suporta múltiplas classes) | |
*/ | |
public function getElementsByClassName(string $className): array | |
{ | |
$elements = $this->dom->getElementsByTagName('*'); | |
$result = []; | |
foreach ($elements as $element) { | |
if ($element->nodeType === XML_ELEMENT_NODE) { | |
$classes = preg_split('/\s+/', trim($element->getAttribute('class'))); | |
if (in_array($className, $classes, true)) { | |
$result[] = $element; | |
} | |
} | |
} | |
return $result; | |
} | |
/** | |
* Obtém elemento por ID | |
*/ | |
public function getElementById(string $id): ?\DOMElement | |
{ | |
return $this->dom->getElementById($id) ?: null; | |
} | |
/** | |
* Extrai todos os estilos inline BRUTOS (sem processamento CSS) | |
* Retorna a string completa do atributo style | |
*/ | |
public function getRawInlineStyles(): array | |
{ | |
$elements = $this->dom->getElementsByTagName('*'); | |
$styles = []; | |
foreach ($elements as $element) { | |
if ($element->nodeType === XML_ELEMENT_NODE && $element->hasAttribute('style')) { | |
$styles[] = [ | |
'element' => $element->tagName, | |
'id' => $element->hasAttribute('id') ? $element->getAttribute('id') : null, | |
'class' => $element->hasAttribute('class') ? $element->getAttribute('class') : null, | |
'selector' => $this->buildCssSelector($element), | |
'rawStyle' => $element->getAttribute('style'), | |
'node' => $element | |
]; | |
} | |
} | |
return $styles; | |
} | |
/** | |
* Extrai todos os estilos de tags <style> BRUTOS (sem processamento CSS) | |
*/ | |
public function getRawStyleTags(): array | |
{ | |
$styles = []; | |
foreach ($this->dom->getElementsByTagName('style') as $style) { | |
$media = $style->hasAttribute('media') ? $style->getAttribute('media') : 'all'; | |
$styles[] = [ | |
'content' => $style->nodeValue, | |
'media' => $media, | |
'type' => $style->hasAttribute('type') ? $style->getAttribute('type') : 'text/css', | |
'node' => $style | |
]; | |
} | |
return $styles; | |
} | |
/** | |
* Constrói um seletor CSS representativo para o elemento | |
*/ | |
private function buildCssSelector(\DOMElement $element): string | |
{ | |
$selector = $element->tagName; | |
if ($element->hasAttribute('id')) { | |
$selector .= '#' . $element->getAttribute('id'); | |
} | |
if ($element->hasAttribute('class')) { | |
$classes = preg_split('/\s+/', trim($element->getAttribute('class'))); | |
foreach ($classes as $class) { | |
$selector .= '.' . $class; | |
} | |
} | |
return $selector; | |
} | |
/** | |
* Converte NodeList para array | |
*/ | |
private function convertNodeListToArray(\DOMNodeList $nodeList): array | |
{ | |
$result = []; | |
foreach ($nodeList as $node) { | |
$result[] = $node; | |
} | |
return $result; | |
} | |
/** | |
* Obtém o DOMDocument subjacente | |
*/ | |
public function getDom(): \DOMDocument | |
{ | |
return $this->dom; | |
} | |
/** | |
* Obtém erros de parsing | |
*/ | |
public function getErrors(): array | |
{ | |
return $this->errors; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment