Skip to content

Instantly share code, notes, and snippets.

@celsowm
Created September 6, 2025 15:51
Show Gist options
  • Save celsowm/adef48ea3e6bcb188a5c9370ca06a563 to your computer and use it in GitHub Desktop.
Save celsowm/adef48ea3e6bcb188a5c9370ca06a563 to your computer and use it in GitHub Desktop.
HtmlParser.php
<?php
declare(strict_types=1);
class HtmlParser
{
private \DOMDocument $dom;
private array $errors = [];
public function __construct(string $html)
{
$this->dom = new \DOMDocument();
// Configura opções para melhor tratamento de HTML
$this->dom->preserveWhiteSpace = false;
$this->dom->strictErrorChecking = false;
$this->dom->recover = true;
// Suprime erros temporariamente
$internalErrors = libxml_use_internal_errors(true);
// Adiciona metatag de encoding para evitar problemas com caracteres especiais
$html = '<meta charset="UTF-8">' . $html;
// Carrega o HTML
$this->dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
// Captura erros para diagnóstico
$this->errors = libxml_get_errors();
libxml_clear_errors();
libxml_use_internal_errors($internalErrors);
}
/**
* Obtém todos os elementos por tag
*/
public function getElementsByTagName(string $tagName): array
{
$elements = $this->dom->getElementsByTagName($tagName);
return $this->convertNodeListToArray($elements);
}
/**
* Obtém elementos por classe (suporta múltiplas classes)
*/
public function getElementsByClassName(string $className): array
{
$elements = $this->dom->getElementsByTagName('*');
$result = [];
foreach ($elements as $element) {
if ($element->nodeType === XML_ELEMENT_NODE) {
$classes = preg_split('/\s+/', trim($element->getAttribute('class')));
if (in_array($className, $classes, true)) {
$result[] = $element;
}
}
}
return $result;
}
/**
* Obtém elemento por ID
*/
public function getElementById(string $id): ?\DOMElement
{
return $this->dom->getElementById($id) ?: null;
}
/**
* Extrai todos os estilos inline BRUTOS (sem processamento CSS)
* Retorna a string completa do atributo style
*/
public function getRawInlineStyles(): array
{
$elements = $this->dom->getElementsByTagName('*');
$styles = [];
foreach ($elements as $element) {
if ($element->nodeType === XML_ELEMENT_NODE && $element->hasAttribute('style')) {
$styles[] = [
'element' => $element->tagName,
'id' => $element->hasAttribute('id') ? $element->getAttribute('id') : null,
'class' => $element->hasAttribute('class') ? $element->getAttribute('class') : null,
'selector' => $this->buildCssSelector($element),
'rawStyle' => $element->getAttribute('style'),
'node' => $element
];
}
}
return $styles;
}
/**
* Extrai todos os estilos de tags <style> BRUTOS (sem processamento CSS)
*/
public function getRawStyleTags(): array
{
$styles = [];
foreach ($this->dom->getElementsByTagName('style') as $style) {
$media = $style->hasAttribute('media') ? $style->getAttribute('media') : 'all';
$styles[] = [
'content' => $style->nodeValue,
'media' => $media,
'type' => $style->hasAttribute('type') ? $style->getAttribute('type') : 'text/css',
'node' => $style
];
}
return $styles;
}
/**
* Constrói um seletor CSS representativo para o elemento
*/
private function buildCssSelector(\DOMElement $element): string
{
$selector = $element->tagName;
if ($element->hasAttribute('id')) {
$selector .= '#' . $element->getAttribute('id');
}
if ($element->hasAttribute('class')) {
$classes = preg_split('/\s+/', trim($element->getAttribute('class')));
foreach ($classes as $class) {
$selector .= '.' . $class;
}
}
return $selector;
}
/**
* Converte NodeList para array
*/
private function convertNodeListToArray(\DOMNodeList $nodeList): array
{
$result = [];
foreach ($nodeList as $node) {
$result[] = $node;
}
return $result;
}
/**
* Obtém o DOMDocument subjacente
*/
public function getDom(): \DOMDocument
{
return $this->dom;
}
/**
* Obtém erros de parsing
*/
public function getErrors(): array
{
return $this->errors;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment