Last active
February 26, 2025 19:22
-
-
Save greg-randall/5b86156073ac4b475ca4e93835babc14 to your computer and use it in GitHub Desktop.
Remove invisible elements from html -- helps with web scraping by identifying and removing invisible elements from a webpage's DOM to focus on content visible to website users.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Determines if a DOM element is visible to the user | |
* | |
* Checks multiple CSS properties that can hide elements: | |
* - display: none | |
* - visibility: hidden | |
* - opacity: 0 | |
* - zero dimensions (width and height) | |
* - parent visibility (propagated via data attribute) | |
* | |
* Preserves critical elements: | |
* - Elements in the <head> tag | |
* - <script> tags | |
* - <style> tags | |
* | |
* @param {HTMLElement} element - The DOM element to check for visibility | |
* @returns {boolean} - True if the element is visible or should be preserved, false otherwise | |
*/ | |
function isElementVisible(element) { | |
// Always preserve head tags, script tags, style tags, and br tags | |
if (element.tagName === 'HEAD' || | |
element.tagName === 'SCRIPT' || | |
element.tagName === 'STYLE' || | |
element.tagName === 'LINK' || | |
element.tagName === 'BR') { | |
return true; | |
} | |
// Check if element is inside head (preserve all head contents) | |
let ancestor = element.parentElement; | |
while (ancestor) { | |
if (ancestor.tagName === 'HEAD') { | |
return true; | |
} | |
ancestor = ancestor.parentElement; | |
} | |
// Quick check - if any parent is already marked invisible, this element is too | |
let parent = element.parentElement; | |
while (parent) { | |
if (parent.hasAttribute('data-is-invisible')) { | |
element.setAttribute('data-is-invisible', 'true'); | |
return false; | |
} | |
parent = parent.parentElement; | |
} | |
const style = window.getComputedStyle(element); | |
// Check all visibility conditions at once for better performance | |
if (style.display === 'none' || | |
style.visibility === 'hidden' || | |
style.opacity === '0' || | |
(element.offsetWidth === 0 && element.offsetHeight === 0)) { | |
// Mark this element as invisible for faster child checks | |
element.setAttribute('data-is-invisible', 'true'); | |
return false; | |
} | |
return true; | |
} | |
/** | |
* Calculate depths for all elements efficiently using a single DOM traversal | |
* | |
* @returns {Map} - Map of elements to their depth in the DOM | |
*/ | |
function calculateElementDepths() { | |
const depthMap = new Map(); | |
function processNode(node, depth) { | |
if (node.nodeType === Node.ELEMENT_NODE) { | |
depthMap.set(node, depth); | |
for (let child of node.children) { | |
processNode(child, depth + 1); | |
} | |
} | |
} | |
processNode(document.documentElement, 0); | |
return depthMap; | |
} | |
/** | |
* Removes all invisible elements from the DOM | |
* | |
* This function: | |
* 1. Calculates the depth of each element in the DOM tree efficiently | |
* 2. Sorts elements from deepest to shallowest to avoid removal issues | |
* 3. Checks each element for visibility | |
* 4. Marks elements for removal using data attributes | |
* 5. Removes invisible elements that don't have parents already marked for removal | |
* 6. Logs statistics about the operation | |
* | |
* @returns {void} | |
*/ | |
function removeInvisibleElements() { | |
// Performance measurement | |
const startTime = performance.now(); | |
// Log the initial element count | |
const initialCount = document.getElementsByTagName('*').length; | |
console.log(`Total elements before processing: ${initialCount}`); | |
// Calculate all element depths in a single efficient traversal | |
const depthMap = calculateElementDepths(); | |
// Get all elements and add depth information | |
const allElements = Array.from(document.getElementsByTagName('*')); | |
const elementsWithDepth = allElements.map(element => ({ | |
element, | |
depth: depthMap.get(element) || 0 | |
})); | |
// Sort from deepest to shallowest to avoid parent removal before children | |
elementsWithDepth.sort((a, b) => b.depth - a.depth); | |
// Identify elements to remove | |
const elementsToRemove = []; | |
let removedCount = 0; | |
let preservedCount = 0; | |
// First pass - mark elements for removal | |
for (const { element, depth } of elementsWithDepth) { | |
// Skip if any parent has already been marked for removal | |
let parentElement = element.parentElement; | |
let skipThisElement = false; | |
while (parentElement) { | |
if (parentElement.hasAttribute('data-marked-for-removal')) { | |
skipThisElement = true; | |
break; | |
} | |
parentElement = parentElement.parentElement; | |
} | |
if (skipThisElement) continue; | |
// Check if element is visible or should be preserved | |
if (!isElementVisible(element)) { | |
// Additional check for tags that might be inside scripts or styles | |
const tagName = element.tagName.toUpperCase(); | |
if (tagName === 'SCRIPT' || tagName === 'STYLE' || tagName === 'HEAD' || tagName === 'LINK') { | |
preservedCount++; | |
continue; | |
} | |
elementsToRemove.push(element); | |
element.setAttribute('data-marked-for-removal', 'true'); | |
removedCount++; | |
} | |
} | |
console.log(`Found ${removedCount} invisible elements (in ${(performance.now() - startTime).toFixed(2)}ms)`); | |
console.log(`Preserved ${preservedCount} elements in head, scripts, and styles`); | |
// Second pass - remove the elements | |
elementsToRemove.forEach(element => { | |
// Final check to make sure we're not removing critical elements | |
const tagName = element.tagName.toUpperCase(); | |
if (tagName === 'SCRIPT' || tagName === 'STYLE' || tagName === 'HEAD' || tagName === 'LINK' || tagName === 'BR') { | |
console.warn(`Prevented removal of critical element: ${tagName}`); | |
return; | |
} | |
// Check if this element is inside head | |
let inHead = false; | |
let parent = element.parentElement; | |
while (parent) { | |
if (parent.tagName === 'HEAD') { | |
inHead = true; | |
break; | |
} | |
parent = parent.parentElement; | |
} | |
if (inHead) { | |
console.warn(`Prevented removal of element inside <head>`); | |
return; | |
} | |
// Safe to remove | |
if (element.parentNode) { | |
element.parentNode.removeChild(element); | |
} | |
}); | |
// Clean up temporary attributes (in case we want to run this again) | |
document.querySelectorAll('[data-is-invisible]').forEach(el => { | |
el.removeAttribute('data-is-invisible'); | |
}); | |
document.querySelectorAll('[data-marked-for-removal]').forEach(el => { | |
el.removeAttribute('data-marked-for-removal'); | |
}); | |
// Log the final element count after removal | |
const finalCount = document.getElementsByTagName('*').length; | |
console.log(`Total elements after processing: ${finalCount}`); | |
console.log(`Removed ${initialCount - finalCount} elements`); | |
console.log(`Total operation time: ${(performance.now() - startTime).toFixed(2)}ms`); | |
} | |
// Start the removal process | |
console.log("Removal operation begun"); | |
removeInvisibleElements(); | |
console.log("Removal operation completed"); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment