Skip to content

Instantly share code, notes, and snippets.

@greg-randall
Last active February 26, 2025 19:22
Show Gist options
  • Save greg-randall/5b86156073ac4b475ca4e93835babc14 to your computer and use it in GitHub Desktop.
Save greg-randall/5b86156073ac4b475ca4e93835babc14 to your computer and use it in GitHub Desktop.
Remove invisible elements from html -- helps with web scraping by identifying and removing invisible elements from a webpage's DOM to focus on content visible to website users.
/**
* Determines if a DOM element is visible to the user
*
* Checks multiple CSS properties that can hide elements:
* - display: none
* - visibility: hidden
* - opacity: 0
* - zero dimensions (width and height)
* - parent visibility (propagated via data attribute)
*
* Preserves critical elements:
* - Elements in the <head> tag
* - <script> tags
* - <style> tags
*
* @param {HTMLElement} element - The DOM element to check for visibility
* @returns {boolean} - True if the element is visible or should be preserved, false otherwise
*/
function isElementVisible(element) {
// Always preserve head tags, script tags, style tags, and br tags
if (element.tagName === 'HEAD' ||
element.tagName === 'SCRIPT' ||
element.tagName === 'STYLE' ||
element.tagName === 'LINK' ||
element.tagName === 'BR') {
return true;
}
// Check if element is inside head (preserve all head contents)
let ancestor = element.parentElement;
while (ancestor) {
if (ancestor.tagName === 'HEAD') {
return true;
}
ancestor = ancestor.parentElement;
}
// Quick check - if any parent is already marked invisible, this element is too
let parent = element.parentElement;
while (parent) {
if (parent.hasAttribute('data-is-invisible')) {
element.setAttribute('data-is-invisible', 'true');
return false;
}
parent = parent.parentElement;
}
const style = window.getComputedStyle(element);
// Check all visibility conditions at once for better performance
if (style.display === 'none' ||
style.visibility === 'hidden' ||
style.opacity === '0' ||
(element.offsetWidth === 0 && element.offsetHeight === 0)) {
// Mark this element as invisible for faster child checks
element.setAttribute('data-is-invisible', 'true');
return false;
}
return true;
}
/**
* Calculate depths for all elements efficiently using a single DOM traversal
*
* @returns {Map} - Map of elements to their depth in the DOM
*/
function calculateElementDepths() {
const depthMap = new Map();
function processNode(node, depth) {
if (node.nodeType === Node.ELEMENT_NODE) {
depthMap.set(node, depth);
for (let child of node.children) {
processNode(child, depth + 1);
}
}
}
processNode(document.documentElement, 0);
return depthMap;
}
/**
* Removes all invisible elements from the DOM
*
* This function:
* 1. Calculates the depth of each element in the DOM tree efficiently
* 2. Sorts elements from deepest to shallowest to avoid removal issues
* 3. Checks each element for visibility
* 4. Marks elements for removal using data attributes
* 5. Removes invisible elements that don't have parents already marked for removal
* 6. Logs statistics about the operation
*
* @returns {void}
*/
function removeInvisibleElements() {
// Performance measurement
const startTime = performance.now();
// Log the initial element count
const initialCount = document.getElementsByTagName('*').length;
console.log(`Total elements before processing: ${initialCount}`);
// Calculate all element depths in a single efficient traversal
const depthMap = calculateElementDepths();
// Get all elements and add depth information
const allElements = Array.from(document.getElementsByTagName('*'));
const elementsWithDepth = allElements.map(element => ({
element,
depth: depthMap.get(element) || 0
}));
// Sort from deepest to shallowest to avoid parent removal before children
elementsWithDepth.sort((a, b) => b.depth - a.depth);
// Identify elements to remove
const elementsToRemove = [];
let removedCount = 0;
let preservedCount = 0;
// First pass - mark elements for removal
for (const { element, depth } of elementsWithDepth) {
// Skip if any parent has already been marked for removal
let parentElement = element.parentElement;
let skipThisElement = false;
while (parentElement) {
if (parentElement.hasAttribute('data-marked-for-removal')) {
skipThisElement = true;
break;
}
parentElement = parentElement.parentElement;
}
if (skipThisElement) continue;
// Check if element is visible or should be preserved
if (!isElementVisible(element)) {
// Additional check for tags that might be inside scripts or styles
const tagName = element.tagName.toUpperCase();
if (tagName === 'SCRIPT' || tagName === 'STYLE' || tagName === 'HEAD' || tagName === 'LINK') {
preservedCount++;
continue;
}
elementsToRemove.push(element);
element.setAttribute('data-marked-for-removal', 'true');
removedCount++;
}
}
console.log(`Found ${removedCount} invisible elements (in ${(performance.now() - startTime).toFixed(2)}ms)`);
console.log(`Preserved ${preservedCount} elements in head, scripts, and styles`);
// Second pass - remove the elements
elementsToRemove.forEach(element => {
// Final check to make sure we're not removing critical elements
const tagName = element.tagName.toUpperCase();
if (tagName === 'SCRIPT' || tagName === 'STYLE' || tagName === 'HEAD' || tagName === 'LINK' || tagName === 'BR') {
console.warn(`Prevented removal of critical element: ${tagName}`);
return;
}
// Check if this element is inside head
let inHead = false;
let parent = element.parentElement;
while (parent) {
if (parent.tagName === 'HEAD') {
inHead = true;
break;
}
parent = parent.parentElement;
}
if (inHead) {
console.warn(`Prevented removal of element inside <head>`);
return;
}
// Safe to remove
if (element.parentNode) {
element.parentNode.removeChild(element);
}
});
// Clean up temporary attributes (in case we want to run this again)
document.querySelectorAll('[data-is-invisible]').forEach(el => {
el.removeAttribute('data-is-invisible');
});
document.querySelectorAll('[data-marked-for-removal]').forEach(el => {
el.removeAttribute('data-marked-for-removal');
});
// Log the final element count after removal
const finalCount = document.getElementsByTagName('*').length;
console.log(`Total elements after processing: ${finalCount}`);
console.log(`Removed ${initialCount - finalCount} elements`);
console.log(`Total operation time: ${(performance.now() - startTime).toFixed(2)}ms`);
}
// Start the removal process
console.log("Removal operation begun");
removeInvisibleElements();
console.log("Removal operation completed");
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment