Last active
January 16, 2025 23:55
-
-
Save joshtynjala/e85b580f7cda22e618924fc783bc4133 to your computer and use it in GitHub Desktop.
A script to run in a web browser's Javascript console to normalize the HTML in old HTML documentation from help.adobe.com. The resulting simplified HTML content can be converted more easily into other formats, like Markdown, using tools like pandoc. The help.adobe.com content is Creative Commons licensed.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
document.body.querySelector("#ahpod")?.remove(); | |
document.body.querySelector("#mboxScriptContainer")?.remove(); | |
let h1 = document.querySelector("#content_wrapper h1:first-of-type"); | |
let article = document.querySelector( | |
"table#inner_content_table td:first-of-type" | |
); | |
if (!article) { | |
// some pages seem to have a different table id, for some reason | |
article = document.querySelector("table#page_content_table td:first-of-type"); | |
} | |
article.querySelector("div:first-of-type")?.remove(); | |
article.querySelector("#chcPromo")?.remove(); | |
article.querySelectorAll("script")?.forEach((element) => element.remove()); | |
article.querySelector("#userprefs")?.remove(); | |
// article.querySelector("#related")?.remove(); | |
article.querySelector("#footer")?.remove(); | |
article.querySelector("#minitoc")?.remove(); | |
article | |
.querySelectorAll('a[name^="WS"') | |
?.forEach((element) => element.remove()); | |
article | |
.querySelectorAll('a[href^="#top"]') | |
?.forEach((element) => element.remove()); | |
article | |
.querySelectorAll('*[width^="NaN%"]') | |
?.forEach((element) => element.removeAttribute("width")); | |
article | |
.querySelectorAll("a") | |
?.forEach((element) => element.removeAttribute("target")); | |
document.body.querySelectorAll("*")?.forEach((element) => { | |
element.removeAttribute("class"); | |
element.removeAttribute("id"); | |
element.removeAttribute("style"); | |
element.removeAttribute("valign"); | |
element.removeAttribute("headers"); | |
element.removeAttribute("border"); | |
element.removeAttribute("cellpadding"); | |
element.removeAttribute("cellspacing"); | |
element.removeAttribute("align"); | |
element.removeAttribute("valign"); | |
element.removeAttribute("xmlns:adobe"); | |
element.removeAttribute("xmlns:fn"); | |
element.removeAttribute("xmlns:fo"); | |
element.removeAttribute("xmlns:xs"); | |
}); | |
article | |
.querySelectorAll("a") | |
?.forEach((element) => element.removeAttribute("onclick")); | |
document.body.innerHTML = ""; | |
document.body.appendChild(h1); | |
document.body.appendChild(article); | |
document.body.innerHTML = document.body.innerHTML | |
.replaceAll(/( |\n)<samp>\s+/g, "$1<samp>") //no whitespace after <samp> | |
.replaceAll(/\s+<\/samp>/g, "</samp>") // no whitespace before </samp> | |
.replaceAll(/(<\/\w+>)\s+(\.|,|:|\))/g, "$1$2") // no whitespace between end tag and punctuation | |
.replaceAll(/\s+(<\/a>)/g, "$1"); // no whitespace before </a> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here are some additional manual tasks that aren't easy to automate from the web browser console, or they may apply only after Markdown conversion with Pandoc.
An extra space may appear between Markdown formatting and punctuation.
(\*\*|_)( | \n)[.,!:;\)]
An empty comment may appear between lines:
<!-- -->
Hyperlinks may have a strange formatting where they duplicate the URL in quotes.
\]\([\w\-\/\.:]+ "[\w\-\/\.:]+"\)
Original relative URLs start with WS. They need to be replaced.
\]\(WS\w+
<a href="WS
The original formatting may include curly quotes that should be replaced with straight quotes:
"
and'
“
”
‘
’
Check that code examples are formatted correctly.
^\s+(\{|\})
(curly brace after whitespace at beginning of line)^\s+(<\w+)
(open HTML/XML tag after whitespace at beginning of line)Replace
goo.gl
links with canonical URLs.Replace
adobe.com/go/
links with canonical URLs:http://www.adobe.com/go/learn_flcs5_as3lr_en
=>https://help.adobe.com/en_US/FlashPlatform/reference/actionscript/3/index.html
https://help.adobe.com/en_US/AS3LCR/Flash_10.0/
=>https://help.adobe.com/en_US/FlashPlatform/reference/actionscript/3/index.html
http://www.adobe.com/go/learn_flex45_platformref_en
=>https://help.adobe.com/en_US/FlashPlatform/reference/actionscript/3/index.html
http://www.adobe.com/go/learn_flex4_apiref_en
=>https://help.adobe.com/en_US/FlashPlatform/reference/actionscript/3/index.html
https://help.adobe.com/en_US/AS3LCR/Flex_4.0/
=>https://help.adobe.com/en_US/FlashPlatform/reference/actionscript/3/index.html
http://help.adobe.com:80/en_US/Flex/4.0/langref/
=>https://help.adobe.com/en_US/FlashPlatform/reference/actionscript/3/index.html
http://livedocs.adobe.com/flex/201/langref/
=>https://help.adobe.com/en_US/FlashPlatform/reference/actionscript/3/index.html
http://www.adobe.com/go/learn_cs5_as2lr_en
->https://web.archive.org/web/20120114132936/http://help.adobe.com/en_US/FlashPlatform/reference/actionscript/2/help.html?content=Part2_AS2_LangRef_1.html
http://www.adobe.com/go/flash_devcenter
:https://web.archive.org/web/20120504010203/http://www.adobe.com/devnet/flash.html
http://www.adobe.com/go/learn_fl_samples
:https://web.archive.org/web/20120303062950/http://www.adobe.com/devnet/flash/samples.html
Replace dead external links with WayBack Machine archive links.
Replace
http:
URLs withhttps:
, if possible.Search for
helpexamples.com
URLs in example code and download the content (possibly from WayBack Machine).Nice to have: Replace fake URLs like
[yourDomain].com
withexample.com
.Nice to have: Replace HTML tables with GFM table syntax. (Sometimes, pandoc doesn't convert them from HTML to Markdown)
Nice to have: convert code samples from simple indented code blocks that are not syntax highlighted to
as3
andmxml
code fences.Replace
<span class="dfn">
and<span class="kbd">
with italic and code Markdown.