Created
February 7, 2012 22:18
-
-
Save dvdbng/1762527 to your computer and use it in GitHub Desktop.
HTML DOM to Markdown in javascript
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
You can use this to convert a DOM element or a HTML string to markdown. | |
Usage examples: | |
var markdown = toMarkdown(document.getElementById("content")); | |
// With jQuery you can easily convert HTML strings | |
var markdown = toMarkdown($("<ul><li>Hi!</li></ul>")[0]); | |
*/ | |
function markdownEscape(text){ | |
return text.replace(/\s+/g," ").replace(/[\\\-*_>#]/g,"\\$&"); | |
} | |
function repeat(str,times){ | |
return (new Array(times+1)).join(str) | |
} | |
function childsToMarkdown(tree,mode){ | |
var res = ""; | |
for(var i=0, l=tree.childNodes.length; i<l; ++i){ | |
res += nodeToMarkdown(tree.childNodes[i], mode); | |
} | |
return res; | |
} | |
function nodeToMarkdown(tree,mode){ | |
var nl = "\n\n"; | |
if(tree.nodeType == 3){ // Text node | |
return markdownEscape(tree.nodeValue) | |
}else if(tree.nodeType == 1){ | |
if(mode == "block"){ | |
switch(tree.tagName.toLowerCase()){ | |
case "br": | |
return nl; | |
case "hr": | |
return nl + "---" + nl; | |
// Block container elements | |
case "p": | |
case "div": | |
case "section": | |
case "address": | |
case "center": | |
return nl + childsToMarkdown(tree, "block") + nl; | |
case "ul": | |
return nl + childsToMarkdown(tree, "u") + nl; | |
case "ol": | |
return nl + childsToMarkdown(tree, "o") + nl; | |
case "pre": | |
return nl + " " + childsToMarkdown(tree, "inline") + nl; | |
case "code": | |
if(tree.childNodes.length == 1){ | |
break; // use the inline format | |
} | |
return nl + " " + childsToMarkdown(tree, "inline") + nl; | |
case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": case "h7": | |
return nl + repeat("#", +tree.tagName[1]) + " " + childsToMarkdown(tree, "inline") + nl; | |
case "blockquote": | |
return nl + "> " + childsToMarkdown(tree, "inline") + nl; | |
} | |
} | |
if(/^[ou]+$/.test(mode)){ | |
if(tree.tagName == "LI"){ | |
return "\n" + repeat(" ", mode.length - 1) + (mode[mode.length-1]=="o"?"1. ":"- ") + childsToMarkdown(tree, mode+"l"); | |
}else{ | |
console.log("[toMarkdown] - invalid element at this point " + mode.tagName); | |
return childsToMarkdown(tree, "inline") | |
} | |
}else if(/^[ou]+l$/.test(mode)){ | |
if(tree.tagName == "UL"){ | |
return childsToMarkdown(tree,mode.substr(0,mode.length-1)+"u"); | |
}else if(tree.tagName == "OL"){ | |
return childsToMarkdown(tree,mode.substr(0,mode.length-1)+"o"); | |
} | |
} | |
// Inline tags | |
switch(tree.tagName.toLowerCase()){ | |
case "strong": | |
case "b": | |
return "**" + childsToMarkdown(tree,"inline") + "**"; | |
case "em": | |
case "i": | |
return "_" + childsToMarkdown(tree,"inline") + "_"; | |
case "code": // Inline version of code | |
return "`" + childsToMarkdown(tree,"inline") + "`"; | |
case "a": | |
return "[" + childsToMarkdown(tree,"inline") + "](" + tree.getAttribute("href") + ")"; | |
case "img": | |
return nl + "[_Image_: " + markdownEscape(tree.getAttribute("alt")) + "](" + tree.getAttribute("src") + ")" + nl; | |
case "script": | |
case "style": | |
case "meta": | |
return ""; | |
default: | |
console.log("[toMarkdown] - undefined element " + tree.tagName) | |
return childsToMarkdown(tree,mode); | |
} | |
} | |
} | |
function toMarkdown(node){ | |
return nodeToMarkdown(node,"block").replace(/[\n]{2,}/g,"\n\n").replace(/^[\n]+/,"").replace(/[\n]+$/,""); | |
} |
Really useful. Thank you ! I have made a minor edit to my local copy:
// in case text is null
function markdownEscape(text) {
return text ? text.replace(/\s+/g, " ").replace(/[\\\-*_>#]/g, "\\$&") : '';
}
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Awesome script! Works better than most html to markdown converters. Thanks!