Created
October 25, 2023 19:44
-
-
Save LianSheng197/62af1ee6930d289a353993137c4283c6 to your computer and use it in GitHub Desktop.
一個僅支持 HTML 的 <html>, <head>, <body>, <title>, <h1>, <p> 六種標籤的極簡語法分析範例,不含屬性、自閉合標籤、錯誤恢復等常見 HTML 功能。以 JavaScript 編寫。
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function lexer(input) { | |
const regex = /<\/?(title|h1|html|head|body|p)>|[^<]+/g; | |
let result, tokens = []; | |
while ((result = regex.exec(input)) !== null) { | |
tokens.push(result[0]); | |
} | |
return tokens; | |
} | |
function parser(tokens) { | |
let current = 0; | |
function walk() { | |
let token = tokens[current]; | |
if (token.startsWith("<")) { | |
let tag = token.slice(1, -1); | |
current++; | |
let node = { | |
type: 'Element', | |
tagName: tag, | |
children: [], | |
}; | |
while (!tokens[current].startsWith(`</${tag}>`)) { | |
node.children.push(walk()); | |
} | |
current++; | |
return node; | |
} else { | |
current++; | |
return { | |
type: 'Text', | |
value: token, | |
}; | |
} | |
} | |
let ast = { | |
type: 'Document', | |
children: [], | |
}; | |
while (current < tokens.length) { | |
ast.children.push(walk()); | |
} | |
return ast; | |
} | |
function parseHTML(input) { | |
let tokens = lexer(input); | |
return parser(tokens); | |
} | |
const exampleHTML = ` | |
<html> | |
<head> | |
<title>Test</title> | |
</head> | |
<body> | |
<h1>Hello</h1> | |
<p>World</p> | |
</body> | |
</html>`; | |
let ast = parseHTML(exampleHTML); | |
console.log(JSON.stringify(ast, null, 2)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
在實際應用中,詞法分析器 (laxer) 更常用逐字元解析的做法,追求更高效的解析,以及更複雜的語法設計。
上述範例使用 regex 僅僅是為了保留概念並簡化程式碼。