Created
November 22, 2025 03:21
-
-
Save bytemain/a81fdfabd8cbbceea42cb8a85303792e to your computer and use it in GitHub Desktop.
Find out invalid encoding for Chinese
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import fs from "node:fs/promises"; | |
| import path from "node:path"; | |
| import process from "node:process"; | |
| import { TextDecoder } from "node:util"; | |
| const docsDir = path.join(process.cwd(), "site", "content", "zh", "docs"); | |
| async function listFiles(dir) { | |
| const out = []; | |
| const entries = await fs.readdir(dir, { withFileTypes: true }); | |
| for (const e of entries) { | |
| const p = path.join(dir, e.name); | |
| if (e.isDirectory()) { | |
| const sub = await listFiles(p); | |
| out.push(...sub); | |
| } else if (e.isFile()) { | |
| out.push(p); | |
| } | |
| } | |
| return out; | |
| } | |
| function analyze(content) { | |
| const lines = content.split(/\r?\n/); | |
| const issues = { replacementChar: [], controlChars: [], mojibake: [] }; | |
| const whitelist = [/ØMQ/g, /×/g]; | |
| for (let i = 0; i < lines.length; i++) { | |
| const line = lines[i]; | |
| if (line.includes("\uFFFD")) issues.replacementChar.push(i + 1); | |
| if (/[\x00-\x08\x0B\x0C\x0E-\x1F]/.test(line)) issues.controlChars.push(i + 1); | |
| let normalized = line; | |
| for (const w of whitelist) normalized = normalized.replace(w, ""); | |
| if (/[\u4E00-\u9FFF]/.test(normalized) && /[\u00C0-\u00FF]/.test(normalized)) issues.mojibake.push(i + 1); | |
| } | |
| return issues; | |
| } | |
| async function run() { | |
| const decoder = new TextDecoder("utf-8", { fatal: true }); | |
| const files = await listFiles(docsDir); | |
| let totalIssues = 0; | |
| for (const file of files) { | |
| const buf = await fs.readFile(file); | |
| let content; | |
| let validUtf8 = true; | |
| try { | |
| content = decoder.decode(buf); | |
| } catch { | |
| validUtf8 = false; | |
| content = buf.toString("utf8"); | |
| } | |
| const issues = analyze(content); | |
| const hasIssues = !validUtf8 || issues.replacementChar.length || issues.controlChars.length || issues.mojibake.length; | |
| if (hasIssues) { | |
| totalIssues++; | |
| const rel = path.relative(process.cwd(), file); | |
| const summary = []; | |
| if (!validUtf8) summary.push("invalid_utf8"); | |
| if (issues.replacementChar.length) summary.push(`replacement_char:${issues.replacementChar.length}`); | |
| if (issues.controlChars.length) summary.push(`control_chars:${issues.controlChars.length}`); | |
| if (issues.mojibake.length) summary.push(`suspicious_mojibake:${issues.mojibake.length}`); | |
| console.log(`ISSUE ${rel} -> ${summary.join(", ")}`); | |
| const show = (arr) => arr.slice(0, 5).map((n) => `#${n}`).join(", "); | |
| if (!validUtf8) console.log(" detail: invalid UTF-8 sequence detected by decoder"); | |
| if (issues.replacementChar.length) console.log(" lines with replacement char �:", show(issues.replacementChar)); | |
| if (issues.controlChars.length) console.log(" lines with control chars:", show(issues.controlChars)); | |
| if (issues.mojibake.length) console.log(" lines suspicious of mojibake:", show(issues.mojibake)); | |
| } | |
| } | |
| if (totalIssues === 0) { | |
| console.log("No encoding issues detected in zh/docs"); | |
| } | |
| } | |
| run().catch((err) => { | |
| console.error(err); | |
| process.exit(1); | |
| }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment