Skip to content

Instantly share code, notes, and snippets.

@bytemain
Created November 22, 2025 03:21
Show Gist options
  • Select an option

  • Save bytemain/a81fdfabd8cbbceea42cb8a85303792e to your computer and use it in GitHub Desktop.

Select an option

Save bytemain/a81fdfabd8cbbceea42cb8a85303792e to your computer and use it in GitHub Desktop.
Find out invalid encoding for Chinese
import fs from "node:fs/promises";
import path from "node:path";
import process from "node:process";
import { TextDecoder } from "node:util";
const docsDir = path.join(process.cwd(), "site", "content", "zh", "docs");
async function listFiles(dir) {
const out = [];
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const e of entries) {
const p = path.join(dir, e.name);
if (e.isDirectory()) {
const sub = await listFiles(p);
out.push(...sub);
} else if (e.isFile()) {
out.push(p);
}
}
return out;
}
function analyze(content) {
const lines = content.split(/\r?\n/);
const issues = { replacementChar: [], controlChars: [], mojibake: [] };
const whitelist = [/ØMQ/g, /×/g];
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (line.includes("\uFFFD")) issues.replacementChar.push(i + 1);
if (/[\x00-\x08\x0B\x0C\x0E-\x1F]/.test(line)) issues.controlChars.push(i + 1);
let normalized = line;
for (const w of whitelist) normalized = normalized.replace(w, "");
if (/[\u4E00-\u9FFF]/.test(normalized) && /[\u00C0-\u00FF]/.test(normalized)) issues.mojibake.push(i + 1);
}
return issues;
}
async function run() {
const decoder = new TextDecoder("utf-8", { fatal: true });
const files = await listFiles(docsDir);
let totalIssues = 0;
for (const file of files) {
const buf = await fs.readFile(file);
let content;
let validUtf8 = true;
try {
content = decoder.decode(buf);
} catch {
validUtf8 = false;
content = buf.toString("utf8");
}
const issues = analyze(content);
const hasIssues = !validUtf8 || issues.replacementChar.length || issues.controlChars.length || issues.mojibake.length;
if (hasIssues) {
totalIssues++;
const rel = path.relative(process.cwd(), file);
const summary = [];
if (!validUtf8) summary.push("invalid_utf8");
if (issues.replacementChar.length) summary.push(`replacement_char:${issues.replacementChar.length}`);
if (issues.controlChars.length) summary.push(`control_chars:${issues.controlChars.length}`);
if (issues.mojibake.length) summary.push(`suspicious_mojibake:${issues.mojibake.length}`);
console.log(`ISSUE ${rel} -> ${summary.join(", ")}`);
const show = (arr) => arr.slice(0, 5).map((n) => `#${n}`).join(", ");
if (!validUtf8) console.log(" detail: invalid UTF-8 sequence detected by decoder");
if (issues.replacementChar.length) console.log(" lines with replacement char �:", show(issues.replacementChar));
if (issues.controlChars.length) console.log(" lines with control chars:", show(issues.controlChars));
if (issues.mojibake.length) console.log(" lines suspicious of mojibake:", show(issues.mojibake));
}
}
if (totalIssues === 0) {
console.log("No encoding issues detected in zh/docs");
}
}
run().catch((err) => {
console.error(err);
process.exit(1);
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment