Skip to content

Instantly share code, notes, and snippets.

@erdesigns-eu
Created August 6, 2025 07:42
Show Gist options
  • Select an option

  • Save erdesigns-eu/fb9e205eef3178cff16a531166f18a69 to your computer and use it in GitHub Desktop.

Select an option

Save erdesigns-eu/fb9e205eef3178cff16a531166f18a69 to your computer and use it in GitHub Desktop.
JSON State Machine Parser for parsing JSON responses from LLM's, this parser fixes common issues in JSON and validates/repairs against a JSON Schema
export interface JSONSchema {
/**
* The type of the JSON value
*/
type?: 'object' | 'array' | 'string' | 'number' | 'boolean' | 'null';
/**
* Properties of the object, if type is 'object'
*/
properties?: Record<string, JSONSchema>;
/**
* Items of the array, if type is 'array'
*/
items?: JSONSchema;
/**
* Required properties for the object
*/
required?: string[];
/**
* Additional properties allowed in the object
*/
additionalProperties?: boolean | JSONSchema;
/**
* Pattern for string values, if type is 'string'
*/
patternProperties?: Record<string, JSONSchema>;
/**
* Minimum value for number types
*/
minItems?: number;
/**
* Maximum value for number types
*/
maxItems?: number;
/**
* Description of the schema
*/
description?: string;
/**
* Schema must match one of the provided schemas
*/
oneOf?: JSONSchema[];
}
export interface ValidationResult {
/**
* Whether the JSON is valid according to the schema
*/
valid: boolean;
/**
* Errors encountered during validation
*/
errors: string[];
/**
* Repaired JSON object if validation was successful
*/
repaired?: any;
}
/**
* Options for the JSONStateMachineParser.
*/
export interface JSONParserOptions {
/**
* Allow single-quoted strings (e.g., 'value')
*/
allowSingleQuotedStrings?: boolean;
/**
* Allow trailing commas in objects and arrays.
*/
allowTrailingCommas?: boolean;
/**
* Remove leading BOM if present.
*/
removeBom?: boolean;
/**
* Normalize newline sequences to LF.
*/
normalizeNewlines?: boolean;
/**
* Remove control characters except tab, LF, CR.
*/
removeControlChars?: boolean;
/**
* Automatically convert ISO date strings to Date objects.
*/
parseDates?: boolean;
}
/**
* Reviver function type for transforming parsed values.
* Similar to the second parameter of JSON.parse.
*/
export type JSONReviver = (this: any, key: string, value: any) => any;
/**
* A state-machine-based JSON parser with aggressive memory reuse, cleanup, and reviver support.
*/
export class JSONStateMachineParser {
/**
* The input string to parse.
* This is reset after each parse to free memory.
*/
private input: string = '';
/**
* Current index in the input string.
* This is reset after each parse to free memory.
*/
private index: number = 0;
/**
* Options for the parser, with defaults applied.
* This is initialized in the constructor.
*/
private options: Required<JSONParserOptions>;
/**
* Check if a string is a valid JSON object or array structure.
* This method also handles strings wrapped in ``` backticks with optional language labels.
* @param raw - The string to check.
* @returns True if the string is valid JSON, false otherwise.
*/
static isStringJson(raw: string): boolean {
if (typeof raw !== 'string' || raw.length === 0) {
return false;
}
// Extract content from backticks if present
let str = raw;
if (str.startsWith('```')) {
const end = str.lastIndexOf('```');
if (end === -1 || end === 0) {
// No closing backticks found
return false;
}
// Find the first newline after the opening backticks and optional label
let contentStart = str.indexOf('\n');
if (contentStart === -1) {
// No newline found, assume no label and content starts after ```
contentStart = 3;
// Skip any non-newline characters that might be a language label
while (contentStart < str.length && str.charAt(contentStart) !== '\n') {
contentStart++;
}
if (contentStart < str.length) {
contentStart++; // Skip the newline
}
}
else {
contentStart++; // Skip the newline
}
const content = str.substring(contentStart, end);
str = content || str.slice(3);
}
// Trim whitespace
let start = 0;
let finish = str.length - 1;
const ws = [' ', '\t', '\n', '\r'];
while (start <= finish && ws.includes(str.charAt(start))) {
start++;
}
while (finish >= start && ws.includes(str.charAt(finish))) {
finish--;
}
if (start > finish) {
return false;
}
const first = str.charAt(start);
const last = str.charAt(finish);
// Object or array?
if (first === '{' && last === '}') {
return true;
}
if (first === '[' && last === ']') {
return true;
}
// Literal: null, true, false
const literal = str.substring(start, finish + 1);
if (literal === 'null' || literal === 'true' || literal === 'false') {
return true;
}
return false;
}
/**
* Create a new JSONStateMachineParser instance with optional configuration.
* @param options - Configuration options for the parser.
*/
constructor(options: JSONParserOptions = {}) {
// Destructure options with defaults.
const {
allowSingleQuotedStrings = true,
allowTrailingCommas = true,
removeBom = true,
normalizeNewlines = true,
removeControlChars = true,
parseDates = false,
} = options;
// Store options with defaults applied.
this.options = {
allowSingleQuotedStrings,
allowTrailingCommas,
removeBom,
normalizeNewlines,
removeControlChars,
parseDates,
};
}
/**
* Parse JSON-like string into type T, cleaning buffers, applying reviver, and optional schema validation.
* @param raw - The JSON or JSON-like string, optionally wrapped in ``` backticks.
* @param schema - Optional JSON Schema object to validate and repair the result against.
* @param reviver - Optional function to transform values; applied bottom-up.
* @returns Parsed and validated value as type T.
*/
public parse<T = any>(raw: string, schema?: JSONSchema, reviver?: JSONReviver): T {
let src = this.unwrapBackticks(raw);
if (this.options.removeBom && src.charCodeAt(0) === 0xFEFF) {
src = src.slice(1);
}
if (this.options.normalizeNewlines) {
src = this.normalize(src);
}
if (this.options.removeControlChars) {
src = this.filterControlCharacters(src);
}
this.input = src;
this.index = 0;
const result = this.parseValue();
this.skipWhitespace();
if (this.index < this.input.length) {
throw new SyntaxError(`Unexpected token at ${this.index}`);
}
// Bottom-up reviver walk and optional ISO date conversion
const rootHolder: any = { '': result };
const revived = this.applyReviver(rootHolder, '', reviver);
// Clear internal buffers to prevent memory retention
this.input = '';
this.index = 0;
// Optional JSON Schema validation and repair
if (schema) {
const validationResult = this.validateAndRepair(revived, schema);
if (!validationResult.valid) {
throw new Error(`JSON does not match schema: ${validationResult.errors.join(', ')}`);
}
return validationResult.repaired as T;
}
return revived as T;
}
/**
* Recursively walk and apply reviver, with date conversion if enabled.
* @param holder - The object holding the value.
* @param key - The key of the value to revive.
* @param reviver - Optional reviver function.
* @returns The revived value.
*/
private applyReviver(holder: any, key: string, reviver?: JSONReviver): any {
let value = holder[key];
if (value && typeof value === 'object') {
for (const k in value) {
if (Object.prototype.hasOwnProperty.call(value, k)) {
const v = this.applyReviver(value, k, reviver);
if (v === undefined) {
delete value[k];
}
else {
value[k] = v;
}
}
}
}
if (this.options.parseDates && typeof value === 'string') {
// Attempt to convert ISO-formatted strings to Date
const d = new Date(value);
if (!isNaN(d.valueOf())) {
value = d;
}
}
if (reviver) {
// Reviver functions receive their parent as `this`
return reviver.call(holder, key, value);
}
return value;
}
/**
* Remove ``` wrappers and optional language label (e.g., json).
* @param str - The input string to unwrap.
* @returns The unwrapped string.
*/
private unwrapBackticks(str: string): string {
if (!str.startsWith('```')) {
return str;
}
const end = str.lastIndexOf('```');
if (end === -1 || end === 0) {
// No closing backticks found, return original
return str;
}
// Find the first newline after the opening backticks and optional label
let contentStart = str.indexOf('\n');
if (contentStart === -1) {
// No newline found, assume no label and content starts after ```
contentStart = 3;
// Skip any non-newline characters that might be a language label
while (contentStart < str.length && str.charAt(contentStart) !== '\n') {
contentStart++;
}
if (contentStart < str.length) {
contentStart++; // Skip the newline
}
}
else {
contentStart++; // Skip the newline
}
const content = str.substring(contentStart, end);
return content || str.slice(3);
}
/**
* Normalize CRLF and CR to LF without intermediate allocations.
* @param str - The input string to normalize.
* @returns The normalized string.
*/
private normalize(str: string): string {
const buf: string[] = [];
for (let i = 0; i < str.length; i++) {
const ch = str.charAt(i);
if (ch === '\r') {
if (str.charAt(i + 1) === '\n') {
i++;
}
buf.push('\n');
}
else {
buf.push(ch);
}
}
return buf.join('');
}
/**
* Filter out control characters except \t, \n, \r.
* @param str - The input string to filter.
* @returns The filtered string.
*/
private filterControlCharacters(str: string): string {
const buf: string[] = [];
for (let i = 0; i < str.length; i++) {
const code = str.charCodeAt(i);
if (code >= 0x20 || code === 0x09 || code === 0x0A || code === 0x0D) {
buf.push(str.charAt(i));
}
}
return buf.join('');
}
/**
* Parse any JSON value.
* @returns The parsed value.
*/
private parseValue(): any {
this.skipWhitespace();
if (this.index >= this.input.length) {
throw new SyntaxError('Unexpected end');
}
const c = this.input.charAt(this.index);
if (c === '{') {
return this.parseObject();
}
if (c === '[') {
return this.parseArray();
}
if (c === '"') {
return this.parseString('"');
}
if (c === '\'' && this.options.allowSingleQuotedStrings) {
return this.parseString('\'');
}
if (c === '-' || this.isDigit(c)) {
return this.parseNumber();
}
return this.parseLiteral();
}
/**
* Parse an object with lenient comma handling.
* @returns The parsed object.
*/
private parseObject(): Record<string, any> {
this.index++;
const obj: Record<string, any> = {};
const loop = true;
let first = true;
while (loop) {
this.skipWhitespace();
if (this.input.charAt(this.index) === '}') {
this.index++; break;
}
if (!first) {
if (this.input.charAt(this.index) === ',') {
this.index++;
}
else if (this.options.allowTrailingCommas && this.input.charAt(this.index) === '}') {
this.index++; break;
}
}
first = false;
const key = this.parseKey();
this.index++; // skip ':'
const val = this.parseValue();
obj[key.endsWith('\\') ? key.slice(0, -1) : key] = val;
}
return obj;
}
/**
* Parse array with lenient comma handling.
* @returns The parsed array.
*/
private parseArray(): any[] {
this.index++;
const arr: any[] = [];
const loop = true;
let first = true;
while (loop) {
this.skipWhitespace();
if (this.input.charAt(this.index) === ']') {
this.index++; break;
}
if (!first) {
if (this.input.charAt(this.index) === ',') {
this.index++;
}
else if (this.options.allowTrailingCommas && this.input.charAt(this.index) === ']') {
this.index++; break;
}
}
first = false;
arr.push(this.parseValue());
}
return arr;
}
/**
* Parse string using buffer to minimize concatenations.
* @param quote - The quote character.
* @returns The parsed string.
*/
private parseString(quote: '"' | '\''): string {
this.index++;
const buf: string[] = [];
while (this.index < this.input.length) {
const ch = this.input.charAt(this.index++);
if (ch === '\\') {
const esc = this.input.charAt(this.index++) || '';
switch (esc) {
case 'b': buf.push('\b'); break;
case 'f': buf.push('\f'); break;
case 'n': buf.push('\n'); break;
case 'r': buf.push('\r'); break;
case 't': buf.push('\t'); break;
case 'u': {
let code = 0;
for (let i = 0; i < 4; i++) {
const digit = this.input.charCodeAt(this.index + i);
code = code * 16 + ((digit >= 48 && digit <= 57) ? digit - 48 : (digit >= 65 && digit <= 70) ? digit - 55 : (digit >= 97 && digit <= 102) ? digit - 87 : 0);
}
buf.push(String.fromCharCode(code));
this.index += 4;
break;
}
default: buf.push(esc); break;
}
}
else if (ch === quote) {
const nxt = this.peekNextNonWhitespace();
if (!nxt || [',', ':', '}', ']'].includes(nxt)) {
break;
}
buf.push(ch);
}
else {
buf.push(ch);
}
}
return buf.join('');
}
/**
* Parse a number from the input string.
* Handles integers, floats, and scientific notation.
* @returns The parsed number.
*/
private parseNumber(): number {
const start = this.index;
if (this.input.charAt(this.index) === '-') {
this.index++;
}
while (this.isDigit(this.input.charAt(this.index))) {
this.index++;
}
if (this.input.charAt(this.index) === '.') {
this.index++; while (this.isDigit(this.input.charAt(this.index))) {
this.index++;
}
}
const e = this.input.charAt(this.index).toLowerCase();
if (e === 'e') {
this.index++; if (['+', '-'].includes(this.input.charAt(this.index))) {
this.index++;
} while (this.isDigit(this.input.charAt(this.index))) {
this.index++;
}
}
const num = Number(this.input.slice(start, this.index));
if (isNaN(num)) {
throw new SyntaxError('Invalid number');
}
return num;
}
/**
* Parse literal values: true, false, or null.
* @returns The parsed literal value.
*/
private parseLiteral(): boolean | null {
const rem = this.input.substring(this.index, this.index + 5);
if (rem.startsWith('true')) {
this.index += 4;
return true;
}
if (rem.startsWith('false')) {
this.index += 5;
return false;
}
if (rem.startsWith('null')) {
this.index += 4;
return null;
}
throw new SyntaxError('Unexpected literal');
}
/**
* Parse a key from the input string.
* @returns The parsed key.
*/
private parseKey(): string {
this.skipWhitespace();
return this.parseString(this.input.charAt(this.index) as '"' | '\'');
}
/**
* Skip whitespace characters in the input string.
* This method advances the index until a non-whitespace character is found.
* It handles spaces, tabs, newlines, and carriage returns.
* @returns void
*/
private skipWhitespace(): void {
while ([' ', '\t', '\n', '\r'].includes(this.input.charAt(this.index))) {
this.index++;
}
}
/**
* Peek the next non-whitespace character in the input string.
* This method does not advance the index, allowing the caller to check the next character without consuming it.
* It skips over spaces, tabs, newlines, and carriage returns.
* If no non-whitespace character is found, it returns undefined.
* @returns The next non-whitespace character or undefined.
*/
private peekNextNonWhitespace(): string | undefined {
let i = this.index;
while (i < this.input.length) {
const c = this.input.charAt(i++);
if (![' ', '\t', '\n', '\r'].includes(c)) {
return c;
}
}
return undefined;
}
/**
* Check if a character is a digit (0-9).
* This method is used to determine if a character can be part of a numeric value.
* @param c - The character to check.
* @returns True if the character is a digit, false otherwise.
*/
private isDigit(c: string): boolean {
return c >= '0' && c <= '9';
}
/**
* Validate and repair a value against a JSON schema.
* @param value - The value to validate.
* @param schema - The JSON schema to validate against.
* @param path - The current path for error reporting.
* @returns Validation result with errors and repaired value.
*/
private validateAndRepair(value: any, schema: JSONSchema, path: string = ''): ValidationResult {
const errors: string[] = [];
let repaired = value;
// Handle oneOf validation first - try each schema until one passes
if (schema.oneOf) {
let bestResult: ValidationResult | null = null;
let fewestErrors = Infinity;
for (const subSchema of schema.oneOf) {
const result = this.validateAndRepair(value, subSchema, path);
// If validation passes completely, use this result
if (result.valid) {
return result;
}
// Keep track of the result with fewest errors as fallback
if (result.errors.length < fewestErrors) {
fewestErrors = result.errors.length;
bestResult = result;
}
}
// If no schema in oneOf matched perfectly, return the best attempt
if (bestResult) {
return bestResult;
}
// If all schemas failed badly, return an error
return {
valid: false,
errors: [`${path}: value does not match any schema in oneOf`],
repaired: value,
};
}
// Type validation and coercion
if (schema.type) {
const typeResult = this.validateType(value, schema.type, path);
if (!typeResult.valid) {
errors.push(...typeResult.errors);
repaired = typeResult.repaired ?? repaired;
}
else {
repaired = typeResult.repaired ?? repaired;
}
}
// Object-specific validation
if (schema.type === 'object' && repaired !== null && typeof repaired === 'object' && !Array.isArray(repaired)) {
const objectResult = this.validateObject(repaired, schema, path);
errors.push(...objectResult.errors);
repaired = objectResult.repaired ?? repaired;
}
// Array-specific validation
if (schema.type === 'array' && Array.isArray(repaired)) {
const arrayResult = this.validateArray(repaired, schema, path);
errors.push(...arrayResult.errors);
repaired = arrayResult.repaired ?? repaired;
}
return {
valid: errors.length === 0,
errors,
repaired,
};
}
/**
* Validate and coerce a value to the expected type.
* @param value - The value to validate.
* @param expectedType - The expected type.
* @param path - The current path for error reporting.
* @returns Validation result with type coercion.
*/
private validateType(value: any, expectedType: string, path: string): ValidationResult {
const errors: string[] = [];
let repaired = value;
switch (expectedType) {
case 'string':
if (typeof value !== 'string') {
if (value === null || value === undefined) {
repaired = '';
}
else {
repaired = String(value);
}
}
break;
case 'number':
if (typeof value !== 'number') {
const num = Number(value);
if (isNaN(num)) {
errors.push(`${path}: expected number, got ${typeof value}`);
repaired = 0;
}
else {
repaired = num;
}
}
break;
case 'boolean':
if (typeof value !== 'boolean') {
repaired = Boolean(value);
}
break;
case 'object':
if (value === null || typeof value !== 'object' || Array.isArray(value)) {
if (value === null || value === undefined) {
repaired = {};
}
else {
errors.push(`${path}: expected object, got ${typeof value}`);
repaired = {};
}
}
break;
case 'array':
if (!Array.isArray(value)) {
if (value === null || value === undefined) {
repaired = [];
}
else {
errors.push(`${path}: expected array, got ${typeof value}`);
repaired = [];
}
}
break;
case 'null':
if (value !== null) {
repaired = null;
}
break;
}
return {
valid: errors.length === 0,
errors,
repaired,
};
}
/**
* Validate an object against a schema.
* @param obj - The object to validate.
* @param schema - The schema to validate against.
* @param path - The current path for error reporting.
* @returns Validation result.
*/
private validateObject(obj: any, schema: JSONSchema, path: string): ValidationResult {
const errors: string[] = [];
const repaired: any = {};
// Copy existing properties first
for (const key in obj) {
if (Object.prototype.hasOwnProperty.call(obj, key)) {
repaired[key] = obj[key];
}
}
// Validate required properties
if (schema.required) {
for (const requiredProp of schema.required) {
if (!(requiredProp in repaired)) {
errors.push(`${path}.${requiredProp}: required property missing`);
// Add default value based on property schema
if (schema.properties && schema.properties[requiredProp]) {
repaired[requiredProp] = this.getDefaultValue(schema.properties[requiredProp]);
}
else {
repaired[requiredProp] = null;
}
}
}
}
// Validate properties
if (schema.properties) {
for (const [propName, propSchema] of Object.entries(schema.properties)) {
if (propName in repaired) {
const result = this.validateAndRepair(repaired[propName], propSchema, `${path}.${propName}`);
errors.push(...result.errors);
repaired[propName] = result.repaired;
}
}
}
// Validate pattern properties
if (schema.patternProperties) {
for (const [pattern, patternSchema] of Object.entries(schema.patternProperties)) {
const regex = new RegExp(pattern);
for (const key in repaired) {
if (regex.test(key)) {
const result = this.validateAndRepair(repaired[key], patternSchema, `${path}.${key}`);
errors.push(...result.errors);
repaired[key] = result.repaired;
}
}
}
}
// Handle additionalProperties
if (schema.additionalProperties === false) {
const allowedProps = new Set(Object.keys(schema.properties || {}));
// Add pattern properties matches
if (schema.patternProperties) {
for (const pattern of Object.keys(schema.patternProperties)) {
const regex = new RegExp(pattern);
for (const key in repaired) {
if (regex.test(key)) {
allowedProps.add(key);
}
}
}
}
for (const key in repaired) {
if (!allowedProps.has(key)) {
delete repaired[key];
}
}
}
return {
valid: errors.length === 0,
errors,
repaired,
};
}
/**
* Validate an array against a schema.
* @param arr - The array to validate.
* @param schema - The schema to validate against.
* @param path - The current path for error reporting.
* @returns Validation result.
*/
private validateArray(arr: any[], schema: JSONSchema, path: string): ValidationResult {
const errors: string[] = [];
const repaired: any[] = [];
// Validate minItems
if (schema.minItems !== undefined && arr.length < schema.minItems) {
errors.push(`${path}: array has ${arr.length} items, minimum is ${schema.minItems}`);
// Pad array to minimum length if items schema is available
while (repaired.length < schema.minItems) {
if (schema.items) {
repaired.push(this.getDefaultValue(schema.items));
}
else {
repaired.push(null);
}
}
}
// Validate maxItems
if (schema.maxItems !== undefined && arr.length > schema.maxItems) {
errors.push(`${path}: array has ${arr.length} items, maximum is ${schema.maxItems}`);
}
// Validate items
for (let i = 0; i < arr.length; i++) {
if (schema.maxItems === undefined || i < schema.maxItems) {
if (schema.items) {
const result = this.validateAndRepair(arr[i], schema.items, `${path}[${i}]`);
errors.push(...result.errors);
repaired[i] = result.repaired;
}
else {
repaired[i] = arr[i];
}
}
}
return {
valid: errors.length === 0,
errors,
repaired,
};
}
/**
* Get a default value for a schema type.
* @param schema - The schema to get default value for.
* @returns Default value.
*/
private getDefaultValue(schema: JSONSchema): any {
switch (schema.type) {
case 'string': return '';
case 'number': return 0;
case 'boolean': return false;
case 'array': return [];
case 'object': return {};
case 'null': return null;
default: return null;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment