Created
October 6, 2025 16:02
-
-
Save ClarkeRemy/4d58aaadd17986c89cc31af74467b8df to your computer and use it in GitHub Desktop.
Simple Sexpr tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <stdio.h> | |
| #include <stdint.h> | |
| #include <stddef.h> | |
| #define false 0 | |
| #define true !false | |
| char* input = "(a bc (def \n 5) 7)"; | |
| struct Sym { char* ptr; uintptr_t len; }; | |
| // struct Sexpr | |
| // { | |
| // enum { NODE, NIL, SYM } tag; | |
| // union { | |
| // struct { struct Sexpr* val; struct Sexpr* next; } node; | |
| // struct { } nil; | |
| // struct Sym sym; | |
| // }; | |
| // }; | |
| enum {LPAR, RPAR, SYM} TokenType; | |
| struct Token { | |
| int tag; | |
| union { | |
| struct {} lpar; | |
| struct {} rpar; | |
| struct Sym sym; | |
| } data; | |
| }; | |
| void parse_sym | |
| ( struct Token* tokens, | |
| uintptr_t* n_tokens, | |
| char* start | |
| ); | |
| void print_token(struct Token* token); | |
| #define MAX_TOKENS 5000 | |
| int main() { | |
| struct Token tokens[MAX_TOKENS] = {0}; | |
| uintptr_t n_tokens = 0; | |
| int dbg = false; | |
| char* start = input; | |
| while (*start != '\0') { | |
| switch (*start) | |
| { | |
| case ' ': | |
| case '\n': | |
| if (dbg) printf("WS\n"); | |
| break; | |
| case '(': | |
| if (dbg) printf("LPAR\n"); | |
| if (n_tokens == MAX_TOKENS) goto fail; | |
| tokens[n_tokens] = (struct Token){.tag = LPAR, .data.lpar = {}}; | |
| n_tokens+=1; | |
| break; | |
| case ')': | |
| if (dbg) printf("RPAR\n"); | |
| if (n_tokens == MAX_TOKENS) goto fail; | |
| tokens[n_tokens] = (struct Token){.tag = RPAR, .data.rpar = {}}; | |
| n_tokens+=1; | |
| break; | |
| default: // must be after ! | |
| if (dbg) printf("SYM\n"); | |
| if (n_tokens == MAX_TOKENS) goto fail; | |
| parse_sym(tokens, &n_tokens, start); | |
| break; | |
| } | |
| ++start; | |
| } | |
| putchar('['); | |
| for (uintptr_t i = 0; i != n_tokens; i++) { | |
| print_token(&tokens[i]); | |
| if (i+1 != n_tokens ) { | |
| putchar(','); | |
| putchar(' '); | |
| } | |
| } | |
| putchar(']'); | |
| putchar('\n'); | |
| return 0; | |
| fail:printf("RAN OUT OF TOKEN SPACE\n"); | |
| return 1; | |
| } | |
| void parse_sym | |
| ( struct Token* tokens, | |
| uintptr_t* n_tokens, | |
| char* start | |
| ) | |
| { | |
| char* next = start; | |
| uintptr_t len = 0; | |
| while (true) | |
| { | |
| next += 1; | |
| len += 1; | |
| int break_out = false; | |
| switch (*next) | |
| { | |
| case ' ' : | |
| case '\n': | |
| case '(' : | |
| case ')' : | |
| break_out = true; | |
| } | |
| if (break_out) { | |
| break; | |
| } | |
| } | |
| tokens[*n_tokens] = | |
| (struct Token) | |
| { .tag = SYM, | |
| .data.sym = { .ptr=start, .len = len} | |
| }; | |
| *n_tokens +=1; | |
| } | |
| void print_token(struct Token* token) { | |
| switch (token->tag) | |
| { | |
| case LPAR: | |
| printf("LPAR"); | |
| break; | |
| case RPAR: | |
| printf("RPAR"); | |
| break; | |
| case SYM: | |
| printf("SYM=\""); | |
| char* end = token->data.sym.ptr | |
| + token->data.sym.len; | |
| for ( char* cursor = token->data.sym.ptr | |
| ; cursor != end | |
| ; ++cursor | |
| ) | |
| { | |
| putchar(*cursor); | |
| } | |
| printf("\""); | |
| break; | |
| default: | |
| printf("INVALID TOKEN\n"); | |
| break; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment