Skip to content

Instantly share code, notes, and snippets.

@ClarkeRemy
Created October 6, 2025 16:02
Show Gist options
  • Save ClarkeRemy/4d58aaadd17986c89cc31af74467b8df to your computer and use it in GitHub Desktop.
Save ClarkeRemy/4d58aaadd17986c89cc31af74467b8df to your computer and use it in GitHub Desktop.
Simple Sexpr tokenizer
#include <stdio.h>
#include <stdint.h>
#include <stddef.h>
#define false 0
#define true !false
char* input = "(a bc (def \n 5) 7)";
struct Sym { char* ptr; uintptr_t len; };
// struct Sexpr
// {
// enum { NODE, NIL, SYM } tag;
// union {
// struct { struct Sexpr* val; struct Sexpr* next; } node;
// struct { } nil;
// struct Sym sym;
// };
// };
enum {LPAR, RPAR, SYM} TokenType;
struct Token {
int tag;
union {
struct {} lpar;
struct {} rpar;
struct Sym sym;
} data;
};
void parse_sym
( struct Token* tokens,
uintptr_t* n_tokens,
char* start
);
void print_token(struct Token* token);
#define MAX_TOKENS 5000
int main() {
struct Token tokens[MAX_TOKENS] = {0};
uintptr_t n_tokens = 0;
int dbg = false;
char* start = input;
while (*start != '\0') {
switch (*start)
{
case ' ':
case '\n':
if (dbg) printf("WS\n");
break;
case '(':
if (dbg) printf("LPAR\n");
if (n_tokens == MAX_TOKENS) goto fail;
tokens[n_tokens] = (struct Token){.tag = LPAR, .data.lpar = {}};
n_tokens+=1;
break;
case ')':
if (dbg) printf("RPAR\n");
if (n_tokens == MAX_TOKENS) goto fail;
tokens[n_tokens] = (struct Token){.tag = RPAR, .data.rpar = {}};
n_tokens+=1;
break;
default: // must be after !
if (dbg) printf("SYM\n");
if (n_tokens == MAX_TOKENS) goto fail;
parse_sym(tokens, &n_tokens, start);
break;
}
++start;
}
putchar('[');
for (uintptr_t i = 0; i != n_tokens; i++) {
print_token(&tokens[i]);
if (i+1 != n_tokens ) {
putchar(',');
putchar(' ');
}
}
putchar(']');
putchar('\n');
return 0;
fail:printf("RAN OUT OF TOKEN SPACE\n");
return 1;
}
void parse_sym
( struct Token* tokens,
uintptr_t* n_tokens,
char* start
)
{
char* next = start;
uintptr_t len = 0;
while (true)
{
next += 1;
len += 1;
int break_out = false;
switch (*next)
{
case ' ' :
case '\n':
case '(' :
case ')' :
break_out = true;
}
if (break_out) {
break;
}
}
tokens[*n_tokens] =
(struct Token)
{ .tag = SYM,
.data.sym = { .ptr=start, .len = len}
};
*n_tokens +=1;
}
void print_token(struct Token* token) {
switch (token->tag)
{
case LPAR:
printf("LPAR");
break;
case RPAR:
printf("RPAR");
break;
case SYM:
printf("SYM=\"");
char* end = token->data.sym.ptr
+ token->data.sym.len;
for ( char* cursor = token->data.sym.ptr
; cursor != end
; ++cursor
)
{
putchar(*cursor);
}
printf("\"");
break;
default:
printf("INVALID TOKEN\n");
break;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment