Last active
September 8, 2021 22:53
-
-
Save trufae/6db3d524faae82c13eb4a0b3eabcb6b1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* radare - LGPL - Copyright 2007-2021 - pancake */ | |
#if 0 | |
Very simple code parser in C | |
============================ | |
Takes a string representing the code and runs a callback everytime a token is found | |
r_codetok ("string", callback, &userdata); | |
#endif | |
#include <r_util.h> | |
#define D if(0) | |
typedef enum { | |
R_TOKEN_NONE, | |
R_TOKEN_INT, | |
R_TOKEN_FLOAT, | |
R_TOKEN_WORD, | |
R_TOKEN_HASH, | |
R_TOKEN_STRING, | |
R_TOKEN_COMMENT, | |
R_TOKEN_MATH, | |
R_TOKEN_GROUP, | |
R_TOKEN_BEGIN, | |
R_TOKEN_END | |
} TokenType; | |
static const char *tokentypes[] = { | |
"none", "intn", "flot", "word", "hash", "strn", "cmnt", "math", "grup", "begin", "end", NULL | |
}; | |
typedef bool (*RTokenizerCallback)(struct r_tokenizer_t *tok); | |
typedef struct r_tokenizer_t { | |
bool hex; | |
bool escape; | |
const char *buf; | |
char ch; | |
size_t begin; | |
int indent; | |
size_t end; | |
TokenType type; | |
RTokenizerCallback cb; | |
void *user; | |
} RTokenizer; | |
R_API RTokenizer *r_tokenizer_new(void) { | |
RTokenizer *t = R_NEW0 (RTokenizer); | |
return t; | |
} | |
static bool is_token_begin(RTokenizer *tok, char ch) { | |
return !(IS_WHITESPACE (ch) || ch == '\n'); | |
} | |
static bool end_token(RTokenizer *tok) { | |
const char* tt = tokentypes[tok->type]; | |
const int tok_len = (size_t)(tok->end - tok->begin); | |
const char* tok_buf = tok->buf + tok->begin; | |
const char *r = r_str_pad (' ', tok->indent * 4); | |
if (tok->cb) { | |
return tok->cb (tok); | |
} | |
eprintf ("[%s]%s%.*s%c", tt, r, tok_len, tok_buf, 10); | |
return true; | |
} | |
static bool start_token(RTokenizer *tok, char ch) { | |
switch (ch) { | |
case '\'': | |
case '"': | |
tok->type = R_TOKEN_STRING; | |
tok->ch = ch; | |
return true; | |
case '/': | |
tok->type = R_TOKEN_COMMENT; | |
break; | |
case '(': | |
case '{': | |
case '[': | |
tok->indent ++; | |
tok->type = R_TOKEN_GROUP; | |
return false; | |
case ')': | |
case '}': | |
case ']': | |
tok->indent --; | |
tok->type = R_TOKEN_GROUP; | |
return false; | |
case '#': | |
tok->type = R_TOKEN_HASH; | |
return true; | |
case '<': | |
case '>': | |
case '=': | |
case '+': | |
case '-': | |
case '*': | |
case '?': | |
case '|': | |
case '&': | |
case '%': | |
case '^': | |
case ':': | |
case ';': | |
case ',': | |
case '.': | |
tok->type = R_TOKEN_MATH; | |
return false; | |
} | |
if (isalpha (ch)) { | |
tok->type = R_TOKEN_WORD; | |
} | |
if (ch >= '0' && ch <= '9') { | |
tok->type = R_TOKEN_INT; | |
} | |
return false; | |
} | |
static bool is_token_char(RTokenizer *tok, char ch) { | |
switch (tok->type) { | |
case R_TOKEN_NONE: | |
// ERROR | |
return false; | |
case R_TOKEN_HASH: | |
return (isdigit (ch) || ch == '#' || ch == '_') || (isalpha (ch) && !IS_WHITESPACE (ch)); | |
case R_TOKEN_COMMENT: | |
if (tok->end-tok->begin == 0) { | |
if (ch != '/') { | |
tok->type = R_TOKEN_MATH; | |
return false; | |
} | |
} | |
return (ch != '\n'); | |
case R_TOKEN_WORD: | |
return (isdigit (ch) || ch == '#' || ch == '_') || (isalpha (ch) && !IS_WHITESPACE (ch)); | |
case R_TOKEN_INT: | |
if (ch == 'x') { | |
tok->hex = true; | |
return true; | |
} | |
if (ch == '.') { | |
tok->type = R_TOKEN_FLOAT; | |
return true; | |
} | |
if (tok->hex) { | |
if (ch >= 'a' && ch <= 'f') { | |
return true; | |
} | |
if (ch >= 'A' && ch <= 'F') { | |
return true; | |
} | |
} | |
return ch >= '0' && ch <= '9'; | |
case R_TOKEN_FLOAT: | |
return isdigit (ch) || ch == 'f'; // XXX 'f' is the last char | |
case R_TOKEN_STRING: | |
if (tok->escape) { | |
tok->escape = false; | |
} else { | |
if (ch == tok->ch) { | |
return false; | |
} | |
if (ch == '\\') { | |
tok->escape = true; | |
} | |
} | |
return true; | |
case R_TOKEN_GROUP: | |
case R_TOKEN_MATH: | |
// those are one char tokens | |
return false; | |
} | |
return false; | |
} | |
static void tokenize(const char *buf, RTokenizerCallback cb, void *user) { | |
eprintf ("tokenize(%s)%c", buf, 10); | |
RTokenizer *tok = R_NEW0 (RTokenizer); | |
tok->cb = cb; | |
tok->user = user; | |
size_t i = 0; | |
size_t len = strlen (buf); | |
tok->buf = buf; | |
tok->type = R_TOKEN_BEGIN; | |
end_token (tok); | |
while (i < len) { | |
tok->hex = false; | |
tok->type = R_TOKEN_NONE; | |
while (i < len && !is_token_begin (tok, buf[i])) { | |
i++; | |
} | |
if (i == len) { | |
break; | |
} | |
tok->ch = buf[i]; | |
tok->begin = i; | |
tok->end = i; | |
if (start_token (tok, buf[i])) { | |
tok->begin++; | |
i++; | |
} | |
while (i < len && is_token_char (tok, buf[i])) { | |
i++; | |
tok->end = i; | |
} | |
if (tok->type == R_TOKEN_GROUP) { | |
tok->end = i; | |
i++; | |
} else if (tok->type == R_TOKEN_MATH) { | |
i++; | |
tok->end = i; | |
} else { | |
tok->end = i; | |
} | |
if (tok->type == R_TOKEN_STRING) { | |
i++; | |
} | |
if (tok->type != R_TOKEN_NONE) { | |
end_token (tok); | |
} else { | |
i++; | |
} | |
} | |
tok->type = R_TOKEN_END; | |
end_token (tok); | |
} | |
typedef struct { | |
char* word; | |
int parlevel; | |
bool inswitch; | |
bool incase; | |
bool inassign; | |
bool inreturn; | |
RList *args; | |
char *s; | |
PJ *pj; | |
} Data; | |
static void indent(RTokenizer *tok) { | |
Data *data = tok->user; | |
int n = 2 * ((tok->type == R_TOKEN_GROUP)? (tok->indent-1): tok->indent); | |
if (data->incase) { | |
n++; | |
} | |
eprintf ("%s", r_str_pad (' ', n)); | |
} | |
bool callback(RTokenizer *tok) { | |
Data *data = tok->user; | |
switch (tok->type) { | |
case R_TOKEN_HASH: | |
{ | |
char *h = r_str_ndup (tok->buf + tok->begin, tok->end - tok->begin); | |
if (data->pj) { | |
} else { | |
eprintf ("DIRECTIVE (%s)%c", h, 10); | |
} | |
free (h); | |
} | |
break; | |
case R_TOKEN_WORD: | |
free (data->word); | |
data->word = r_str_ndup (tok->buf + tok->begin, tok->end - tok->begin); | |
// eprintf ("WORD (%s)%c", data->word, 10); | |
if (data->incase ) { | |
// eprintf ("CASE WORD (%s)%c", data->word, 10); | |
break; | |
data->incase = false; | |
} | |
if (!strcmp (data->word, "case")) { | |
R_FREE (data->word); | |
data->incase = true; | |
break; | |
} | |
if (!strcmp (data->word, "default")) { | |
break; | |
} | |
if (!strcmp (data->word, "return")) { | |
if (data->pj) { | |
pj_o (data->pj); | |
pj_ks (data->pj, "node", "return"); | |
} else { | |
indent (tok); | |
eprintf ("RETURN%c",10); | |
} | |
R_FREE (data->s); | |
data->inreturn = true; | |
return false; | |
} | |
if (!strcmp (data->word, "break")) { | |
break; | |
} | |
if (data->s) { | |
data->s = r_str_append (data->s, " "); | |
} | |
data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin); | |
// eprintf ("WORD(%s)%c", data->word, 10); | |
break; | |
case R_TOKEN_STRING: | |
{ char *word = r_str_ndup (tok->buf + tok->begin, tok->end - tok->begin); | |
// eprintf ("STRING(%s)%c", word, 10); | |
free (word); | |
} | |
if (data->s) { | |
data->s = r_str_append (data->s, " "); | |
} | |
data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin); | |
break; | |
case R_TOKEN_GROUP: | |
if (data->inassign) { | |
break; | |
} | |
switch (tok->ch) { | |
case '}': | |
R_FREE (data->s); | |
pj_end (data->pj); | |
pj_end (data->pj); | |
break; | |
} | |
if (tok->ch == ')') { | |
data->parlevel--; | |
data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin); | |
if (data->args) { | |
r_list_append (data->args, data->s); | |
data->s = NULL; | |
char *arg; | |
RListIter *iter; | |
r_list_foreach (data->args, iter, arg) { | |
if (arg) { | |
eprintf ("%s", r_str_pad (' ', (tok->indent +1)* 2)); | |
eprintf (" - %s%c", arg, 10); | |
if (data->pj) { | |
char *lz = r_str_rchr (arg, NULL, ' '); | |
if (lz) { | |
*lz++ = 0; | |
pj_o (data->pj); | |
pj_ks (data->pj, "name", lz); | |
pj_ks (data->pj, "type", arg); | |
pj_end (data->pj); | |
} else { | |
pj_s (data->pj, arg); | |
pj_s (data->pj, arg); | |
} | |
} | |
} | |
} | |
r_list_free (data->args); | |
data->args = NULL; | |
if (data->pj) { | |
pj_end(data->pj); | |
} | |
} | |
} else if (tok->ch == '{') { | |
if (data->word) { | |
if (!strcmp (data->word, "else")) { | |
indent (tok); | |
eprintf ("ELSE %d%c", tok->indent, 10); | |
r_list_free (data->args); | |
data->args = NULL; | |
R_FREE (data->s); | |
} | |
} | |
pj_ka (data->pj, "body"); | |
} else if (tok->ch == '(') { | |
data->parlevel++; | |
if (data->word) { | |
if ( !strcmp (data->word, "if")) { | |
indent (tok); | |
eprintf ("IF %d%c", tok->indent, 10); | |
} else if ( !strcmp (data->word, "switch")) { | |
data->inswitch = true; | |
indent (tok); | |
eprintf ("SWITCH%c", 10); | |
R_FREE (data->word); | |
} else { | |
if (tok->indent == 1) { | |
if (data->pj) { | |
pj_ko (data->pj, data->word); | |
pj_ks (data->pj, "type", "symbol"); | |
pj_ks (data->pj, "name", data->word); | |
pj_ka (data->pj, "args"); | |
} else { | |
eprintf ("FUNC (%s)%c", data->word, 10); | |
} | |
} else { | |
if (data->pj) { | |
pj_o (data->pj); | |
pj_ks (data->pj, "type", "call"); | |
pj_ks (data->pj, "name", data->word); | |
pj_ka (data->pj, "args"); | |
} else { | |
indent (tok); | |
eprintf ("CALL (%s)%c", data->word, 10); | |
} | |
} | |
} | |
} | |
R_FREE (data->s); | |
if (data->word) { | |
data->args = r_list_newf (free); | |
} | |
R_FREE (data->word); | |
} | |
break; | |
case R_TOKEN_INT: | |
case R_TOKEN_FLOAT: | |
if (data->incase || data->inassign) { | |
R_FREE (data->word); | |
data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin); | |
data->incase = false; | |
// data->inassign = false; | |
break; | |
} else { | |
if (!data->s) { | |
data->s = r_str_ndup (tok->buf + tok->begin, tok->end - tok->begin); | |
} | |
} | |
if (data->incase) { | |
char *s = r_str_ndup (tok->buf + tok->begin, tok->end - tok->begin); | |
// data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin); | |
indent (tok); | |
eprintf ("CASE (%s)%c", s, 10); | |
data->incase = false; | |
R_FREE (data->word); | |
} | |
// fallthru | |
case R_TOKEN_MATH: | |
if (data->incase) { | |
data->incase = false; | |
R_FREE (data->word); | |
} | |
switch (tok->ch) { | |
case '=': | |
// eprintf ("PAR %d %c", data->parlevel, 10); | |
if (data->parlevel == 0) { | |
indent(tok); | |
eprintf ("ASSIGN (%s)%c", data->word, 10); | |
data->inassign = true; | |
if (data->pj) { | |
pj_o (data->pj); | |
pj_ks (data->pj, "node", "assign"); | |
pj_ks (data->pj, "var", data->word); | |
} | |
R_FREE (data->word); | |
R_FREE (data->s); | |
} | |
break; | |
case ':': | |
if (data->word) { | |
// eprintf ("CASE %s%c", data->word, 10); | |
break; | |
} | |
case ';': | |
if (data->inreturn) { | |
indent(tok); | |
eprintf ("-- ARG (%s)%c", data->s, 10); | |
if (data->pj) { | |
pj_ks (data->pj, "value", data->s); | |
pj_end (data->pj); | |
} | |
} | |
if (data->inassign) { | |
indent(tok); | |
eprintf ("-- ARG (%s)%c", data->s, 10); | |
data->inassign = false; | |
if (data->pj) { | |
pj_ks (data->pj, "value", data->s); | |
pj_end (data->pj); | |
} | |
} | |
R_FREE (data->word); | |
break; | |
case '*': | |
case '+': | |
case '-': | |
case '%': | |
case '&': | |
case '|': | |
case '<': | |
case '>': | |
R_FREE (data->word); | |
data->s = r_str_appendlen (data->s, tok->buf + tok->begin, tok->end - tok->begin); | |
break; | |
case ',': | |
if (data->s) { | |
r_list_append (data->args, data->s); | |
data->s = NULL; | |
} | |
R_FREE (data->word); | |
break; | |
} | |
// eprintf ("ARG%c%c",tok->ch, 10); | |
break; | |
case R_TOKEN_BEGIN: | |
case R_TOKEN_END: | |
// free the data | |
eprintf ("DONE%c", 10); | |
break; | |
} | |
return true; | |
} | |
// | |
int main() { | |
tokenize("Hello World", NULL, NULL); | |
tokenize("hello('this', 33, true);", NULL, NULL); | |
tokenize( | |
" // hello world this is very new\n" | |
" int main(int argc, char **argv) {\n" | |
" printf (\"Hello %s\", \"world\");}\n" | |
" }\n" | |
, NULL, NULL | |
); | |
Data data = {0}; | |
char *s = r_file_slurp ("a.c", NULL); | |
data.pj = pj_new ();; | |
pj_o (data.pj); | |
tokenize (s, callback, &data); | |
pj_end (data.pj); | |
char *o = pj_drain (data.pj); | |
printf ("%s%c", o, 10); | |
free (o); | |
free (s); | |
} |
tokenize(
int main(int64_t arg1, int64_t arg2)
{
int a = 33;
a = 123;
return a;
}
int jaja() {
return 3999;
}
)
DONE
FUNC (main)
- int64_t arg1
- int64_t arg2
ASSIGN (a)
-- ARG (33)
ASSIGN (a)
-- ARG (123)
RETURN
-- ARG (a)
FUNC (jaja)
RETURN
-- ARG (3999)
DONE
$
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Output: