Last active
October 15, 2024 06:15
-
-
Save waldnercharles/8cfcb6e87f4a17244390cc53bebe5818 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _CRT_SECURE_NO_WARNINGS | |
#define _CRT_NONSTDC_NO_DEPRECATE | |
#include <ctype.h> | |
#include <stdbool.h> | |
#include <stdint.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <direct.h> | |
char *read_entire_file_to_memory_and_nul_terminate(const char *filename, size_t *file_size) | |
{ | |
FILE* file = fopen(filename, "rb"); | |
if (file == NULL) { | |
return NULL; | |
} | |
fseek(file, 0, SEEK_END); | |
size_t sz = ftell(file); | |
rewind(file); | |
char *buf = (char *)malloc(sz + 1); | |
if (buf == NULL) { | |
perror("Failed to allocate memory"); | |
fclose(file); | |
return NULL; | |
} | |
size_t bytes_read = fread(buf, sizeof(char), sz, file); | |
if (bytes_read != sz) { | |
perror("Failed to read file"); | |
free(buf); | |
fclose(file); | |
return NULL; | |
} | |
buf[bytes_read] = '\0'; | |
if (file_size) { *file_size = sz; } | |
return buf; | |
} | |
void normalize_line_endings(char *buffer) | |
{ | |
char *src = buffer; | |
char *dst = buffer; | |
while (*src) { | |
if (*src == '\r') { | |
if (*(src + 1) == '\n') { src++; } | |
*dst++ = '\n'; | |
} else { | |
*dst++ = *src; | |
} | |
src++; | |
} | |
*dst = '\0'; | |
} | |
void merge_lines_ending_in_backslash(char *buffer) | |
{ | |
char *src = buffer; | |
char *dst = buffer; | |
while (*src) { | |
if (*src == '\\' && (*(src + 1) == '\n')) { | |
src += 2; // Move past the \ and LF | |
} else { | |
*dst++ = *src++; | |
} | |
} | |
*dst = '\0'; | |
} | |
void remove_comments(char *buffer) | |
{ | |
char *src = buffer; | |
char *dst = buffer; | |
bool is_in_string = false; | |
while (*src) { | |
// Check for string literals | |
if (*src == '"' && (src == buffer || *(src - 1) != '\\')) { | |
is_in_string = !is_in_string; | |
} | |
if (is_in_string) { | |
*dst++ = *src++; | |
continue; | |
} | |
// Handle #include directives | |
if (*src == '#') { | |
*dst++ = *src++; | |
while (*src && isspace((unsigned char)*src)) { *dst++ = *src++; } // Skip spaces after '#' | |
if (strncmp(src, "include", 7) == 0) { | |
strncpy(dst, src, 7); | |
dst += 7; src += 7; | |
while (*src && isspace((unsigned char)*src)) { *dst++ = *src++; } // Skip spaces after 'include' | |
if (*src == '<' || *src == '"') { | |
char end_char = *src == '<' ? '>' : '"'; | |
*dst++ = *src++; | |
while (*src && *src != end_char) { *dst++ = *src++; } // Skip until '>' or '"' | |
if (*src == end_char) *dst++ = *src++; | |
continue; | |
} | |
} | |
} | |
// Multi-line comments | |
else if (*src == '/' && *(src + 1) == '*') { | |
src += 2; // Move past "/*" | |
while (*src && !(*src == '*' && *(src + 1) == '/')) { src++; } // Skip contents of comment | |
if (*src) { src += 2; } // Move past "*/" | |
*dst++ = ' '; | |
} | |
// Single-line comments | |
else if (*src == '/' && *(src + 1) == '/') { | |
src += 2; // Skip past "//" | |
while (*src && *src != '\n') { src++; } // Skip to the end of the line | |
*dst++ = ' '; | |
} | |
else { | |
*dst++ = *src++; | |
} | |
} | |
*dst = '\0'; | |
} | |
#define MAX_TOKEN_LENGTH 256 | |
typedef enum { | |
TOKEN_IDENTIFIER, | |
TOKEN_NUMBER, | |
TOKEN_STRING, | |
TOKEN_HEADER, | |
TOKEN_PUNCTUATION, | |
TOKEN_OTHER, | |
TOKEN_EOF, | |
} token_type_t; | |
typedef struct token_t { | |
token_type_t type; | |
char file[MAX_TOKEN_LENGTH]; | |
char value[MAX_TOKEN_LENGTH]; | |
size_t start; | |
size_t end; | |
} token_t; | |
typedef struct token_node_t { | |
token_t token; | |
struct token_node_t* prev; | |
struct token_node_t* next; | |
} token_node_t; | |
typedef struct token_list_t { | |
token_node_t *head; | |
token_node_t *tail; | |
} token_list_t; | |
void append_token(token_list_t *list) | |
{ | |
token_node_t *new_node = (token_node_t *)malloc(sizeof(token_node_t)); | |
if (new_node == NULL) { | |
perror("Failed to allocate memory for token node"); | |
return; | |
} | |
memset(new_node, 0, sizeof(token_node_t)); | |
if (list->tail == NULL) { | |
list->head = new_node; | |
list->tail = new_node; | |
} else { | |
list->tail->next = new_node; | |
new_node->prev = list->tail; | |
list->tail = new_node; | |
} | |
} | |
void destroy_token_list(token_list_t list) { | |
token_node_t *head = list.head; | |
token_node_t *tmp; | |
while (head != NULL) { | |
tmp = head; | |
head = head->next; | |
free(tmp); | |
} | |
} | |
token_list_t tokenize(const char *str) { | |
token_list_t list; | |
list.head = NULL; | |
list.tail = NULL; | |
const char *punctuation[] = { | |
"~", "}", "||", "|=", "|", "{", "^=", "^", "]", "[", "?", ">>=", ">>", ">=", ">", "==", | |
"=", "<=", "<<=", "<<", "<", ";", "::", ":", "/=", "/", "...", ".", "->", "-=", "--", | |
"-", ",", "+=", "++", "+", "*=", "*", ")", "(", "&=", "&&", "&", "%=", "%", "##", "#", | |
"!=", "!" }; | |
const int punctuation_len = sizeof(punctuation) / sizeof(punctuation[0]); | |
const char *c = str; | |
for (;;) | |
{ | |
// Skip whitespace | |
while (*c && isspace((unsigned char)*c)) { c++; } | |
// Handle EOF | |
if (*c == '\0') { return list; } | |
append_token(&list); | |
token_node_t *node = list.tail; | |
token_t *token = &node->token; | |
// Handle numbers | |
if (isdigit((unsigned char)*c) || (*c == '.' && isdigit((unsigned char)*(c + 1)))) { | |
const char *start = c; | |
while (isalnum((unsigned char)*c) || *c == '_' || *c == '.') { | |
c++; | |
} | |
if ((*(c) == '+' || *(c) == '-') && (*(c - 1) == 'e' || *(c - 1) == 'E' || *(c - 1) == 'p' || *(c - 1) == 'P')) { | |
c += 2; | |
while (isalnum((unsigned char)*c) || *c == '_' || *c == '.') { | |
c++; | |
} | |
} | |
size_t length = c - start; | |
token->type = TOKEN_NUMBER; | |
strncpy(token->value, start, length); | |
token->value[length] = '\0'; | |
token->start = c - str - length; | |
token->end = c - str; | |
continue; | |
} | |
// Handle identifiers | |
if (isalpha(*c) || *c == '_') { | |
const char *start = c; | |
while (isalnum(*c) || *c == '_') { | |
c++; | |
} | |
size_t length = c - start; | |
token->type = TOKEN_IDENTIFIER; | |
strncpy(token->value, start, length); | |
token->value[length] = '\0'; | |
token->start = c - str - length; | |
token->end = c - str; | |
continue; | |
} | |
// Determine if it's a header file name (for #include "..." or #include <...>) | |
if (*c == '<' || *c == '"') { | |
if (node->prev != NULL && node->prev->prev != NULL && | |
strcmp(node->prev->prev->token.value, "#") == 0 && strcmp(node->prev->token.value, "include") == 0) { | |
const char *start = c; | |
c++; | |
char quote = *start == '<' ? '>' : '"'; | |
while (*c && *c != quote) { c++; } | |
if (*c == quote) c++; | |
size_t length = c - start; | |
token->type = TOKEN_HEADER; | |
strncpy(token->value, start, length); | |
token->value[length] = '\0'; | |
token->start = c - str - length; | |
token->end = c - str; | |
continue; | |
} | |
} | |
// Handle strings | |
if (*c == '"' || *c == '\'') { | |
const char *start = c; | |
c++; | |
char quote = *start; | |
while (*c && *c != quote) { | |
if (*c == '\\' && (*(c + 1) == '\\' || *(c + 1) == '\'' || *(c + 1) == '"')) { | |
c += 2; | |
} else { | |
c++; | |
} | |
} | |
c++; | |
size_t length = c - start; | |
token->type = TOKEN_STRING; | |
strncpy(token->value, start, length); | |
token->value[length] = '\0'; | |
token->start = c - str - length; | |
token->end = c - str; | |
continue; | |
} | |
// Handle punctuation | |
bool is_punctuation = false; | |
for (int i = 0; i < punctuation_len; i++) { | |
size_t length = strlen(punctuation[i]); | |
if (strncmp(c, punctuation[i], length) == 0) { | |
token->type = TOKEN_PUNCTUATION; | |
strncpy(token->value, c, length); | |
token->value[length] = '\0'; | |
token->start = c - str; | |
token->end = c - str + length; | |
c += length; | |
is_punctuation = true; | |
break; | |
} | |
} | |
if (is_punctuation) continue; | |
// Handle everything else | |
token->type = TOKEN_OTHER; | |
token->value[0] = *c++; | |
token->value[1] = '\0'; | |
token->start = c - str - 1; | |
token->end = c - str; | |
} | |
} | |
token_list_t tokenize_file(const char *file) | |
{ | |
token_list_t list; | |
list.head = list.tail = NULL; | |
char *code = read_entire_file_to_memory_and_nul_terminate(file, NULL); | |
if (code == NULL) { | |
perror("Failed to load file"); | |
} else { | |
normalize_line_endings(code); | |
merge_lines_ending_in_backslash(code); | |
remove_comments(code); | |
list = tokenize(code); | |
token_node_t *it = list.head; | |
while (it) { | |
strcpy(it->token.file, file); | |
it = it->next; | |
} | |
free(code); | |
} | |
return list; | |
} | |
void print_token(token_t token) { | |
const char *type_str; | |
switch (token.type) { | |
case TOKEN_IDENTIFIER: type_str = "IDENTIFIER"; break; | |
case TOKEN_NUMBER: type_str = "NUMBER"; break; | |
case TOKEN_STRING: type_str = "STRING"; break; | |
case TOKEN_HEADER: type_str = "HEADER"; break; | |
case TOKEN_PUNCTUATION: type_str = "PUNCTUATION"; break; | |
case TOKEN_OTHER: type_str = "OTHER"; break; | |
case TOKEN_EOF: type_str = "EOF"; break; | |
default: type_str = "UNKNOWN"; | |
} | |
printf("(%s %s %s %zu-%zu)\n", token.file, type_str, token.value, token.start, token.end); | |
} | |
void preprocess_file(const char *file) { | |
token_list_t token_list = tokenize_file(file); | |
token_node_t *current = token_list.head; | |
while (current) { | |
token_node_t *next = current->next; | |
if (current->token.type == TOKEN_PUNCTUATION) { | |
if (strcmp(current->token.value, "#") == 0) { | |
if (next && next->token.type == TOKEN_IDENTIFIER) { | |
if (strcmp(next->token.value, "include") == 0) { | |
next = next->next; | |
if (next && next->token.type == TOKEN_HEADER) { | |
char path[MAX_TOKEN_LENGTH]; | |
strcpy(path, next->token.value + 1); | |
path[strlen(path) - 1] = '\0'; | |
token_list_t new_tokens = tokenize_file(path); | |
// Remove the old tokens | |
if (new_tokens.head) { | |
current->prev->next = new_tokens.head; | |
new_tokens.tail->next = next->next; | |
free(current->next->next); | |
free(current->next); | |
free(current); | |
next = new_tokens.head; | |
} | |
} | |
} | |
} | |
} | |
} | |
current = next; | |
} | |
for (token_node_t *it = token_list.head; it != NULL; it = it->next) { | |
print_token(it->token); | |
} | |
destroy_token_list(token_list); | |
} | |
int main(void) { | |
chdir(".."); | |
preprocess_file("test.c"); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment