Skip to content

Instantly share code, notes, and snippets.

@waldnercharles
Last active October 15, 2024 06:15
Show Gist options
  • Save waldnercharles/8cfcb6e87f4a17244390cc53bebe5818 to your computer and use it in GitHub Desktop.
Save waldnercharles/8cfcb6e87f4a17244390cc53bebe5818 to your computer and use it in GitHub Desktop.
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_NONSTDC_NO_DEPRECATE
#include <ctype.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <direct.h>
char *read_entire_file_to_memory_and_nul_terminate(const char *filename, size_t *file_size)
{
FILE* file = fopen(filename, "rb");
if (file == NULL) {
return NULL;
}
fseek(file, 0, SEEK_END);
size_t sz = ftell(file);
rewind(file);
char *buf = (char *)malloc(sz + 1);
if (buf == NULL) {
perror("Failed to allocate memory");
fclose(file);
return NULL;
}
size_t bytes_read = fread(buf, sizeof(char), sz, file);
if (bytes_read != sz) {
perror("Failed to read file");
free(buf);
fclose(file);
return NULL;
}
buf[bytes_read] = '\0';
if (file_size) { *file_size = sz; }
return buf;
}
void normalize_line_endings(char *buffer)
{
char *src = buffer;
char *dst = buffer;
while (*src) {
if (*src == '\r') {
if (*(src + 1) == '\n') { src++; }
*dst++ = '\n';
} else {
*dst++ = *src;
}
src++;
}
*dst = '\0';
}
void merge_lines_ending_in_backslash(char *buffer)
{
char *src = buffer;
char *dst = buffer;
while (*src) {
if (*src == '\\' && (*(src + 1) == '\n')) {
src += 2; // Move past the \ and LF
} else {
*dst++ = *src++;
}
}
*dst = '\0';
}
void remove_comments(char *buffer)
{
char *src = buffer;
char *dst = buffer;
bool is_in_string = false;
while (*src) {
// Check for string literals
if (*src == '"' && (src == buffer || *(src - 1) != '\\')) {
is_in_string = !is_in_string;
}
if (is_in_string) {
*dst++ = *src++;
continue;
}
// Handle #include directives
if (*src == '#') {
*dst++ = *src++;
while (*src && isspace((unsigned char)*src)) { *dst++ = *src++; } // Skip spaces after '#'
if (strncmp(src, "include", 7) == 0) {
strncpy(dst, src, 7);
dst += 7; src += 7;
while (*src && isspace((unsigned char)*src)) { *dst++ = *src++; } // Skip spaces after 'include'
if (*src == '<' || *src == '"') {
char end_char = *src == '<' ? '>' : '"';
*dst++ = *src++;
while (*src && *src != end_char) { *dst++ = *src++; } // Skip until '>' or '"'
if (*src == end_char) *dst++ = *src++;
continue;
}
}
}
// Multi-line comments
else if (*src == '/' && *(src + 1) == '*') {
src += 2; // Move past "/*"
while (*src && !(*src == '*' && *(src + 1) == '/')) { src++; } // Skip contents of comment
if (*src) { src += 2; } // Move past "*/"
*dst++ = ' ';
}
// Single-line comments
else if (*src == '/' && *(src + 1) == '/') {
src += 2; // Skip past "//"
while (*src && *src != '\n') { src++; } // Skip to the end of the line
*dst++ = ' ';
}
else {
*dst++ = *src++;
}
}
*dst = '\0';
}
#define MAX_TOKEN_LENGTH 256
typedef enum {
TOKEN_IDENTIFIER,
TOKEN_NUMBER,
TOKEN_STRING,
TOKEN_HEADER,
TOKEN_PUNCTUATION,
TOKEN_OTHER,
TOKEN_EOF,
} token_type_t;
typedef struct token_t {
token_type_t type;
char file[MAX_TOKEN_LENGTH];
char value[MAX_TOKEN_LENGTH];
size_t start;
size_t end;
} token_t;
typedef struct token_node_t {
token_t token;
struct token_node_t* prev;
struct token_node_t* next;
} token_node_t;
typedef struct token_list_t {
token_node_t *head;
token_node_t *tail;
} token_list_t;
void append_token(token_list_t *list)
{
token_node_t *new_node = (token_node_t *)malloc(sizeof(token_node_t));
if (new_node == NULL) {
perror("Failed to allocate memory for token node");
return;
}
memset(new_node, 0, sizeof(token_node_t));
if (list->tail == NULL) {
list->head = new_node;
list->tail = new_node;
} else {
list->tail->next = new_node;
new_node->prev = list->tail;
list->tail = new_node;
}
}
void destroy_token_list(token_list_t list) {
token_node_t *head = list.head;
token_node_t *tmp;
while (head != NULL) {
tmp = head;
head = head->next;
free(tmp);
}
}
token_list_t tokenize(const char *str) {
token_list_t list;
list.head = NULL;
list.tail = NULL;
const char *punctuation[] = {
"~", "}", "||", "|=", "|", "{", "^=", "^", "]", "[", "?", ">>=", ">>", ">=", ">", "==",
"=", "<=", "<<=", "<<", "<", ";", "::", ":", "/=", "/", "...", ".", "->", "-=", "--",
"-", ",", "+=", "++", "+", "*=", "*", ")", "(", "&=", "&&", "&", "%=", "%", "##", "#",
"!=", "!" };
const int punctuation_len = sizeof(punctuation) / sizeof(punctuation[0]);
const char *c = str;
for (;;)
{
// Skip whitespace
while (*c && isspace((unsigned char)*c)) { c++; }
// Handle EOF
if (*c == '\0') { return list; }
append_token(&list);
token_node_t *node = list.tail;
token_t *token = &node->token;
// Handle numbers
if (isdigit((unsigned char)*c) || (*c == '.' && isdigit((unsigned char)*(c + 1)))) {
const char *start = c;
while (isalnum((unsigned char)*c) || *c == '_' || *c == '.') {
c++;
}
if ((*(c) == '+' || *(c) == '-') && (*(c - 1) == 'e' || *(c - 1) == 'E' || *(c - 1) == 'p' || *(c - 1) == 'P')) {
c += 2;
while (isalnum((unsigned char)*c) || *c == '_' || *c == '.') {
c++;
}
}
size_t length = c - start;
token->type = TOKEN_NUMBER;
strncpy(token->value, start, length);
token->value[length] = '\0';
token->start = c - str - length;
token->end = c - str;
continue;
}
// Handle identifiers
if (isalpha(*c) || *c == '_') {
const char *start = c;
while (isalnum(*c) || *c == '_') {
c++;
}
size_t length = c - start;
token->type = TOKEN_IDENTIFIER;
strncpy(token->value, start, length);
token->value[length] = '\0';
token->start = c - str - length;
token->end = c - str;
continue;
}
// Determine if it's a header file name (for #include "..." or #include <...>)
if (*c == '<' || *c == '"') {
if (node->prev != NULL && node->prev->prev != NULL &&
strcmp(node->prev->prev->token.value, "#") == 0 && strcmp(node->prev->token.value, "include") == 0) {
const char *start = c;
c++;
char quote = *start == '<' ? '>' : '"';
while (*c && *c != quote) { c++; }
if (*c == quote) c++;
size_t length = c - start;
token->type = TOKEN_HEADER;
strncpy(token->value, start, length);
token->value[length] = '\0';
token->start = c - str - length;
token->end = c - str;
continue;
}
}
// Handle strings
if (*c == '"' || *c == '\'') {
const char *start = c;
c++;
char quote = *start;
while (*c && *c != quote) {
if (*c == '\\' && (*(c + 1) == '\\' || *(c + 1) == '\'' || *(c + 1) == '"')) {
c += 2;
} else {
c++;
}
}
c++;
size_t length = c - start;
token->type = TOKEN_STRING;
strncpy(token->value, start, length);
token->value[length] = '\0';
token->start = c - str - length;
token->end = c - str;
continue;
}
// Handle punctuation
bool is_punctuation = false;
for (int i = 0; i < punctuation_len; i++) {
size_t length = strlen(punctuation[i]);
if (strncmp(c, punctuation[i], length) == 0) {
token->type = TOKEN_PUNCTUATION;
strncpy(token->value, c, length);
token->value[length] = '\0';
token->start = c - str;
token->end = c - str + length;
c += length;
is_punctuation = true;
break;
}
}
if (is_punctuation) continue;
// Handle everything else
token->type = TOKEN_OTHER;
token->value[0] = *c++;
token->value[1] = '\0';
token->start = c - str - 1;
token->end = c - str;
}
}
token_list_t tokenize_file(const char *file)
{
token_list_t list;
list.head = list.tail = NULL;
char *code = read_entire_file_to_memory_and_nul_terminate(file, NULL);
if (code == NULL) {
perror("Failed to load file");
} else {
normalize_line_endings(code);
merge_lines_ending_in_backslash(code);
remove_comments(code);
list = tokenize(code);
token_node_t *it = list.head;
while (it) {
strcpy(it->token.file, file);
it = it->next;
}
free(code);
}
return list;
}
void print_token(token_t token) {
const char *type_str;
switch (token.type) {
case TOKEN_IDENTIFIER: type_str = "IDENTIFIER"; break;
case TOKEN_NUMBER: type_str = "NUMBER"; break;
case TOKEN_STRING: type_str = "STRING"; break;
case TOKEN_HEADER: type_str = "HEADER"; break;
case TOKEN_PUNCTUATION: type_str = "PUNCTUATION"; break;
case TOKEN_OTHER: type_str = "OTHER"; break;
case TOKEN_EOF: type_str = "EOF"; break;
default: type_str = "UNKNOWN";
}
printf("(%s %s %s %zu-%zu)\n", token.file, type_str, token.value, token.start, token.end);
}
void preprocess_file(const char *file) {
token_list_t token_list = tokenize_file(file);
token_node_t *current = token_list.head;
while (current) {
token_node_t *next = current->next;
if (current->token.type == TOKEN_PUNCTUATION) {
if (strcmp(current->token.value, "#") == 0) {
if (next && next->token.type == TOKEN_IDENTIFIER) {
if (strcmp(next->token.value, "include") == 0) {
next = next->next;
if (next && next->token.type == TOKEN_HEADER) {
char path[MAX_TOKEN_LENGTH];
strcpy(path, next->token.value + 1);
path[strlen(path) - 1] = '\0';
token_list_t new_tokens = tokenize_file(path);
// Remove the old tokens
if (new_tokens.head) {
current->prev->next = new_tokens.head;
new_tokens.tail->next = next->next;
free(current->next->next);
free(current->next);
free(current);
next = new_tokens.head;
}
}
}
}
}
}
current = next;
}
for (token_node_t *it = token_list.head; it != NULL; it = it->next) {
print_token(it->token);
}
destroy_token_list(token_list);
}
int main(void) {
chdir("..");
preprocess_file("test.c");
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment