waldnercharles · October 15, 2024 06:15
diff --git a/small_pp.c b/small_pp.c
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_NONSTDC_NO_DEPRECATE

 #include <ctype.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <direct.h>

 char *read_entire_file_to_memory_and_nul_terminate(const char *filename, size_t *file_size)
 {
 	FILE* file = fopen(filename, "rb");
 	if (file == NULL) {
 		return NULL;
 	}

 	fseek(file, 0, SEEK_END);
 	size_t sz = ftell(file);
 	rewind(file);

 	char *buf = (char *)malloc(sz + 1);
 	if (buf == NULL) {
 		perror("Failed to allocate memory");
 		fclose(file);
 		return NULL;
 	}

 	size_t bytes_read = fread(buf, sizeof(char), sz, file);
 	if (bytes_read != sz) {
 		perror("Failed to read file");
 		free(buf);
 		fclose(file);
 		return NULL;
 	}
 	buf[bytes_read] = '\0';

 	if (file_size) { *file_size = sz; }
 	return buf;
 }

 void normalize_line_endings(char *buffer)
 {
 	char *src = buffer;
 	char *dst = buffer;
 	while (*src) {
 		if (*src == '\r') {
 			if (*(src + 1) == '\n') { src++; }
 			*dst++ = '\n';
 		} else {
 			*dst++ = *src;
 		}
 		src++;
 	}
 	*dst = '\0';
 }

 void merge_lines_ending_in_backslash(char *buffer)
 {
 	char *src = buffer;
 	char *dst = buffer;
 	while (*src) {
 		if (*src == '\\' && (*(src + 1) == '\n')) {
 			src += 2; // Move past the \ and LF
 		} else {
 			*dst++ = *src++;
 		}
 	}
 	*dst = '\0';
 }

 void remove_comments(char *buffer)
 {
 	char *src = buffer;
 	char *dst = buffer;
 	bool is_in_string = false;
 	while (*src) {
 		// Check for string literals
 		if (*src == '"' && (src == buffer || *(src - 1) != '\\')) {
 			is_in_string = !is_in_string;
 		}
 		if (is_in_string) {
 			*dst++ = *src++;
 			continue;
 		}

 		// Handle #include directives
 		if (*src == '#') {
 			*dst++ = *src++;
 			while (*src && isspace((unsigned char)*src)) { *dst++ = *src++; } // Skip spaces after '#'
 			if (strncmp(src, "include", 7) == 0) {
 				strncpy(dst, src, 7);
 				dst += 7; src += 7;
 				while (*src && isspace((unsigned char)*src)) { *dst++ = *src++; } // Skip spaces after 'include'
 				if (*src == '<' || *src == '"') {
 					char end_char = *src == '<' ? '>' : '"';
 					*dst++ = *src++;
 					while (*src && *src != end_char) { *dst++ = *src++; } // Skip until '>' or '"'
 					if (*src == end_char) *dst++ = *src++;
 					continue;
 				}
 			}
 		}
 		// Multi-line comments
 		else if (*src == '/' && *(src + 1) == '*') {
 			src += 2; // Move past "/*"
 			while (*src && !(*src == '*' && *(src + 1) == '/')) { src++; } // Skip contents of comment
 			if (*src) { src += 2; } // Move past "*/"
 			*dst++ = ' ';
 		}
 		// Single-line comments
 		else if (*src == '/' && *(src + 1) == '/') {
 			src += 2; // Skip past "//"
 			while (*src && *src != '\n') { src++; } // Skip to the end of the line
 			*dst++ = ' ';
 		}
 		else {
 			*dst++ = *src++;
 		}
 	}
 	*dst = '\0';
 }

 #define MAX_TOKEN_LENGTH 256

 typedef enum {
 	TOKEN_IDENTIFIER,
 	TOKEN_NUMBER,
 	TOKEN_STRING,
 	TOKEN_HEADER,
 	TOKEN_PUNCTUATION,
 	TOKEN_OTHER,
 	TOKEN_EOF,
 } token_type_t;

 typedef struct token_t {
 	token_type_t type;
 	char file[MAX_TOKEN_LENGTH];
 	char value[MAX_TOKEN_LENGTH];
 	size_t start;
 	size_t end;
 } token_t;

 typedef struct token_node_t {
 	token_t token;
 	struct token_node_t* prev;
 	struct token_node_t* next;
 } token_node_t;

 typedef struct token_list_t {
 	token_node_t *head;
 	token_node_t *tail;
 } token_list_t;

 void append_token(token_list_t *list)
 {
 	token_node_t *new_node = (token_node_t *)malloc(sizeof(token_node_t));
 	if (new_node == NULL) {
 		perror("Failed to allocate memory for token node");
 		return;
 	}
 	memset(new_node, 0, sizeof(token_node_t));

 	if (list->tail == NULL) {
 		list->head = new_node;
 		list->tail = new_node;
 	} else {
 		list->tail->next = new_node;
 		new_node->prev = list->tail;
 		list->tail = new_node;
 	}
 }

 void destroy_token_list(token_list_t list) {
 	token_node_t *head = list.head;
 	token_node_t *tmp;
 	while (head != NULL) {
 		tmp = head;
 		head = head->next;
 		free(tmp);
 	}
 }

 token_list_t tokenize(const char *str) {
 	token_list_t list;
 	list.head = NULL;
 	list.tail = NULL;
 	
 	const char *punctuation[] = {
 			"~", "}", "||", "|=", "|", "{", "^=", "^", "]", "[", "?", ">>=", ">>", ">=", ">", "==",
 			"=", "<=", "<<=", "<<", "<", ";", "::", ":", "/=", "/", "...", ".", "->", "-=", "--",
 			"-", ",", "+=", "++", "+", "*=", "*", ")", "(", "&=", "&&", "&", "%=", "%", "##", "#",
 			"!=", "!" };
 	const int punctuation_len = sizeof(punctuation) / sizeof(punctuation[0]);

 	const char *c = str;
 	for (;;)
 	{
 		// Skip whitespace
 		while (*c && isspace((unsigned char)*c)) { c++; }

 		// Handle EOF
 		if (*c == '\0') { return list; }

 		append_token(&list);
 		token_node_t *node = list.tail;
 		token_t *token = &node->token;

 		// Handle numbers
 		if (isdigit((unsigned char)*c) || (*c == '.' && isdigit((unsigned char)*(c + 1)))) {
 			const char *start = c;
 			while (isalnum((unsigned char)*c) || *c == '_' || *c == '.') {
 				c++;
 			}
 			if ((*(c) == '+' || *(c) == '-') && (*(c - 1) == 'e' || *(c - 1) == 'E' || *(c - 1) == 'p' || *(c - 1) == 'P')) {
 				c += 2;
 				while (isalnum((unsigned char)*c) || *c == '_' || *c == '.') {
 					c++;
 				}
 			}

 			size_t length = c - start;
 			token->type = TOKEN_NUMBER;
 			strncpy(token->value, start, length);
 			token->value[length] = '\0';
 			token->start = c - str - length;
 			token->end = c - str;
 			continue;
 		}

 		// Handle identifiers
 		if (isalpha(*c) || *c == '_') {
 			const char *start = c;
 			while (isalnum(*c) || *c == '_') {
 				c++;
 			}
 			size_t length = c - start;
 			token->type = TOKEN_IDENTIFIER;
 			strncpy(token->value, start, length);
 			token->value[length] = '\0';
 			token->start = c - str - length;
 			token->end = c - str;
 			continue;
 		}

 		// Determine if it's a header file name (for #include "..." or #include <...>)
 		if (*c == '<' || *c == '"') {
 			if (node->prev != NULL && node->prev->prev != NULL &&
 				strcmp(node->prev->prev->token.value, "#") == 0	&& strcmp(node->prev->token.value, "include") == 0) {
 				const char *start = c;
 				c++;

 				char quote = *start == '<' ? '>' : '"';
 				while (*c && *c != quote) { c++; }
 				if (*c == quote) c++;

 				size_t length = c - start;
 				token->type = TOKEN_HEADER;
 				strncpy(token->value, start, length);
 				token->value[length] = '\0';
 				token->start = c - str - length;
 				token->end = c - str;
 				continue;
 			}
 		}

 		// Handle strings
 		if (*c == '"' || *c == '\'') {
 			const char *start = c;
 			c++;

 			char quote = *start;
 			while (*c && *c != quote) {
 				if (*c == '\\' && (*(c + 1) == '\\' || *(c + 1) == '\'' || *(c + 1) == '"')) {
 					c += 2;
 				} else {
 					c++;
 				}
 			}
 			c++;

 			size_t length = c - start;
 			token->type = TOKEN_STRING;
 			strncpy(token->value, start, length);
 			token->value[length] = '\0';
 			token->start = c - str - length;
 			token->end = c - str;
 			continue;
 		}

 		// Handle punctuation
 		bool is_punctuation = false;
 		for (int i = 0; i < punctuation_len; i++) {
 			size_t length = strlen(punctuation[i]);
 			if (strncmp(c, punctuation[i], length) == 0) {
 				token->type = TOKEN_PUNCTUATION;
 				strncpy(token->value, c, length);
 				token->value[length] = '\0';
 				token->start = c - str;
 				token->end = c - str + length;
 				c += length;
 				is_punctuation = true;
 				break;
 			}
 		}
 		if (is_punctuation) continue;

 		// Handle everything else
 		token->type = TOKEN_OTHER;
 		token->value[0] = *c++;
 		token->value[1] = '\0';
 		token->start = c - str - 1;
 		token->end = c - str;
 	}
 }

 token_list_t tokenize_file(const char *file)
 {
 	token_list_t list;
 	list.head = list.tail = NULL;
 	char *code = read_entire_file_to_memory_and_nul_terminate(file, NULL);
 	if (code == NULL) {
 		perror("Failed to load file");
 	} else {
 		normalize_line_endings(code);
 		merge_lines_ending_in_backslash(code);
 		remove_comments(code);
 		list = tokenize(code);
 		token_node_t *it = list.head;
 		while (it) {
 			strcpy(it->token.file, file);
 			it = it->next;
 		}
 		free(code);
 	}
 	return list;
 }

 void print_token(token_t token) {
 	const char *type_str;
 	switch (token.type) {
 		case TOKEN_IDENTIFIER: type_str = "IDENTIFIER"; break;
 		case TOKEN_NUMBER: type_str = "NUMBER"; break;
 		case TOKEN_STRING: type_str = "STRING"; break;
 		case TOKEN_HEADER: type_str = "HEADER"; break;
 		case TOKEN_PUNCTUATION: type_str = "PUNCTUATION"; break;
 		case TOKEN_OTHER: type_str = "OTHER"; break;
 		case TOKEN_EOF: type_str = "EOF"; break;
 		default: type_str = "UNKNOWN";
 	}
 	printf("(%s %s %s %zu-%zu)\n", token.file, type_str, token.value, token.start, token.end);
 }

 void preprocess_file(const char *file) {
 	token_list_t token_list = tokenize_file(file);

 	token_node_t *current = token_list.head;
 	while (current) {
 		token_node_t *next = current->next;
 		if (current->token.type == TOKEN_PUNCTUATION) {
 			if (strcmp(current->token.value, "#") == 0) {
 				if (next && next->token.type == TOKEN_IDENTIFIER) {
 					if (strcmp(next->token.value, "include") == 0) {
 						next = next->next;
 						if (next && next->token.type == TOKEN_HEADER) {
 							char path[MAX_TOKEN_LENGTH];
 							strcpy(path, next->token.value + 1);
 							path[strlen(path) - 1] = '\0';
 							token_list_t new_tokens = tokenize_file(path);

 							// Remove the old tokens
 							if (new_tokens.head) {
 								current->prev->next = new_tokens.head;
 								new_tokens.tail->next = next->next;
 								free(current->next->next);
 								free(current->next);
 								free(current);
 								next = new_tokens.head;
 							}
 						}
 					}
 				}
 			}
 		}
 		current = next;
 	}

 	for (token_node_t *it = token_list.head; it != NULL; it = it->next) {
 		print_token(it->token);
 	}
 	destroy_token_list(token_list);
 }

 int main(void) {
 	chdir("..");
 	preprocess_file("test.c");
 	return 0;
 }
	#define _CRT_SECURE_NO_WARNINGS
	#define _CRT_NONSTDC_NO_DEPRECATE

	#include <ctype.h>
	#include <stdbool.h>
	#include <stdint.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <direct.h>

	char read_entire_file_to_memory_and_nul_terminate(const char filename, size_t *file_size)
	{
	FILE* file = fopen(filename, "rb");
	if (file == NULL) {
	return NULL;
	}

	fseek(file, 0, SEEK_END);
	size_t sz = ftell(file);
	rewind(file);

	char buf = (char )malloc(sz + 1);
	if (buf == NULL) {
	perror("Failed to allocate memory");
	fclose(file);
	return NULL;
	}

	size_t bytes_read = fread(buf, sizeof(char), sz, file);
	if (bytes_read != sz) {
	perror("Failed to read file");
	free(buf);
	fclose(file);
	return NULL;
	}
	buf[bytes_read] = '\0';

	if (file_size) { *file_size = sz; }
	return buf;
	}

	void normalize_line_endings(char *buffer)
	{
	char *src = buffer;
	char *dst = buffer;
	while (*src) {
	if (*src == '\r') {
	if (*(src + 1) == '\n') { src++; }
	*dst++ = '\n';
	} else {
	dst++ = src;
	}
	src++;
	}
	*dst = '\0';
	}

	void merge_lines_ending_in_backslash(char *buffer)
	{
	char *src = buffer;
	char *dst = buffer;
	while (*src) {
	if (src == '\\' && ((src + 1) == '\n')) {
	src += 2; // Move past the \ and LF
	} else {
	dst++ = src++;
	}
	}
	*dst = '\0';
	}

	void remove_comments(char *buffer)
	{
	char *src = buffer;
	char *dst = buffer;
	bool is_in_string = false;
	while (*src) {
	// Check for string literals
	if (src == '"' && (src == buffer \|\| (src - 1) != '\\')) {
	is_in_string = !is_in_string;
	}
	if (is_in_string) {
	dst++ = src++;
	continue;
	}

	// Handle #include directives
	if (*src == '#') {
	dst++ = src++;
	while (src && isspace((unsigned char)src)) { dst++ = src++; } // Skip spaces after '#'
	if (strncmp(src, "include", 7) == 0) {
	strncpy(dst, src, 7);
	dst += 7; src += 7;
	while (src && isspace((unsigned char)src)) { dst++ = src++; } // Skip spaces after 'include'
	if (src == '<' \|\| src == '"') {
	char end_char = *src == '<' ? '>' : '"';
	dst++ = src++;
	while (src && src != end_char) { dst++ = src++; } // Skip until '>' or '"'
	if (src == end_char) dst++ = *src++;
	continue;
	}
	}
	}
	// Multi-line comments
	else if (src == '/' && (src + 1) == '*') {
	src += 2; // Move past "/*"
	while (src && !(src == '' && (src + 1) == '/')) { src++; } // Skip contents of comment
	if (src) { src += 2; } // Move past "/"
	*dst++ = ' ';
	}
	// Single-line comments
	else if (src == '/' && (src + 1) == '/') {
	src += 2; // Skip past "//"
	while (src && src != '\n') { src++; } // Skip to the end of the line
	*dst++ = ' ';
	}
	else {
	dst++ = src++;
	}
	}
	*dst = '\0';
	}

	#define MAX_TOKEN_LENGTH 256

	typedef enum {
	TOKEN_IDENTIFIER,
	TOKEN_NUMBER,
	TOKEN_STRING,
	TOKEN_HEADER,
	TOKEN_PUNCTUATION,
	TOKEN_OTHER,
	TOKEN_EOF,
	} token_type_t;

	typedef struct token_t {
	token_type_t type;
	char file[MAX_TOKEN_LENGTH];
	char value[MAX_TOKEN_LENGTH];
	size_t start;
	size_t end;
	} token_t;

	typedef struct token_node_t {
	token_t token;
	struct token_node_t* prev;
	struct token_node_t* next;
	} token_node_t;

	typedef struct token_list_t {
	token_node_t *head;
	token_node_t *tail;
	} token_list_t;

	void append_token(token_list_t *list)
	{
	token_node_t new_node = (token_node_t )malloc(sizeof(token_node_t));
	if (new_node == NULL) {
	perror("Failed to allocate memory for token node");
	return;
	}
	memset(new_node, 0, sizeof(token_node_t));

	if (list->tail == NULL) {
	list->head = new_node;
	list->tail = new_node;
	} else {
	list->tail->next = new_node;
	new_node->prev = list->tail;
	list->tail = new_node;
	}
	}

	void destroy_token_list(token_list_t list) {
	token_node_t *head = list.head;
	token_node_t *tmp;
	while (head != NULL) {
	tmp = head;
	head = head->next;
	free(tmp);
	}
	}

	token_list_t tokenize(const char *str) {
	token_list_t list;
	list.head = NULL;
	list.tail = NULL;

	const char *punctuation[] = {
	"~", "}", "\|\|", "\|=", "\|", "{", "^=", "^", "]", "[", "?", ">>=", ">>", ">=", ">", "==",
	"=", "<=", "<<=", "<<", "<", ";", "::", ":", "/=", "/", "...", ".", "->", "-=", "--",
	"-", ",", "+=", "++", "+", "=", "", ")", "(", "&=", "&&", "&", "%=", "%", "##", "#",
	"!=", "!" };
	const int punctuation_len = sizeof(punctuation) / sizeof(punctuation[0]);

	const char *c = str;
	for (;;)
	{
	// Skip whitespace
	while (c && isspace((unsigned char)c)) { c++; }

	// Handle EOF
	if (*c == '\0') { return list; }

	append_token(&list);
	token_node_t *node = list.tail;
	token_t *token = &node->token;

	// Handle numbers
	if (isdigit((unsigned char)c) \|\| (c == '.' && isdigit((unsigned char)*(c + 1)))) {
	const char *start = c;
	while (isalnum((unsigned char)c) \|\| c == '_' \|\| *c == '.') {
	c++;
	}
	if (((c) == '+' \|\| (c) == '-') && ((c - 1) == 'e' \|\| (c - 1) == 'E' \|\| (c - 1) == 'p' \|\| (c - 1) == 'P')) {
	c += 2;
	while (isalnum((unsigned char)c) \|\| c == '_' \|\| *c == '.') {
	c++;
	}
	}

	size_t length = c - start;
	token->type = TOKEN_NUMBER;
	strncpy(token->value, start, length);
	token->value[length] = '\0';
	token->start = c - str - length;
	token->end = c - str;
	continue;
	}

	// Handle identifiers
	if (isalpha(c) \|\| c == '_') {
	const char *start = c;
	while (isalnum(c) \|\| c == '_') {
	c++;
	}
	size_t length = c - start;
	token->type = TOKEN_IDENTIFIER;
	strncpy(token->value, start, length);
	token->value[length] = '\0';
	token->start = c - str - length;
	token->end = c - str;
	continue;
	}

	// Determine if it's a header file name (for #include "..." or #include <...>)
	if (c == '<' \|\| c == '"') {
	if (node->prev != NULL && node->prev->prev != NULL &&
	strcmp(node->prev->prev->token.value, "#") == 0 && strcmp(node->prev->token.value, "include") == 0) {
	const char *start = c;
	c++;

	char quote = *start == '<' ? '>' : '"';
	while (c && c != quote) { c++; }
	if (*c == quote) c++;

	size_t length = c - start;
	token->type = TOKEN_HEADER;
	strncpy(token->value, start, length);
	token->value[length] = '\0';
	token->start = c - str - length;
	token->end = c - str;
	continue;
	}
	}

	// Handle strings
	if (c == '"' \|\| c == '\'') {
	const char *start = c;
	c++;

	char quote = *start;
	while (c && c != quote) {
	if (c == '\\' && ((c + 1) == '\\' \|\| (c + 1) == '\'' \|\| (c + 1) == '"')) {
	c += 2;
	} else {
	c++;
	}
	}
	c++;

	size_t length = c - start;
	token->type = TOKEN_STRING;
	strncpy(token->value, start, length);
	token->value[length] = '\0';
	token->start = c - str - length;
	token->end = c - str;
	continue;
	}

	// Handle punctuation
	bool is_punctuation = false;
	for (int i = 0; i < punctuation_len; i++) {
	size_t length = strlen(punctuation[i]);
	if (strncmp(c, punctuation[i], length) == 0) {
	token->type = TOKEN_PUNCTUATION;
	strncpy(token->value, c, length);
	token->value[length] = '\0';
	token->start = c - str;
	token->end = c - str + length;
	c += length;
	is_punctuation = true;
	break;
	}
	}
	if (is_punctuation) continue;

	// Handle everything else
	token->type = TOKEN_OTHER;
	token->value[0] = *c++;
	token->value[1] = '\0';
	token->start = c - str - 1;
	token->end = c - str;
	}
	}

	token_list_t tokenize_file(const char *file)
	{
	token_list_t list;
	list.head = list.tail = NULL;
	char *code = read_entire_file_to_memory_and_nul_terminate(file, NULL);
	if (code == NULL) {
	perror("Failed to load file");
	} else {
	normalize_line_endings(code);
	merge_lines_ending_in_backslash(code);
	remove_comments(code);
	list = tokenize(code);
	token_node_t *it = list.head;
	while (it) {
	strcpy(it->token.file, file);
	it = it->next;
	}
	free(code);
	}
	return list;
	}

	void print_token(token_t token) {
	const char *type_str;
	switch (token.type) {
	case TOKEN_IDENTIFIER: type_str = "IDENTIFIER"; break;
	case TOKEN_NUMBER: type_str = "NUMBER"; break;
	case TOKEN_STRING: type_str = "STRING"; break;
	case TOKEN_HEADER: type_str = "HEADER"; break;
	case TOKEN_PUNCTUATION: type_str = "PUNCTUATION"; break;
	case TOKEN_OTHER: type_str = "OTHER"; break;
	case TOKEN_EOF: type_str = "EOF"; break;
	default: type_str = "UNKNOWN";
	}
	printf("(%s %s %s %zu-%zu)\n", token.file, type_str, token.value, token.start, token.end);
	}

	void preprocess_file(const char *file) {
	token_list_t token_list = tokenize_file(file);

	token_node_t *current = token_list.head;
	while (current) {
	token_node_t *next = current->next;
	if (current->token.type == TOKEN_PUNCTUATION) {
	if (strcmp(current->token.value, "#") == 0) {
	if (next && next->token.type == TOKEN_IDENTIFIER) {
	if (strcmp(next->token.value, "include") == 0) {
	next = next->next;
	if (next && next->token.type == TOKEN_HEADER) {
	char path[MAX_TOKEN_LENGTH];
	strcpy(path, next->token.value + 1);
	path[strlen(path) - 1] = '\0';
	token_list_t new_tokens = tokenize_file(path);

	// Remove the old tokens
	if (new_tokens.head) {
	current->prev->next = new_tokens.head;
	new_tokens.tail->next = next->next;
	free(current->next->next);
	free(current->next);
	free(current);
	next = new_tokens.head;
	}
	}
	}
	}
	}
	}
	current = next;
	}

	for (token_node_t *it = token_list.head; it != NULL; it = it->next) {
	print_token(it->token);
	}
	destroy_token_list(token_list);
	}

	int main(void) {
	chdir("..");
	preprocess_file("test.c");
	return 0;
	}