Created
April 1, 2024 23:20
-
-
Save gszauer/ae964b262786667ca6f1d4dcc88d5af0 to your computer and use it in GitHub Desktop.
Tokenizer.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
namespace Compiler { | |
public delegate void PrintErrorFunction(string message); | |
public class Location { | |
public string File { get; protected set; } | |
public int Line { get; protected set; } | |
public int Column { get; protected set; } | |
public Location(string file, int line, int column) { | |
File = file; | |
Line = line; | |
Column = column; | |
} | |
} | |
public class Token { | |
public string Lexeme { get; protected set; } | |
public Location Location { get; protected set; } | |
public Symbol Symbol { get; protected set; } | |
public Token(string lexeme, Location location, Symbol symbol) { | |
this.Lexeme = lexeme; | |
this.Location = location; | |
this.Symbol = symbol; | |
} | |
public void AppendLexeme(string lexeme) { | |
Lexeme += lexeme; | |
} | |
public void ReplaceLexeme(string lexem) { | |
Lexeme = lexem; | |
} | |
public string AsString { | |
get { | |
return Symbol.ToString() + " `" + Lexeme + "` on line: " + Location.Line + " column: " + Location.Column; | |
} | |
} | |
} | |
public class Tokenizer { | |
protected class State { | |
public PrintErrorFunction print; | |
public string Source; | |
public string File; | |
public int Current; | |
public int Start; | |
public int Line; | |
public int Column; | |
public string[]? ForceIdentifiers; | |
public bool HadError; | |
public string? Error; | |
public State(string fiole, string source, PrintErrorFunction p, string[]? forceIdentifiers) { | |
print = p; | |
Source = source; | |
File = fiole; | |
Current = 0; | |
Start = 0; | |
Line = 1; | |
Column = 1; | |
ForceIdentifiers = forceIdentifiers; | |
HadError = false; | |
Error = null; | |
} | |
} | |
protected static Dictionary<string, Symbol>? Keywords = null; | |
public static List<Token>? Tokenize(string fileName, string fileContent, PrintErrorFunction print, string[]? forceIdentifiers = null) { | |
InitKeywords(); | |
State state = new State(fileName, fileContent, print, forceIdentifiers); | |
List<Token> result = new List<Token>(); | |
Token startOfFile = new Token(fileName, new Location(fileName, 0, 0), Symbol.FILE_START); | |
result.Add(startOfFile); | |
Token? lastToken = null; | |
while (!IsAtEnd(state)) { | |
state.Start = state.Current; | |
Token? token = ScanToken(state); | |
if (token == null) { | |
Error(state, "Could not scan token"); | |
return null; | |
} | |
if (token.Symbol == Symbol.COMMENT) { | |
continue; | |
} | |
if (token.Symbol == Symbol.LIT_STRING) { | |
if (lastToken != null && lastToken.Symbol == Symbol.LIT_STRING) { | |
lastToken.AppendLexeme(token.Lexeme); | |
continue; | |
} | |
} | |
result.Add(token); | |
lastToken = token; | |
} | |
Token endOfFile = new Token(fileName, new Location(fileName, 0, 0), Symbol.FILE_END); | |
result.Add(endOfFile); | |
if (state.HadError) { | |
return null; | |
} | |
foreach (Token t in result) { | |
if (t.Symbol == Symbol.LIT_STRING) { | |
t.ReplaceLexeme("\"" + t.Lexeme + "\""); | |
} | |
else if (t.Symbol == Symbol.LIT_CHAR) { | |
t.ReplaceLexeme("'" + t.Lexeme + "'"); | |
} | |
} | |
return result; | |
} | |
protected static void InitKeywords() { | |
if (Keywords == null) { | |
Keywords = new Dictionary<string, Symbol>(); | |
Keywords.Add("char", Symbol.TYPE_CHAR); | |
Keywords.Add("int", Symbol.TYPE_INT); | |
Keywords.Add("float", Symbol.TYPE_FLOAT); | |
Keywords.Add("bool", Symbol.TYPE_BOOL); | |
Keywords.Add("string", Symbol.TYPE_STRING); | |
Keywords.Add("object", Symbol.TYPE_OBJECT); | |
Keywords.Add("void", Symbol.TYPE_VOID); | |
Keywords.Add("delegate", Symbol.DELEGATE); | |
Keywords.Add("class", Symbol.CLASS); | |
Keywords.Add("interface", Symbol.INTERFACE); | |
Keywords.Add("extends", Symbol.EXTENDS); | |
Keywords.Add("implements", Symbol.IMPLEMENTS); | |
Keywords.Add("true", Symbol.LIT_BOOL); | |
Keywords.Add("false", Symbol.LIT_BOOL); | |
Keywords.Add("null", Symbol.LIT_NULL); | |
Keywords.Add("new", Symbol.NEW); | |
Keywords.Add("and", Symbol.AND); | |
Keywords.Add("or", Symbol.OR); | |
Keywords.Add("as", Symbol.AS); | |
Keywords.Add("if", Symbol.IF); | |
Keywords.Add("else", Symbol.ELSE); | |
Keywords.Add("for", Symbol.FOR); | |
Keywords.Add("while", Symbol.WHILE); | |
Keywords.Add("public", Symbol.PUBLIC); | |
Keywords.Add("protected", Symbol.PROTECTED); | |
Keywords.Add("private", Symbol.PRIVATE); | |
Keywords.Add("return", Symbol.RETURN); | |
Keywords.Add("continue", Symbol.CONTINUE); | |
Keywords.Add("break", Symbol.BREAK); | |
Keywords.Add("assert", Symbol.ASSERT); | |
Keywords.Add("this", Symbol.THIS); | |
Keywords.Add("base", Symbol.BASE); | |
Keywords.Add("set", Symbol.SET); | |
Keywords.Add("get", Symbol.GET); | |
} | |
} | |
protected static Token? ScanToken(State s) { | |
char c = Advance(s); | |
if (c == '(') { return MakeToken(s, Symbol.LPAREN); } | |
else if (c == ')') { return MakeToken(s, Symbol.RPAREN); } | |
else if (c == '{') { return MakeToken(s, Symbol.LBRACE); } | |
else if (c == '}') { return MakeToken(s, Symbol.RBRACE); } | |
else if (c == '[') { return MakeToken(s, Symbol.LBRACKET); } | |
else if (c == ']') { return MakeToken(s, Symbol.RBRACKET); } | |
else if (c == ',') { return MakeToken(s, Symbol.COMMA); } | |
else if (c == ';') { return MakeToken(s, Symbol.SEMICOLON); } | |
else if (c == '`') { return MakeToken(s, Symbol.TICK); } | |
else if (c == '!') { return MakeToken(s, Match(s, '=') ? Symbol.NOT_EQUAL : Symbol.NOT); } | |
else if (c == '~') { return MakeToken(s, Match(s, '=') ? Symbol.TILDE_EQUAL : Symbol.TILDE); } | |
else if (c == '*') { return MakeToken(s, Match(s, '=') ? Symbol.STAR_EQUAL : Symbol.STAR); } | |
else if (c == '%') { return MakeToken(s, Match(s, '=') ? Symbol.MOD_EQUAL : Symbol.MOD); } | |
else if (c == '^') { return MakeToken(s, Match(s, '=') ? Symbol.POW_EQUAL : Symbol.POW); } | |
else if (c == '>') { return MakeToken(s, Match(s, '=') ? Symbol.GREATER_EQUAL : Symbol.GREATER); } | |
else if (c == '<') { return MakeToken(s, Match(s, '=') ? Symbol.LESS_EQUAL : Symbol.LESS); } | |
else if (c == ':') { return MakeToken(s, Match(s, '=') ? Symbol.COLON_EQUAL : Symbol.COLON); } | |
else if (c == '?') { return MakeToken(s, Match(s, '=') ? Symbol.QUESTION_EQUAL : Symbol.QUESTION); } | |
else if (c == '@') { return MakeToken(s, Match(s, '=') ? Symbol.AT_EQUAL : Symbol.AT); } | |
else if (c == '#') { return MakeToken(s, Match(s, '=') ? Symbol.HASH_EQUAL : Symbol.HASH); } | |
else if (c == '$') { return MakeToken(s, Match(s, '=') ? Symbol.DOLLAR_EQUAL : Symbol.DOLLAR); } | |
else if (c == '&') { return MakeToken(s, Match(s, '=') ? Symbol.AMPER_EQUAL : Symbol.AMPER); } | |
else if (c == '|') { return MakeToken(s, Match(s, '=') ? Symbol.PIPE_EQUAL : Symbol.PIPE); } | |
else if (c == '.') { | |
if (Match(s, '.')) { | |
if (Match(s, '.')) { | |
return MakeToken(s, Symbol.DOT_DOT_DOT); | |
} | |
return MakeToken(s, Symbol.DOT_DOT); | |
} | |
return MakeToken(s, Symbol.DOT); | |
} | |
else if (c == '=') { | |
if (Match(s, '=')) { | |
if (Match(s, '=')) { | |
return MakeToken(s, Symbol.EQUAL_EQUAL_EQUAL); | |
} | |
return MakeToken(s, Symbol.EQUAL_EQUAL); | |
} | |
return MakeToken(s, Symbol.EQUAL); | |
} | |
else if (c == '+') { | |
if (Match(s, '=')) { return MakeToken(s, Symbol.PLUS_EQUAL); } | |
else if (Match(s, '+')) { return MakeToken(s, Symbol.PLUS_PLUS); } | |
else { return MakeToken(s, Symbol.PLUS); } | |
} | |
else if (c == '-') { | |
if (Match(s, '=')) { return MakeToken(s, Symbol.MINUS_EQUAL); } | |
else if (Match(s, '-')) { return MakeToken(s, Symbol.MINUS_MINUS); } | |
else { return MakeToken(s, Symbol.MINUS); } | |
} | |
else if (c == '/') { | |
if (Match(s, '=')) { return MakeToken(s, Symbol.SLASH_EQUAL); } | |
else if (Match(s, '/')) { | |
while (!IsAtEnd(s) && Peek(s, 0) != '\n') { | |
Advance(s); | |
} | |
return MakeToken(s, Symbol.COMMENT); | |
} | |
else if (Match(s, '*')) { | |
while (true) { | |
if (Peek(s, 0) == '/') { | |
if (Peek(s, -1) == '*') { | |
Advance(s); // Eat the slash | |
return MakeToken(s, Symbol.COMMENT); | |
} | |
} | |
Advance(s); | |
if (IsAtEnd(s)) { | |
Error(s, "Unterminated comment"); | |
} | |
} | |
} | |
else { | |
return MakeToken(s, Symbol.SLASH); | |
} | |
} | |
else if (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f') { | |
c = Peek(s, 0); | |
while (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f') { | |
Advance(s); | |
if (IsAtEnd(s)) { | |
break; | |
} | |
c = Peek(s, 0); | |
} | |
return MakeToken(s, Symbol.COMMENT); // Skip white space | |
} | |
else if (c == '\'') { | |
char literal = Advance(s); | |
if (literal == '\\') { | |
literal = Advance(s); | |
if (literal != '0' && literal != 't' && literal != 'n' && literal != 'r' && literal != 'f' && literal != '\\' && literal != '\'') { | |
Error(s, "Unexpected char escape sequence: '" + literal + "'"); | |
} | |
} | |
if (Peek(s, 0) != '\'') { | |
Error(s, "Unterminated character literal"); | |
} | |
Advance(s); // Eat ' | |
return MakeToken(s, Symbol.LIT_CHAR, literal.ToString()); | |
} | |
else if (c == '"') { | |
string literal = ""; | |
while (!IsAtEnd(s)) { | |
if (Peek(s, 0) == '"') { | |
if (Peek(s, -1) == '\\') { | |
// Just let it go | |
} | |
else { | |
break; | |
} | |
} | |
if (Peek(s, 0) == '\n') { | |
Error(s, "Newline is not supported inside string"); | |
} | |
literal += Advance(s); | |
} | |
if (Peek(s, 0) != '"') { | |
Error(s, "Unterminated string"); | |
} | |
Advance(s); // Eat " | |
return MakeToken(s, Symbol.LIT_STRING, literal.ToString()); | |
} | |
else { | |
if (IsNumber(c)) { | |
while (MatchNumber(s)) ; | |
if (Match(s, '.')) { | |
while (MatchNumber(s)) ; | |
return MakeToken(s, Symbol.LIT_FLOAT); | |
} | |
/* else */ | |
if (Match(s, 'f')) { | |
return MakeToken(s, Symbol.LIT_FLOAT); | |
} | |
return MakeToken(s, Symbol.LIT_INT); | |
} | |
else if (c == '_' || IsAlpha(c)) { | |
while (!IsAtEnd(s) && IsAlphaNumericWithUnderscore(Peek(s, 0))) { | |
Advance(s); | |
} | |
string lexeme = GetLexeme(s); | |
if (Keywords != null && Keywords.ContainsKey(lexeme)) { | |
return MakeToken(s, Keywords[lexeme]); | |
} | |
return MakeToken(s, Symbol.IDENTIFIER); | |
} | |
} | |
Error(s, "Encountered unexpected character: '" + c + "'"); | |
return null; | |
} | |
protected static string GetLexeme(State s) { | |
return s.Source.Substring(s.Start, s.Current - s.Start); | |
} | |
protected static bool IsNumber(char c) { | |
return c >= '0' && c <= '9'; | |
} | |
protected static bool IsAlpha(char c) { | |
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); | |
} | |
protected static bool IsAlphaNumericWithUnderscore(char c) { | |
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || (c == '_'); | |
} | |
protected static bool MatchNumber(State s) { | |
if (IsAtEnd(s)) { | |
return false; | |
} | |
char peek = Peek(s, 0); | |
if (peek < '0' || peek > '9') { | |
return false; | |
} | |
Advance(s); | |
return true; | |
} | |
protected static bool Match(State s, char c) { | |
if (IsAtEnd(s)) { | |
return false; | |
} | |
if (Peek(s, 0) != c) { | |
return false; | |
} | |
Advance(s); | |
return true; | |
} | |
protected static char Peek(State s, int offset) { | |
int location = s.Current + offset; | |
if (location < 0) { | |
Error(s, "Can't peek below zero"); | |
} | |
else if (location >= s.Source.Length) { | |
Error(s, "Can't peek past end"); | |
} | |
char result = s.Source[location]; | |
return result; | |
} | |
protected static char Advance(State s) { | |
if (IsAtEnd(s)) { | |
Error(s, "Can't advance past end of token stream"); | |
} | |
s.Column += 1; | |
if (s.Source[s.Current] == '\n') { | |
s.Line += 1; | |
s.Column = 1; | |
} | |
return s.Source[s.Current++]; | |
} | |
protected static Token MakeToken(State s, Symbol symbol, string? optLex = null) { | |
string lexeme = s.Source.Substring(s.Start, s.Current - s.Start); | |
Location location = new Location(s.File, s.Line, s.Column); | |
return new Token(optLex == null? lexeme : optLex, location, symbol); | |
} | |
protected static bool IsAtEnd(State s) { | |
return s.Current >= s.Source.Length; | |
} | |
protected static void Error(State s, string error) { | |
s.HadError = true; | |
error = "Error in Tokenizer: \n" + error; | |
error += "\nOn line: " + s.Line + ", column: " + s.Column + ", in file: " + s.File; | |
s.Error = error; | |
if (s.print != null) { | |
s.print(error); | |
} | |
else { | |
throw new Exception(error); | |
} | |
} | |
} | |
public enum Symbol { | |
FILE_START, FILE_END, | |
NOT, AT, HASH, DOLLAR, MOD, POW, AMPER, STAR, PLUS, MINUS, TILDE, SLASH, QUESTION, COLON, LESS, GREATER, EQUAL, PIPE, | |
NOT_EQUAL, AT_EQUAL, HASH_EQUAL, DOLLAR_EQUAL, MOD_EQUAL, POW_EQUAL,LESS_EQUAL, GREATER_EQUAL, COLON_EQUAL, | |
AMPER_EQUAL, STAR_EQUAL, PLUS_EQUAL, MINUS_EQUAL, TILDE_EQUAL, QUESTION_EQUAL, SLASH_EQUAL, EQUAL_EQUAL, | |
PLUS_PLUS, MINUS_MINUS, PIPE_EQUAL, EQUAL_EQUAL_EQUAL, | |
SEMICOLON, COMMA, DOT, DOT_DOT, DOT_DOT_DOT, TICK, | |
TYPE_CHAR, TYPE_INT, TYPE_FLOAT, TYPE_BOOL, TYPE_STRING, TYPE_OBJECT, TYPE_VOID, | |
LIT_CHAR, LIT_INT, LIT_FLOAT, LIT_BOOL, LIT_STRING, LIT_NULL, | |
DELEGATE, CLASS, INTERFACE, EXTENDS, IMPLEMENTS, STATIC, | |
NEW, AND, OR, AS, IF, ELSE, FOR, WHILE, PUBLIC, PROTECTED, PRIVATE, | |
RETURN, CONTINUE, BREAK, ASSERT, THIS, BASE, GET, SET, | |
LBRACE, RBRACE, /* { } */ | |
LBRACKET, RBRACKET, /* [ ] */ | |
LPAREN, RPAREN, /* ( ) */ | |
IDENTIFIER, COMMENT | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment