Last active
February 15, 2017 13:08
-
-
Save radex/20ccd14da08d56f47074 to your computer and use it in GitHub Desktop.
Wrote a little lexer/tokenizer for fun. (Warning: I have no idea what I'm doing)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
struct Stream { | |
let string: NSString | |
var position: Int | |
var matchingRange: NSRange { | |
return NSRange(location: position, length: string.length - position) | |
} | |
} | |
struct Stack<T> { | |
var array: [T] = [] | |
var tip: T? { return array.last } | |
mutating func push(x: T) { array.append(x) } | |
mutating func pop() -> T? { | |
if array.count > 0 { | |
return array.removeLast() | |
} else { | |
return nil | |
} | |
} | |
} | |
func matchRegexAt(#pattern: String, stream: Stream) -> String? { | |
let regex = NSRegularExpression(pattern: "^" + pattern, options: nil, error: nil)! | |
let match = regex.firstMatchInString(stream.string, options: nil, range: stream.matchingRange) | |
if let range = match?.rangeAtIndex(0) { | |
return stream.string.substringWithRange(range) | |
} else { | |
return nil | |
} | |
} | |
enum NextState { | |
case Stay | |
case Pop | |
case Push(String) | |
} | |
typealias Rule = Stream -> (Stream, Token?, NextState)? | |
func rule(regex: String, tokenizer: NSString -> Token?) -> Rule { | |
return rule(regex, .Stay, tokenizer) | |
} | |
func rule(regex: String, next: NextState, tokenizer: NSString -> Token?) -> Rule { | |
return { stream in | |
if let match: NSString = matchRegexAt(pattern: regex, stream) { | |
var newStream = stream | |
newStream.position += match.length | |
return (newStream, tokenizer(match), next) | |
} else { | |
return nil | |
} | |
} | |
} | |
enum Token: Printable { | |
case Symbol(Swift.String) | |
case Number(Int) | |
case String(Swift.String) | |
var description: Swift.String { | |
switch self { | |
case .Symbol(let string): return "SYMBOL \(string)" | |
case .Number(let number): return "NUMBER \(number)" | |
case .String(let string): return "STRING \"\(string)\"" | |
} | |
} | |
} | |
class Lexer { | |
var stream: Stream | |
var tokens: [Token] = [] | |
var states: [String: [Rule]] = [:] | |
var stack = Stack<String>() | |
init(string: String) { | |
stream = Stream(string: string, position: 0) | |
stack.push("root") | |
} | |
func registerState(name: String, _ ruleset: [Rule]) { | |
states[name] = ruleset | |
} | |
func lex() { | |
while true { | |
let ruleset = states[stack.tip!]! | |
switch matchRuleset(ruleset) { | |
case .Some(.Pop): | |
stack.pop() | |
case .Some(.Push(let state)): | |
stack.push(state) | |
default: return | |
} | |
} | |
} | |
func matchRuleset(ruleset: [Rule]) -> NextState? { | |
while stream.position < stream.string.length { | |
if let nextState = matchRulesetOnce(ruleset) { | |
switch nextState { | |
case .Stay: continue | |
default: return nextState | |
} | |
} else { | |
fatalError("No rule matched :(") | |
} | |
} | |
return nil | |
} | |
func matchRulesetOnce(ruleset: [Rule]) -> NextState? { | |
for rule in ruleset { | |
if let nextState = matchRule(rule) { | |
return nextState | |
} | |
} | |
return nil | |
} | |
func matchRule(rule: Rule) -> NextState? { | |
if let (outputStream, token, next) = rule(stream) { | |
stream = outputStream | |
if let token = token { | |
tokens.append(token) | |
} | |
return next | |
} else { | |
return nil | |
} | |
} | |
} | |
let lexer = Lexer(string: "blah 0 10 0xFF foo ; comment\n" + "blah 0b0101 blah \" some 0xFF string \" bla ") | |
lexer.registerState("root", [ | |
// whitespace | |
rule("\\s+", { _ in nil }), | |
// comment | |
rule(";.*?\n", { _ in nil }), | |
// hex numbers | |
rule("0x[0-9a-fA-F]+", { | |
var number: UInt32 = 0 | |
let scanner = NSScanner(string: $0) | |
scanner.scanHexInt(&number) | |
return .Number(Int(number)) | |
}), | |
// bin numbers | |
rule("0b[01]+", { | |
let bin: NSString = $0.substringFromIndex(2) | |
return .Number(strtol(bin.UTF8String, nil, 2)) | |
}), | |
// decimal numbers | |
rule("\\d+", { .Number($0.integerValue) }), | |
// strings | |
rule("\"", .Push("string"), { _ in nil }), | |
// words | |
rule("[a-zA-Z]+", { .Symbol($0) }) | |
]) | |
lexer.registerState("string", [ | |
rule("[^\\\"]+", { .String($0) }), | |
rule("\"", .Pop, { _ in nil }) | |
]) | |
lexer.lex() | |
for token in lexer.tokens { | |
println(token.description) | |
} | |
lexer.stream.position |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is really cool stuff, and even taking into consideration nested quotations 👍