Last active
September 3, 2020 15:17
-
-
Save chriseidhof/831c61250ccf3822d0d63f3b706aae84 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Three different lexer implementations. First a "consume everything" lexer, then a "lazy" lexer (that is, an iterator), then a lazy lexer without explicit state. | |
enum Token { | |
case heading(level: Int) | |
case star | |
case text(String) | |
} | |
import Foundation | |
enum LexState { | |
case heading(Int) | |
case other | |
} | |
extension Substring { | |
mutating func lex() -> [Token] { | |
var tokens: [Token] = [] | |
var state = LexState.other | |
// In a classical lexer, we need to keep track of state. | |
func flush() { | |
switch state { | |
case .heading(let x): | |
tokens.append(.heading(level: x)) | |
case .other: () | |
} | |
} | |
while let next = self.popFirst() { | |
switch (state, next) { | |
case (.other, "#"): | |
state = .heading(1) | |
case (.heading(let x), "#"): | |
state = .heading(x + 1) | |
case (_, "*"): | |
flush() | |
tokens.append(.star) | |
default: | |
fatalError() | |
} | |
} | |
return tokens | |
} | |
} | |
let source = "###**##" | |
var rem = source[...] | |
rem.lex() | |
// we can model the same lexer as an iterator. we store the state as an instance property. | |
struct Lexer: IteratorProtocol { | |
var state: LexState | |
var remainder: Substring | |
var buffer: [Token] = [] | |
mutating func flush(adding: Token?) -> Token? { | |
defer { state = .other } | |
switch state { | |
case let .heading(x): | |
if let y = adding { buffer.append(y) } | |
return .heading(level: x) | |
default: | |
return adding | |
} | |
} | |
mutating func next() -> Token? { | |
if !buffer.isEmpty { | |
return buffer.remove(at: 0) | |
} | |
while let n = remainder.popFirst() { | |
switch (state, n) { | |
case (.other, "#"): | |
state = .heading(1) | |
case (.heading(let x), "#"): | |
state = .heading(x + 1) | |
case (_, "*"): | |
return flush(adding: .star) | |
default: | |
fatalError() | |
} | |
} | |
return flush(adding: nil) | |
} | |
} | |
let result = Array(AnySequence { Lexer(state: .other, remainder: source[...], buffer: []) }) | |
result | |
// Instead of having an explicit LexState, we can also use functions to model our current state. | |
// I got the idea from Rob Pike's https://www.youtube.com/watch?v=HxaD_trXwRE, but as Swift doesn't have coroutines (yet?), we need continuations | |
struct Cont { | |
let run: (inout Substring) -> (Token, Cont)? | |
} | |
// We can now have a lazy token producer (the consumer asks for next tokens) without having to store explicit state: | |
struct Lexer2: IteratorProtocol { | |
var remainder: Substring | |
var continuation: Cont | |
init(_ remainder: Substring) { | |
self.remainder = remainder | |
self.continuation = Lexer2.startState() | |
} | |
mutating func next() -> Token? { | |
guard let (t, c) = continuation.run(&remainder) else { return nil } | |
self.continuation = c | |
return t | |
} | |
static func startState() -> Cont { | |
return Cont { str in | |
guard let n = str.popFirst() else { return nil } | |
return self.helper(n).run(&str) | |
} | |
} | |
static func helper(_ n: Character) -> Cont { | |
return Cont { str in | |
switch n { | |
case "#": | |
return self.heading().run(&str) | |
case "*": | |
return (.star, self.startState()) | |
default: | |
fatalError() // not supported by the language yet | |
} | |
} | |
} | |
static func heading() -> Cont { | |
return Cont { str in | |
var count = 1 // when we get to this function we've seen exactly one "#" | |
while let n = str.popFirst() { | |
switch n { | |
case "#": | |
count += 1 | |
default: | |
// here we can pass the already seen `n` to the helper method, which captures that "state" for us | |
return (.heading(level: count), Lexer2.helper(n)) | |
} | |
} | |
return (.heading(level: count), Lexer2.startState()) | |
} | |
} | |
} | |
var l = Array(AnySequence { Lexer2(source[...]) }) | |
l |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment