diff options
Diffstat (limited to 'src/tokenizer.zig')
-rw-r--r-- | src/tokenizer.zig | 136 |
1 files changed, 74 insertions, 62 deletions
diff --git a/src/tokenizer.zig b/src/tokenizer.zig index e125110..a955c65 100644 --- a/src/tokenizer.zig +++ b/src/tokenizer.zig @@ -4,53 +4,30 @@ const TokenizerError = error{ TokenizingError, }; -pub const TokenType = enum { +pub const TokenType = union(enum) { // Keywords - LET, - IF, - WHILE, - RETURN, - ARROW, - - // Identifiers - IDENTIFIER, - - // Literals - NUMBER, - BOOLEAN, - - // Operators - EQUALS, - PLUS, - MINUS, - MUL, - DIV, - BANG, - - // Punctuation - SEMICOLON, - COMMA, - LPAREN, - RPAREN, - LBRACE, - RBRACE, -}; - -pub const Token = union(TokenType) { LET: void, IF: void, WHILE: void, RETURN: void, ARROW: void, + + // Identifiers IDENTIFIER: []u8, + + // Literals NUMBER: i64, BOOLEAN: bool, + + // Operators EQUALS: void, PLUS: void, MINUS: void, MUL: void, DIV: void, BANG: void, + + // Punctuation SEMICOLON: void, COMMA: void, LPAREN: void, @@ -59,6 +36,17 @@ pub const Token = union(TokenType) { RBRACE: void, }; +const TokenLocation = struct { + col: u64, + row: u64, +}; + +pub const Token = struct { + location: TokenLocation, + offset: u64, + type: TokenType, +}; + pub const Tokenizer = struct { buf: []u8, offset: u64, @@ -68,48 +56,45 @@ pub const Tokenizer = struct { } pub fn next(self: *Tokenizer) TokenizerError!?Token { - defer self.offset += 1; self.skip_whitespace(); self.skip_comments(); self.skip_whitespace(); if (self.offset >= self.buf.len) return null; - const c = self.buf[self.offset]; - - if (self.accept_substr("let")) return Token{ .LET = void{} }; - if (self.accept_substr("if")) return Token{ .IF = void{} }; - if (self.accept_substr("while")) return Token{ .WHILE = void{} }; - if (self.accept_substr("return")) return Token{ .RETURN = void{} }; - if (self.accept_substr("true")) return Token{ .BOOLEAN = true }; - if (self.accept_substr("false")) return Token{ .BOOLEAN = false }; - - if (self.accept_substr("=>")) return Token{ .ARROW = void{} }; - if (c == ';') return Token{ .SEMICOLON = void{} }; - if (c == ',') return Token{ .COMMA = void{} }; - if (c == '(') return Token{ .LPAREN = void{} }; - if (c == ')') return Token{ .RPAREN = void{} }; - if (c == '{') return Token{ .LBRACE = void{} }; - if (c == '}') return Token{ .RBRACE = void{} }; - if (c == '=') return Token{ .EQUALS = void{} }; - if (c == '+') return Token{ .PLUS = void{} }; - if (c == '-') return Token{ .MINUS = void{} }; - if (c == '*') return Token{ .MUL = void{} }; - if (c == '/') return Token{ .DIV = void{} }; - if (c == '!') return Token{ .BANG = void{} }; + if (self.accept_string("let")) return self.create_token(.{ .LET = void{} }); + if (self.accept_string("if")) return self.create_token(.{ .IF = void{} }); + if (self.accept_string("while")) return self.create_token(.{ .WHILE = void{} }); + if (self.accept_string("return")) return self.create_token(.{ .RETURN = void{} }); + if (self.accept_string("true")) return self.create_token(.{ .BOOLEAN = true }); + if (self.accept_string("false")) return self.create_token(.{ .BOOLEAN = false }); + + if (self.accept_string("=>")) return self.create_token(.{ .ARROW = void{} }); + if (self.accept_string(";")) return self.create_token(.{ .SEMICOLON = void{} }); + if (self.accept_string(",")) return self.create_token(.{ .COMMA = void{} }); + if (self.accept_string("(")) return self.create_token(.{ .LPAREN = void{} }); + if (self.accept_string(")")) return self.create_token(.{ .RPAREN = void{} }); + if (self.accept_string("{")) return self.create_token(.{ .LBRACE = void{} }); + if (self.accept_string("}")) return self.create_token(.{ .RBRACE = void{} }); + if (self.accept_string("=")) return self.create_token(.{ .EQUALS = void{} }); + if (self.accept_string("+")) return self.create_token(.{ .PLUS = void{} }); + if (self.accept_string("-")) return self.create_token(.{ .MINUS = void{} }); + if (self.accept_string("*")) return self.create_token(.{ .MUL = void{} }); + if (self.accept_string("/")) return self.create_token(.{ .DIV = void{} }); + if (self.accept_string("!")) return self.create_token(.{ .BANG = void{} }); const string = self.consume_string(); if (string.len == 0) return TokenizerError.TokenizingError; - if (std.fmt.parseInt(i32, string, 10) catch null) |i| return Token{ .NUMBER = i }; + if (std.fmt.parseInt(i32, string, 10) catch null) |i| return self.create_token(.{ .NUMBER = i }); - return Token{ .IDENTIFIER = string }; + return self.create_token(.{ .IDENTIFIER = string }); } fn skip_comments(self: *Tokenizer) void { - if (!self.accept_substr("/*")) return; + if (!self.accept_string("/*")) return; - while (!self.accept_substr("*/")) { + while (!self.accept_string("*/")) { self.offset += 1; } } @@ -127,17 +112,16 @@ pub const Tokenizer = struct { defer self.offset = if (self.offset > 0) self.offset - 1 else self.offset; const start = self.offset; while (true) { + defer self.offset += 1; if (self.offset >= self.buf.len) return self.buf[start..self.offset]; const c = self.buf[self.offset]; if (!std.ascii.isAlphanumeric(c) and c != '_') return self.buf[start..self.offset]; - - self.offset += 1; } } - fn accept_substr(self: *Tokenizer, substr: []const u8) bool { + fn accept_string(self: *Tokenizer, substr: []const u8) bool { if (self.offset + substr.len > self.buf.len) return false; if (std.mem.eql(u8, self.buf[self.offset .. self.offset + substr.len], substr)) { self.offset += substr.len; @@ -145,6 +129,34 @@ pub const Tokenizer = struct { } return false; } + + fn create_token(self: *Tokenizer, token_type: TokenType) Token { + return Token{ + .location = self.compute_location(), + .offset = self.offset - 1, + .type = token_type, + }; + } + + fn compute_location(self: *Tokenizer) TokenLocation { + var location = TokenLocation{ .col = 1, .row = 1 }; + + var i: usize = 0; + while (i < self.offset) : (i += 1) { + if (self.buf[i] == '\n') { + location.row += 1; + location.col = 1; + } else { + location.col += 1; + } + } + + // We need to do this because we call this fn after we consume the token + location.row -= 1; + location.col -= 1; + + return location; + } }; test "simple" { |