diff options
| author | Baitinq <[email protected]> | 2025-05-31 23:24:03 +0200 |
|---|---|---|
| committer | Baitinq <[email protected]> | 2025-05-31 23:24:03 +0200 |
| commit | 369579adaeb67caa16dc8ec6f0352e9ea4ad6246 (patch) | |
| tree | 867c1658db7930635d48d729d91993f0019468e0 | |
| parent | Bootstrap: Tokenizer: Rewrite using struct instead of global values (diff) | |
| download | interpreter-369579adaeb67caa16dc8ec6f0352e9ea4ad6246.tar.gz interpreter-369579adaeb67caa16dc8ec6f0352e9ea4ad6246.tar.bz2 interpreter-369579adaeb67caa16dc8ec6f0352e9ea4ad6246.zip | |
Bootstrap: Tokenizer: Start rewriting with token type
| -rw-r--r-- | src/bootstrap/tokenizer.src | 304 | ||||
| -rw-r--r-- | src/codegen.zig | 22 | ||||
| -rw-r--r-- | src/parser.zig | 2 |
3 files changed, 254 insertions, 74 deletions
diff --git a/src/bootstrap/tokenizer.src b/src/bootstrap/tokenizer.src index 19a2036..91b5c51 100644 --- a/src/bootstrap/tokenizer.src +++ b/src/bootstrap/tokenizer.src @@ -13,14 +13,62 @@ extern fclose = (*i8) => *i8; import "!stdlib.src"; import "!mem.src"; +/* Keywords */ +let TOKEN_IMPORT = 1; +let TOKEN_LET = 2; +let TOKEN_EXTERN = 3; +let TOKEN_IF = 4; +let TOKEN_WHILE = 5; +let TOKEN_RETURN = 6; +let TOKEN_BREAK = 7; +let TOKEN_CONTINUE = 8; +let TOKEN_ARROW = 9; +let TOKEN_STRUCT = 10; + +/* Identifiers */ +let TOKEN_IDENTIFIER = 11; + +/* Literals */ +let TOKEN_NUMBER = 12; +let TOKEN_BOOLEAN = 13; +let TOKEN_NULL = 14; +let TOKEN_CHAR = 15; +let TOKEN_STRING = 16; + +/* Operators */ +let TOKEN_EQUALS = 17; +let TOKEN_PLUS = 18; +let TOKEN_MINUS = 19; +let TOKEN_MUL = 20; +let TOKEN_DIV = 21; +let TOKEN_MOD = 22; +let TOKEN_BANG = 23; +let TOKEN_LESS = 24; +let TOKEN_GREATER = 25; +let TOKEN_DOT = 26; + +/* Punctuation */ +let TOKEN_SEMICOLON = 27; +let TOKEN_COMMA = 28; +let TOKEN_COLON = 29; +let TOKEN_LPAREN = 30; +let TOKEN_RPAREN = 31; +let TOKEN_LBRACE = 32; +let TOKEN_RBRACE = 33; + +let token = struct { + type: i64, + data: *void, +}; + let tokenizer = struct { - tokens: *i8, - tokens_len: i64, buf: *i8, file_size: i64, offset: i64, arena: *arena, + tokens: *token, + tokens_len: i64, }; let read_file = (t: *tokenizer, filename: *i8) => void { @@ -42,34 +90,119 @@ let read_file = (t: *tokenizer, filename: *i8) => void { return; }; -let add_token = (t: *tokenizer, token: *i8) => i64 { - println("Add token: %s", token); - let i = 0; - while true { - let c = (*(token + cast(*i8, i))); - - (*((*t).tokens + cast(*i8, (*t).tokens_len))) = c; +let add_token = (t: *tokenizer, to: *token) => void { + println("Add token: %d", (*to).type); - (*t).tokens_len = (*t).tokens_len + 1; - i = i + 1; - - if c == '\0' { - return 0; - }; - }; + (*((*t).tokens + cast(*token, (*t).tokens_len))) = *to; + (*t).tokens_len = (*t).tokens_len + 1; - return 0; + return; }; let print_tokens = (t: *tokenizer) => i64 { let i = 0; while i < (*t).tokens_len { - let c = (*((*t).tokens + cast(*i8, i))); - if c == '\0' { - c = '\n'; - }; + let to = (*((*t).tokens + cast(*token, i))); - printf("%c", c); + if (to.type == TOKEN_IMPORT) { + printf("Import\n"); + }; + if (to.type == TOKEN_LET) { + printf("Let\n"); + }; + if (to.type == TOKEN_EXTERN) { + printf("Extern\n"); + }; + if (to.type == TOKEN_IF) { + printf("If\n"); + }; + if (to.type == TOKEN_WHILE) { + printf("While\n"); + }; + if (to.type == TOKEN_RETURN) { + printf("Return\n"); + }; + if (to.type == TOKEN_BREAK) { + printf("Break\n"); + }; + if (to.type == TOKEN_CONTINUE) { + printf("Continue\n"); + }; + if (to.type == TOKEN_ARROW) { + printf("Arrow\n"); + }; + if (to.type == TOKEN_STRUCT) { + printf("Struct\n"); + }; + if (to.type == TOKEN_IDENTIFIER) { + printf("Identifier: %s\n", cast(*i8, to.data)); + }; + if (to.type == TOKEN_NUMBER) { + printf("Number: %d\n", cast(i64, to.data)); + }; + if (to.type == TOKEN_BOOLEAN) { + printf("Boolean: %d\n", cast(i1, to.data)); + }; + if (to.type == TOKEN_NULL) { + printf("Null\n"); + }; + if (to.type == TOKEN_CHAR) { + printf("Char: %c\n", cast(i8, to.data)); + }; + if (to.type == TOKEN_STRING) { + printf("String: %s\n", cast(*i8, to.data)); + }; + if (to.type == TOKEN_EQUALS) { + printf("Equals\n"); + }; + if (to.type == TOKEN_PLUS) { + printf("Plus\n"); + }; + if (to.type == TOKEN_MINUS) { + printf("Minus\n"); + }; + if (to.type == TOKEN_MUL) { + printf("Mul\n"); + }; + if (to.type == TOKEN_DIV) { + printf("Div\n"); + }; + if (to.type == TOKEN_MOD) { + printf("Mod\n"); + }; + if (to.type == TOKEN_BANG) { + printf("Bang\n"); + }; + if (to.type == TOKEN_LESS) { + printf("Less\n"); + }; + if (to.type == TOKEN_GREATER) { + printf("Greater\n"); + }; + if (to.type == TOKEN_DOT) { + printf("Dot\n"); + }; + if (to.type == TOKEN_SEMICOLON) { + printf("Semicolon\n"); + }; + if (to.type == TOKEN_COMMA) { + printf("Comma\n"); + }; + if (to.type == TOKEN_COLON) { + printf("Colon\n"); + }; + if (to.type == TOKEN_LPAREN) { + printf("LParen\n"); + }; + if (to.type == TOKEN_RPAREN) { + printf("RParen\n"); + }; + if (to.type == TOKEN_LBRACE) { + printf("LBrace\n"); + }; + if (to.type == TOKEN_RBRACE) { + printf("RBrace\n"); + }; i = i + 1; }; @@ -229,119 +362,151 @@ let tokenizer_skip_comments = (t: *tokenizer) => void { return; }; -let tokenizer_next = (t: *tokenizer) => *i8 { +let tokenizer_next = (t: *tokenizer) => *token { tokenizer_skip_whitespace(t); tokenizer_skip_comments(t); tokenizer_skip_whitespace(t); if (*t).offset >= (*t).file_size { - return "EOF"; + println("FILE EXCEED"); + return cast(*token, null); }; + + let to = cast(*token, arena_alloc((*t).arena, sizeof(token))); if tokenizer_accept_string(t, "import") { - return "import"; + (*to).type = TOKEN_IMPORT; + return to; }; if tokenizer_accept_string(t, "let") { - return "let"; + (*to).type = TOKEN_LET; + return to; }; if tokenizer_accept_string(t, "extern") { - return "extern"; + (*to).type = TOKEN_EXTERN; + return to; }; if tokenizer_accept_string(t, "if") { - return "if"; + (*to).type = TOKEN_IF; + return to; }; if tokenizer_accept_string(t, "while") { - return "while"; + (*to).type = TOKEN_WHILE; + return to; }; if tokenizer_accept_string(t, "return") { - return "return"; + (*to).type = TOKEN_RETURN; + return to; }; if tokenizer_accept_string(t, "break") { - return "break"; + (*to).type = TOKEN_BREAK; + return to; }; if tokenizer_accept_string(t, "true") { - return "bool:true"; + (*to).type = TOKEN_BOOLEAN; + let data = cast(*bool, arena_alloc((*t).arena, sizeof(bool))); + *data = true; + (*to).data = cast(*void, data); + return to; }; if tokenizer_accept_string(t, "false") { - return "bool:false"; + let data = cast(*bool, arena_alloc((*t).arena, sizeof(bool))); + *data = false; + (*to).data = cast(*void, data); + return to; }; if tokenizer_accept_string(t, "=>") { - return "=>"; + (*to).type = TOKEN_ARROW; + return to; }; if tokenizer_accept_string(t, ";") { - return ";"; + (*to).type = TOKEN_SEMICOLON; + return to; }; if tokenizer_accept_string(t, ",") { - return ","; + (*to).type = TOKEN_COMMA; + return to; }; if tokenizer_accept_string(t, ":") { - return ":"; + (*to).type = TOKEN_COLON; + return to; }; if tokenizer_accept_string(t, "(") { - return "("; + (*to).type = TOKEN_LPAREN; + return to; }; if tokenizer_accept_string(t, ")") { - return ")"; + (*to).type = TOKEN_RPAREN; + return to; }; if tokenizer_accept_string(t, "{") { - return "{"; + (*to).type = TOKEN_LBRACE; + return to; }; if tokenizer_accept_string(t, "}") { - return "}"; + (*to).type = TOKEN_RBRACE; + return to; }; if tokenizer_accept_string(t, "=") { - return "="; + (*to).type = TOKEN_EQUALS; + return to; }; if tokenizer_accept_string(t, "+") { - return "+"; + (*to).type = TOKEN_PLUS; + return to; }; if tokenizer_accept_string(t, "-") { - return "-"; + (*to).type = TOKEN_MINUS; + return to; }; if tokenizer_accept_string(t, "*") { - return "*"; + (*to).type = TOKEN_MUL; + return to; }; if tokenizer_accept_string(t, "/") { - return "/"; + (*to).type = TOKEN_DIV; + return to; }; if tokenizer_accept_string(t, "%") { - return "%"; + (*to).type = TOKEN_MOD; + return to; }; if tokenizer_accept_string(t, "!") { - return "!"; + (*to).type = TOKEN_BANG; + return to; }; if tokenizer_accept_string(t, "<") { - return "<"; + (*to).type = TOKEN_LESS; + return to; }; if tokenizer_accept_string(t, ">") { - return ">"; + (*to).type = TOKEN_GREATER; + return to; }; if tokenizer_accept_string(t, ".") { - return "."; + (*to).type = TOKEN_DOT; + return to; }; let maybe_int = tokenizer_accept_int_type(t); if maybe_int != cast(*i64, null) { - let to = cast(*i8, arena_alloc((*t).arena, 1000)); - sprintf(to, "int:%d", *maybe_int); - + (*to).type = TOKEN_NUMBER; + (*to).data = cast(*void, maybe_int); return to; }; let maybe_char = tokenizer_accept_char_type(t); if maybe_char != cast(*i8, null) { - let to = cast(*i8, arena_alloc((*t).arena, 1000)); - sprintf(to, "char:%d", *maybe_char); - + (*to).type = TOKEN_CHAR; + (*to).data = cast(*void, maybe_char); return to; }; let maybe_string = tokenizer_accept_string_type(t); if maybe_string != cast(*i8, null) { - let to = cast(*i8, arena_alloc((*t).arena, 1000)); - sprintf(to, "string:%s", maybe_string); - + (*to).type = TOKEN_STRING; + (*to).data = cast(*void, maybe_string); return to; }; @@ -355,19 +520,22 @@ let tokenizer_next = (t: *tokenizer) => *i8 { return true; }); if strlen(string) == 0 { - return cast(*i8, null); + println("NO IDENT!"); + return cast(*token, null); }; - let to = cast(*i8, arena_alloc((*t).arena, 100)); - sprintf(to, "identifier:%s", string); - + (*to).type = TOKEN_IDENTIFIER; + (*to).data = cast(*void, string); + return to; }; let tokenizer_init = (alloc: *arena, filename: *i8) => i64 { let t = cast(*tokenizer, arena_alloc(alloc, sizeof(tokenizer))); (*t).arena = alloc; - (*t).tokens = cast(*i8, arena_alloc((*t).arena, 100000)); + (*t).tokens = cast(*i8, arena_alloc((*t).arena, 100)); + (*t).tokens_len = 0; + (*t).offset = 0; read_file(t, filename); @@ -375,14 +543,10 @@ let tokenizer_init = (alloc: *arena, filename: *i8) => i64 { println("%s", (*t).buf); - while true { let tk = tokenizer_next(t); - if tk == cast(*i8, null) { + if tk == cast(*token, null) { println("NULL TOKEN!"); - return 1; - }; - if strcmp(tk, "EOF") { break; }; add_token(t, tk); diff --git a/src/codegen.zig b/src/codegen.zig index 9be065e..152572f 100644 --- a/src/codegen.zig +++ b/src/codegen.zig @@ -521,6 +521,7 @@ pub const CodeGen = struct { .type = try self.create_node(.{ .TYPE = .{ .SIMPLE_TYPE = .{ .name = "void", + .underlying_type = null, } }, }), }, @@ -532,6 +533,7 @@ pub const CodeGen = struct { .TYPE = .{ .SIMPLE_TYPE = .{ .name = "i64", + .underlying_type = null, }, }, })); @@ -546,6 +548,7 @@ pub const CodeGen = struct { .TYPE = .{ .SIMPLE_TYPE = .{ .name = "bool", + .underlying_type = null, }, }, })); @@ -555,6 +558,7 @@ pub const CodeGen = struct { .TYPE = .{ .SIMPLE_TYPE = .{ .name = "i8", + .underlying_type = null, }, }, })); @@ -573,6 +577,7 @@ pub const CodeGen = struct { .type = try self.create_node(.{ .TYPE = .{ .SIMPLE_TYPE = .{ .name = "i8", + .underlying_type = null, } }, }), }, @@ -605,6 +610,7 @@ pub const CodeGen = struct { var result: llvm.LLVMValueRef = undefined; var node_type: *parser.Node = try self.create_node(.{ .TYPE = .{ .SIMPLE_TYPE = .{ .name = "i64", + .underlying_type = null, } } }); if (exp.addition) { @@ -655,6 +661,7 @@ pub const CodeGen = struct { .TYPE = .{ .SIMPLE_TYPE = .{ .name = "bool", + .underlying_type = null, }, }, }); @@ -665,6 +672,7 @@ pub const CodeGen = struct { .TYPE = .{ .SIMPLE_TYPE = .{ .name = "i64", + .underlying_type = null, }, }, }); @@ -699,6 +707,7 @@ pub const CodeGen = struct { .TYPE = .{ .SIMPLE_TYPE = .{ .name = "bool", + .underlying_type = null, }, }, })); @@ -725,6 +734,10 @@ pub const CodeGen = struct { }); }, .STRUCT_TYPE => |t| { + const simple_type_node = try self.create_node(.{ .TYPE = .{ .SIMPLE_TYPE = .{ + .name = name.?, + .underlying_type = expression, + } } }); const struct_type = llvm.LLVMStructCreateNamed(self.llvm_context, try std.fmt.allocPrintZ(self.arena, "{s}", .{name.?})); // Needed for recursive structs @@ -734,7 +747,7 @@ pub const CodeGen = struct { .type = struct_type, .stack_level = null, .node = expression, - .node_type = expression, + .node_type = simple_type_node, })); } @@ -749,7 +762,7 @@ pub const CodeGen = struct { .type = struct_type, .stack_level = null, .node = expression, - .node_type = expression, + .node_type = simple_type_node, }); }, else => unreachable, @@ -779,6 +792,7 @@ pub const CodeGen = struct { .TYPE = .{ .SIMPLE_TYPE = .{ .name = "i64", + .underlying_type = null, }, }, }), @@ -847,7 +861,7 @@ pub const CodeGen = struct { unreachable; } var fieldIndex: ?usize = null; - for (0.., typ.TYPE.STRUCT_TYPE.fields) |i, field| { + for (0.., typ.TYPE.SIMPLE_TYPE.underlying_type.?.TYPE.STRUCT_TYPE.fields) |i, field| { if (std.mem.eql(u8, name, field.PRIMARY_EXPRESSION.IDENTIFIER.name)) { fieldIndex = i; break; @@ -861,7 +875,7 @@ pub const CodeGen = struct { return .{ .value = llvm.LLVMBuildGEP2(self.builder, try self.get_llvm_type(typ), ptr.value, indices, indices.len, try std.fmt.allocPrintZ(self.arena, "{s}", .{name})), - .type = typ.TYPE.STRUCT_TYPE.fields[fieldIndex.?].PRIMARY_EXPRESSION.IDENTIFIER.type.?, + .type = typ.TYPE.SIMPLE_TYPE.underlying_type.?.TYPE.STRUCT_TYPE.fields[fieldIndex.?].PRIMARY_EXPRESSION.IDENTIFIER.type.?, }; } diff --git a/src/parser.zig b/src/parser.zig index 5db8fa8..e92ed51 100644 --- a/src/parser.zig +++ b/src/parser.zig @@ -92,6 +92,7 @@ pub const Node = union(enum) { TYPE: union(enum) { SIMPLE_TYPE: struct { name: []const u8, + underlying_type: ?*Node, }, FUNCTION_TYPE: struct { parameters: []*Node, @@ -863,6 +864,7 @@ pub const Parser = struct { .TYPE = .{ .SIMPLE_TYPE = .{ .name = try self.arena.dupe(u8, ident), + .underlying_type = null, }, }, }); |