extern strlen = (*i8) => i64; extern memcpy = (*void, *void, i64) => void; extern sprintf = (*i8, *i8, varargs) => void; extern atoi = (*i8) => i64; import "!stdlib.pry"; import "!mem.pry"; /* Keywords */ let TOKEN_IMPORT = 1; let TOKEN_LET = 2; let TOKEN_EXTERN = 3; let TOKEN_IF = 4; let TOKEN_WHILE = 5; let TOKEN_RETURN = 6; let TOKEN_BREAK = 7; let TOKEN_CONTINUE = 8; let TOKEN_ARROW = 9; let TOKEN_STRUCT = 10; let TOKEN_TYPE = 34; /* Identifiers */ let TOKEN_IDENTIFIER = 11; /* Literals */ let TOKEN_NUMBER = 12; let TOKEN_BOOLEAN = 13; let TOKEN_NULL = 14; let TOKEN_CHAR = 15; let TOKEN_STRING = 16; /* Operators */ let TOKEN_EQUALS = 17; let TOKEN_PLUS = 18; let TOKEN_MINUS = 19; let TOKEN_MUL = 20; let TOKEN_DIV = 21; let TOKEN_MOD = 22; let TOKEN_BANG = 23; let TOKEN_LESS = 24; let TOKEN_GREATER = 25; let TOKEN_DOT = 26; /* Punctuation */ let TOKEN_SEMICOLON = 27; let TOKEN_COMMA = 28; let TOKEN_COLON = 29; let TOKEN_LPAREN = 30; let TOKEN_RPAREN = 31; let TOKEN_LBRACE = 32; let TOKEN_RBRACE = 33; let token = struct { type: i64, data: *void, }; let tokenizer = struct { buf: *i8, buf_len: i64, offset: i64, arena: *arena, }; let print_tokens = (ts: *token, ts_len: i64) => i64 { let i = 0; while i < ts_len { let to = (*(ts + cast(*token, i))); if (to.type == TOKEN_IMPORT) { printf("Import\n"); }; if (to.type == TOKEN_LET) { printf("Let\n"); }; if (to.type == TOKEN_EXTERN) { printf("Extern\n"); }; if (to.type == TOKEN_IF) { printf("If\n"); }; if (to.type == TOKEN_WHILE) { printf("While\n"); }; if (to.type == TOKEN_RETURN) { printf("Return\n"); }; if (to.type == TOKEN_BREAK) { printf("Break\n"); }; if (to.type == TOKEN_CONTINUE) { printf("Continue\n"); }; if (to.type == TOKEN_ARROW) { printf("Arrow\n"); }; if (to.type == TOKEN_STRUCT) { printf("Struct\n"); }; if (to.type == TOKEN_TYPE) { printf("Type\n"); }; if (to.type == TOKEN_IDENTIFIER) { printf("Identifier: %s\n", cast(*i8, to.data)); }; if (to.type == TOKEN_NUMBER) { printf("Number: %d\n", *cast(*i64, to.data)); }; if (to.type == TOKEN_BOOLEAN) { printf("Boolean: %d\n", *cast(*bool, to.data)); }; if (to.type == TOKEN_NULL) { printf("Null\n"); }; if (to.type == TOKEN_CHAR) { printf("Char: %c\n", *cast(*i8, to.data)); }; if (to.type == TOKEN_STRING) { printf("String: %s\n", cast(*i8, to.data)); }; if (to.type == TOKEN_EQUALS) { printf("Equals\n"); }; if (to.type == TOKEN_PLUS) { printf("Plus\n"); }; if (to.type == TOKEN_MINUS) { printf("Minus\n"); }; if (to.type == TOKEN_MUL) { printf("Mul\n"); }; if (to.type == TOKEN_DIV) { printf("Div\n"); }; if (to.type == TOKEN_MOD) { printf("Mod\n"); }; if (to.type == TOKEN_BANG) { printf("Bang\n"); }; if (to.type == TOKEN_LESS) { printf("Less\n"); }; if (to.type == TOKEN_GREATER) { printf("Greater\n"); }; if (to.type == TOKEN_DOT) { printf("Dot\n"); }; if (to.type == TOKEN_SEMICOLON) { printf("Semicolon\n"); }; if (to.type == TOKEN_COMMA) { printf("Comma\n"); }; if (to.type == TOKEN_COLON) { printf("Colon\n"); }; if (to.type == TOKEN_LPAREN) { printf("LParen\n"); }; if (to.type == TOKEN_RPAREN) { printf("RParen\n"); }; if (to.type == TOKEN_LBRACE) { printf("LBrace\n"); }; if (to.type == TOKEN_RBRACE) { printf("RBrace\n"); }; i = i + 1; }; return 0; }; let tokenizer_skip_whitespace = (t: *tokenizer) => void { while true { if (*t).offset >= (*t).buf_len { return; }; let c = (*((*t).buf + cast(*i8, (*t).offset))); if !iswhitespace(c) { return; }; (*t).offset = (*t).offset + 1; }; return; }; let tokenizer_accept_string = (t: *tokenizer, str: *i8) => bool { let str_len = strlen(str); if (*t).offset + str_len > (*t).buf_len { return false; }; let s = cast(*i8, arena_alloc((*t).arena, 1000)); memcpy(cast(*void, s), cast(*void, (*t).buf + cast(*i8, (*t).offset)), str_len); if strcmp(s, str) { (*t).offset = (*t).offset + str_len; return true; }; return false; }; let tokenizer_consume_until_condition = (t: *tokenizer, condition: (i8) => bool) => *i8 { let start = (*t).offset; let res = cast(*i8, arena_alloc((*t).arena, 1000)); while true { if (*t).offset >= (*t).buf_len { return res; }; let c = (*((*t).buf + cast(*i8, (*t).offset))); let offset = (*t).offset; if c == '\\' { let next_c = (*((*t).buf + cast(*i8, offset + 1))); let any = false; if next_c == 'n' { (*(res + cast(*i8, offset - start))) = '\n'; any = true; }; if next_c == 't' { (*(res + cast(*i8, offset - start))) = '\t'; any = true; }; if next_c == 'r' { (*(res + cast(*i8, offset - start))) = '\r'; any = true; }; if next_c == '0' { (*(res + cast(*i8, offset - start))) = '\0'; any = true; }; if next_c == '\\' { (*(res + cast(*i8, offset - start))) = '\\'; any = true; }; if !any { (*(res + cast(*i8, offset - start))) = next_c; }; offset = offset + 1; offset = offset + 1; (*t).offset = offset; continue; }; if condition(c) { return res; }; (*(res + cast(*i8, offset - start))) = c; (*(res + cast(*i8, offset - start + 1))) = '\0'; offset = offset + 1; (*t).offset = offset; }; return cast(*i8, null); }; let tokenizer_accept_int_type = (t: *tokenizer) => *i64 { let string = tokenizer_consume_until_condition(t, (c: i8) => bool { return !isdigit(c); }); if string == cast(*i8, null) { return cast(*i64, null); }; if strlen(string) == 0 { return cast(*i64, null); }; let x = cast(*i64, arena_alloc((*t).arena, sizeof(i64))); *x = atoi(string); return x; }; let tokenizer_accept_char_type = (t: *tokenizer) => *i8 { let prev_offset = (*t).offset; if !tokenizer_accept_string(t, "'") { (*t).offset = prev_offset; return cast(*i8, null); }; let string = tokenizer_consume_until_condition(t, (c: i8) => bool { return c == '\''; }); if !tokenizer_accept_string(t, "'") { (*t).offset = prev_offset; return cast(*i8, null); }; return string; }; let tokenizer_accept_string_type = (t: *tokenizer) => *i8 { let prev_offset = (*t).offset; if !tokenizer_accept_string(t, "\"") { (*t).offset = prev_offset; return cast(*i8, null); }; let string = tokenizer_consume_until_condition(t, (c: i8) => bool { return c == '"'; }); if !tokenizer_accept_string(t, "\"") { (*t).offset = prev_offset; return cast(*i8, null); }; return string; }; let tokenizer_skip_comments = (t: *tokenizer) => void { if !tokenizer_accept_string(t, "/*") { return; }; while !tokenizer_accept_string(t, "*/") { (*t).offset = (*t).offset + 1; }; return; }; let tokenizer_next = (t: *tokenizer) => *token { tokenizer_skip_whitespace(t); tokenizer_skip_comments(t); tokenizer_skip_whitespace(t); if (*t).offset >= (*t).buf_len { return cast(*token, null); }; let to = cast(*token, arena_alloc((*t).arena, sizeof(token))); if tokenizer_accept_string(t, "import") { (*to).type = TOKEN_IMPORT; return to; }; if tokenizer_accept_string(t, "let") { (*to).type = TOKEN_LET; return to; }; if tokenizer_accept_string(t, "extern") { (*to).type = TOKEN_EXTERN; return to; }; if tokenizer_accept_string(t, "if") { (*to).type = TOKEN_IF; return to; }; if tokenizer_accept_string(t, "while") { (*to).type = TOKEN_WHILE; return to; }; if tokenizer_accept_string(t, "return") { (*to).type = TOKEN_RETURN; return to; }; if tokenizer_accept_string(t, "break") { (*to).type = TOKEN_BREAK; return to; }; if tokenizer_accept_string(t, "continue") { (*to).type = TOKEN_CONTINUE; return to; }; if tokenizer_accept_string(t, "true") { (*to).type = TOKEN_BOOLEAN; let data = cast(*bool, arena_alloc((*t).arena, sizeof(bool))); *data = true; (*to).data = cast(*void, data); return to; }; if tokenizer_accept_string(t, "false") { (*to).type = TOKEN_BOOLEAN; let data = cast(*bool, arena_alloc((*t).arena, sizeof(bool))); *data = false; (*to).data = cast(*void, data); return to; }; if tokenizer_accept_string(t, "null") { (*to).type = TOKEN_NULL; return to; }; if tokenizer_accept_string(t, "struct") { (*to).type = TOKEN_STRUCT; return to; }; if tokenizer_accept_string(t, "newtype") { (*to).type = TOKEN_TYPE; return to; }; if tokenizer_accept_string(t, "=>") { (*to).type = TOKEN_ARROW; return to; }; if tokenizer_accept_string(t, ";") { (*to).type = TOKEN_SEMICOLON; return to; }; if tokenizer_accept_string(t, ",") { (*to).type = TOKEN_COMMA; return to; }; if tokenizer_accept_string(t, ":") { (*to).type = TOKEN_COLON; return to; }; if tokenizer_accept_string(t, "(") { (*to).type = TOKEN_LPAREN; return to; }; if tokenizer_accept_string(t, ")") { (*to).type = TOKEN_RPAREN; return to; }; if tokenizer_accept_string(t, "{") { (*to).type = TOKEN_LBRACE; return to; }; if tokenizer_accept_string(t, "}") { (*to).type = TOKEN_RBRACE; return to; }; if tokenizer_accept_string(t, "=") { (*to).type = TOKEN_EQUALS; return to; }; if tokenizer_accept_string(t, "+") { (*to).type = TOKEN_PLUS; return to; }; if tokenizer_accept_string(t, "-") { (*to).type = TOKEN_MINUS; return to; }; if tokenizer_accept_string(t, "*") { (*to).type = TOKEN_MUL; return to; }; if tokenizer_accept_string(t, "/") { (*to).type = TOKEN_DIV; return to; }; if tokenizer_accept_string(t, "%") { (*to).type = TOKEN_MOD; return to; }; if tokenizer_accept_string(t, "!") { (*to).type = TOKEN_BANG; return to; }; if tokenizer_accept_string(t, "<") { (*to).type = TOKEN_LESS; return to; }; if tokenizer_accept_string(t, ">") { (*to).type = TOKEN_GREATER; return to; }; if tokenizer_accept_string(t, ".") { (*to).type = TOKEN_DOT; return to; }; let maybe_int = tokenizer_accept_int_type(t); if maybe_int != cast(*i64, null) { (*to).type = TOKEN_NUMBER; (*to).data = cast(*void, maybe_int); return to; }; let maybe_char = tokenizer_accept_char_type(t); if maybe_char != cast(*i8, null) { (*to).type = TOKEN_CHAR; (*to).data = cast(*void, maybe_char); return to; }; let maybe_string = tokenizer_accept_string_type(t); if maybe_string != cast(*i8, null) { (*to).type = TOKEN_STRING; (*to).data = cast(*void, maybe_string); return to; }; let string = tokenizer_consume_until_condition(t, (c: i8) => bool { if isalphanum(c) { return false; }; if c == '_' { return false; }; return true; }); if strlen(string) == 0 { printf("NO IDENT!\n"); return cast(*token, null); }; (*to).type = TOKEN_IDENTIFIER; (*to).data = cast(*void, string); return to; }; let tokenizer_init = (alloc: *arena, file: slice) => *tokenizer { let t = cast(*tokenizer, arena_alloc(alloc, sizeof(tokenizer))); (*t).arena = alloc; (*t).offset = 0; (*t).buf = cast(*i8, file.data); (*t).buf_len = file.data_len; printf("File size: %d\n", (*t).buf_len); printf("%s\n", (*t).buf); return t; }; let tokenizer_tokenize = (t: *tokenizer) => slice { let tokens = cast(*token, arena_alloc((*t).arena, sizeof(token) * 40000)); /* why does it not care about type here */ let tokens_len = 0; while true { let tk = tokenizer_next(t); if tk == cast(*token, null) { break; }; printf("Add token: %d\n", (*tk).type); (*(tokens + cast(*token, tokens_len))) = *tk; tokens_len = tokens_len + 1; }; printf("PRINT TOKENS: %d\n", tokens_len); print_tokens(tokens, tokens_len); let res = slice{}; res.data = cast(*void, tokens); res.data_len = tokens_len; return res; };