diff options
Diffstat (limited to 'src/bootstrap/tokenizer.pry')
| -rw-r--r-- | src/bootstrap/tokenizer.pry | 553 |
1 files changed, 553 insertions, 0 deletions
diff --git a/src/bootstrap/tokenizer.pry b/src/bootstrap/tokenizer.pry new file mode 100644 index 0000000..a2cafb4 --- /dev/null +++ b/src/bootstrap/tokenizer.pry @@ -0,0 +1,553 @@ +extern strlen = (*i8) => i64; +extern memcpy = (*void, *void, i64) => void; +extern sprintf = (*i8, *i8, varargs) => void; +extern atoi = (*i8) => i64; + +import "!stdlib.pry"; +import "!mem.pry"; + +/* Keywords */ +let TOKEN_IMPORT = 1; +let TOKEN_LET = 2; +let TOKEN_EXTERN = 3; +let TOKEN_IF = 4; +let TOKEN_WHILE = 5; +let TOKEN_RETURN = 6; +let TOKEN_BREAK = 7; +let TOKEN_CONTINUE = 8; +let TOKEN_ARROW = 9; +let TOKEN_STRUCT = 10; +let TOKEN_TYPE = 34; + +/* Identifiers */ +let TOKEN_IDENTIFIER = 11; + +/* Literals */ +let TOKEN_NUMBER = 12; +let TOKEN_BOOLEAN = 13; +let TOKEN_NULL = 14; +let TOKEN_CHAR = 15; +let TOKEN_STRING = 16; + +/* Operators */ +let TOKEN_EQUALS = 17; +let TOKEN_PLUS = 18; +let TOKEN_MINUS = 19; +let TOKEN_MUL = 20; +let TOKEN_DIV = 21; +let TOKEN_MOD = 22; +let TOKEN_BANG = 23; +let TOKEN_LESS = 24; +let TOKEN_GREATER = 25; +let TOKEN_DOT = 26; + +/* Punctuation */ +let TOKEN_SEMICOLON = 27; +let TOKEN_COMMA = 28; +let TOKEN_COLON = 29; +let TOKEN_LPAREN = 30; +let TOKEN_RPAREN = 31; +let TOKEN_LBRACE = 32; +let TOKEN_RBRACE = 33; + +let token = struct { + type: i64, + data: *void, +}; + +let tokenizer = struct { + buf: *i8, + buf_len: i64, + offset: i64, + + arena: *arena, +}; + +let print_tokens = (ts: *token, ts_len: i64) => i64 { + let i = 0; + while i < ts_len { + let to = (*(ts + cast(*token, i))); + + if (to.type == TOKEN_IMPORT) { + printf("Import\n"); + }; + if (to.type == TOKEN_LET) { + printf("Let\n"); + }; + if (to.type == TOKEN_EXTERN) { + printf("Extern\n"); + }; + if (to.type == TOKEN_IF) { + printf("If\n"); + }; + if (to.type == TOKEN_WHILE) { + printf("While\n"); + }; + if (to.type == TOKEN_RETURN) { + printf("Return\n"); + }; + if (to.type == TOKEN_BREAK) { + printf("Break\n"); + }; + if (to.type == TOKEN_CONTINUE) { + printf("Continue\n"); + }; + if (to.type == TOKEN_ARROW) { + printf("Arrow\n"); + }; + if (to.type == TOKEN_STRUCT) { + printf("Struct\n"); + }; + if (to.type == TOKEN_TYPE) { + printf("Type\n"); + }; + if (to.type == TOKEN_IDENTIFIER) { + printf("Identifier: %s\n", cast(*i8, to.data)); + }; + if (to.type == TOKEN_NUMBER) { + printf("Number: %d\n", *cast(*i64, to.data)); + }; + if (to.type == TOKEN_BOOLEAN) { + printf("Boolean: %d\n", *cast(*bool, to.data)); + }; + if (to.type == TOKEN_NULL) { + printf("Null\n"); + }; + if (to.type == TOKEN_CHAR) { + printf("Char: %c\n", *cast(*i8, to.data)); + }; + if (to.type == TOKEN_STRING) { + printf("String: %s\n", cast(*i8, to.data)); + }; + if (to.type == TOKEN_EQUALS) { + printf("Equals\n"); + }; + if (to.type == TOKEN_PLUS) { + printf("Plus\n"); + }; + if (to.type == TOKEN_MINUS) { + printf("Minus\n"); + }; + if (to.type == TOKEN_MUL) { + printf("Mul\n"); + }; + if (to.type == TOKEN_DIV) { + printf("Div\n"); + }; + if (to.type == TOKEN_MOD) { + printf("Mod\n"); + }; + if (to.type == TOKEN_BANG) { + printf("Bang\n"); + }; + if (to.type == TOKEN_LESS) { + printf("Less\n"); + }; + if (to.type == TOKEN_GREATER) { + printf("Greater\n"); + }; + if (to.type == TOKEN_DOT) { + printf("Dot\n"); + }; + if (to.type == TOKEN_SEMICOLON) { + printf("Semicolon\n"); + }; + if (to.type == TOKEN_COMMA) { + printf("Comma\n"); + }; + if (to.type == TOKEN_COLON) { + printf("Colon\n"); + }; + if (to.type == TOKEN_LPAREN) { + printf("LParen\n"); + }; + if (to.type == TOKEN_RPAREN) { + printf("RParen\n"); + }; + if (to.type == TOKEN_LBRACE) { + printf("LBrace\n"); + }; + if (to.type == TOKEN_RBRACE) { + printf("RBrace\n"); + }; + + i = i + 1; + }; + + return 0; +}; + +let tokenizer_skip_whitespace = (t: *tokenizer) => void { + while true { + if (*t).offset >= (*t).buf_len { return; }; + let c = (*((*t).buf + cast(*i8, (*t).offset))); + if !iswhitespace(c) { + return; + }; + (*t).offset = (*t).offset + 1; + }; + + return; +}; + +let tokenizer_accept_string = (t: *tokenizer, str: *i8) => bool { + let str_len = strlen(str); + if (*t).offset + str_len > (*t).buf_len { return false; }; + + let s = cast(*i8, arena_alloc((*t).arena, 1000)); + memcpy(cast(*void, s), cast(*void, (*t).buf + cast(*i8, (*t).offset)), str_len); + + if strcmp(s, str) { + (*t).offset = (*t).offset + str_len; + return true; + }; + + return false; +}; + +let tokenizer_consume_until_condition = (t: *tokenizer, condition: (i8) => bool) => *i8 { + let start = (*t).offset; + let res = cast(*i8, arena_alloc((*t).arena, 1000)); + + while true { + if (*t).offset >= (*t).buf_len { + return res; + }; + + let c = (*((*t).buf + cast(*i8, (*t).offset))); + + let offset = (*t).offset; + if c == '\\' { + let next_c = (*((*t).buf + cast(*i8, offset + 1))); + + let any = false; + if next_c == 'n' { + (*(res + cast(*i8, offset - start))) = '\n'; + any = true; + }; + if next_c == 't' { + (*(res + cast(*i8, offset - start))) = '\t'; + any = true; + }; + if next_c == 'r' { + (*(res + cast(*i8, offset - start))) = '\r'; + any = true; + }; + if next_c == '0' { + (*(res + cast(*i8, offset - start))) = '\0'; + any = true; + }; + if next_c == '\\' { + (*(res + cast(*i8, offset - start))) = '\\'; + any = true; + }; + if !any { + (*(res + cast(*i8, offset - start))) = next_c; + }; + + offset = offset + 1; + offset = offset + 1; + (*t).offset = offset; + + continue; + }; + + if condition(c) { + return res; + }; + + (*(res + cast(*i8, offset - start))) = c; + (*(res + cast(*i8, offset - start + 1))) = '\0'; + + offset = offset + 1; + (*t).offset = offset; + }; + + return cast(*i8, null); +}; + +let tokenizer_accept_int_type = (t: *tokenizer) => *i64 { + let string = tokenizer_consume_until_condition(t, (c: i8) => bool { + return !isdigit(c); + }); + if string == cast(*i8, null) { + return cast(*i64, null); + }; + if strlen(string) == 0 { + return cast(*i64, null); + }; + let x = cast(*i64, arena_alloc((*t).arena, sizeof(i64))); + *x = atoi(string); + return x; +}; + +let tokenizer_accept_char_type = (t: *tokenizer) => *i8 { + let prev_offset = (*t).offset; + if !tokenizer_accept_string(t, "'") { + (*t).offset = prev_offset; + return cast(*i8, null); + }; + + let string = tokenizer_consume_until_condition(t, (c: i8) => bool { + return c == '\''; + }); + + if !tokenizer_accept_string(t, "'") { + (*t).offset = prev_offset; + return cast(*i8, null); + }; + + return string; +}; + +let tokenizer_accept_string_type = (t: *tokenizer) => *i8 { + let prev_offset = (*t).offset; + if !tokenizer_accept_string(t, "\"") { + (*t).offset = prev_offset; + return cast(*i8, null); + }; + + let string = tokenizer_consume_until_condition(t, (c: i8) => bool { + return c == '"'; + }); + + if !tokenizer_accept_string(t, "\"") { + (*t).offset = prev_offset; + return cast(*i8, null); + }; + + return string; +}; + +let tokenizer_skip_comments = (t: *tokenizer) => void { + if !tokenizer_accept_string(t, "/*") { return; }; + + while !tokenizer_accept_string(t, "*/") { + (*t).offset = (*t).offset + 1; + }; + + return; +}; + +let tokenizer_next = (t: *tokenizer) => *token { + tokenizer_skip_whitespace(t); + tokenizer_skip_comments(t); + tokenizer_skip_whitespace(t); + + if (*t).offset >= (*t).buf_len { + return cast(*token, null); + }; + + let to = cast(*token, arena_alloc((*t).arena, sizeof(token))); + + if tokenizer_accept_string(t, "import") { + (*to).type = TOKEN_IMPORT; + return to; + }; + if tokenizer_accept_string(t, "let") { + (*to).type = TOKEN_LET; + return to; + }; + if tokenizer_accept_string(t, "extern") { + (*to).type = TOKEN_EXTERN; + return to; + }; + if tokenizer_accept_string(t, "if") { + (*to).type = TOKEN_IF; + return to; + }; + if tokenizer_accept_string(t, "while") { + (*to).type = TOKEN_WHILE; + return to; + }; + if tokenizer_accept_string(t, "return") { + (*to).type = TOKEN_RETURN; + return to; + }; + if tokenizer_accept_string(t, "break") { + (*to).type = TOKEN_BREAK; + return to; + }; + if tokenizer_accept_string(t, "continue") { + (*to).type = TOKEN_CONTINUE; + return to; + }; + if tokenizer_accept_string(t, "true") { + (*to).type = TOKEN_BOOLEAN; + let data = cast(*bool, arena_alloc((*t).arena, sizeof(bool))); + *data = true; + (*to).data = cast(*void, data); + return to; + }; + if tokenizer_accept_string(t, "false") { + (*to).type = TOKEN_BOOLEAN; + let data = cast(*bool, arena_alloc((*t).arena, sizeof(bool))); + *data = false; + (*to).data = cast(*void, data); + return to; + }; + if tokenizer_accept_string(t, "null") { + (*to).type = TOKEN_NULL; + return to; + }; + if tokenizer_accept_string(t, "struct") { + (*to).type = TOKEN_STRUCT; + return to; + }; + if tokenizer_accept_string(t, "newtype") { + (*to).type = TOKEN_TYPE; + return to; + }; + + if tokenizer_accept_string(t, "=>") { + (*to).type = TOKEN_ARROW; + return to; + }; + if tokenizer_accept_string(t, ";") { + (*to).type = TOKEN_SEMICOLON; + return to; + }; + if tokenizer_accept_string(t, ",") { + (*to).type = TOKEN_COMMA; + return to; + }; + if tokenizer_accept_string(t, ":") { + (*to).type = TOKEN_COLON; + return to; + }; + if tokenizer_accept_string(t, "(") { + (*to).type = TOKEN_LPAREN; + return to; + }; + if tokenizer_accept_string(t, ")") { + (*to).type = TOKEN_RPAREN; + return to; + }; + if tokenizer_accept_string(t, "{") { + (*to).type = TOKEN_LBRACE; + return to; + }; + if tokenizer_accept_string(t, "}") { + (*to).type = TOKEN_RBRACE; + return to; + }; + if tokenizer_accept_string(t, "=") { + (*to).type = TOKEN_EQUALS; + return to; + }; + if tokenizer_accept_string(t, "+") { + (*to).type = TOKEN_PLUS; + return to; + }; + if tokenizer_accept_string(t, "-") { + (*to).type = TOKEN_MINUS; + return to; + }; + if tokenizer_accept_string(t, "*") { + (*to).type = TOKEN_MUL; + return to; + }; + if tokenizer_accept_string(t, "/") { + (*to).type = TOKEN_DIV; + return to; + }; + if tokenizer_accept_string(t, "%") { + (*to).type = TOKEN_MOD; + return to; + }; + if tokenizer_accept_string(t, "!") { + (*to).type = TOKEN_BANG; + return to; + }; + if tokenizer_accept_string(t, "<") { + (*to).type = TOKEN_LESS; + return to; + }; + if tokenizer_accept_string(t, ">") { + (*to).type = TOKEN_GREATER; + return to; + }; + if tokenizer_accept_string(t, ".") { + (*to).type = TOKEN_DOT; + return to; + }; + + let maybe_int = tokenizer_accept_int_type(t); + if maybe_int != cast(*i64, null) { + (*to).type = TOKEN_NUMBER; + (*to).data = cast(*void, maybe_int); + return to; + }; + + let maybe_char = tokenizer_accept_char_type(t); + if maybe_char != cast(*i8, null) { + (*to).type = TOKEN_CHAR; + (*to).data = cast(*void, maybe_char); + return to; + }; + + let maybe_string = tokenizer_accept_string_type(t); + if maybe_string != cast(*i8, null) { + (*to).type = TOKEN_STRING; + (*to).data = cast(*void, maybe_string); + return to; + }; + + let string = tokenizer_consume_until_condition(t, (c: i8) => bool { + if isalphanum(c) { + return false; + }; + if c == '_' { + return false; + }; + return true; + }); + if strlen(string) == 0 { + printf("NO IDENT!\n"); + return cast(*token, null); + }; + + (*to).type = TOKEN_IDENTIFIER; + (*to).data = cast(*void, string); + + return to; +}; + +let tokenizer_init = (alloc: *arena, file: slice) => *tokenizer { + let t = cast(*tokenizer, arena_alloc(alloc, sizeof(tokenizer))); + (*t).arena = alloc; + (*t).offset = 0; + (*t).buf = cast(*i8, file.data); + (*t).buf_len = file.data_len; + + printf("File size: %d\n", (*t).buf_len); + + printf("%s\n", (*t).buf); + + return t; +}; + +let tokenizer_tokenize = (t: *tokenizer) => slice { + let tokens = cast(*token, arena_alloc((*t).arena, sizeof(token) * 1000)); /* why does it not care about type here */ + let tokens_len = 0; + + while true { + let tk = tokenizer_next(t); + if tk == cast(*token, null) { + break; + }; + printf("Add token: %d\n", (*tk).type); + + (*(tokens + cast(*token, tokens_len))) = *tk; + tokens_len = tokens_len + 1; + }; + + printf("PRINT TOKENS: %d\n", tokens_len); + + print_tokens(tokens, tokens_len); + + let res = slice{}; + res.data = cast(*void, tokens); + res.data_len = tokens_len; + return res; +}; |