about summary refs log tree commit diff
path: root/src/bootstrap/tokenizer.pry
diff options
context:
space:
mode:
Diffstat (limited to 'src/bootstrap/tokenizer.pry')
-rw-r--r--src/bootstrap/tokenizer.pry553
1 files changed, 553 insertions, 0 deletions
diff --git a/src/bootstrap/tokenizer.pry b/src/bootstrap/tokenizer.pry
new file mode 100644
index 0000000..a2cafb4
--- /dev/null
+++ b/src/bootstrap/tokenizer.pry
@@ -0,0 +1,553 @@
+extern strlen = (*i8) => i64;
+extern memcpy = (*void, *void, i64) => void;
+extern sprintf = (*i8, *i8, varargs) => void;
+extern atoi = (*i8) => i64;
+
+import "!stdlib.pry";
+import "!mem.pry";
+
+/* Keywords */
+let TOKEN_IMPORT     = 1;
+let TOKEN_LET        = 2;
+let TOKEN_EXTERN     = 3;
+let TOKEN_IF         = 4;
+let TOKEN_WHILE      = 5;
+let TOKEN_RETURN     = 6;
+let TOKEN_BREAK      = 7;
+let TOKEN_CONTINUE   = 8;
+let TOKEN_ARROW      = 9;
+let TOKEN_STRUCT     = 10;
+let TOKEN_TYPE       = 34;
+
+/* Identifiers */
+let TOKEN_IDENTIFIER = 11;
+
+/* Literals */
+let TOKEN_NUMBER     = 12;
+let TOKEN_BOOLEAN    = 13;
+let TOKEN_NULL       = 14;
+let TOKEN_CHAR       = 15;
+let TOKEN_STRING     = 16;
+
+/* Operators */
+let TOKEN_EQUALS     = 17;
+let TOKEN_PLUS       = 18;
+let TOKEN_MINUS      = 19;
+let TOKEN_MUL        = 20;
+let TOKEN_DIV        = 21;
+let TOKEN_MOD        = 22;
+let TOKEN_BANG       = 23;
+let TOKEN_LESS       = 24;
+let TOKEN_GREATER    = 25;
+let TOKEN_DOT        = 26;
+
+/* Punctuation */
+let TOKEN_SEMICOLON  = 27;
+let TOKEN_COMMA      = 28;
+let TOKEN_COLON      = 29;
+let TOKEN_LPAREN     = 30;
+let TOKEN_RPAREN     = 31;
+let TOKEN_LBRACE     = 32;
+let TOKEN_RBRACE     = 33;
+
+let token = struct {
+	type: i64,
+	data: *void,
+};
+
+let tokenizer = struct {
+	buf: *i8,
+	buf_len: i64,
+	offset: i64,
+
+	arena: *arena,
+};
+
+let print_tokens = (ts: *token, ts_len: i64) => i64 {
+	let i = 0;
+	while i < ts_len {
+		let to = (*(ts + cast(*token, i)));
+
+		if (to.type == TOKEN_IMPORT) {
+			printf("Import\n");
+		};
+		if (to.type == TOKEN_LET) {
+			printf("Let\n");
+		};
+		if (to.type == TOKEN_EXTERN) {
+			printf("Extern\n");
+		};
+		if (to.type == TOKEN_IF) {
+			printf("If\n");
+		};
+		if (to.type == TOKEN_WHILE) {
+			printf("While\n");
+		};
+		if (to.type == TOKEN_RETURN) {
+			printf("Return\n");
+		};
+		if (to.type == TOKEN_BREAK) {
+			printf("Break\n");
+		};
+		if (to.type == TOKEN_CONTINUE) {
+			printf("Continue\n");
+		};
+		if (to.type == TOKEN_ARROW) {
+			printf("Arrow\n");
+		};
+		if (to.type == TOKEN_STRUCT) {
+			printf("Struct\n");
+		};
+		if (to.type == TOKEN_TYPE) {
+			printf("Type\n");
+		};
+		if (to.type == TOKEN_IDENTIFIER) {
+			printf("Identifier: %s\n", cast(*i8, to.data));
+		};
+		if (to.type == TOKEN_NUMBER) {
+			printf("Number: %d\n", *cast(*i64, to.data));
+		};
+		if (to.type == TOKEN_BOOLEAN) {
+			printf("Boolean: %d\n", *cast(*bool, to.data));
+		};
+		if (to.type == TOKEN_NULL) {
+			printf("Null\n");
+		};
+		if (to.type == TOKEN_CHAR) {
+			printf("Char: %c\n", *cast(*i8, to.data));
+		};
+		if (to.type == TOKEN_STRING) {
+			printf("String: %s\n", cast(*i8, to.data));
+		};
+		if (to.type == TOKEN_EQUALS) {
+			printf("Equals\n");
+		};
+		if (to.type == TOKEN_PLUS) {
+			printf("Plus\n");
+		};
+		if (to.type == TOKEN_MINUS) {
+			printf("Minus\n");
+		};
+		if (to.type == TOKEN_MUL) {
+			printf("Mul\n");
+		};
+		if (to.type == TOKEN_DIV) {
+			printf("Div\n");
+		};
+		if (to.type == TOKEN_MOD) {
+			printf("Mod\n");
+		};
+		if (to.type == TOKEN_BANG) {
+			printf("Bang\n");
+		};
+		if (to.type == TOKEN_LESS) {
+			printf("Less\n");
+		};
+		if (to.type == TOKEN_GREATER) {
+			printf("Greater\n");
+		};
+		if (to.type == TOKEN_DOT) {
+			printf("Dot\n");
+		};
+		if (to.type == TOKEN_SEMICOLON) {
+			printf("Semicolon\n");
+		};
+		if (to.type == TOKEN_COMMA) {
+			printf("Comma\n");
+		};
+		if (to.type == TOKEN_COLON) {
+			printf("Colon\n");
+		};
+		if (to.type == TOKEN_LPAREN) {
+			printf("LParen\n");
+		};
+		if (to.type == TOKEN_RPAREN) {
+			printf("RParen\n");
+		};
+		if (to.type == TOKEN_LBRACE) {
+			printf("LBrace\n");
+		};
+		if (to.type == TOKEN_RBRACE) {
+			printf("RBrace\n");
+		};
+
+		i = i + 1;
+	};
+
+	return 0;
+};
+
+let tokenizer_skip_whitespace = (t: *tokenizer) => void {
+	while true {
+		if (*t).offset >= (*t).buf_len { return; };
+		let c = (*((*t).buf + cast(*i8, (*t).offset)));
+		if !iswhitespace(c) {
+			return;
+		};
+		(*t).offset = (*t).offset + 1;
+	};
+
+	return;
+};
+
+let tokenizer_accept_string = (t: *tokenizer, str: *i8) => bool {
+	let str_len = strlen(str);
+	if (*t).offset + str_len > (*t).buf_len { return false; };
+
+	let s = cast(*i8, arena_alloc((*t).arena, 1000));
+	memcpy(cast(*void, s), cast(*void, (*t).buf + cast(*i8, (*t).offset)), str_len);
+
+	if strcmp(s, str) {
+		(*t).offset = (*t).offset + str_len;
+		return true;
+	};
+
+	return false;
+};
+
+let tokenizer_consume_until_condition = (t: *tokenizer, condition: (i8) => bool) => *i8 {
+	let start = (*t).offset;
+	let res = cast(*i8, arena_alloc((*t).arena, 1000));
+
+	while true {
+		if (*t).offset >= (*t).buf_len {
+			return res;
+		};
+
+		let c = (*((*t).buf + cast(*i8, (*t).offset)));
+
+		let offset = (*t).offset;
+		if c == '\\' {
+			let next_c = (*((*t).buf + cast(*i8, offset + 1)));
+		
+			let any = false;
+			if next_c == 'n' {
+				(*(res + cast(*i8, offset - start))) = '\n';
+				any = true;
+			};
+			if next_c == 't' {
+				(*(res + cast(*i8, offset - start))) = '\t';
+				any = true;
+			};
+			if next_c == 'r' {
+				(*(res + cast(*i8, offset - start))) = '\r';
+				any = true;
+			};
+			if next_c == '0' {
+				(*(res + cast(*i8, offset - start))) = '\0';
+				any = true;
+			};
+			if next_c == '\\' {
+				(*(res + cast(*i8, offset - start))) = '\\';
+				any = true;
+			};
+			if !any {
+				(*(res + cast(*i8, offset - start))) = next_c;
+			};
+			
+			offset = offset + 1;
+			offset = offset + 1;
+			(*t).offset = offset;
+
+			continue;
+		};
+
+		if condition(c) {
+			return res;
+		};
+		
+		(*(res + cast(*i8, offset - start))) = c;
+		(*(res + cast(*i8, offset - start + 1))) = '\0';
+
+		offset = offset + 1;
+		(*t).offset = offset;
+	};
+
+	return cast(*i8, null);
+};
+
+let tokenizer_accept_int_type = (t: *tokenizer) => *i64 {
+	let string = tokenizer_consume_until_condition(t, (c: i8) => bool {
+		return !isdigit(c);
+	});
+	if string == cast(*i8, null) {
+		return cast(*i64, null);
+	};
+	if strlen(string) == 0 {
+		return cast(*i64, null);
+	};
+	let x = cast(*i64, arena_alloc((*t).arena, sizeof(i64)));
+	*x = atoi(string);
+	return x;
+};
+
+let tokenizer_accept_char_type = (t: *tokenizer) => *i8 {
+	let prev_offset = (*t).offset;
+	if !tokenizer_accept_string(t, "'") {
+		(*t).offset = prev_offset;
+		return cast(*i8, null);
+	};
+
+	let string = tokenizer_consume_until_condition(t, (c: i8) => bool {
+		return c == '\'';
+	});
+
+	if !tokenizer_accept_string(t, "'") {
+		(*t).offset = prev_offset;
+		return cast(*i8, null);
+	};
+
+	return string;
+};
+
+let tokenizer_accept_string_type = (t: *tokenizer) => *i8 {
+	let prev_offset = (*t).offset;
+	if !tokenizer_accept_string(t, "\"") {
+		(*t).offset = prev_offset;
+		return cast(*i8, null);
+	};
+
+	let string = tokenizer_consume_until_condition(t, (c: i8) => bool {
+		return c == '"';
+	});
+
+	if !tokenizer_accept_string(t, "\"") {
+		(*t).offset = prev_offset;
+		return cast(*i8, null);
+	};
+
+	return string;
+};
+
+let tokenizer_skip_comments = (t: *tokenizer) => void {
+	if !tokenizer_accept_string(t, "/*") { return; };
+
+	while !tokenizer_accept_string(t, "*/") {
+		(*t).offset = (*t).offset + 1;
+	};
+
+	return;
+};
+
+let tokenizer_next = (t: *tokenizer) => *token {
+	tokenizer_skip_whitespace(t);
+	tokenizer_skip_comments(t);
+	tokenizer_skip_whitespace(t);
+
+	if (*t).offset >= (*t).buf_len {
+		return cast(*token, null);
+	};
+	
+	let to = cast(*token, arena_alloc((*t).arena, sizeof(token)));
+
+	if tokenizer_accept_string(t, "import") {
+	    (*to).type = TOKEN_IMPORT;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "let") {
+	    (*to).type = TOKEN_LET;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "extern") {
+	    (*to).type = TOKEN_EXTERN;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "if") {
+	    (*to).type = TOKEN_IF;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "while") {
+	    (*to).type = TOKEN_WHILE;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "return") {
+	    (*to).type = TOKEN_RETURN;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "break") {
+	    (*to).type = TOKEN_BREAK;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "continue") {
+	    (*to).type = TOKEN_CONTINUE;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "true") {
+	    (*to).type = TOKEN_BOOLEAN;
+	    let data = cast(*bool, arena_alloc((*t).arena, sizeof(bool)));
+	    *data = true;
+	    (*to).data = cast(*void, data);
+	    return to;
+	};
+	if tokenizer_accept_string(t, "false") {
+	    (*to).type = TOKEN_BOOLEAN;
+	    let data = cast(*bool, arena_alloc((*t).arena, sizeof(bool)));
+	    *data = false;
+	    (*to).data = cast(*void, data);
+	    return to;
+	};
+	if tokenizer_accept_string(t, "null") {
+	    (*to).type = TOKEN_NULL;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "struct") {
+	    (*to).type = TOKEN_STRUCT;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "newtype") {
+	    (*to).type = TOKEN_TYPE;
+	    return to;
+	};
+
+	if tokenizer_accept_string(t, "=>") {
+	    (*to).type = TOKEN_ARROW;
+	    return to;
+	};
+	if tokenizer_accept_string(t, ";") {
+	    (*to).type = TOKEN_SEMICOLON;
+	    return to;
+	};
+	if tokenizer_accept_string(t, ",") {
+	    (*to).type = TOKEN_COMMA;
+	    return to;
+	};
+	if tokenizer_accept_string(t, ":") {
+	    (*to).type = TOKEN_COLON;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "(") {
+	    (*to).type = TOKEN_LPAREN;
+	    return to;
+	};
+	if tokenizer_accept_string(t, ")") {
+	    (*to).type = TOKEN_RPAREN;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "{") {
+	    (*to).type = TOKEN_LBRACE;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "}") {
+	    (*to).type = TOKEN_RBRACE;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "=") {
+	    (*to).type = TOKEN_EQUALS;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "+") {
+	    (*to).type = TOKEN_PLUS;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "-") {
+	    (*to).type = TOKEN_MINUS;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "*") {
+	    (*to).type = TOKEN_MUL;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "/") {
+	    (*to).type = TOKEN_DIV;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "%") {
+	    (*to).type = TOKEN_MOD;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "!") {
+	    (*to).type = TOKEN_BANG;
+	    return to;
+	};
+	if tokenizer_accept_string(t, "<") {
+	    (*to).type = TOKEN_LESS;
+	    return to;
+	};
+	if tokenizer_accept_string(t, ">") {
+	    (*to).type = TOKEN_GREATER;
+	    return to;
+	};
+	if tokenizer_accept_string(t, ".") {
+	    (*to).type = TOKEN_DOT;
+	    return to;
+	};
+	
+	let maybe_int = tokenizer_accept_int_type(t);
+	if maybe_int != cast(*i64, null) {
+		(*to).type = TOKEN_NUMBER;
+		(*to).data = cast(*void, maybe_int);
+		return to;
+	};
+
+	let maybe_char = tokenizer_accept_char_type(t);
+	if maybe_char != cast(*i8, null) {
+		(*to).type = TOKEN_CHAR;
+		(*to).data = cast(*void, maybe_char);
+		return to;
+	};
+
+	let maybe_string = tokenizer_accept_string_type(t);
+	if maybe_string != cast(*i8, null) {
+		(*to).type = TOKEN_STRING;
+		(*to).data = cast(*void, maybe_string);
+		return to;
+	};
+
+	let string = tokenizer_consume_until_condition(t, (c: i8) => bool {
+		if isalphanum(c) {
+			return false;
+		};
+		if c == '_' {
+			return false;
+		};
+		return true;
+	});
+	if strlen(string) == 0 {
+		printf("NO IDENT!\n");
+		return cast(*token, null);
+	};
+
+	(*to).type = TOKEN_IDENTIFIER;
+	(*to).data = cast(*void, string);
+
+	return to;
+};
+
+let tokenizer_init = (alloc: *arena, file: slice) => *tokenizer {
+	let t = cast(*tokenizer, arena_alloc(alloc, sizeof(tokenizer)));
+	(*t).arena = alloc;
+	(*t).offset = 0;
+	(*t).buf = cast(*i8, file.data);
+	(*t).buf_len = file.data_len;
+
+	printf("File size: %d\n", (*t).buf_len);
+
+	printf("%s\n", (*t).buf);
+
+	return t;
+};
+
+let tokenizer_tokenize = (t: *tokenizer) => slice {
+	let tokens = cast(*token, arena_alloc((*t).arena, sizeof(token) * 1000)); /* why does it not care about type here */
+	let tokens_len = 0;
+
+	while true {
+		let tk = tokenizer_next(t);
+		if tk == cast(*token, null) {
+			break;
+		};
+		printf("Add token: %d\n", (*tk).type);
+
+		(*(tokens + cast(*token, tokens_len))) = *tk;
+		tokens_len = tokens_len + 1;
+	};
+
+	printf("PRINT TOKENS: %d\n", tokens_len);
+
+	print_tokens(tokens, tokens_len);
+
+	let res = slice{};
+	res.data = cast(*void, tokens);
+	res.data_len = tokens_len;
+	return res;
+};