about summary refs log tree commit diff
path: root/src/bootstrap/tokenizer.pry
diff options
context:
space:
mode:
authorBaitinq <[email protected]>2025-07-15 17:34:39 +0200
committerBaitinq <[email protected]>2025-07-15 18:00:31 +0200
commitcc56ed42486c2636af50bae451825ad90cfd4b6c (patch)
tree2307d6ced51f427405e4152b4ff1493e245a6b30 /src/bootstrap/tokenizer.pry
parentBoostrap: Support generating LLVM IR file (diff)
downloadpry-lang-cc56ed42486c2636af50bae451825ad90cfd4b6c.tar.gz
pry-lang-cc56ed42486c2636af50bae451825ad90cfd4b6c.tar.bz2
pry-lang-cc56ed42486c2636af50bae451825ad90cfd4b6c.zip
Finish bootstrapping :^)
Diffstat (limited to 'src/bootstrap/tokenizer.pry')
-rw-r--r--src/bootstrap/tokenizer.pry553
1 files changed, 0 insertions, 553 deletions
diff --git a/src/bootstrap/tokenizer.pry b/src/bootstrap/tokenizer.pry
deleted file mode 100644
index ddc2cef..0000000
--- a/src/bootstrap/tokenizer.pry
+++ /dev/null
@@ -1,553 +0,0 @@
-extern strlen = (*i8) => i64;
-extern memcpy = (*void, *void, i64) => void;
-extern sprintf = (*i8, *i8, varargs) => void;
-extern atoi = (*i8) => i64;
-
-import "!stdlib.pry";
-import "!mem.pry";
-
-/* Keywords */
-let TOKEN_IMPORT     = 1;
-let TOKEN_LET        = 2;
-let TOKEN_EXTERN     = 3;
-let TOKEN_IF         = 4;
-let TOKEN_WHILE      = 5;
-let TOKEN_RETURN     = 6;
-let TOKEN_BREAK      = 7;
-let TOKEN_CONTINUE   = 8;
-let TOKEN_ARROW      = 9;
-let TOKEN_STRUCT     = 10;
-let TOKEN_TYPE       = 34;
-
-/* Identifiers */
-let TOKEN_IDENTIFIER = 11;
-
-/* Literals */
-let TOKEN_NUMBER     = 12;
-let TOKEN_BOOLEAN    = 13;
-let TOKEN_NULL       = 14;
-let TOKEN_CHAR       = 15;
-let TOKEN_STRING     = 16;
-
-/* Operators */
-let TOKEN_EQUALS     = 17;
-let TOKEN_PLUS       = 18;
-let TOKEN_MINUS      = 19;
-let TOKEN_MUL        = 20;
-let TOKEN_DIV        = 21;
-let TOKEN_MOD        = 22;
-let TOKEN_BANG       = 23;
-let TOKEN_LESS       = 24;
-let TOKEN_GREATER    = 25;
-let TOKEN_DOT        = 26;
-
-/* Punctuation */
-let TOKEN_SEMICOLON  = 27;
-let TOKEN_COMMA      = 28;
-let TOKEN_COLON      = 29;
-let TOKEN_LPAREN     = 30;
-let TOKEN_RPAREN     = 31;
-let TOKEN_LBRACE     = 32;
-let TOKEN_RBRACE     = 33;
-
-let token = struct {
-	type: i64,
-	data: *void,
-};
-
-let tokenizer = struct {
-	buf: *i8,
-	buf_len: i64,
-	offset: i64,
-
-	arena: *arena,
-};
-
-let print_tokens = (ts: *token, ts_len: i64) => i64 {
-	let i = 0;
-	while i < ts_len {
-		let to = (*(ts + cast(*token, i)));
-
-		if (to.type == TOKEN_IMPORT) {
-			printf("Import\n");
-		};
-		if (to.type == TOKEN_LET) {
-			printf("Let\n");
-		};
-		if (to.type == TOKEN_EXTERN) {
-			printf("Extern\n");
-		};
-		if (to.type == TOKEN_IF) {
-			printf("If\n");
-		};
-		if (to.type == TOKEN_WHILE) {
-			printf("While\n");
-		};
-		if (to.type == TOKEN_RETURN) {
-			printf("Return\n");
-		};
-		if (to.type == TOKEN_BREAK) {
-			printf("Break\n");
-		};
-		if (to.type == TOKEN_CONTINUE) {
-			printf("Continue\n");
-		};
-		if (to.type == TOKEN_ARROW) {
-			printf("Arrow\n");
-		};
-		if (to.type == TOKEN_STRUCT) {
-			printf("Struct\n");
-		};
-		if (to.type == TOKEN_TYPE) {
-			printf("Type\n");
-		};
-		if (to.type == TOKEN_IDENTIFIER) {
-			printf("Identifier: %s\n", cast(*i8, to.data));
-		};
-		if (to.type == TOKEN_NUMBER) {
-			printf("Number: %d\n", *cast(*i64, to.data));
-		};
-		if (to.type == TOKEN_BOOLEAN) {
-			printf("Boolean: %d\n", *cast(*bool, to.data));
-		};
-		if (to.type == TOKEN_NULL) {
-			printf("Null\n");
-		};
-		if (to.type == TOKEN_CHAR) {
-			printf("Char: %c\n", *cast(*i8, to.data));
-		};
-		if (to.type == TOKEN_STRING) {
-			printf("String: %s\n", cast(*i8, to.data));
-		};
-		if (to.type == TOKEN_EQUALS) {
-			printf("Equals\n");
-		};
-		if (to.type == TOKEN_PLUS) {
-			printf("Plus\n");
-		};
-		if (to.type == TOKEN_MINUS) {
-			printf("Minus\n");
-		};
-		if (to.type == TOKEN_MUL) {
-			printf("Mul\n");
-		};
-		if (to.type == TOKEN_DIV) {
-			printf("Div\n");
-		};
-		if (to.type == TOKEN_MOD) {
-			printf("Mod\n");
-		};
-		if (to.type == TOKEN_BANG) {
-			printf("Bang\n");
-		};
-		if (to.type == TOKEN_LESS) {
-			printf("Less\n");
-		};
-		if (to.type == TOKEN_GREATER) {
-			printf("Greater\n");
-		};
-		if (to.type == TOKEN_DOT) {
-			printf("Dot\n");
-		};
-		if (to.type == TOKEN_SEMICOLON) {
-			printf("Semicolon\n");
-		};
-		if (to.type == TOKEN_COMMA) {
-			printf("Comma\n");
-		};
-		if (to.type == TOKEN_COLON) {
-			printf("Colon\n");
-		};
-		if (to.type == TOKEN_LPAREN) {
-			printf("LParen\n");
-		};
-		if (to.type == TOKEN_RPAREN) {
-			printf("RParen\n");
-		};
-		if (to.type == TOKEN_LBRACE) {
-			printf("LBrace\n");
-		};
-		if (to.type == TOKEN_RBRACE) {
-			printf("RBrace\n");
-		};
-
-		i = i + 1;
-	};
-
-	return 0;
-};
-
-let tokenizer_skip_whitespace = (t: *tokenizer) => void {
-	while true {
-		if (*t).offset >= (*t).buf_len { return; };
-		let c = (*((*t).buf + cast(*i8, (*t).offset)));
-		if !iswhitespace(c) {
-			return;
-		};
-		(*t).offset = (*t).offset + 1;
-	};
-
-	return;
-};
-
-let tokenizer_accept_string = (t: *tokenizer, str: *i8) => bool {
-	let str_len = strlen(str);
-	if (*t).offset + str_len > (*t).buf_len { return false; };
-
-	let s = cast(*i8, arena_alloc((*t).arena, 1000));
-	memcpy(cast(*void, s), cast(*void, (*t).buf + cast(*i8, (*t).offset)), str_len);
-
-	if strcmp(s, str) {
-		(*t).offset = (*t).offset + str_len;
-		return true;
-	};
-
-	return false;
-};
-
-let tokenizer_consume_until_condition = (t: *tokenizer, condition: (i8) => bool) => *i8 {
-	let start = (*t).offset;
-	let res = cast(*i8, arena_alloc((*t).arena, 1000));
-
-	while true {
-		if (*t).offset >= (*t).buf_len {
-			return res;
-		};
-
-		let c = (*((*t).buf + cast(*i8, (*t).offset)));
-
-		let offset = (*t).offset;
-		if c == '\\' {
-			let next_c = (*((*t).buf + cast(*i8, offset + 1)));
-		
-			let any = false;
-			if next_c == 'n' {
-				(*(res + cast(*i8, offset - start))) = '\n';
-				any = true;
-			};
-			if next_c == 't' {
-				(*(res + cast(*i8, offset - start))) = '\t';
-				any = true;
-			};
-			if next_c == 'r' {
-				(*(res + cast(*i8, offset - start))) = '\r';
-				any = true;
-			};
-			if next_c == '0' {
-				(*(res + cast(*i8, offset - start))) = '\0';
-				any = true;
-			};
-			if next_c == '\\' {
-				(*(res + cast(*i8, offset - start))) = '\\';
-				any = true;
-			};
-			if !any {
-				(*(res + cast(*i8, offset - start))) = next_c;
-			};
-			
-			offset = offset + 1;
-			offset = offset + 1;
-			(*t).offset = offset;
-
-			continue;
-		};
-
-		if condition(c) {
-			return res;
-		};
-		
-		(*(res + cast(*i8, offset - start))) = c;
-		(*(res + cast(*i8, offset - start + 1))) = '\0';
-
-		offset = offset + 1;
-		(*t).offset = offset;
-	};
-
-	return cast(*i8, null);
-};
-
-let tokenizer_accept_int_type = (t: *tokenizer) => *i64 {
-	let string = tokenizer_consume_until_condition(t, (c: i8) => bool {
-		return !isdigit(c);
-	});
-	if string == cast(*i8, null) {
-		return cast(*i64, null);
-	};
-	if strlen(string) == 0 {
-		return cast(*i64, null);
-	};
-	let x = cast(*i64, arena_alloc((*t).arena, sizeof(i64)));
-	*x = atoi(string);
-	return x;
-};
-
-let tokenizer_accept_char_type = (t: *tokenizer) => *i8 {
-	let prev_offset = (*t).offset;
-	if !tokenizer_accept_string(t, "'") {
-		(*t).offset = prev_offset;
-		return cast(*i8, null);
-	};
-
-	let string = tokenizer_consume_until_condition(t, (c: i8) => bool {
-		return c == '\'';
-	});
-
-	if !tokenizer_accept_string(t, "'") {
-		(*t).offset = prev_offset;
-		return cast(*i8, null);
-	};
-
-	return string;
-};
-
-let tokenizer_accept_string_type = (t: *tokenizer) => *i8 {
-	let prev_offset = (*t).offset;
-	if !tokenizer_accept_string(t, "\"") {
-		(*t).offset = prev_offset;
-		return cast(*i8, null);
-	};
-
-	let string = tokenizer_consume_until_condition(t, (c: i8) => bool {
-		return c == '"';
-	});
-
-	if !tokenizer_accept_string(t, "\"") {
-		(*t).offset = prev_offset;
-		return cast(*i8, null);
-	};
-
-	return string;
-};
-
-let tokenizer_skip_comments = (t: *tokenizer) => void {
-	if !tokenizer_accept_string(t, "/*") { return; };
-
-	while !tokenizer_accept_string(t, "*/") {
-		(*t).offset = (*t).offset + 1;
-	};
-
-	return;
-};
-
-let tokenizer_next = (t: *tokenizer) => *token {
-	tokenizer_skip_whitespace(t);
-	tokenizer_skip_comments(t);
-	tokenizer_skip_whitespace(t);
-
-	if (*t).offset >= (*t).buf_len {
-		return cast(*token, null);
-	};
-	
-	let to = cast(*token, arena_alloc((*t).arena, sizeof(token)));
-
-	if tokenizer_accept_string(t, "import") {
-	    (*to).type = TOKEN_IMPORT;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "let") {
-	    (*to).type = TOKEN_LET;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "extern") {
-	    (*to).type = TOKEN_EXTERN;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "if") {
-	    (*to).type = TOKEN_IF;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "while") {
-	    (*to).type = TOKEN_WHILE;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "return") {
-	    (*to).type = TOKEN_RETURN;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "break") {
-	    (*to).type = TOKEN_BREAK;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "continue") {
-	    (*to).type = TOKEN_CONTINUE;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "true") {
-	    (*to).type = TOKEN_BOOLEAN;
-	    let data = cast(*bool, arena_alloc((*t).arena, sizeof(bool)));
-	    *data = true;
-	    (*to).data = cast(*void, data);
-	    return to;
-	};
-	if tokenizer_accept_string(t, "false") {
-	    (*to).type = TOKEN_BOOLEAN;
-	    let data = cast(*bool, arena_alloc((*t).arena, sizeof(bool)));
-	    *data = false;
-	    (*to).data = cast(*void, data);
-	    return to;
-	};
-	if tokenizer_accept_string(t, "null") {
-	    (*to).type = TOKEN_NULL;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "struct") {
-	    (*to).type = TOKEN_STRUCT;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "newtype") {
-	    (*to).type = TOKEN_TYPE;
-	    return to;
-	};
-
-	if tokenizer_accept_string(t, "=>") {
-	    (*to).type = TOKEN_ARROW;
-	    return to;
-	};
-	if tokenizer_accept_string(t, ";") {
-	    (*to).type = TOKEN_SEMICOLON;
-	    return to;
-	};
-	if tokenizer_accept_string(t, ",") {
-	    (*to).type = TOKEN_COMMA;
-	    return to;
-	};
-	if tokenizer_accept_string(t, ":") {
-	    (*to).type = TOKEN_COLON;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "(") {
-	    (*to).type = TOKEN_LPAREN;
-	    return to;
-	};
-	if tokenizer_accept_string(t, ")") {
-	    (*to).type = TOKEN_RPAREN;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "{") {
-	    (*to).type = TOKEN_LBRACE;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "}") {
-	    (*to).type = TOKEN_RBRACE;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "=") {
-	    (*to).type = TOKEN_EQUALS;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "+") {
-	    (*to).type = TOKEN_PLUS;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "-") {
-	    (*to).type = TOKEN_MINUS;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "*") {
-	    (*to).type = TOKEN_MUL;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "/") {
-	    (*to).type = TOKEN_DIV;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "%") {
-	    (*to).type = TOKEN_MOD;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "!") {
-	    (*to).type = TOKEN_BANG;
-	    return to;
-	};
-	if tokenizer_accept_string(t, "<") {
-	    (*to).type = TOKEN_LESS;
-	    return to;
-	};
-	if tokenizer_accept_string(t, ">") {
-	    (*to).type = TOKEN_GREATER;
-	    return to;
-	};
-	if tokenizer_accept_string(t, ".") {
-	    (*to).type = TOKEN_DOT;
-	    return to;
-	};
-	
-	let maybe_int = tokenizer_accept_int_type(t);
-	if maybe_int != cast(*i64, null) {
-		(*to).type = TOKEN_NUMBER;
-		(*to).data = cast(*void, maybe_int);
-		return to;
-	};
-
-	let maybe_char = tokenizer_accept_char_type(t);
-	if maybe_char != cast(*i8, null) {
-		(*to).type = TOKEN_CHAR;
-		(*to).data = cast(*void, maybe_char);
-		return to;
-	};
-
-	let maybe_string = tokenizer_accept_string_type(t);
-	if maybe_string != cast(*i8, null) {
-		(*to).type = TOKEN_STRING;
-		(*to).data = cast(*void, maybe_string);
-		return to;
-	};
-
-	let string = tokenizer_consume_until_condition(t, (c: i8) => bool {
-		if isalphanum(c) {
-			return false;
-		};
-		if c == '_' {
-			return false;
-		};
-		return true;
-	});
-	if strlen(string) == 0 {
-		printf("NO IDENT!\n");
-		return cast(*token, null);
-	};
-
-	(*to).type = TOKEN_IDENTIFIER;
-	(*to).data = cast(*void, string);
-
-	return to;
-};
-
-let tokenizer_init = (alloc: *arena, file: slice) => *tokenizer {
-	let t = cast(*tokenizer, arena_alloc(alloc, sizeof(tokenizer)));
-	(*t).arena = alloc;
-	(*t).offset = 0;
-	(*t).buf = cast(*i8, file.data);
-	(*t).buf_len = file.data_len;
-
-	printf("File size: %d\n", (*t).buf_len);
-
-	printf("%s\n", (*t).buf);
-
-	return t;
-};
-
-let tokenizer_tokenize = (t: *tokenizer) => slice {
-	let tokens = cast(*token, arena_alloc((*t).arena, sizeof(token) * 40000)); /* why does it not care about type here */
-	let tokens_len = 0;
-
-	while true {
-		let tk = tokenizer_next(t);
-		if tk == cast(*token, null) {
-			break;
-		};
-		printf("Add token: %d\n", (*tk).type);
-
-		(*(tokens + cast(*token, tokens_len))) = *tk;
-		tokens_len = tokens_len + 1;
-	};
-
-	printf("PRINT TOKENS: %d\n", tokens_len);
-
-	print_tokens(tokens, tokens_len);
-
-	let res = slice{};
-	res.data = cast(*void, tokens);
-	res.data_len = tokens_len;
-	return res;
-};