From f2bd233f1dddd91067a3a2c7b9f030356daee3d0 Mon Sep 17 00:00:00 2001 From: Stowy Date: Fri, 19 Jan 2024 00:11:20 +0100 Subject: [PATCH 1/3] Added a first version of rust syntax highlighting --- src/txti/txti.c | 236 ++++++++++++++++++++++++++++++++++++++++++++++++ src/txti/txti.h | 2 + 2 files changed, 238 insertions(+) diff --git a/src/txti/txti.c b/src/txti/txti.c index 62f17a7cc..488aa9364 100644 --- a/src/txti/txti.c +++ b/src/txti/txti.c @@ -69,6 +69,10 @@ txti_lang_kind_from_extension(String8 extension) { kind = TXTI_LangKind_CPlusPlus; } + else if (str8_match(extension, str8_lit("rs"), StringMatchFlag_CaseInsensitive)) + { + kind = TXTI_LangKind_Rust; + } return kind; } @@ -421,6 +425,234 @@ txti_token_array_from_string__cpp(Arena *arena, U64 *bytes_processed_counter, St return result; } +internal TXTI_TokenArray +txti_token_array_from_string__rust(Arena *arena, U64 *bytes_processed_counter, String8 string) +{ + Temp scratch = scratch_begin(&arena, 1); + + //- rjf: generate token list + TXTI_TokenChunkList tokens = {0}; + { + B32 comment_is_single_line = 0; + B32 string_is_char = 0; + TXTI_TokenKind active_token_kind = TXTI_TokenKind_Null; + U64 active_token_start_idx = 0; + B32 escaped = 0; + B32 next_escaped = 0; + U64 byte_process_start_idx = 0; + for(U64 idx = 0; idx <= string.size;) + { + U8 byte = (idx+0 < string.size) ? (string.str[idx+0]) : 0; + U8 next_byte = (idx+1 < string.size) ? (string.str[idx+1]) : 0; + + // rjf: update counter + if(bytes_processed_counter != 0 && ((idx-byte_process_start_idx) >= 1000 || idx == string.size)) + { + ins_atomic_u64_add_eval(bytes_processed_counter, (idx-byte_process_start_idx)); + byte_process_start_idx = idx; + } + + // rjf: escaping + if(escaped && (byte != '\r' && byte != '\n')) + { + next_escaped = 0; + } + else if(!escaped && byte == '\\') + { + next_escaped = 1; + } + + // rjf: take starter, determine active token kind + if(active_token_kind == TXTI_TokenKind_Null) + { + // rjf: use next bytes to start a new token + if(0){} + else if(char_is_space(byte)) { active_token_kind = TXTI_TokenKind_Whitespace; } + else if(byte == '_' || + byte == '$' || + char_is_alpha(byte)) { active_token_kind = TXTI_TokenKind_Identifier; } + else if(char_is_digit(byte, 10) || + (byte == '.' && + char_is_digit(next_byte, 10))) { active_token_kind = TXTI_TokenKind_Numeric; } + else if(byte == '"') { active_token_kind = TXTI_TokenKind_String; string_is_char = 0; } + else if(byte == '\'') { active_token_kind = TXTI_TokenKind_String; string_is_char = 1; } + else if(byte == '/' && next_byte == '/') { active_token_kind = TXTI_TokenKind_Comment; comment_is_single_line = 1; } + else if(byte == '/' && next_byte == '*') { active_token_kind = TXTI_TokenKind_Comment; comment_is_single_line = 0; } + else if(byte == '~' || byte == '!' || + byte == '%' || byte == '^' || + byte == '&' || byte == '*' || + byte == '(' || byte == ')' || + byte == '-' || byte == '=' || + byte == '+' || byte == '[' || + byte == ']' || byte == '{' || + byte == '}' || byte == ':' || + byte == ';' || byte == ',' || + byte == '.' || byte == '<' || + byte == '>' || byte == '/' || + byte == '?' || byte == '|') { active_token_kind = TXTI_TokenKind_Symbol; } + else if(byte == '#') { active_token_kind = TXTI_TokenKind_Meta; } + + // rjf: start new token + if(active_token_kind != TXTI_TokenKind_Null) + { + active_token_start_idx = idx; + } + + // rjf: invalid token kind -> emit error + else + { + TXTI_Token token = {TXTI_TokenKind_Error, r1u64(idx, idx+1)}; + txti_token_chunk_list_push(scratch.arena, &tokens, 4096, &token); + } + } + + // rjf: look for ender + U64 ender_pad = 0; + B32 ender_found = 0; + if(active_token_kind != TXTI_TokenKind_Null && idx>active_token_start_idx) + { + if(idx == string.size) + { + ender_pad = 0; + ender_found = 1; + } + else switch(active_token_kind) + { + default:break; + case TXTI_TokenKind_Whitespace: + { + ender_found = !char_is_space(byte); + }break; + case TXTI_TokenKind_Identifier: + { + ender_found = (!char_is_alpha(byte) && !char_is_digit(byte, 10) && byte != '_' && byte != '$'); + }break; + case TXTI_TokenKind_Numeric: + { + ender_found = (!char_is_alpha(byte) && !char_is_digit(byte, 10) && byte != '_' && byte != '.'); + }break; + case TXTI_TokenKind_String: + { + ender_found = (!escaped && ((!string_is_char && byte == '"') || (string_is_char && byte == '\''))); + ender_pad += 1; + }break; + case TXTI_TokenKind_Symbol: + { + ender_found = (byte != '~' && byte != '!' && + byte != '%' && byte != '^' && + byte != '&' && byte != '*' && + byte != '(' && byte != ')' && + byte != '-' && byte != '=' && + byte != '+' && byte != '[' && + byte != ']' && byte != '{' && + byte != '}' && byte != ':' && + byte != ';' && byte != ',' && + byte != '.' && byte != '<' && + byte != '>' && byte != '/' && + byte != '?' && byte != '|'); + }break; + case TXTI_TokenKind_Comment: + { + if(comment_is_single_line) + { + ender_found = (!escaped && (byte == '\r' || byte == '\n')); + } + else + { + ender_found = (active_token_start_idx+1 < idx && byte == '*' && next_byte == '/'); + ender_pad += 2; + } + }break; + case TXTI_TokenKind_Meta: + { + ender_found = (!escaped && (byte == '\r' || byte == '\n')); + }break; + } + } + + // rjf: next byte is ender => emit token + if(ender_found) + { + TXTI_Token token = {active_token_kind, r1u64(active_token_start_idx, idx+ender_pad)}; + active_token_kind = TXTI_TokenKind_Null; + + // rjf: identifier -> keyword in special cases + if(token.kind == TXTI_TokenKind_Identifier) + { + read_only local_persist String8 rust_keywords[] = + { + str8_lit_comp("as"), + str8_lit_comp("async"), + str8_lit_comp("await"), + str8_lit_comp("break"), + str8_lit_comp("const"), + str8_lit_comp("continue"), + str8_lit_comp("crate"), + str8_lit_comp("dyn"), + str8_lit_comp("else"), + str8_lit_comp("enum"), + str8_lit_comp("extern"), + str8_lit_comp("false"), + str8_lit_comp("fn"), + str8_lit_comp("for"), + str8_lit_comp("if"), + str8_lit_comp("impl"), + str8_lit_comp("in"), + str8_lit_comp("let"), + str8_lit_comp("loop"), + str8_lit_comp("match"), + str8_lit_comp("mod"), + str8_lit_comp("move"), + str8_lit_comp("mut"), + str8_lit_comp("pub"), + str8_lit_comp("ref"), + str8_lit_comp("return"), + str8_lit_comp("Self"), + str8_lit_comp("self"), + str8_lit_comp("static"), + str8_lit_comp("struct"), + str8_lit_comp("super"), + str8_lit_comp("trait"), + str8_lit_comp("true"), + str8_lit_comp("type"), + str8_lit_comp("unsafe"), + str8_lit_comp("use"), + str8_lit_comp("where"), + str8_lit_comp("while"), + }; + String8 token_string = str8_substr(string, r1u64(active_token_start_idx, idx+ender_pad)); + for(U64 keyword_idx = 0; keyword_idx < ArrayCount(rust_keywords); keyword_idx += 1) + { + if(str8_match(rust_keywords[keyword_idx], token_string, 0)) + { + token.kind = TXTI_TokenKind_Keyword; + break; + } + } + } + + // rjf: push + txti_token_chunk_list_push(scratch.arena, &tokens, 4096, &token); + + // rjf: increment by ender padding + idx += ender_pad; + } + + // rjf: advance by 1 byte if we haven't found an ender + if(!ender_found) + { + idx += 1; + } + escaped = next_escaped; + } + } + + //- rjf: token list -> token array + TXTI_TokenArray result = txti_token_array_from_chunk_list(arena, &tokens); + scratch_end(scratch); + return result; +} + //////////////////////////////// //~ rjf: Message Type Functions @@ -999,6 +1231,10 @@ txti_mut_thread_entry_point(void *p) { lex_function = txti_token_array_from_string__cpp; }break; + case TXTI_LangKind_Rust: + { + lex_function = txti_token_array_from_string__rust; + }break; } //- rjf: detect line end kind diff --git a/src/txti/txti.h b/src/txti/txti.h index c43b3b27d..dc4a3301c 100644 --- a/src/txti/txti.h +++ b/src/txti/txti.h @@ -146,6 +146,7 @@ typedef enum TXTI_LangKind TXTI_LangKind_Null, TXTI_LangKind_C, TXTI_LangKind_CPlusPlus, + TXTI_LangKind_Rust, TXTI_LangKind_COUNT } TXTI_LangKind; @@ -354,6 +355,7 @@ internal TXTI_TokenArray txti_token_array_from_list(Arena *arena, TXTI_TokenList //~ rjf: Lexing Functions internal TXTI_TokenArray txti_token_array_from_string__cpp(Arena *arena, U64 *bytes_processed_counter, String8 string); +internal TXTI_TokenArray txti_token_array_from_string__rust(Arena *arena, U64 *bytes_processed_counter, String8 string); //////////////////////////////// //~ rjf: Message Type Functions From b0fa880a906c1ddd3318b4506c06d6d3282f4b96 Mon Sep 17 00:00:00 2001 From: Stowy Date: Fri, 19 Jan 2024 12:23:58 +0100 Subject: [PATCH 2/3] Fixed 'static being taken for a string in rust highlighting --- src/txti/txti.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/txti/txti.c b/src/txti/txti.c index 488aa9364..fa0dfd82e 100644 --- a/src/txti/txti.c +++ b/src/txti/txti.c @@ -444,6 +444,7 @@ txti_token_array_from_string__rust(Arena *arena, U64 *bytes_processed_counter, S { U8 byte = (idx+0 < string.size) ? (string.str[idx+0]) : 0; U8 next_byte = (idx+1 < string.size) ? (string.str[idx+1]) : 0; + U8 third_byte = (idx+2 < string.size) ? (string.str[idx+2]) : 0; // rjf: update counter if(bytes_processed_counter != 0 && ((idx-byte_process_start_idx) >= 1000 || idx == string.size)) @@ -470,7 +471,10 @@ txti_token_array_from_string__rust(Arena *arena, U64 *bytes_processed_counter, S else if(char_is_space(byte)) { active_token_kind = TXTI_TokenKind_Whitespace; } else if(byte == '_' || byte == '$' || - char_is_alpha(byte)) { active_token_kind = TXTI_TokenKind_Identifier; } + char_is_alpha(byte) || + (byte == '\'' && + next_byte == 's' && + third_byte != '\'')) { active_token_kind = TXTI_TokenKind_Identifier; } else if(char_is_digit(byte, 10) || (byte == '.' && char_is_digit(next_byte, 10))) { active_token_kind = TXTI_TokenKind_Numeric; } @@ -619,6 +623,22 @@ txti_token_array_from_string__rust(Arena *arena, U64 *bytes_processed_counter, S str8_lit_comp("use"), str8_lit_comp("where"), str8_lit_comp("while"), + str8_lit_comp("abstract"), + str8_lit_comp("become"), + str8_lit_comp("box"), + str8_lit_comp("do"), + str8_lit_comp("final"), + str8_lit_comp("macro"), + str8_lit_comp("override"), + str8_lit_comp("priv"), + str8_lit_comp("typeof"), + str8_lit_comp("unsized"), + str8_lit_comp("virtual"), + str8_lit_comp("yield"), + str8_lit_comp("try"), + str8_lit_comp("macro_rules"), + str8_lit_comp("union"), + str8_lit_comp("'static"), }; String8 token_string = str8_substr(string, r1u64(active_token_start_idx, idx+ender_pad)); for(U64 keyword_idx = 0; keyword_idx < ArrayCount(rust_keywords); keyword_idx += 1) From ad3428d8f54438f686ccd199d869534b9533bacd Mon Sep 17 00:00:00 2001 From: Stowy Date: Fri, 19 Jan 2024 12:27:59 +0100 Subject: [PATCH 3/3] Fixed `0..size` being taken as a numeric token --- src/txti/txti.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/txti/txti.c b/src/txti/txti.c index fa0dfd82e..2376d1c2a 100644 --- a/src/txti/txti.c +++ b/src/txti/txti.c @@ -533,7 +533,7 @@ txti_token_array_from_string__rust(Arena *arena, U64 *bytes_processed_counter, S }break; case TXTI_TokenKind_Numeric: { - ender_found = (!char_is_alpha(byte) && !char_is_digit(byte, 10) && byte != '_' && byte != '.'); + ender_found = (!char_is_alpha(byte) && !char_is_digit(byte, 10) && byte != '_' && (byte != '.' || next_byte == '.')); }break; case TXTI_TokenKind_String: {