Improve highlighters

This commit is contained in:
2026-01-18 17:49:36 +00:00
parent c9324c13aa
commit b5c49f4277
5 changed files with 408 additions and 17 deletions

View File

@@ -1,5 +1,7 @@
ADD(Data)
ADD(Shebang)
ADD(Comment)
ADD(Error)
ADD(String)
ADD(Escape)
ADD(Interpolation)

View File

@@ -22,7 +22,7 @@ cjk_samples = [
]
# Ruby regex with unicode
$unicode_regex = /[一-龯ぁ-ん#{0x3000}
$unicode_regex_multiline = /[一-龯ぁ-ん#{0x3000}
\-ヶー
s wow
@@ -30,22 +30,29 @@ s wow
々〆〤]/
UNICORE = %r{
[s]
{#{}}
\C-s\u{10}
}
UNINITCORE = %{
{{#{}}}
test = "A:\x41 B:\101 C:\u0043 D:\u{44 45} NUL:\0 DEL:\c? CTRL_A:\cA META_X:\M-x CTRL_META_X:\C-\M-x MIX:\C-\M-z N:\N{UNICODE NAME}"
}
# Unicode identifiers (valid in Ruby)
= 0x5_4eddaee
π = 3.14_159e+2, ?\u0234, ?\,, ?\x0A, ?s, true, false
π = 3.14_159e+2, ?\u0234, ?\,, ?\x0A, ?s, true, false, 0
= -> { "こんに \n ちは" }
arr = Array.new()
not_arr = NotArray.new()
raise NameError or SystemExit or CustomError or Errno or ErrorNotAtAll
# Method using unicode variable names
def math_test
puts "π * 2 = #{π * 2}"
@@ -57,6 +64,7 @@ cjk_samples.each_with_index do |str, idx:|
symbol = :"
a
"
sym2 = :hello
end
# Test emoji width behaviors
@@ -98,10 +106,10 @@ mixed = [
two_docs = <<DOC1 , <<DOC2
stuff for doc2
DOC1
stuff for doc 2 with \#{interpolation} and more
stuff for doc 2 with \#{not interpolation} and more
DOC2
p = 0 <<22
p = 0 <<22 # not a heredoc
mixed.each { |m| puts m }
@@ -170,6 +178,8 @@ class TestObject
puts "#{@name}: #{@value}"
end
private
def double_value
@value * 2
end
@@ -297,7 +307,7 @@ TEXT
puts multi_line
# Symbols and strings
sym = :my_symbol
sym = :my_symbol == __dir__
str = "my string"
puts "Symbol: #{sym}, String: #{str}"
@@ -336,3 +346,9 @@ puts "Match 'fox'?" if sample_text =~ /fox/
# End of test script
puts "Ruby syntax highlighting test complete."
__END__
Anything here should be ignored >><<
{{{}}}[[[]]](((000)))

View File

@@ -70,3 +70,9 @@ std::shared_ptr<void> bash_parse(std::vector<Token> *tokens,
}
return state;
}
// String literals surrounded by ' strictly with no escaping inside
// double quoted strings " allow interpolation and escaping - with $var and
// ${var} and $((math)) $(command) and `command` expansions ANSI-C quoted
// stirngs - $'' backslash escapes but with \xHH and \uHHHH and \uHHHHHHHH \cX
// too

View File

@@ -1,6 +1,182 @@
#include "syntax/decl.h"
#include "syntax/langs.h"
// TODO: in regex better highlighting of regex structures
const static std::vector<std::string> types = {
"BasicObject", "Object", "NilClass",
"TrueClass", "FalseClass", "Integer",
"Fixnum", "Bignum", "Float",
"Rational", "Complex", "Numeric",
"String", "Symbol", "Array",
"Hash", "Range", "Regexp",
"Struct", "Enumarator", "Enumerable",
"Time", "Date", "IO",
"File", "Dir", "Thread",
"Proc", "Method", "Module",
"Class", "Mutex", "ConditionVariable",
"MatchData", "Encoding", "Fiber",
};
const static std::vector<std::string> builtins = {
"ARGF",
"ARGV",
"ENV",
"STDIN",
"STDOUT",
"STDERR",
"DATA",
"TOPLEVEL_BINDING",
"RUBY_PLATFORM",
"RUBY_VERSION",
"RUBY_RELEASE_DATE",
"RUBY_PATCHLEVEL",
"RUBY_ENGINE",
"__LINE__",
"__FILE__",
"__ENCODING__",
"__dir__",
"__callee__",
"__method__",
"__id__",
"__send__",
};
const static std::vector<std::string> methods = {
"abort",
"at_exit",
"binding",
"block_given?",
"caller",
"catch",
"chomp",
"chomp!",
"chop",
"chop!",
"eval",
"exec",
"exit",
"exit!",
"fail",
"fork",
"format",
"gets",
"global_variables",
"gsub",
"gsub!",
"iterator?",
"lambda",
"load",
"loop",
"open",
"print",
"printf",
"proc",
"putc",
"puts",
"raise",
"rand",
"readline",
"readlines",
"require",
"require_relative",
"select",
"sleep",
"spawn",
"split",
"sprintf",
"srand",
"sub",
"sub!",
"syscall",
"system",
"test",
"throw",
"trace_var",
"trap",
"untrace_var",
"attr",
"attr_reader",
"attr_writer",
"attr_accessor",
"class_variable_get",
"class_variable_set",
"define_method",
"instance_variable_get",
"instance_variable_set",
"private",
"protected",
"public",
"public_class_method",
"module_function",
"remove_method",
"undef_method",
"method",
"methods",
"singleton_methods",
"private_methods",
"protected_methods",
"public_methods",
"send",
"extend",
"include",
"prepend",
"clone",
"dup",
"freeze",
"taint",
"untaint",
"trust",
"untrust",
"untaint?",
"trust?",
"each",
"each_with_index",
"each_with_object",
"map",
"collect",
"select",
"reject",
"reduce",
"inject",
"find",
"detect",
"all?",
"any?",
"none?",
"one?",
"count",
"cycle",
"drop",
"drop_while",
"take",
"take_while",
"chunk",
"chunk_while",
"group_by",
"partition",
"slice_before",
"slice_after",
"nil?",
"is_a?",
"kind_of?",
"instance_of?",
"respond_to?",
"equal?",
"object_id",
"class",
"singleton_class",
"clone",
"freeze",
"tap",
"then",
};
const static std::vector<std::string> errors = {
"Exception", "SignalException", "Interrupt", "StopIteration",
"Errno", "SystemExit", "fatal",
};
const static std::vector<std::string> base_keywords = {
"class", "module", "begin", "end", "else", "rescue", "ensure", "do", "when",
};
@@ -119,12 +295,20 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
static Trie operator_keywords_trie;
static Trie expecting_operators_trie;
static Trie operator_trie;
static Trie types_trie;
static Trie builtins_trie;
static Trie methods_trie;
static Trie errors_trie;
if (!keywords_trie_init) {
base_keywords_trie.build(base_keywords);
expecting_keywords_trie.build(expecting_keywords);
operator_keywords_trie.build(operator_keywords);
expecting_operators_trie.build(expecting_operators);
operator_trie.build(operators);
types_trie.build(types);
builtins_trie.build(builtins);
methods_trie.build(methods);
errors_trie.build(errors);
keywords_trie_init = true;
}
tokens->clear();
@@ -174,8 +358,60 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
tokens->push_back({start, i, TokenKind::String});
start = i;
i++;
if (i < len && text[i] == 'x') {
i++;
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
} else if (i < len && text[i] == 'u') {
i++;
if (i < len && text[i] == '{') {
i++;
while (i < len && text[i] != '}')
i++;
if (i < len)
i++;
} else {
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
}
} else if (i < len && text[i] >= '0' && text[i] <= '7') {
i++;
if (i < len && text[i] >= '0' && text[i] <= '7')
i++;
if (i < len && text[i] >= '0' && text[i] <= '7')
i++;
} else if (i < len && text[i] == 'c') {
i++;
if (i < len && text[i] != '\\')
i++;
} else if (i < len && (text[i] == 'M' || text[i] == 'C')) {
i++;
if (i < len && text[i] == '-') {
i++;
if (i < len && text[i] != '\\')
i++;
}
} else if (i < len && text[i] == 'N') {
i++;
if (i < len && text[i] == '{') {
i++;
while (i < len && text[i] != '}')
i++;
if (i < len)
i++;
}
} else {
if (i < len)
i++;
}
tokens->push_back({start, i, TokenKind::Escape});
continue;
}
@@ -202,11 +438,62 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
tokens->push_back({start, i, TokenKind::String});
start = i;
i++;
if (i < len && text[i] == 'x') {
i++;
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
} else if (i < len && text[i] == 'u') {
i++;
if (i < len && text[i] == '{') {
i++;
while (i < len && text[i] != '}')
i++;
if (i < len)
i++;
} else {
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
}
} else if (i < len && text[i] >= '0' && text[i] <= '7') {
i++;
if (i < len && text[i] >= '0' && text[i] <= '7')
i++;
if (i < len && text[i] >= '0' && text[i] <= '7')
i++;
} else if (i < len && text[i] == 'c') {
i++;
if (i < len && text[i] != '\\')
i++;
} else if (i < len && (text[i] == 'M' || text[i] == 'C')) {
i++;
if (i < len && text[i] == '-') {
i++;
if (i < len && text[i] != '\\')
i++;
}
} else if (i < len && text[i] == 'N') {
i++;
if (i < len && text[i] == '{') {
i++;
while (i < len && text[i] != '}')
i++;
if (i < len)
i++;
}
} else {
if (i < len)
i++;
}
tokens->push_back({start, i, TokenKind::Escape});
continue;
continue;
}
if (state->full_state->lit.allow_interp && text[i] == '#' &&
i + 1 < len && text[i + 1] == '{') {
@@ -253,11 +540,62 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
while (i < len) {
if (text[i] == '\\') {
tokens->push_back({start, i, TokenKind::Regexp});
;
start = i;
i++;
if (i < len && text[i] == 'x') {
i++;
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
} else if (i < len && text[i] == 'u') {
i++;
if (i < len && text[i] == '{') {
i++;
while (i < len && text[i] != '}')
i++;
if (i < len)
i++;
} else {
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
if (i < len && isxdigit(text[i]))
i++;
}
} else if (i < len && text[i] >= '0' && text[i] <= '7') {
i++;
if (i < len && text[i] >= '0' && text[i] <= '7')
i++;
if (i < len && text[i] >= '0' && text[i] <= '7')
i++;
} else if (i < len && text[i] == 'c') {
i++;
if (i < len && text[i] != '\\')
i++;
} else if (i < len && (text[i] == 'M' || text[i] == 'C')) {
i++;
if (i < len && text[i] == '-') {
i++;
if (i < len && text[i] != '\\')
i++;
}
} else if (i < len && text[i] == 'N') {
i++;
if (i < len && text[i] == '{') {
i++;
while (i < len && text[i] != '}')
i++;
if (i < len)
i++;
}
} else {
if (i < len)
i++;
}
tokens->push_back({start, i, TokenKind::Escape});
continue;
}
@@ -365,6 +703,11 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
i++;
continue;
} else if (text[i] == '#') {
if (i == 0 && len > 4 && text[i + 1] == '!') {
state->full_state->expecting_expr = false;
tokens->push_back({0, len, TokenKind::Shebang});
return state;
}
tokens->push_back({i, len, TokenKind::Comment});
state->full_state->expecting_expr = false;
return state;
@@ -394,7 +737,7 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
continue;
}
if (text[i] == '\'' || text[i] == '"') {
tokens->push_back({start, i, TokenKind::Operator});
tokens->push_back({start, i, TokenKind::Label});
state->full_state->expecting_expr = true;
continue;
}
@@ -793,9 +1136,31 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
tokens->push_back({i, i + length, TokenKind::KeywordOperator});
i += length;
continue;
} else if ((length = types_trie.match(text, i, len, identifier_char))) {
tokens->push_back({i, i + length, TokenKind::Type});
i += length;
continue;
} else if ((length = methods_trie.match(text, i, len, identifier_char))) {
tokens->push_back({i, i + length, TokenKind::Function});
i += length;
continue;
} else if ((length =
builtins_trie.match(text, i, len, identifier_char))) {
tokens->push_back({i, i + length, TokenKind::Constant});
i += length;
continue;
} else if ((length = errors_trie.match(text, i, len, identifier_char))) {
tokens->push_back({i, i + length, TokenKind::Error});
i += length;
continue;
} else if (text[i] >= 'A' && text[i] <= 'Z') {
uint32_t start = i;
i += get_next_word(text, i, len);
if (i - start >= 5 && text[i - 5] == 'E' && text[i - 4] == 'r' &&
text[i - 3] == 'r' && text[i - 2] == 'o' && text[i - 1] == 'r') {
tokens->push_back({start, i, TokenKind::Error});
continue;
}
tokens->push_back({start, i, TokenKind::Constant});
continue;
} else {
@@ -899,7 +1264,3 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
}
return state;
}
// TODO: Add trie's for builtins and highlight them separately liek (Array /
// self etc)
// And in regex better highlighting of regex structures

View File

@@ -2,6 +2,12 @@
"Default": {
"fg": "#EEEEEE"
},
"Shebang": {
"fg": "#7dcfff"
},
"Error": {
"fg": "#EF5168"
},
"Comment": {
"fg": "#AAAAAA",
"italic": true