Improve highlighters

2026-01-18 17:49:36 +00:00
parent c9324c13aa
commit b5c49f4277
5 changed files with 408 additions and 17 deletions
--- a/include/syntax/tokens.def
+++ b/include/syntax/tokens.def
@@ -1,5 +1,7 @@
 ADD(Data)
+ADD(Shebang)
 ADD(Comment)
+ADD(Error)
 ADD(String)
 ADD(Escape)
 ADD(Interpolation)
--- a/samples/ruby.rb
+++ b/samples/ruby.rb
@@ -22,7 +22,7 @@ cjk_samples = [
 ]

 # Ruby regex with unicode
-$unicode_regex = /[一-龯ぁ-ん#{0x3000}ァ
+$unicode_regex_multiline = /[一-龯ぁ-ん#{0x3000}ァ
 \-ヶー

 s wow
@@ -30,22 +30,29 @@ s wow
 々〆〤]/

 UNICORE = %r{
-    
+    [s]
    {#{}}
-    
+    \C-s\u{10}
  }
  
 UNINITCORE = %{
    
    {{#{}}}
    
+    test = "A:\x41 B:\101 C:\u0043 D:\u{44 45} NUL:\0 DEL:\c? CTRL_A:\cA META_X:\M-x CTRL_META_X:\C-\M-x MIX:\C-\M-z N:\N{UNICODE NAME}"
+    
  }

 # Unicode identifiers (valid in Ruby)
 变量 = 0x5_4eddaee
-π = 3.14_159e+2, ?\u0234, ?\,, ?\x0A, ?s, true, false
+π = 3.14_159e+2, ?\u0234, ?\,, ?\x0A, ?s, true, false, 0
 挨拶 = -> { "こんに \n ちは" }

+arr = Array.new()
+not_arr = NotArray.new()
+
+raise NameError or SystemExit or CustomError or Errno or ErrorNotAtAll
+
 # Method using unicode variable names
 def math_test
  puts "π * 2 = #{π * 2}"
@@ -57,6 +64,7 @@ cjk_samples.each_with_index do |str, idx:|
  symbol = :"
  a
  "
+  sym2 = :hello
 end

 # Test emoji width behaviors
@@ -98,10 +106,10 @@ mixed = [
 two_docs = <<DOC1 , <<DOC2
 stuff for doc2
 DOC1
-stuff for doc 2 with \#{interpolation} and more
+stuff for doc 2 with \#{not interpolation} and more
 DOC2

-p = 0 <<22
+p = 0 <<22 # not a heredoc

 mixed.each { |m| puts m }

@@ -170,6 +178,8 @@ class TestObject
    puts "#{@name}: #{@value}"
  end

+  private
+
  def double_value
    @value * 2
  end
@@ -297,7 +307,7 @@ TEXT
 puts multi_line

 # Symbols and strings
-sym = :my_symbol
+sym = :my_symbol == __dir__
 str = "my string"
 puts "Symbol: #{sym}, String: #{str}"

@@ -336,3 +346,9 @@ puts "Match 'fox'?" if sample_text =~ /fox/

 # End of test script
 puts "Ruby syntax highlighting test complete."
+
+__END__
+
+
+Anything here should be ignored >><<
+{{{}}}[[[]]](((000)))
--- a/src/syntax/bash.cc
+++ b/src/syntax/bash.cc
@@ -70,3 +70,9 @@ std::shared_ptr<void> bash_parse(std::vector<Token> *tokens,
  }
  return state;
 }
+
+// String literals surrounded by   '  strictly with no escaping inside
+// double quoted strings "  allow interpolation and escaping - with $var and
+// ${var}  and $((math)) $(command) and `command` expansions ANSI-C quoted
+// stirngs - $''  backslash escapes but with \xHH and \uHHHH and \uHHHHHHHH \cX
+// too
--- a/src/syntax/ruby.cc
+++ b/src/syntax/ruby.cc
@@ -1,6 +1,182 @@
 #include "syntax/decl.h"
 #include "syntax/langs.h"

+// TODO: in regex better highlighting of regex structures
+
+const static std::vector<std::string> types = {
+    "BasicObject", "Object",     "NilClass",
+    "TrueClass",   "FalseClass", "Integer",
+    "Fixnum",      "Bignum",     "Float",
+    "Rational",    "Complex",    "Numeric",
+    "String",      "Symbol",     "Array",
+    "Hash",        "Range",      "Regexp",
+    "Struct",      "Enumarator", "Enumerable",
+    "Time",        "Date",       "IO",
+    "File",        "Dir",        "Thread",
+    "Proc",        "Method",     "Module",
+    "Class",       "Mutex",      "ConditionVariable",
+    "MatchData",   "Encoding",   "Fiber",
+};
+
+const static std::vector<std::string> builtins = {
+    "ARGF",
+    "ARGV",
+    "ENV",
+    "STDIN",
+    "STDOUT",
+    "STDERR",
+    "DATA",
+    "TOPLEVEL_BINDING",
+    "RUBY_PLATFORM",
+    "RUBY_VERSION",
+    "RUBY_RELEASE_DATE",
+    "RUBY_PATCHLEVEL",
+    "RUBY_ENGINE",
+    "__LINE__",
+    "__FILE__",
+    "__ENCODING__",
+    "__dir__",
+    "__callee__",
+    "__method__",
+    "__id__",
+    "__send__",
+};
+
+const static std::vector<std::string> methods = {
+    "abort",
+    "at_exit",
+    "binding",
+    "block_given?",
+    "caller",
+    "catch",
+    "chomp",
+    "chomp!",
+    "chop",
+    "chop!",
+    "eval",
+    "exec",
+    "exit",
+    "exit!",
+    "fail",
+    "fork",
+    "format",
+    "gets",
+    "global_variables",
+    "gsub",
+    "gsub!",
+    "iterator?",
+    "lambda",
+    "load",
+    "loop",
+    "open",
+    "print",
+    "printf",
+    "proc",
+    "putc",
+    "puts",
+    "raise",
+    "rand",
+    "readline",
+    "readlines",
+    "require",
+    "require_relative",
+    "select",
+    "sleep",
+    "spawn",
+    "split",
+    "sprintf",
+    "srand",
+    "sub",
+    "sub!",
+    "syscall",
+    "system",
+    "test",
+    "throw",
+    "trace_var",
+    "trap",
+    "untrace_var",
+    "attr",
+    "attr_reader",
+    "attr_writer",
+    "attr_accessor",
+    "class_variable_get",
+    "class_variable_set",
+    "define_method",
+    "instance_variable_get",
+    "instance_variable_set",
+    "private",
+    "protected",
+    "public",
+    "public_class_method",
+    "module_function",
+    "remove_method",
+    "undef_method",
+    "method",
+    "methods",
+    "singleton_methods",
+    "private_methods",
+    "protected_methods",
+    "public_methods",
+    "send",
+    "extend",
+    "include",
+    "prepend",
+    "clone",
+    "dup",
+    "freeze",
+    "taint",
+    "untaint",
+    "trust",
+    "untrust",
+    "untaint?",
+    "trust?",
+    "each",
+    "each_with_index",
+    "each_with_object",
+    "map",
+    "collect",
+    "select",
+    "reject",
+    "reduce",
+    "inject",
+    "find",
+    "detect",
+    "all?",
+    "any?",
+    "none?",
+    "one?",
+    "count",
+    "cycle",
+    "drop",
+    "drop_while",
+    "take",
+    "take_while",
+    "chunk",
+    "chunk_while",
+    "group_by",
+    "partition",
+    "slice_before",
+    "slice_after",
+    "nil?",
+    "is_a?",
+    "kind_of?",
+    "instance_of?",
+    "respond_to?",
+    "equal?",
+    "object_id",
+    "class",
+    "singleton_class",
+    "clone",
+    "freeze",
+    "tap",
+    "then",
+};
+
+const static std::vector<std::string> errors = {
+    "Exception", "SignalException", "Interrupt", "StopIteration",
+    "Errno",     "SystemExit",      "fatal",
+};
+
 const static std::vector<std::string> base_keywords = {
    "class", "module", "begin", "end", "else", "rescue", "ensure", "do", "when",
 };
@@ -119,12 +295,20 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
  static Trie operator_keywords_trie;
  static Trie expecting_operators_trie;
  static Trie operator_trie;
+  static Trie types_trie;
+  static Trie builtins_trie;
+  static Trie methods_trie;
+  static Trie errors_trie;
  if (!keywords_trie_init) {
    base_keywords_trie.build(base_keywords);
    expecting_keywords_trie.build(expecting_keywords);
    operator_keywords_trie.build(operator_keywords);
    expecting_operators_trie.build(expecting_operators);
    operator_trie.build(operators);
+    types_trie.build(types);
+    builtins_trie.build(builtins);
+    methods_trie.build(methods);
+    errors_trie.build(errors);
    keywords_trie_init = true;
  }
  tokens->clear();
@@ -174,8 +358,60 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
            tokens->push_back({start, i, TokenKind::String});
            start = i;
            i++;
+            if (i < len && text[i] == 'x') {
+              i++;
+              if (i < len && isxdigit(text[i]))
+                i++;
+              if (i < len && isxdigit(text[i]))
+                i++;
+            } else if (i < len && text[i] == 'u') {
+              i++;
+              if (i < len && text[i] == '{') {
+                i++;
+                while (i < len && text[i] != '}')
+                  i++;
                if (i < len)
                  i++;
+              } else {
+                if (i < len && isxdigit(text[i]))
+                  i++;
+                if (i < len && isxdigit(text[i]))
+                  i++;
+                if (i < len && isxdigit(text[i]))
+                  i++;
+                if (i < len && isxdigit(text[i]))
+                  i++;
+              }
+            } else if (i < len && text[i] >= '0' && text[i] <= '7') {
+              i++;
+              if (i < len && text[i] >= '0' && text[i] <= '7')
+                i++;
+              if (i < len && text[i] >= '0' && text[i] <= '7')
+                i++;
+            } else if (i < len && text[i] == 'c') {
+              i++;
+              if (i < len && text[i] != '\\')
+                i++;
+            } else if (i < len && (text[i] == 'M' || text[i] == 'C')) {
+              i++;
+              if (i < len && text[i] == '-') {
+                i++;
+                if (i < len && text[i] != '\\')
+                  i++;
+              }
+            } else if (i < len && text[i] == 'N') {
+              i++;
+              if (i < len && text[i] == '{') {
+                i++;
+                while (i < len && text[i] != '}')
+                  i++;
+                if (i < len)
+                  i++;
+              }
+            } else {
+              if (i < len)
+                i++;
+            }
            tokens->push_back({start, i, TokenKind::Escape});
            continue;
          }
@@ -202,11 +438,62 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
          tokens->push_back({start, i, TokenKind::String});
          start = i;
          i++;
+          if (i < len && text[i] == 'x') {
+            i++;
+            if (i < len && isxdigit(text[i]))
+              i++;
+            if (i < len && isxdigit(text[i]))
+              i++;
+          } else if (i < len && text[i] == 'u') {
+            i++;
+            if (i < len && text[i] == '{') {
+              i++;
+              while (i < len && text[i] != '}')
+                i++;
              if (i < len)
                i++;
+            } else {
+              if (i < len && isxdigit(text[i]))
+                i++;
+              if (i < len && isxdigit(text[i]))
+                i++;
+              if (i < len && isxdigit(text[i]))
+                i++;
+              if (i < len && isxdigit(text[i]))
+                i++;
+            }
+          } else if (i < len && text[i] >= '0' && text[i] <= '7') {
+            i++;
+            if (i < len && text[i] >= '0' && text[i] <= '7')
+              i++;
+            if (i < len && text[i] >= '0' && text[i] <= '7')
+              i++;
+          } else if (i < len && text[i] == 'c') {
+            i++;
+            if (i < len && text[i] != '\\')
+              i++;
+          } else if (i < len && (text[i] == 'M' || text[i] == 'C')) {
+            i++;
+            if (i < len && text[i] == '-') {
+              i++;
+              if (i < len && text[i] != '\\')
+                i++;
+            }
+          } else if (i < len && text[i] == 'N') {
+            i++;
+            if (i < len && text[i] == '{') {
+              i++;
+              while (i < len && text[i] != '}')
+                i++;
+              if (i < len)
+                i++;
+            }
+          } else {
+            if (i < len)
+              i++;
+          }
          tokens->push_back({start, i, TokenKind::Escape});
          continue;
-          continue;
        }
        if (state->full_state->lit.allow_interp && text[i] == '#' &&
            i + 1 < len && text[i + 1] == '{') {
@@ -253,11 +540,62 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
      while (i < len) {
        if (text[i] == '\\') {
          tokens->push_back({start, i, TokenKind::Regexp});
-          ;
          start = i;
          i++;
+          if (i < len && text[i] == 'x') {
+            i++;
+            if (i < len && isxdigit(text[i]))
+              i++;
+            if (i < len && isxdigit(text[i]))
+              i++;
+          } else if (i < len && text[i] == 'u') {
+            i++;
+            if (i < len && text[i] == '{') {
+              i++;
+              while (i < len && text[i] != '}')
+                i++;
              if (i < len)
                i++;
+            } else {
+              if (i < len && isxdigit(text[i]))
+                i++;
+              if (i < len && isxdigit(text[i]))
+                i++;
+              if (i < len && isxdigit(text[i]))
+                i++;
+              if (i < len && isxdigit(text[i]))
+                i++;
+            }
+          } else if (i < len && text[i] >= '0' && text[i] <= '7') {
+            i++;
+            if (i < len && text[i] >= '0' && text[i] <= '7')
+              i++;
+            if (i < len && text[i] >= '0' && text[i] <= '7')
+              i++;
+          } else if (i < len && text[i] == 'c') {
+            i++;
+            if (i < len && text[i] != '\\')
+              i++;
+          } else if (i < len && (text[i] == 'M' || text[i] == 'C')) {
+            i++;
+            if (i < len && text[i] == '-') {
+              i++;
+              if (i < len && text[i] != '\\')
+                i++;
+            }
+          } else if (i < len && text[i] == 'N') {
+            i++;
+            if (i < len && text[i] == '{') {
+              i++;
+              while (i < len && text[i] != '}')
+                i++;
+              if (i < len)
+                i++;
+            }
+          } else {
+            if (i < len)
+              i++;
+          }
          tokens->push_back({start, i, TokenKind::Escape});
          continue;
        }
@@ -365,6 +703,11 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
      i++;
      continue;
    } else if (text[i] == '#') {
+      if (i == 0 && len > 4 && text[i + 1] == '!') {
+        state->full_state->expecting_expr = false;
+        tokens->push_back({0, len, TokenKind::Shebang});
+        return state;
+      }
      tokens->push_back({i, len, TokenKind::Comment});
      state->full_state->expecting_expr = false;
      return state;
@@ -394,7 +737,7 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
        continue;
      }
      if (text[i] == '\'' || text[i] == '"') {
-        tokens->push_back({start, i, TokenKind::Operator});
+        tokens->push_back({start, i, TokenKind::Label});
        state->full_state->expecting_expr = true;
        continue;
      }
@@ -793,9 +1136,31 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
        tokens->push_back({i, i + length, TokenKind::KeywordOperator});
        i += length;
        continue;
+      } else if ((length = types_trie.match(text, i, len, identifier_char))) {
+        tokens->push_back({i, i + length, TokenKind::Type});
+        i += length;
+        continue;
+      } else if ((length = methods_trie.match(text, i, len, identifier_char))) {
+        tokens->push_back({i, i + length, TokenKind::Function});
+        i += length;
+        continue;
+      } else if ((length =
+                      builtins_trie.match(text, i, len, identifier_char))) {
+        tokens->push_back({i, i + length, TokenKind::Constant});
+        i += length;
+        continue;
+      } else if ((length = errors_trie.match(text, i, len, identifier_char))) {
+        tokens->push_back({i, i + length, TokenKind::Error});
+        i += length;
+        continue;
      } else if (text[i] >= 'A' && text[i] <= 'Z') {
        uint32_t start = i;
        i += get_next_word(text, i, len);
+        if (i - start >= 5 && text[i - 5] == 'E' && text[i - 4] == 'r' &&
+            text[i - 3] == 'r' && text[i - 2] == 'o' && text[i - 1] == 'r') {
+          tokens->push_back({start, i, TokenKind::Error});
+          continue;
+        }
        tokens->push_back({start, i, TokenKind::Constant});
        continue;
      } else {
@@ -899,7 +1264,3 @@ std::shared_ptr<void> ruby_parse(std::vector<Token> *tokens,
  }
  return state;
 }
-
-// TODO: Add trie's for builtins and highlight them separately liek (Array /
-// self etc)
-// And in regex better highlighting of regex structures
--- a/themes/default.json
+++ b/themes/default.json
@@ -2,6 +2,12 @@
  "Default": {
    "fg": "#EEEEEE"
  },
+  "Shebang": {
+    "fg": "#7dcfff"
+  },
+  "Error": {
+    "fg": "#EF5168"
+  },
  "Comment": {
    "fg": "#AAAAAA",
    "italic": true