crib/src/utils/unicode.cc

#include "utfcpp/source/utf8.h"
#include "utils/utils.h"

int display_width(const char *str, size_t len) {
  if (!str || !*str)
    return 0;
  if (str[0] == '\t')
    return 4;
  unicode_width_state_t state;
  unicode_width_init(&state);
  int width = 0;
  for (size_t j = 0; j < len; j++) {
    unsigned char c = str[j];
    if (c < 128) {
      int char_width = unicode_width_process(&state, c);
      if (char_width > 0)
        width += char_width;
    } else {
      uint_least32_t cp;
      size_t bytes = grapheme_decode_utf8(str + j, strlen(str) - j, &cp);
      if (bytes > 1) {
        int char_width = unicode_width_process(&state, cp);
        if (char_width > 0)
          width += char_width;
        j += bytes - 1;
      }
    }
  }
  return width;
}

uint8_t utf8_codepoint_width(unsigned char c) {
  if ((c & 0x80) == 0x00)
    return 1;
  if ((c & 0xE0) == 0xC0)
    return 2;
  if ((c & 0xF0) == 0xE0)
    return 3;
  if ((c & 0xF8) == 0xF0)
    return 4;
  return 1;
}

uint32_t get_visual_col_from_bytes(const char *line, uint32_t len,
                                   uint32_t byte_limit) {
  if (!line)
    return 0;
  uint32_t visual_col = 0;
  uint32_t current_byte = 0;
  if (len > 0 && line[len - 1] == '\n')
    len--;
  while (current_byte < byte_limit && current_byte < len) {
    uint32_t inc = grapheme_next_character_break_utf8(line + current_byte,
                                                      len - current_byte);
    if (current_byte + inc > byte_limit)
      break;
    int w = display_width(line + current_byte, inc);
    if (w < 0)
      w = 0;
    visual_col += (uint32_t)w;
    current_byte += inc;
  }
  return visual_col;
}

uint32_t get_bytes_from_visual_col(const char *line, uint32_t len,
                                   uint32_t target_visual_col) {
  if (!line)
    return 0;
  uint32_t current_byte = 0;
  uint32_t visual_col = 0;
  if (len > 0 && line[len - 1] == '\n')
    len--;
  while (current_byte < len && visual_col < target_visual_col) {
    uint32_t inc = grapheme_next_character_break_utf8(line + current_byte,
                                                      len - current_byte);
    int w = display_width(line + current_byte, inc);
    if (w < 0)
      w = 0;
    if (visual_col + (uint32_t)w > target_visual_col)
      return current_byte;
    visual_col += (uint32_t)w;
    current_byte += inc;
  }
  return current_byte;
}

uint32_t count_clusters(const char *line, size_t len, size_t from, size_t to) {
  uint32_t count = 0;
  size_t pos = from;
  while (pos < to && pos < len) {
    size_t next =
        pos + grapheme_next_character_break_utf8(line + pos, len - pos);
    if (next > to)
      break;
    pos = next;
    count++;
  }
  return count;
}

size_t utf8_offset_to_utf16(const char *utf8, size_t utf8_len,
                            size_t byte_offset) {
  if (byte_offset > utf8_len)
    return byte_offset;
  const char *start = utf8;
  const char *mid = utf8 + byte_offset;
  if (!utf8::is_valid(start, mid))
    assert(0 && "invalid utf8");
  size_t utf16_offset = 0;
  for (auto it = start; it < mid;) {
    uint32_t codepoint = utf8::next(it, mid);
    if (codepoint <= 0xFFFF)
      utf16_offset += 1;
    else
      utf16_offset += 2;
  }
  return utf16_offset;
}

size_t utf16_offset_to_utf8(const char *utf8, size_t utf8_len,
                            size_t utf16_offset) {
  const char *start = utf8;
  const char *end = utf8 + utf8_len;
  const char *it = start;
  size_t utf16_count = 0;
  while (it < end) {
    if (utf16_count >= utf16_offset)
      break;
    const char *prev = it;
    uint32_t codepoint = utf8::next(it, end);
    if (codepoint <= 0xFFFF)
      utf16_count += 1;
    else
      utf16_count += 2;
    if (utf16_count > utf16_offset)
      return prev - start;
  }
  return it - start;
}