#include "../include/knot.h" #include #include #include #include #include #include #include #include static void update(Knot *n) { if (!n) return; if (!n->depth || n->depth == 0) return; uint32_t left_chars = n->left ? n->left->char_count : 0; uint32_t right_chars = n->right ? n->right->char_count : 0; n->char_count = left_chars + right_chars; uint32_t left_lines = n->left ? n->left->line_count : 0; uint32_t right_lines = n->right ? n->right->line_count : 0; n->line_count = left_lines + right_lines; uint8_t left_depth = n->left ? n->left->depth : 0; uint8_t right_depth = n->right ? n->right->depth : 0; n->depth = MAX(left_depth, right_depth) + 1; n->chunk_size = n->left ? n->left->chunk_size : n->right->chunk_size; } uint32_t optimal_chunk_size(uint64_t length) { if (length <= MIN_CHUNK_SIZE) return MIN_CHUNK_SIZE; double target_exponent = MIN(std::log2((double)MAX_CHUNK_SIZE), 7.0 + (std::log2((double)length) - 10.0) * 0.25); uint32_t final_chunk_size = MAX((uint32_t)MIN_CHUNK_SIZE, (uint32_t)std::pow(2.0, target_exponent)); final_chunk_size = MIN(final_chunk_size, (uint32_t)MAX_CHUNK_SIZE); final_chunk_size = 1U << (32 - __builtin_clz(final_chunk_size - 1)); return final_chunk_size; } // str is not consumed and \0 is not handled // So if str is null terminated then len must be strlen(str) // and freed by caller Knot *load(char *str, uint32_t len, uint32_t chunk_size) { if (len > (uint32_t)(chunk_size - (chunk_size / 16))) { Knot *left = load(str, len / 2, chunk_size); Knot *right = load(str + len / 2, len - len / 2, chunk_size); Knot *node = (Knot *)malloc(sizeof(Knot)); if (!node) return nullptr; node->left = left; node->right = right; node->chunk_size = chunk_size; node->depth = MAX(left->depth, right->depth) + 1; node->char_count = left->char_count + right->char_count; node->line_count = left->line_count + right->line_count; return node; } else { Knot *node = (Knot *)malloc(sizeof(Knot) + chunk_size + 1); if (!node) return nullptr; node->left = nullptr; node->right = nullptr; node->chunk_size = chunk_size; node->depth = 0; node->char_count = len; uint32_t newline_count = 0; for (uint32_t i = 0; i < len; i++) { char c = str[i]; node->data[i] = c; if (c == '\n') newline_count++; } node->line_count = newline_count; return node; } } // leaf if consumed and freed (so dont use or free it after) // left and right are the new nodes static void split_leaf(Knot *leaf, uint32_t k, Knot **left, Knot **right) { Knot *left_node = (Knot *)malloc(sizeof(Knot) + leaf->chunk_size + 1); left_node->left = nullptr; left_node->right = nullptr; left_node->chunk_size = leaf->chunk_size; left_node->depth = 0; left_node->char_count = k; uint32_t newline_count = 0; for (uint32_t i = 0; i < k; i++) { char c = leaf->data[i]; left_node->data[i] = c; if (c == '\n') newline_count++; } left_node->line_count = newline_count; uint16_t right_line_count = leaf->line_count - newline_count; *left = left_node; Knot *right_node = (Knot *)malloc(sizeof(Knot) + leaf->chunk_size + 1); right_node->left = nullptr; right_node->right = nullptr; right_node->chunk_size = leaf->chunk_size; right_node->depth = 0; right_node->char_count = leaf->char_count - k; right_node->line_count = right_line_count; for (uint32_t i = k; i < leaf->char_count; i++) { char c = leaf->data[i]; right_node->data[i - k] = c; } *right = right_node; free(leaf); } // This makes node nonsensical, so dont use or free it after void split(Knot *node, uint32_t offset, Knot **left, Knot **right) { if (!node) { *left = nullptr; *right = nullptr; return; } if (node->depth == 0) { split_leaf(node, offset, left, right); return; } uint32_t left_size = node->left ? node->left->char_count : 0; if (offset < left_size) { Knot *L = nullptr, *R = nullptr; split(node->left, offset, &L, &R); node->left = R; update(node); *right = node; *left = L; } else { uint32_t new_offset = offset - left_size; Knot *L = nullptr, *R = nullptr; split(node->right, new_offset, &L, &R); node->right = L; update(node); *left = node; *right = R; } } static inline int get_balance_factor(Knot *n) { if (!n) return 0; return (int)DEPTH(n->left) - (int)DEPTH(n->right); } static inline Knot *rotate_right(Knot *y) { Knot *x = y->left; Knot *T2 = x->right; x->right = y; y->left = T2; update(y); update(x); return x; } static inline Knot *rotate_left(Knot *x) { Knot *y = x->right; Knot *T2 = y->left; y->left = x; x->right = T2; update(x); update(y); return y; } // Technically n can be used after calling // but use return value instead Knot *balance(Knot *n) { update(n); int bal = get_balance_factor(n); if (bal > 1) { if (get_balance_factor(n->left) < 0) n->left = rotate_left(n->left); return rotate_right(n); } if (bal < -1) { if (get_balance_factor(n->right) > 0) n->right = rotate_right(n->right); return rotate_left(n); } return n; } // Dont free left or right after calling (only free return value) // Assumes both ropes have equal chunk sizes Knot *concat(Knot *left, Knot *right) { if (!left) return right; if (!right) return left; if (!left || left->char_count == 0) { if (left) free_rope(left); return right; } if (!right || right->char_count == 0) { if (right) free_rope(right); return left; } if (left->depth == 0 && right->depth == 0) { if (left->char_count + right->char_count <= left->chunk_size) { Knot *node = (Knot *)malloc(sizeof(Knot) + left->chunk_size + 1); node->left = nullptr; node->right = nullptr; node->chunk_size = left->chunk_size; node->depth = 0; node->char_count = left->char_count + right->char_count; node->line_count = left->line_count + right->line_count; memcpy(node->data, left->data, left->char_count); memcpy(node->data + left->char_count, right->data, right->char_count); free(left); free(right); return node; } } uint16_t d_left = left->depth; uint16_t d_right = right->depth; if (d_left > d_right + 1) { left->right = concat(left->right, right); return balance(left); } if (d_right > d_left + 1) { right->left = concat(left, right->left); return balance(right); } Knot *node = (Knot *)malloc(sizeof(Knot)); if (!node) return nullptr; node->left = left; node->right = right; node->chunk_size = left->chunk_size; node->depth = MAX(d_left, d_right) + 1; update(node); return node; } // This makes node nonsensical, so dont use or free it after // Instead, free the return value or use it in node's place Knot *insert(Knot *node, uint32_t offset, char *str, uint32_t len) { if (!node) return nullptr; if (node->depth == 0 && node->char_count + len <= node->chunk_size) { if (offset < node->char_count) memmove(node->data + offset + len, node->data + offset, node->char_count - offset); memcpy(node->data + offset, str, len); node->char_count += len; for (uint32_t i = 0; i < len; i++) if (str[i] == '\n') node->line_count++; return node; } if (node->depth > 0) { uint32_t left_count = node->left ? node->left->char_count : 0; if (offset < left_count) { Knot *new_left = insert(node->left, offset, str, len); node->left = new_left; update(node); return balance(node); } else { Knot *new_right = insert(node->right, offset - left_count, str, len); node->right = new_right; update(node); return balance(node); } } Knot *left_part = nullptr; Knot *right_part = nullptr; split(node, offset, &left_part, &right_part); Knot *middle_part = load(str, len, left_part->chunk_size); return concat(concat(left_part, middle_part), right_part); } // This makes node nonsensical, so dont use or free it after // Instead, free the return value or use it in node's place Knot *erase(Knot *node, uint32_t offset, uint32_t len) { if (!node || len == 0 || offset >= node->char_count) return node; if (offset + len > node->char_count) len = node->char_count - offset; if (node->depth == 0) { uint32_t deleted_newlines = 0; for (uint32_t i = offset; i < offset + len; i++) if (node->data[i] == '\n') deleted_newlines++; node->line_count -= deleted_newlines; if (offset + len < node->char_count) memmove(node->data + offset, node->data + offset + len, node->char_count - (offset + len)); node->char_count -= len; return node; } uint32_t left_count = node->left ? node->left->char_count : 0; if (offset + len <= left_count) { node->left = erase(node->left, offset, len); } else if (offset >= left_count) { node->right = erase(node->right, offset - left_count, len); } else { Knot *left = nullptr, *middle = nullptr, *right = nullptr; split(node, offset, &left, &right); split(right, len, &middle, &right); free_rope(middle); return concat(left, right); } update(node); return balance(node); } static void _read_into(Knot *node, uint32_t offset, uint32_t len, char *dest) { if (!node || len == 0) return; if (node->depth == 0) { memcpy(dest, node->data + offset, len); return; } Knot *left = node->left; uint32_t left_count = left ? left->char_count : 0; if (offset < left_count) { uint32_t chunk_len = left_count - offset; if (chunk_len > len) chunk_len = len; _read_into(left, offset, chunk_len, dest); dest += chunk_len; len -= chunk_len; offset = 0; } else { offset -= left_count; } if (len > 0 && node->right) _read_into(node->right, offset, len, dest); } char *read(Knot *root, uint32_t offset, uint32_t len) { if (!root) return nullptr; if (offset >= root->char_count) { char *empty = (char *)malloc(1); if (empty) empty[0] = '\0'; return empty; } if (offset + len > root->char_count) { len = root->char_count - offset; } char *buffer = (char *)malloc((len + 1) * sizeof(char)); if (!buffer) return nullptr; _read_into(root, offset, len, buffer); buffer[len] = '\0'; return buffer; } // Hopefully free the tree only once at the end of its use using the pointer // from the last insert or concat or erase call. // (or use twice if last call was split - for both left and right). void free_rope(Knot *root) { if (!root) return; free_rope(root->left); free_rope(root->right); free(root); } static uint32_t find_nth_newline_offset(Knot *node, uint32_t n) { if (!node || n > node->line_count) return UINT32_MAX; if (node->depth == 0) { uint32_t count = 0; for (uint32_t i = 0; i < node->char_count; i++) { if (node->data[i] == '\n') { if (count == n) return i; count++; } } return UINT32_MAX; } uint32_t left_lines = node->left ? node->left->line_count : 0; if (n < left_lines) { return find_nth_newline_offset(node->left, n); } else { uint32_t right_offset = find_nth_newline_offset(node->right, n - left_lines); if (right_offset == UINT32_MAX) return UINT32_MAX; uint32_t left_chars = node->left ? node->left->char_count : 0; return left_chars + right_offset; } } uint32_t byte_to_line(Knot *node, uint32_t offset, uint32_t *out_col) { if (!node) { if (out_col) *out_col = 0; return 0; } if (offset >= node->char_count) { if (out_col) *out_col = 0; return node->line_count; } if (node->depth == 0) { uint32_t line = 0; uint32_t col = 0; for (uint32_t i = 0; i < offset; i++) { if (node->data[i] == '\n') { line++; col = 0; } else { col++; } } if (out_col) *out_col = col; return line; } uint32_t left_chars = node->left ? node->left->char_count : 0; if (offset < left_chars) { return byte_to_line(node->left, offset, out_col); } else { uint32_t sub_col = 0; uint32_t line = (node->left ? node->left->line_count : 0) + byte_to_line(node->right, offset - left_chars, &sub_col); if (out_col) *out_col = sub_col; return line; } } uint32_t line_to_byte(Knot *node, uint32_t line, uint32_t *out_len) { if (!node) { if (out_len) *out_len = 0; return 0; } uint32_t start_offset = 0; uint32_t end_offset = 0; if (line == 0) { start_offset = 0; } else { uint32_t prev_newline = find_nth_newline_offset(node, line - 1); if (prev_newline == UINT32_MAX) start_offset = node->char_count; else start_offset = prev_newline + 1; } uint32_t current_newline = find_nth_newline_offset(node, line); if (current_newline == UINT32_MAX) end_offset = node->char_count; else end_offset = current_newline + 1; if (out_len) { if (end_offset > start_offset) *out_len = end_offset - start_offset; else *out_len = 0; } return start_offset; } LineIterator *begin_l_iter(Knot *root, uint32_t start_line) { if (!root) return nullptr; if (start_line > root->line_count) return nullptr; LineIterator *it = (LineIterator *)malloc(sizeof(LineIterator)); if (!it) return nullptr; it->top = 0; it->line = start_line; it->node = nullptr; if (start_line == 0) { it->offset = 0; while (root->left) { it->stack[it->top++] = root; root = root->left; } if (!root->left && !root->right) it->node = root; it->stack[it->top++] = root; return it; } Knot *curr = root; uint32_t relative_line = --start_line; it->line = start_line; while (curr) { it->stack[it->top++] = curr; if (!curr->left && !curr->right) { it->node = curr; break; } uint32_t left_lines = (curr->left) ? curr->left->line_count : 0; if (relative_line < left_lines) { curr = curr->left; } else { relative_line -= left_lines; curr = curr->right; } } if (!it->node) { free(it); return nullptr; } it->offset = 0; if (relative_line > 0) { uint32_t found_newlines = 0; uint32_t i = 0; for (i = 0; i < it->node->char_count; i++) { if (it->node->data[i] == '\n') { found_newlines++; if (found_newlines == relative_line) { it->offset = i + 1; break; } } } } free(next_line(it)); return it; } static inline void iter_advance_leaf(LineIterator *it) { if (it->top == 0) { it->node = nullptr; return; } Knot *prev = it->stack[--it->top]; while (it->top > 0) { Knot *parent = it->stack[it->top - 1]; if (parent->left == prev && parent->right) { Knot *curr = parent->right; while (curr) { it->stack[it->top++] = curr; if (!curr->left && !curr->right) { it->node = curr; it->offset = 0; return; } curr = (curr->left) ? curr->left : curr->right; } } prev = it->stack[--it->top]; } it->node = nullptr; } char *next_line(LineIterator *it) { if (!it || !it->node) return nullptr; size_t capacity = 128; size_t len = 0; char *buffer = (char *)malloc(capacity); if (!buffer) return nullptr; while (it->node) { if (it->offset >= it->node->char_count) { iter_advance_leaf(it); if (!it->node) break; } char *start = it->node->data + it->offset; char *end = it->node->data + it->node->char_count; char *newline_ptr = (char *)memchr(start, '\n', end - start); size_t chunk_len; int found_newline = 0; if (newline_ptr) { chunk_len = (newline_ptr - start) + 1; found_newline = 1; } else { chunk_len = end - start; } if (len + chunk_len + 1 > capacity) { capacity = (capacity * 2) + chunk_len; char *new_buf = (char *)realloc(buffer, capacity); if (!new_buf) { free(buffer); return nullptr; } buffer = new_buf; } memcpy(buffer + len, start, chunk_len); len += chunk_len; it->offset += chunk_len; if (found_newline) { buffer[len] = '\0'; it->line++; return buffer; } } if (len > 0) { buffer[len] = '\0'; it->line++; return buffer; } free(buffer); return nullptr; } LeafIterator *begin_k_iter(Knot *root, uint32_t start_offset) { if (!root) return nullptr; LeafIterator *it = (LeafIterator *)malloc(sizeof(LeafIterator)); if (!it) return nullptr; it->top = 0; it->adjustment = 0; Knot *curr = root; while (curr) { it->stack[it->top++] = curr; if (!curr->left && !curr->right) { if (start_offset > curr->char_count) { free(it); return nullptr; } it->node = curr; it->adjustment = start_offset; return it; } uint32_t left_size = (curr->left) ? curr->left->char_count : 0; if (start_offset < left_size) { curr = curr->left; } else { start_offset -= left_size; curr = curr->right; } } free(it); return nullptr; } // Caller must never free the returned string char *next_leaf(LeafIterator *it, uint32_t *out_len) { if (!it || !it->node) return nullptr; char *data_to_return = it->node->data + it->adjustment; if (out_len) *out_len = it->node->char_count - it->adjustment; it->node->data[it->node->char_count] = '\0'; it->adjustment = 0; Knot *prev_leaf = it->node; Knot *parent = nullptr; while (it->top > 0) { parent = it->stack[--it->top]; if (parent->right && parent->right != prev_leaf) { Knot *curr = parent->right; while (curr) { it->stack[it->top++] = curr; if (!curr->left && !curr->right) { it->node = curr; return data_to_return; } curr = curr->left; if (!curr) curr = it->stack[it->top - 1]->right; } } prev_leaf = parent; } it->node = nullptr; return data_to_return; } ByteIterator *begin_b_iter(Knot *root) { ByteIterator *b_it = (ByteIterator *)malloc(sizeof(ByteIterator)); LeafIterator *l_it = begin_k_iter(root, 0); b_it->it = l_it; b_it->offset_g = 0; b_it->offset_l = 0; b_it->char_count = 0; b_it->data = nullptr; return b_it; } char next_byte(ByteIterator *it) { if (it->data && it->offset_l < it->char_count) { return it->data[it->offset_l++]; } else { it->offset_g += it->offset_l; it->offset_l = 1; char *data = next_leaf(it->it, &it->char_count); if (!data) return '\0'; while (it->char_count <= 0) { data = next_leaf(it->it, &it->char_count); if (!data) return '\0'; } it->data = data; return *it->data; } } // Caller must NOT free returned string. // Returns nullptr if offset is invalid or no leaf found. char *leaf_from_offset(Knot *root, uint32_t start_offset, uint32_t *out_len) { if (!root) return nullptr; Knot *curr = root; while (curr) { if (!curr->left && !curr->right) { if (start_offset > curr->char_count) return nullptr; char *result = curr->data + start_offset; if (out_len) *out_len = curr->char_count - start_offset; curr->data[curr->char_count] = '\0'; return result; } uint32_t left_size = curr->left ? curr->left->char_count : 0; if (start_offset < left_size) { curr = curr->left; } else { start_offset -= left_size; curr = curr->right; } } return nullptr; } std::vector> search_rope(Knot *root, const char *pattern) { std::vector> results; int errorcode; PCRE2_SIZE erroffset; pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0, &errorcode, &erroffset, nullptr); if (!re) { fprintf(stderr, "PCRE2 compile error: %d\n", errorcode); return results; } pcre2_match_data *mdata = pcre2_match_data_create(128, nullptr); int workspace[PCRE_WORKSPACE_SIZE]; LeafIterator *it = begin_k_iter(root, 0); if (!it) { pcre2_code_free(re); pcre2_match_data_free(mdata); return results; } size_t chunk_abs_offset = 0; size_t saved_match_start = 0; bool match_in_progress = false; int flags = PCRE2_PARTIAL_SOFT; while (1) { const char *chunk_start = next_leaf(it, nullptr); if (!chunk_start) break; size_t chunk_len = strlen(chunk_start); const char *current_ptr = chunk_start; size_t remaining_len = chunk_len; while (remaining_len > 0) { int rc = pcre2_dfa_match(re, (PCRE2_SPTR)current_ptr, remaining_len, 0, flags, mdata, nullptr, workspace, PCRE_WORKSPACE_SIZE); if (rc >= 0) { PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata); size_t match_start_abs; size_t match_end_abs; if (match_in_progress) { match_start_abs = saved_match_start; match_end_abs = chunk_abs_offset + (current_ptr - chunk_start) + ov[1]; } else { match_start_abs = chunk_abs_offset + (current_ptr - chunk_start) + ov[0]; match_end_abs = chunk_abs_offset + (current_ptr - chunk_start) + ov[1]; } size_t total_len = match_end_abs - match_start_abs; results.push_back(std::make_pair(match_start_abs, total_len)); size_t consumed = ov[1]; if (consumed == 0) consumed = 1; current_ptr += consumed; if (consumed > remaining_len) remaining_len = 0; else remaining_len -= consumed; match_in_progress = false; flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL; continue; } else if (rc == PCRE2_ERROR_PARTIAL) { PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata); if (!match_in_progress) { saved_match_start = chunk_abs_offset + (current_ptr - chunk_start) + ov[0]; match_in_progress = true; } flags |= PCRE2_DFA_RESTART; flags |= PCRE2_NOTBOL; break; } else { if (match_in_progress) { match_in_progress = false; flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL; current_ptr++; remaining_len--; } else { break; } // if (rc != PCRE2_ERROR_NOMATCH) {} // handle error } } chunk_abs_offset += chunk_len; if (!match_in_progress) flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL; } pcre2_match_data_free(mdata); pcre2_code_free(re); free(it); return results; } // TODO: Optimize and make it actually utilize capture groups etc. // // static const size_t MAX_OVERLAP = 1024; // // std::vector> search_rope_new(Knot *root, // const char *pattern) { // std::vector> results; // int errorcode; // PCRE2_SIZE erroffset; // // // 1. Compile (Standard compilation) // pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, // 0, // &errorcode, &erroffset, nullptr); // if (!re) { // fprintf(stderr, "PCRE2 compile error: %d\n", errorcode); // return results; // } // // pcre2_match_data *mdata = pcre2_match_data_create_from_pattern(re, // nullptr); // // LeafIterator *it = begin_k_iter(root, 0); // if (!it) { // pcre2_code_free(re); // pcre2_match_data_free(mdata); // return results; // } // // // Buffer to hold (Last X chars) + (Current Chunk) // std::string buffer; // // // Tracks where the *start* of the current buffer is located relative to // the // // whole rope // size_t buffer_abs_offset = 0; // // // Tracks the absolute offset up to which we have already "cleared" // matches. // // This prevents reporting a match twice if it sits inside the overlap // region. size_t processed_up_to_abs = 0; // // while (1) { // // 2. Get next chunk // const char *chunk_start = next_leaf(it, nullptr); // if (!chunk_start) // break; // // // 3. Update Buffer: Append new data // size_t chunk_len = strlen(chunk_start); // buffer.append(chunk_start, chunk_len); // // PCRE2_SPTR subject = (PCRE2_SPTR)buffer.c_str(); // size_t subject_len = buffer.length(); // size_t start_offset = 0; // // // 4. Run pcre2_match loop on the current window // while (true) { // int rc = pcre2_match(re, subject, subject_len, start_offset, // 0, // Default options // mdata, nullptr); // // if (rc < 0) { // // No match (or error) in the rest of this buffer // break; // } // // PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(mdata); // size_t match_local_start = ovector[0]; // size_t match_local_end = ovector[1]; // // // Calculate Absolute Coordinates // size_t match_abs_start = buffer_abs_offset + match_local_start; // size_t match_len = match_local_end - match_local_start; // // // 5. Deduplication Check // // If we find a match that starts *before* where we finished processing // // the previous chunk, it means this match is entirely inside the // // overlap region and was reported in the previous iteration. // if (match_abs_start >= processed_up_to_abs) { // results.push_back(std::make_pair(match_abs_start, match_len)); // // Update processed marker so we don't report this again // // (Using start + 1 ensures we allow overlapping matches if regex // // allows, but strictly prevents the exact same start index being // // reported twice) // processed_up_to_abs = match_abs_start + 1; // } // // // Prepare for next match in this buffer // start_offset = match_local_end; // // // Handle empty matches (e.g. "a*" matching empty string) to prevent // // infinite loop // if (match_local_end == match_local_start) { // if (start_offset < subject_len) { // start_offset++; // } else { // break; // End of buffer // } // } // } // // // 6. Maintenance: Shrink buffer to keep only the last MAX_OVERLAP // // characters // if (buffer.length() > MAX_OVERLAP) { // size_t to_remove = buffer.length() - MAX_OVERLAP; // // // Remove from the beginning of the string // buffer.erase(0, to_remove); // // // The buffer's start has now moved forward in absolute terms // buffer_abs_offset += to_remove; // } // } // // // Cleanup // pcre2_match_data_free(mdata); // pcre2_code_free(re); // free(it); // Assuming iter needs free based on original code usage // // return results; // }