Initial commit

This commit is contained in:
2025-11-27 16:42:36 +00:00
commit a95e5c09b8
7 changed files with 2188 additions and 0 deletions

272
src/noose.cpp Normal file
View File

@@ -0,0 +1,272 @@
#include "../headers/noose.hpp"
#include "../headers/rope.hpp"
#include <assert.h>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <stdio.h>
// VM - pass 2
bool test_ranges(char inp, Range *ranges, int len) {
for (int i = 0; i < len; i++) {
Range *r = ranges + i;
if (inp >= r->start && inp <= r->end)
return true;
}
return false;
}
// Use pike vm method
struct Thread {
Inst *pc;
uint32_t saved[40]; /* $0 through $9 */
};
Thread thread(Inst *pc, uint32_t *saved) {
Thread t;
t.pc = pc;
for (int i = 0; i < 40; i++)
t.saved[i] = saved[i];
return t;
}
struct ThreadList {
Thread *t;
int n;
};
void handle_end(uint32_t *tsaved, uint32_t *saved) {
for (int i = 0; i < 40; i++)
saved[i] = tsaved[i];
}
bool addstate(Inst *prog, ThreadList *list, Thread t, int count) {
if (t.pc->op == JMP) {
if (addstate(prog, list, thread(prog + t.pc->j.x, t.saved), count))
return true;
return false;
} else if (t.pc->op == FRK) {
if (addstate(prog, list, thread(prog + t.pc->j.x, t.saved), count))
return true;
if (addstate(prog, list, thread(prog + t.pc->j.y, t.saved), count))
return true;
return false;
} else if (t.pc->op == SVS) {
// Handle SVS: set the start offset and continue
t.saved[t.pc->idx * 2] = count; // 'count' must be passed or accessible
if (addstate(prog, list, thread(t.pc + 1, t.saved), count))
return true;
return false;
} else if (t.pc->op == SVE) {
// Handle SVE: set the end offset and continue
t.saved[t.pc->idx * 2 + 1] = count; // 'count' must be passed or accessible
if (addstate(prog, list, thread(t.pc + 1, t.saved), count))
return true;
return false;
} else if (t.pc->op == END) {
handle_end(t.saved, t.saved);
return true;
} else {
for (int i = 0; i < list->n; i++)
if (list->t[i].pc == t.pc)
return false;
list->t[list->n++] = t;
return false;
}
}
void inline swap(ThreadList *a, ThreadList *b) {
ThreadList t = *a;
*a = *b;
*b = t;
}
void inline clear(ThreadList *list) { list->n = 0; }
int proglen(Inst *prog) {
int len = 0;
while (prog[len].op != END)
len++;
return ++len;
}
void inline free_list(ThreadList *list) {
free(list->t);
free(list);
}
int next_match(Inst *prog, ByteIterator *it, uint32_t *saved) {
int len;
ThreadList *clist, *nlist;
Thread t;
len = proglen(prog);
clist = (ThreadList *)malloc(sizeof(ThreadList));
clist->t = (Thread *)malloc(+sizeof(Thread) * len);
clist->n = 0;
nlist = (ThreadList *)malloc(sizeof(ThreadList));
nlist->t = (Thread *)malloc(+sizeof(Thread) * len);
nlist->n = 0;
char sp;
int count = 0;
addstate(prog, clist, thread(prog, saved), count);
for (sp = next_byte(it); sp != '\0'; sp = next_byte(it)) {
printf("%c", sp);
for (int i = 0; i < clist->n; i++) {
t = clist->t[i];
switch (t.pc->op) {
case MCH:
if (!test_ranges(sp, t.pc->r.ranges, t.pc->r.len)) {
if (addstate(prog, nlist, thread(prog, saved), count))
return true;
break;
}
if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count))
return true;
break;
case NMC:
if (test_ranges(sp, t.pc->r.ranges, t.pc->r.len)) {
if (addstate(prog, nlist, thread(prog, saved), count))
return true;
break;
}
if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count))
return true;
break;
case ANY:
if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count))
return true;
break;
case END:
case JMP:
case FRK:
case SVS:
case SVE:
break;
}
}
swap(clist, nlist);
clear(nlist);
count++;
}
free_list(clist);
free_list(nlist);
return false; // Reached EOF without a match
}
void print_program(Inst *program) {
Inst *p = program;
int i = 0;
while (1) {
printf("%3d: ", i);
switch (p->op) {
case JMP: {
int x = (int)(p->j.x);
printf("JMP -> %d\n", x);
break;
}
case FRK: {
int x = (int)(p->j.x);
int y = (int)(p->j.y);
printf("FRK -> %d , %d\n", x, y);
break;
}
case MCH: {
printf("MCH [");
for (int r = 0; r < p->r.len; r++) {
Range rr = p->r.ranges[r];
if (rr.start == rr.end)
printf("'%c'%s", rr.start, (r + 1 < p->r.len) ? ", " : "");
else
printf("'%c'-'%c'%s", rr.start, rr.end,
(r + 1 < p->r.len) ? ", " : "");
}
printf("]\n");
break;
}
case NMC: {
printf("NMC [");
for (int r = 0; r < p->r.len; r++) {
Range rr = p->r.ranges[r];
if (rr.start == rr.end)
printf("'%c'%s", rr.start, (r + 1 < p->r.len) ? ", " : "");
else
printf("'%c'-'%c'%s", rr.start, rr.end,
(r + 1 < p->r.len) ? ", " : "");
}
printf("]\n");
break;
}
case ANY:
printf("ANY\n");
break;
case SVS:
printf("SVS idx=%d\n", (unsigned char)p->idx);
break;
case SVE:
printf("SVE idx=%d\n", (unsigned char)p->idx);
break;
case END:
printf("END\n");
return;
default:
printf("UNKNOWN op=%d\n", p->op);
return;
}
p++;
i++;
}
}
Inst *compile_regex(std::string pattern) {
return compile_ast(regex_to_ast(pattern));
}
int __main() {
// Maunally compiled program for testing
char *buffer = (char *)malloc(29);
strcpy(buffer, "abcdabcdabcdabcdf");
// This loads all (excluding \0 put in by strcpy)
Knot *root = load(buffer, 17, optimal_chunk_size(12));
ByteIterator *it = begin_b_iter(root);
uint32_t saved[40];
for (int i = 0; i < 40; i++)
saved[i] = 0;
std::string pattern = "(abcd)+";
Inst *program = compile_regex(pattern);
print_program(program);
int result;
while ((result = next_match(program, it, saved))) {
printf("\nRES: %d\n", result);
for (int i = 0; i < 40; i++)
printf("%d, ", saved[i]);
}
free(program);
free(buffer);
free(it->it);
free(it);
free(root);
return 0;
}

387
src/rexambler.cpp Normal file
View File

@@ -0,0 +1,387 @@
#include "../headers/noose.hpp"
#include <cassert>
#include <string>
#include <vector>
Exp *parse_alternation(Parser *p);
Exp *parse_sequence(Parser *p);
Exp *parse_atom_with_modifiers(Parser *p);
Exp *parse_bracket_class(Parser *p);
Exp *make_none() {
Exp *e = new Exp();
e->capture = false;
e->kind = ExpKind::NONE;
return e;
}
Exp *make_any() {
Exp *e = new Exp();
e->capture = false;
e->kind = ExpKind::ANY;
return e;
}
Exp *make_range(const std::vector<ExRange> &ranges) {
Exp *e = new Exp();
e->capture = false;
e->kind = ExpKind::RANGE;
new (&e->ranges) std::vector<ExRange>(ranges);
return e;
}
Exp *make_range_single(char c, bool neg = false) {
std::vector<ExRange> r;
r.push_back(ExRange{neg, c, c});
return make_range(r);
}
Exp *make_or(Exp *l, Exp *r) {
OpOr *o = new OpOr();
o->left = l;
o->right = r;
Exp *e = new Exp();
e->capture = false;
e->kind = ExpKind::OR;
e->opor = o;
return e;
}
Exp *make_seq(Exp *l, Exp *r) {
OpSeq *o = new OpSeq();
o->left = l;
o->right = r;
Exp *e = new Exp();
e->capture = false;
e->kind = ExpKind::SEQ;
e->opseq = o;
return e;
}
Exp *clone_exp(Exp *e) {
if (!e)
return make_none();
if (e->kind == ExpKind::NONE)
return make_none();
if (e->kind == ExpKind::ANY)
return make_any();
if (e->kind == ExpKind::RANGE) {
return make_range(e->ranges);
}
if (e->kind == ExpKind::OR) {
Exp *l = clone_exp(e->opor->left);
Exp *r = clone_exp(e->opor->right);
return make_or(l, r);
}
if (e->kind == ExpKind::SEQ) {
Exp *l = clone_exp(e->opseq->left);
Exp *r = clone_exp(e->opseq->right);
return make_seq(l, r);
}
return make_none();
}
inline char peek(Parser *p) {
return (p->i < p->s.size()) ? p->s[p->i] : '\0';
} // lookahead
inline char consume(Parser *p) {
return (p->i < p->s.size()) ? p->s[p->i++] : '\0';
} // consume
Exp *regex_to_ast(std::string pattern) {
Parser p(pattern);
Exp *res = parse_alternation(&p);
return res ? res : make_none();
}
Exp *parse_alternation(Parser *p) {
std::vector<Exp *> parts;
parts.push_back(parse_sequence(p));
while (peek(p) == '|') {
consume(p);
parts.push_back(parse_sequence(p));
}
if (parts.empty())
return make_none();
Exp *cur = parts[0];
for (size_t p = 1; p < parts.size(); ++p)
cur = make_or(cur, parts[p]);
return cur;
}
Exp *parse_sequence(Parser *p) {
std::vector<Exp *> atoms;
while (true) {
if (p->i >= p->s.size())
break;
char c = peek(p);
if (c == ')' || c == '|')
break;
Exp *a = parse_atom_with_modifiers(p);
if (!a)
break;
atoms.push_back(a);
}
if (atoms.empty())
return make_none();
Exp *cur = atoms[0];
for (size_t k = 1; k < atoms.size(); ++k)
cur = make_seq(cur, atoms[k]);
return cur;
}
Exp *parse_atom(Parser *p) {
if (p->i >= p->s.size())
return nullptr;
char c = peek(p);
if (c == '(') {
// grouping; recurse; set capture=true for group's root
consume(p); // '('
Exp *inner = parse_alternation(p);
if (peek(p) == ')')
consume(p);
if (!inner)
inner = make_none();
inner->capture = true; // as requested
return inner;
}
if (c == '[') {
// parse bracket class
return parse_bracket_class(p);
}
if (c == '\\') {
consume(p);
if (p->i >= p->s.size())
return make_none();
char esc = consume(p);
// handle known escapes
switch (esc) {
case 'd':
return make_range({ExRange{false, '0', '9'}});
case 'D': {
// negated 0-9 : we represent as first sentinel negate=true then the
// included range
std::vector<ExRange> v;
v.push_back(ExRange{true, '0', '9'});
return make_range(v);
}
case 'w': {
std::vector<ExRange> v;
v.push_back(ExRange{false, 'a', 'z'});
v.push_back(ExRange{false, 'A', 'Z'});
v.push_back(ExRange{false, '0', '9'});
v.push_back(ExRange{false, '_', '_'});
return make_range(v);
}
case 'W': {
std::vector<ExRange> v;
// provide the positive ranges that will be negated
v.push_back(ExRange{true, 'a', 'z'});
v.push_back(ExRange{true, 'A', 'Z'});
v.push_back(ExRange{true, '0', '9'});
v.push_back(ExRange{true, '_', '_'});
return make_range(v);
}
case 's': {
std::vector<ExRange> v;
v.push_back(ExRange{false, ' ', ' '}); // space
v.push_back(ExRange{false, '\t', '\t'}); // tab
v.push_back(ExRange{false, '\r', '\r'}); // CR
v.push_back(ExRange{false, '\n', '\n'}); // LF
v.push_back(ExRange{false, '\v', '\v'}); // VT
v.push_back(ExRange{false, '\f', '\f'}); // FF
return make_range(v);
}
case 'S': {
std::vector<ExRange> v;
v.push_back(ExRange{true, 0, 0});
v.push_back(ExRange{true, ' ', ' '});
v.push_back(ExRange{true, '\t', '\t'});
v.push_back(ExRange{true, '\r', '\r'});
v.push_back(ExRange{true, '\n', '\n'});
v.push_back(ExRange{true, '\v', '\v'});
v.push_back(ExRange{true, '\f', '\f'});
return make_range(v);
}
case '.':
// escaped dot -> literal dot
return make_range_single('.', false);
default:
// escaped literal: any char becomes a single-char range
return make_range_single(esc, false);
}
}
if (c == '!') {
consume(p);
return make_any();
}
// literal char (including '.' when unescaped is special? In many syntaxes
// '.' is wildcard, but user said '.' maps to [^\n], so treat '.' as
// wildcard)
if (c == '.') {
consume(p);
// dot == [^\n]
std::vector<ExRange> v;
v.push_back(ExRange{true, '\n', '\n'}); // indicate newline excluded
return make_range(v);
}
// otherwise a normal literal single char -> single range
char lit = consume(p);
return make_range_single(lit, false);
}
Exp *parse_bracket_class(Parser *p) {
assert(peek(p) == '[');
consume(p); // '['
bool neg = false;
if (peek(p) == '^') {
neg = true;
consume(p);
}
std::vector<ExRange> ranges;
while (p->i < p->s.size() && peek(p) != ']') {
char a = consume(p);
if (a == '\\') {
if (p->i >= p->s.size())
break;
char esc = consume(p);
if (esc == 'd') {
ranges.push_back(ExRange{neg, '0', '9'});
} else if (esc == 'w') {
ranges.push_back(ExRange{neg, 'a', 'z'});
ranges.push_back(ExRange{neg, 'A', 'Z'});
ranges.push_back(ExRange{neg, '0', '9'});
ranges.push_back(ExRange{neg, '_', '_'});
} else if (esc == 's') {
ranges.push_back(ExRange{neg, ' ', ' '});
ranges.push_back(ExRange{neg, '\t', '\t'});
ranges.push_back(ExRange{neg, '\r', '\r'});
ranges.push_back(ExRange{neg, '\n', '\n'});
ranges.push_back(ExRange{neg, '\v', '\v'});
ranges.push_back(ExRange{neg, '\f', '\f'});
} else {
ranges.push_back(ExRange{neg, esc, esc});
}
} else if (peek(p) == '-' && p->i + 1 < p->s.size() &&
p->s[p->i + 1] != ']') {
// range: previous char '-' next char
// but we already consumed 'a' as a; ensure there's a start to range
// get next char
// Note: we already consumed 'a' into a variable; now current char is
// '-' because we peeked it so do: (We are at position of '-') consume
// '-' and then next char
consume(p); // '-'
if (p->i >= p->s.size())
break;
char b = consume(p);
ranges.push_back(ExRange{neg, a, b});
} else {
// single char
ranges.push_back(ExRange{neg, a, a});
}
}
if (peek(p) == ']')
consume(p);
// If negated, represent with sentinel first element with negate=true
return make_range(ranges);
}
bool parse_integer_opt(Parser *p, int &out) {
if (p->i >= p->s.size() || !std::isdigit((unsigned char)peek(p)))
return false;
int val = 0;
while (p->i < p->s.size() && std::isdigit((unsigned char)peek(p)))
val = val * 10 + (consume(p) - '0');
out = val;
return true;
}
Exp *parse_atom_with_modifiers(Parser *p) {
Exp *atom = parse_atom(p);
if (!atom)
return nullptr;
// apply possibly multiple modifiers in sequence
while (true) {
if (peek(p) == '?') {
consume(p);
// OpOr(atom, NONE)
atom = make_or(clone_exp(atom), make_none());
} else if (peek(p) == '*') {
consume(p);
// Expand to 20 repeating OpOr(atom, NONE) chained by SEQ as literal
// tree
Exp *unit_or = nullptr;
for (int t = 0; t < 20; ++t) {
Exp *op = make_or(clone_exp(atom), make_none());
if (!unit_or)
unit_or = op;
else
unit_or = make_seq(unit_or, op);
}
atom = unit_or ? unit_or : make_none();
} else if (peek(p) == '+') {
consume(p);
// First the atom, then 20 OpOr(atom, NONE) sequence
Exp *rest = nullptr;
for (int t = 0; t < 20; ++t) {
Exp *op = make_or(clone_exp(atom), make_none());
if (!rest)
rest = op;
else
rest = make_seq(rest, op);
}
atom = rest ? make_seq(clone_exp(atom), rest) : clone_exp(atom);
} else if (peek(p) == '{') {
// parse {x,y}
size_t save = p->i;
consume(p); // '{'
int x = 0, y = -1;
bool ok = parse_integer_opt(p, x);
if (!ok || peek(p) != ',') {
// malformed; roll back treat '{' as literal
p->i = save;
break;
}
consume(p); // ','
ok = parse_integer_opt(p, y);
if (!ok || peek(p) != '}') {
p->i = save;
break;
}
consume(p); // '}'
if (y < x)
y = x;
if (y > 20)
y = 20; // clamp to 20 as requested
// Build x copies of atom concatenated, then (y-x) OpOr(atom, NONE)
// chained
Exp *prefix = nullptr;
for (int k = 0; k < x; ++k) {
if (!prefix)
prefix = clone_exp(atom);
else
prefix = make_seq(prefix, clone_exp(atom));
}
Exp *suffix = nullptr;
for (int k = 0; k < (y - x); ++k) {
Exp *op = make_or(clone_exp(atom), make_none());
if (!suffix)
suffix = op;
else
suffix = make_seq(suffix, op);
}
if (!prefix)
prefix = make_none();
if (!suffix)
atom = prefix;
else
atom = make_seq(prefix, suffix);
} else {
break;
}
}
return atom;
}

149
src/rexpiler.cpp Normal file
View File

@@ -0,0 +1,149 @@
#include "../headers/noose.hpp"
#include <assert.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <stdio.h>
struct InstList {
Inst *data;
size_t len;
size_t cap;
int idx;
};
static inline bool is_ranges_negated(std::vector<ExRange> r) {
if (r.empty())
return false;
return r[0].negate;
}
static void insert_inst(InstList *list, Inst *inst) {
if (list->len >= list->cap) {
size_t nc = list->cap ? list->cap * 2 : 32;
list->data = (Inst *)realloc(list->data, nc * sizeof(Inst));
list->cap = nc;
}
list->data[list->len++] = *inst;
}
Inst *make_inst(Op op) {
Inst *I = (Inst *)calloc(1, sizeof(Inst));
if (!I)
assert(0);
I->op = op;
return I;
}
void compile_exp(Exp *e, InstList *list);
void compile_any(Exp *e, InstList *list) {
Inst *I = make_inst(ANY);
if (e->capture) {
list->idx++;
Inst *sv = make_inst(SVS);
sv->idx = list->idx;
insert_inst(list, sv);
insert_inst(list, I);
Inst *se = make_inst(SVE);
se->idx = list->idx;
insert_inst(list, se);
} else {
insert_inst(list, I);
}
}
void compile_range(Exp *e, InstList *list) {
std::vector<ExRange> er = e->ranges;
bool neg = is_ranges_negated(er);
size_t cnt = er.size();
Range *arr = (Range *)malloc(sizeof(Range) * cnt);
size_t w = 0;
for (ExRange cur : er)
arr[w++] = Range{cur.start, cur.end};
Inst *I = make_inst(neg ? NMC : MCH);
I->r.ranges = arr;
I->r.len = (int)cnt;
if (e->capture) {
list->idx++;
Inst *sv = make_inst(SVS);
sv->idx = list->idx;
insert_inst(list, sv);
insert_inst(list, I);
Inst *se = make_inst(SVE);
se->idx = list->idx;
insert_inst(list, se);
} else {
insert_inst(list, I);
}
}
void compile_seq(Exp *e, InstList *list) {
if (e->capture) {
int idx = list->idx++;
Inst *sv = make_inst(SVS);
sv->idx = idx;
insert_inst(list, sv);
}
compile_exp(e->opseq->left, list);
compile_exp(e->opseq->right, list);
if (e->capture) {
Inst *se = make_inst(SVE);
se->idx = list->idx - 1;
insert_inst(list, se);
}
}
void compile_or(Exp *e, InstList *list) {
if (e->capture) {
Inst *sv = make_inst(SVS);
sv->idx = list->idx++;
insert_inst(list, sv);
}
Inst *frk = make_inst(FRK);
frk->j.x = -1;
frk->j.y = -1;
int frk_idx = list->len;
insert_inst(list, frk);
int left_start = list->len;
compile_exp(e->opor->left, list);
Inst *jmp = make_inst(JMP);
insert_inst(list, jmp);
int jmp_idx = list->len - 1;
int right_start = list->len;
compile_exp(e->opor->right, list);
list->data[frk_idx].j.x = left_start;
list->data[frk_idx].j.y = right_start;
list->data[jmp_idx].j.x = list->len;
if (e->capture) {
Inst *se = make_inst(SVE);
se->idx = list->idx - 1;
insert_inst(list, se);
}
}
void compile_exp(Exp *e, InstList *list) {
switch (e->kind) {
case ExpKind::NONE:
break;
case ExpKind::ANY:
compile_any(e, list);
break;
case ExpKind::RANGE:
compile_range(e, list);
break;
case ExpKind::SEQ:
compile_seq(e, list);
break;
case ExpKind::OR:
compile_or(e, list);
break;
}
}
Inst *compile_ast(Exp *root) {
InstList list = {nullptr, 0, 0, 1};
compile_exp(root, &list);
insert_inst(&list, make_inst(END));
return list.data;
}

853
src/rope.cpp Normal file
View File

@@ -0,0 +1,853 @@
#include "../headers/rope.hpp"
#include "../headers/noose.hpp"
#include <assert.h>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <pcre2.h>
#include <stdio.h>
#include <string.h>
static void update(Knot *n) {
if (!n)
return;
if (!n->depth || n->depth == 0)
return;
uint32_t left_chars = n->left ? n->left->char_count : 0;
uint32_t right_chars = n->right ? n->right->char_count : 0;
n->char_count = left_chars + right_chars;
uint32_t left_lines = n->left ? n->left->line_count : 0;
uint32_t right_lines = n->right ? n->right->line_count : 0;
n->line_count = left_lines + right_lines;
uint8_t left_depth = n->left ? n->left->depth : 0;
uint8_t right_depth = n->right ? n->right->depth : 0;
n->depth = MAX(left_depth, right_depth) + 1;
n->chunk_size = n->left ? n->left->chunk_size : n->right->chunk_size;
}
// str is not consumed and \0 is not handled
// So if str is null terminated then len must be strlen(str)
// and freed by caller
Knot *load(char *str, uint32_t len, uint32_t chunk_size) {
if (len > (uint32_t)(chunk_size - (chunk_size / 16))) {
Knot *left = load(str, len / 2, chunk_size);
Knot *right = load(str + len / 2, len - len / 2, chunk_size);
Knot *node = (Knot *)malloc(sizeof(Knot));
if (!node)
return nullptr;
node->left = left;
node->right = right;
node->chunk_size = chunk_size;
node->depth = MAX(left->depth, right->depth) + 1;
node->char_count = left->char_count + right->char_count;
node->line_count = left->line_count + right->line_count;
return node;
} else {
Knot *node = (Knot *)malloc(sizeof(Knot) + chunk_size);
if (!node)
return nullptr;
node->left = nullptr;
node->right = nullptr;
node->chunk_size = chunk_size;
node->depth = 0;
node->char_count = len;
uint32_t newline_count = 0;
for (uint32_t i = 0; i < len; i++) {
char c = str[i];
node->data[i] = c;
if (c == '\n')
newline_count++;
}
node->line_count = newline_count;
return node;
}
}
// leaf if consumed and freed (so dont use or free it after)
// left and right are the new nodes
static void split_leaf(Knot *leaf, uint32_t k, Knot **left, Knot **right) {
Knot *left_node = (Knot *)malloc(sizeof(Knot) + leaf->chunk_size);
left_node->left = nullptr;
left_node->right = nullptr;
left_node->chunk_size = leaf->chunk_size;
left_node->depth = 0;
left_node->char_count = k;
uint32_t newline_count = 0;
for (uint32_t i = 0; i < k; i++) {
char c = leaf->data[i];
left_node->data[i] = c;
if (c == '\n')
newline_count++;
}
left_node->line_count = newline_count;
uint16_t right_line_count = leaf->line_count - newline_count;
*left = left_node;
Knot *right_node = (Knot *)malloc(sizeof(Knot) + leaf->chunk_size);
right_node->left = nullptr;
right_node->right = nullptr;
right_node->chunk_size = leaf->chunk_size;
right_node->depth = 0;
right_node->char_count = leaf->char_count - k;
right_node->line_count = right_line_count;
for (uint32_t i = k; i < leaf->char_count; i++) {
char c = leaf->data[i];
right_node->data[i - k] = c;
}
*right = right_node;
free(leaf);
}
// This makes node nonsensical, so dont use or free it after
void split(Knot *node, uint32_t offset, Knot **left, Knot **right) {
if (!node) {
*left = nullptr;
*right = nullptr;
return;
}
if (node->depth == 0) {
split_leaf(node, offset, left, right);
return;
}
uint32_t left_size = node->left ? node->left->char_count : 0;
if (offset < left_size) {
Knot *L = nullptr, *R = nullptr;
split(node->left, offset, &L, &R);
node->left = R;
update(node);
*right = node;
*left = L;
} else {
uint32_t new_offset = offset - left_size;
Knot *L = nullptr, *R = nullptr;
split(node->right, new_offset, &L, &R);
node->right = L;
update(node);
*left = node;
*right = R;
}
}
static inline int get_balance_factor(Knot *n) {
if (!n)
return 0;
return (int)DEPTH(n->left) - (int)DEPTH(n->right);
}
static inline Knot *rotate_right(Knot *y) {
Knot *x = y->left;
Knot *T2 = x->right;
x->right = y;
y->left = T2;
update(y);
update(x);
return x;
}
static inline Knot *rotate_left(Knot *x) {
Knot *y = x->right;
Knot *T2 = y->left;
y->left = x;
x->right = T2;
update(x);
update(y);
return y;
}
// Technically n can be used after calling
// but use return value instead
Knot *balance(Knot *n) {
update(n);
int bal = get_balance_factor(n);
if (bal > 1) {
if (get_balance_factor(n->left) < 0)
n->left = rotate_left(n->left);
return rotate_right(n);
}
if (bal < -1) {
if (get_balance_factor(n->right) > 0)
n->right = rotate_right(n->right);
return rotate_left(n);
}
return n;
}
// Dont free left or right after calling (only free return value)
// Assumes both ropes have equal chunk sizes
Knot *concat(Knot *left, Knot *right) {
if (!left)
return right;
if (!right)
return left;
if (!left || left->char_count == 0) {
if (left)
free_rope(left);
return right;
}
if (!right || right->char_count == 0) {
if (right)
free_rope(right);
return left;
}
if (left->depth == 0 && right->depth == 0) {
if (left->char_count + right->char_count <= left->chunk_size) {
Knot *node = (Knot *)malloc(sizeof(Knot) + left->chunk_size);
node->left = nullptr;
node->right = nullptr;
node->chunk_size = left->chunk_size;
node->depth = 0;
node->char_count = left->char_count + right->char_count;
node->line_count = left->line_count + right->line_count;
memcpy(node->data, left->data, left->char_count);
memcpy(node->data + left->char_count, right->data, right->char_count);
free(left);
free(right);
return node;
}
}
uint16_t d_left = left->depth;
uint16_t d_right = right->depth;
if (d_left > d_right + 1) {
left->right = concat(left->right, right);
return balance(left);
}
if (d_right > d_left + 1) {
right->left = concat(left, right->left);
return balance(right);
}
Knot *node = (Knot *)malloc(sizeof(Knot));
if (!node)
return nullptr;
node->left = left;
node->right = right;
node->chunk_size = left->chunk_size;
node->depth = MAX(d_left, d_right) + 1;
update(node);
return node;
}
// This makes node nonsensical, so dont use or free it after
// Instead, free the return value or use it in node's place
Knot *insert(Knot *node, uint32_t offset, char *str, uint32_t len) {
if (!node)
return nullptr;
if (node->depth == 0 && node->char_count + len <= node->chunk_size) {
if (offset < node->char_count)
memmove(node->data + offset + len, node->data + offset,
node->char_count - offset);
memcpy(node->data + offset, str, len);
node->char_count += len;
for (uint32_t i = 0; i < len; i++)
if (str[i] == '\n')
node->line_count++;
return node;
}
if (node->depth > 0) {
uint32_t left_count = node->left ? node->left->char_count : 0;
if (offset < left_count) {
Knot *new_left = insert(node->left, offset, str, len);
node->left = new_left;
update(node);
return balance(node);
} else {
Knot *new_right = insert(node->right, offset - left_count, str, len);
node->right = new_right;
update(node);
return balance(node);
}
}
Knot *left_part = nullptr;
Knot *right_part = nullptr;
split(node, offset, &left_part, &right_part);
Knot *middle_part = load(str, len, node->chunk_size);
return concat(concat(left_part, middle_part), right_part);
}
// This makes node nonsensical, so dont use or free it after
// Instead, free the return value or use it in node's place
Knot *erase(Knot *node, uint32_t offset, uint32_t len) {
if (!node || len == 0 || offset >= node->char_count)
return node;
if (offset + len > node->char_count)
len = node->char_count - offset;
if (node->depth == 0) {
uint32_t deleted_newlines = 0;
for (uint32_t i = offset; i < offset + len; i++)
if (node->data[i] == '\n')
deleted_newlines++;
node->line_count -= deleted_newlines;
if (offset + len < node->char_count)
memmove(node->data + offset, node->data + offset + len,
node->char_count - (offset + len));
node->char_count -= len;
return node;
}
uint32_t left_count = node->left ? node->left->char_count : 0;
if (offset + len <= left_count) {
node->left = erase(node->left, offset, len);
} else if (offset >= left_count) {
node->right = erase(node->right, offset - left_count, len);
} else {
Knot *left = nullptr, *middle = nullptr, *right = nullptr;
split(node, offset, &left, &right);
split(right, len, &middle, &right);
free_rope(middle);
return concat(left, right);
}
update(node);
return balance(node);
}
static void _read_into(Knot *node, uint32_t offset, uint32_t len, char *dest) {
if (!node || len == 0)
return;
if (node->depth == 0) {
memcpy(dest, node->data + offset, len);
return;
}
Knot *left = node->left;
uint32_t left_count = left ? left->char_count : 0;
if (offset < left_count) {
uint32_t chunk_len = left_count - offset;
if (chunk_len > len)
chunk_len = len;
_read_into(left, offset, chunk_len, dest);
dest += chunk_len;
len -= chunk_len;
offset = 0;
} else {
offset -= left_count;
}
if (len > 0 && node->right)
_read_into(node->right, offset, len, dest);
}
char *read(Knot *root, uint32_t offset, uint32_t len) {
if (!root)
return nullptr;
if (offset >= root->char_count) {
char *empty = (char *)malloc(1);
if (empty)
empty[0] = '\0';
return empty;
}
if (offset + len > root->char_count) {
len = root->char_count - offset;
}
char *buffer = (char *)malloc((len + 1) * sizeof(char));
if (!buffer)
return nullptr;
_read_into(root, offset, len, buffer);
buffer[len] = '\0';
return buffer;
}
// Hopefully free the tree only once at the end of its use using the pointer
// from the last insert or concat or erase call.
// (or use twice if last call was split - for both left and right).
void free_rope(Knot *root) {
if (!root)
return;
free_rope(root->left);
free_rope(root->right);
free(root);
}
static uint32_t find_nth_newline_offset(Knot *node, uint32_t n) {
if (!node || n > node->line_count)
return UINT32_MAX;
if (node->depth == 0) {
uint32_t count = 0;
for (uint32_t i = 0; i < node->char_count; i++) {
if (node->data[i] == '\n') {
if (count == n)
return i;
count++;
}
}
return UINT32_MAX;
}
uint32_t left_lines = node->left ? node->left->line_count : 0;
if (n < left_lines) {
return find_nth_newline_offset(node->left, n);
} else {
uint32_t right_offset =
find_nth_newline_offset(node->right, n - left_lines);
if (right_offset == UINT32_MAX)
return UINT32_MAX;
uint32_t left_chars = node->left ? node->left->char_count : 0;
return left_chars + right_offset;
}
}
uint32_t byte_to_line(Knot *node, uint32_t offset) {
if (!node)
return 0;
if (offset >= node->char_count)
return node->line_count;
if (node->depth == 0) {
uint32_t lines_before = 0;
uint32_t limit = (offset < node->char_count) ? offset : node->char_count;
for (uint32_t i = 0; i < limit; i++)
if (node->data[i] == '\n')
lines_before++;
return lines_before;
}
uint32_t left_chars = node->left ? node->left->char_count : 0;
if (offset < left_chars) {
return byte_to_line(node->left, offset);
} else {
uint32_t left_lines = node->left ? node->left->line_count : 0;
return left_lines + byte_to_line(node->right, offset - left_chars);
}
}
uint32_t line_to_byte(Knot *node, uint32_t line, uint32_t *out_len) {
if (!node) {
if (out_len)
*out_len = 0;
return 0;
}
uint32_t start_offset = 0;
uint32_t end_offset = 0;
if (line == 0) {
start_offset = 0;
} else {
uint32_t prev_newline = find_nth_newline_offset(node, line - 1);
if (prev_newline == UINT32_MAX)
start_offset = node->char_count;
else
start_offset = prev_newline + 1;
}
uint32_t current_newline = find_nth_newline_offset(node, line);
if (current_newline == UINT32_MAX)
end_offset = node->char_count;
else
end_offset = current_newline + 1;
if (out_len) {
if (end_offset > start_offset)
*out_len = end_offset - start_offset;
else
*out_len = 0;
}
return start_offset;
}
LineIterator *begin_l_iter(Knot *root, uint32_t start_line) {
if (!root)
return nullptr;
if (start_line > root->line_count)
return nullptr;
LineIterator *it = (LineIterator *)malloc(sizeof(LineIterator));
if (!it)
return nullptr;
it->top = 0;
it->line = start_line;
it->node = nullptr;
if (start_line == 0) {
it->offset = 0;
while (root->left) {
it->stack[it->top++] = root;
root = root->left;
if (!root->left && !root->right)
it->node = root;
}
it->stack[it->top++] = root;
return it;
}
Knot *curr = root;
uint32_t relative_line = start_line;
while (curr) {
it->stack[it->top++] = curr;
if (!curr->left && !curr->right) {
it->node = curr;
break;
}
uint32_t left_lines = (curr->left) ? curr->left->line_count : 0;
if (relative_line < left_lines) {
curr = curr->left;
} else {
relative_line -= left_lines;
curr = curr->right;
}
}
if (!it->node) {
free(it);
return nullptr;
}
it->offset = 0;
if (relative_line > 0) {
uint32_t found_newlines = 0;
uint32_t i = 0;
for (i = 0; i < it->node->char_count; i++) {
if (it->node->data[i] == '\n') {
found_newlines++;
if (found_newlines == relative_line) {
it->offset = i + 1;
break;
}
}
}
}
return it;
}
static inline void iter_advance_leaf(LineIterator *it) {
if (it->top == 0) {
it->node = nullptr;
return;
}
Knot *prev = it->stack[--it->top];
while (it->top > 0) {
Knot *parent = it->stack[it->top - 1];
if (parent->left == prev && parent->right) {
Knot *curr = parent->right;
while (curr) {
it->stack[it->top++] = curr;
if (!curr->left && !curr->right) {
it->node = curr;
it->offset = 0;
return;
}
curr = (curr->left) ? curr->left : curr->right;
}
}
prev = it->stack[--it->top];
}
it->node = nullptr;
}
char *next_line(LineIterator *it) {
if (!it || !it->node)
return nullptr;
size_t capacity = 128;
size_t len = 0;
char *buffer = (char *)malloc(capacity);
if (!buffer)
return nullptr;
while (it->node) {
if (it->offset >= it->node->char_count) {
iter_advance_leaf(it);
if (!it->node)
break;
}
char *start = it->node->data + it->offset;
char *end = it->node->data + it->node->char_count;
char *newline_ptr = (char *)memchr(start, '\n', end - start);
size_t chunk_len;
int found_newline = 0;
if (newline_ptr) {
chunk_len = (newline_ptr - start) + 1;
found_newline = 1;
} else {
chunk_len = end - start;
}
if (len + chunk_len + 1 > capacity) {
capacity = (capacity * 2) + chunk_len;
char *new_buf = (char *)realloc(buffer, capacity);
if (!new_buf) {
free(buffer);
return nullptr;
}
buffer = new_buf;
}
memcpy(buffer + len, start, chunk_len);
len += chunk_len;
it->offset += chunk_len;
if (found_newline) {
buffer[len] = '\0';
it->line++;
return buffer;
}
}
if (len > 0) {
buffer[len] = '\0';
it->line++;
return buffer;
}
free(buffer);
return nullptr;
}
LeafIterator *begin_k_iter(Knot *root) {
if (!root)
return nullptr;
LeafIterator *it = (LeafIterator *)malloc(sizeof(LeafIterator));
if (!it)
return nullptr;
it->top = 0;
Knot *curr = root;
while (curr) {
it->stack[it->top++] = curr;
if (!curr->left && !curr->right) {
it->node = curr;
return it;
}
curr = curr->left;
if (!curr) {
curr = it->stack[--it->top]->right;
Knot *temp = it->stack[it->top];
it->stack[it->top++] = temp;
curr = temp->left ? temp->left : temp->right;
Knot *parent = it->stack[it->top - 1];
curr = parent->left;
if (!curr) {
curr = parent->right;
}
}
}
free(it);
return nullptr;
}
// Caller must never free the returned string
char *next_leaf(LeafIterator *it) {
if (!it || !it->node)
return nullptr;
char *data_to_return = it->node->data;
data_to_return[it->node->char_count] = '\0';
Knot *prev_leaf = it->node;
Knot *parent = nullptr;
while (it->top > 0) {
parent = it->stack[--it->top];
if (parent->right && parent->right != prev_leaf) {
Knot *curr = parent->right;
while (curr) {
it->stack[it->top++] = curr;
if (!curr->left && !curr->right) {
it->node = curr;
return data_to_return;
}
curr = curr->left;
if (!curr)
curr = it->stack[it->top - 1]->right;
}
}
prev_leaf = parent;
}
it->node = nullptr;
return data_to_return;
}
ByteIterator *begin_b_iter(Knot *root) {
ByteIterator *b_it = (ByteIterator *)malloc(sizeof(ByteIterator));
LeafIterator *l_it = begin_k_iter(root);
b_it->it = l_it;
b_it->offset_g = 0;
b_it->offset_l = 0;
b_it->char_count = 0;
b_it->data = nullptr;
return b_it;
}
char next_byte(ByteIterator *it) {
if (it->data && it->offset_l < it->char_count) {
return it->data[it->offset_l++];
} else {
it->offset_g += it->offset_l;
it->offset_l = 1;
char *data = next_leaf(it->it);
it->char_count = strlen(data);
it->data = data;
if (it->data)
return *it->data;
else
return '\0';
}
}
std::vector<std::pair<size_t, size_t>> search_rope(Knot *root,
const char *pattern) {
std::vector<std::pair<size_t, size_t>> results;
int errorcode;
PCRE2_SIZE erroffset;
pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0,
&errorcode, &erroffset, nullptr);
if (!re) {
fprintf(stderr, "PCRE2 compile error: %d\n", errorcode);
return results;
}
pcre2_match_data *mdata = pcre2_match_data_create(128, nullptr);
int workspace[PCRE_WORKSPACE_SIZE];
LeafIterator *it = begin_k_iter(root);
if (!it) {
pcre2_code_free(re);
pcre2_match_data_free(mdata);
return results;
}
size_t chunk_abs_offset = 0;
size_t saved_match_start = 0;
bool match_in_progress = false;
int flags = PCRE2_PARTIAL_SOFT;
while (1) {
const char *chunk_start = next_leaf(it);
if (!chunk_start)
break;
size_t chunk_len = strlen(chunk_start);
const char *current_ptr = chunk_start;
size_t remaining_len = chunk_len;
while (remaining_len > 0) {
int rc =
pcre2_dfa_match(re, (PCRE2_SPTR)current_ptr, remaining_len, 0, flags,
mdata, nullptr, workspace, PCRE_WORKSPACE_SIZE);
if (rc >= 0) {
PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata);
size_t match_start_abs;
size_t match_end_abs;
if (match_in_progress) {
match_start_abs = saved_match_start;
match_end_abs =
chunk_abs_offset + (current_ptr - chunk_start) + ov[1];
} else {
match_start_abs =
chunk_abs_offset + (current_ptr - chunk_start) + ov[0];
match_end_abs =
chunk_abs_offset + (current_ptr - chunk_start) + ov[1];
}
size_t total_len = match_end_abs - match_start_abs;
results.push_back(std::make_pair(match_start_abs, total_len));
size_t consumed = ov[1];
if (consumed == 0)
consumed = 1;
current_ptr += consumed;
if (consumed > remaining_len)
remaining_len = 0;
else
remaining_len -= consumed;
match_in_progress = false;
flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL;
continue;
} else if (rc == PCRE2_ERROR_PARTIAL) {
PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata);
if (!match_in_progress) {
saved_match_start =
chunk_abs_offset + (current_ptr - chunk_start) + ov[0];
match_in_progress = true;
}
flags |= PCRE2_DFA_RESTART;
flags |= PCRE2_NOTBOL;
break;
} else {
if (match_in_progress) {
match_in_progress = false;
flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL;
current_ptr++;
remaining_len--;
} else {
break;
}
// if (rc != PCRE2_ERROR_NOMATCH) {} // handle error
}
}
chunk_abs_offset += chunk_len;
if (!match_in_progress)
flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL;
}
pcre2_match_data_free(mdata);
pcre2_code_free(re);
free(it);
return results;
}
uint32_t optimal_chunk_size(uint64_t length) {
if (length <= MIN_CHUNK_SIZE)
return MIN_CHUNK_SIZE;
double target_exponent = MIN(std::log2((double)MAX_CHUNK_SIZE),
7.0 + (std::log2((double)length) - 10.0) * 0.25);
uint32_t final_chunk_size =
MAX((uint32_t)MIN_CHUNK_SIZE, (uint32_t)std::pow(2.0, target_exponent));
final_chunk_size = MIN(final_chunk_size, (uint32_t)MAX_CHUNK_SIZE);
final_chunk_size = 1U << (32 - __builtin_clz(final_chunk_size - 1));
return final_chunk_size;
}
// Basic correctness test & usage example
int _main() {
char *buffer = (char *)malloc(44 * 4 + 5);
strcpy(buffer, "The quick brown fox jumps over the lazy dog.\n\
The quick brown fox jumps over the lazy dog.\n\
The quick brown fox jumps over the lazy dog.\n\
The quick brown fox jumps over the lazy dog.");
// This loads all (excluding \0 put in by strcpy)
Knot *root = load(buffer, 44 * 4 + 3, optimal_chunk_size(44 * 4 + 3));
Knot *left = nullptr, *right = nullptr;
// Splits root into left and right (root is no longer valid)
split(root, 5, &left, &right);
// simple read based on byte offset and length
char *s1 = read(left, 0, 100);
printf("%s\n:\n", s1);
char *s2 = read(right, 0, 100);
printf("%s\n;\n", s2);
free(s1);
free(s2);
// Recombines left and right into root (both can
// be valid or invalid in optimized cases)
// they are to not be used after concat
root = concat(left, right);
// root should be set to return value from insert always
root = insert(root, 5, buffer, 5);
free(buffer);
char *s3 = read(root, 0, 100);
printf("%s\n,\n", s3);
// Similar to insert but for erase
root = erase(root, 5, 5);
char *s4 = read(root, 0, 100);
printf("%s\n.\n", s4);
free(s3);
free(s4);
uint32_t byte_offset;
uint32_t len;
// Byte offset given reltive to how it would
// be in a file offset + len includes the \n
// at the end of the line (or nothing is EOF)
byte_offset = line_to_byte(root, 2, &len);
char *s5 = read(root, byte_offset, len);
printf("%s\n'\n", s5);
free(s5);
// returns line number of which line that
// byte position would be in.
// the ending \n position is included in this
uint32_t line = byte_to_line(root, byte_offset + len - 1);
printf("%u\n:\n", line);
// From second line onwards (0 indexed)
LineIterator *it = begin_l_iter(root, 0);
char *c = nullptr;
while ((c = next_line(it)) != nullptr) {
printf("%s :wow:\n", c);
free(c);
}
free(it);
printf("\n/\n");
// Starts at first byte (to be used for regex search)
ByteIterator *it2 = begin_b_iter(root);
uint32_t saved[40];
for (int i = 0; i < 40; i++)
saved[i] = 0;
std::string pattern = "f.x";
Inst *program = compile_regex(pattern);
bool result;
while ((result = next_match(program, it2, saved))) {
printf("\nRES: %d\n", result);
for (int i = 0; i < 40; i++)
printf("%d, ", saved[i]);
}
// char c2 = ' ';
// while ((c2 = next_byte(it2)) != '\0')
// printf("%c :wow!:\n", c2);
// free(it2);
// search // uses leaf iterator internally // PCRE2 based
std::vector<std::pair<size_t, size_t>> matches = search_rope(root, "f.x");
for (size_t i = 0; i < matches.size(); i++)
printf("\n%lu %lu", matches[i].first, matches[i].second);
// A rope needs to be freed only once if last action on the rope is
// insert or concat or erase.
// for splits we need to free both left and right separately
free_rope(root);
return 0;
}

283
src/test.cpp Normal file
View File

@@ -0,0 +1,283 @@
#include "../headers/noose.hpp"
#include "../headers/rope.hpp"
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
char *load_file(const char *path, size_t *out_len) {
FILE *f = fopen(path, "rb");
if (!f) {
perror("fopen");
return nullptr;
}
fseek(f, 0, SEEK_END);
size_t len = ftell(f);
rewind(f);
char *buf = (char *)malloc(len);
if (!buf) {
perror("malloc");
fclose(f);
return nullptr;
}
fread(buf, 1, len, f);
fclose(f);
*out_len = len;
return buf;
}
int main() {
printf("My rope implementation benchmark\n");
{
size_t len;
printf("Loading file into rope...\n");
char *buf = load_file("./random.bin", &len);
auto start = std::chrono::high_resolution_clock::now();
Knot *root = load(buf, len, 2);
auto end = std::chrono::high_resolution_clock::now();
printf("Load time: %.3f s\n",
std::chrono::duration<double>(end - start).count());
free(buf);
// READ TEST
printf("Testing read...\n");
start = std::chrono::high_resolution_clock::now();
char *content = read(root, len / 2, 1024);
end = std::chrono::high_resolution_clock::now();
free(content);
printf("Read 1 KB from middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// INSERT TEST
printf("Testing insert...\n");
char insert_data[1024];
memset(insert_data, 'X', 1024);
start = std::chrono::high_resolution_clock::now();
root = insert(root, len / 2, insert_data, 1024);
end = std::chrono::high_resolution_clock::now();
printf("Insert 1 KB in middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// ERASE TEST (Delete the same 1 KB we just inserted)
printf("Testing erase...\n");
start = std::chrono::high_resolution_clock::now();
root = erase(root, len / 2, 1024);
end = std::chrono::high_resolution_clock::now();
printf("Erase 1 KB in middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// SPLIT TEST
printf("Testing split...\n");
Knot *left = nullptr, *right = nullptr;
start = std::chrono::high_resolution_clock::now();
split(root, len / 2, &left, &right);
end = std::chrono::high_resolution_clock::now();
printf("Split at middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// CONCAT TEST
printf("Testing concat...\n");
start = std::chrono::high_resolution_clock::now();
root = concat(left, right);
end = std::chrono::high_resolution_clock::now();
printf("Concat: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// ---------------------------------------------------------
// LINE OPERATIONS TESTS
// ---------------------------------------------------------
printf("Testing line operations...\n");
// KNOWN CONSTANTS based on: yes "The quick brown fox jumps over the lazy
// dog." String length: 44 + 1 newline = 45 bytes per line.
const uint32_t BYTES_PER_LINE = 45;
const uint32_t TEST_LINE_INDEX = 1000; // A line deep in the file
// 1. Test byte_to_line
// We pick a byte in the middle of TEST_LINE_INDEX.
// Offset = (100000 * 45) + 10.
uint32_t test_offset = (TEST_LINE_INDEX * BYTES_PER_LINE) + 10;
start = std::chrono::high_resolution_clock::now();
uint16_t calculated_line = byte_to_line(root, test_offset);
end = std::chrono::high_resolution_clock::now();
printf("byte_to_line (%u -> %u): %.6f s ", test_offset, calculated_line,
std::chrono::duration<double>(end - start).count());
if (calculated_line == TEST_LINE_INDEX) {
printf("[PASS]\n");
} else {
printf("[FAIL] Expected %u, got %u\n", TEST_LINE_INDEX, calculated_line);
}
// 2. Test line_to_byte
// We ask for the start of TEST_LINE_INDEX. Should be exactly
// TEST_LINE_INDEX * 45.
uint32_t out_len = 0;
uint32_t expected_start = TEST_LINE_INDEX * BYTES_PER_LINE;
start = std::chrono::high_resolution_clock::now();
uint32_t calculated_start = line_to_byte(root, TEST_LINE_INDEX, &out_len);
end = std::chrono::high_resolution_clock::now();
printf("line_to_byte (Line %u -> Offset %u): %.6f s ", TEST_LINE_INDEX,
calculated_start,
std::chrono::duration<double>(end - start).count());
if (calculated_start == expected_start && out_len == BYTES_PER_LINE) {
printf("[PASS]\n");
} else {
printf("[FAIL] Expected offset %u (len %u), got %u (len %u)\n",
expected_start, BYTES_PER_LINE, calculated_start, out_len);
}
// ---------------------------------------------------------
// ITERATOR SPEED TEST
// ---------------------------------------------------------
printf("Testing iterator speed...\n");
const uint32_t LINES_TO_ITERATE = 10000; // Iterate 10,000 lines
// 1. Initialize the iterator at a deep line index
uint32_t start_line = TEST_LINE_INDEX + 10;
LeafIterator *it = begin_k_iter(root);
if (!it) {
printf("Iterator Test: [FAIL] begin_iterator returned NULL.\n");
} else {
char *line = NULL;
uint32_t lines_read = 0;
start = std::chrono::high_resolution_clock::now();
// 2. Iterate and time the process
// We use the clean C idiom: get the line, check for NULL, then
// process.
while (lines_read < LINES_TO_ITERATE && (line = next_leaf(it)) != NULL) {
// Note: We deliberately skip printing to focus on the Rope operation
// time.
lines_read++;
}
end = std::chrono::high_resolution_clock::now();
double elapsed_time = std::chrono::duration<double>(end - start).count();
printf("Iterator speed (f:: %u): %.6f s (%.2f lines/s)\n", lines_read,
elapsed_time, (double)lines_read / elapsed_time);
if (lines_read == LINES_TO_ITERATE) {
printf("Iterator Test: [PASS] Successfully iterated %u lines.\n",
LINES_TO_ITERATE);
} else {
printf("Iterator Test: [FAIL] Expected %u lines, read %u.\n",
LINES_TO_ITERATE, lines_read);
}
// 3. Clean up the iterator
free(it);
}
// search test
start = std::chrono::high_resolution_clock::now();
std::vector<std::pair<size_t, size_t>> matches = search_rope(root, "f.x");
end = std::chrono::high_resolution_clock::now();
printf("Search Time: %.6f s\n",
std::chrono::duration<double>(end - start).count());
printf("Found %lu matches\n", matches.size());
char *c = read(root, 0, 1000);
printf("%s\n", c);
free(c);
ByteIterator *it1 = begin_b_iter(root);
char ch;
while ((ch = next_byte(it1)) != '\0') {
printf("%c:", ch);
}
ByteIterator *it2 = begin_b_iter(root);
uint32_t saved[40];
for (int i = 0; i < 40; i++)
saved[i] = 0;
std::string pattern = "f.x";
Inst *program = compile_regex(pattern);
bool result;
int count = 0;
start = std::chrono::high_resolution_clock::now();
while ((result = next_match(program, it2, saved))) {
count++;
printf("%d\n", count);
}
end = std::chrono::high_resolution_clock::now();
printf("Search Time: %.6f s\n",
std::chrono::duration<double>(end - start).count());
printf("Found2 %d matches\n", count);
free_rope(root);
}
printf("Testing std::string...\n");
{
std::ifstream file("random.bin", std::ios::binary | std::ios::ate);
if (!file) {
perror("ifstream");
return 1;
}
size_t len = file.tellg();
file.seekg(0);
std::string data(len, '\0');
file.read(data.data(), len);
std::string s = data;
auto start = std::chrono::high_resolution_clock::now();
// READ: middle 1 KB
std::string read_chunk = s.substr(len / 2, 1024);
auto end = std::chrono::high_resolution_clock::now();
printf("std::string read 1 KB from middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// INSERT: middle 1 KB
std::string insert_data(1024, 'X');
start = std::chrono::high_resolution_clock::now();
s.insert(len / 2, insert_data);
end = std::chrono::high_resolution_clock::now();
printf("std::string insert 1 KB in middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// ERASE: middle 1 KB
start = std::chrono::high_resolution_clock::now();
s.erase(len / 2, 1024);
end = std::chrono::high_resolution_clock::now();
printf("std::string erase 1 KB in middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// SPLIT: middle
start = std::chrono::high_resolution_clock::now();
std::string left = s.substr(0, len / 2);
std::string right = s.substr(len / 2);
end = std::chrono::high_resolution_clock::now();
printf("std::string split at middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// CONCAT
start = std::chrono::high_resolution_clock::now();
s = left + right;
end = std::chrono::high_resolution_clock::now();
printf("std::string concat: %.6f s\n",
std::chrono::duration<double>(end - start).count());
}
}