Initial commit

This commit is contained in:
2025-11-27 16:42:36 +00:00
commit a95e5c09b8
7 changed files with 2188 additions and 0 deletions

87
api/noose.hpp Normal file
View File

@@ -0,0 +1,87 @@
#ifndef NOOSE_HPP
#define NOOSE_HPP
#include "../headers/rope.hpp"
#include <string>
struct Exp;
enum class ExpKind { RANGE, OR, SEQ, ANY, NONE };
struct ExRange {
bool negate = false;
char start;
char end;
};
struct OpOr {
Exp *left;
Exp *right;
};
struct OpSeq {
Exp *left;
Exp *right;
};
struct Exp {
bool capture;
ExpKind kind;
union {
OpOr *opor;
OpSeq *opseq;
std::vector<ExRange> ranges;
};
Exp() {
capture = false;
kind = ExpKind::NONE;
}
};
struct Parser {
std::string s;
size_t i;
Parser(std::string str) : s(str), i(0) {}
};
enum Op {
// These jump around
JMP = 0, // Jump to j.x
FRK = 1, // Fork to j.x and j.y (with priority to x)
// These consume 1 char from the input, if not then fail thread
// (failuire of main thread is not successfull match)
MCH = 2, // match with range object
NMC = 3, // not match with range object
ANY = 4, // Anything
// Used to save offsets
SVS = 5, // Start save for i cap group
SVE = 6, // End save for i cap group
// Match is successful if main thread reaches the end
END = 7
};
struct Range { // use start == end to match a particular char
char start;
char end;
};
struct Inst {
Op op;
union {
struct {
Range *ranges;
int len;
} r;
struct {
int x, y;
} j;
};
int idx;
};
Exp *regex_to_ast(std::string pattern);
Inst *compile_ast(Exp *root);
Inst *compile_regex(std::string pattern);
int next_match(Inst *prog, ByteIterator *it, uint32_t *saved);
#endif

157
api/rope.hpp Normal file
View File

@@ -0,0 +1,157 @@
#ifndef ROPE_HPP
#define ROPE_HPP
#include <cstdint>
#include <vector>
#define MIN_CHUNK_SIZE 64 // 64 Bytes
#define MAX_CHUNK_SIZE 1024 * 8 // 8192 Bytes (8 KiB)
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define DEPTH(n) ((n) ? (n)->depth : 0)
#define PCRE2_CODE_UNIT_WIDTH 8
#define PCRE_WORKSPACE_SIZE 512
// Rope node definition
typedef struct Knot {
Knot *left;
Knot *right;
uint8_t depth;
uint32_t chunk_size;
uint32_t line_count;
uint32_t char_count;
char data[];
} Knot;
typedef struct LineIterator {
Knot *node;
uint8_t top;
uint32_t offset;
uint32_t line;
Knot *stack[64];
} LineIterator;
typedef struct LeafIterator {
Knot *node;
uint8_t top;
uint32_t offset;
Knot *stack[64];
} LeafIterator;
typedef struct ByteIterator {
LeafIterator *it;
uint32_t offset_l;
uint32_t offset_g;
uint32_t char_count;
char *data;
} ByteIterator;
// Rope operations
// Takes lengt of string to be converted
// to rope and returns a suitable chunk size
// but rope should work with any positive chunk size
uint32_t optimal_chunk_size(uint64_t length);
// Takes a string (no need for null termination) and returns a rope
// len is the length of the string, and chunk size is the size of each chunk
// load does not free or consume the string.
// and the str can be freed after load has been run.
Knot *load(char *str, uint32_t len, uint32_t chunk_size);
// Balances the rope and returns the root
// n is no longer valid / do not free
// As rope is balanced by other functions
// this is not to be used directly
Knot *balance(Knot *n);
// Concatenates two ropes and returns the joined root
// Balances the ropes too, if needed
// left and right are no longer valid / do not free
// ! left and right should have the same chunk size !
Knot *concat(Knot *left, Knot *right);
// Used to insert text into the rope
// node (the rope being inserted into) is no longer valid after call
// instead use return value as the new node
// offset is the position of the insertion relative to the start of the rope
// str is the string to be inserted (no need for null termination)
// len is the length of the string
Knot *insert(Knot *node, uint32_t offset, char *str, uint32_t len);
// Similar to insert but for deletion
// node (the rope being deleted from) is no longer valid after call
// instead use return value as the new node
// offset is the position of the deletion relative to the start of the rope
// len is the length of the deletion
Knot *erase(Knot *node, uint32_t offset, uint32_t len);
// Used to read a string from the rope
// root is the rope to be read from
// offset is the position of the read relative to the start of the rope
// len is the length of the read
// returns a null terminated string, should be freed by the caller
char *read(Knot *root, uint32_t offset, uint32_t len);
// Used to split the rope into left and right ropes
// node is the rope to be split (it is no longer valid after call / do not free)
// offset is the position of the split relative to the start of the rope
// left and right are pointers set to the root of that side of the split
void split(Knot *node, uint32_t offset, Knot **left, Knot **right);
// Used to convert a byte offset to a line number that contains that byte
uint32_t byte_to_line(Knot *node, uint32_t offset);
// Used to convert a line number to a byte offset (start of the line)
// also sets out_len to the length of the line
uint32_t line_to_byte(Knot *node, uint32_t line, uint32_t *out_len);
// Used to start a line iterator from the start_line number
// root is the root of the rope
// returned iterator must be freed after iteration is done
LineIterator *begin_l_iter(Knot *root, uint32_t start_line);
// Each subsequent call returns the next line as a null terminated string
// `it` is the iterator returned from begin_l_iter
// After getting the necessary lines free the iterator (no need to go upto the
// end) returns null if there are no more lines All return strings `must` be
// freed by the caller
char *next_line(LineIterator *it);
// Used to start an iterator over leaf data
// root is the root of the rope
// the caller must free the iterator after use
LeafIterator *begin_k_iter(Knot *root);
// Returns the next leaf data as a null terminated string
// `it` is the iterator returned from begin_k_iter
// ! Strings returned must never be freed by the caller !
// to mutate the string a copy must be made
char *next_leaf(LeafIterator *it);
// Used to start an iterator over byte data (one byte at a time)
// Uses leaf iterator internally
// root is the root of the rope, the caller must free the iterator after use
ByteIterator *begin_b_iter(Knot *root);
// Returns the next byte from the iterator
// Returns '\0' if there are no more bytes left
// `it` is the iterator returned from begin_b_iter
char next_byte(ByteIterator *it);
// Used to search for a pattern in the rope
// Pattern is a null terminated string representing a regular expression (DFA
// compliant) I.e some forms of backtracking etc. are not supported
// root is the root of the rope to be searched
// Returns a vector of pairs of start and length offsets (in bytes)
std::vector<std::pair<size_t, size_t>> search_rope(Knot *root,
const char *pattern);
// Helper function to free the rope
// root is the root of the rope
// the root is no longer valid after call
// This must be called only once when the rope is no longer needed
void free_rope(Knot *root);
#endif // ROPE_HPP

272
src/noose.cpp Normal file
View File

@@ -0,0 +1,272 @@
#include "../headers/noose.hpp"
#include "../headers/rope.hpp"
#include <assert.h>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <stdio.h>
// VM - pass 2
bool test_ranges(char inp, Range *ranges, int len) {
for (int i = 0; i < len; i++) {
Range *r = ranges + i;
if (inp >= r->start && inp <= r->end)
return true;
}
return false;
}
// Use pike vm method
struct Thread {
Inst *pc;
uint32_t saved[40]; /* $0 through $9 */
};
Thread thread(Inst *pc, uint32_t *saved) {
Thread t;
t.pc = pc;
for (int i = 0; i < 40; i++)
t.saved[i] = saved[i];
return t;
}
struct ThreadList {
Thread *t;
int n;
};
void handle_end(uint32_t *tsaved, uint32_t *saved) {
for (int i = 0; i < 40; i++)
saved[i] = tsaved[i];
}
bool addstate(Inst *prog, ThreadList *list, Thread t, int count) {
if (t.pc->op == JMP) {
if (addstate(prog, list, thread(prog + t.pc->j.x, t.saved), count))
return true;
return false;
} else if (t.pc->op == FRK) {
if (addstate(prog, list, thread(prog + t.pc->j.x, t.saved), count))
return true;
if (addstate(prog, list, thread(prog + t.pc->j.y, t.saved), count))
return true;
return false;
} else if (t.pc->op == SVS) {
// Handle SVS: set the start offset and continue
t.saved[t.pc->idx * 2] = count; // 'count' must be passed or accessible
if (addstate(prog, list, thread(t.pc + 1, t.saved), count))
return true;
return false;
} else if (t.pc->op == SVE) {
// Handle SVE: set the end offset and continue
t.saved[t.pc->idx * 2 + 1] = count; // 'count' must be passed or accessible
if (addstate(prog, list, thread(t.pc + 1, t.saved), count))
return true;
return false;
} else if (t.pc->op == END) {
handle_end(t.saved, t.saved);
return true;
} else {
for (int i = 0; i < list->n; i++)
if (list->t[i].pc == t.pc)
return false;
list->t[list->n++] = t;
return false;
}
}
void inline swap(ThreadList *a, ThreadList *b) {
ThreadList t = *a;
*a = *b;
*b = t;
}
void inline clear(ThreadList *list) { list->n = 0; }
int proglen(Inst *prog) {
int len = 0;
while (prog[len].op != END)
len++;
return ++len;
}
void inline free_list(ThreadList *list) {
free(list->t);
free(list);
}
int next_match(Inst *prog, ByteIterator *it, uint32_t *saved) {
int len;
ThreadList *clist, *nlist;
Thread t;
len = proglen(prog);
clist = (ThreadList *)malloc(sizeof(ThreadList));
clist->t = (Thread *)malloc(+sizeof(Thread) * len);
clist->n = 0;
nlist = (ThreadList *)malloc(sizeof(ThreadList));
nlist->t = (Thread *)malloc(+sizeof(Thread) * len);
nlist->n = 0;
char sp;
int count = 0;
addstate(prog, clist, thread(prog, saved), count);
for (sp = next_byte(it); sp != '\0'; sp = next_byte(it)) {
printf("%c", sp);
for (int i = 0; i < clist->n; i++) {
t = clist->t[i];
switch (t.pc->op) {
case MCH:
if (!test_ranges(sp, t.pc->r.ranges, t.pc->r.len)) {
if (addstate(prog, nlist, thread(prog, saved), count))
return true;
break;
}
if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count))
return true;
break;
case NMC:
if (test_ranges(sp, t.pc->r.ranges, t.pc->r.len)) {
if (addstate(prog, nlist, thread(prog, saved), count))
return true;
break;
}
if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count))
return true;
break;
case ANY:
if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count))
return true;
break;
case END:
case JMP:
case FRK:
case SVS:
case SVE:
break;
}
}
swap(clist, nlist);
clear(nlist);
count++;
}
free_list(clist);
free_list(nlist);
return false; // Reached EOF without a match
}
void print_program(Inst *program) {
Inst *p = program;
int i = 0;
while (1) {
printf("%3d: ", i);
switch (p->op) {
case JMP: {
int x = (int)(p->j.x);
printf("JMP -> %d\n", x);
break;
}
case FRK: {
int x = (int)(p->j.x);
int y = (int)(p->j.y);
printf("FRK -> %d , %d\n", x, y);
break;
}
case MCH: {
printf("MCH [");
for (int r = 0; r < p->r.len; r++) {
Range rr = p->r.ranges[r];
if (rr.start == rr.end)
printf("'%c'%s", rr.start, (r + 1 < p->r.len) ? ", " : "");
else
printf("'%c'-'%c'%s", rr.start, rr.end,
(r + 1 < p->r.len) ? ", " : "");
}
printf("]\n");
break;
}
case NMC: {
printf("NMC [");
for (int r = 0; r < p->r.len; r++) {
Range rr = p->r.ranges[r];
if (rr.start == rr.end)
printf("'%c'%s", rr.start, (r + 1 < p->r.len) ? ", " : "");
else
printf("'%c'-'%c'%s", rr.start, rr.end,
(r + 1 < p->r.len) ? ", " : "");
}
printf("]\n");
break;
}
case ANY:
printf("ANY\n");
break;
case SVS:
printf("SVS idx=%d\n", (unsigned char)p->idx);
break;
case SVE:
printf("SVE idx=%d\n", (unsigned char)p->idx);
break;
case END:
printf("END\n");
return;
default:
printf("UNKNOWN op=%d\n", p->op);
return;
}
p++;
i++;
}
}
Inst *compile_regex(std::string pattern) {
return compile_ast(regex_to_ast(pattern));
}
int __main() {
// Maunally compiled program for testing
char *buffer = (char *)malloc(29);
strcpy(buffer, "abcdabcdabcdabcdf");
// This loads all (excluding \0 put in by strcpy)
Knot *root = load(buffer, 17, optimal_chunk_size(12));
ByteIterator *it = begin_b_iter(root);
uint32_t saved[40];
for (int i = 0; i < 40; i++)
saved[i] = 0;
std::string pattern = "(abcd)+";
Inst *program = compile_regex(pattern);
print_program(program);
int result;
while ((result = next_match(program, it, saved))) {
printf("\nRES: %d\n", result);
for (int i = 0; i < 40; i++)
printf("%d, ", saved[i]);
}
free(program);
free(buffer);
free(it->it);
free(it);
free(root);
return 0;
}

387
src/rexambler.cpp Normal file
View File

@@ -0,0 +1,387 @@
#include "../headers/noose.hpp"
#include <cassert>
#include <string>
#include <vector>
Exp *parse_alternation(Parser *p);
Exp *parse_sequence(Parser *p);
Exp *parse_atom_with_modifiers(Parser *p);
Exp *parse_bracket_class(Parser *p);
Exp *make_none() {
Exp *e = new Exp();
e->capture = false;
e->kind = ExpKind::NONE;
return e;
}
Exp *make_any() {
Exp *e = new Exp();
e->capture = false;
e->kind = ExpKind::ANY;
return e;
}
Exp *make_range(const std::vector<ExRange> &ranges) {
Exp *e = new Exp();
e->capture = false;
e->kind = ExpKind::RANGE;
new (&e->ranges) std::vector<ExRange>(ranges);
return e;
}
Exp *make_range_single(char c, bool neg = false) {
std::vector<ExRange> r;
r.push_back(ExRange{neg, c, c});
return make_range(r);
}
Exp *make_or(Exp *l, Exp *r) {
OpOr *o = new OpOr();
o->left = l;
o->right = r;
Exp *e = new Exp();
e->capture = false;
e->kind = ExpKind::OR;
e->opor = o;
return e;
}
Exp *make_seq(Exp *l, Exp *r) {
OpSeq *o = new OpSeq();
o->left = l;
o->right = r;
Exp *e = new Exp();
e->capture = false;
e->kind = ExpKind::SEQ;
e->opseq = o;
return e;
}
Exp *clone_exp(Exp *e) {
if (!e)
return make_none();
if (e->kind == ExpKind::NONE)
return make_none();
if (e->kind == ExpKind::ANY)
return make_any();
if (e->kind == ExpKind::RANGE) {
return make_range(e->ranges);
}
if (e->kind == ExpKind::OR) {
Exp *l = clone_exp(e->opor->left);
Exp *r = clone_exp(e->opor->right);
return make_or(l, r);
}
if (e->kind == ExpKind::SEQ) {
Exp *l = clone_exp(e->opseq->left);
Exp *r = clone_exp(e->opseq->right);
return make_seq(l, r);
}
return make_none();
}
inline char peek(Parser *p) {
return (p->i < p->s.size()) ? p->s[p->i] : '\0';
} // lookahead
inline char consume(Parser *p) {
return (p->i < p->s.size()) ? p->s[p->i++] : '\0';
} // consume
Exp *regex_to_ast(std::string pattern) {
Parser p(pattern);
Exp *res = parse_alternation(&p);
return res ? res : make_none();
}
Exp *parse_alternation(Parser *p) {
std::vector<Exp *> parts;
parts.push_back(parse_sequence(p));
while (peek(p) == '|') {
consume(p);
parts.push_back(parse_sequence(p));
}
if (parts.empty())
return make_none();
Exp *cur = parts[0];
for (size_t p = 1; p < parts.size(); ++p)
cur = make_or(cur, parts[p]);
return cur;
}
Exp *parse_sequence(Parser *p) {
std::vector<Exp *> atoms;
while (true) {
if (p->i >= p->s.size())
break;
char c = peek(p);
if (c == ')' || c == '|')
break;
Exp *a = parse_atom_with_modifiers(p);
if (!a)
break;
atoms.push_back(a);
}
if (atoms.empty())
return make_none();
Exp *cur = atoms[0];
for (size_t k = 1; k < atoms.size(); ++k)
cur = make_seq(cur, atoms[k]);
return cur;
}
Exp *parse_atom(Parser *p) {
if (p->i >= p->s.size())
return nullptr;
char c = peek(p);
if (c == '(') {
// grouping; recurse; set capture=true for group's root
consume(p); // '('
Exp *inner = parse_alternation(p);
if (peek(p) == ')')
consume(p);
if (!inner)
inner = make_none();
inner->capture = true; // as requested
return inner;
}
if (c == '[') {
// parse bracket class
return parse_bracket_class(p);
}
if (c == '\\') {
consume(p);
if (p->i >= p->s.size())
return make_none();
char esc = consume(p);
// handle known escapes
switch (esc) {
case 'd':
return make_range({ExRange{false, '0', '9'}});
case 'D': {
// negated 0-9 : we represent as first sentinel negate=true then the
// included range
std::vector<ExRange> v;
v.push_back(ExRange{true, '0', '9'});
return make_range(v);
}
case 'w': {
std::vector<ExRange> v;
v.push_back(ExRange{false, 'a', 'z'});
v.push_back(ExRange{false, 'A', 'Z'});
v.push_back(ExRange{false, '0', '9'});
v.push_back(ExRange{false, '_', '_'});
return make_range(v);
}
case 'W': {
std::vector<ExRange> v;
// provide the positive ranges that will be negated
v.push_back(ExRange{true, 'a', 'z'});
v.push_back(ExRange{true, 'A', 'Z'});
v.push_back(ExRange{true, '0', '9'});
v.push_back(ExRange{true, '_', '_'});
return make_range(v);
}
case 's': {
std::vector<ExRange> v;
v.push_back(ExRange{false, ' ', ' '}); // space
v.push_back(ExRange{false, '\t', '\t'}); // tab
v.push_back(ExRange{false, '\r', '\r'}); // CR
v.push_back(ExRange{false, '\n', '\n'}); // LF
v.push_back(ExRange{false, '\v', '\v'}); // VT
v.push_back(ExRange{false, '\f', '\f'}); // FF
return make_range(v);
}
case 'S': {
std::vector<ExRange> v;
v.push_back(ExRange{true, 0, 0});
v.push_back(ExRange{true, ' ', ' '});
v.push_back(ExRange{true, '\t', '\t'});
v.push_back(ExRange{true, '\r', '\r'});
v.push_back(ExRange{true, '\n', '\n'});
v.push_back(ExRange{true, '\v', '\v'});
v.push_back(ExRange{true, '\f', '\f'});
return make_range(v);
}
case '.':
// escaped dot -> literal dot
return make_range_single('.', false);
default:
// escaped literal: any char becomes a single-char range
return make_range_single(esc, false);
}
}
if (c == '!') {
consume(p);
return make_any();
}
// literal char (including '.' when unescaped is special? In many syntaxes
// '.' is wildcard, but user said '.' maps to [^\n], so treat '.' as
// wildcard)
if (c == '.') {
consume(p);
// dot == [^\n]
std::vector<ExRange> v;
v.push_back(ExRange{true, '\n', '\n'}); // indicate newline excluded
return make_range(v);
}
// otherwise a normal literal single char -> single range
char lit = consume(p);
return make_range_single(lit, false);
}
Exp *parse_bracket_class(Parser *p) {
assert(peek(p) == '[');
consume(p); // '['
bool neg = false;
if (peek(p) == '^') {
neg = true;
consume(p);
}
std::vector<ExRange> ranges;
while (p->i < p->s.size() && peek(p) != ']') {
char a = consume(p);
if (a == '\\') {
if (p->i >= p->s.size())
break;
char esc = consume(p);
if (esc == 'd') {
ranges.push_back(ExRange{neg, '0', '9'});
} else if (esc == 'w') {
ranges.push_back(ExRange{neg, 'a', 'z'});
ranges.push_back(ExRange{neg, 'A', 'Z'});
ranges.push_back(ExRange{neg, '0', '9'});
ranges.push_back(ExRange{neg, '_', '_'});
} else if (esc == 's') {
ranges.push_back(ExRange{neg, ' ', ' '});
ranges.push_back(ExRange{neg, '\t', '\t'});
ranges.push_back(ExRange{neg, '\r', '\r'});
ranges.push_back(ExRange{neg, '\n', '\n'});
ranges.push_back(ExRange{neg, '\v', '\v'});
ranges.push_back(ExRange{neg, '\f', '\f'});
} else {
ranges.push_back(ExRange{neg, esc, esc});
}
} else if (peek(p) == '-' && p->i + 1 < p->s.size() &&
p->s[p->i + 1] != ']') {
// range: previous char '-' next char
// but we already consumed 'a' as a; ensure there's a start to range
// get next char
// Note: we already consumed 'a' into a variable; now current char is
// '-' because we peeked it so do: (We are at position of '-') consume
// '-' and then next char
consume(p); // '-'
if (p->i >= p->s.size())
break;
char b = consume(p);
ranges.push_back(ExRange{neg, a, b});
} else {
// single char
ranges.push_back(ExRange{neg, a, a});
}
}
if (peek(p) == ']')
consume(p);
// If negated, represent with sentinel first element with negate=true
return make_range(ranges);
}
bool parse_integer_opt(Parser *p, int &out) {
if (p->i >= p->s.size() || !std::isdigit((unsigned char)peek(p)))
return false;
int val = 0;
while (p->i < p->s.size() && std::isdigit((unsigned char)peek(p)))
val = val * 10 + (consume(p) - '0');
out = val;
return true;
}
Exp *parse_atom_with_modifiers(Parser *p) {
Exp *atom = parse_atom(p);
if (!atom)
return nullptr;
// apply possibly multiple modifiers in sequence
while (true) {
if (peek(p) == '?') {
consume(p);
// OpOr(atom, NONE)
atom = make_or(clone_exp(atom), make_none());
} else if (peek(p) == '*') {
consume(p);
// Expand to 20 repeating OpOr(atom, NONE) chained by SEQ as literal
// tree
Exp *unit_or = nullptr;
for (int t = 0; t < 20; ++t) {
Exp *op = make_or(clone_exp(atom), make_none());
if (!unit_or)
unit_or = op;
else
unit_or = make_seq(unit_or, op);
}
atom = unit_or ? unit_or : make_none();
} else if (peek(p) == '+') {
consume(p);
// First the atom, then 20 OpOr(atom, NONE) sequence
Exp *rest = nullptr;
for (int t = 0; t < 20; ++t) {
Exp *op = make_or(clone_exp(atom), make_none());
if (!rest)
rest = op;
else
rest = make_seq(rest, op);
}
atom = rest ? make_seq(clone_exp(atom), rest) : clone_exp(atom);
} else if (peek(p) == '{') {
// parse {x,y}
size_t save = p->i;
consume(p); // '{'
int x = 0, y = -1;
bool ok = parse_integer_opt(p, x);
if (!ok || peek(p) != ',') {
// malformed; roll back treat '{' as literal
p->i = save;
break;
}
consume(p); // ','
ok = parse_integer_opt(p, y);
if (!ok || peek(p) != '}') {
p->i = save;
break;
}
consume(p); // '}'
if (y < x)
y = x;
if (y > 20)
y = 20; // clamp to 20 as requested
// Build x copies of atom concatenated, then (y-x) OpOr(atom, NONE)
// chained
Exp *prefix = nullptr;
for (int k = 0; k < x; ++k) {
if (!prefix)
prefix = clone_exp(atom);
else
prefix = make_seq(prefix, clone_exp(atom));
}
Exp *suffix = nullptr;
for (int k = 0; k < (y - x); ++k) {
Exp *op = make_or(clone_exp(atom), make_none());
if (!suffix)
suffix = op;
else
suffix = make_seq(suffix, op);
}
if (!prefix)
prefix = make_none();
if (!suffix)
atom = prefix;
else
atom = make_seq(prefix, suffix);
} else {
break;
}
}
return atom;
}

149
src/rexpiler.cpp Normal file
View File

@@ -0,0 +1,149 @@
#include "../headers/noose.hpp"
#include <assert.h>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <stdio.h>
struct InstList {
Inst *data;
size_t len;
size_t cap;
int idx;
};
static inline bool is_ranges_negated(std::vector<ExRange> r) {
if (r.empty())
return false;
return r[0].negate;
}
static void insert_inst(InstList *list, Inst *inst) {
if (list->len >= list->cap) {
size_t nc = list->cap ? list->cap * 2 : 32;
list->data = (Inst *)realloc(list->data, nc * sizeof(Inst));
list->cap = nc;
}
list->data[list->len++] = *inst;
}
Inst *make_inst(Op op) {
Inst *I = (Inst *)calloc(1, sizeof(Inst));
if (!I)
assert(0);
I->op = op;
return I;
}
void compile_exp(Exp *e, InstList *list);
void compile_any(Exp *e, InstList *list) {
Inst *I = make_inst(ANY);
if (e->capture) {
list->idx++;
Inst *sv = make_inst(SVS);
sv->idx = list->idx;
insert_inst(list, sv);
insert_inst(list, I);
Inst *se = make_inst(SVE);
se->idx = list->idx;
insert_inst(list, se);
} else {
insert_inst(list, I);
}
}
void compile_range(Exp *e, InstList *list) {
std::vector<ExRange> er = e->ranges;
bool neg = is_ranges_negated(er);
size_t cnt = er.size();
Range *arr = (Range *)malloc(sizeof(Range) * cnt);
size_t w = 0;
for (ExRange cur : er)
arr[w++] = Range{cur.start, cur.end};
Inst *I = make_inst(neg ? NMC : MCH);
I->r.ranges = arr;
I->r.len = (int)cnt;
if (e->capture) {
list->idx++;
Inst *sv = make_inst(SVS);
sv->idx = list->idx;
insert_inst(list, sv);
insert_inst(list, I);
Inst *se = make_inst(SVE);
se->idx = list->idx;
insert_inst(list, se);
} else {
insert_inst(list, I);
}
}
void compile_seq(Exp *e, InstList *list) {
if (e->capture) {
int idx = list->idx++;
Inst *sv = make_inst(SVS);
sv->idx = idx;
insert_inst(list, sv);
}
compile_exp(e->opseq->left, list);
compile_exp(e->opseq->right, list);
if (e->capture) {
Inst *se = make_inst(SVE);
se->idx = list->idx - 1;
insert_inst(list, se);
}
}
void compile_or(Exp *e, InstList *list) {
if (e->capture) {
Inst *sv = make_inst(SVS);
sv->idx = list->idx++;
insert_inst(list, sv);
}
Inst *frk = make_inst(FRK);
frk->j.x = -1;
frk->j.y = -1;
int frk_idx = list->len;
insert_inst(list, frk);
int left_start = list->len;
compile_exp(e->opor->left, list);
Inst *jmp = make_inst(JMP);
insert_inst(list, jmp);
int jmp_idx = list->len - 1;
int right_start = list->len;
compile_exp(e->opor->right, list);
list->data[frk_idx].j.x = left_start;
list->data[frk_idx].j.y = right_start;
list->data[jmp_idx].j.x = list->len;
if (e->capture) {
Inst *se = make_inst(SVE);
se->idx = list->idx - 1;
insert_inst(list, se);
}
}
void compile_exp(Exp *e, InstList *list) {
switch (e->kind) {
case ExpKind::NONE:
break;
case ExpKind::ANY:
compile_any(e, list);
break;
case ExpKind::RANGE:
compile_range(e, list);
break;
case ExpKind::SEQ:
compile_seq(e, list);
break;
case ExpKind::OR:
compile_or(e, list);
break;
}
}
Inst *compile_ast(Exp *root) {
InstList list = {nullptr, 0, 0, 1};
compile_exp(root, &list);
insert_inst(&list, make_inst(END));
return list.data;
}

853
src/rope.cpp Normal file
View File

@@ -0,0 +1,853 @@
#include "../headers/rope.hpp"
#include "../headers/noose.hpp"
#include <assert.h>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <pcre2.h>
#include <stdio.h>
#include <string.h>
static void update(Knot *n) {
if (!n)
return;
if (!n->depth || n->depth == 0)
return;
uint32_t left_chars = n->left ? n->left->char_count : 0;
uint32_t right_chars = n->right ? n->right->char_count : 0;
n->char_count = left_chars + right_chars;
uint32_t left_lines = n->left ? n->left->line_count : 0;
uint32_t right_lines = n->right ? n->right->line_count : 0;
n->line_count = left_lines + right_lines;
uint8_t left_depth = n->left ? n->left->depth : 0;
uint8_t right_depth = n->right ? n->right->depth : 0;
n->depth = MAX(left_depth, right_depth) + 1;
n->chunk_size = n->left ? n->left->chunk_size : n->right->chunk_size;
}
// str is not consumed and \0 is not handled
// So if str is null terminated then len must be strlen(str)
// and freed by caller
Knot *load(char *str, uint32_t len, uint32_t chunk_size) {
if (len > (uint32_t)(chunk_size - (chunk_size / 16))) {
Knot *left = load(str, len / 2, chunk_size);
Knot *right = load(str + len / 2, len - len / 2, chunk_size);
Knot *node = (Knot *)malloc(sizeof(Knot));
if (!node)
return nullptr;
node->left = left;
node->right = right;
node->chunk_size = chunk_size;
node->depth = MAX(left->depth, right->depth) + 1;
node->char_count = left->char_count + right->char_count;
node->line_count = left->line_count + right->line_count;
return node;
} else {
Knot *node = (Knot *)malloc(sizeof(Knot) + chunk_size);
if (!node)
return nullptr;
node->left = nullptr;
node->right = nullptr;
node->chunk_size = chunk_size;
node->depth = 0;
node->char_count = len;
uint32_t newline_count = 0;
for (uint32_t i = 0; i < len; i++) {
char c = str[i];
node->data[i] = c;
if (c == '\n')
newline_count++;
}
node->line_count = newline_count;
return node;
}
}
// leaf if consumed and freed (so dont use or free it after)
// left and right are the new nodes
static void split_leaf(Knot *leaf, uint32_t k, Knot **left, Knot **right) {
Knot *left_node = (Knot *)malloc(sizeof(Knot) + leaf->chunk_size);
left_node->left = nullptr;
left_node->right = nullptr;
left_node->chunk_size = leaf->chunk_size;
left_node->depth = 0;
left_node->char_count = k;
uint32_t newline_count = 0;
for (uint32_t i = 0; i < k; i++) {
char c = leaf->data[i];
left_node->data[i] = c;
if (c == '\n')
newline_count++;
}
left_node->line_count = newline_count;
uint16_t right_line_count = leaf->line_count - newline_count;
*left = left_node;
Knot *right_node = (Knot *)malloc(sizeof(Knot) + leaf->chunk_size);
right_node->left = nullptr;
right_node->right = nullptr;
right_node->chunk_size = leaf->chunk_size;
right_node->depth = 0;
right_node->char_count = leaf->char_count - k;
right_node->line_count = right_line_count;
for (uint32_t i = k; i < leaf->char_count; i++) {
char c = leaf->data[i];
right_node->data[i - k] = c;
}
*right = right_node;
free(leaf);
}
// This makes node nonsensical, so dont use or free it after
void split(Knot *node, uint32_t offset, Knot **left, Knot **right) {
if (!node) {
*left = nullptr;
*right = nullptr;
return;
}
if (node->depth == 0) {
split_leaf(node, offset, left, right);
return;
}
uint32_t left_size = node->left ? node->left->char_count : 0;
if (offset < left_size) {
Knot *L = nullptr, *R = nullptr;
split(node->left, offset, &L, &R);
node->left = R;
update(node);
*right = node;
*left = L;
} else {
uint32_t new_offset = offset - left_size;
Knot *L = nullptr, *R = nullptr;
split(node->right, new_offset, &L, &R);
node->right = L;
update(node);
*left = node;
*right = R;
}
}
static inline int get_balance_factor(Knot *n) {
if (!n)
return 0;
return (int)DEPTH(n->left) - (int)DEPTH(n->right);
}
static inline Knot *rotate_right(Knot *y) {
Knot *x = y->left;
Knot *T2 = x->right;
x->right = y;
y->left = T2;
update(y);
update(x);
return x;
}
static inline Knot *rotate_left(Knot *x) {
Knot *y = x->right;
Knot *T2 = y->left;
y->left = x;
x->right = T2;
update(x);
update(y);
return y;
}
// Technically n can be used after calling
// but use return value instead
Knot *balance(Knot *n) {
update(n);
int bal = get_balance_factor(n);
if (bal > 1) {
if (get_balance_factor(n->left) < 0)
n->left = rotate_left(n->left);
return rotate_right(n);
}
if (bal < -1) {
if (get_balance_factor(n->right) > 0)
n->right = rotate_right(n->right);
return rotate_left(n);
}
return n;
}
// Dont free left or right after calling (only free return value)
// Assumes both ropes have equal chunk sizes
Knot *concat(Knot *left, Knot *right) {
if (!left)
return right;
if (!right)
return left;
if (!left || left->char_count == 0) {
if (left)
free_rope(left);
return right;
}
if (!right || right->char_count == 0) {
if (right)
free_rope(right);
return left;
}
if (left->depth == 0 && right->depth == 0) {
if (left->char_count + right->char_count <= left->chunk_size) {
Knot *node = (Knot *)malloc(sizeof(Knot) + left->chunk_size);
node->left = nullptr;
node->right = nullptr;
node->chunk_size = left->chunk_size;
node->depth = 0;
node->char_count = left->char_count + right->char_count;
node->line_count = left->line_count + right->line_count;
memcpy(node->data, left->data, left->char_count);
memcpy(node->data + left->char_count, right->data, right->char_count);
free(left);
free(right);
return node;
}
}
uint16_t d_left = left->depth;
uint16_t d_right = right->depth;
if (d_left > d_right + 1) {
left->right = concat(left->right, right);
return balance(left);
}
if (d_right > d_left + 1) {
right->left = concat(left, right->left);
return balance(right);
}
Knot *node = (Knot *)malloc(sizeof(Knot));
if (!node)
return nullptr;
node->left = left;
node->right = right;
node->chunk_size = left->chunk_size;
node->depth = MAX(d_left, d_right) + 1;
update(node);
return node;
}
// This makes node nonsensical, so dont use or free it after
// Instead, free the return value or use it in node's place
Knot *insert(Knot *node, uint32_t offset, char *str, uint32_t len) {
if (!node)
return nullptr;
if (node->depth == 0 && node->char_count + len <= node->chunk_size) {
if (offset < node->char_count)
memmove(node->data + offset + len, node->data + offset,
node->char_count - offset);
memcpy(node->data + offset, str, len);
node->char_count += len;
for (uint32_t i = 0; i < len; i++)
if (str[i] == '\n')
node->line_count++;
return node;
}
if (node->depth > 0) {
uint32_t left_count = node->left ? node->left->char_count : 0;
if (offset < left_count) {
Knot *new_left = insert(node->left, offset, str, len);
node->left = new_left;
update(node);
return balance(node);
} else {
Knot *new_right = insert(node->right, offset - left_count, str, len);
node->right = new_right;
update(node);
return balance(node);
}
}
Knot *left_part = nullptr;
Knot *right_part = nullptr;
split(node, offset, &left_part, &right_part);
Knot *middle_part = load(str, len, node->chunk_size);
return concat(concat(left_part, middle_part), right_part);
}
// This makes node nonsensical, so dont use or free it after
// Instead, free the return value or use it in node's place
Knot *erase(Knot *node, uint32_t offset, uint32_t len) {
if (!node || len == 0 || offset >= node->char_count)
return node;
if (offset + len > node->char_count)
len = node->char_count - offset;
if (node->depth == 0) {
uint32_t deleted_newlines = 0;
for (uint32_t i = offset; i < offset + len; i++)
if (node->data[i] == '\n')
deleted_newlines++;
node->line_count -= deleted_newlines;
if (offset + len < node->char_count)
memmove(node->data + offset, node->data + offset + len,
node->char_count - (offset + len));
node->char_count -= len;
return node;
}
uint32_t left_count = node->left ? node->left->char_count : 0;
if (offset + len <= left_count) {
node->left = erase(node->left, offset, len);
} else if (offset >= left_count) {
node->right = erase(node->right, offset - left_count, len);
} else {
Knot *left = nullptr, *middle = nullptr, *right = nullptr;
split(node, offset, &left, &right);
split(right, len, &middle, &right);
free_rope(middle);
return concat(left, right);
}
update(node);
return balance(node);
}
static void _read_into(Knot *node, uint32_t offset, uint32_t len, char *dest) {
if (!node || len == 0)
return;
if (node->depth == 0) {
memcpy(dest, node->data + offset, len);
return;
}
Knot *left = node->left;
uint32_t left_count = left ? left->char_count : 0;
if (offset < left_count) {
uint32_t chunk_len = left_count - offset;
if (chunk_len > len)
chunk_len = len;
_read_into(left, offset, chunk_len, dest);
dest += chunk_len;
len -= chunk_len;
offset = 0;
} else {
offset -= left_count;
}
if (len > 0 && node->right)
_read_into(node->right, offset, len, dest);
}
char *read(Knot *root, uint32_t offset, uint32_t len) {
if (!root)
return nullptr;
if (offset >= root->char_count) {
char *empty = (char *)malloc(1);
if (empty)
empty[0] = '\0';
return empty;
}
if (offset + len > root->char_count) {
len = root->char_count - offset;
}
char *buffer = (char *)malloc((len + 1) * sizeof(char));
if (!buffer)
return nullptr;
_read_into(root, offset, len, buffer);
buffer[len] = '\0';
return buffer;
}
// Hopefully free the tree only once at the end of its use using the pointer
// from the last insert or concat or erase call.
// (or use twice if last call was split - for both left and right).
void free_rope(Knot *root) {
if (!root)
return;
free_rope(root->left);
free_rope(root->right);
free(root);
}
static uint32_t find_nth_newline_offset(Knot *node, uint32_t n) {
if (!node || n > node->line_count)
return UINT32_MAX;
if (node->depth == 0) {
uint32_t count = 0;
for (uint32_t i = 0; i < node->char_count; i++) {
if (node->data[i] == '\n') {
if (count == n)
return i;
count++;
}
}
return UINT32_MAX;
}
uint32_t left_lines = node->left ? node->left->line_count : 0;
if (n < left_lines) {
return find_nth_newline_offset(node->left, n);
} else {
uint32_t right_offset =
find_nth_newline_offset(node->right, n - left_lines);
if (right_offset == UINT32_MAX)
return UINT32_MAX;
uint32_t left_chars = node->left ? node->left->char_count : 0;
return left_chars + right_offset;
}
}
uint32_t byte_to_line(Knot *node, uint32_t offset) {
if (!node)
return 0;
if (offset >= node->char_count)
return node->line_count;
if (node->depth == 0) {
uint32_t lines_before = 0;
uint32_t limit = (offset < node->char_count) ? offset : node->char_count;
for (uint32_t i = 0; i < limit; i++)
if (node->data[i] == '\n')
lines_before++;
return lines_before;
}
uint32_t left_chars = node->left ? node->left->char_count : 0;
if (offset < left_chars) {
return byte_to_line(node->left, offset);
} else {
uint32_t left_lines = node->left ? node->left->line_count : 0;
return left_lines + byte_to_line(node->right, offset - left_chars);
}
}
uint32_t line_to_byte(Knot *node, uint32_t line, uint32_t *out_len) {
if (!node) {
if (out_len)
*out_len = 0;
return 0;
}
uint32_t start_offset = 0;
uint32_t end_offset = 0;
if (line == 0) {
start_offset = 0;
} else {
uint32_t prev_newline = find_nth_newline_offset(node, line - 1);
if (prev_newline == UINT32_MAX)
start_offset = node->char_count;
else
start_offset = prev_newline + 1;
}
uint32_t current_newline = find_nth_newline_offset(node, line);
if (current_newline == UINT32_MAX)
end_offset = node->char_count;
else
end_offset = current_newline + 1;
if (out_len) {
if (end_offset > start_offset)
*out_len = end_offset - start_offset;
else
*out_len = 0;
}
return start_offset;
}
LineIterator *begin_l_iter(Knot *root, uint32_t start_line) {
if (!root)
return nullptr;
if (start_line > root->line_count)
return nullptr;
LineIterator *it = (LineIterator *)malloc(sizeof(LineIterator));
if (!it)
return nullptr;
it->top = 0;
it->line = start_line;
it->node = nullptr;
if (start_line == 0) {
it->offset = 0;
while (root->left) {
it->stack[it->top++] = root;
root = root->left;
if (!root->left && !root->right)
it->node = root;
}
it->stack[it->top++] = root;
return it;
}
Knot *curr = root;
uint32_t relative_line = start_line;
while (curr) {
it->stack[it->top++] = curr;
if (!curr->left && !curr->right) {
it->node = curr;
break;
}
uint32_t left_lines = (curr->left) ? curr->left->line_count : 0;
if (relative_line < left_lines) {
curr = curr->left;
} else {
relative_line -= left_lines;
curr = curr->right;
}
}
if (!it->node) {
free(it);
return nullptr;
}
it->offset = 0;
if (relative_line > 0) {
uint32_t found_newlines = 0;
uint32_t i = 0;
for (i = 0; i < it->node->char_count; i++) {
if (it->node->data[i] == '\n') {
found_newlines++;
if (found_newlines == relative_line) {
it->offset = i + 1;
break;
}
}
}
}
return it;
}
static inline void iter_advance_leaf(LineIterator *it) {
if (it->top == 0) {
it->node = nullptr;
return;
}
Knot *prev = it->stack[--it->top];
while (it->top > 0) {
Knot *parent = it->stack[it->top - 1];
if (parent->left == prev && parent->right) {
Knot *curr = parent->right;
while (curr) {
it->stack[it->top++] = curr;
if (!curr->left && !curr->right) {
it->node = curr;
it->offset = 0;
return;
}
curr = (curr->left) ? curr->left : curr->right;
}
}
prev = it->stack[--it->top];
}
it->node = nullptr;
}
char *next_line(LineIterator *it) {
if (!it || !it->node)
return nullptr;
size_t capacity = 128;
size_t len = 0;
char *buffer = (char *)malloc(capacity);
if (!buffer)
return nullptr;
while (it->node) {
if (it->offset >= it->node->char_count) {
iter_advance_leaf(it);
if (!it->node)
break;
}
char *start = it->node->data + it->offset;
char *end = it->node->data + it->node->char_count;
char *newline_ptr = (char *)memchr(start, '\n', end - start);
size_t chunk_len;
int found_newline = 0;
if (newline_ptr) {
chunk_len = (newline_ptr - start) + 1;
found_newline = 1;
} else {
chunk_len = end - start;
}
if (len + chunk_len + 1 > capacity) {
capacity = (capacity * 2) + chunk_len;
char *new_buf = (char *)realloc(buffer, capacity);
if (!new_buf) {
free(buffer);
return nullptr;
}
buffer = new_buf;
}
memcpy(buffer + len, start, chunk_len);
len += chunk_len;
it->offset += chunk_len;
if (found_newline) {
buffer[len] = '\0';
it->line++;
return buffer;
}
}
if (len > 0) {
buffer[len] = '\0';
it->line++;
return buffer;
}
free(buffer);
return nullptr;
}
LeafIterator *begin_k_iter(Knot *root) {
if (!root)
return nullptr;
LeafIterator *it = (LeafIterator *)malloc(sizeof(LeafIterator));
if (!it)
return nullptr;
it->top = 0;
Knot *curr = root;
while (curr) {
it->stack[it->top++] = curr;
if (!curr->left && !curr->right) {
it->node = curr;
return it;
}
curr = curr->left;
if (!curr) {
curr = it->stack[--it->top]->right;
Knot *temp = it->stack[it->top];
it->stack[it->top++] = temp;
curr = temp->left ? temp->left : temp->right;
Knot *parent = it->stack[it->top - 1];
curr = parent->left;
if (!curr) {
curr = parent->right;
}
}
}
free(it);
return nullptr;
}
// Caller must never free the returned string
char *next_leaf(LeafIterator *it) {
if (!it || !it->node)
return nullptr;
char *data_to_return = it->node->data;
data_to_return[it->node->char_count] = '\0';
Knot *prev_leaf = it->node;
Knot *parent = nullptr;
while (it->top > 0) {
parent = it->stack[--it->top];
if (parent->right && parent->right != prev_leaf) {
Knot *curr = parent->right;
while (curr) {
it->stack[it->top++] = curr;
if (!curr->left && !curr->right) {
it->node = curr;
return data_to_return;
}
curr = curr->left;
if (!curr)
curr = it->stack[it->top - 1]->right;
}
}
prev_leaf = parent;
}
it->node = nullptr;
return data_to_return;
}
ByteIterator *begin_b_iter(Knot *root) {
ByteIterator *b_it = (ByteIterator *)malloc(sizeof(ByteIterator));
LeafIterator *l_it = begin_k_iter(root);
b_it->it = l_it;
b_it->offset_g = 0;
b_it->offset_l = 0;
b_it->char_count = 0;
b_it->data = nullptr;
return b_it;
}
char next_byte(ByteIterator *it) {
if (it->data && it->offset_l < it->char_count) {
return it->data[it->offset_l++];
} else {
it->offset_g += it->offset_l;
it->offset_l = 1;
char *data = next_leaf(it->it);
it->char_count = strlen(data);
it->data = data;
if (it->data)
return *it->data;
else
return '\0';
}
}
std::vector<std::pair<size_t, size_t>> search_rope(Knot *root,
const char *pattern) {
std::vector<std::pair<size_t, size_t>> results;
int errorcode;
PCRE2_SIZE erroffset;
pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0,
&errorcode, &erroffset, nullptr);
if (!re) {
fprintf(stderr, "PCRE2 compile error: %d\n", errorcode);
return results;
}
pcre2_match_data *mdata = pcre2_match_data_create(128, nullptr);
int workspace[PCRE_WORKSPACE_SIZE];
LeafIterator *it = begin_k_iter(root);
if (!it) {
pcre2_code_free(re);
pcre2_match_data_free(mdata);
return results;
}
size_t chunk_abs_offset = 0;
size_t saved_match_start = 0;
bool match_in_progress = false;
int flags = PCRE2_PARTIAL_SOFT;
while (1) {
const char *chunk_start = next_leaf(it);
if (!chunk_start)
break;
size_t chunk_len = strlen(chunk_start);
const char *current_ptr = chunk_start;
size_t remaining_len = chunk_len;
while (remaining_len > 0) {
int rc =
pcre2_dfa_match(re, (PCRE2_SPTR)current_ptr, remaining_len, 0, flags,
mdata, nullptr, workspace, PCRE_WORKSPACE_SIZE);
if (rc >= 0) {
PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata);
size_t match_start_abs;
size_t match_end_abs;
if (match_in_progress) {
match_start_abs = saved_match_start;
match_end_abs =
chunk_abs_offset + (current_ptr - chunk_start) + ov[1];
} else {
match_start_abs =
chunk_abs_offset + (current_ptr - chunk_start) + ov[0];
match_end_abs =
chunk_abs_offset + (current_ptr - chunk_start) + ov[1];
}
size_t total_len = match_end_abs - match_start_abs;
results.push_back(std::make_pair(match_start_abs, total_len));
size_t consumed = ov[1];
if (consumed == 0)
consumed = 1;
current_ptr += consumed;
if (consumed > remaining_len)
remaining_len = 0;
else
remaining_len -= consumed;
match_in_progress = false;
flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL;
continue;
} else if (rc == PCRE2_ERROR_PARTIAL) {
PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata);
if (!match_in_progress) {
saved_match_start =
chunk_abs_offset + (current_ptr - chunk_start) + ov[0];
match_in_progress = true;
}
flags |= PCRE2_DFA_RESTART;
flags |= PCRE2_NOTBOL;
break;
} else {
if (match_in_progress) {
match_in_progress = false;
flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL;
current_ptr++;
remaining_len--;
} else {
break;
}
// if (rc != PCRE2_ERROR_NOMATCH) {} // handle error
}
}
chunk_abs_offset += chunk_len;
if (!match_in_progress)
flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL;
}
pcre2_match_data_free(mdata);
pcre2_code_free(re);
free(it);
return results;
}
uint32_t optimal_chunk_size(uint64_t length) {
if (length <= MIN_CHUNK_SIZE)
return MIN_CHUNK_SIZE;
double target_exponent = MIN(std::log2((double)MAX_CHUNK_SIZE),
7.0 + (std::log2((double)length) - 10.0) * 0.25);
uint32_t final_chunk_size =
MAX((uint32_t)MIN_CHUNK_SIZE, (uint32_t)std::pow(2.0, target_exponent));
final_chunk_size = MIN(final_chunk_size, (uint32_t)MAX_CHUNK_SIZE);
final_chunk_size = 1U << (32 - __builtin_clz(final_chunk_size - 1));
return final_chunk_size;
}
// Basic correctness test & usage example
int _main() {
char *buffer = (char *)malloc(44 * 4 + 5);
strcpy(buffer, "The quick brown fox jumps over the lazy dog.\n\
The quick brown fox jumps over the lazy dog.\n\
The quick brown fox jumps over the lazy dog.\n\
The quick brown fox jumps over the lazy dog.");
// This loads all (excluding \0 put in by strcpy)
Knot *root = load(buffer, 44 * 4 + 3, optimal_chunk_size(44 * 4 + 3));
Knot *left = nullptr, *right = nullptr;
// Splits root into left and right (root is no longer valid)
split(root, 5, &left, &right);
// simple read based on byte offset and length
char *s1 = read(left, 0, 100);
printf("%s\n:\n", s1);
char *s2 = read(right, 0, 100);
printf("%s\n;\n", s2);
free(s1);
free(s2);
// Recombines left and right into root (both can
// be valid or invalid in optimized cases)
// they are to not be used after concat
root = concat(left, right);
// root should be set to return value from insert always
root = insert(root, 5, buffer, 5);
free(buffer);
char *s3 = read(root, 0, 100);
printf("%s\n,\n", s3);
// Similar to insert but for erase
root = erase(root, 5, 5);
char *s4 = read(root, 0, 100);
printf("%s\n.\n", s4);
free(s3);
free(s4);
uint32_t byte_offset;
uint32_t len;
// Byte offset given reltive to how it would
// be in a file offset + len includes the \n
// at the end of the line (or nothing is EOF)
byte_offset = line_to_byte(root, 2, &len);
char *s5 = read(root, byte_offset, len);
printf("%s\n'\n", s5);
free(s5);
// returns line number of which line that
// byte position would be in.
// the ending \n position is included in this
uint32_t line = byte_to_line(root, byte_offset + len - 1);
printf("%u\n:\n", line);
// From second line onwards (0 indexed)
LineIterator *it = begin_l_iter(root, 0);
char *c = nullptr;
while ((c = next_line(it)) != nullptr) {
printf("%s :wow:\n", c);
free(c);
}
free(it);
printf("\n/\n");
// Starts at first byte (to be used for regex search)
ByteIterator *it2 = begin_b_iter(root);
uint32_t saved[40];
for (int i = 0; i < 40; i++)
saved[i] = 0;
std::string pattern = "f.x";
Inst *program = compile_regex(pattern);
bool result;
while ((result = next_match(program, it2, saved))) {
printf("\nRES: %d\n", result);
for (int i = 0; i < 40; i++)
printf("%d, ", saved[i]);
}
// char c2 = ' ';
// while ((c2 = next_byte(it2)) != '\0')
// printf("%c :wow!:\n", c2);
// free(it2);
// search // uses leaf iterator internally // PCRE2 based
std::vector<std::pair<size_t, size_t>> matches = search_rope(root, "f.x");
for (size_t i = 0; i < matches.size(); i++)
printf("\n%lu %lu", matches[i].first, matches[i].second);
// A rope needs to be freed only once if last action on the rope is
// insert or concat or erase.
// for splits we need to free both left and right separately
free_rope(root);
return 0;
}

283
src/test.cpp Normal file
View File

@@ -0,0 +1,283 @@
#include "../headers/noose.hpp"
#include "../headers/rope.hpp"
#include <chrono>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <string>
char *load_file(const char *path, size_t *out_len) {
FILE *f = fopen(path, "rb");
if (!f) {
perror("fopen");
return nullptr;
}
fseek(f, 0, SEEK_END);
size_t len = ftell(f);
rewind(f);
char *buf = (char *)malloc(len);
if (!buf) {
perror("malloc");
fclose(f);
return nullptr;
}
fread(buf, 1, len, f);
fclose(f);
*out_len = len;
return buf;
}
int main() {
printf("My rope implementation benchmark\n");
{
size_t len;
printf("Loading file into rope...\n");
char *buf = load_file("./random.bin", &len);
auto start = std::chrono::high_resolution_clock::now();
Knot *root = load(buf, len, 2);
auto end = std::chrono::high_resolution_clock::now();
printf("Load time: %.3f s\n",
std::chrono::duration<double>(end - start).count());
free(buf);
// READ TEST
printf("Testing read...\n");
start = std::chrono::high_resolution_clock::now();
char *content = read(root, len / 2, 1024);
end = std::chrono::high_resolution_clock::now();
free(content);
printf("Read 1 KB from middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// INSERT TEST
printf("Testing insert...\n");
char insert_data[1024];
memset(insert_data, 'X', 1024);
start = std::chrono::high_resolution_clock::now();
root = insert(root, len / 2, insert_data, 1024);
end = std::chrono::high_resolution_clock::now();
printf("Insert 1 KB in middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// ERASE TEST (Delete the same 1 KB we just inserted)
printf("Testing erase...\n");
start = std::chrono::high_resolution_clock::now();
root = erase(root, len / 2, 1024);
end = std::chrono::high_resolution_clock::now();
printf("Erase 1 KB in middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// SPLIT TEST
printf("Testing split...\n");
Knot *left = nullptr, *right = nullptr;
start = std::chrono::high_resolution_clock::now();
split(root, len / 2, &left, &right);
end = std::chrono::high_resolution_clock::now();
printf("Split at middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// CONCAT TEST
printf("Testing concat...\n");
start = std::chrono::high_resolution_clock::now();
root = concat(left, right);
end = std::chrono::high_resolution_clock::now();
printf("Concat: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// ---------------------------------------------------------
// LINE OPERATIONS TESTS
// ---------------------------------------------------------
printf("Testing line operations...\n");
// KNOWN CONSTANTS based on: yes "The quick brown fox jumps over the lazy
// dog." String length: 44 + 1 newline = 45 bytes per line.
const uint32_t BYTES_PER_LINE = 45;
const uint32_t TEST_LINE_INDEX = 1000; // A line deep in the file
// 1. Test byte_to_line
// We pick a byte in the middle of TEST_LINE_INDEX.
// Offset = (100000 * 45) + 10.
uint32_t test_offset = (TEST_LINE_INDEX * BYTES_PER_LINE) + 10;
start = std::chrono::high_resolution_clock::now();
uint16_t calculated_line = byte_to_line(root, test_offset);
end = std::chrono::high_resolution_clock::now();
printf("byte_to_line (%u -> %u): %.6f s ", test_offset, calculated_line,
std::chrono::duration<double>(end - start).count());
if (calculated_line == TEST_LINE_INDEX) {
printf("[PASS]\n");
} else {
printf("[FAIL] Expected %u, got %u\n", TEST_LINE_INDEX, calculated_line);
}
// 2. Test line_to_byte
// We ask for the start of TEST_LINE_INDEX. Should be exactly
// TEST_LINE_INDEX * 45.
uint32_t out_len = 0;
uint32_t expected_start = TEST_LINE_INDEX * BYTES_PER_LINE;
start = std::chrono::high_resolution_clock::now();
uint32_t calculated_start = line_to_byte(root, TEST_LINE_INDEX, &out_len);
end = std::chrono::high_resolution_clock::now();
printf("line_to_byte (Line %u -> Offset %u): %.6f s ", TEST_LINE_INDEX,
calculated_start,
std::chrono::duration<double>(end - start).count());
if (calculated_start == expected_start && out_len == BYTES_PER_LINE) {
printf("[PASS]\n");
} else {
printf("[FAIL] Expected offset %u (len %u), got %u (len %u)\n",
expected_start, BYTES_PER_LINE, calculated_start, out_len);
}
// ---------------------------------------------------------
// ITERATOR SPEED TEST
// ---------------------------------------------------------
printf("Testing iterator speed...\n");
const uint32_t LINES_TO_ITERATE = 10000; // Iterate 10,000 lines
// 1. Initialize the iterator at a deep line index
uint32_t start_line = TEST_LINE_INDEX + 10;
LeafIterator *it = begin_k_iter(root);
if (!it) {
printf("Iterator Test: [FAIL] begin_iterator returned NULL.\n");
} else {
char *line = NULL;
uint32_t lines_read = 0;
start = std::chrono::high_resolution_clock::now();
// 2. Iterate and time the process
// We use the clean C idiom: get the line, check for NULL, then
// process.
while (lines_read < LINES_TO_ITERATE && (line = next_leaf(it)) != NULL) {
// Note: We deliberately skip printing to focus on the Rope operation
// time.
lines_read++;
}
end = std::chrono::high_resolution_clock::now();
double elapsed_time = std::chrono::duration<double>(end - start).count();
printf("Iterator speed (f:: %u): %.6f s (%.2f lines/s)\n", lines_read,
elapsed_time, (double)lines_read / elapsed_time);
if (lines_read == LINES_TO_ITERATE) {
printf("Iterator Test: [PASS] Successfully iterated %u lines.\n",
LINES_TO_ITERATE);
} else {
printf("Iterator Test: [FAIL] Expected %u lines, read %u.\n",
LINES_TO_ITERATE, lines_read);
}
// 3. Clean up the iterator
free(it);
}
// search test
start = std::chrono::high_resolution_clock::now();
std::vector<std::pair<size_t, size_t>> matches = search_rope(root, "f.x");
end = std::chrono::high_resolution_clock::now();
printf("Search Time: %.6f s\n",
std::chrono::duration<double>(end - start).count());
printf("Found %lu matches\n", matches.size());
char *c = read(root, 0, 1000);
printf("%s\n", c);
free(c);
ByteIterator *it1 = begin_b_iter(root);
char ch;
while ((ch = next_byte(it1)) != '\0') {
printf("%c:", ch);
}
ByteIterator *it2 = begin_b_iter(root);
uint32_t saved[40];
for (int i = 0; i < 40; i++)
saved[i] = 0;
std::string pattern = "f.x";
Inst *program = compile_regex(pattern);
bool result;
int count = 0;
start = std::chrono::high_resolution_clock::now();
while ((result = next_match(program, it2, saved))) {
count++;
printf("%d\n", count);
}
end = std::chrono::high_resolution_clock::now();
printf("Search Time: %.6f s\n",
std::chrono::duration<double>(end - start).count());
printf("Found2 %d matches\n", count);
free_rope(root);
}
printf("Testing std::string...\n");
{
std::ifstream file("random.bin", std::ios::binary | std::ios::ate);
if (!file) {
perror("ifstream");
return 1;
}
size_t len = file.tellg();
file.seekg(0);
std::string data(len, '\0');
file.read(data.data(), len);
std::string s = data;
auto start = std::chrono::high_resolution_clock::now();
// READ: middle 1 KB
std::string read_chunk = s.substr(len / 2, 1024);
auto end = std::chrono::high_resolution_clock::now();
printf("std::string read 1 KB from middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// INSERT: middle 1 KB
std::string insert_data(1024, 'X');
start = std::chrono::high_resolution_clock::now();
s.insert(len / 2, insert_data);
end = std::chrono::high_resolution_clock::now();
printf("std::string insert 1 KB in middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// ERASE: middle 1 KB
start = std::chrono::high_resolution_clock::now();
s.erase(len / 2, 1024);
end = std::chrono::high_resolution_clock::now();
printf("std::string erase 1 KB in middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// SPLIT: middle
start = std::chrono::high_resolution_clock::now();
std::string left = s.substr(0, len / 2);
std::string right = s.substr(len / 2);
end = std::chrono::high_resolution_clock::now();
printf("std::string split at middle: %.6f s\n",
std::chrono::duration<double>(end - start).count());
// CONCAT
start = std::chrono::high_resolution_clock::now();
s = left + right;
end = std::chrono::high_resolution_clock::now();
printf("std::string concat: %.6f s\n",
std::chrono::duration<double>(end - start).count());
}
}