commit a95e5c09b8c7521be0b0892a28c0dd1499a2a1c4 Author: Syed Daanish Date: Thu Nov 27 16:42:36 2025 +0000 Initial commit diff --git a/api/noose.hpp b/api/noose.hpp new file mode 100644 index 0000000..8b63f22 --- /dev/null +++ b/api/noose.hpp @@ -0,0 +1,87 @@ +#ifndef NOOSE_HPP +#define NOOSE_HPP +#include "../headers/rope.hpp" + +#include + +struct Exp; + +enum class ExpKind { RANGE, OR, SEQ, ANY, NONE }; + +struct ExRange { + bool negate = false; + char start; + char end; +}; + +struct OpOr { + Exp *left; + Exp *right; +}; + +struct OpSeq { + Exp *left; + Exp *right; +}; + +struct Exp { + bool capture; + ExpKind kind; + union { + OpOr *opor; + OpSeq *opseq; + std::vector ranges; + }; + Exp() { + capture = false; + kind = ExpKind::NONE; + } +}; + +struct Parser { + std::string s; + size_t i; + Parser(std::string str) : s(str), i(0) {} +}; + +enum Op { + // These jump around + JMP = 0, // Jump to j.x + FRK = 1, // Fork to j.x and j.y (with priority to x) + // These consume 1 char from the input, if not then fail thread + // (failuire of main thread is not successfull match) + MCH = 2, // match with range object + NMC = 3, // not match with range object + ANY = 4, // Anything + // Used to save offsets + SVS = 5, // Start save for i cap group + SVE = 6, // End save for i cap group + // Match is successful if main thread reaches the end + END = 7 +}; + +struct Range { // use start == end to match a particular char + char start; + char end; +}; + +struct Inst { + Op op; + union { + struct { + Range *ranges; + int len; + } r; + struct { + int x, y; + } j; + }; + int idx; +}; + +Exp *regex_to_ast(std::string pattern); +Inst *compile_ast(Exp *root); +Inst *compile_regex(std::string pattern); +int next_match(Inst *prog, ByteIterator *it, uint32_t *saved); + +#endif diff --git a/api/rope.hpp b/api/rope.hpp new file mode 100644 index 0000000..5f29da4 --- /dev/null +++ b/api/rope.hpp @@ -0,0 +1,157 @@ +#ifndef ROPE_HPP +#define ROPE_HPP + +#include +#include + +#define MIN_CHUNK_SIZE 64 // 64 Bytes +#define MAX_CHUNK_SIZE 1024 * 8 // 8192 Bytes (8 KiB) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define DEPTH(n) ((n) ? (n)->depth : 0) + +#define PCRE2_CODE_UNIT_WIDTH 8 +#define PCRE_WORKSPACE_SIZE 512 + +// Rope node definition +typedef struct Knot { + Knot *left; + Knot *right; + uint8_t depth; + uint32_t chunk_size; + uint32_t line_count; + uint32_t char_count; + char data[]; +} Knot; + +typedef struct LineIterator { + Knot *node; + uint8_t top; + uint32_t offset; + uint32_t line; + Knot *stack[64]; +} LineIterator; + +typedef struct LeafIterator { + Knot *node; + uint8_t top; + uint32_t offset; + Knot *stack[64]; +} LeafIterator; + +typedef struct ByteIterator { + LeafIterator *it; + uint32_t offset_l; + uint32_t offset_g; + uint32_t char_count; + char *data; +} ByteIterator; + +// Rope operations + +// Takes lengt of string to be converted +// to rope and returns a suitable chunk size +// but rope should work with any positive chunk size +uint32_t optimal_chunk_size(uint64_t length); + +// Takes a string (no need for null termination) and returns a rope +// len is the length of the string, and chunk size is the size of each chunk +// load does not free or consume the string. +// and the str can be freed after load has been run. +Knot *load(char *str, uint32_t len, uint32_t chunk_size); + +// Balances the rope and returns the root +// n is no longer valid / do not free +// As rope is balanced by other functions +// this is not to be used directly +Knot *balance(Knot *n); + +// Concatenates two ropes and returns the joined root +// Balances the ropes too, if needed +// left and right are no longer valid / do not free +// ! left and right should have the same chunk size ! +Knot *concat(Knot *left, Knot *right); + +// Used to insert text into the rope +// node (the rope being inserted into) is no longer valid after call +// instead use return value as the new node +// offset is the position of the insertion relative to the start of the rope +// str is the string to be inserted (no need for null termination) +// len is the length of the string +Knot *insert(Knot *node, uint32_t offset, char *str, uint32_t len); + +// Similar to insert but for deletion +// node (the rope being deleted from) is no longer valid after call +// instead use return value as the new node +// offset is the position of the deletion relative to the start of the rope +// len is the length of the deletion +Knot *erase(Knot *node, uint32_t offset, uint32_t len); + +// Used to read a string from the rope +// root is the rope to be read from +// offset is the position of the read relative to the start of the rope +// len is the length of the read +// returns a null terminated string, should be freed by the caller +char *read(Knot *root, uint32_t offset, uint32_t len); + +// Used to split the rope into left and right ropes +// node is the rope to be split (it is no longer valid after call / do not free) +// offset is the position of the split relative to the start of the rope +// left and right are pointers set to the root of that side of the split +void split(Knot *node, uint32_t offset, Knot **left, Knot **right); + +// Used to convert a byte offset to a line number that contains that byte +uint32_t byte_to_line(Knot *node, uint32_t offset); + +// Used to convert a line number to a byte offset (start of the line) +// also sets out_len to the length of the line +uint32_t line_to_byte(Knot *node, uint32_t line, uint32_t *out_len); + +// Used to start a line iterator from the start_line number +// root is the root of the rope +// returned iterator must be freed after iteration is done +LineIterator *begin_l_iter(Knot *root, uint32_t start_line); + +// Each subsequent call returns the next line as a null terminated string +// `it` is the iterator returned from begin_l_iter +// After getting the necessary lines free the iterator (no need to go upto the +// end) returns null if there are no more lines All return strings `must` be +// freed by the caller +char *next_line(LineIterator *it); + +// Used to start an iterator over leaf data +// root is the root of the rope +// the caller must free the iterator after use +LeafIterator *begin_k_iter(Knot *root); + +// Returns the next leaf data as a null terminated string +// `it` is the iterator returned from begin_k_iter +// ! Strings returned must never be freed by the caller ! +// to mutate the string a copy must be made +char *next_leaf(LeafIterator *it); + +// Used to start an iterator over byte data (one byte at a time) +// Uses leaf iterator internally +// root is the root of the rope, the caller must free the iterator after use +ByteIterator *begin_b_iter(Knot *root); + +// Returns the next byte from the iterator +// Returns '\0' if there are no more bytes left +// `it` is the iterator returned from begin_b_iter +char next_byte(ByteIterator *it); + +// Used to search for a pattern in the rope +// Pattern is a null terminated string representing a regular expression (DFA +// compliant) I.e some forms of backtracking etc. are not supported +// root is the root of the rope to be searched +// Returns a vector of pairs of start and length offsets (in bytes) +std::vector> search_rope(Knot *root, + const char *pattern); + +// Helper function to free the rope +// root is the root of the rope +// the root is no longer valid after call +// This must be called only once when the rope is no longer needed +void free_rope(Knot *root); + +#endif // ROPE_HPP diff --git a/src/noose.cpp b/src/noose.cpp new file mode 100644 index 0000000..5028919 --- /dev/null +++ b/src/noose.cpp @@ -0,0 +1,272 @@ +#include "../headers/noose.hpp" +#include "../headers/rope.hpp" +#include +#include +#include +#include +#include + +// VM - pass 2 + +bool test_ranges(char inp, Range *ranges, int len) { + for (int i = 0; i < len; i++) { + Range *r = ranges + i; + if (inp >= r->start && inp <= r->end) + return true; + } + return false; +} + +// Use pike vm method + +struct Thread { + Inst *pc; + uint32_t saved[40]; /* $0 through $9 */ +}; + +Thread thread(Inst *pc, uint32_t *saved) { + Thread t; + t.pc = pc; + for (int i = 0; i < 40; i++) + t.saved[i] = saved[i]; + return t; +} + +struct ThreadList { + Thread *t; + int n; +}; + +void handle_end(uint32_t *tsaved, uint32_t *saved) { + for (int i = 0; i < 40; i++) + saved[i] = tsaved[i]; +} + +bool addstate(Inst *prog, ThreadList *list, Thread t, int count) { + if (t.pc->op == JMP) { + if (addstate(prog, list, thread(prog + t.pc->j.x, t.saved), count)) + return true; + return false; + } else if (t.pc->op == FRK) { + if (addstate(prog, list, thread(prog + t.pc->j.x, t.saved), count)) + return true; + if (addstate(prog, list, thread(prog + t.pc->j.y, t.saved), count)) + return true; + return false; + } else if (t.pc->op == SVS) { + // Handle SVS: set the start offset and continue + t.saved[t.pc->idx * 2] = count; // 'count' must be passed or accessible + if (addstate(prog, list, thread(t.pc + 1, t.saved), count)) + return true; + return false; + } else if (t.pc->op == SVE) { + // Handle SVE: set the end offset and continue + t.saved[t.pc->idx * 2 + 1] = count; // 'count' must be passed or accessible + if (addstate(prog, list, thread(t.pc + 1, t.saved), count)) + return true; + return false; + } else if (t.pc->op == END) { + handle_end(t.saved, t.saved); + return true; + } else { + for (int i = 0; i < list->n; i++) + if (list->t[i].pc == t.pc) + return false; + list->t[list->n++] = t; + return false; + } +} + +void inline swap(ThreadList *a, ThreadList *b) { + ThreadList t = *a; + *a = *b; + *b = t; +} + +void inline clear(ThreadList *list) { list->n = 0; } + +int proglen(Inst *prog) { + int len = 0; + while (prog[len].op != END) + len++; + return ++len; +} + +void inline free_list(ThreadList *list) { + free(list->t); + free(list); +} + +int next_match(Inst *prog, ByteIterator *it, uint32_t *saved) { + int len; + ThreadList *clist, *nlist; + Thread t; + + len = proglen(prog); + clist = (ThreadList *)malloc(sizeof(ThreadList)); + clist->t = (Thread *)malloc(+sizeof(Thread) * len); + clist->n = 0; + nlist = (ThreadList *)malloc(sizeof(ThreadList)); + nlist->t = (Thread *)malloc(+sizeof(Thread) * len); + nlist->n = 0; + char sp; + int count = 0; + + addstate(prog, clist, thread(prog, saved), count); + for (sp = next_byte(it); sp != '\0'; sp = next_byte(it)) { + printf("%c", sp); + for (int i = 0; i < clist->n; i++) { + t = clist->t[i]; + switch (t.pc->op) { + case MCH: + if (!test_ranges(sp, t.pc->r.ranges, t.pc->r.len)) { + if (addstate(prog, nlist, thread(prog, saved), count)) + return true; + break; + } + if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count)) + return true; + break; + case NMC: + if (test_ranges(sp, t.pc->r.ranges, t.pc->r.len)) { + if (addstate(prog, nlist, thread(prog, saved), count)) + return true; + break; + } + if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count)) + return true; + break; + case ANY: + if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count)) + return true; + break; + case END: + case JMP: + case FRK: + case SVS: + case SVE: + break; + } + } + swap(clist, nlist); + clear(nlist); + count++; + } + + free_list(clist); + free_list(nlist); + + return false; // Reached EOF without a match +} + +void print_program(Inst *program) { + Inst *p = program; + int i = 0; + + while (1) { + printf("%3d: ", i); + + switch (p->op) { + case JMP: { + int x = (int)(p->j.x); + printf("JMP -> %d\n", x); + break; + } + + case FRK: { + int x = (int)(p->j.x); + int y = (int)(p->j.y); + printf("FRK -> %d , %d\n", x, y); + break; + } + + case MCH: { + printf("MCH ["); + for (int r = 0; r < p->r.len; r++) { + Range rr = p->r.ranges[r]; + if (rr.start == rr.end) + printf("'%c'%s", rr.start, (r + 1 < p->r.len) ? ", " : ""); + else + printf("'%c'-'%c'%s", rr.start, rr.end, + (r + 1 < p->r.len) ? ", " : ""); + } + printf("]\n"); + break; + } + + case NMC: { + printf("NMC ["); + for (int r = 0; r < p->r.len; r++) { + Range rr = p->r.ranges[r]; + if (rr.start == rr.end) + printf("'%c'%s", rr.start, (r + 1 < p->r.len) ? ", " : ""); + else + printf("'%c'-'%c'%s", rr.start, rr.end, + (r + 1 < p->r.len) ? ", " : ""); + } + printf("]\n"); + break; + } + + case ANY: + printf("ANY\n"); + break; + + case SVS: + printf("SVS idx=%d\n", (unsigned char)p->idx); + break; + + case SVE: + printf("SVE idx=%d\n", (unsigned char)p->idx); + break; + + case END: + printf("END\n"); + return; + + default: + printf("UNKNOWN op=%d\n", p->op); + return; + } + + p++; + i++; + } +} + +Inst *compile_regex(std::string pattern) { + return compile_ast(regex_to_ast(pattern)); +} + +int __main() { + // Maunally compiled program for testing + char *buffer = (char *)malloc(29); + strcpy(buffer, "abcdabcdabcdabcdf"); + // This loads all (excluding \0 put in by strcpy) + Knot *root = load(buffer, 17, optimal_chunk_size(12)); + ByteIterator *it = begin_b_iter(root); + uint32_t saved[40]; + + for (int i = 0; i < 40; i++) + saved[i] = 0; + + std::string pattern = "(abcd)+"; + + Inst *program = compile_regex(pattern); + + print_program(program); + + int result; + while ((result = next_match(program, it, saved))) { + printf("\nRES: %d\n", result); + for (int i = 0; i < 40; i++) + printf("%d, ", saved[i]); + } + + free(program); + free(buffer); + free(it->it); + free(it); + free(root); + return 0; +} diff --git a/src/rexambler.cpp b/src/rexambler.cpp new file mode 100644 index 0000000..8e0cc63 --- /dev/null +++ b/src/rexambler.cpp @@ -0,0 +1,387 @@ +#include "../headers/noose.hpp" +#include +#include +#include + +Exp *parse_alternation(Parser *p); +Exp *parse_sequence(Parser *p); +Exp *parse_atom_with_modifiers(Parser *p); +Exp *parse_bracket_class(Parser *p); + +Exp *make_none() { + Exp *e = new Exp(); + e->capture = false; + e->kind = ExpKind::NONE; + return e; +} + +Exp *make_any() { + Exp *e = new Exp(); + e->capture = false; + e->kind = ExpKind::ANY; + return e; +} + +Exp *make_range(const std::vector &ranges) { + Exp *e = new Exp(); + e->capture = false; + e->kind = ExpKind::RANGE; + new (&e->ranges) std::vector(ranges); + return e; +} + +Exp *make_range_single(char c, bool neg = false) { + std::vector r; + r.push_back(ExRange{neg, c, c}); + return make_range(r); +} + +Exp *make_or(Exp *l, Exp *r) { + OpOr *o = new OpOr(); + o->left = l; + o->right = r; + Exp *e = new Exp(); + e->capture = false; + e->kind = ExpKind::OR; + e->opor = o; + return e; +} + +Exp *make_seq(Exp *l, Exp *r) { + OpSeq *o = new OpSeq(); + o->left = l; + o->right = r; + Exp *e = new Exp(); + e->capture = false; + e->kind = ExpKind::SEQ; + e->opseq = o; + return e; +} + +Exp *clone_exp(Exp *e) { + if (!e) + return make_none(); + if (e->kind == ExpKind::NONE) + return make_none(); + if (e->kind == ExpKind::ANY) + return make_any(); + if (e->kind == ExpKind::RANGE) { + return make_range(e->ranges); + } + if (e->kind == ExpKind::OR) { + Exp *l = clone_exp(e->opor->left); + Exp *r = clone_exp(e->opor->right); + return make_or(l, r); + } + if (e->kind == ExpKind::SEQ) { + Exp *l = clone_exp(e->opseq->left); + Exp *r = clone_exp(e->opseq->right); + return make_seq(l, r); + } + return make_none(); +} + +inline char peek(Parser *p) { + return (p->i < p->s.size()) ? p->s[p->i] : '\0'; +} // lookahead + +inline char consume(Parser *p) { + return (p->i < p->s.size()) ? p->s[p->i++] : '\0'; +} // consume + +Exp *regex_to_ast(std::string pattern) { + Parser p(pattern); + Exp *res = parse_alternation(&p); + return res ? res : make_none(); +} + +Exp *parse_alternation(Parser *p) { + std::vector parts; + parts.push_back(parse_sequence(p)); + while (peek(p) == '|') { + consume(p); + parts.push_back(parse_sequence(p)); + } + if (parts.empty()) + return make_none(); + Exp *cur = parts[0]; + for (size_t p = 1; p < parts.size(); ++p) + cur = make_or(cur, parts[p]); + return cur; +} + +Exp *parse_sequence(Parser *p) { + std::vector atoms; + while (true) { + if (p->i >= p->s.size()) + break; + char c = peek(p); + if (c == ')' || c == '|') + break; + Exp *a = parse_atom_with_modifiers(p); + if (!a) + break; + atoms.push_back(a); + } + if (atoms.empty()) + return make_none(); + Exp *cur = atoms[0]; + for (size_t k = 1; k < atoms.size(); ++k) + cur = make_seq(cur, atoms[k]); + return cur; +} + +Exp *parse_atom(Parser *p) { + if (p->i >= p->s.size()) + return nullptr; + char c = peek(p); + if (c == '(') { + // grouping; recurse; set capture=true for group's root + consume(p); // '(' + Exp *inner = parse_alternation(p); + if (peek(p) == ')') + consume(p); + if (!inner) + inner = make_none(); + inner->capture = true; // as requested + return inner; + } + if (c == '[') { + // parse bracket class + return parse_bracket_class(p); + } + if (c == '\\') { + consume(p); + if (p->i >= p->s.size()) + return make_none(); + char esc = consume(p); + // handle known escapes + switch (esc) { + case 'd': + return make_range({ExRange{false, '0', '9'}}); + case 'D': { + // negated 0-9 : we represent as first sentinel negate=true then the + // included range + std::vector v; + v.push_back(ExRange{true, '0', '9'}); + return make_range(v); + } + case 'w': { + std::vector v; + v.push_back(ExRange{false, 'a', 'z'}); + v.push_back(ExRange{false, 'A', 'Z'}); + v.push_back(ExRange{false, '0', '9'}); + v.push_back(ExRange{false, '_', '_'}); + return make_range(v); + } + case 'W': { + std::vector v; + // provide the positive ranges that will be negated + v.push_back(ExRange{true, 'a', 'z'}); + v.push_back(ExRange{true, 'A', 'Z'}); + v.push_back(ExRange{true, '0', '9'}); + v.push_back(ExRange{true, '_', '_'}); + return make_range(v); + } + case 's': { + std::vector v; + v.push_back(ExRange{false, ' ', ' '}); // space + v.push_back(ExRange{false, '\t', '\t'}); // tab + v.push_back(ExRange{false, '\r', '\r'}); // CR + v.push_back(ExRange{false, '\n', '\n'}); // LF + v.push_back(ExRange{false, '\v', '\v'}); // VT + v.push_back(ExRange{false, '\f', '\f'}); // FF + return make_range(v); + } + case 'S': { + std::vector v; + v.push_back(ExRange{true, 0, 0}); + v.push_back(ExRange{true, ' ', ' '}); + v.push_back(ExRange{true, '\t', '\t'}); + v.push_back(ExRange{true, '\r', '\r'}); + v.push_back(ExRange{true, '\n', '\n'}); + v.push_back(ExRange{true, '\v', '\v'}); + v.push_back(ExRange{true, '\f', '\f'}); + return make_range(v); + } + case '.': + // escaped dot -> literal dot + return make_range_single('.', false); + default: + // escaped literal: any char becomes a single-char range + return make_range_single(esc, false); + } + } + if (c == '!') { + consume(p); + return make_any(); + } + // literal char (including '.' when unescaped is special? In many syntaxes + // '.' is wildcard, but user said '.' maps to [^\n], so treat '.' as + // wildcard) + if (c == '.') { + consume(p); + // dot == [^\n] + std::vector v; + v.push_back(ExRange{true, '\n', '\n'}); // indicate newline excluded + return make_range(v); + } + // otherwise a normal literal single char -> single range + char lit = consume(p); + return make_range_single(lit, false); +} + +Exp *parse_bracket_class(Parser *p) { + assert(peek(p) == '['); + consume(p); // '[' + bool neg = false; + if (peek(p) == '^') { + neg = true; + consume(p); + } + std::vector ranges; + while (p->i < p->s.size() && peek(p) != ']') { + char a = consume(p); + if (a == '\\') { + if (p->i >= p->s.size()) + break; + char esc = consume(p); + if (esc == 'd') { + ranges.push_back(ExRange{neg, '0', '9'}); + } else if (esc == 'w') { + ranges.push_back(ExRange{neg, 'a', 'z'}); + ranges.push_back(ExRange{neg, 'A', 'Z'}); + ranges.push_back(ExRange{neg, '0', '9'}); + ranges.push_back(ExRange{neg, '_', '_'}); + } else if (esc == 's') { + ranges.push_back(ExRange{neg, ' ', ' '}); + ranges.push_back(ExRange{neg, '\t', '\t'}); + ranges.push_back(ExRange{neg, '\r', '\r'}); + ranges.push_back(ExRange{neg, '\n', '\n'}); + ranges.push_back(ExRange{neg, '\v', '\v'}); + ranges.push_back(ExRange{neg, '\f', '\f'}); + } else { + ranges.push_back(ExRange{neg, esc, esc}); + } + } else if (peek(p) == '-' && p->i + 1 < p->s.size() && + p->s[p->i + 1] != ']') { + // range: previous char '-' next char + // but we already consumed 'a' as a; ensure there's a start to range + // get next char + // Note: we already consumed 'a' into a variable; now current char is + // '-' because we peeked it so do: (We are at position of '-') consume + // '-' and then next char + consume(p); // '-' + if (p->i >= p->s.size()) + break; + char b = consume(p); + ranges.push_back(ExRange{neg, a, b}); + } else { + // single char + ranges.push_back(ExRange{neg, a, a}); + } + } + if (peek(p) == ']') + consume(p); + // If negated, represent with sentinel first element with negate=true + return make_range(ranges); +} + +bool parse_integer_opt(Parser *p, int &out) { + if (p->i >= p->s.size() || !std::isdigit((unsigned char)peek(p))) + return false; + int val = 0; + while (p->i < p->s.size() && std::isdigit((unsigned char)peek(p))) + val = val * 10 + (consume(p) - '0'); + out = val; + return true; +} + +Exp *parse_atom_with_modifiers(Parser *p) { + Exp *atom = parse_atom(p); + if (!atom) + return nullptr; + + // apply possibly multiple modifiers in sequence + while (true) { + if (peek(p) == '?') { + consume(p); + // OpOr(atom, NONE) + atom = make_or(clone_exp(atom), make_none()); + } else if (peek(p) == '*') { + consume(p); + // Expand to 20 repeating OpOr(atom, NONE) chained by SEQ as literal + // tree + Exp *unit_or = nullptr; + for (int t = 0; t < 20; ++t) { + Exp *op = make_or(clone_exp(atom), make_none()); + if (!unit_or) + unit_or = op; + else + unit_or = make_seq(unit_or, op); + } + atom = unit_or ? unit_or : make_none(); + } else if (peek(p) == '+') { + consume(p); + // First the atom, then 20 OpOr(atom, NONE) sequence + Exp *rest = nullptr; + for (int t = 0; t < 20; ++t) { + Exp *op = make_or(clone_exp(atom), make_none()); + if (!rest) + rest = op; + else + rest = make_seq(rest, op); + } + atom = rest ? make_seq(clone_exp(atom), rest) : clone_exp(atom); + } else if (peek(p) == '{') { + // parse {x,y} + size_t save = p->i; + consume(p); // '{' + int x = 0, y = -1; + bool ok = parse_integer_opt(p, x); + if (!ok || peek(p) != ',') { + // malformed; roll back treat '{' as literal + p->i = save; + break; + } + consume(p); // ',' + ok = parse_integer_opt(p, y); + if (!ok || peek(p) != '}') { + p->i = save; + break; + } + consume(p); // '}' + if (y < x) + y = x; + if (y > 20) + y = 20; // clamp to 20 as requested + // Build x copies of atom concatenated, then (y-x) OpOr(atom, NONE) + // chained + Exp *prefix = nullptr; + for (int k = 0; k < x; ++k) { + if (!prefix) + prefix = clone_exp(atom); + else + prefix = make_seq(prefix, clone_exp(atom)); + } + Exp *suffix = nullptr; + for (int k = 0; k < (y - x); ++k) { + Exp *op = make_or(clone_exp(atom), make_none()); + if (!suffix) + suffix = op; + else + suffix = make_seq(suffix, op); + } + if (!prefix) + prefix = make_none(); + if (!suffix) + atom = prefix; + else + atom = make_seq(prefix, suffix); + } else { + break; + } + } + + return atom; +} diff --git a/src/rexpiler.cpp b/src/rexpiler.cpp new file mode 100644 index 0000000..6aca0ca --- /dev/null +++ b/src/rexpiler.cpp @@ -0,0 +1,149 @@ +#include "../headers/noose.hpp" +#include +#include +#include +#include +#include + +struct InstList { + Inst *data; + size_t len; + size_t cap; + int idx; +}; + +static inline bool is_ranges_negated(std::vector r) { + if (r.empty()) + return false; + return r[0].negate; +} + +static void insert_inst(InstList *list, Inst *inst) { + if (list->len >= list->cap) { + size_t nc = list->cap ? list->cap * 2 : 32; + list->data = (Inst *)realloc(list->data, nc * sizeof(Inst)); + list->cap = nc; + } + list->data[list->len++] = *inst; +} + +Inst *make_inst(Op op) { + Inst *I = (Inst *)calloc(1, sizeof(Inst)); + if (!I) + assert(0); + I->op = op; + return I; +} + +void compile_exp(Exp *e, InstList *list); + +void compile_any(Exp *e, InstList *list) { + Inst *I = make_inst(ANY); + if (e->capture) { + list->idx++; + Inst *sv = make_inst(SVS); + sv->idx = list->idx; + insert_inst(list, sv); + insert_inst(list, I); + Inst *se = make_inst(SVE); + se->idx = list->idx; + insert_inst(list, se); + } else { + insert_inst(list, I); + } +} + +void compile_range(Exp *e, InstList *list) { + std::vector er = e->ranges; + bool neg = is_ranges_negated(er); + size_t cnt = er.size(); + Range *arr = (Range *)malloc(sizeof(Range) * cnt); + size_t w = 0; + for (ExRange cur : er) + arr[w++] = Range{cur.start, cur.end}; + Inst *I = make_inst(neg ? NMC : MCH); + I->r.ranges = arr; + I->r.len = (int)cnt; + if (e->capture) { + list->idx++; + Inst *sv = make_inst(SVS); + sv->idx = list->idx; + insert_inst(list, sv); + insert_inst(list, I); + Inst *se = make_inst(SVE); + se->idx = list->idx; + insert_inst(list, se); + } else { + insert_inst(list, I); + } +} + +void compile_seq(Exp *e, InstList *list) { + if (e->capture) { + int idx = list->idx++; + Inst *sv = make_inst(SVS); + sv->idx = idx; + insert_inst(list, sv); + } + compile_exp(e->opseq->left, list); + compile_exp(e->opseq->right, list); + if (e->capture) { + Inst *se = make_inst(SVE); + se->idx = list->idx - 1; + insert_inst(list, se); + } +} + +void compile_or(Exp *e, InstList *list) { + if (e->capture) { + Inst *sv = make_inst(SVS); + sv->idx = list->idx++; + insert_inst(list, sv); + } + Inst *frk = make_inst(FRK); + frk->j.x = -1; + frk->j.y = -1; + int frk_idx = list->len; + insert_inst(list, frk); + int left_start = list->len; + compile_exp(e->opor->left, list); + Inst *jmp = make_inst(JMP); + insert_inst(list, jmp); + int jmp_idx = list->len - 1; + int right_start = list->len; + compile_exp(e->opor->right, list); + list->data[frk_idx].j.x = left_start; + list->data[frk_idx].j.y = right_start; + list->data[jmp_idx].j.x = list->len; + if (e->capture) { + Inst *se = make_inst(SVE); + se->idx = list->idx - 1; + insert_inst(list, se); + } +} + +void compile_exp(Exp *e, InstList *list) { + switch (e->kind) { + case ExpKind::NONE: + break; + case ExpKind::ANY: + compile_any(e, list); + break; + case ExpKind::RANGE: + compile_range(e, list); + break; + case ExpKind::SEQ: + compile_seq(e, list); + break; + case ExpKind::OR: + compile_or(e, list); + break; + } +} + +Inst *compile_ast(Exp *root) { + InstList list = {nullptr, 0, 0, 1}; + compile_exp(root, &list); + insert_inst(&list, make_inst(END)); + return list.data; +} diff --git a/src/rope.cpp b/src/rope.cpp new file mode 100644 index 0000000..34e7b0a --- /dev/null +++ b/src/rope.cpp @@ -0,0 +1,853 @@ +#include "../headers/rope.hpp" +#include "../headers/noose.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +static void update(Knot *n) { + if (!n) + return; + if (!n->depth || n->depth == 0) + return; + uint32_t left_chars = n->left ? n->left->char_count : 0; + uint32_t right_chars = n->right ? n->right->char_count : 0; + n->char_count = left_chars + right_chars; + uint32_t left_lines = n->left ? n->left->line_count : 0; + uint32_t right_lines = n->right ? n->right->line_count : 0; + n->line_count = left_lines + right_lines; + uint8_t left_depth = n->left ? n->left->depth : 0; + uint8_t right_depth = n->right ? n->right->depth : 0; + n->depth = MAX(left_depth, right_depth) + 1; + n->chunk_size = n->left ? n->left->chunk_size : n->right->chunk_size; +} + +// str is not consumed and \0 is not handled +// So if str is null terminated then len must be strlen(str) +// and freed by caller +Knot *load(char *str, uint32_t len, uint32_t chunk_size) { + if (len > (uint32_t)(chunk_size - (chunk_size / 16))) { + Knot *left = load(str, len / 2, chunk_size); + Knot *right = load(str + len / 2, len - len / 2, chunk_size); + Knot *node = (Knot *)malloc(sizeof(Knot)); + if (!node) + return nullptr; + node->left = left; + node->right = right; + node->chunk_size = chunk_size; + node->depth = MAX(left->depth, right->depth) + 1; + node->char_count = left->char_count + right->char_count; + node->line_count = left->line_count + right->line_count; + return node; + } else { + Knot *node = (Knot *)malloc(sizeof(Knot) + chunk_size); + if (!node) + return nullptr; + node->left = nullptr; + node->right = nullptr; + node->chunk_size = chunk_size; + node->depth = 0; + node->char_count = len; + uint32_t newline_count = 0; + for (uint32_t i = 0; i < len; i++) { + char c = str[i]; + node->data[i] = c; + if (c == '\n') + newline_count++; + } + node->line_count = newline_count; + return node; + } +} + +// leaf if consumed and freed (so dont use or free it after) +// left and right are the new nodes +static void split_leaf(Knot *leaf, uint32_t k, Knot **left, Knot **right) { + Knot *left_node = (Knot *)malloc(sizeof(Knot) + leaf->chunk_size); + left_node->left = nullptr; + left_node->right = nullptr; + left_node->chunk_size = leaf->chunk_size; + left_node->depth = 0; + left_node->char_count = k; + uint32_t newline_count = 0; + for (uint32_t i = 0; i < k; i++) { + char c = leaf->data[i]; + left_node->data[i] = c; + if (c == '\n') + newline_count++; + } + left_node->line_count = newline_count; + uint16_t right_line_count = leaf->line_count - newline_count; + *left = left_node; + Knot *right_node = (Knot *)malloc(sizeof(Knot) + leaf->chunk_size); + right_node->left = nullptr; + right_node->right = nullptr; + right_node->chunk_size = leaf->chunk_size; + right_node->depth = 0; + right_node->char_count = leaf->char_count - k; + right_node->line_count = right_line_count; + for (uint32_t i = k; i < leaf->char_count; i++) { + char c = leaf->data[i]; + right_node->data[i - k] = c; + } + *right = right_node; + free(leaf); +} + +// This makes node nonsensical, so dont use or free it after +void split(Knot *node, uint32_t offset, Knot **left, Knot **right) { + if (!node) { + *left = nullptr; + *right = nullptr; + return; + } + if (node->depth == 0) { + split_leaf(node, offset, left, right); + return; + } + uint32_t left_size = node->left ? node->left->char_count : 0; + if (offset < left_size) { + Knot *L = nullptr, *R = nullptr; + split(node->left, offset, &L, &R); + node->left = R; + update(node); + *right = node; + *left = L; + } else { + uint32_t new_offset = offset - left_size; + Knot *L = nullptr, *R = nullptr; + split(node->right, new_offset, &L, &R); + node->right = L; + update(node); + *left = node; + *right = R; + } +} + +static inline int get_balance_factor(Knot *n) { + if (!n) + return 0; + return (int)DEPTH(n->left) - (int)DEPTH(n->right); +} + +static inline Knot *rotate_right(Knot *y) { + Knot *x = y->left; + Knot *T2 = x->right; + x->right = y; + y->left = T2; + update(y); + update(x); + return x; +} + +static inline Knot *rotate_left(Knot *x) { + Knot *y = x->right; + Knot *T2 = y->left; + y->left = x; + x->right = T2; + update(x); + update(y); + return y; +} + +// Technically n can be used after calling +// but use return value instead +Knot *balance(Knot *n) { + update(n); + int bal = get_balance_factor(n); + if (bal > 1) { + if (get_balance_factor(n->left) < 0) + n->left = rotate_left(n->left); + return rotate_right(n); + } + if (bal < -1) { + if (get_balance_factor(n->right) > 0) + n->right = rotate_right(n->right); + return rotate_left(n); + } + return n; +} + +// Dont free left or right after calling (only free return value) +// Assumes both ropes have equal chunk sizes +Knot *concat(Knot *left, Knot *right) { + if (!left) + return right; + if (!right) + return left; + if (!left || left->char_count == 0) { + if (left) + free_rope(left); + return right; + } + if (!right || right->char_count == 0) { + if (right) + free_rope(right); + return left; + } + if (left->depth == 0 && right->depth == 0) { + if (left->char_count + right->char_count <= left->chunk_size) { + Knot *node = (Knot *)malloc(sizeof(Knot) + left->chunk_size); + node->left = nullptr; + node->right = nullptr; + node->chunk_size = left->chunk_size; + node->depth = 0; + node->char_count = left->char_count + right->char_count; + node->line_count = left->line_count + right->line_count; + memcpy(node->data, left->data, left->char_count); + memcpy(node->data + left->char_count, right->data, right->char_count); + free(left); + free(right); + return node; + } + } + uint16_t d_left = left->depth; + uint16_t d_right = right->depth; + if (d_left > d_right + 1) { + left->right = concat(left->right, right); + return balance(left); + } + if (d_right > d_left + 1) { + right->left = concat(left, right->left); + return balance(right); + } + Knot *node = (Knot *)malloc(sizeof(Knot)); + if (!node) + return nullptr; + node->left = left; + node->right = right; + node->chunk_size = left->chunk_size; + node->depth = MAX(d_left, d_right) + 1; + update(node); + return node; +} + +// This makes node nonsensical, so dont use or free it after +// Instead, free the return value or use it in node's place +Knot *insert(Knot *node, uint32_t offset, char *str, uint32_t len) { + if (!node) + return nullptr; + if (node->depth == 0 && node->char_count + len <= node->chunk_size) { + if (offset < node->char_count) + memmove(node->data + offset + len, node->data + offset, + node->char_count - offset); + memcpy(node->data + offset, str, len); + node->char_count += len; + for (uint32_t i = 0; i < len; i++) + if (str[i] == '\n') + node->line_count++; + return node; + } + if (node->depth > 0) { + uint32_t left_count = node->left ? node->left->char_count : 0; + if (offset < left_count) { + Knot *new_left = insert(node->left, offset, str, len); + node->left = new_left; + update(node); + return balance(node); + } else { + Knot *new_right = insert(node->right, offset - left_count, str, len); + node->right = new_right; + update(node); + return balance(node); + } + } + Knot *left_part = nullptr; + Knot *right_part = nullptr; + split(node, offset, &left_part, &right_part); + Knot *middle_part = load(str, len, node->chunk_size); + return concat(concat(left_part, middle_part), right_part); +} + +// This makes node nonsensical, so dont use or free it after +// Instead, free the return value or use it in node's place +Knot *erase(Knot *node, uint32_t offset, uint32_t len) { + if (!node || len == 0 || offset >= node->char_count) + return node; + if (offset + len > node->char_count) + len = node->char_count - offset; + if (node->depth == 0) { + uint32_t deleted_newlines = 0; + for (uint32_t i = offset; i < offset + len; i++) + if (node->data[i] == '\n') + deleted_newlines++; + node->line_count -= deleted_newlines; + if (offset + len < node->char_count) + memmove(node->data + offset, node->data + offset + len, + node->char_count - (offset + len)); + node->char_count -= len; + return node; + } + uint32_t left_count = node->left ? node->left->char_count : 0; + if (offset + len <= left_count) { + node->left = erase(node->left, offset, len); + } else if (offset >= left_count) { + node->right = erase(node->right, offset - left_count, len); + } else { + Knot *left = nullptr, *middle = nullptr, *right = nullptr; + split(node, offset, &left, &right); + split(right, len, &middle, &right); + free_rope(middle); + return concat(left, right); + } + update(node); + return balance(node); +} + +static void _read_into(Knot *node, uint32_t offset, uint32_t len, char *dest) { + if (!node || len == 0) + return; + if (node->depth == 0) { + memcpy(dest, node->data + offset, len); + return; + } + Knot *left = node->left; + uint32_t left_count = left ? left->char_count : 0; + if (offset < left_count) { + uint32_t chunk_len = left_count - offset; + if (chunk_len > len) + chunk_len = len; + _read_into(left, offset, chunk_len, dest); + dest += chunk_len; + len -= chunk_len; + offset = 0; + } else { + offset -= left_count; + } + if (len > 0 && node->right) + _read_into(node->right, offset, len, dest); +} + +char *read(Knot *root, uint32_t offset, uint32_t len) { + if (!root) + return nullptr; + if (offset >= root->char_count) { + char *empty = (char *)malloc(1); + if (empty) + empty[0] = '\0'; + return empty; + } + if (offset + len > root->char_count) { + len = root->char_count - offset; + } + char *buffer = (char *)malloc((len + 1) * sizeof(char)); + if (!buffer) + return nullptr; + _read_into(root, offset, len, buffer); + buffer[len] = '\0'; + return buffer; +} + +// Hopefully free the tree only once at the end of its use using the pointer +// from the last insert or concat or erase call. +// (or use twice if last call was split - for both left and right). +void free_rope(Knot *root) { + if (!root) + return; + free_rope(root->left); + free_rope(root->right); + free(root); +} + +static uint32_t find_nth_newline_offset(Knot *node, uint32_t n) { + if (!node || n > node->line_count) + return UINT32_MAX; + if (node->depth == 0) { + uint32_t count = 0; + for (uint32_t i = 0; i < node->char_count; i++) { + if (node->data[i] == '\n') { + if (count == n) + return i; + count++; + } + } + return UINT32_MAX; + } + uint32_t left_lines = node->left ? node->left->line_count : 0; + if (n < left_lines) { + return find_nth_newline_offset(node->left, n); + } else { + uint32_t right_offset = + find_nth_newline_offset(node->right, n - left_lines); + if (right_offset == UINT32_MAX) + return UINT32_MAX; + uint32_t left_chars = node->left ? node->left->char_count : 0; + return left_chars + right_offset; + } +} + +uint32_t byte_to_line(Knot *node, uint32_t offset) { + if (!node) + return 0; + if (offset >= node->char_count) + return node->line_count; + if (node->depth == 0) { + uint32_t lines_before = 0; + uint32_t limit = (offset < node->char_count) ? offset : node->char_count; + for (uint32_t i = 0; i < limit; i++) + if (node->data[i] == '\n') + lines_before++; + return lines_before; + } + uint32_t left_chars = node->left ? node->left->char_count : 0; + if (offset < left_chars) { + return byte_to_line(node->left, offset); + } else { + uint32_t left_lines = node->left ? node->left->line_count : 0; + return left_lines + byte_to_line(node->right, offset - left_chars); + } +} + +uint32_t line_to_byte(Knot *node, uint32_t line, uint32_t *out_len) { + if (!node) { + if (out_len) + *out_len = 0; + return 0; + } + uint32_t start_offset = 0; + uint32_t end_offset = 0; + if (line == 0) { + start_offset = 0; + } else { + uint32_t prev_newline = find_nth_newline_offset(node, line - 1); + if (prev_newline == UINT32_MAX) + start_offset = node->char_count; + else + start_offset = prev_newline + 1; + } + uint32_t current_newline = find_nth_newline_offset(node, line); + if (current_newline == UINT32_MAX) + end_offset = node->char_count; + else + end_offset = current_newline + 1; + if (out_len) { + if (end_offset > start_offset) + *out_len = end_offset - start_offset; + else + *out_len = 0; + } + return start_offset; +} + +LineIterator *begin_l_iter(Knot *root, uint32_t start_line) { + if (!root) + return nullptr; + if (start_line > root->line_count) + return nullptr; + LineIterator *it = (LineIterator *)malloc(sizeof(LineIterator)); + if (!it) + return nullptr; + it->top = 0; + it->line = start_line; + it->node = nullptr; + if (start_line == 0) { + it->offset = 0; + while (root->left) { + it->stack[it->top++] = root; + root = root->left; + if (!root->left && !root->right) + it->node = root; + } + it->stack[it->top++] = root; + return it; + } + Knot *curr = root; + uint32_t relative_line = start_line; + while (curr) { + it->stack[it->top++] = curr; + if (!curr->left && !curr->right) { + it->node = curr; + break; + } + uint32_t left_lines = (curr->left) ? curr->left->line_count : 0; + if (relative_line < left_lines) { + curr = curr->left; + } else { + relative_line -= left_lines; + curr = curr->right; + } + } + if (!it->node) { + free(it); + return nullptr; + } + it->offset = 0; + if (relative_line > 0) { + uint32_t found_newlines = 0; + uint32_t i = 0; + for (i = 0; i < it->node->char_count; i++) { + if (it->node->data[i] == '\n') { + found_newlines++; + if (found_newlines == relative_line) { + it->offset = i + 1; + break; + } + } + } + } + return it; +} + +static inline void iter_advance_leaf(LineIterator *it) { + if (it->top == 0) { + it->node = nullptr; + return; + } + Knot *prev = it->stack[--it->top]; + while (it->top > 0) { + Knot *parent = it->stack[it->top - 1]; + if (parent->left == prev && parent->right) { + Knot *curr = parent->right; + while (curr) { + it->stack[it->top++] = curr; + if (!curr->left && !curr->right) { + it->node = curr; + it->offset = 0; + return; + } + curr = (curr->left) ? curr->left : curr->right; + } + } + prev = it->stack[--it->top]; + } + it->node = nullptr; +} + +char *next_line(LineIterator *it) { + if (!it || !it->node) + return nullptr; + size_t capacity = 128; + size_t len = 0; + char *buffer = (char *)malloc(capacity); + if (!buffer) + return nullptr; + while (it->node) { + if (it->offset >= it->node->char_count) { + iter_advance_leaf(it); + if (!it->node) + break; + } + char *start = it->node->data + it->offset; + char *end = it->node->data + it->node->char_count; + char *newline_ptr = (char *)memchr(start, '\n', end - start); + size_t chunk_len; + int found_newline = 0; + if (newline_ptr) { + chunk_len = (newline_ptr - start) + 1; + found_newline = 1; + } else { + chunk_len = end - start; + } + if (len + chunk_len + 1 > capacity) { + capacity = (capacity * 2) + chunk_len; + char *new_buf = (char *)realloc(buffer, capacity); + if (!new_buf) { + free(buffer); + return nullptr; + } + buffer = new_buf; + } + memcpy(buffer + len, start, chunk_len); + len += chunk_len; + it->offset += chunk_len; + if (found_newline) { + buffer[len] = '\0'; + it->line++; + return buffer; + } + } + if (len > 0) { + buffer[len] = '\0'; + it->line++; + return buffer; + } + free(buffer); + return nullptr; +} + +LeafIterator *begin_k_iter(Knot *root) { + if (!root) + return nullptr; + LeafIterator *it = (LeafIterator *)malloc(sizeof(LeafIterator)); + if (!it) + return nullptr; + it->top = 0; + Knot *curr = root; + while (curr) { + it->stack[it->top++] = curr; + if (!curr->left && !curr->right) { + it->node = curr; + return it; + } + curr = curr->left; + if (!curr) { + curr = it->stack[--it->top]->right; + Knot *temp = it->stack[it->top]; + it->stack[it->top++] = temp; + curr = temp->left ? temp->left : temp->right; + Knot *parent = it->stack[it->top - 1]; + curr = parent->left; + if (!curr) { + curr = parent->right; + } + } + } + free(it); + return nullptr; +} + +// Caller must never free the returned string +char *next_leaf(LeafIterator *it) { + if (!it || !it->node) + return nullptr; + char *data_to_return = it->node->data; + data_to_return[it->node->char_count] = '\0'; + Knot *prev_leaf = it->node; + Knot *parent = nullptr; + while (it->top > 0) { + parent = it->stack[--it->top]; + if (parent->right && parent->right != prev_leaf) { + Knot *curr = parent->right; + while (curr) { + it->stack[it->top++] = curr; + if (!curr->left && !curr->right) { + it->node = curr; + return data_to_return; + } + curr = curr->left; + if (!curr) + curr = it->stack[it->top - 1]->right; + } + } + prev_leaf = parent; + } + it->node = nullptr; + return data_to_return; +} + +ByteIterator *begin_b_iter(Knot *root) { + ByteIterator *b_it = (ByteIterator *)malloc(sizeof(ByteIterator)); + LeafIterator *l_it = begin_k_iter(root); + b_it->it = l_it; + b_it->offset_g = 0; + b_it->offset_l = 0; + b_it->char_count = 0; + b_it->data = nullptr; + return b_it; +} + +char next_byte(ByteIterator *it) { + if (it->data && it->offset_l < it->char_count) { + return it->data[it->offset_l++]; + } else { + it->offset_g += it->offset_l; + it->offset_l = 1; + char *data = next_leaf(it->it); + it->char_count = strlen(data); + it->data = data; + if (it->data) + return *it->data; + else + return '\0'; + } +} + +std::vector> search_rope(Knot *root, + const char *pattern) { + std::vector> results; + int errorcode; + PCRE2_SIZE erroffset; + pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0, + &errorcode, &erroffset, nullptr); + if (!re) { + fprintf(stderr, "PCRE2 compile error: %d\n", errorcode); + return results; + } + pcre2_match_data *mdata = pcre2_match_data_create(128, nullptr); + int workspace[PCRE_WORKSPACE_SIZE]; + LeafIterator *it = begin_k_iter(root); + if (!it) { + pcre2_code_free(re); + pcre2_match_data_free(mdata); + return results; + } + size_t chunk_abs_offset = 0; + size_t saved_match_start = 0; + bool match_in_progress = false; + int flags = PCRE2_PARTIAL_SOFT; + while (1) { + const char *chunk_start = next_leaf(it); + if (!chunk_start) + break; + size_t chunk_len = strlen(chunk_start); + const char *current_ptr = chunk_start; + size_t remaining_len = chunk_len; + while (remaining_len > 0) { + int rc = + pcre2_dfa_match(re, (PCRE2_SPTR)current_ptr, remaining_len, 0, flags, + mdata, nullptr, workspace, PCRE_WORKSPACE_SIZE); + if (rc >= 0) { + PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata); + size_t match_start_abs; + size_t match_end_abs; + if (match_in_progress) { + match_start_abs = saved_match_start; + match_end_abs = + chunk_abs_offset + (current_ptr - chunk_start) + ov[1]; + } else { + match_start_abs = + chunk_abs_offset + (current_ptr - chunk_start) + ov[0]; + match_end_abs = + chunk_abs_offset + (current_ptr - chunk_start) + ov[1]; + } + size_t total_len = match_end_abs - match_start_abs; + results.push_back(std::make_pair(match_start_abs, total_len)); + size_t consumed = ov[1]; + if (consumed == 0) + consumed = 1; + current_ptr += consumed; + if (consumed > remaining_len) + remaining_len = 0; + else + remaining_len -= consumed; + match_in_progress = false; + flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL; + continue; + } else if (rc == PCRE2_ERROR_PARTIAL) { + PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata); + if (!match_in_progress) { + saved_match_start = + chunk_abs_offset + (current_ptr - chunk_start) + ov[0]; + match_in_progress = true; + } + flags |= PCRE2_DFA_RESTART; + flags |= PCRE2_NOTBOL; + break; + } else { + if (match_in_progress) { + match_in_progress = false; + flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL; + current_ptr++; + remaining_len--; + } else { + break; + } + // if (rc != PCRE2_ERROR_NOMATCH) {} // handle error + } + } + chunk_abs_offset += chunk_len; + if (!match_in_progress) + flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL; + } + pcre2_match_data_free(mdata); + pcre2_code_free(re); + free(it); + return results; +} + +uint32_t optimal_chunk_size(uint64_t length) { + if (length <= MIN_CHUNK_SIZE) + return MIN_CHUNK_SIZE; + double target_exponent = MIN(std::log2((double)MAX_CHUNK_SIZE), + 7.0 + (std::log2((double)length) - 10.0) * 0.25); + uint32_t final_chunk_size = + MAX((uint32_t)MIN_CHUNK_SIZE, (uint32_t)std::pow(2.0, target_exponent)); + final_chunk_size = MIN(final_chunk_size, (uint32_t)MAX_CHUNK_SIZE); + final_chunk_size = 1U << (32 - __builtin_clz(final_chunk_size - 1)); + return final_chunk_size; +} + +// Basic correctness test & usage example +int _main() { + char *buffer = (char *)malloc(44 * 4 + 5); + strcpy(buffer, "The quick brown fox jumps over the lazy dog.\n\ +The quick brown fox jumps over the lazy dog.\n\ +The quick brown fox jumps over the lazy dog.\n\ +The quick brown fox jumps over the lazy dog."); + // This loads all (excluding \0 put in by strcpy) + Knot *root = load(buffer, 44 * 4 + 3, optimal_chunk_size(44 * 4 + 3)); + Knot *left = nullptr, *right = nullptr; + // Splits root into left and right (root is no longer valid) + split(root, 5, &left, &right); + // simple read based on byte offset and length + char *s1 = read(left, 0, 100); + printf("%s\n:\n", s1); + char *s2 = read(right, 0, 100); + printf("%s\n;\n", s2); + free(s1); + free(s2); + // Recombines left and right into root (both can + // be valid or invalid in optimized cases) + // they are to not be used after concat + root = concat(left, right); + // root should be set to return value from insert always + root = insert(root, 5, buffer, 5); + free(buffer); + char *s3 = read(root, 0, 100); + printf("%s\n,\n", s3); + // Similar to insert but for erase + root = erase(root, 5, 5); + char *s4 = read(root, 0, 100); + printf("%s\n.\n", s4); + free(s3); + free(s4); + uint32_t byte_offset; + uint32_t len; + // Byte offset given reltive to how it would + // be in a file offset + len includes the \n + // at the end of the line (or nothing is EOF) + byte_offset = line_to_byte(root, 2, &len); + char *s5 = read(root, byte_offset, len); + printf("%s\n'\n", s5); + free(s5); + // returns line number of which line that + // byte position would be in. + // the ending \n position is included in this + uint32_t line = byte_to_line(root, byte_offset + len - 1); + printf("%u\n:\n", line); + // From second line onwards (0 indexed) + LineIterator *it = begin_l_iter(root, 0); + char *c = nullptr; + while ((c = next_line(it)) != nullptr) { + printf("%s :wow:\n", c); + free(c); + } + free(it); + printf("\n/\n"); + // Starts at first byte (to be used for regex search) + ByteIterator *it2 = begin_b_iter(root); + + uint32_t saved[40]; + + for (int i = 0; i < 40; i++) + saved[i] = 0; + + std::string pattern = "f.x"; + + Inst *program = compile_regex(pattern); + + bool result; + while ((result = next_match(program, it2, saved))) { + printf("\nRES: %d\n", result); + for (int i = 0; i < 40; i++) + printf("%d, ", saved[i]); + } + + // char c2 = ' '; + // while ((c2 = next_byte(it2)) != '\0') + // printf("%c :wow!:\n", c2); + // free(it2); + // search // uses leaf iterator internally // PCRE2 based + std::vector> matches = search_rope(root, "f.x"); + for (size_t i = 0; i < matches.size(); i++) + printf("\n%lu %lu", matches[i].first, matches[i].second); + // A rope needs to be freed only once if last action on the rope is + // insert or concat or erase. + // for splits we need to free both left and right separately + free_rope(root); + return 0; +} diff --git a/src/test.cpp b/src/test.cpp new file mode 100644 index 0000000..dfc8a4f --- /dev/null +++ b/src/test.cpp @@ -0,0 +1,283 @@ +#include "../headers/noose.hpp" +#include "../headers/rope.hpp" +#include +#include +#include +#include +#include +#include +#include + +char *load_file(const char *path, size_t *out_len) { + FILE *f = fopen(path, "rb"); + if (!f) { + perror("fopen"); + return nullptr; + } + fseek(f, 0, SEEK_END); + size_t len = ftell(f); + rewind(f); + + char *buf = (char *)malloc(len); + if (!buf) { + perror("malloc"); + fclose(f); + return nullptr; + } + + fread(buf, 1, len, f); + fclose(f); + + *out_len = len; + return buf; +} + +int main() { + + printf("My rope implementation benchmark\n"); + + { + size_t len; + printf("Loading file into rope...\n"); + char *buf = load_file("./random.bin", &len); + auto start = std::chrono::high_resolution_clock::now(); + Knot *root = load(buf, len, 2); + auto end = std::chrono::high_resolution_clock::now(); + printf("Load time: %.3f s\n", + std::chrono::duration(end - start).count()); + + free(buf); + + // READ TEST + printf("Testing read...\n"); + start = std::chrono::high_resolution_clock::now(); + char *content = read(root, len / 2, 1024); + end = std::chrono::high_resolution_clock::now(); + free(content); + printf("Read 1 KB from middle: %.6f s\n", + std::chrono::duration(end - start).count()); + + // INSERT TEST + printf("Testing insert...\n"); + char insert_data[1024]; + memset(insert_data, 'X', 1024); + start = std::chrono::high_resolution_clock::now(); + root = insert(root, len / 2, insert_data, 1024); + end = std::chrono::high_resolution_clock::now(); + printf("Insert 1 KB in middle: %.6f s\n", + std::chrono::duration(end - start).count()); + + // ERASE TEST (Delete the same 1 KB we just inserted) + printf("Testing erase...\n"); + start = std::chrono::high_resolution_clock::now(); + root = erase(root, len / 2, 1024); + end = std::chrono::high_resolution_clock::now(); + printf("Erase 1 KB in middle: %.6f s\n", + std::chrono::duration(end - start).count()); + + // SPLIT TEST + printf("Testing split...\n"); + Knot *left = nullptr, *right = nullptr; + start = std::chrono::high_resolution_clock::now(); + split(root, len / 2, &left, &right); + end = std::chrono::high_resolution_clock::now(); + printf("Split at middle: %.6f s\n", + std::chrono::duration(end - start).count()); + + // CONCAT TEST + printf("Testing concat...\n"); + start = std::chrono::high_resolution_clock::now(); + root = concat(left, right); + end = std::chrono::high_resolution_clock::now(); + printf("Concat: %.6f s\n", + std::chrono::duration(end - start).count()); + + // --------------------------------------------------------- + // LINE OPERATIONS TESTS + // --------------------------------------------------------- + printf("Testing line operations...\n"); + + // KNOWN CONSTANTS based on: yes "The quick brown fox jumps over the lazy + // dog." String length: 44 + 1 newline = 45 bytes per line. + const uint32_t BYTES_PER_LINE = 45; + const uint32_t TEST_LINE_INDEX = 1000; // A line deep in the file + + // 1. Test byte_to_line + // We pick a byte in the middle of TEST_LINE_INDEX. + // Offset = (100000 * 45) + 10. + uint32_t test_offset = (TEST_LINE_INDEX * BYTES_PER_LINE) + 10; + + start = std::chrono::high_resolution_clock::now(); + uint16_t calculated_line = byte_to_line(root, test_offset); + end = std::chrono::high_resolution_clock::now(); + + printf("byte_to_line (%u -> %u): %.6f s ", test_offset, calculated_line, + std::chrono::duration(end - start).count()); + + if (calculated_line == TEST_LINE_INDEX) { + printf("[PASS]\n"); + } else { + printf("[FAIL] Expected %u, got %u\n", TEST_LINE_INDEX, calculated_line); + } + + // 2. Test line_to_byte + // We ask for the start of TEST_LINE_INDEX. Should be exactly + // TEST_LINE_INDEX * 45. + uint32_t out_len = 0; + uint32_t expected_start = TEST_LINE_INDEX * BYTES_PER_LINE; + + start = std::chrono::high_resolution_clock::now(); + uint32_t calculated_start = line_to_byte(root, TEST_LINE_INDEX, &out_len); + end = std::chrono::high_resolution_clock::now(); + + printf("line_to_byte (Line %u -> Offset %u): %.6f s ", TEST_LINE_INDEX, + calculated_start, + std::chrono::duration(end - start).count()); + + if (calculated_start == expected_start && out_len == BYTES_PER_LINE) { + printf("[PASS]\n"); + } else { + printf("[FAIL] Expected offset %u (len %u), got %u (len %u)\n", + expected_start, BYTES_PER_LINE, calculated_start, out_len); + } + + // --------------------------------------------------------- + // ITERATOR SPEED TEST + // --------------------------------------------------------- + printf("Testing iterator speed...\n"); + + const uint32_t LINES_TO_ITERATE = 10000; // Iterate 10,000 lines + + // 1. Initialize the iterator at a deep line index + uint32_t start_line = TEST_LINE_INDEX + 10; + + LeafIterator *it = begin_k_iter(root); + if (!it) { + printf("Iterator Test: [FAIL] begin_iterator returned NULL.\n"); + } else { + char *line = NULL; + uint32_t lines_read = 0; + + start = std::chrono::high_resolution_clock::now(); + + // 2. Iterate and time the process + // We use the clean C idiom: get the line, check for NULL, then + // process. + while (lines_read < LINES_TO_ITERATE && (line = next_leaf(it)) != NULL) { + // Note: We deliberately skip printing to focus on the Rope operation + // time. + lines_read++; + } + + end = std::chrono::high_resolution_clock::now(); + + double elapsed_time = std::chrono::duration(end - start).count(); + + printf("Iterator speed (f:: %u): %.6f s (%.2f lines/s)\n", lines_read, + elapsed_time, (double)lines_read / elapsed_time); + + if (lines_read == LINES_TO_ITERATE) { + printf("Iterator Test: [PASS] Successfully iterated %u lines.\n", + LINES_TO_ITERATE); + } else { + printf("Iterator Test: [FAIL] Expected %u lines, read %u.\n", + LINES_TO_ITERATE, lines_read); + } + + // 3. Clean up the iterator + free(it); + } + + // search test + + start = std::chrono::high_resolution_clock::now(); + std::vector> matches = search_rope(root, "f.x"); + end = std::chrono::high_resolution_clock::now(); + printf("Search Time: %.6f s\n", + std::chrono::duration(end - start).count()); + printf("Found %lu matches\n", matches.size()); + + char *c = read(root, 0, 1000); + printf("%s\n", c); + free(c); + + ByteIterator *it1 = begin_b_iter(root); + char ch; + while ((ch = next_byte(it1)) != '\0') { + printf("%c:", ch); + } + + ByteIterator *it2 = begin_b_iter(root); + uint32_t saved[40]; + for (int i = 0; i < 40; i++) + saved[i] = 0; + std::string pattern = "f.x"; + Inst *program = compile_regex(pattern); + bool result; + int count = 0; + start = std::chrono::high_resolution_clock::now(); + while ((result = next_match(program, it2, saved))) { + count++; + printf("%d\n", count); + } + end = std::chrono::high_resolution_clock::now(); + printf("Search Time: %.6f s\n", + std::chrono::duration(end - start).count()); + printf("Found2 %d matches\n", count); + + free_rope(root); + } + + printf("Testing std::string...\n"); + + { + std::ifstream file("random.bin", std::ios::binary | std::ios::ate); + if (!file) { + perror("ifstream"); + return 1; + } + size_t len = file.tellg(); + file.seekg(0); + std::string data(len, '\0'); + file.read(data.data(), len); + + std::string s = data; + + auto start = std::chrono::high_resolution_clock::now(); + // READ: middle 1 KB + std::string read_chunk = s.substr(len / 2, 1024); + auto end = std::chrono::high_resolution_clock::now(); + printf("std::string read 1 KB from middle: %.6f s\n", + std::chrono::duration(end - start).count()); + + // INSERT: middle 1 KB + std::string insert_data(1024, 'X'); + start = std::chrono::high_resolution_clock::now(); + s.insert(len / 2, insert_data); + end = std::chrono::high_resolution_clock::now(); + printf("std::string insert 1 KB in middle: %.6f s\n", + std::chrono::duration(end - start).count()); + + // ERASE: middle 1 KB + start = std::chrono::high_resolution_clock::now(); + s.erase(len / 2, 1024); + end = std::chrono::high_resolution_clock::now(); + printf("std::string erase 1 KB in middle: %.6f s\n", + std::chrono::duration(end - start).count()); + + // SPLIT: middle + start = std::chrono::high_resolution_clock::now(); + std::string left = s.substr(0, len / 2); + std::string right = s.substr(len / 2); + end = std::chrono::high_resolution_clock::now(); + printf("std::string split at middle: %.6f s\n", + std::chrono::duration(end - start).count()); + + // CONCAT + start = std::chrono::high_resolution_clock::now(); + s = left + right; + end = std::chrono::high_resolution_clock::now(); + printf("std::string concat: %.6f s\n", + std::chrono::duration(end - start).count()); + } +}