Initial commit
This commit is contained in:
87
api/noose.hpp
Normal file
87
api/noose.hpp
Normal file
@@ -0,0 +1,87 @@
|
||||
#ifndef NOOSE_HPP
|
||||
#define NOOSE_HPP
|
||||
#include "../headers/rope.hpp"
|
||||
|
||||
#include <string>
|
||||
|
||||
struct Exp;
|
||||
|
||||
enum class ExpKind { RANGE, OR, SEQ, ANY, NONE };
|
||||
|
||||
struct ExRange {
|
||||
bool negate = false;
|
||||
char start;
|
||||
char end;
|
||||
};
|
||||
|
||||
struct OpOr {
|
||||
Exp *left;
|
||||
Exp *right;
|
||||
};
|
||||
|
||||
struct OpSeq {
|
||||
Exp *left;
|
||||
Exp *right;
|
||||
};
|
||||
|
||||
struct Exp {
|
||||
bool capture;
|
||||
ExpKind kind;
|
||||
union {
|
||||
OpOr *opor;
|
||||
OpSeq *opseq;
|
||||
std::vector<ExRange> ranges;
|
||||
};
|
||||
Exp() {
|
||||
capture = false;
|
||||
kind = ExpKind::NONE;
|
||||
}
|
||||
};
|
||||
|
||||
struct Parser {
|
||||
std::string s;
|
||||
size_t i;
|
||||
Parser(std::string str) : s(str), i(0) {}
|
||||
};
|
||||
|
||||
enum Op {
|
||||
// These jump around
|
||||
JMP = 0, // Jump to j.x
|
||||
FRK = 1, // Fork to j.x and j.y (with priority to x)
|
||||
// These consume 1 char from the input, if not then fail thread
|
||||
// (failuire of main thread is not successfull match)
|
||||
MCH = 2, // match with range object
|
||||
NMC = 3, // not match with range object
|
||||
ANY = 4, // Anything
|
||||
// Used to save offsets
|
||||
SVS = 5, // Start save for i cap group
|
||||
SVE = 6, // End save for i cap group
|
||||
// Match is successful if main thread reaches the end
|
||||
END = 7
|
||||
};
|
||||
|
||||
struct Range { // use start == end to match a particular char
|
||||
char start;
|
||||
char end;
|
||||
};
|
||||
|
||||
struct Inst {
|
||||
Op op;
|
||||
union {
|
||||
struct {
|
||||
Range *ranges;
|
||||
int len;
|
||||
} r;
|
||||
struct {
|
||||
int x, y;
|
||||
} j;
|
||||
};
|
||||
int idx;
|
||||
};
|
||||
|
||||
Exp *regex_to_ast(std::string pattern);
|
||||
Inst *compile_ast(Exp *root);
|
||||
Inst *compile_regex(std::string pattern);
|
||||
int next_match(Inst *prog, ByteIterator *it, uint32_t *saved);
|
||||
|
||||
#endif
|
||||
157
api/rope.hpp
Normal file
157
api/rope.hpp
Normal file
@@ -0,0 +1,157 @@
|
||||
#ifndef ROPE_HPP
|
||||
#define ROPE_HPP
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
#define MIN_CHUNK_SIZE 64 // 64 Bytes
|
||||
#define MAX_CHUNK_SIZE 1024 * 8 // 8192 Bytes (8 KiB)
|
||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#define DEPTH(n) ((n) ? (n)->depth : 0)
|
||||
|
||||
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||
#define PCRE_WORKSPACE_SIZE 512
|
||||
|
||||
// Rope node definition
|
||||
typedef struct Knot {
|
||||
Knot *left;
|
||||
Knot *right;
|
||||
uint8_t depth;
|
||||
uint32_t chunk_size;
|
||||
uint32_t line_count;
|
||||
uint32_t char_count;
|
||||
char data[];
|
||||
} Knot;
|
||||
|
||||
typedef struct LineIterator {
|
||||
Knot *node;
|
||||
uint8_t top;
|
||||
uint32_t offset;
|
||||
uint32_t line;
|
||||
Knot *stack[64];
|
||||
} LineIterator;
|
||||
|
||||
typedef struct LeafIterator {
|
||||
Knot *node;
|
||||
uint8_t top;
|
||||
uint32_t offset;
|
||||
Knot *stack[64];
|
||||
} LeafIterator;
|
||||
|
||||
typedef struct ByteIterator {
|
||||
LeafIterator *it;
|
||||
uint32_t offset_l;
|
||||
uint32_t offset_g;
|
||||
uint32_t char_count;
|
||||
char *data;
|
||||
} ByteIterator;
|
||||
|
||||
// Rope operations
|
||||
|
||||
// Takes lengt of string to be converted
|
||||
// to rope and returns a suitable chunk size
|
||||
// but rope should work with any positive chunk size
|
||||
uint32_t optimal_chunk_size(uint64_t length);
|
||||
|
||||
// Takes a string (no need for null termination) and returns a rope
|
||||
// len is the length of the string, and chunk size is the size of each chunk
|
||||
// load does not free or consume the string.
|
||||
// and the str can be freed after load has been run.
|
||||
Knot *load(char *str, uint32_t len, uint32_t chunk_size);
|
||||
|
||||
// Balances the rope and returns the root
|
||||
// n is no longer valid / do not free
|
||||
// As rope is balanced by other functions
|
||||
// this is not to be used directly
|
||||
Knot *balance(Knot *n);
|
||||
|
||||
// Concatenates two ropes and returns the joined root
|
||||
// Balances the ropes too, if needed
|
||||
// left and right are no longer valid / do not free
|
||||
// ! left and right should have the same chunk size !
|
||||
Knot *concat(Knot *left, Knot *right);
|
||||
|
||||
// Used to insert text into the rope
|
||||
// node (the rope being inserted into) is no longer valid after call
|
||||
// instead use return value as the new node
|
||||
// offset is the position of the insertion relative to the start of the rope
|
||||
// str is the string to be inserted (no need for null termination)
|
||||
// len is the length of the string
|
||||
Knot *insert(Knot *node, uint32_t offset, char *str, uint32_t len);
|
||||
|
||||
// Similar to insert but for deletion
|
||||
// node (the rope being deleted from) is no longer valid after call
|
||||
// instead use return value as the new node
|
||||
// offset is the position of the deletion relative to the start of the rope
|
||||
// len is the length of the deletion
|
||||
Knot *erase(Knot *node, uint32_t offset, uint32_t len);
|
||||
|
||||
// Used to read a string from the rope
|
||||
// root is the rope to be read from
|
||||
// offset is the position of the read relative to the start of the rope
|
||||
// len is the length of the read
|
||||
// returns a null terminated string, should be freed by the caller
|
||||
char *read(Knot *root, uint32_t offset, uint32_t len);
|
||||
|
||||
// Used to split the rope into left and right ropes
|
||||
// node is the rope to be split (it is no longer valid after call / do not free)
|
||||
// offset is the position of the split relative to the start of the rope
|
||||
// left and right are pointers set to the root of that side of the split
|
||||
void split(Knot *node, uint32_t offset, Knot **left, Knot **right);
|
||||
|
||||
// Used to convert a byte offset to a line number that contains that byte
|
||||
uint32_t byte_to_line(Knot *node, uint32_t offset);
|
||||
|
||||
// Used to convert a line number to a byte offset (start of the line)
|
||||
// also sets out_len to the length of the line
|
||||
uint32_t line_to_byte(Knot *node, uint32_t line, uint32_t *out_len);
|
||||
|
||||
// Used to start a line iterator from the start_line number
|
||||
// root is the root of the rope
|
||||
// returned iterator must be freed after iteration is done
|
||||
LineIterator *begin_l_iter(Knot *root, uint32_t start_line);
|
||||
|
||||
// Each subsequent call returns the next line as a null terminated string
|
||||
// `it` is the iterator returned from begin_l_iter
|
||||
// After getting the necessary lines free the iterator (no need to go upto the
|
||||
// end) returns null if there are no more lines All return strings `must` be
|
||||
// freed by the caller
|
||||
char *next_line(LineIterator *it);
|
||||
|
||||
// Used to start an iterator over leaf data
|
||||
// root is the root of the rope
|
||||
// the caller must free the iterator after use
|
||||
LeafIterator *begin_k_iter(Knot *root);
|
||||
|
||||
// Returns the next leaf data as a null terminated string
|
||||
// `it` is the iterator returned from begin_k_iter
|
||||
// ! Strings returned must never be freed by the caller !
|
||||
// to mutate the string a copy must be made
|
||||
char *next_leaf(LeafIterator *it);
|
||||
|
||||
// Used to start an iterator over byte data (one byte at a time)
|
||||
// Uses leaf iterator internally
|
||||
// root is the root of the rope, the caller must free the iterator after use
|
||||
ByteIterator *begin_b_iter(Knot *root);
|
||||
|
||||
// Returns the next byte from the iterator
|
||||
// Returns '\0' if there are no more bytes left
|
||||
// `it` is the iterator returned from begin_b_iter
|
||||
char next_byte(ByteIterator *it);
|
||||
|
||||
// Used to search for a pattern in the rope
|
||||
// Pattern is a null terminated string representing a regular expression (DFA
|
||||
// compliant) I.e some forms of backtracking etc. are not supported
|
||||
// root is the root of the rope to be searched
|
||||
// Returns a vector of pairs of start and length offsets (in bytes)
|
||||
std::vector<std::pair<size_t, size_t>> search_rope(Knot *root,
|
||||
const char *pattern);
|
||||
|
||||
// Helper function to free the rope
|
||||
// root is the root of the rope
|
||||
// the root is no longer valid after call
|
||||
// This must be called only once when the rope is no longer needed
|
||||
void free_rope(Knot *root);
|
||||
|
||||
#endif // ROPE_HPP
|
||||
272
src/noose.cpp
Normal file
272
src/noose.cpp
Normal file
@@ -0,0 +1,272 @@
|
||||
#include "../headers/noose.hpp"
|
||||
#include "../headers/rope.hpp"
|
||||
#include <assert.h>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <stdio.h>
|
||||
|
||||
// VM - pass 2
|
||||
|
||||
bool test_ranges(char inp, Range *ranges, int len) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
Range *r = ranges + i;
|
||||
if (inp >= r->start && inp <= r->end)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Use pike vm method
|
||||
|
||||
struct Thread {
|
||||
Inst *pc;
|
||||
uint32_t saved[40]; /* $0 through $9 */
|
||||
};
|
||||
|
||||
Thread thread(Inst *pc, uint32_t *saved) {
|
||||
Thread t;
|
||||
t.pc = pc;
|
||||
for (int i = 0; i < 40; i++)
|
||||
t.saved[i] = saved[i];
|
||||
return t;
|
||||
}
|
||||
|
||||
struct ThreadList {
|
||||
Thread *t;
|
||||
int n;
|
||||
};
|
||||
|
||||
void handle_end(uint32_t *tsaved, uint32_t *saved) {
|
||||
for (int i = 0; i < 40; i++)
|
||||
saved[i] = tsaved[i];
|
||||
}
|
||||
|
||||
bool addstate(Inst *prog, ThreadList *list, Thread t, int count) {
|
||||
if (t.pc->op == JMP) {
|
||||
if (addstate(prog, list, thread(prog + t.pc->j.x, t.saved), count))
|
||||
return true;
|
||||
return false;
|
||||
} else if (t.pc->op == FRK) {
|
||||
if (addstate(prog, list, thread(prog + t.pc->j.x, t.saved), count))
|
||||
return true;
|
||||
if (addstate(prog, list, thread(prog + t.pc->j.y, t.saved), count))
|
||||
return true;
|
||||
return false;
|
||||
} else if (t.pc->op == SVS) {
|
||||
// Handle SVS: set the start offset and continue
|
||||
t.saved[t.pc->idx * 2] = count; // 'count' must be passed or accessible
|
||||
if (addstate(prog, list, thread(t.pc + 1, t.saved), count))
|
||||
return true;
|
||||
return false;
|
||||
} else if (t.pc->op == SVE) {
|
||||
// Handle SVE: set the end offset and continue
|
||||
t.saved[t.pc->idx * 2 + 1] = count; // 'count' must be passed or accessible
|
||||
if (addstate(prog, list, thread(t.pc + 1, t.saved), count))
|
||||
return true;
|
||||
return false;
|
||||
} else if (t.pc->op == END) {
|
||||
handle_end(t.saved, t.saved);
|
||||
return true;
|
||||
} else {
|
||||
for (int i = 0; i < list->n; i++)
|
||||
if (list->t[i].pc == t.pc)
|
||||
return false;
|
||||
list->t[list->n++] = t;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
void inline swap(ThreadList *a, ThreadList *b) {
|
||||
ThreadList t = *a;
|
||||
*a = *b;
|
||||
*b = t;
|
||||
}
|
||||
|
||||
void inline clear(ThreadList *list) { list->n = 0; }
|
||||
|
||||
int proglen(Inst *prog) {
|
||||
int len = 0;
|
||||
while (prog[len].op != END)
|
||||
len++;
|
||||
return ++len;
|
||||
}
|
||||
|
||||
void inline free_list(ThreadList *list) {
|
||||
free(list->t);
|
||||
free(list);
|
||||
}
|
||||
|
||||
int next_match(Inst *prog, ByteIterator *it, uint32_t *saved) {
|
||||
int len;
|
||||
ThreadList *clist, *nlist;
|
||||
Thread t;
|
||||
|
||||
len = proglen(prog);
|
||||
clist = (ThreadList *)malloc(sizeof(ThreadList));
|
||||
clist->t = (Thread *)malloc(+sizeof(Thread) * len);
|
||||
clist->n = 0;
|
||||
nlist = (ThreadList *)malloc(sizeof(ThreadList));
|
||||
nlist->t = (Thread *)malloc(+sizeof(Thread) * len);
|
||||
nlist->n = 0;
|
||||
char sp;
|
||||
int count = 0;
|
||||
|
||||
addstate(prog, clist, thread(prog, saved), count);
|
||||
for (sp = next_byte(it); sp != '\0'; sp = next_byte(it)) {
|
||||
printf("%c", sp);
|
||||
for (int i = 0; i < clist->n; i++) {
|
||||
t = clist->t[i];
|
||||
switch (t.pc->op) {
|
||||
case MCH:
|
||||
if (!test_ranges(sp, t.pc->r.ranges, t.pc->r.len)) {
|
||||
if (addstate(prog, nlist, thread(prog, saved), count))
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count))
|
||||
return true;
|
||||
break;
|
||||
case NMC:
|
||||
if (test_ranges(sp, t.pc->r.ranges, t.pc->r.len)) {
|
||||
if (addstate(prog, nlist, thread(prog, saved), count))
|
||||
return true;
|
||||
break;
|
||||
}
|
||||
if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count))
|
||||
return true;
|
||||
break;
|
||||
case ANY:
|
||||
if (addstate(prog, nlist, thread(t.pc + 1, t.saved), count))
|
||||
return true;
|
||||
break;
|
||||
case END:
|
||||
case JMP:
|
||||
case FRK:
|
||||
case SVS:
|
||||
case SVE:
|
||||
break;
|
||||
}
|
||||
}
|
||||
swap(clist, nlist);
|
||||
clear(nlist);
|
||||
count++;
|
||||
}
|
||||
|
||||
free_list(clist);
|
||||
free_list(nlist);
|
||||
|
||||
return false; // Reached EOF without a match
|
||||
}
|
||||
|
||||
void print_program(Inst *program) {
|
||||
Inst *p = program;
|
||||
int i = 0;
|
||||
|
||||
while (1) {
|
||||
printf("%3d: ", i);
|
||||
|
||||
switch (p->op) {
|
||||
case JMP: {
|
||||
int x = (int)(p->j.x);
|
||||
printf("JMP -> %d\n", x);
|
||||
break;
|
||||
}
|
||||
|
||||
case FRK: {
|
||||
int x = (int)(p->j.x);
|
||||
int y = (int)(p->j.y);
|
||||
printf("FRK -> %d , %d\n", x, y);
|
||||
break;
|
||||
}
|
||||
|
||||
case MCH: {
|
||||
printf("MCH [");
|
||||
for (int r = 0; r < p->r.len; r++) {
|
||||
Range rr = p->r.ranges[r];
|
||||
if (rr.start == rr.end)
|
||||
printf("'%c'%s", rr.start, (r + 1 < p->r.len) ? ", " : "");
|
||||
else
|
||||
printf("'%c'-'%c'%s", rr.start, rr.end,
|
||||
(r + 1 < p->r.len) ? ", " : "");
|
||||
}
|
||||
printf("]\n");
|
||||
break;
|
||||
}
|
||||
|
||||
case NMC: {
|
||||
printf("NMC [");
|
||||
for (int r = 0; r < p->r.len; r++) {
|
||||
Range rr = p->r.ranges[r];
|
||||
if (rr.start == rr.end)
|
||||
printf("'%c'%s", rr.start, (r + 1 < p->r.len) ? ", " : "");
|
||||
else
|
||||
printf("'%c'-'%c'%s", rr.start, rr.end,
|
||||
(r + 1 < p->r.len) ? ", " : "");
|
||||
}
|
||||
printf("]\n");
|
||||
break;
|
||||
}
|
||||
|
||||
case ANY:
|
||||
printf("ANY\n");
|
||||
break;
|
||||
|
||||
case SVS:
|
||||
printf("SVS idx=%d\n", (unsigned char)p->idx);
|
||||
break;
|
||||
|
||||
case SVE:
|
||||
printf("SVE idx=%d\n", (unsigned char)p->idx);
|
||||
break;
|
||||
|
||||
case END:
|
||||
printf("END\n");
|
||||
return;
|
||||
|
||||
default:
|
||||
printf("UNKNOWN op=%d\n", p->op);
|
||||
return;
|
||||
}
|
||||
|
||||
p++;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
Inst *compile_regex(std::string pattern) {
|
||||
return compile_ast(regex_to_ast(pattern));
|
||||
}
|
||||
|
||||
int __main() {
|
||||
// Maunally compiled program for testing
|
||||
char *buffer = (char *)malloc(29);
|
||||
strcpy(buffer, "abcdabcdabcdabcdf");
|
||||
// This loads all (excluding \0 put in by strcpy)
|
||||
Knot *root = load(buffer, 17, optimal_chunk_size(12));
|
||||
ByteIterator *it = begin_b_iter(root);
|
||||
uint32_t saved[40];
|
||||
|
||||
for (int i = 0; i < 40; i++)
|
||||
saved[i] = 0;
|
||||
|
||||
std::string pattern = "(abcd)+";
|
||||
|
||||
Inst *program = compile_regex(pattern);
|
||||
|
||||
print_program(program);
|
||||
|
||||
int result;
|
||||
while ((result = next_match(program, it, saved))) {
|
||||
printf("\nRES: %d\n", result);
|
||||
for (int i = 0; i < 40; i++)
|
||||
printf("%d, ", saved[i]);
|
||||
}
|
||||
|
||||
free(program);
|
||||
free(buffer);
|
||||
free(it->it);
|
||||
free(it);
|
||||
free(root);
|
||||
return 0;
|
||||
}
|
||||
387
src/rexambler.cpp
Normal file
387
src/rexambler.cpp
Normal file
@@ -0,0 +1,387 @@
|
||||
#include "../headers/noose.hpp"
|
||||
#include <cassert>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
Exp *parse_alternation(Parser *p);
|
||||
Exp *parse_sequence(Parser *p);
|
||||
Exp *parse_atom_with_modifiers(Parser *p);
|
||||
Exp *parse_bracket_class(Parser *p);
|
||||
|
||||
Exp *make_none() {
|
||||
Exp *e = new Exp();
|
||||
e->capture = false;
|
||||
e->kind = ExpKind::NONE;
|
||||
return e;
|
||||
}
|
||||
|
||||
Exp *make_any() {
|
||||
Exp *e = new Exp();
|
||||
e->capture = false;
|
||||
e->kind = ExpKind::ANY;
|
||||
return e;
|
||||
}
|
||||
|
||||
Exp *make_range(const std::vector<ExRange> &ranges) {
|
||||
Exp *e = new Exp();
|
||||
e->capture = false;
|
||||
e->kind = ExpKind::RANGE;
|
||||
new (&e->ranges) std::vector<ExRange>(ranges);
|
||||
return e;
|
||||
}
|
||||
|
||||
Exp *make_range_single(char c, bool neg = false) {
|
||||
std::vector<ExRange> r;
|
||||
r.push_back(ExRange{neg, c, c});
|
||||
return make_range(r);
|
||||
}
|
||||
|
||||
Exp *make_or(Exp *l, Exp *r) {
|
||||
OpOr *o = new OpOr();
|
||||
o->left = l;
|
||||
o->right = r;
|
||||
Exp *e = new Exp();
|
||||
e->capture = false;
|
||||
e->kind = ExpKind::OR;
|
||||
e->opor = o;
|
||||
return e;
|
||||
}
|
||||
|
||||
Exp *make_seq(Exp *l, Exp *r) {
|
||||
OpSeq *o = new OpSeq();
|
||||
o->left = l;
|
||||
o->right = r;
|
||||
Exp *e = new Exp();
|
||||
e->capture = false;
|
||||
e->kind = ExpKind::SEQ;
|
||||
e->opseq = o;
|
||||
return e;
|
||||
}
|
||||
|
||||
Exp *clone_exp(Exp *e) {
|
||||
if (!e)
|
||||
return make_none();
|
||||
if (e->kind == ExpKind::NONE)
|
||||
return make_none();
|
||||
if (e->kind == ExpKind::ANY)
|
||||
return make_any();
|
||||
if (e->kind == ExpKind::RANGE) {
|
||||
return make_range(e->ranges);
|
||||
}
|
||||
if (e->kind == ExpKind::OR) {
|
||||
Exp *l = clone_exp(e->opor->left);
|
||||
Exp *r = clone_exp(e->opor->right);
|
||||
return make_or(l, r);
|
||||
}
|
||||
if (e->kind == ExpKind::SEQ) {
|
||||
Exp *l = clone_exp(e->opseq->left);
|
||||
Exp *r = clone_exp(e->opseq->right);
|
||||
return make_seq(l, r);
|
||||
}
|
||||
return make_none();
|
||||
}
|
||||
|
||||
inline char peek(Parser *p) {
|
||||
return (p->i < p->s.size()) ? p->s[p->i] : '\0';
|
||||
} // lookahead
|
||||
|
||||
inline char consume(Parser *p) {
|
||||
return (p->i < p->s.size()) ? p->s[p->i++] : '\0';
|
||||
} // consume
|
||||
|
||||
Exp *regex_to_ast(std::string pattern) {
|
||||
Parser p(pattern);
|
||||
Exp *res = parse_alternation(&p);
|
||||
return res ? res : make_none();
|
||||
}
|
||||
|
||||
Exp *parse_alternation(Parser *p) {
|
||||
std::vector<Exp *> parts;
|
||||
parts.push_back(parse_sequence(p));
|
||||
while (peek(p) == '|') {
|
||||
consume(p);
|
||||
parts.push_back(parse_sequence(p));
|
||||
}
|
||||
if (parts.empty())
|
||||
return make_none();
|
||||
Exp *cur = parts[0];
|
||||
for (size_t p = 1; p < parts.size(); ++p)
|
||||
cur = make_or(cur, parts[p]);
|
||||
return cur;
|
||||
}
|
||||
|
||||
Exp *parse_sequence(Parser *p) {
|
||||
std::vector<Exp *> atoms;
|
||||
while (true) {
|
||||
if (p->i >= p->s.size())
|
||||
break;
|
||||
char c = peek(p);
|
||||
if (c == ')' || c == '|')
|
||||
break;
|
||||
Exp *a = parse_atom_with_modifiers(p);
|
||||
if (!a)
|
||||
break;
|
||||
atoms.push_back(a);
|
||||
}
|
||||
if (atoms.empty())
|
||||
return make_none();
|
||||
Exp *cur = atoms[0];
|
||||
for (size_t k = 1; k < atoms.size(); ++k)
|
||||
cur = make_seq(cur, atoms[k]);
|
||||
return cur;
|
||||
}
|
||||
|
||||
Exp *parse_atom(Parser *p) {
|
||||
if (p->i >= p->s.size())
|
||||
return nullptr;
|
||||
char c = peek(p);
|
||||
if (c == '(') {
|
||||
// grouping; recurse; set capture=true for group's root
|
||||
consume(p); // '('
|
||||
Exp *inner = parse_alternation(p);
|
||||
if (peek(p) == ')')
|
||||
consume(p);
|
||||
if (!inner)
|
||||
inner = make_none();
|
||||
inner->capture = true; // as requested
|
||||
return inner;
|
||||
}
|
||||
if (c == '[') {
|
||||
// parse bracket class
|
||||
return parse_bracket_class(p);
|
||||
}
|
||||
if (c == '\\') {
|
||||
consume(p);
|
||||
if (p->i >= p->s.size())
|
||||
return make_none();
|
||||
char esc = consume(p);
|
||||
// handle known escapes
|
||||
switch (esc) {
|
||||
case 'd':
|
||||
return make_range({ExRange{false, '0', '9'}});
|
||||
case 'D': {
|
||||
// negated 0-9 : we represent as first sentinel negate=true then the
|
||||
// included range
|
||||
std::vector<ExRange> v;
|
||||
v.push_back(ExRange{true, '0', '9'});
|
||||
return make_range(v);
|
||||
}
|
||||
case 'w': {
|
||||
std::vector<ExRange> v;
|
||||
v.push_back(ExRange{false, 'a', 'z'});
|
||||
v.push_back(ExRange{false, 'A', 'Z'});
|
||||
v.push_back(ExRange{false, '0', '9'});
|
||||
v.push_back(ExRange{false, '_', '_'});
|
||||
return make_range(v);
|
||||
}
|
||||
case 'W': {
|
||||
std::vector<ExRange> v;
|
||||
// provide the positive ranges that will be negated
|
||||
v.push_back(ExRange{true, 'a', 'z'});
|
||||
v.push_back(ExRange{true, 'A', 'Z'});
|
||||
v.push_back(ExRange{true, '0', '9'});
|
||||
v.push_back(ExRange{true, '_', '_'});
|
||||
return make_range(v);
|
||||
}
|
||||
case 's': {
|
||||
std::vector<ExRange> v;
|
||||
v.push_back(ExRange{false, ' ', ' '}); // space
|
||||
v.push_back(ExRange{false, '\t', '\t'}); // tab
|
||||
v.push_back(ExRange{false, '\r', '\r'}); // CR
|
||||
v.push_back(ExRange{false, '\n', '\n'}); // LF
|
||||
v.push_back(ExRange{false, '\v', '\v'}); // VT
|
||||
v.push_back(ExRange{false, '\f', '\f'}); // FF
|
||||
return make_range(v);
|
||||
}
|
||||
case 'S': {
|
||||
std::vector<ExRange> v;
|
||||
v.push_back(ExRange{true, 0, 0});
|
||||
v.push_back(ExRange{true, ' ', ' '});
|
||||
v.push_back(ExRange{true, '\t', '\t'});
|
||||
v.push_back(ExRange{true, '\r', '\r'});
|
||||
v.push_back(ExRange{true, '\n', '\n'});
|
||||
v.push_back(ExRange{true, '\v', '\v'});
|
||||
v.push_back(ExRange{true, '\f', '\f'});
|
||||
return make_range(v);
|
||||
}
|
||||
case '.':
|
||||
// escaped dot -> literal dot
|
||||
return make_range_single('.', false);
|
||||
default:
|
||||
// escaped literal: any char becomes a single-char range
|
||||
return make_range_single(esc, false);
|
||||
}
|
||||
}
|
||||
if (c == '!') {
|
||||
consume(p);
|
||||
return make_any();
|
||||
}
|
||||
// literal char (including '.' when unescaped is special? In many syntaxes
|
||||
// '.' is wildcard, but user said '.' maps to [^\n], so treat '.' as
|
||||
// wildcard)
|
||||
if (c == '.') {
|
||||
consume(p);
|
||||
// dot == [^\n]
|
||||
std::vector<ExRange> v;
|
||||
v.push_back(ExRange{true, '\n', '\n'}); // indicate newline excluded
|
||||
return make_range(v);
|
||||
}
|
||||
// otherwise a normal literal single char -> single range
|
||||
char lit = consume(p);
|
||||
return make_range_single(lit, false);
|
||||
}
|
||||
|
||||
Exp *parse_bracket_class(Parser *p) {
|
||||
assert(peek(p) == '[');
|
||||
consume(p); // '['
|
||||
bool neg = false;
|
||||
if (peek(p) == '^') {
|
||||
neg = true;
|
||||
consume(p);
|
||||
}
|
||||
std::vector<ExRange> ranges;
|
||||
while (p->i < p->s.size() && peek(p) != ']') {
|
||||
char a = consume(p);
|
||||
if (a == '\\') {
|
||||
if (p->i >= p->s.size())
|
||||
break;
|
||||
char esc = consume(p);
|
||||
if (esc == 'd') {
|
||||
ranges.push_back(ExRange{neg, '0', '9'});
|
||||
} else if (esc == 'w') {
|
||||
ranges.push_back(ExRange{neg, 'a', 'z'});
|
||||
ranges.push_back(ExRange{neg, 'A', 'Z'});
|
||||
ranges.push_back(ExRange{neg, '0', '9'});
|
||||
ranges.push_back(ExRange{neg, '_', '_'});
|
||||
} else if (esc == 's') {
|
||||
ranges.push_back(ExRange{neg, ' ', ' '});
|
||||
ranges.push_back(ExRange{neg, '\t', '\t'});
|
||||
ranges.push_back(ExRange{neg, '\r', '\r'});
|
||||
ranges.push_back(ExRange{neg, '\n', '\n'});
|
||||
ranges.push_back(ExRange{neg, '\v', '\v'});
|
||||
ranges.push_back(ExRange{neg, '\f', '\f'});
|
||||
} else {
|
||||
ranges.push_back(ExRange{neg, esc, esc});
|
||||
}
|
||||
} else if (peek(p) == '-' && p->i + 1 < p->s.size() &&
|
||||
p->s[p->i + 1] != ']') {
|
||||
// range: previous char '-' next char
|
||||
// but we already consumed 'a' as a; ensure there's a start to range
|
||||
// get next char
|
||||
// Note: we already consumed 'a' into a variable; now current char is
|
||||
// '-' because we peeked it so do: (We are at position of '-') consume
|
||||
// '-' and then next char
|
||||
consume(p); // '-'
|
||||
if (p->i >= p->s.size())
|
||||
break;
|
||||
char b = consume(p);
|
||||
ranges.push_back(ExRange{neg, a, b});
|
||||
} else {
|
||||
// single char
|
||||
ranges.push_back(ExRange{neg, a, a});
|
||||
}
|
||||
}
|
||||
if (peek(p) == ']')
|
||||
consume(p);
|
||||
// If negated, represent with sentinel first element with negate=true
|
||||
return make_range(ranges);
|
||||
}
|
||||
|
||||
bool parse_integer_opt(Parser *p, int &out) {
|
||||
if (p->i >= p->s.size() || !std::isdigit((unsigned char)peek(p)))
|
||||
return false;
|
||||
int val = 0;
|
||||
while (p->i < p->s.size() && std::isdigit((unsigned char)peek(p)))
|
||||
val = val * 10 + (consume(p) - '0');
|
||||
out = val;
|
||||
return true;
|
||||
}
|
||||
|
||||
Exp *parse_atom_with_modifiers(Parser *p) {
|
||||
Exp *atom = parse_atom(p);
|
||||
if (!atom)
|
||||
return nullptr;
|
||||
|
||||
// apply possibly multiple modifiers in sequence
|
||||
while (true) {
|
||||
if (peek(p) == '?') {
|
||||
consume(p);
|
||||
// OpOr(atom, NONE)
|
||||
atom = make_or(clone_exp(atom), make_none());
|
||||
} else if (peek(p) == '*') {
|
||||
consume(p);
|
||||
// Expand to 20 repeating OpOr(atom, NONE) chained by SEQ as literal
|
||||
// tree
|
||||
Exp *unit_or = nullptr;
|
||||
for (int t = 0; t < 20; ++t) {
|
||||
Exp *op = make_or(clone_exp(atom), make_none());
|
||||
if (!unit_or)
|
||||
unit_or = op;
|
||||
else
|
||||
unit_or = make_seq(unit_or, op);
|
||||
}
|
||||
atom = unit_or ? unit_or : make_none();
|
||||
} else if (peek(p) == '+') {
|
||||
consume(p);
|
||||
// First the atom, then 20 OpOr(atom, NONE) sequence
|
||||
Exp *rest = nullptr;
|
||||
for (int t = 0; t < 20; ++t) {
|
||||
Exp *op = make_or(clone_exp(atom), make_none());
|
||||
if (!rest)
|
||||
rest = op;
|
||||
else
|
||||
rest = make_seq(rest, op);
|
||||
}
|
||||
atom = rest ? make_seq(clone_exp(atom), rest) : clone_exp(atom);
|
||||
} else if (peek(p) == '{') {
|
||||
// parse {x,y}
|
||||
size_t save = p->i;
|
||||
consume(p); // '{'
|
||||
int x = 0, y = -1;
|
||||
bool ok = parse_integer_opt(p, x);
|
||||
if (!ok || peek(p) != ',') {
|
||||
// malformed; roll back treat '{' as literal
|
||||
p->i = save;
|
||||
break;
|
||||
}
|
||||
consume(p); // ','
|
||||
ok = parse_integer_opt(p, y);
|
||||
if (!ok || peek(p) != '}') {
|
||||
p->i = save;
|
||||
break;
|
||||
}
|
||||
consume(p); // '}'
|
||||
if (y < x)
|
||||
y = x;
|
||||
if (y > 20)
|
||||
y = 20; // clamp to 20 as requested
|
||||
// Build x copies of atom concatenated, then (y-x) OpOr(atom, NONE)
|
||||
// chained
|
||||
Exp *prefix = nullptr;
|
||||
for (int k = 0; k < x; ++k) {
|
||||
if (!prefix)
|
||||
prefix = clone_exp(atom);
|
||||
else
|
||||
prefix = make_seq(prefix, clone_exp(atom));
|
||||
}
|
||||
Exp *suffix = nullptr;
|
||||
for (int k = 0; k < (y - x); ++k) {
|
||||
Exp *op = make_or(clone_exp(atom), make_none());
|
||||
if (!suffix)
|
||||
suffix = op;
|
||||
else
|
||||
suffix = make_seq(suffix, op);
|
||||
}
|
||||
if (!prefix)
|
||||
prefix = make_none();
|
||||
if (!suffix)
|
||||
atom = prefix;
|
||||
else
|
||||
atom = make_seq(prefix, suffix);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return atom;
|
||||
}
|
||||
149
src/rexpiler.cpp
Normal file
149
src/rexpiler.cpp
Normal file
@@ -0,0 +1,149 @@
|
||||
#include "../headers/noose.hpp"
|
||||
#include <assert.h>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <stdio.h>
|
||||
|
||||
struct InstList {
|
||||
Inst *data;
|
||||
size_t len;
|
||||
size_t cap;
|
||||
int idx;
|
||||
};
|
||||
|
||||
static inline bool is_ranges_negated(std::vector<ExRange> r) {
|
||||
if (r.empty())
|
||||
return false;
|
||||
return r[0].negate;
|
||||
}
|
||||
|
||||
static void insert_inst(InstList *list, Inst *inst) {
|
||||
if (list->len >= list->cap) {
|
||||
size_t nc = list->cap ? list->cap * 2 : 32;
|
||||
list->data = (Inst *)realloc(list->data, nc * sizeof(Inst));
|
||||
list->cap = nc;
|
||||
}
|
||||
list->data[list->len++] = *inst;
|
||||
}
|
||||
|
||||
Inst *make_inst(Op op) {
|
||||
Inst *I = (Inst *)calloc(1, sizeof(Inst));
|
||||
if (!I)
|
||||
assert(0);
|
||||
I->op = op;
|
||||
return I;
|
||||
}
|
||||
|
||||
void compile_exp(Exp *e, InstList *list);
|
||||
|
||||
void compile_any(Exp *e, InstList *list) {
|
||||
Inst *I = make_inst(ANY);
|
||||
if (e->capture) {
|
||||
list->idx++;
|
||||
Inst *sv = make_inst(SVS);
|
||||
sv->idx = list->idx;
|
||||
insert_inst(list, sv);
|
||||
insert_inst(list, I);
|
||||
Inst *se = make_inst(SVE);
|
||||
se->idx = list->idx;
|
||||
insert_inst(list, se);
|
||||
} else {
|
||||
insert_inst(list, I);
|
||||
}
|
||||
}
|
||||
|
||||
void compile_range(Exp *e, InstList *list) {
|
||||
std::vector<ExRange> er = e->ranges;
|
||||
bool neg = is_ranges_negated(er);
|
||||
size_t cnt = er.size();
|
||||
Range *arr = (Range *)malloc(sizeof(Range) * cnt);
|
||||
size_t w = 0;
|
||||
for (ExRange cur : er)
|
||||
arr[w++] = Range{cur.start, cur.end};
|
||||
Inst *I = make_inst(neg ? NMC : MCH);
|
||||
I->r.ranges = arr;
|
||||
I->r.len = (int)cnt;
|
||||
if (e->capture) {
|
||||
list->idx++;
|
||||
Inst *sv = make_inst(SVS);
|
||||
sv->idx = list->idx;
|
||||
insert_inst(list, sv);
|
||||
insert_inst(list, I);
|
||||
Inst *se = make_inst(SVE);
|
||||
se->idx = list->idx;
|
||||
insert_inst(list, se);
|
||||
} else {
|
||||
insert_inst(list, I);
|
||||
}
|
||||
}
|
||||
|
||||
void compile_seq(Exp *e, InstList *list) {
|
||||
if (e->capture) {
|
||||
int idx = list->idx++;
|
||||
Inst *sv = make_inst(SVS);
|
||||
sv->idx = idx;
|
||||
insert_inst(list, sv);
|
||||
}
|
||||
compile_exp(e->opseq->left, list);
|
||||
compile_exp(e->opseq->right, list);
|
||||
if (e->capture) {
|
||||
Inst *se = make_inst(SVE);
|
||||
se->idx = list->idx - 1;
|
||||
insert_inst(list, se);
|
||||
}
|
||||
}
|
||||
|
||||
void compile_or(Exp *e, InstList *list) {
|
||||
if (e->capture) {
|
||||
Inst *sv = make_inst(SVS);
|
||||
sv->idx = list->idx++;
|
||||
insert_inst(list, sv);
|
||||
}
|
||||
Inst *frk = make_inst(FRK);
|
||||
frk->j.x = -1;
|
||||
frk->j.y = -1;
|
||||
int frk_idx = list->len;
|
||||
insert_inst(list, frk);
|
||||
int left_start = list->len;
|
||||
compile_exp(e->opor->left, list);
|
||||
Inst *jmp = make_inst(JMP);
|
||||
insert_inst(list, jmp);
|
||||
int jmp_idx = list->len - 1;
|
||||
int right_start = list->len;
|
||||
compile_exp(e->opor->right, list);
|
||||
list->data[frk_idx].j.x = left_start;
|
||||
list->data[frk_idx].j.y = right_start;
|
||||
list->data[jmp_idx].j.x = list->len;
|
||||
if (e->capture) {
|
||||
Inst *se = make_inst(SVE);
|
||||
se->idx = list->idx - 1;
|
||||
insert_inst(list, se);
|
||||
}
|
||||
}
|
||||
|
||||
void compile_exp(Exp *e, InstList *list) {
|
||||
switch (e->kind) {
|
||||
case ExpKind::NONE:
|
||||
break;
|
||||
case ExpKind::ANY:
|
||||
compile_any(e, list);
|
||||
break;
|
||||
case ExpKind::RANGE:
|
||||
compile_range(e, list);
|
||||
break;
|
||||
case ExpKind::SEQ:
|
||||
compile_seq(e, list);
|
||||
break;
|
||||
case ExpKind::OR:
|
||||
compile_or(e, list);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Inst *compile_ast(Exp *root) {
|
||||
InstList list = {nullptr, 0, 0, 1};
|
||||
compile_exp(root, &list);
|
||||
insert_inst(&list, make_inst(END));
|
||||
return list.data;
|
||||
}
|
||||
853
src/rope.cpp
Normal file
853
src/rope.cpp
Normal file
@@ -0,0 +1,853 @@
|
||||
#include "../headers/rope.hpp"
|
||||
#include "../headers/noose.hpp"
|
||||
#include <assert.h>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <pcre2.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
static void update(Knot *n) {
|
||||
if (!n)
|
||||
return;
|
||||
if (!n->depth || n->depth == 0)
|
||||
return;
|
||||
uint32_t left_chars = n->left ? n->left->char_count : 0;
|
||||
uint32_t right_chars = n->right ? n->right->char_count : 0;
|
||||
n->char_count = left_chars + right_chars;
|
||||
uint32_t left_lines = n->left ? n->left->line_count : 0;
|
||||
uint32_t right_lines = n->right ? n->right->line_count : 0;
|
||||
n->line_count = left_lines + right_lines;
|
||||
uint8_t left_depth = n->left ? n->left->depth : 0;
|
||||
uint8_t right_depth = n->right ? n->right->depth : 0;
|
||||
n->depth = MAX(left_depth, right_depth) + 1;
|
||||
n->chunk_size = n->left ? n->left->chunk_size : n->right->chunk_size;
|
||||
}
|
||||
|
||||
// str is not consumed and \0 is not handled
|
||||
// So if str is null terminated then len must be strlen(str)
|
||||
// and freed by caller
|
||||
Knot *load(char *str, uint32_t len, uint32_t chunk_size) {
|
||||
if (len > (uint32_t)(chunk_size - (chunk_size / 16))) {
|
||||
Knot *left = load(str, len / 2, chunk_size);
|
||||
Knot *right = load(str + len / 2, len - len / 2, chunk_size);
|
||||
Knot *node = (Knot *)malloc(sizeof(Knot));
|
||||
if (!node)
|
||||
return nullptr;
|
||||
node->left = left;
|
||||
node->right = right;
|
||||
node->chunk_size = chunk_size;
|
||||
node->depth = MAX(left->depth, right->depth) + 1;
|
||||
node->char_count = left->char_count + right->char_count;
|
||||
node->line_count = left->line_count + right->line_count;
|
||||
return node;
|
||||
} else {
|
||||
Knot *node = (Knot *)malloc(sizeof(Knot) + chunk_size);
|
||||
if (!node)
|
||||
return nullptr;
|
||||
node->left = nullptr;
|
||||
node->right = nullptr;
|
||||
node->chunk_size = chunk_size;
|
||||
node->depth = 0;
|
||||
node->char_count = len;
|
||||
uint32_t newline_count = 0;
|
||||
for (uint32_t i = 0; i < len; i++) {
|
||||
char c = str[i];
|
||||
node->data[i] = c;
|
||||
if (c == '\n')
|
||||
newline_count++;
|
||||
}
|
||||
node->line_count = newline_count;
|
||||
return node;
|
||||
}
|
||||
}
|
||||
|
||||
// leaf if consumed and freed (so dont use or free it after)
|
||||
// left and right are the new nodes
|
||||
static void split_leaf(Knot *leaf, uint32_t k, Knot **left, Knot **right) {
|
||||
Knot *left_node = (Knot *)malloc(sizeof(Knot) + leaf->chunk_size);
|
||||
left_node->left = nullptr;
|
||||
left_node->right = nullptr;
|
||||
left_node->chunk_size = leaf->chunk_size;
|
||||
left_node->depth = 0;
|
||||
left_node->char_count = k;
|
||||
uint32_t newline_count = 0;
|
||||
for (uint32_t i = 0; i < k; i++) {
|
||||
char c = leaf->data[i];
|
||||
left_node->data[i] = c;
|
||||
if (c == '\n')
|
||||
newline_count++;
|
||||
}
|
||||
left_node->line_count = newline_count;
|
||||
uint16_t right_line_count = leaf->line_count - newline_count;
|
||||
*left = left_node;
|
||||
Knot *right_node = (Knot *)malloc(sizeof(Knot) + leaf->chunk_size);
|
||||
right_node->left = nullptr;
|
||||
right_node->right = nullptr;
|
||||
right_node->chunk_size = leaf->chunk_size;
|
||||
right_node->depth = 0;
|
||||
right_node->char_count = leaf->char_count - k;
|
||||
right_node->line_count = right_line_count;
|
||||
for (uint32_t i = k; i < leaf->char_count; i++) {
|
||||
char c = leaf->data[i];
|
||||
right_node->data[i - k] = c;
|
||||
}
|
||||
*right = right_node;
|
||||
free(leaf);
|
||||
}
|
||||
|
||||
// This makes node nonsensical, so dont use or free it after
|
||||
void split(Knot *node, uint32_t offset, Knot **left, Knot **right) {
|
||||
if (!node) {
|
||||
*left = nullptr;
|
||||
*right = nullptr;
|
||||
return;
|
||||
}
|
||||
if (node->depth == 0) {
|
||||
split_leaf(node, offset, left, right);
|
||||
return;
|
||||
}
|
||||
uint32_t left_size = node->left ? node->left->char_count : 0;
|
||||
if (offset < left_size) {
|
||||
Knot *L = nullptr, *R = nullptr;
|
||||
split(node->left, offset, &L, &R);
|
||||
node->left = R;
|
||||
update(node);
|
||||
*right = node;
|
||||
*left = L;
|
||||
} else {
|
||||
uint32_t new_offset = offset - left_size;
|
||||
Knot *L = nullptr, *R = nullptr;
|
||||
split(node->right, new_offset, &L, &R);
|
||||
node->right = L;
|
||||
update(node);
|
||||
*left = node;
|
||||
*right = R;
|
||||
}
|
||||
}
|
||||
|
||||
static inline int get_balance_factor(Knot *n) {
|
||||
if (!n)
|
||||
return 0;
|
||||
return (int)DEPTH(n->left) - (int)DEPTH(n->right);
|
||||
}
|
||||
|
||||
static inline Knot *rotate_right(Knot *y) {
|
||||
Knot *x = y->left;
|
||||
Knot *T2 = x->right;
|
||||
x->right = y;
|
||||
y->left = T2;
|
||||
update(y);
|
||||
update(x);
|
||||
return x;
|
||||
}
|
||||
|
||||
static inline Knot *rotate_left(Knot *x) {
|
||||
Knot *y = x->right;
|
||||
Knot *T2 = y->left;
|
||||
y->left = x;
|
||||
x->right = T2;
|
||||
update(x);
|
||||
update(y);
|
||||
return y;
|
||||
}
|
||||
|
||||
// Technically n can be used after calling
|
||||
// but use return value instead
|
||||
Knot *balance(Knot *n) {
|
||||
update(n);
|
||||
int bal = get_balance_factor(n);
|
||||
if (bal > 1) {
|
||||
if (get_balance_factor(n->left) < 0)
|
||||
n->left = rotate_left(n->left);
|
||||
return rotate_right(n);
|
||||
}
|
||||
if (bal < -1) {
|
||||
if (get_balance_factor(n->right) > 0)
|
||||
n->right = rotate_right(n->right);
|
||||
return rotate_left(n);
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
// Dont free left or right after calling (only free return value)
|
||||
// Assumes both ropes have equal chunk sizes
|
||||
Knot *concat(Knot *left, Knot *right) {
|
||||
if (!left)
|
||||
return right;
|
||||
if (!right)
|
||||
return left;
|
||||
if (!left || left->char_count == 0) {
|
||||
if (left)
|
||||
free_rope(left);
|
||||
return right;
|
||||
}
|
||||
if (!right || right->char_count == 0) {
|
||||
if (right)
|
||||
free_rope(right);
|
||||
return left;
|
||||
}
|
||||
if (left->depth == 0 && right->depth == 0) {
|
||||
if (left->char_count + right->char_count <= left->chunk_size) {
|
||||
Knot *node = (Knot *)malloc(sizeof(Knot) + left->chunk_size);
|
||||
node->left = nullptr;
|
||||
node->right = nullptr;
|
||||
node->chunk_size = left->chunk_size;
|
||||
node->depth = 0;
|
||||
node->char_count = left->char_count + right->char_count;
|
||||
node->line_count = left->line_count + right->line_count;
|
||||
memcpy(node->data, left->data, left->char_count);
|
||||
memcpy(node->data + left->char_count, right->data, right->char_count);
|
||||
free(left);
|
||||
free(right);
|
||||
return node;
|
||||
}
|
||||
}
|
||||
uint16_t d_left = left->depth;
|
||||
uint16_t d_right = right->depth;
|
||||
if (d_left > d_right + 1) {
|
||||
left->right = concat(left->right, right);
|
||||
return balance(left);
|
||||
}
|
||||
if (d_right > d_left + 1) {
|
||||
right->left = concat(left, right->left);
|
||||
return balance(right);
|
||||
}
|
||||
Knot *node = (Knot *)malloc(sizeof(Knot));
|
||||
if (!node)
|
||||
return nullptr;
|
||||
node->left = left;
|
||||
node->right = right;
|
||||
node->chunk_size = left->chunk_size;
|
||||
node->depth = MAX(d_left, d_right) + 1;
|
||||
update(node);
|
||||
return node;
|
||||
}
|
||||
|
||||
// This makes node nonsensical, so dont use or free it after
|
||||
// Instead, free the return value or use it in node's place
|
||||
Knot *insert(Knot *node, uint32_t offset, char *str, uint32_t len) {
|
||||
if (!node)
|
||||
return nullptr;
|
||||
if (node->depth == 0 && node->char_count + len <= node->chunk_size) {
|
||||
if (offset < node->char_count)
|
||||
memmove(node->data + offset + len, node->data + offset,
|
||||
node->char_count - offset);
|
||||
memcpy(node->data + offset, str, len);
|
||||
node->char_count += len;
|
||||
for (uint32_t i = 0; i < len; i++)
|
||||
if (str[i] == '\n')
|
||||
node->line_count++;
|
||||
return node;
|
||||
}
|
||||
if (node->depth > 0) {
|
||||
uint32_t left_count = node->left ? node->left->char_count : 0;
|
||||
if (offset < left_count) {
|
||||
Knot *new_left = insert(node->left, offset, str, len);
|
||||
node->left = new_left;
|
||||
update(node);
|
||||
return balance(node);
|
||||
} else {
|
||||
Knot *new_right = insert(node->right, offset - left_count, str, len);
|
||||
node->right = new_right;
|
||||
update(node);
|
||||
return balance(node);
|
||||
}
|
||||
}
|
||||
Knot *left_part = nullptr;
|
||||
Knot *right_part = nullptr;
|
||||
split(node, offset, &left_part, &right_part);
|
||||
Knot *middle_part = load(str, len, node->chunk_size);
|
||||
return concat(concat(left_part, middle_part), right_part);
|
||||
}
|
||||
|
||||
// This makes node nonsensical, so dont use or free it after
|
||||
// Instead, free the return value or use it in node's place
|
||||
Knot *erase(Knot *node, uint32_t offset, uint32_t len) {
|
||||
if (!node || len == 0 || offset >= node->char_count)
|
||||
return node;
|
||||
if (offset + len > node->char_count)
|
||||
len = node->char_count - offset;
|
||||
if (node->depth == 0) {
|
||||
uint32_t deleted_newlines = 0;
|
||||
for (uint32_t i = offset; i < offset + len; i++)
|
||||
if (node->data[i] == '\n')
|
||||
deleted_newlines++;
|
||||
node->line_count -= deleted_newlines;
|
||||
if (offset + len < node->char_count)
|
||||
memmove(node->data + offset, node->data + offset + len,
|
||||
node->char_count - (offset + len));
|
||||
node->char_count -= len;
|
||||
return node;
|
||||
}
|
||||
uint32_t left_count = node->left ? node->left->char_count : 0;
|
||||
if (offset + len <= left_count) {
|
||||
node->left = erase(node->left, offset, len);
|
||||
} else if (offset >= left_count) {
|
||||
node->right = erase(node->right, offset - left_count, len);
|
||||
} else {
|
||||
Knot *left = nullptr, *middle = nullptr, *right = nullptr;
|
||||
split(node, offset, &left, &right);
|
||||
split(right, len, &middle, &right);
|
||||
free_rope(middle);
|
||||
return concat(left, right);
|
||||
}
|
||||
update(node);
|
||||
return balance(node);
|
||||
}
|
||||
|
||||
static void _read_into(Knot *node, uint32_t offset, uint32_t len, char *dest) {
|
||||
if (!node || len == 0)
|
||||
return;
|
||||
if (node->depth == 0) {
|
||||
memcpy(dest, node->data + offset, len);
|
||||
return;
|
||||
}
|
||||
Knot *left = node->left;
|
||||
uint32_t left_count = left ? left->char_count : 0;
|
||||
if (offset < left_count) {
|
||||
uint32_t chunk_len = left_count - offset;
|
||||
if (chunk_len > len)
|
||||
chunk_len = len;
|
||||
_read_into(left, offset, chunk_len, dest);
|
||||
dest += chunk_len;
|
||||
len -= chunk_len;
|
||||
offset = 0;
|
||||
} else {
|
||||
offset -= left_count;
|
||||
}
|
||||
if (len > 0 && node->right)
|
||||
_read_into(node->right, offset, len, dest);
|
||||
}
|
||||
|
||||
char *read(Knot *root, uint32_t offset, uint32_t len) {
|
||||
if (!root)
|
||||
return nullptr;
|
||||
if (offset >= root->char_count) {
|
||||
char *empty = (char *)malloc(1);
|
||||
if (empty)
|
||||
empty[0] = '\0';
|
||||
return empty;
|
||||
}
|
||||
if (offset + len > root->char_count) {
|
||||
len = root->char_count - offset;
|
||||
}
|
||||
char *buffer = (char *)malloc((len + 1) * sizeof(char));
|
||||
if (!buffer)
|
||||
return nullptr;
|
||||
_read_into(root, offset, len, buffer);
|
||||
buffer[len] = '\0';
|
||||
return buffer;
|
||||
}
|
||||
|
||||
// Hopefully free the tree only once at the end of its use using the pointer
|
||||
// from the last insert or concat or erase call.
|
||||
// (or use twice if last call was split - for both left and right).
|
||||
void free_rope(Knot *root) {
|
||||
if (!root)
|
||||
return;
|
||||
free_rope(root->left);
|
||||
free_rope(root->right);
|
||||
free(root);
|
||||
}
|
||||
|
||||
static uint32_t find_nth_newline_offset(Knot *node, uint32_t n) {
|
||||
if (!node || n > node->line_count)
|
||||
return UINT32_MAX;
|
||||
if (node->depth == 0) {
|
||||
uint32_t count = 0;
|
||||
for (uint32_t i = 0; i < node->char_count; i++) {
|
||||
if (node->data[i] == '\n') {
|
||||
if (count == n)
|
||||
return i;
|
||||
count++;
|
||||
}
|
||||
}
|
||||
return UINT32_MAX;
|
||||
}
|
||||
uint32_t left_lines = node->left ? node->left->line_count : 0;
|
||||
if (n < left_lines) {
|
||||
return find_nth_newline_offset(node->left, n);
|
||||
} else {
|
||||
uint32_t right_offset =
|
||||
find_nth_newline_offset(node->right, n - left_lines);
|
||||
if (right_offset == UINT32_MAX)
|
||||
return UINT32_MAX;
|
||||
uint32_t left_chars = node->left ? node->left->char_count : 0;
|
||||
return left_chars + right_offset;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t byte_to_line(Knot *node, uint32_t offset) {
|
||||
if (!node)
|
||||
return 0;
|
||||
if (offset >= node->char_count)
|
||||
return node->line_count;
|
||||
if (node->depth == 0) {
|
||||
uint32_t lines_before = 0;
|
||||
uint32_t limit = (offset < node->char_count) ? offset : node->char_count;
|
||||
for (uint32_t i = 0; i < limit; i++)
|
||||
if (node->data[i] == '\n')
|
||||
lines_before++;
|
||||
return lines_before;
|
||||
}
|
||||
uint32_t left_chars = node->left ? node->left->char_count : 0;
|
||||
if (offset < left_chars) {
|
||||
return byte_to_line(node->left, offset);
|
||||
} else {
|
||||
uint32_t left_lines = node->left ? node->left->line_count : 0;
|
||||
return left_lines + byte_to_line(node->right, offset - left_chars);
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t line_to_byte(Knot *node, uint32_t line, uint32_t *out_len) {
|
||||
if (!node) {
|
||||
if (out_len)
|
||||
*out_len = 0;
|
||||
return 0;
|
||||
}
|
||||
uint32_t start_offset = 0;
|
||||
uint32_t end_offset = 0;
|
||||
if (line == 0) {
|
||||
start_offset = 0;
|
||||
} else {
|
||||
uint32_t prev_newline = find_nth_newline_offset(node, line - 1);
|
||||
if (prev_newline == UINT32_MAX)
|
||||
start_offset = node->char_count;
|
||||
else
|
||||
start_offset = prev_newline + 1;
|
||||
}
|
||||
uint32_t current_newline = find_nth_newline_offset(node, line);
|
||||
if (current_newline == UINT32_MAX)
|
||||
end_offset = node->char_count;
|
||||
else
|
||||
end_offset = current_newline + 1;
|
||||
if (out_len) {
|
||||
if (end_offset > start_offset)
|
||||
*out_len = end_offset - start_offset;
|
||||
else
|
||||
*out_len = 0;
|
||||
}
|
||||
return start_offset;
|
||||
}
|
||||
|
||||
LineIterator *begin_l_iter(Knot *root, uint32_t start_line) {
|
||||
if (!root)
|
||||
return nullptr;
|
||||
if (start_line > root->line_count)
|
||||
return nullptr;
|
||||
LineIterator *it = (LineIterator *)malloc(sizeof(LineIterator));
|
||||
if (!it)
|
||||
return nullptr;
|
||||
it->top = 0;
|
||||
it->line = start_line;
|
||||
it->node = nullptr;
|
||||
if (start_line == 0) {
|
||||
it->offset = 0;
|
||||
while (root->left) {
|
||||
it->stack[it->top++] = root;
|
||||
root = root->left;
|
||||
if (!root->left && !root->right)
|
||||
it->node = root;
|
||||
}
|
||||
it->stack[it->top++] = root;
|
||||
return it;
|
||||
}
|
||||
Knot *curr = root;
|
||||
uint32_t relative_line = start_line;
|
||||
while (curr) {
|
||||
it->stack[it->top++] = curr;
|
||||
if (!curr->left && !curr->right) {
|
||||
it->node = curr;
|
||||
break;
|
||||
}
|
||||
uint32_t left_lines = (curr->left) ? curr->left->line_count : 0;
|
||||
if (relative_line < left_lines) {
|
||||
curr = curr->left;
|
||||
} else {
|
||||
relative_line -= left_lines;
|
||||
curr = curr->right;
|
||||
}
|
||||
}
|
||||
if (!it->node) {
|
||||
free(it);
|
||||
return nullptr;
|
||||
}
|
||||
it->offset = 0;
|
||||
if (relative_line > 0) {
|
||||
uint32_t found_newlines = 0;
|
||||
uint32_t i = 0;
|
||||
for (i = 0; i < it->node->char_count; i++) {
|
||||
if (it->node->data[i] == '\n') {
|
||||
found_newlines++;
|
||||
if (found_newlines == relative_line) {
|
||||
it->offset = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return it;
|
||||
}
|
||||
|
||||
static inline void iter_advance_leaf(LineIterator *it) {
|
||||
if (it->top == 0) {
|
||||
it->node = nullptr;
|
||||
return;
|
||||
}
|
||||
Knot *prev = it->stack[--it->top];
|
||||
while (it->top > 0) {
|
||||
Knot *parent = it->stack[it->top - 1];
|
||||
if (parent->left == prev && parent->right) {
|
||||
Knot *curr = parent->right;
|
||||
while (curr) {
|
||||
it->stack[it->top++] = curr;
|
||||
if (!curr->left && !curr->right) {
|
||||
it->node = curr;
|
||||
it->offset = 0;
|
||||
return;
|
||||
}
|
||||
curr = (curr->left) ? curr->left : curr->right;
|
||||
}
|
||||
}
|
||||
prev = it->stack[--it->top];
|
||||
}
|
||||
it->node = nullptr;
|
||||
}
|
||||
|
||||
char *next_line(LineIterator *it) {
|
||||
if (!it || !it->node)
|
||||
return nullptr;
|
||||
size_t capacity = 128;
|
||||
size_t len = 0;
|
||||
char *buffer = (char *)malloc(capacity);
|
||||
if (!buffer)
|
||||
return nullptr;
|
||||
while (it->node) {
|
||||
if (it->offset >= it->node->char_count) {
|
||||
iter_advance_leaf(it);
|
||||
if (!it->node)
|
||||
break;
|
||||
}
|
||||
char *start = it->node->data + it->offset;
|
||||
char *end = it->node->data + it->node->char_count;
|
||||
char *newline_ptr = (char *)memchr(start, '\n', end - start);
|
||||
size_t chunk_len;
|
||||
int found_newline = 0;
|
||||
if (newline_ptr) {
|
||||
chunk_len = (newline_ptr - start) + 1;
|
||||
found_newline = 1;
|
||||
} else {
|
||||
chunk_len = end - start;
|
||||
}
|
||||
if (len + chunk_len + 1 > capacity) {
|
||||
capacity = (capacity * 2) + chunk_len;
|
||||
char *new_buf = (char *)realloc(buffer, capacity);
|
||||
if (!new_buf) {
|
||||
free(buffer);
|
||||
return nullptr;
|
||||
}
|
||||
buffer = new_buf;
|
||||
}
|
||||
memcpy(buffer + len, start, chunk_len);
|
||||
len += chunk_len;
|
||||
it->offset += chunk_len;
|
||||
if (found_newline) {
|
||||
buffer[len] = '\0';
|
||||
it->line++;
|
||||
return buffer;
|
||||
}
|
||||
}
|
||||
if (len > 0) {
|
||||
buffer[len] = '\0';
|
||||
it->line++;
|
||||
return buffer;
|
||||
}
|
||||
free(buffer);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
LeafIterator *begin_k_iter(Knot *root) {
|
||||
if (!root)
|
||||
return nullptr;
|
||||
LeafIterator *it = (LeafIterator *)malloc(sizeof(LeafIterator));
|
||||
if (!it)
|
||||
return nullptr;
|
||||
it->top = 0;
|
||||
Knot *curr = root;
|
||||
while (curr) {
|
||||
it->stack[it->top++] = curr;
|
||||
if (!curr->left && !curr->right) {
|
||||
it->node = curr;
|
||||
return it;
|
||||
}
|
||||
curr = curr->left;
|
||||
if (!curr) {
|
||||
curr = it->stack[--it->top]->right;
|
||||
Knot *temp = it->stack[it->top];
|
||||
it->stack[it->top++] = temp;
|
||||
curr = temp->left ? temp->left : temp->right;
|
||||
Knot *parent = it->stack[it->top - 1];
|
||||
curr = parent->left;
|
||||
if (!curr) {
|
||||
curr = parent->right;
|
||||
}
|
||||
}
|
||||
}
|
||||
free(it);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// Caller must never free the returned string
|
||||
char *next_leaf(LeafIterator *it) {
|
||||
if (!it || !it->node)
|
||||
return nullptr;
|
||||
char *data_to_return = it->node->data;
|
||||
data_to_return[it->node->char_count] = '\0';
|
||||
Knot *prev_leaf = it->node;
|
||||
Knot *parent = nullptr;
|
||||
while (it->top > 0) {
|
||||
parent = it->stack[--it->top];
|
||||
if (parent->right && parent->right != prev_leaf) {
|
||||
Knot *curr = parent->right;
|
||||
while (curr) {
|
||||
it->stack[it->top++] = curr;
|
||||
if (!curr->left && !curr->right) {
|
||||
it->node = curr;
|
||||
return data_to_return;
|
||||
}
|
||||
curr = curr->left;
|
||||
if (!curr)
|
||||
curr = it->stack[it->top - 1]->right;
|
||||
}
|
||||
}
|
||||
prev_leaf = parent;
|
||||
}
|
||||
it->node = nullptr;
|
||||
return data_to_return;
|
||||
}
|
||||
|
||||
ByteIterator *begin_b_iter(Knot *root) {
|
||||
ByteIterator *b_it = (ByteIterator *)malloc(sizeof(ByteIterator));
|
||||
LeafIterator *l_it = begin_k_iter(root);
|
||||
b_it->it = l_it;
|
||||
b_it->offset_g = 0;
|
||||
b_it->offset_l = 0;
|
||||
b_it->char_count = 0;
|
||||
b_it->data = nullptr;
|
||||
return b_it;
|
||||
}
|
||||
|
||||
char next_byte(ByteIterator *it) {
|
||||
if (it->data && it->offset_l < it->char_count) {
|
||||
return it->data[it->offset_l++];
|
||||
} else {
|
||||
it->offset_g += it->offset_l;
|
||||
it->offset_l = 1;
|
||||
char *data = next_leaf(it->it);
|
||||
it->char_count = strlen(data);
|
||||
it->data = data;
|
||||
if (it->data)
|
||||
return *it->data;
|
||||
else
|
||||
return '\0';
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<size_t, size_t>> search_rope(Knot *root,
|
||||
const char *pattern) {
|
||||
std::vector<std::pair<size_t, size_t>> results;
|
||||
int errorcode;
|
||||
PCRE2_SIZE erroffset;
|
||||
pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0,
|
||||
&errorcode, &erroffset, nullptr);
|
||||
if (!re) {
|
||||
fprintf(stderr, "PCRE2 compile error: %d\n", errorcode);
|
||||
return results;
|
||||
}
|
||||
pcre2_match_data *mdata = pcre2_match_data_create(128, nullptr);
|
||||
int workspace[PCRE_WORKSPACE_SIZE];
|
||||
LeafIterator *it = begin_k_iter(root);
|
||||
if (!it) {
|
||||
pcre2_code_free(re);
|
||||
pcre2_match_data_free(mdata);
|
||||
return results;
|
||||
}
|
||||
size_t chunk_abs_offset = 0;
|
||||
size_t saved_match_start = 0;
|
||||
bool match_in_progress = false;
|
||||
int flags = PCRE2_PARTIAL_SOFT;
|
||||
while (1) {
|
||||
const char *chunk_start = next_leaf(it);
|
||||
if (!chunk_start)
|
||||
break;
|
||||
size_t chunk_len = strlen(chunk_start);
|
||||
const char *current_ptr = chunk_start;
|
||||
size_t remaining_len = chunk_len;
|
||||
while (remaining_len > 0) {
|
||||
int rc =
|
||||
pcre2_dfa_match(re, (PCRE2_SPTR)current_ptr, remaining_len, 0, flags,
|
||||
mdata, nullptr, workspace, PCRE_WORKSPACE_SIZE);
|
||||
if (rc >= 0) {
|
||||
PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata);
|
||||
size_t match_start_abs;
|
||||
size_t match_end_abs;
|
||||
if (match_in_progress) {
|
||||
match_start_abs = saved_match_start;
|
||||
match_end_abs =
|
||||
chunk_abs_offset + (current_ptr - chunk_start) + ov[1];
|
||||
} else {
|
||||
match_start_abs =
|
||||
chunk_abs_offset + (current_ptr - chunk_start) + ov[0];
|
||||
match_end_abs =
|
||||
chunk_abs_offset + (current_ptr - chunk_start) + ov[1];
|
||||
}
|
||||
size_t total_len = match_end_abs - match_start_abs;
|
||||
results.push_back(std::make_pair(match_start_abs, total_len));
|
||||
size_t consumed = ov[1];
|
||||
if (consumed == 0)
|
||||
consumed = 1;
|
||||
current_ptr += consumed;
|
||||
if (consumed > remaining_len)
|
||||
remaining_len = 0;
|
||||
else
|
||||
remaining_len -= consumed;
|
||||
match_in_progress = false;
|
||||
flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL;
|
||||
continue;
|
||||
} else if (rc == PCRE2_ERROR_PARTIAL) {
|
||||
PCRE2_SIZE *ov = pcre2_get_ovector_pointer(mdata);
|
||||
if (!match_in_progress) {
|
||||
saved_match_start =
|
||||
chunk_abs_offset + (current_ptr - chunk_start) + ov[0];
|
||||
match_in_progress = true;
|
||||
}
|
||||
flags |= PCRE2_DFA_RESTART;
|
||||
flags |= PCRE2_NOTBOL;
|
||||
break;
|
||||
} else {
|
||||
if (match_in_progress) {
|
||||
match_in_progress = false;
|
||||
flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL;
|
||||
current_ptr++;
|
||||
remaining_len--;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
// if (rc != PCRE2_ERROR_NOMATCH) {} // handle error
|
||||
}
|
||||
}
|
||||
chunk_abs_offset += chunk_len;
|
||||
if (!match_in_progress)
|
||||
flags = PCRE2_PARTIAL_SOFT | PCRE2_NOTBOL;
|
||||
}
|
||||
pcre2_match_data_free(mdata);
|
||||
pcre2_code_free(re);
|
||||
free(it);
|
||||
return results;
|
||||
}
|
||||
|
||||
uint32_t optimal_chunk_size(uint64_t length) {
|
||||
if (length <= MIN_CHUNK_SIZE)
|
||||
return MIN_CHUNK_SIZE;
|
||||
double target_exponent = MIN(std::log2((double)MAX_CHUNK_SIZE),
|
||||
7.0 + (std::log2((double)length) - 10.0) * 0.25);
|
||||
uint32_t final_chunk_size =
|
||||
MAX((uint32_t)MIN_CHUNK_SIZE, (uint32_t)std::pow(2.0, target_exponent));
|
||||
final_chunk_size = MIN(final_chunk_size, (uint32_t)MAX_CHUNK_SIZE);
|
||||
final_chunk_size = 1U << (32 - __builtin_clz(final_chunk_size - 1));
|
||||
return final_chunk_size;
|
||||
}
|
||||
|
||||
// Basic correctness test & usage example
|
||||
int _main() {
|
||||
char *buffer = (char *)malloc(44 * 4 + 5);
|
||||
strcpy(buffer, "The quick brown fox jumps over the lazy dog.\n\
|
||||
The quick brown fox jumps over the lazy dog.\n\
|
||||
The quick brown fox jumps over the lazy dog.\n\
|
||||
The quick brown fox jumps over the lazy dog.");
|
||||
// This loads all (excluding \0 put in by strcpy)
|
||||
Knot *root = load(buffer, 44 * 4 + 3, optimal_chunk_size(44 * 4 + 3));
|
||||
Knot *left = nullptr, *right = nullptr;
|
||||
// Splits root into left and right (root is no longer valid)
|
||||
split(root, 5, &left, &right);
|
||||
// simple read based on byte offset and length
|
||||
char *s1 = read(left, 0, 100);
|
||||
printf("%s\n:\n", s1);
|
||||
char *s2 = read(right, 0, 100);
|
||||
printf("%s\n;\n", s2);
|
||||
free(s1);
|
||||
free(s2);
|
||||
// Recombines left and right into root (both can
|
||||
// be valid or invalid in optimized cases)
|
||||
// they are to not be used after concat
|
||||
root = concat(left, right);
|
||||
// root should be set to return value from insert always
|
||||
root = insert(root, 5, buffer, 5);
|
||||
free(buffer);
|
||||
char *s3 = read(root, 0, 100);
|
||||
printf("%s\n,\n", s3);
|
||||
// Similar to insert but for erase
|
||||
root = erase(root, 5, 5);
|
||||
char *s4 = read(root, 0, 100);
|
||||
printf("%s\n.\n", s4);
|
||||
free(s3);
|
||||
free(s4);
|
||||
uint32_t byte_offset;
|
||||
uint32_t len;
|
||||
// Byte offset given reltive to how it would
|
||||
// be in a file offset + len includes the \n
|
||||
// at the end of the line (or nothing is EOF)
|
||||
byte_offset = line_to_byte(root, 2, &len);
|
||||
char *s5 = read(root, byte_offset, len);
|
||||
printf("%s\n'\n", s5);
|
||||
free(s5);
|
||||
// returns line number of which line that
|
||||
// byte position would be in.
|
||||
// the ending \n position is included in this
|
||||
uint32_t line = byte_to_line(root, byte_offset + len - 1);
|
||||
printf("%u\n:\n", line);
|
||||
// From second line onwards (0 indexed)
|
||||
LineIterator *it = begin_l_iter(root, 0);
|
||||
char *c = nullptr;
|
||||
while ((c = next_line(it)) != nullptr) {
|
||||
printf("%s :wow:\n", c);
|
||||
free(c);
|
||||
}
|
||||
free(it);
|
||||
printf("\n/\n");
|
||||
// Starts at first byte (to be used for regex search)
|
||||
ByteIterator *it2 = begin_b_iter(root);
|
||||
|
||||
uint32_t saved[40];
|
||||
|
||||
for (int i = 0; i < 40; i++)
|
||||
saved[i] = 0;
|
||||
|
||||
std::string pattern = "f.x";
|
||||
|
||||
Inst *program = compile_regex(pattern);
|
||||
|
||||
bool result;
|
||||
while ((result = next_match(program, it2, saved))) {
|
||||
printf("\nRES: %d\n", result);
|
||||
for (int i = 0; i < 40; i++)
|
||||
printf("%d, ", saved[i]);
|
||||
}
|
||||
|
||||
// char c2 = ' ';
|
||||
// while ((c2 = next_byte(it2)) != '\0')
|
||||
// printf("%c :wow!:\n", c2);
|
||||
// free(it2);
|
||||
// search // uses leaf iterator internally // PCRE2 based
|
||||
std::vector<std::pair<size_t, size_t>> matches = search_rope(root, "f.x");
|
||||
for (size_t i = 0; i < matches.size(); i++)
|
||||
printf("\n%lu %lu", matches[i].first, matches[i].second);
|
||||
// A rope needs to be freed only once if last action on the rope is
|
||||
// insert or concat or erase.
|
||||
// for splits we need to free both left and right separately
|
||||
free_rope(root);
|
||||
return 0;
|
||||
}
|
||||
283
src/test.cpp
Normal file
283
src/test.cpp
Normal file
@@ -0,0 +1,283 @@
|
||||
#include "../headers/noose.hpp"
|
||||
#include "../headers/rope.hpp"
|
||||
#include <chrono>
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
char *load_file(const char *path, size_t *out_len) {
|
||||
FILE *f = fopen(path, "rb");
|
||||
if (!f) {
|
||||
perror("fopen");
|
||||
return nullptr;
|
||||
}
|
||||
fseek(f, 0, SEEK_END);
|
||||
size_t len = ftell(f);
|
||||
rewind(f);
|
||||
|
||||
char *buf = (char *)malloc(len);
|
||||
if (!buf) {
|
||||
perror("malloc");
|
||||
fclose(f);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fread(buf, 1, len, f);
|
||||
fclose(f);
|
||||
|
||||
*out_len = len;
|
||||
return buf;
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
printf("My rope implementation benchmark\n");
|
||||
|
||||
{
|
||||
size_t len;
|
||||
printf("Loading file into rope...\n");
|
||||
char *buf = load_file("./random.bin", &len);
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
Knot *root = load(buf, len, 2);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
printf("Load time: %.3f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
free(buf);
|
||||
|
||||
// READ TEST
|
||||
printf("Testing read...\n");
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
char *content = read(root, len / 2, 1024);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
free(content);
|
||||
printf("Read 1 KB from middle: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
// INSERT TEST
|
||||
printf("Testing insert...\n");
|
||||
char insert_data[1024];
|
||||
memset(insert_data, 'X', 1024);
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
root = insert(root, len / 2, insert_data, 1024);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
printf("Insert 1 KB in middle: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
// ERASE TEST (Delete the same 1 KB we just inserted)
|
||||
printf("Testing erase...\n");
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
root = erase(root, len / 2, 1024);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
printf("Erase 1 KB in middle: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
// SPLIT TEST
|
||||
printf("Testing split...\n");
|
||||
Knot *left = nullptr, *right = nullptr;
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
split(root, len / 2, &left, &right);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
printf("Split at middle: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
// CONCAT TEST
|
||||
printf("Testing concat...\n");
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
root = concat(left, right);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
printf("Concat: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// LINE OPERATIONS TESTS
|
||||
// ---------------------------------------------------------
|
||||
printf("Testing line operations...\n");
|
||||
|
||||
// KNOWN CONSTANTS based on: yes "The quick brown fox jumps over the lazy
|
||||
// dog." String length: 44 + 1 newline = 45 bytes per line.
|
||||
const uint32_t BYTES_PER_LINE = 45;
|
||||
const uint32_t TEST_LINE_INDEX = 1000; // A line deep in the file
|
||||
|
||||
// 1. Test byte_to_line
|
||||
// We pick a byte in the middle of TEST_LINE_INDEX.
|
||||
// Offset = (100000 * 45) + 10.
|
||||
uint32_t test_offset = (TEST_LINE_INDEX * BYTES_PER_LINE) + 10;
|
||||
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
uint16_t calculated_line = byte_to_line(root, test_offset);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
printf("byte_to_line (%u -> %u): %.6f s ", test_offset, calculated_line,
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
if (calculated_line == TEST_LINE_INDEX) {
|
||||
printf("[PASS]\n");
|
||||
} else {
|
||||
printf("[FAIL] Expected %u, got %u\n", TEST_LINE_INDEX, calculated_line);
|
||||
}
|
||||
|
||||
// 2. Test line_to_byte
|
||||
// We ask for the start of TEST_LINE_INDEX. Should be exactly
|
||||
// TEST_LINE_INDEX * 45.
|
||||
uint32_t out_len = 0;
|
||||
uint32_t expected_start = TEST_LINE_INDEX * BYTES_PER_LINE;
|
||||
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
uint32_t calculated_start = line_to_byte(root, TEST_LINE_INDEX, &out_len);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
printf("line_to_byte (Line %u -> Offset %u): %.6f s ", TEST_LINE_INDEX,
|
||||
calculated_start,
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
if (calculated_start == expected_start && out_len == BYTES_PER_LINE) {
|
||||
printf("[PASS]\n");
|
||||
} else {
|
||||
printf("[FAIL] Expected offset %u (len %u), got %u (len %u)\n",
|
||||
expected_start, BYTES_PER_LINE, calculated_start, out_len);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------
|
||||
// ITERATOR SPEED TEST
|
||||
// ---------------------------------------------------------
|
||||
printf("Testing iterator speed...\n");
|
||||
|
||||
const uint32_t LINES_TO_ITERATE = 10000; // Iterate 10,000 lines
|
||||
|
||||
// 1. Initialize the iterator at a deep line index
|
||||
uint32_t start_line = TEST_LINE_INDEX + 10;
|
||||
|
||||
LeafIterator *it = begin_k_iter(root);
|
||||
if (!it) {
|
||||
printf("Iterator Test: [FAIL] begin_iterator returned NULL.\n");
|
||||
} else {
|
||||
char *line = NULL;
|
||||
uint32_t lines_read = 0;
|
||||
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// 2. Iterate and time the process
|
||||
// We use the clean C idiom: get the line, check for NULL, then
|
||||
// process.
|
||||
while (lines_read < LINES_TO_ITERATE && (line = next_leaf(it)) != NULL) {
|
||||
// Note: We deliberately skip printing to focus on the Rope operation
|
||||
// time.
|
||||
lines_read++;
|
||||
}
|
||||
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double elapsed_time = std::chrono::duration<double>(end - start).count();
|
||||
|
||||
printf("Iterator speed (f:: %u): %.6f s (%.2f lines/s)\n", lines_read,
|
||||
elapsed_time, (double)lines_read / elapsed_time);
|
||||
|
||||
if (lines_read == LINES_TO_ITERATE) {
|
||||
printf("Iterator Test: [PASS] Successfully iterated %u lines.\n",
|
||||
LINES_TO_ITERATE);
|
||||
} else {
|
||||
printf("Iterator Test: [FAIL] Expected %u lines, read %u.\n",
|
||||
LINES_TO_ITERATE, lines_read);
|
||||
}
|
||||
|
||||
// 3. Clean up the iterator
|
||||
free(it);
|
||||
}
|
||||
|
||||
// search test
|
||||
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
std::vector<std::pair<size_t, size_t>> matches = search_rope(root, "f.x");
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
printf("Search Time: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
printf("Found %lu matches\n", matches.size());
|
||||
|
||||
char *c = read(root, 0, 1000);
|
||||
printf("%s\n", c);
|
||||
free(c);
|
||||
|
||||
ByteIterator *it1 = begin_b_iter(root);
|
||||
char ch;
|
||||
while ((ch = next_byte(it1)) != '\0') {
|
||||
printf("%c:", ch);
|
||||
}
|
||||
|
||||
ByteIterator *it2 = begin_b_iter(root);
|
||||
uint32_t saved[40];
|
||||
for (int i = 0; i < 40; i++)
|
||||
saved[i] = 0;
|
||||
std::string pattern = "f.x";
|
||||
Inst *program = compile_regex(pattern);
|
||||
bool result;
|
||||
int count = 0;
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
while ((result = next_match(program, it2, saved))) {
|
||||
count++;
|
||||
printf("%d\n", count);
|
||||
}
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
printf("Search Time: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
printf("Found2 %d matches\n", count);
|
||||
|
||||
free_rope(root);
|
||||
}
|
||||
|
||||
printf("Testing std::string...\n");
|
||||
|
||||
{
|
||||
std::ifstream file("random.bin", std::ios::binary | std::ios::ate);
|
||||
if (!file) {
|
||||
perror("ifstream");
|
||||
return 1;
|
||||
}
|
||||
size_t len = file.tellg();
|
||||
file.seekg(0);
|
||||
std::string data(len, '\0');
|
||||
file.read(data.data(), len);
|
||||
|
||||
std::string s = data;
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
// READ: middle 1 KB
|
||||
std::string read_chunk = s.substr(len / 2, 1024);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
printf("std::string read 1 KB from middle: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
// INSERT: middle 1 KB
|
||||
std::string insert_data(1024, 'X');
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
s.insert(len / 2, insert_data);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
printf("std::string insert 1 KB in middle: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
// ERASE: middle 1 KB
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
s.erase(len / 2, 1024);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
printf("std::string erase 1 KB in middle: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
// SPLIT: middle
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
std::string left = s.substr(0, len / 2);
|
||||
std::string right = s.substr(len / 2);
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
printf("std::string split at middle: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
|
||||
// CONCAT
|
||||
start = std::chrono::high_resolution_clock::now();
|
||||
s = left + right;
|
||||
end = std::chrono::high_resolution_clock::now();
|
||||
printf("std::string concat: %.6f s\n",
|
||||
std::chrono::duration<double>(end - start).count());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user