diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ba8b160 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +tst +*.bin +*.vim diff --git a/api/noose.hpp b/api/noose.hpp index 10b5339..01730a7 100644 --- a/api/noose.hpp +++ b/api/noose.hpp @@ -79,9 +79,24 @@ struct Inst { int idx; }; +struct Thread { + Inst *pc; + uint32_t saved[40]; /* $0 through $9 */ +}; + +struct ThreadList { + Thread *t; + int n; +}; + Exp *regex_to_ast(std::string pattern); +void free_exp(Exp *exp); Inst *compile_ast(Exp *root); Inst *compile_regex(std::string pattern); -int next_match(Inst *prog, ByteIterator *it, uint32_t *saved); +int proglen(Inst *prog); +void free_program(Inst *instructions); +int next_match(Inst *prog, ByteIterator *it, uint32_t *saved, ThreadList *clist, + ThreadList *nlist); +void print_program(Inst *program); #endif diff --git a/src/noose.cpp b/src/noose.cpp index 7952097..072586b 100644 --- a/src/noose.cpp +++ b/src/noose.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include // VM - pass 2 @@ -19,11 +18,6 @@ bool test_ranges(char inp, Range *ranges, int len) { // Use pike vm method -struct Thread { - Inst *pc; - uint32_t saved[40]; /* $0 through $9 */ -}; - Thread thread(Inst *pc, uint32_t *saved) { Thread t; t.pc = pc; @@ -32,16 +26,6 @@ Thread thread(Inst *pc, uint32_t *saved) { return t; } -struct ThreadList { - Thread *t; - int n; -}; - -void handle_end(uint32_t *tsaved, uint32_t *saved) { - for (int i = 0; i < 40; i++) - saved[i] = tsaved[i]; -} - bool addstate(Inst *prog, ThreadList *list, Thread t, int count) { if (t.pc->op == JMP) { if (addstate(prog, list, thread(prog + t.pc->j.x, t.saved), count)) @@ -66,7 +50,6 @@ bool addstate(Inst *prog, ThreadList *list, Thread t, int count) { return true; return false; } else if (t.pc->op == END) { - handle_end(t.saved, t.saved); return true; } else { for (int i = 0; i < list->n; i++) @@ -97,24 +80,13 @@ void inline free_list(ThreadList *list) { free(list); } -int next_match(Inst *prog, ByteIterator *it, uint32_t *saved) { - int len; - ThreadList *clist, *nlist; +int next_match(Inst *prog, ByteIterator *it, uint32_t *saved, ThreadList *clist, + ThreadList *nlist) { Thread t; - - len = proglen(prog); - clist = (ThreadList *)malloc(sizeof(ThreadList)); - clist->t = (Thread *)malloc(+sizeof(Thread) * len); - clist->n = 0; - nlist = (ThreadList *)malloc(sizeof(ThreadList)); - nlist->t = (Thread *)malloc(+sizeof(Thread) * len); - nlist->n = 0; char sp; int count = 0; - addstate(prog, clist, thread(prog, saved), count); for (sp = next_byte(it); sp != '\0'; sp = next_byte(it)) { - printf("%c", sp); for (int i = 0; i < clist->n; i++) { t = clist->t[i]; switch (t.pc->op) { @@ -148,14 +120,11 @@ int next_match(Inst *prog, ByteIterator *it, uint32_t *saved) { break; } } + clear(clist); swap(clist, nlist); - clear(nlist); count++; } - free_list(clist); - free_list(nlist); - return false; // Reached EOF without a match } @@ -235,38 +204,41 @@ void print_program(Inst *program) { } Inst *compile_regex(std::string pattern) { - return compile_ast(regex_to_ast(pattern)); + Exp *ast = regex_to_ast(pattern); + Inst *program = compile_ast(ast); + free_exp(ast); + return program; } -int __main() { - // Maunally compiled program for testing - char *buffer = (char *)malloc(29); - strcpy(buffer, "abcdabcdabcdabcdf"); - // This loads all (excluding \0 put in by strcpy) - Knot *root = load(buffer, 17, optimal_chunk_size(12)); - ByteIterator *it = begin_b_iter(root); - uint32_t saved[40]; - - for (int i = 0; i < 40; i++) - saved[i] = 0; - - std::string pattern = "(abcd)+"; - - Inst *program = compile_regex(pattern); - - print_program(program); - - int result; - while ((result = next_match(program, it, saved))) { - printf("\nRES: %d\n", result); - for (int i = 0; i < 40; i++) - printf("%d, ", saved[i]); - } - - free(program); - free(buffer); - free(it->it); - free(it); - free(root); - return 0; -} +// int __main() { +// // Maunally compiled program for testing +// char *buffer = (char *)malloc(29); +// strcpy(buffer, "abcdabcdabcdabcdf"); +// // This loads all (excluding \0 put in by strcpy) +// Knot *root = load(buffer, 17, optimal_chunk_size(12)); +// ByteIterator *it = begin_b_iter(root); +// uint32_t saved[40]; +// +// for (int i = 0; i < 40; i++) +// saved[i] = 0; +// +// std::string pattern = "(abcd)+"; +// +// Inst *program = compile_regex(pattern); +// +// print_program(program); +// +// int result; +// while ((result = next_match(program, it, saved))) { +// printf("\nRES: %d\n", result); +// for (int i = 0; i < 40; i++) +// printf("%d, ", saved[i]); +// } +// +// free(program); +// free(buffer); +// free(it->it); +// free(it); +// free(root); +// return 0; +// } diff --git a/src/rexambler.cpp b/src/rexambler.cpp index 695c1e7..f79814c 100644 --- a/src/rexambler.cpp +++ b/src/rexambler.cpp @@ -9,21 +9,21 @@ Exp *parse_atom_with_modifiers(Parser *p); Exp *parse_bracket_class(Parser *p); Exp *make_none() { - Exp *e = new Exp(); + Exp *e = (Exp *)malloc(sizeof(Exp)); e->capture = false; e->kind = ExpKind::NONE; return e; } Exp *make_any() { - Exp *e = new Exp(); + Exp *e = (Exp *)malloc(sizeof(Exp)); e->capture = false; e->kind = ExpKind::ANY; return e; } Exp *make_range(const std::vector &ranges) { - Exp *e = new Exp(); + Exp *e = (Exp *)malloc(sizeof(Exp)); e->capture = false; e->kind = ExpKind::RANGE; new (&e->ranges) std::vector(ranges); @@ -37,10 +37,10 @@ Exp *make_range_single(char c, bool neg = false) { } Exp *make_or(Exp *l, Exp *r) { - OpOr *o = new OpOr(); + OpOr *o = (OpOr *)malloc(sizeof(OpOr)); o->left = l; o->right = r; - Exp *e = new Exp(); + Exp *e = (Exp *)malloc(sizeof(Exp)); e->capture = false; e->kind = ExpKind::OR; e->opor = o; @@ -48,10 +48,10 @@ Exp *make_or(Exp *l, Exp *r) { } Exp *make_seq(Exp *l, Exp *r) { - OpSeq *o = new OpSeq(); + OpSeq *o = (OpSeq *)malloc(sizeof(OpSeq)); o->left = l; o->right = r; - Exp *e = new Exp(); + Exp *e = (Exp *)malloc(sizeof(Exp)); e->capture = false; e->kind = ExpKind::SEQ; e->opseq = o; @@ -95,6 +95,23 @@ Exp *regex_to_ast(std::string pattern) { return res ? res : make_none(); } +void free_exp(Exp *exp) { + if (!exp) + return; + if (exp->kind == ExpKind::OR) { + free_exp(exp->opor->left); + free_exp(exp->opor->right); + free(exp->opor); + } else if (exp->kind == ExpKind::SEQ) { + free_exp(exp->opseq->left); + free_exp(exp->opseq->right); + free(exp->opseq); + } else if (exp->kind == ExpKind::RANGE) { + exp->ranges.~vector(); + } + free(exp); +} + Exp *parse_alternation(Parser *p) { std::vector parts; parts.push_back(parse_sequence(p)); @@ -302,37 +319,41 @@ Exp *parse_atom_with_modifiers(Parser *p) { if (!atom) return nullptr; - // apply possibly multiple modifiers in sequence while (true) { if (peek(p) == '?') { consume(p); - // OpOr(atom, NONE) - atom = make_or(clone_exp(atom), make_none()); + Exp *old = atom; + atom = make_or(clone_exp(old), make_none()); + free_exp(old); } else if (peek(p) == '*') { consume(p); - // Expand to 20 repeating OpOr(atom, NONE) chained by SEQ as literal - // tree + Exp *old = atom; + // Expand to 20 repeating OpOr(atom, NONE) chained by SEQ as literal tree Exp *unit_or = nullptr; for (int t = 0; t < 20; ++t) { - Exp *op = make_or(clone_exp(atom), make_none()); + Exp *op = make_or(clone_exp(old), make_none()); if (!unit_or) unit_or = op; else unit_or = make_seq(unit_or, op); } atom = unit_or ? unit_or : make_none(); + free_exp(old); } else if (peek(p) == '+') { consume(p); + Exp *old = atom; // First the atom, then 20 OpOr(atom, NONE) sequence Exp *rest = nullptr; for (int t = 0; t < 20; ++t) { - Exp *op = make_or(clone_exp(atom), make_none()); + Exp *op = make_or(clone_exp(old), make_none()); if (!rest) rest = op; else rest = make_seq(rest, op); } - atom = rest ? make_seq(clone_exp(atom), rest) : clone_exp(atom); + Exp *new_atom = rest ? make_seq(clone_exp(old), rest) : clone_exp(old); + atom = new_atom; + free_exp(old); } else if (peek(p) == '{') { // parse {x,y} size_t save = p->i; @@ -355,18 +376,19 @@ Exp *parse_atom_with_modifiers(Parser *p) { y = x; if (y > 20) y = 20; // clamp to 20 as requested + + Exp *old = atom; // Build x copies of atom concatenated, then (y-x) OpOr(atom, NONE) - // chained Exp *prefix = nullptr; for (int k = 0; k < x; ++k) { if (!prefix) - prefix = clone_exp(atom); + prefix = clone_exp(old); else - prefix = make_seq(prefix, clone_exp(atom)); + prefix = make_seq(prefix, clone_exp(old)); } Exp *suffix = nullptr; for (int k = 0; k < (y - x); ++k) { - Exp *op = make_or(clone_exp(atom), make_none()); + Exp *op = make_or(clone_exp(old), make_none()); if (!suffix) suffix = op; else @@ -374,10 +396,14 @@ Exp *parse_atom_with_modifiers(Parser *p) { } if (!prefix) prefix = make_none(); + Exp *new_atom = nullptr; if (!suffix) - atom = prefix; + new_atom = prefix; else - atom = make_seq(prefix, suffix); + new_atom = make_seq(prefix, suffix); + + atom = new_atom; + free_exp(old); } else { break; } diff --git a/src/rexpiler.cpp b/src/rexpiler.cpp index c64191b..9599422 100644 --- a/src/rexpiler.cpp +++ b/src/rexpiler.cpp @@ -25,6 +25,7 @@ static void insert_inst(InstList *list, Inst *inst) { list->cap = nc; } list->data[list->len++] = *inst; + free(inst); } Inst *make_inst(Op op) { @@ -122,6 +123,18 @@ void compile_or(Exp *e, InstList *list) { } } +void free_program(Inst *instructions) { + if (!instructions) + return; + Inst *current = instructions; + while (current->op != END) { + if (current->op == MCH || current->op == NMC) + free(current->r.ranges); + current++; // Assuming sequential memory layout + } + free(instructions); +} + void compile_exp(Exp *e, InstList *list) { switch (e->kind) { case ExpKind::NONE: diff --git a/src/rope.cpp b/src/rope.cpp index ea0c263..884343d 100644 --- a/src/rope.cpp +++ b/src/rope.cpp @@ -259,7 +259,7 @@ Knot *insert(Knot *node, uint32_t offset, char *str, uint32_t len) { Knot *left_part = nullptr; Knot *right_part = nullptr; split(node, offset, &left_part, &right_part); - Knot *middle_part = load(str, len, node->chunk_size); + Knot *middle_part = load(str, len, left_part->chunk_size); return concat(concat(left_part, middle_part), right_part); } @@ -647,12 +647,17 @@ char next_byte(ByteIterator *it) { it->offset_g += it->offset_l; it->offset_l = 1; char *data = next_leaf(it->it); - it->char_count = strlen(data); - it->data = data; - if (it->data) - return *it->data; - else + if (!data) return '\0'; + it->char_count = strlen(data); + while (it->char_count <= 0) { + data = next_leaf(it->it); + if (!data) + return '\0'; + it->char_count = strlen(data); + } + it->data = data; + return *it->data; } } @@ -828,15 +833,15 @@ The quick brown fox jumps over the lazy dog."); std::string pattern = "f.x"; - Inst *program = compile_regex(pattern); - - bool result; - while ((result = next_match(program, it2, saved))) { - printf("\nRES: %d\n", result); - for (int i = 0; i < 40; i++) - printf("%d, ", saved[i]); - } - + // Inst *program = compile_regex(pattern); + // + // bool result; + // while ((result = next_match(program, it2, saved))) { + // printf("\nRES: %d\n", result); + // for (int i = 0; i < 40; i++) + // printf("%d, ", saved[i]); + // } + // // char c2 = ' '; // while ((c2 = next_byte(it2)) != '\0') // printf("%c :wow!:\n", c2); diff --git a/src/test.cpp b/src/test.cpp index c87de40..3a4077e 100644 --- a/src/test.cpp +++ b/src/test.cpp @@ -41,7 +41,7 @@ int main() { printf("Loading file into rope...\n"); char *buf = load_file("./random.bin", &len); auto start = std::chrono::high_resolution_clock::now(); - Knot *root = load(buf, len, 2); + Knot *root = load(buf, len, optimal_chunk_size(len)); auto end = std::chrono::high_resolution_clock::now(); printf("Load time: %.3f s\n", std::chrono::duration(end - start).count()); @@ -191,40 +191,56 @@ int main() { // search test start = std::chrono::high_resolution_clock::now(); - std::vector> matches = search_rope(root, "f.x"); + std::vector> matches = + search_rope(root, "[A-Z][a-z]+"); end = std::chrono::high_resolution_clock::now(); printf("Search Time: %.6f s\n", std::chrono::duration(end - start).count()); printf("Found %lu matches\n", matches.size()); - char *c = read(root, 0, 1000); - printf("%s\n", c); - free(c); + // char *c = read(root, 0, 1000); + // printf("%s\n", c); + // free(c); - ByteIterator *it1 = begin_b_iter(root); - char ch; - while ((ch = next_byte(it1)) != '\0') { - printf("%c:", ch); - } + // ByteIterator *it1 = begin_b_iter(root); + // char ch; + // while ((ch = next_byte(it1)) != '\0') { + // printf("%c:", ch); + // } ByteIterator *it2 = begin_b_iter(root); uint32_t saved[40]; for (int i = 0; i < 40; i++) saved[i] = 0; - std::string pattern = "f.x"; + std::string pattern = "[A-Z][a-z]+"; Inst *program = compile_regex(pattern); + print_program(program); bool result; + int prolen = proglen(program); + ThreadList *clist = (ThreadList *)malloc(sizeof(ThreadList)); + clist->t = (Thread *)malloc(+sizeof(Thread) * prolen); + clist->n = 0; + ThreadList *nlist = (ThreadList *)malloc(sizeof(ThreadList)); + nlist->t = (Thread *)malloc(+sizeof(Thread) * prolen); + nlist->n = 0; int count = 0; start = std::chrono::high_resolution_clock::now(); - while ((result = next_match(program, it2, saved))) { + while ((result = next_match(program, it2, saved, clist, nlist))) { count++; - printf("%d\n", count); } end = std::chrono::high_resolution_clock::now(); printf("Search Time: %.6f s\n", std::chrono::duration(end - start).count()); printf("Found2 %d matches\n", count); + free_program(program); + free(it2->it); + free(it2); + free(clist->t); + free(nlist->t); + free(clist); + free(nlist); + free_rope(root); }