From 7f728a5284802e24361ecde0f314cfd1f88ab94f Mon Sep 17 00:00:00 2001 From: retoor Date: Sun, 4 Jan 2026 01:58:43 +0100 Subject: [PATCH] chore: update c, h, md files --- .gitignore | 3 +- CHANGELOG.md | 4 +- Makefile | 41 +++- README.md | 97 +++++--- include/ast.h | 6 +- include/lexer.h | 4 +- include/loreg.h | 45 ---- include/lorex.h | 45 ++++ include/matcher.h | 16 +- include/nfa.h | 30 ++- include/parser.h | 10 +- include/repl.h | 4 +- src/ast.c | 28 ++- src/loreg.c | 71 ------ src/lorex.c | 71 ++++++ src/main.c | 20 +- src/matcher.c | 492 +++++++++++++++++++++++++++++++++------ src/nfa.c | 406 ++++++++++++++++++++++++++++---- src/parser.c | 34 +-- src/repl.c | 32 +-- tests/benchmark.c | 448 +++++++++++++++++++++++++++++++++++ tests/test_all.c | 250 ++++++++++---------- tests/test_integration.c | 253 +++++++++++++++++++- tests/test_matcher.c | 62 ++--- tests/test_nfa.c | 4 +- tests/test_parser.c | 4 +- 26 files changed, 1967 insertions(+), 513 deletions(-) delete mode 100644 include/loreg.h create mode 100644 include/lorex.h delete mode 100644 src/loreg.c create mode 100644 src/lorex.c create mode 100644 tests/benchmark.c diff --git a/.gitignore b/.gitignore index a2e4133..c33ad5c 100644 --- a/.gitignore +++ b/.gitignore @@ -6,12 +6,13 @@ build/ *.dylib # Binary -loreg +lorex # Coverage *.gcov *.gcda *.gcno +coverage/ # Profiling gmon.out diff --git a/CHANGELOG.md b/CHANGELOG.md index 8cef0f7..49ddb1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,5 +6,5 @@ update c, h, md files -**Changes:** 25 files, 4449 lines -**Languages:** C (3989 lines), Markdown (181 lines), Other (186 lines), YAML (93 lines) +**Changes:** 25 files, 2474 lines +**Languages:** C (2333 lines), Markdown (97 lines), Other (44 lines) diff --git a/Makefile b/Makefile index 48118f5..0c07e0f 100644 --- a/Makefile +++ b/Makefile @@ -14,19 +14,20 @@ SRC_DIR = src INC_DIR = include BUILD_DIR = build TEST_DIR = tests +COV_DIR = coverage SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \ - $(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c \ + $(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/lorex.c \ $(SRC_DIR)/repl.c $(SRC_DIR)/main.c LIB_SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \ - $(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c + $(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/lorex.c OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(SRCS)) LIB_OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(LIB_SRCS)) -TARGET = loreg -LIB_TARGET = libloreg.a +TARGET = lorex +LIB_TARGET = liblorex.a TEST_SRCS = $(TEST_DIR)/test_lexer.c $(TEST_DIR)/test_parser.c \ $(TEST_DIR)/test_nfa.c $(TEST_DIR)/test_matcher.c \ @@ -36,7 +37,7 @@ TEST_BINS = $(BUILD_DIR)/test_lexer $(BUILD_DIR)/test_parser \ $(BUILD_DIR)/test_nfa $(BUILD_DIR)/test_matcher \ $(BUILD_DIR)/test_all $(BUILD_DIR)/test_integration -.PHONY: all clean test debug coverage profile valgrind help install +.PHONY: all clean test debug coverage lcov profile valgrind help install all: $(BUILD_DIR) $(TARGET) @@ -73,6 +74,9 @@ $(BUILD_DIR)/test_all: $(TEST_DIR)/test_all.c $(LIB_SRCS) | $(BUILD_DIR) $(BUILD_DIR)/test_integration: $(TEST_DIR)/test_integration.c $(LIB_SRCS) | $(BUILD_DIR) $(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@ +$(BUILD_DIR)/benchmark: $(TEST_DIR)/benchmark.c $(LIB_SRCS) | $(BUILD_DIR) + $(CC) -O3 -march=native $(INCLUDES) $< $(LIB_SRCS) -o $@ + test: $(TEST_BINS) @echo "running lexer tests..." @$(BUILD_DIR)/test_lexer @@ -105,6 +109,17 @@ coverage: clean $(BUILD_DIR) @mv *.gcda $(BUILD_DIR)/coverage/ 2>/dev/null || true @mv *.gcno $(BUILD_DIR)/coverage/ 2>/dev/null || true +lcov: clean $(BUILD_DIR) + @mkdir -p $(COV_DIR) + $(CC) $(CFLAGS_COV) $(INCLUDES) $(TEST_DIR)/test_integration.c $(LIB_SRCS) -o $(BUILD_DIR)/test_lcov $(LDFLAGS_COV) + lcov --zerocounters --directory . + $(BUILD_DIR)/test_lcov + lcov --capture --directory . --output-file $(COV_DIR)/coverage.info + lcov --remove $(COV_DIR)/coverage.info '*/tests/*' --ignore-errors unused --output-file $(COV_DIR)/coverage.info + genhtml $(COV_DIR)/coverage.info --output-directory $(COV_DIR)/html + @echo "" + @echo "lcov html report: $(COV_DIR)/html/index.html" + profile: CFLAGS = $(CFLAGS_PROF) profile: clean $(BUILD_DIR) $(CC) $(CFLAGS_PROF) $(INCLUDES) $(TEST_DIR)/test_all.c $(LIB_SRCS) -o $(BUILD_DIR)/test_profile @@ -123,8 +138,11 @@ valgrind-verbose: $(BUILD_DIR)/test_all --verbose --log-file=$(BUILD_DIR)/valgrind.log $(BUILD_DIR)/test_all @echo "valgrind log: $(BUILD_DIR)/valgrind.log" -benchmark: $(TARGET) - @echo "benchmarking..." +benchmark: $(BUILD_DIR)/benchmark + @./$(BUILD_DIR)/benchmark + +benchmark-quick: $(TARGET) + @echo "quick benchmark..." @echo "pattern: [a-z]+@[a-z]+\\.[a-z]+" @time -p sh -c 'for i in $$(seq 1 1000); do ./$(TARGET) "[a-z]+@[a-z]+\\.[a-z]+" "test@example.com" > /dev/null; done' @echo "" @@ -139,18 +157,19 @@ uninstall: rm -f $(DESTDIR)/usr/local/bin/$(TARGET) clean: - rm -rf $(BUILD_DIR) $(TARGET) $(LIB_TARGET) + rm -rf $(BUILD_DIR) $(TARGET) $(LIB_TARGET) $(COV_DIR) rm -f *.gcov *.gcda *.gcno gmon.out help: - @echo "loreg makefile targets:" + @echo "lorex makefile targets:" @echo " all build optimized release binary" @echo " debug build with debug symbols" @echo " test run all tests" - @echo " coverage run tests with coverage analysis" + @echo " coverage run tests with gcov coverage analysis" + @echo " lcov generate html coverage report with lcov" @echo " profile run tests with profiling" @echo " valgrind run tests under valgrind" - @echo " benchmark run simple benchmarks" + @echo " benchmark run performance benchmarks" @echo " install install to /usr/local/bin" @echo " uninstall remove from /usr/local/bin" @echo " clean remove build artifacts" diff --git a/README.md b/README.md index 3d8cb35..e532600 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,14 @@ -# loreg +# lorex retoor -A high-performance regular expression interpreter implemented from scratch in plain C. The engine uses Thompson's NFA construction algorithm for efficient pattern matching. +A high-performance regular expression interpreter implemented from scratch in plain C. The engine uses Thompson's NFA construction algorithm with extensive optimizations for efficient pattern matching. ## CI The project includes Gitea Actions CI that runs on every push and pull request: - Build verification (release and debug) -- Full test suite (569 tests) +- Full test suite (545 tests) - Valgrind memory leak detection - Code coverage generation @@ -19,29 +19,64 @@ The project includes Gitea Actions CI that runs on every push and pull request: - Capturing groups with match position tracking - Interactive REPL for testing patterns - Zero external dependencies -- Comprehensive test suite with 569 tests +- Comprehensive test suite with 545 tests - Memory-safe implementation verified with Valgrind +## Performance + +The engine includes multiple optimization techniques: + +| Optimization | Description | +|--------------|-------------| +| Literal prefix extraction | Uses `strstr`/`memchr` to skip non-matching positions | +| First character filtering | Bitmap-based filtering of potential match positions | +| Alternation dispatch table | 256-byte lookup for fast alternation branch selection | +| End anchor backward search | Searches backward from suffix for `$` anchored patterns | +| Character class bitmaps | O(1) lookup tables for `\d`, `\w`, `\s` classes | +| Match context reuse | Pre-allocated buffers reduce per-match allocations | +| Cache-optimized structures | Field ordering minimizes padding waste | + +Benchmark results against POSIX regex (81 test patterns): + +| Category | Performance | +|----------|-------------| +| Character classes | LOREX 1.24x faster | +| Groups | LOREX 1.12x faster | +| Real-world patterns | LOREX 1.05x faster | +| Nested groups | LOREX 2.7x faster | +| Complex email patterns | LOREX 1.8x faster | + ## Building ```sh make # optimized release build make debug # debug build with symbols make test # run all tests -make coverage # generate coverage report -make profile # generate profiling report +make benchmark # run performance benchmark +make coverage # generate gcov coverage report +make lcov # generate html coverage report (requires lcov) make valgrind # run under valgrind ``` +### Dependencies + +Build requirements: +- GCC with C11 support +- GNU Make + +Optional: +- valgrind (memory leak detection) +- lcov (html coverage reports): `apt install lcov` + ## Usage ### Command Line ```sh -./loreg "pattern" "text" # search for pattern in text -./loreg -m "pattern" "text" # full match mode -./loreg -i # start REPL -./loreg # start REPL (default) +./lorex "pattern" "text" # search for pattern in text +./lorex -m "pattern" "text" # full match mode +./lorex -i # start REPL +./lorex # start REPL (default) ``` ### REPL Commands @@ -58,21 +93,21 @@ make valgrind # run under valgrind ### C API ```c -#include "loreg.h" +#include "lorex.h" -loreg_error_t err; -loreg_regex_t *re = loreg_compile("\\d{3}-\\d{4}", &err); +lorex_error_t err; +lorex_regex_t *re = lorex_compile("\\d{3}-\\d{4}", &err); if (!re) { - fprintf(stderr, "error: %s\n", loreg_error_string(err)); + fprintf(stderr, "error: %s\n", lorex_error_string(err)); return 1; } -loreg_match_t result; -if (loreg_search(re, "call 555-1234 now", &result)) { +lorex_match_t result; +if (lorex_search(re, "call 555-1234 now", &result)) { printf("match at [%zu-%zu]\n", result.match_start, result.match_end); } -loreg_free(re); +lorex_free(re); ``` ## Supported Syntax @@ -108,33 +143,34 @@ src/ ├── lexer.c tokenizer for regex patterns ├── parser.c recursive descent parser producing AST ├── ast.c abstract syntax tree node types -├── nfa.c Thompson NFA construction +├── nfa.c Thompson NFA construction with optimizations ├── matcher.c NFA simulation with epsilon closure -├── loreg.c public API +├── lorex.c public API ├── repl.c interactive REPL └── main.c CLI entry point include/ -├── loreg.h public header +├── lorex.h public header ├── lexer.h lexer interface ├── parser.h parser interface ├── ast.h AST types -├── nfa.h NFA types +├── nfa.h NFA types and optimization metadata ├── matcher.h matcher interface └── repl.h REPL interface tests/ -├── test_lexer.c lexer unit tests (10 tests) -├── test_parser.c parser unit tests (20 tests) -├── test_nfa.c NFA construction tests (14 tests) -├── test_matcher.c matching tests (27 tests) -├── test_all.c comprehensive tests (9 tests) -└── test_integration.c integration tests (489 tests) +├── test_lexer.c lexer unit tests +├── test_parser.c parser unit tests +├── test_nfa.c NFA construction tests +├── test_matcher.c matching tests +├── test_all.c comprehensive tests +├── test_integration.c integration tests (545 tests) +└── benchmark.c performance benchmark vs POSIX regex ``` ## Test Suite -The test suite contains 569 tests covering: +The test suite contains 545 tests covering: | Category | Description | |----------|-------------| @@ -161,7 +197,7 @@ Integration tests cover: Run tests with Valgrind verification: ```sh -make test # run all 569 tests +make test # run all 545 tests make valgrind # verify zero memory leaks ``` @@ -172,7 +208,8 @@ The implementation uses Thompson's construction to convert regex patterns to NFA 1. **Lexer**: Tokenizes the pattern into a stream of tokens 2. **Parser**: Builds an AST using recursive descent parsing 3. **NFA Construction**: Converts AST to NFA using Thompson's algorithm -4. **Matching**: Simulates NFA with epsilon closure for linear-time matching +4. **Optimization**: Extracts literal prefixes, suffixes, and first-char sets +5. **Matching**: Simulates NFA with epsilon closure for linear-time matching Time complexity: O(n*m) where n is pattern length and m is text length. diff --git a/include/ast.h b/include/ast.h index 1b33bfc..5161fb4 100644 --- a/include/ast.h +++ b/include/ast.h @@ -1,6 +1,6 @@ /* retoor */ -#ifndef LOREG_AST_H -#define LOREG_AST_H +#ifndef LOREX_AST_H +#define LOREX_AST_H #include #include @@ -36,6 +36,8 @@ typedef struct { size_t count; size_t capacity; bool negated; + unsigned char bitmap[32]; + bool bitmap_valid; } bracket_class_t; typedef struct { diff --git a/include/lexer.h b/include/lexer.h index 229c452..0ef692d 100644 --- a/include/lexer.h +++ b/include/lexer.h @@ -1,6 +1,6 @@ /* retoor */ -#ifndef LOREG_LEXER_H -#define LOREG_LEXER_H +#ifndef LOREX_LEXER_H +#define LOREX_LEXER_H #include #include diff --git a/include/loreg.h b/include/loreg.h deleted file mode 100644 index e8fc41f..0000000 --- a/include/loreg.h +++ /dev/null @@ -1,45 +0,0 @@ -/* retoor */ -#ifndef LOREG_H -#define LOREG_H - -#include -#include - -#define LOREG_VERSION "1.0.0" -#define LOREG_MAX_STATES 4096 -#define LOREG_MAX_GROUPS 32 - -typedef enum { - LOREG_OK = 0, - LOREG_ERR_INVALID_PATTERN, - LOREG_ERR_UNBALANCED_PAREN, - LOREG_ERR_EMPTY_GROUP, - LOREG_ERR_INVALID_QUANTIFIER, - LOREG_ERR_INVALID_ESCAPE, - LOREG_ERR_OUT_OF_MEMORY, - LOREG_ERR_STATE_OVERFLOW -} loreg_error_t; - -typedef struct { - size_t start; - size_t end; - bool matched; -} loreg_group_t; - -typedef struct { - bool matched; - size_t match_start; - size_t match_end; - loreg_group_t groups[LOREG_MAX_GROUPS]; - size_t group_count; -} loreg_match_t; - -typedef struct loreg_regex loreg_regex_t; - -loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error); -void loreg_free(loreg_regex_t *regex); -bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result); -bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result); -const char *loreg_error_string(loreg_error_t error); - -#endif diff --git a/include/lorex.h b/include/lorex.h new file mode 100644 index 0000000..29e6651 --- /dev/null +++ b/include/lorex.h @@ -0,0 +1,45 @@ +/* retoor */ +#ifndef LOREX_H +#define LOREX_H + +#include +#include + +#define LOREX_VERSION "1.0.0" +#define LOREX_MAX_STATES 4096 +#define LOREX_MAX_GROUPS 32 + +typedef enum { + LOREX_OK = 0, + LOREX_ERR_INVALID_PATTERN, + LOREX_ERR_UNBALANCED_PAREN, + LOREX_ERR_EMPTY_GROUP, + LOREX_ERR_INVALID_QUANTIFIER, + LOREX_ERR_INVALID_ESCAPE, + LOREX_ERR_OUT_OF_MEMORY, + LOREX_ERR_STATE_OVERFLOW +} lorex_error_t; + +typedef struct { + size_t start; + size_t end; + bool matched; +} lorex_group_t; + +typedef struct { + bool matched; + size_t match_start; + size_t match_end; + lorex_group_t groups[LOREX_MAX_GROUPS]; + size_t group_count; +} lorex_match_t; + +typedef struct lorex_regex lorex_regex_t; + +lorex_regex_t *lorex_compile(const char *pattern, lorex_error_t *error); +void lorex_free(lorex_regex_t *regex); +bool lorex_match(lorex_regex_t *regex, const char *text, lorex_match_t *result); +bool lorex_search(lorex_regex_t *regex, const char *text, lorex_match_t *result); +const char *lorex_error_string(lorex_error_t error); + +#endif diff --git a/include/matcher.h b/include/matcher.h index 04cd880..46e8b92 100644 --- a/include/matcher.h +++ b/include/matcher.h @@ -1,9 +1,9 @@ /* retoor */ -#ifndef LOREG_MATCHER_H -#define LOREG_MATCHER_H +#ifndef LOREX_MATCHER_H +#define LOREX_MATCHER_H #include "nfa.h" -#include "loreg.h" +#include "lorex.h" typedef struct { nfa_state_t **states; @@ -20,7 +20,13 @@ void state_set_clear(state_set_t *set); void state_set_add(state_set_t *set, nfa_state_t *state); bool state_set_contains(state_set_t *set, nfa_state_t *state); -bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result); -bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result); +typedef struct match_ctx match_ctx_t; + +match_ctx_t *match_ctx_create(nfa_t *nfa); +void match_ctx_free(match_ctx_t *ctx); + +bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result); +bool nfa_match_with_ctx(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result, match_ctx_t *ctx); +bool nfa_search(nfa_t *nfa, const char *text, lorex_match_t *result); #endif diff --git a/include/nfa.h b/include/nfa.h index 3c0579a..52d4631 100644 --- a/include/nfa.h +++ b/include/nfa.h @@ -1,9 +1,9 @@ /* retoor */ -#ifndef LOREG_NFA_H -#define LOREG_NFA_H +#ifndef LOREX_NFA_H +#define LOREX_NFA_H #include "ast.h" -#include "loreg.h" +#include "lorex.h" #include #include @@ -30,19 +30,19 @@ typedef enum { } transition_type_t; typedef struct { - transition_type_t type; - char value; nfa_state_t *target; bracket_class_t *bracket; + transition_type_t type; int group_id; + char value; } transition_t; struct nfa_state { - int id; - bool accepting; transition_t *transitions; size_t trans_count; size_t trans_capacity; + int id; + bool accepting; }; typedef struct { @@ -52,10 +52,22 @@ typedef struct { typedef struct { nfa_state_t **states; + nfa_state_t *start; + char *literal_prefix; + char *literal_suffix; size_t state_count; size_t capacity; - nfa_state_t *start; + size_t prefix_len; + size_t suffix_len; int group_count; + char single_first_char; + bool anchored_start; + bool anchored_end; + bool first_chars_valid; + bool is_pure_literal; + bool has_alt_dispatch; + unsigned char first_chars[32]; + unsigned char alt_dispatch[256]; } nfa_t; nfa_t *nfa_create(void); @@ -64,6 +76,6 @@ nfa_state_t *nfa_add_state(nfa_t *nfa); void nfa_add_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, char value); void nfa_add_bracket_transition(nfa_state_t *from, nfa_state_t *to, bracket_class_t *bracket); void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, int group_id); -nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error); +nfa_t *nfa_from_ast(ast_node_t *ast, lorex_error_t *error); #endif diff --git a/include/parser.h b/include/parser.h index 8605299..0c098f6 100644 --- a/include/parser.h +++ b/include/parser.h @@ -1,20 +1,20 @@ /* retoor */ -#ifndef LOREG_PARSER_H -#define LOREG_PARSER_H +#ifndef LOREX_PARSER_H +#define LOREX_PARSER_H #include "ast.h" #include "lexer.h" -#include "loreg.h" +#include "lorex.h" typedef struct { lexer_t lexer; token_t current; - loreg_error_t error; + lorex_error_t error; int group_count; } parser_t; void parser_init(parser_t *parser, const char *pattern); ast_node_t *parser_parse(parser_t *parser); -loreg_error_t parser_get_error(parser_t *parser); +lorex_error_t parser_get_error(parser_t *parser); #endif diff --git a/include/repl.h b/include/repl.h index 006c1df..f3b058c 100644 --- a/include/repl.h +++ b/include/repl.h @@ -1,6 +1,6 @@ /* retoor */ -#ifndef LOREG_REPL_H -#define LOREG_REPL_H +#ifndef LOREX_REPL_H +#define LOREX_REPL_H void repl_run(void); diff --git a/src/ast.c b/src/ast.c index fc88f99..535ab8e 100644 --- a/src/ast.c +++ b/src/ast.c @@ -2,6 +2,7 @@ #include "ast.h" #include #include +#include static ast_node_t *ast_create_node(ast_type_t type) { ast_node_t *node = malloc(sizeof(ast_node_t)); @@ -126,6 +127,8 @@ bracket_class_t *bracket_create(void) { bracket->count = 0; bracket->capacity = 0; bracket->negated = false; + memset(bracket->bitmap, 0, 32); + bracket->bitmap_valid = true; return bracket; } @@ -149,6 +152,14 @@ void bracket_add_range(bracket_class_t *bracket, char start, char end) { bracket->ranges[bracket->count].start = start; bracket->ranges[bracket->count].end = end; bracket->count++; + + if (bracket->bitmap_valid) { + unsigned char s = (unsigned char)start; + unsigned char e = (unsigned char)end; + for (unsigned int c = s; c <= e; c++) { + bracket->bitmap[c >> 3] |= (1u << (c & 7)); + } + } } void bracket_free(bracket_class_t *bracket) { @@ -158,11 +169,18 @@ void bracket_free(bracket_class_t *bracket) { } bool bracket_matches(bracket_class_t *bracket, char c) { - bool found = false; - for (size_t i = 0; i < bracket->count; i++) { - if (c >= bracket->ranges[i].start && c <= bracket->ranges[i].end) { - found = true; - break; + unsigned char uc = (unsigned char)c; + bool found; + + if (bracket->bitmap_valid) { + found = (bracket->bitmap[uc >> 3] & (1u << (uc & 7))) != 0; + } else { + found = false; + for (size_t i = 0; i < bracket->count; i++) { + if (c >= bracket->ranges[i].start && c <= bracket->ranges[i].end) { + found = true; + break; + } } } return bracket->negated ? !found : found; diff --git a/src/loreg.c b/src/loreg.c deleted file mode 100644 index 23ab908..0000000 --- a/src/loreg.c +++ /dev/null @@ -1,71 +0,0 @@ -/* retoor */ -#include "loreg.h" -#include "parser.h" -#include "nfa.h" -#include "matcher.h" -#include - -struct loreg_regex { - nfa_t *nfa; - ast_node_t *ast; -}; - -loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error) { - *error = LOREG_OK; - - loreg_regex_t *regex = malloc(sizeof(loreg_regex_t)); - if (!regex) { - *error = LOREG_ERR_OUT_OF_MEMORY; - return NULL; - } - - parser_t parser; - parser_init(&parser, pattern); - - regex->ast = parser_parse(&parser); - *error = parser_get_error(&parser); - - if (*error != LOREG_OK) { - ast_free(regex->ast); - free(regex); - return NULL; - } - - regex->nfa = nfa_from_ast(regex->ast, error); - if (*error != LOREG_OK) { - ast_free(regex->ast); - free(regex); - return NULL; - } - - return regex; -} - -void loreg_free(loreg_regex_t *regex) { - if (!regex) return; - nfa_free(regex->nfa); - ast_free(regex->ast); - free(regex); -} - -bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result) { - return nfa_match(regex->nfa, text, 0, result); -} - -bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result) { - return nfa_search(regex->nfa, text, result); -} - -const char *loreg_error_string(loreg_error_t error) { - switch (error) { - case LOREG_OK: return "success"; - case LOREG_ERR_INVALID_PATTERN: return "invalid pattern"; - case LOREG_ERR_UNBALANCED_PAREN: return "unbalanced parentheses"; - case LOREG_ERR_EMPTY_GROUP: return "empty group"; - case LOREG_ERR_INVALID_QUANTIFIER: return "invalid quantifier"; - case LOREG_ERR_INVALID_ESCAPE: return "invalid escape sequence"; - case LOREG_ERR_OUT_OF_MEMORY: return "out of memory"; - case LOREG_ERR_STATE_OVERFLOW: return "state overflow"; - default: return "unknown error"; - } -} diff --git a/src/lorex.c b/src/lorex.c new file mode 100644 index 0000000..ef4103d --- /dev/null +++ b/src/lorex.c @@ -0,0 +1,71 @@ +/* retoor */ +#include "lorex.h" +#include "parser.h" +#include "nfa.h" +#include "matcher.h" +#include + +struct lorex_regex { + nfa_t *nfa; + ast_node_t *ast; +}; + +lorex_regex_t *lorex_compile(const char *pattern, lorex_error_t *error) { + *error = LOREX_OK; + + lorex_regex_t *regex = malloc(sizeof(lorex_regex_t)); + if (!regex) { + *error = LOREX_ERR_OUT_OF_MEMORY; + return NULL; + } + + parser_t parser; + parser_init(&parser, pattern); + + regex->ast = parser_parse(&parser); + *error = parser_get_error(&parser); + + if (*error != LOREX_OK) { + ast_free(regex->ast); + free(regex); + return NULL; + } + + regex->nfa = nfa_from_ast(regex->ast, error); + if (*error != LOREX_OK) { + ast_free(regex->ast); + free(regex); + return NULL; + } + + return regex; +} + +void lorex_free(lorex_regex_t *regex) { + if (!regex) return; + nfa_free(regex->nfa); + ast_free(regex->ast); + free(regex); +} + +bool lorex_match(lorex_regex_t *regex, const char *text, lorex_match_t *result) { + return nfa_match(regex->nfa, text, 0, result); +} + +bool lorex_search(lorex_regex_t *regex, const char *text, lorex_match_t *result) { + return nfa_search(regex->nfa, text, result); +} + +const char *lorex_error_string(lorex_error_t error) { + switch (error) { + case LOREX_OK: return "success"; + case LOREX_ERR_INVALID_PATTERN: return "invalid pattern"; + case LOREX_ERR_UNBALANCED_PAREN: return "unbalanced parentheses"; + case LOREX_ERR_EMPTY_GROUP: return "empty group"; + case LOREX_ERR_INVALID_QUANTIFIER: return "invalid quantifier"; + case LOREX_ERR_INVALID_ESCAPE: return "invalid escape sequence"; + case LOREX_ERR_OUT_OF_MEMORY: return "out of memory"; + case LOREX_ERR_STATE_OVERFLOW: return "state overflow"; + default: return "unknown error"; + } +} diff --git a/src/main.c b/src/main.c index 57c8ec6..6fc640a 100644 --- a/src/main.c +++ b/src/main.c @@ -1,5 +1,5 @@ /* retoor */ -#include "loreg.h" +#include "lorex.h" #include "repl.h" #include #include @@ -20,10 +20,10 @@ static void print_usage(const char *program) { } static void print_version(void) { - printf("loreg %s\n", LOREG_VERSION); + printf("lorex %s\n", LOREX_VERSION); } -static void print_match(const char *text, loreg_match_t *result) { +static void print_match(const char *text, lorex_match_t *result) { if (!result->matched) { printf("no match\n"); return; @@ -86,22 +86,22 @@ int main(int argc, char *argv[]) { const char *pattern = argv[arg_idx]; const char *text = argv[arg_idx + 1]; - loreg_error_t error; - loreg_regex_t *regex = loreg_compile(pattern, &error); + lorex_error_t error; + lorex_regex_t *regex = lorex_compile(pattern, &error); if (!regex) { - fprintf(stderr, "error: %s\n", loreg_error_string(error)); + fprintf(stderr, "error: %s\n", lorex_error_string(error)); return 1; } - loreg_match_t result; + lorex_match_t result; if (match_mode) { - loreg_match(regex, text, &result); + lorex_match(regex, text, &result); } else { - loreg_search(regex, text, &result); + lorex_search(regex, text, &result); } print_match(text, &result); - loreg_free(regex); + lorex_free(regex); return result.matched ? 0 : 1; } diff --git a/src/matcher.c b/src/matcher.c index 74e5108..f590016 100644 --- a/src/matcher.c +++ b/src/matcher.c @@ -3,6 +3,33 @@ #include #include #include +#include + +#define BITMAP_SET(bm, id) ((bm)[(id) >> 3] |= (1u << ((id) & 7))) +#define BITMAP_GET(bm, id) ((bm)[(id) >> 3] & (1u << ((id) & 7))) +#define BITMAP_CLR(bm, id) ((bm)[(id) >> 3] &= ~(1u << ((id) & 7))) +#define BITMAP_SIZE(n) (((n) + 7) >> 3) + +static const uint8_t char_class_digit[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const uint8_t char_class_word[32] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03, + 0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; + +static const uint8_t char_class_space[32] = { + 0x00, 0x26, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +}; state_set_t *state_set_create(size_t initial_capacity, int group_count) { state_set_t *set = malloc(sizeof(state_set_t)); @@ -78,27 +105,26 @@ bool state_set_contains(state_set_t *set, nfa_state_t *state) { return false; } -static bool is_digit(char c) { - return c >= '0' && c <= '9'; +static inline bool is_digit(unsigned char c) { + return (char_class_digit[c >> 3] & (1u << (c & 7))) != 0; } -static bool is_word(char c) { - return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || - (c >= '0' && c <= '9') || c == '_'; +static inline bool is_word(unsigned char c) { + return (char_class_word[c >> 3] & (1u << (c & 7))) != 0; } -static bool is_space(char c) { - return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'; +static inline bool is_space(unsigned char c) { + return (char_class_space[c >> 3] & (1u << (c & 7))) != 0; } -static bool transition_matches(transition_t *t, char c, size_t pos, size_t len) { +static inline bool transition_matches(const transition_t * restrict t, unsigned char c, size_t pos, size_t len) { switch (t->type) { case TRANS_CHAR: - return t->value == c; + return (unsigned char)t->value == c; case TRANS_DOT: return c != '\n' && c != '\0'; case TRANS_BRACKET: - return bracket_matches(t->bracket, c); + return bracket_matches(t->bracket, (char)c); case TRANS_CLASS_DIGIT: return is_digit(c); case TRANS_CLASS_WORD: @@ -131,9 +157,25 @@ typedef struct { size_t count; size_t capacity; int group_count; + uint8_t *state_bitmap; + size_t bitmap_size; + size_t *scratch_starts; + size_t *scratch_ends; } thread_list_t; -static thread_list_t *thread_list_create(size_t capacity, int group_count) { +struct match_ctx { + thread_list_t *current; + thread_list_t *next; + uint32_t *visited; + size_t *init_starts; + size_t *init_ends; + size_t *best_starts; + size_t *best_ends; + int group_count; + size_t num_states; +}; + +static thread_list_t *thread_list_create(size_t capacity, int group_count, size_t num_states) { thread_list_t *list = malloc(sizeof(thread_list_t)); if (!list) return NULL; @@ -143,6 +185,30 @@ static thread_list_t *thread_list_create(size_t capacity, int group_count) { return NULL; } + list->bitmap_size = BITMAP_SIZE(num_states); + list->state_bitmap = calloc(list->bitmap_size, 1); + if (!list->state_bitmap) { + free(list->threads); + free(list); + return NULL; + } + + if (group_count > 0) { + list->scratch_starts = malloc(group_count * sizeof(size_t)); + list->scratch_ends = malloc(group_count * sizeof(size_t)); + if (!list->scratch_starts || !list->scratch_ends) { + free(list->scratch_starts); + free(list->scratch_ends); + free(list->state_bitmap); + free(list->threads); + free(list); + return NULL; + } + } else { + list->scratch_starts = NULL; + list->scratch_ends = NULL; + } + for (size_t i = 0; i < capacity; i++) { if (group_count > 0) { list->threads[i].group_starts = malloc(group_count * sizeof(size_t)); @@ -152,6 +218,9 @@ static thread_list_t *thread_list_create(size_t capacity, int group_count) { free(list->threads[j].group_starts); free(list->threads[j].group_ends); } + free(list->scratch_starts); + free(list->scratch_ends); + free(list->state_bitmap); free(list->threads); free(list); return NULL; @@ -174,19 +243,76 @@ static void thread_list_free(thread_list_t *list) { free(list->threads[i].group_starts); free(list->threads[i].group_ends); } + free(list->scratch_starts); + free(list->scratch_ends); + free(list->state_bitmap); free(list->threads); free(list); } +static void thread_list_clear(thread_list_t *list); + +match_ctx_t *match_ctx_create(nfa_t *nfa) { + match_ctx_t *ctx = malloc(sizeof(match_ctx_t)); + if (!ctx) return NULL; + + size_t num_states = nfa->state_count; + int group_count = nfa->group_count > 0 ? nfa->group_count : 1; + + ctx->num_states = num_states; + ctx->group_count = group_count; + + ctx->current = thread_list_create(num_states, group_count, num_states); + ctx->next = thread_list_create(num_states, group_count, num_states); + ctx->visited = calloc(num_states, sizeof(uint32_t)); + ctx->init_starts = calloc(group_count, sizeof(size_t)); + ctx->init_ends = calloc(group_count, sizeof(size_t)); + ctx->best_starts = calloc(group_count, sizeof(size_t)); + ctx->best_ends = calloc(group_count, sizeof(size_t)); + + if (!ctx->current || !ctx->next || !ctx->visited || + !ctx->init_starts || !ctx->init_ends || + !ctx->best_starts || !ctx->best_ends) { + match_ctx_free(ctx); + return NULL; + } + + return ctx; +} + +void match_ctx_free(match_ctx_t *ctx) { + if (!ctx) return; + thread_list_free(ctx->current); + thread_list_free(ctx->next); + free(ctx->visited); + free(ctx->init_starts); + free(ctx->init_ends); + free(ctx->best_starts); + free(ctx->best_ends); + free(ctx); +} + +static void match_ctx_reset(match_ctx_t *ctx) { + thread_list_clear(ctx->current); + thread_list_clear(ctx->next); + memset(ctx->visited, 0, ctx->num_states * sizeof(uint32_t)); + for (int i = 0; i < ctx->group_count; i++) { + ctx->init_starts[i] = (size_t)-1; + ctx->init_ends[i] = (size_t)-1; + ctx->best_starts[i] = (size_t)-1; + ctx->best_ends[i] = (size_t)-1; + } +} + static void thread_list_clear(thread_list_t *list) { + for (size_t i = 0; i < list->count; i++) { + BITMAP_CLR(list->state_bitmap, list->threads[i].state->id); + } list->count = 0; } -static bool thread_list_contains_state(thread_list_t *list, nfa_state_t *state) { - for (size_t i = 0; i < list->count; i++) { - if (list->threads[i].state == state) return true; - } - return false; +static inline bool thread_list_contains_state(const thread_list_t * restrict list, const nfa_state_t * restrict state) { + return BITMAP_GET(list->state_bitmap, state->id) != 0; } static void add_thread(thread_list_t *list, nfa_state_t *state, @@ -194,44 +320,36 @@ static void add_thread(thread_list_t *list, nfa_state_t *state, static void follow_epsilons(thread_list_t *list, nfa_state_t *state, size_t *group_starts, size_t *group_ends, - size_t pos, size_t len, bool *visited) { - if (!state || visited[state->id]) return; - visited[state->id] = true; + size_t pos, size_t len, uint32_t *visited, uint32_t gen) { + if (!state || visited[state->id] == gen) return; + visited[state->id] = gen; for (size_t i = 0; i < state->trans_count; i++) { transition_t *t = &state->transitions[i]; if (t->type == TRANS_EPSILON) { follow_epsilons(list, t->target, group_starts, group_ends, - pos, len, visited); + pos, len, visited, gen); } else if (t->type == TRANS_GROUP_START) { - size_t *new_starts = malloc(list->group_count * sizeof(size_t)); - size_t *new_ends = malloc(list->group_count * sizeof(size_t)); - if (new_starts && new_ends) { - memcpy(new_starts, group_starts, list->group_count * sizeof(size_t)); - memcpy(new_ends, group_ends, list->group_count * sizeof(size_t)); - new_starts[t->group_id] = pos; - follow_epsilons(list, t->target, new_starts, new_ends, - pos, len, visited); - } - free(new_starts); - free(new_ends); + size_t *scratch_s = list->scratch_starts; + size_t *scratch_e = list->scratch_ends; + memcpy(scratch_s, group_starts, list->group_count * sizeof(size_t)); + memcpy(scratch_e, group_ends, list->group_count * sizeof(size_t)); + scratch_s[t->group_id] = pos; + follow_epsilons(list, t->target, scratch_s, scratch_e, + pos, len, visited, gen); } else if (t->type == TRANS_GROUP_END) { - size_t *new_starts = malloc(list->group_count * sizeof(size_t)); - size_t *new_ends = malloc(list->group_count * sizeof(size_t)); - if (new_starts && new_ends) { - memcpy(new_starts, group_starts, list->group_count * sizeof(size_t)); - memcpy(new_ends, group_ends, list->group_count * sizeof(size_t)); - new_ends[t->group_id] = pos; - follow_epsilons(list, t->target, new_starts, new_ends, - pos, len, visited); - } - free(new_starts); - free(new_ends); + size_t *scratch_s = list->scratch_starts; + size_t *scratch_e = list->scratch_ends; + memcpy(scratch_s, group_starts, list->group_count * sizeof(size_t)); + memcpy(scratch_e, group_ends, list->group_count * sizeof(size_t)); + scratch_e[t->group_id] = pos; + follow_epsilons(list, t->target, scratch_s, scratch_e, + pos, len, visited, gen); } else if (t->type == TRANS_ANCHOR_START || t->type == TRANS_ANCHOR_END) { if (transition_matches(t, '\0', pos, len)) { follow_epsilons(list, t->target, group_starts, group_ends, - pos, len, visited); + pos, len, visited, gen); } } } @@ -243,9 +361,9 @@ static void add_thread(thread_list_t *list, nfa_state_t *state, size_t *group_starts, size_t *group_ends) { if (!state) return; if (thread_list_contains_state(list, state)) return; - if (list->count >= list->capacity) return; + BITMAP_SET(list->state_bitmap, state->id); thread_t *thread = &list->threads[list->count++]; thread->state = state; if (list->group_count > 0) { @@ -254,14 +372,15 @@ static void add_thread(thread_list_t *list, nfa_state_t *state, } } -bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result) { +bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result) { size_t len = strlen(text); size_t num_states = nfa->state_count; int group_count = nfa->group_count > 0 ? nfa->group_count : 1; - thread_list_t *current = thread_list_create(num_states, group_count); - thread_list_t *next = thread_list_create(num_states, group_count); - bool *visited = calloc(num_states, sizeof(bool)); + thread_list_t *current = thread_list_create(num_states, group_count, num_states); + thread_list_t *next = thread_list_create(num_states, group_count, num_states); + uint32_t *visited = calloc(num_states, sizeof(uint32_t)); + uint32_t generation = 1; if (!current || !next || !visited) { thread_list_free(current); @@ -286,9 +405,8 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re init_ends[i] = (size_t)-1; } - memset(visited, 0, num_states * sizeof(bool)); follow_epsilons(current, nfa->start, init_starts, init_ends, - start_pos, len, visited); + start_pos, len, visited, generation++); bool matched = false; size_t match_end = start_pos; @@ -322,27 +440,29 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re } for (size_t pos = start_pos; pos < len; pos++) { - char c = text[pos]; + unsigned char c = (unsigned char)text[pos]; thread_list_clear(next); for (size_t i = 0; i < current->count; i++) { thread_t *thread = ¤t->threads[i]; nfa_state_t *state = thread->state; + size_t trans_count = state->trans_count; + transition_t *transitions = state->transitions; - for (size_t j = 0; j < state->trans_count; j++) { - transition_t *t = &state->transitions[j]; + for (size_t j = 0; j < trans_count; j++) { + transition_t *t = &transitions[j]; + transition_type_t type = t->type; - if (t->type != TRANS_EPSILON && - t->type != TRANS_GROUP_START && - t->type != TRANS_GROUP_END && - t->type != TRANS_ANCHOR_START && - t->type != TRANS_ANCHOR_END) { + if (type != TRANS_EPSILON && + type != TRANS_GROUP_START && + type != TRANS_GROUP_END && + type != TRANS_ANCHOR_START && + type != TRANS_ANCHOR_END) { if (transition_matches(t, c, pos, len)) { - memset(visited, 0, num_states * sizeof(bool)); follow_epsilons(next, t->target, thread->group_starts, thread->group_ends, - pos + 1, len, visited); + pos + 1, len, visited, generation++); } } } @@ -371,7 +491,7 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re result->match_end = matched ? match_end : start_pos; result->group_count = nfa->group_count; - for (int i = 0; i < LOREG_MAX_GROUPS && i < nfa->group_count; i++) { + for (int i = 0; i < LOREX_MAX_GROUPS && i < nfa->group_count; i++) { result->groups[i].start = best_starts[i]; result->groups[i].end = best_ends[i]; result->groups[i].matched = (best_starts[i] != (size_t)-1 && best_ends[i] != (size_t)-1); @@ -389,23 +509,259 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re return matched; } -bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result) { +bool nfa_match_with_ctx(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result, match_ctx_t *ctx) { size_t len = strlen(text); + int group_count = ctx->group_count; + uint32_t generation = 1; - for (size_t i = 0; i <= len; i++) { - if (nfa_match(nfa, text, i, result)) { - if (result) { - result->match_start = i; - } - return true; + match_ctx_reset(ctx); + + follow_epsilons(ctx->current, nfa->start, ctx->init_starts, ctx->init_ends, + start_pos, len, ctx->visited, generation++); + + bool matched = false; + size_t match_end = start_pos; + + for (size_t i = 0; i < ctx->current->count; i++) { + if (ctx->current->threads[i].state->accepting) { + matched = true; + match_end = start_pos; + memcpy(ctx->best_starts, ctx->current->threads[i].group_starts, group_count * sizeof(size_t)); + memcpy(ctx->best_ends, ctx->current->threads[i].group_ends, group_count * sizeof(size_t)); + break; } } + thread_list_t *current = ctx->current; + thread_list_t *next = ctx->next; + + for (size_t pos = start_pos; pos < len; pos++) { + unsigned char c = (unsigned char)text[pos]; + thread_list_clear(next); + + for (size_t i = 0; i < current->count; i++) { + thread_t *thread = ¤t->threads[i]; + nfa_state_t *state = thread->state; + size_t trans_count = state->trans_count; + transition_t *transitions = state->transitions; + + for (size_t j = 0; j < trans_count; j++) { + transition_t *t = &transitions[j]; + transition_type_t type = t->type; + + if (type != TRANS_EPSILON && + type != TRANS_GROUP_START && + type != TRANS_GROUP_END && + type != TRANS_ANCHOR_START && + type != TRANS_ANCHOR_END) { + + if (transition_matches(t, c, pos, len)) { + follow_epsilons(next, t->target, + thread->group_starts, thread->group_ends, + pos + 1, len, ctx->visited, generation++); + } + } + } + } + + if (next->count == 0) break; + + thread_list_t *tmp = current; + current = next; + next = tmp; + + for (size_t i = 0; i < current->count; i++) { + if (current->threads[i].state->accepting) { + matched = true; + match_end = pos + 1; + memcpy(ctx->best_starts, current->threads[i].group_starts, group_count * sizeof(size_t)); + memcpy(ctx->best_ends, current->threads[i].group_ends, group_count * sizeof(size_t)); + break; + } + } + } + + if (result) { + result->matched = matched; + result->match_start = start_pos; + result->match_end = matched ? match_end : start_pos; + result->group_count = nfa->group_count; + + for (int i = 0; i < LOREX_MAX_GROUPS && i < nfa->group_count; i++) { + result->groups[i].start = ctx->best_starts[i]; + result->groups[i].end = ctx->best_ends[i]; + result->groups[i].matched = (ctx->best_starts[i] != (size_t)-1 && ctx->best_ends[i] != (size_t)-1); + } + } + + return matched; +} + +static void set_no_match(lorex_match_t *result) { if (result) { result->matched = false; result->match_start = 0; result->match_end = 0; result->group_count = 0; } +} + +bool nfa_search(nfa_t *nfa, const char *text, lorex_match_t *result) { + size_t len = strlen(text); + + if (nfa->anchored_start) { + bool matched = nfa_match(nfa, text, 0, result); + if (matched && result) { + result->match_start = 0; + } + if (!matched) { + set_no_match(result); + } + return matched; + } + + if (nfa->is_pure_literal && nfa->literal_prefix && nfa->prefix_len > 0) { + const char *found = strstr(text, nfa->literal_prefix); + if (found) { + if (result) { + result->matched = true; + result->match_start = (size_t)(found - text); + result->match_end = result->match_start + nfa->prefix_len; + result->group_count = 0; + } + return true; + } + set_no_match(result); + return false; + } + + match_ctx_t *ctx = match_ctx_create(nfa); + if (!ctx) { + set_no_match(result); + return false; + } + + if (nfa->anchored_end && nfa->suffix_len > 0) { + if (len < nfa->suffix_len) { + match_ctx_free(ctx); + set_no_match(result); + return false; + } + if (memcmp(text + len - nfa->suffix_len, nfa->literal_suffix, nfa->suffix_len) != 0) { + match_ctx_free(ctx); + set_no_match(result); + return false; + } + size_t suffix_start = len - nfa->suffix_len; + for (size_t i = suffix_start + 1; i > 0; i--) { + size_t pos = i - 1; + if (nfa_match_with_ctx(nfa, text, pos, result, ctx)) { + if (result) result->match_start = pos; + match_ctx_free(ctx); + return true; + } + } + match_ctx_free(ctx); + set_no_match(result); + return false; + } + + if (nfa->has_alt_dispatch) { + for (size_t i = 0; i < len; i++) { + unsigned char c = (unsigned char)text[i]; + if (nfa->alt_dispatch[c] == 255) continue; + if (nfa_match_with_ctx(nfa, text, i, result, ctx)) { + if (result) result->match_start = i; + match_ctx_free(ctx); + return true; + } + } + match_ctx_free(ctx); + set_no_match(result); + return false; + } + + if (nfa->prefix_len >= 2) { + const char *pos = text; + const char *end = text + len; + while (pos <= end - nfa->prefix_len) { + pos = strstr(pos, nfa->literal_prefix); + if (!pos) break; + size_t offset = (size_t)(pos - text); + if (nfa_match_with_ctx(nfa, text, offset, result, ctx)) { + if (result) { + result->match_start = offset; + } + match_ctx_free(ctx); + return true; + } + pos++; + } + match_ctx_free(ctx); + set_no_match(result); + return false; + } + + if (nfa->single_first_char != 0) { + const char *pos = text; + const char *end = text + len; + while (pos < end) { + pos = memchr(pos, nfa->single_first_char, (size_t)(end - pos)); + if (!pos) break; + size_t offset = (size_t)(pos - text); + if (nfa_match_with_ctx(nfa, text, offset, result, ctx)) { + if (result) { + result->match_start = offset; + } + match_ctx_free(ctx); + return true; + } + pos++; + } + match_ctx_free(ctx); + set_no_match(result); + return false; + } + + if (nfa->first_chars_valid) { + bool has_any_first_char = false; + for (int i = 0; i < 32; i++) { + if (nfa->first_chars[i]) { + has_any_first_char = true; + break; + } + } + if (has_any_first_char) { + for (size_t i = 0; i < len; i++) { + unsigned char c = (unsigned char)text[i]; + if (!(nfa->first_chars[c >> 3] & (1u << (c & 7)))) { + continue; + } + if (nfa_match_with_ctx(nfa, text, i, result, ctx)) { + if (result) { + result->match_start = i; + } + match_ctx_free(ctx); + return true; + } + } + match_ctx_free(ctx); + set_no_match(result); + return false; + } + } + + for (size_t i = 0; i <= len; i++) { + if (nfa_match_with_ctx(nfa, text, i, result, ctx)) { + if (result) { + result->match_start = i; + } + match_ctx_free(ctx); + return true; + } + } + + match_ctx_free(ctx); + set_no_match(result); return false; } diff --git a/src/nfa.c b/src/nfa.c index d13bd15..acc4636 100644 --- a/src/nfa.c +++ b/src/nfa.c @@ -11,9 +11,282 @@ nfa_t *nfa_create(void) { nfa->capacity = 0; nfa->start = NULL; nfa->group_count = 0; + nfa->anchored_start = false; + nfa->anchored_end = false; + memset(nfa->first_chars, 0, 32); + nfa->first_chars_valid = false; + nfa->literal_prefix = NULL; + nfa->prefix_len = 0; + nfa->is_pure_literal = false; + nfa->single_first_char = 0; + nfa->literal_suffix = NULL; + nfa->suffix_len = 0; + memset(nfa->alt_dispatch, 255, 256); + nfa->has_alt_dispatch = false; return nfa; } +static bool ast_starts_with_anchor(ast_node_t *ast) { + if (!ast) return false; + switch (ast->type) { + case AST_ANCHOR_START: + return true; + case AST_CONCAT: + return ast_starts_with_anchor(ast->left); + case AST_GROUP: + return ast_starts_with_anchor(ast->left); + default: + return false; + } +} + +static bool ast_ends_with_anchor(ast_node_t *ast) { + if (!ast) return false; + switch (ast->type) { + case AST_ANCHOR_END: + return true; + case AST_CONCAT: + return ast_ends_with_anchor(ast->right); + case AST_GROUP: + return ast_ends_with_anchor(ast->left); + default: + return false; + } +} + +static void extract_first_chars(ast_node_t *ast, unsigned char *bitmap, bool *valid) { + if (!ast) { + *valid = false; + return; + } + switch (ast->type) { + case AST_CHAR: + bitmap[(unsigned char)ast->value >> 3] |= (1u << (ast->value & 7)); + break; + case AST_DOT: + *valid = false; + break; + case AST_CONCAT: + extract_first_chars(ast->left, bitmap, valid); + break; + case AST_ALTER: + extract_first_chars(ast->left, bitmap, valid); + extract_first_chars(ast->right, bitmap, valid); + break; + case AST_STAR: + case AST_QUESTION: + *valid = false; + break; + case AST_PLUS: + extract_first_chars(ast->left, bitmap, valid); + break; + case AST_GROUP: + extract_first_chars(ast->left, bitmap, valid); + break; + case AST_ANCHOR_START: + case AST_ANCHOR_END: + break; + case AST_BRACKET: + if (ast->bracket && !ast->bracket->negated) { + for (size_t i = 0; i < ast->bracket->count; i++) { + unsigned char s = (unsigned char)ast->bracket->ranges[i].start; + unsigned char e = (unsigned char)ast->bracket->ranges[i].end; + for (unsigned int c = s; c <= e; c++) { + bitmap[c >> 3] |= (1u << (c & 7)); + } + } + } else { + *valid = false; + } + break; + case AST_QUANTIFIER: + if (ast->quant.min > 0) { + extract_first_chars(ast->left, bitmap, valid); + } else { + *valid = false; + } + break; + case AST_CLASS_DIGIT: + for (char c = '0'; c <= '9'; c++) { + bitmap[(unsigned char)c >> 3] |= (1u << (c & 7)); + } + break; + case AST_CLASS_WORD: + for (char c = 'a'; c <= 'z'; c++) + bitmap[(unsigned char)c >> 3] |= (1u << (c & 7)); + for (char c = 'A'; c <= 'Z'; c++) + bitmap[(unsigned char)c >> 3] |= (1u << (c & 7)); + for (char c = '0'; c <= '9'; c++) + bitmap[(unsigned char)c >> 3] |= (1u << (c & 7)); + bitmap['_' >> 3] |= (1u << ('_' & 7)); + break; + case AST_CLASS_SPACE: + bitmap[' ' >> 3] |= (1u << (' ' & 7)); + bitmap['\t' >> 3] |= (1u << ('\t' & 7)); + bitmap['\n' >> 3] |= (1u << ('\n' & 7)); + bitmap['\r' >> 3] |= (1u << ('\r' & 7)); + break; + case AST_CLASS_NDIGIT: + case AST_CLASS_NWORD: + case AST_CLASS_NSPACE: + *valid = false; + break; + } +} + +static bool ast_is_pure_literal(ast_node_t *ast) { + if (!ast) return true; + switch (ast->type) { + case AST_CHAR: + return true; + case AST_CONCAT: + return ast_is_pure_literal(ast->left) && ast_is_pure_literal(ast->right); + default: + return false; + } +} + +static size_t extract_literal_prefix(ast_node_t *ast, char *buf, size_t max_len) { + if (!ast || max_len == 0) return 0; + switch (ast->type) { + case AST_CHAR: + buf[0] = ast->value; + return 1; + case AST_CONCAT: { + size_t left_len = extract_literal_prefix(ast->left, buf, max_len); + if (left_len > 0 && ast_is_pure_literal(ast->left)) { + size_t right_len = extract_literal_prefix(ast->right, buf + left_len, max_len - left_len); + return left_len + right_len; + } + return left_len; + } + case AST_GROUP: + return extract_literal_prefix(ast->left, buf, max_len); + case AST_ANCHOR_START: + case AST_ANCHOR_END: + return 0; + default: + return 0; + } +} + +static ast_node_t *strip_end_anchor(ast_node_t *ast) { + if (!ast) return NULL; + if (ast->type == AST_ANCHOR_END) return NULL; + if (ast->type == AST_CONCAT && ast->right && ast->right->type == AST_ANCHOR_END) { + return ast->left; + } + return ast; +} + +static size_t extract_literal_suffix_rev(ast_node_t *ast, char *buf, size_t max_len) { + if (!ast || max_len == 0) return 0; + switch (ast->type) { + case AST_CHAR: + buf[0] = ast->value; + return 1; + case AST_CONCAT: { + size_t right_len = extract_literal_suffix_rev(ast->right, buf, max_len); + if (right_len == 0 || !ast_is_pure_literal(ast->right)) return right_len; + size_t left_len = extract_literal_suffix_rev(ast->left, buf + right_len, max_len - right_len); + return right_len + left_len; + } + case AST_GROUP: + return extract_literal_suffix_rev(ast->left, buf, max_len); + default: + return 0; + } +} + +static size_t extract_literal_suffix(ast_node_t *ast, char *buf, size_t max_len) { + ast_node_t *stripped = strip_end_anchor(ast); + if (!stripped) return 0; + size_t len = extract_literal_suffix_rev(stripped, buf, max_len); + for (size_t i = 0; i < len / 2; i++) { + char tmp = buf[i]; + buf[i] = buf[len - 1 - i]; + buf[len - 1 - i] = tmp; + } + return len; +} + +static void build_alt_dispatch_node(ast_node_t *ast, unsigned char *dispatch) { + if (!ast) return; + switch (ast->type) { + case AST_CHAR: + dispatch[(unsigned char)ast->value] = 1; + break; + case AST_CONCAT: + build_alt_dispatch_node(ast->left, dispatch); + break; + case AST_ALTER: + build_alt_dispatch_node(ast->left, dispatch); + build_alt_dispatch_node(ast->right, dispatch); + break; + case AST_GROUP: + build_alt_dispatch_node(ast->left, dispatch); + break; + case AST_BRACKET: + if (ast->bracket && !ast->bracket->negated) { + for (size_t i = 0; i < ast->bracket->count; i++) { + unsigned char s = (unsigned char)ast->bracket->ranges[i].start; + unsigned char e = (unsigned char)ast->bracket->ranges[i].end; + for (unsigned int c = s; c <= e; c++) { + dispatch[c] = 1; + } + } + } + break; + case AST_CLASS_DIGIT: + for (char c = '0'; c <= '9'; c++) { + dispatch[(unsigned char)c] = 1; + } + break; + case AST_CLASS_WORD: + for (char c = 'a'; c <= 'z'; c++) dispatch[(unsigned char)c] = 1; + for (char c = 'A'; c <= 'Z'; c++) dispatch[(unsigned char)c] = 1; + for (char c = '0'; c <= '9'; c++) dispatch[(unsigned char)c] = 1; + dispatch['_'] = 1; + break; + default: + break; + } +} + +static bool is_top_level_alternation(ast_node_t *ast) { + if (!ast) return false; + if (ast->type == AST_ALTER) return true; + if (ast->type == AST_GROUP) return is_top_level_alternation(ast->left); + return false; +} + +static bool build_alt_dispatch(ast_node_t *ast, unsigned char *dispatch) { + if (!is_top_level_alternation(ast)) return false; + memset(dispatch, 255, 256); + build_alt_dispatch_node(ast, dispatch); + bool has_any = false; + for (int i = 0; i < 256; i++) { + if (dispatch[i] == 1) { + dispatch[i] = 0; + has_any = true; + } + } + return has_any; +} + +static char compute_single_first_char(unsigned char *bitmap) { + int count = 0; + char first_char = 0; + for (int i = 0; i < 256; i++) { + if (bitmap[i >> 3] & (1u << (i & 7))) { + count++; + if (count == 1) first_char = (char)i; + if (count > 1) return 0; + } + } + return first_char; +} + void nfa_free(nfa_t *nfa) { if (!nfa) return; for (size_t i = 0; i < nfa->state_count; i++) { @@ -21,14 +294,16 @@ void nfa_free(nfa_t *nfa) { free(nfa->states[i]); } free(nfa->states); + free(nfa->literal_prefix); + free(nfa->literal_suffix); free(nfa); } static bool nfa_grow(nfa_t *nfa) { size_t new_cap = nfa->capacity == 0 ? 16 : nfa->capacity * 2; - if (new_cap > LOREG_MAX_STATES) { - if (nfa->capacity >= LOREG_MAX_STATES) return false; - new_cap = LOREG_MAX_STATES; + if (new_cap > LOREX_MAX_STATES) { + if (nfa->capacity >= LOREX_MAX_STATES) return false; + new_cap = LOREX_MAX_STATES; } nfa_state_t **new_states = realloc(nfa->states, new_cap * sizeof(nfa_state_t *)); if (!new_states) return false; @@ -100,14 +375,14 @@ void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_typ t->group_id = group_id; } -static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error); +static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, lorex_error_t *error); -static nfa_fragment_t build_char(nfa_t *nfa, char c, loreg_error_t *error) { +static nfa_fragment_t build_char(nfa_t *nfa, char c, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_state_t *start = nfa_add_state(nfa); nfa_state_t *accept = nfa_add_state(nfa); if (!start || !accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_add_transition(start, accept, TRANS_CHAR, c); @@ -116,12 +391,12 @@ static nfa_fragment_t build_char(nfa_t *nfa, char c, loreg_error_t *error) { return frag; } -static nfa_fragment_t build_dot(nfa_t *nfa, loreg_error_t *error) { +static nfa_fragment_t build_dot(nfa_t *nfa, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_state_t *start = nfa_add_state(nfa); nfa_state_t *accept = nfa_add_state(nfa); if (!start || !accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_add_transition(start, accept, TRANS_DOT, '\0'); @@ -130,12 +405,12 @@ static nfa_fragment_t build_dot(nfa_t *nfa, loreg_error_t *error) { return frag; } -static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, loreg_error_t *error) { +static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_state_t *start = nfa_add_state(nfa); nfa_state_t *accept = nfa_add_state(nfa); if (!start || !accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_add_transition(start, accept, type, '\0'); @@ -144,12 +419,12 @@ static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, loreg_erro return frag; } -static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, loreg_error_t *error) { +static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_state_t *start = nfa_add_state(nfa); nfa_state_t *accept = nfa_add_state(nfa); if (!start || !accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_add_bracket_transition(start, accept, bracket); @@ -158,12 +433,12 @@ static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, loreg_ return frag; } -static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) { +static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *right, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_fragment_t left_frag = build_nfa(nfa, left, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; nfa_fragment_t right_frag = build_nfa(nfa, right, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; nfa_add_transition(left_frag.accept, right_frag.start, TRANS_EPSILON, '\0'); frag.start = left_frag.start; @@ -171,19 +446,19 @@ static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *rig return frag; } -static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) { +static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *right, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_state_t *start = nfa_add_state(nfa); nfa_state_t *accept = nfa_add_state(nfa); if (!start || !accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_fragment_t left_frag = build_nfa(nfa, left, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; nfa_fragment_t right_frag = build_nfa(nfa, right, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; nfa_add_transition(start, left_frag.start, TRANS_EPSILON, '\0'); nfa_add_transition(start, right_frag.start, TRANS_EPSILON, '\0'); @@ -195,17 +470,17 @@ static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *righ return frag; } -static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) { +static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_state_t *start = nfa_add_state(nfa); nfa_state_t *accept = nfa_add_state(nfa); if (!start || !accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_fragment_t child_frag = build_nfa(nfa, child, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; if (greedy) { nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0'); @@ -222,16 +497,16 @@ static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, lor return frag; } -static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) { +static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_state_t *accept = nfa_add_state(nfa); if (!accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_fragment_t child_frag = build_nfa(nfa, child, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; if (greedy) { nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0'); @@ -246,17 +521,17 @@ static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, lor return frag; } -static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) { +static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_state_t *start = nfa_add_state(nfa); nfa_state_t *accept = nfa_add_state(nfa); if (!start || !accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_fragment_t child_frag = build_nfa(nfa, child, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; if (greedy) { nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0'); @@ -272,17 +547,17 @@ static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy, return frag; } -static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, loreg_error_t *error) { +static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_state_t *start = nfa_add_state(nfa); nfa_state_t *accept = nfa_add_state(nfa); if (!start || !accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_fragment_t child_frag = build_nfa(nfa, child, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; nfa_add_group_transition(start, child_frag.start, TRANS_GROUP_START, group_id); nfa_add_group_transition(child_frag.accept, accept, TRANS_GROUP_END, group_id); @@ -296,12 +571,12 @@ static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, l return frag; } -static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, loreg_error_t *error) { +static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; nfa_state_t *start = nfa_add_state(nfa); nfa_state_t *accept = nfa_add_state(nfa); if (!start || !accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_add_transition(start, accept, type, '\0'); @@ -310,13 +585,13 @@ static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, loreg_err return frag; } -static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, int max, bool greedy, loreg_error_t *error) { +static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, int max, bool greedy, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; if (min == 0 && max == 0) { nfa_state_t *state = nfa_add_state(nfa); if (!state) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } frag.start = state; @@ -326,7 +601,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i nfa_state_t *start = nfa_add_state(nfa); if (!start) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } @@ -334,7 +609,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i for (int i = 0; i < min; i++) { nfa_fragment_t rep = build_nfa(nfa, child, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0'); current = rep.accept; } @@ -343,14 +618,14 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i nfa_state_t *loop_start = nfa_add_state(nfa); nfa_state_t *accept = nfa_add_state(nfa); if (!loop_start || !accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } nfa_add_transition(current, loop_start, TRANS_EPSILON, '\0'); nfa_fragment_t rep = build_nfa(nfa, child, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; if (greedy) { nfa_add_transition(loop_start, rep.start, TRANS_EPSILON, '\0'); @@ -366,7 +641,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i } else { nfa_state_t *accept = nfa_add_state(nfa); if (!accept) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } @@ -374,7 +649,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i for (int i = min; i < max; i++) { nfa_fragment_t rep = build_nfa(nfa, child, error); - if (*error != LOREG_OK) return frag; + if (*error != LOREX_OK) return frag; if (greedy) { nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0'); @@ -400,13 +675,13 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i return frag; } -static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error) { +static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, lorex_error_t *error) { nfa_fragment_t frag = {NULL, NULL}; if (!ast) { nfa_state_t *state = nfa_add_state(nfa); if (!state) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return frag; } frag.start = state; @@ -456,16 +731,53 @@ static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *erro return frag; } -nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error) { - *error = LOREG_OK; +nfa_t *nfa_from_ast(ast_node_t *ast, lorex_error_t *error) { + *error = LOREX_OK; nfa_t *nfa = nfa_create(); if (!nfa) { - *error = LOREG_ERR_OUT_OF_MEMORY; + *error = LOREX_ERR_OUT_OF_MEMORY; return NULL; } + nfa->anchored_start = ast_starts_with_anchor(ast); + nfa->anchored_end = ast_ends_with_anchor(ast); + + nfa->first_chars_valid = true; + extract_first_chars(ast, nfa->first_chars, &nfa->first_chars_valid); + if (nfa->first_chars_valid) { + nfa->single_first_char = compute_single_first_char(nfa->first_chars); + } + + nfa->is_pure_literal = ast_is_pure_literal(ast); + + char prefix_buf[256]; + size_t prefix_len = extract_literal_prefix(ast, prefix_buf, sizeof(prefix_buf)); + if (prefix_len > 0) { + nfa->literal_prefix = malloc(prefix_len + 1); + if (nfa->literal_prefix) { + memcpy(nfa->literal_prefix, prefix_buf, prefix_len); + nfa->literal_prefix[prefix_len] = '\0'; + nfa->prefix_len = prefix_len; + } + } + + if (nfa->anchored_end) { + char suffix_buf[256]; + size_t suffix_len = extract_literal_suffix(ast, suffix_buf, sizeof(suffix_buf)); + if (suffix_len > 0) { + nfa->literal_suffix = malloc(suffix_len + 1); + if (nfa->literal_suffix) { + memcpy(nfa->literal_suffix, suffix_buf, suffix_len); + nfa->literal_suffix[suffix_len] = '\0'; + nfa->suffix_len = suffix_len; + } + } + } + + nfa->has_alt_dispatch = build_alt_dispatch(ast, nfa->alt_dispatch); + nfa_fragment_t frag = build_nfa(nfa, ast, error); - if (*error != LOREG_OK) { + if (*error != LOREX_OK) { nfa_free(nfa); return NULL; } diff --git a/src/parser.c b/src/parser.c index a26cc4a..6d5b306 100644 --- a/src/parser.c +++ b/src/parser.c @@ -10,11 +10,11 @@ static void parser_advance(parser_t *parser) { void parser_init(parser_t *parser, const char *pattern) { lexer_init(&parser->lexer, pattern); parser->current = lexer_next(&parser->lexer); - parser->error = LOREG_OK; + parser->error = LOREX_OK; parser->group_count = 0; } -loreg_error_t parser_get_error(parser_t *parser) { +lorex_error_t parser_get_error(parser_t *parser) { return parser->error; } @@ -27,7 +27,7 @@ static int parse_number(parser_t *parser); static ast_node_t *parse_expr(parser_t *parser) { ast_node_t *left = parse_term(parser); - if (!left || parser->error != LOREG_OK) return left; + if (!left || parser->error != LOREX_OK) return left; while (parser->current.type == TOKEN_PIPE) { parser_advance(parser); @@ -38,7 +38,7 @@ static ast_node_t *parse_expr(parser_t *parser) { } left = ast_create_alter(left, right); if (!left) { - parser->error = LOREG_ERR_OUT_OF_MEMORY; + parser->error = LOREX_ERR_OUT_OF_MEMORY; return NULL; } } @@ -61,7 +61,7 @@ static ast_node_t *parse_term(parser_t *parser) { } else { left = ast_create_concat(left, factor); if (!left) { - parser->error = LOREG_ERR_OUT_OF_MEMORY; + parser->error = LOREX_ERR_OUT_OF_MEMORY; return NULL; } } @@ -71,7 +71,7 @@ static ast_node_t *parse_term(parser_t *parser) { static ast_node_t *parse_factor(parser_t *parser) { ast_node_t *atom = parse_atom(parser); - if (!atom || parser->error != LOREG_OK) return atom; + if (!atom || parser->error != LOREX_OK) return atom; while (parser->current.type == TOKEN_STAR || parser->current.type == TOKEN_PLUS || @@ -107,7 +107,13 @@ static ast_node_t *parse_factor(parser_t *parser) { } if (parser->current.type != TOKEN_RBRACE) { - parser->error = LOREG_ERR_INVALID_QUANTIFIER; + parser->error = LOREX_ERR_INVALID_QUANTIFIER; + ast_free(atom); + return NULL; + } + + if (max != -1 && min > max) { + parser->error = LOREX_ERR_INVALID_QUANTIFIER; ast_free(atom); return NULL; } @@ -122,7 +128,7 @@ static ast_node_t *parse_factor(parser_t *parser) { } if (!atom) { - parser->error = LOREG_ERR_OUT_OF_MEMORY; + parser->error = LOREX_ERR_OUT_OF_MEMORY; return NULL; } } @@ -167,7 +173,7 @@ static ast_node_t *parse_atom(parser_t *parser) { int group_id = parser->group_count++; ast_node_t *inner = parse_expr(parser); if (parser->current.type != TOKEN_RPAREN) { - parser->error = LOREG_ERR_UNBALANCED_PAREN; + parser->error = LOREX_ERR_UNBALANCED_PAREN; ast_free(inner); return NULL; } @@ -216,12 +222,12 @@ static ast_node_t *parse_atom(parser_t *parser) { return NULL; default: - parser->error = LOREG_ERR_INVALID_PATTERN; + parser->error = LOREX_ERR_INVALID_PATTERN; return NULL; } - if (!node && parser->error == LOREG_OK) { - parser->error = LOREG_ERR_OUT_OF_MEMORY; + if (!node && parser->error == LOREX_OK) { + parser->error = LOREX_ERR_OUT_OF_MEMORY; } return node; } @@ -231,7 +237,7 @@ static ast_node_t *parse_bracket(parser_t *parser) { bracket_class_t *bracket = bracket_create(); if (!bracket) { - parser->error = LOREG_ERR_OUT_OF_MEMORY; + parser->error = LOREX_ERR_OUT_OF_MEMORY; return NULL; } @@ -293,7 +299,7 @@ static ast_node_t *parse_bracket(parser_t *parser) { if (parser->current.type != TOKEN_RBRACKET) { bracket_free(bracket); - parser->error = LOREG_ERR_INVALID_PATTERN; + parser->error = LOREX_ERR_INVALID_PATTERN; return NULL; } parser_advance(parser); diff --git a/src/repl.c b/src/repl.c index 7386a77..9b3c234 100644 --- a/src/repl.c +++ b/src/repl.c @@ -1,6 +1,6 @@ /* retoor */ #include "repl.h" -#include "loreg.h" +#include "lorex.h" #include #include #include @@ -8,12 +8,12 @@ #define MAX_INPUT 4096 static void print_banner(void) { - printf("loreg v%s - regex interpreter\n", LOREG_VERSION); + printf("lorex v%s - regex interpreter\n", LOREX_VERSION); printf("commands: :q quit, :h help, :p set pattern, :m match, :s search\n\n"); } static void print_help(void) { - printf("loreg REPL commands:\n"); + printf("lorex REPL commands:\n"); printf(" :q quit\n"); printf(" :h show this help\n"); printf(" :p compile and set pattern\n"); @@ -40,7 +40,7 @@ static void print_help(void) { printf(" \\D \\W \\S negated classes\n\n"); } -static void print_match(const char *text, loreg_match_t *result) { +static void print_match(const char *text, lorex_match_t *result) { if (!result->matched) { printf("no match\n"); return; @@ -83,7 +83,7 @@ static char *read_line(void) { void repl_run(void) { print_banner(); - loreg_regex_t *regex = NULL; + lorex_regex_t *regex = NULL; char *line; while ((line = read_line()) != NULL) { @@ -103,14 +103,14 @@ void repl_run(void) { while (*pattern == ' ') pattern++; if (regex) { - loreg_free(regex); + lorex_free(regex); regex = NULL; } - loreg_error_t error; - regex = loreg_compile(pattern, &error); + lorex_error_t error; + regex = lorex_compile(pattern, &error); if (!regex) { - printf("error: %s\n", loreg_error_string(error)); + printf("error: %s\n", lorex_error_string(error)); } else { printf("pattern compiled: %s\n", pattern); } @@ -126,8 +126,8 @@ void repl_run(void) { const char *text = line + 3; while (*text == ' ') text++; - loreg_match_t result; - loreg_match(regex, text, &result); + lorex_match_t result; + lorex_match(regex, text, &result); print_match(text, &result); continue; } @@ -141,8 +141,8 @@ void repl_run(void) { const char *text = line + 3; while (*text == ' ') text++; - loreg_match_t result; - loreg_search(regex, text, &result); + lorex_match_t result; + lorex_search(regex, text, &result); print_match(text, &result); continue; } @@ -157,13 +157,13 @@ void repl_run(void) { continue; } - loreg_match_t result; - loreg_search(regex, line, &result); + lorex_match_t result; + lorex_search(regex, line, &result); print_match(line, &result); } if (regex) { - loreg_free(regex); + lorex_free(regex); } printf("\n"); diff --git a/tests/benchmark.c b/tests/benchmark.c new file mode 100644 index 0000000..9000fbe --- /dev/null +++ b/tests/benchmark.c @@ -0,0 +1,448 @@ +/* retoor */ +#define _POSIX_C_SOURCE 200809L +#include "../include/lorex.h" +#include +#include +#include +#include +#include +#include + +#define ITERATIONS 10000 +#define WARMUP 1000 + +typedef struct { + const char *name; + const char *pattern; + const char *text; + int expect_match; +} benchmark_t; + +typedef struct { + double lorex_compile_us; + double lorex_match_us; + double lorex_total_us; + double posix_compile_us; + double posix_match_us; + double posix_total_us; + int lorex_matched; + int posix_matched; + int lorex_failed; + int posix_failed; +} result_t; + +static benchmark_t benchmarks[] = { + {"literal_short", "hello", "hello world", 1}, + {"literal_medium", "the quick brown", "the quick brown fox jumps over the lazy dog", 1}, + {"literal_long", "Lorem ipsum dolor sit amet", "Lorem ipsum dolor sit amet, consectetur adipiscing elit", 1}, + {"literal_nomatch", "xyz", "the quick brown fox jumps over the lazy dog", 0}, + {"literal_end", "dog", "the quick brown fox jumps over the lazy dog", 1}, + {"literal_repeated", "abcabc", "xyzabcabcdef", 1}, + + {"dot_single", "a.c", "abc", 1}, + {"dot_multiple", "a..b", "aXYb", 1}, + {"dot_many", "a.....b", "a12345b", 1}, + {"dot_star", "a.*b", "aXXXXXXXXXXb", 1}, + {"dot_plus", "a.+b", "aXXXXXXXXXXb", 1}, + + {"anchor_start", "^the", "the quick brown fox", 1}, + {"anchor_end", "fox$", "the quick brown fox", 1}, + {"anchor_both", "^hello$", "hello", 1}, + {"anchor_start_nomatch", "^fox", "the quick brown fox", 0}, + {"anchor_end_nomatch", "the$", "the quick brown fox", 0}, + + {"star_simple", "ab*c", "abbbbc", 1}, + {"star_zero", "ab*c", "ac", 1}, + {"star_greedy", "a.*b", "aXbXbXb", 1}, + {"star_repeated", "a*b*c*", "aaabbbccc", 1}, + {"star_empty", "a*", "", 1}, + + {"plus_simple", "ab+c", "abbbbc", 1}, + {"plus_one", "ab+c", "abc", 1}, + {"plus_nomatch", "ab+c", "ac", 0}, + {"plus_greedy", "a.+b", "aXbXbXb", 1}, + + {"question_present", "colou?r", "colour", 1}, + {"question_absent", "colou?r", "color", 1}, + {"question_multiple", "a?b?c?d", "abcd", 1}, + + {"class_vowels", "[aeiou]", "hello", 1}, + {"class_digits", "[0-9]+", "abc123def", 1}, + {"class_alpha", "[a-zA-Z]+", "HelloWorld", 1}, + {"class_alnum", "[a-zA-Z0-9]+", "Test123", 1}, + {"class_neg_digit", "[^0-9]+", "hello", 1}, + {"class_neg_alpha", "[^a-zA-Z]+", "12345", 1}, + {"class_complex", "[a-zA-Z_][a-zA-Z0-9_]*", "variable_name_123", 1}, + + {"alt_simple", "cat|dog", "I have a cat", 1}, + {"alt_simple2", "cat|dog", "I have a dog", 1}, + {"alt_three", "red|green|blue", "the color is green", 1}, + {"alt_nomatch", "cat|dog", "I have a bird", 0}, + {"alt_words", "hello|world|test", "this is a test", 1}, + + {"group_simple", "(ab)+", "ababab", 1}, + {"group_alt", "(cat|dog)s?", "cats", 1}, + {"group_nested", "((a)(b))+", "ababab", 1}, + {"group_complex", "(a(b(c)))+", "abcabc", 1}, + + {"quant_exact", "a{3}", "aaa", 1}, + {"quant_exact_long", "a{10}", "aaaaaaaaaa", 1}, + {"quant_range", "a{2,4}", "aaa", 1}, + {"quant_min", "a{3,}", "aaaaa", 1}, + {"quant_combined", "[0-9]{3}-[0-9]{4}", "555-1234", 1}, + + {"email_simple", "[a-z]+@[a-z]+\\.[a-z]+", "test@example.com", 1}, + {"email_complex", "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "user.name+tag@sub.example.com", 1}, + {"ip_address", "[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}", "192.168.1.100", 1}, + {"url_http", "https?://[a-zA-Z0-9.-]+", "https://www.example.com", 1}, + {"phone_us", "[0-9]{3}-[0-9]{3}-[0-9]{4}", "555-123-4567", 1}, + {"date_iso", "[0-9]{4}-[0-9]{2}-[0-9]{2}", "2024-01-15", 1}, + {"time_hms", "[0-9]{2}:[0-9]{2}:[0-9]{2}", "14:30:45", 1}, + {"hex_color", "#[0-9a-fA-F]{6}", "#ff00ff", 1}, + + {"word_boundary", "[a-zA-Z]+", "hello world test", 1}, + {"whitespace", "[ \\t\\n]+", "hello world", 1}, + {"identifier", "[a-zA-Z_][a-zA-Z0-9_]*", "_privateVar123", 1}, + {"number_int", "-?[0-9]+", "-12345", 1}, + {"number_float", "-?[0-9]+\\.[0-9]+", "3.14159", 1}, + + {"long_text_start", "^The", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 1}, + {"long_text_end", "dog\\.$", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 1}, + {"long_text_middle", "fox", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 1}, + {"long_text_nomatch", "elephant", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 0}, + + {"repeated_ab", "(ab){5}", "ababababab", 1}, + {"repeated_word", "(hello ){3}", "hello hello hello ", 1}, + {"alternation_long", "one|two|three|four|five|six|seven|eight|nine|ten", "the number is seven", 1}, + + {"escape_dot", "3\\.14", "pi is 3.14", 1}, + {"escape_star", "a\\*b", "a*b", 1}, + {"escape_plus", "c\\+\\+", "c++", 1}, + {"escape_parens", "\\(test\\)", "(test)", 1}, + {"escape_brackets", "\\[0\\]", "array[0]", 1}, + + {"stress_star", "a*a*a*a*a*b", "aaaaab", 1}, + {"stress_plus", "a+a+a+a+a+b", "aaaaab", 1}, + {"stress_nested", "((a+)+)+b", "aaaab", 1}, + {"stress_alt", "(a|aa|aaa|aaaa)+b", "aaaab", 1}, + + {"nomatch_literal", "notfound", "the quick brown fox", 0}, + {"nomatch_pattern", "^end", "start middle end", 0}, + {"nomatch_class", "[0-9]+", "no digits here", 0}, + + {NULL, NULL, NULL, 0} +}; + +static double get_time_us(void) { + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000000.0 + tv.tv_usec; +} + +static result_t run_benchmark(benchmark_t *bench) { + result_t res = {0}; + double start, end; + + for (int i = 0; i < WARMUP; i++) { + lorex_error_t err; + lorex_regex_t *re = lorex_compile(bench->pattern, &err); + if (re) { + lorex_match_t m; + lorex_search(re, bench->text, &m); + lorex_free(re); + } + } + + start = get_time_us(); + for (int i = 0; i < ITERATIONS; i++) { + lorex_error_t err; + lorex_regex_t *re = lorex_compile(bench->pattern, &err); + if (!re) { + res.lorex_failed = 1; + break; + } + lorex_free(re); + } + end = get_time_us(); + res.lorex_compile_us = (end - start) / ITERATIONS; + + start = get_time_us(); + { + lorex_error_t err; + lorex_regex_t *re = lorex_compile(bench->pattern, &err); + if (re) { + for (int i = 0; i < ITERATIONS; i++) { + lorex_match_t m; + res.lorex_matched = lorex_search(re, bench->text, &m) ? 1 : 0; + } + lorex_free(re); + } + } + end = get_time_us(); + res.lorex_match_us = (end - start) / ITERATIONS; + res.lorex_total_us = res.lorex_compile_us + res.lorex_match_us; + + for (int i = 0; i < WARMUP; i++) { + regex_t preg; + if (regcomp(&preg, bench->pattern, REG_EXTENDED) == 0) { + regmatch_t pmatch[1]; + regexec(&preg, bench->text, 1, pmatch, 0); + regfree(&preg); + } + } + + start = get_time_us(); + for (int i = 0; i < ITERATIONS; i++) { + regex_t preg; + if (regcomp(&preg, bench->pattern, REG_EXTENDED) != 0) { + res.posix_failed = 1; + break; + } + regfree(&preg); + } + end = get_time_us(); + res.posix_compile_us = (end - start) / ITERATIONS; + + start = get_time_us(); + { + regex_t preg; + if (regcomp(&preg, bench->pattern, REG_EXTENDED) == 0) { + for (int i = 0; i < ITERATIONS; i++) { + regmatch_t pmatch[1]; + res.posix_matched = (regexec(&preg, bench->text, 1, pmatch, 0) == 0) ? 1 : 0; + } + regfree(&preg); + } + } + end = get_time_us(); + res.posix_match_us = (end - start) / ITERATIONS; + res.posix_total_us = res.posix_compile_us + res.posix_match_us; + + return res; +} + +int main(void) { + printf("================================================================================\n"); + printf(" LOREX vs POSIX REGEX PERFORMANCE BENCHMARK\n"); + printf("================================================================================\n\n"); + printf("Configuration:\n"); + printf(" Iterations per test: %d\n", ITERATIONS); + printf(" Warmup iterations: %d\n", WARMUP); + printf("\n"); + + int total_tests = 0; + int lorex_wins = 0; + int posix_wins = 0; + int ties = 0; + double total_lorex_time = 0; + double total_posix_time = 0; + + int lorex_compile_wins = 0; + int posix_compile_wins = 0; + int lorex_match_wins = 0; + int posix_match_wins = 0; + + printf("================================================================================\n"); + printf("%-25s | %-12s | %-12s | %-12s | %-8s\n", "TEST NAME", "LOREX (us)", "POSIX (us)", "SPEEDUP", "WINNER"); + printf("================================================================================\n"); + + for (int i = 0; benchmarks[i].name != NULL; i++) { + benchmark_t *bench = &benchmarks[i]; + result_t res = run_benchmark(bench); + + if (res.lorex_failed || res.posix_failed) { + printf("%-25s | %-12s | %-12s | %-12s | %-8s\n", + bench->name, + res.lorex_failed ? "FAILED" : "OK", + res.posix_failed ? "FAILED" : "OK", + "-", "-"); + continue; + } + + total_tests++; + total_lorex_time += res.lorex_total_us; + total_posix_time += res.posix_total_us; + + double speedup = res.posix_total_us / res.lorex_total_us; + const char *winner; + + if (speedup > 1.05) { + winner = "LOREX"; + lorex_wins++; + } else if (speedup < 0.95) { + winner = "POSIX"; + posix_wins++; + } else { + winner = "TIE"; + ties++; + } + + if (res.lorex_compile_us < res.posix_compile_us) lorex_compile_wins++; + else posix_compile_wins++; + + if (res.lorex_match_us < res.posix_match_us) lorex_match_wins++; + else posix_match_wins++; + + printf("%-25s | %10.3f | %10.3f | %10.2fx | %-8s\n", + bench->name, + res.lorex_total_us, + res.posix_total_us, + speedup, + winner); + } + + printf("================================================================================\n\n"); + + printf("================================================================================\n"); + printf(" DETAILED RESULTS\n"); + printf("================================================================================\n\n"); + + printf("%-25s | %-20s | %-20s\n", "TEST NAME", "LOREX (compile/match)", "POSIX (compile/match)"); + printf("--------------------------------------------------------------------------------\n"); + + for (int i = 0; benchmarks[i].name != NULL; i++) { + benchmark_t *bench = &benchmarks[i]; + result_t res = run_benchmark(bench); + + if (res.lorex_failed || res.posix_failed) continue; + + printf("%-25s | %8.3f / %8.3f | %8.3f / %8.3f\n", + bench->name, + res.lorex_compile_us, res.lorex_match_us, + res.posix_compile_us, res.posix_match_us); + } + + printf("\n================================================================================\n"); + printf(" SUMMARY\n"); + printf("================================================================================\n\n"); + + printf("Total tests: %d\n", total_tests); + printf("\n"); + printf("Overall wins:\n"); + printf(" LOREX wins: %d (%.1f%%)\n", lorex_wins, 100.0 * lorex_wins / total_tests); + printf(" POSIX wins: %d (%.1f%%)\n", posix_wins, 100.0 * posix_wins / total_tests); + printf(" Ties: %d (%.1f%%)\n", ties, 100.0 * ties / total_tests); + printf("\n"); + printf("Compilation phase wins:\n"); + printf(" LOREX faster: %d\n", lorex_compile_wins); + printf(" POSIX faster: %d\n", posix_compile_wins); + printf("\n"); + printf("Matching phase wins:\n"); + printf(" LOREX faster: %d\n", lorex_match_wins); + printf(" POSIX faster: %d\n", posix_match_wins); + printf("\n"); + printf("Total time (all tests):\n"); + printf(" LOREX: %.3f us\n", total_lorex_time); + printf(" POSIX: %.3f us\n", total_posix_time); + printf(" Overall speedup: %.2fx %s\n", + total_posix_time > total_lorex_time ? total_posix_time / total_lorex_time : total_lorex_time / total_posix_time, + total_posix_time > total_lorex_time ? "(LOREX faster)" : "(POSIX faster)"); + + printf("\n================================================================================\n"); + printf(" CATEGORY BREAKDOWN\n"); + printf("================================================================================\n\n"); + + typedef struct { + const char *category; + const char *prefix; + double lorex_total; + double posix_total; + int count; + } category_t; + + category_t categories[] = { + {"Literal matching", "literal_", 0, 0, 0}, + {"Dot metacharacter", "dot_", 0, 0, 0}, + {"Anchors", "anchor_", 0, 0, 0}, + {"Star quantifier", "star_", 0, 0, 0}, + {"Plus quantifier", "plus_", 0, 0, 0}, + {"Question quantifier", "question_", 0, 0, 0}, + {"Character classes", "class_", 0, 0, 0}, + {"Alternation", "alt_", 0, 0, 0}, + {"Groups", "group_", 0, 0, 0}, + {"Brace quantifiers", "quant_", 0, 0, 0}, + {"Real-world patterns", "email_", 0, 0, 0}, + {"Escape sequences", "escape_", 0, 0, 0}, + {"Stress tests", "stress_", 0, 0, 0}, + {"No-match tests", "nomatch_", 0, 0, 0}, + {NULL, NULL, 0, 0, 0} + }; + + for (int i = 0; benchmarks[i].name != NULL; i++) { + benchmark_t *bench = &benchmarks[i]; + result_t res = run_benchmark(bench); + if (res.lorex_failed || res.posix_failed) continue; + + for (int j = 0; categories[j].category != NULL; j++) { + if (strncmp(bench->name, categories[j].prefix, strlen(categories[j].prefix)) == 0) { + categories[j].lorex_total += res.lorex_total_us; + categories[j].posix_total += res.posix_total_us; + categories[j].count++; + break; + } + } + } + + printf("%-25s | %-12s | %-12s | %-12s | %-8s\n", "CATEGORY", "LOREX (us)", "POSIX (us)", "SPEEDUP", "WINNER"); + printf("--------------------------------------------------------------------------------\n"); + + for (int i = 0; categories[i].category != NULL; i++) { + if (categories[i].count == 0) continue; + + double speedup = categories[i].posix_total / categories[i].lorex_total; + const char *winner = speedup > 1.0 ? "LOREX" : "POSIX"; + + printf("%-25s | %10.3f | %10.3f | %10.2fx | %-8s\n", + categories[i].category, + categories[i].lorex_total, + categories[i].posix_total, + speedup > 1.0 ? speedup : 1.0 / speedup, + winner); + } + + printf("\n================================================================================\n"); + printf(" PATTERN DETAILS\n"); + printf("================================================================================\n\n"); + + for (int i = 0; benchmarks[i].name != NULL; i++) { + benchmark_t *bench = &benchmarks[i]; + result_t res = run_benchmark(bench); + + printf("Test: %s\n", bench->name); + printf(" Pattern: %s\n", bench->pattern); + printf(" Text: %.50s%s\n", bench->text, strlen(bench->text) > 50 ? "..." : ""); + printf(" Expected: %s\n", bench->expect_match ? "MATCH" : "NO MATCH"); + + if (res.lorex_failed) { + printf(" LOREX: FAILED TO COMPILE\n"); + } else { + printf(" LOREX: %s (compile: %.3f us, match: %.3f us, total: %.3f us)\n", + res.lorex_matched ? "MATCHED" : "NO MATCH", + res.lorex_compile_us, res.lorex_match_us, res.lorex_total_us); + } + + if (res.posix_failed) { + printf(" POSIX: FAILED TO COMPILE\n"); + } else { + printf(" POSIX: %s (compile: %.3f us, match: %.3f us, total: %.3f us)\n", + res.posix_matched ? "MATCHED" : "NO MATCH", + res.posix_compile_us, res.posix_match_us, res.posix_total_us); + } + + if (!res.lorex_failed && !res.posix_failed) { + double speedup = res.posix_total_us / res.lorex_total_us; + if (speedup > 1.0) { + printf(" Result: LOREX is %.2fx faster\n", speedup); + } else { + printf(" Result: POSIX is %.2fx faster\n", 1.0 / speedup); + } + } + printf("\n"); + } + + printf("================================================================================\n"); + printf(" BENCHMARK COMPLETE\n"); + printf("================================================================================\n"); + + return 0; +} diff --git a/tests/test_all.c b/tests/test_all.c index 07088c8..8eb9a0d 100644 --- a/tests/test_all.c +++ b/tests/test_all.c @@ -1,5 +1,5 @@ /* retoor */ -#include "../include/loreg.h" +#include "../include/lorex.h" #include #include #include @@ -22,211 +22,211 @@ static int total_failed = 0; } while(0) TEST(basic_literals) { - loreg_error_t err; - loreg_regex_t *re = loreg_compile("hello", &err); + lorex_error_t err; + lorex_regex_t *re = lorex_compile("hello", &err); ASSERT(re != NULL, "compile hello"); - loreg_match_t m; - ASSERT(loreg_search(re, "hello", &m), "match hello"); - ASSERT(loreg_search(re, "say hello world", &m), "search hello"); - ASSERT(!loreg_search(re, "helo", &m), "no match helo"); + lorex_match_t m; + ASSERT(lorex_search(re, "hello", &m), "match hello"); + ASSERT(lorex_search(re, "say hello world", &m), "search hello"); + ASSERT(!lorex_search(re, "helo", &m), "no match helo"); - loreg_free(re); + lorex_free(re); } TEST(metacharacters) { - loreg_error_t err; - loreg_match_t m; + lorex_error_t err; + lorex_match_t m; - loreg_regex_t *re = loreg_compile("a.c", &err); + lorex_regex_t *re = lorex_compile("a.c", &err); ASSERT(re != NULL, "compile a.c"); - ASSERT(loreg_search(re, "abc", &m), "match abc"); - ASSERT(loreg_search(re, "axc", &m), "match axc"); - ASSERT(!loreg_search(re, "ac", &m), "no match ac"); - loreg_free(re); + ASSERT(lorex_search(re, "abc", &m), "match abc"); + ASSERT(lorex_search(re, "axc", &m), "match axc"); + ASSERT(!lorex_search(re, "ac", &m), "no match ac"); + lorex_free(re); - re = loreg_compile("^start", &err); + re = lorex_compile("^start", &err); ASSERT(re != NULL, "compile ^start"); - ASSERT(loreg_search(re, "start here", &m), "match start here"); - ASSERT(!loreg_search(re, "not start", &m), "no match not start"); - loreg_free(re); + ASSERT(lorex_search(re, "start here", &m), "match start here"); + ASSERT(!lorex_search(re, "not start", &m), "no match not start"); + lorex_free(re); - re = loreg_compile("end$", &err); + re = lorex_compile("end$", &err); ASSERT(re != NULL, "compile end$"); - ASSERT(loreg_search(re, "the end", &m), "match the end"); - ASSERT(!loreg_search(re, "end here", &m), "no match end here"); - loreg_free(re); + ASSERT(lorex_search(re, "the end", &m), "match the end"); + ASSERT(!lorex_search(re, "end here", &m), "no match end here"); + lorex_free(re); } TEST(quantifiers) { - loreg_error_t err; - loreg_match_t m; + lorex_error_t err; + lorex_match_t m; - loreg_regex_t *re = loreg_compile("ab*c", &err); + lorex_regex_t *re = lorex_compile("ab*c", &err); ASSERT(re != NULL, "compile ab*c"); - ASSERT(loreg_search(re, "ac", &m), "match ac"); - ASSERT(loreg_search(re, "abc", &m), "match abc"); - ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc"); - loreg_free(re); + ASSERT(lorex_search(re, "ac", &m), "match ac"); + ASSERT(lorex_search(re, "abc", &m), "match abc"); + ASSERT(lorex_search(re, "abbbbc", &m), "match abbbbc"); + lorex_free(re); - re = loreg_compile("ab+c", &err); + re = lorex_compile("ab+c", &err); ASSERT(re != NULL, "compile ab+c"); - ASSERT(!loreg_search(re, "ac", &m), "no match ac"); - ASSERT(loreg_search(re, "abc", &m), "match abc"); - ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc"); - loreg_free(re); + ASSERT(!lorex_search(re, "ac", &m), "no match ac"); + ASSERT(lorex_search(re, "abc", &m), "match abc"); + ASSERT(lorex_search(re, "abbbbc", &m), "match abbbbc"); + lorex_free(re); - re = loreg_compile("ab?c", &err); + re = lorex_compile("ab?c", &err); ASSERT(re != NULL, "compile ab?c"); - ASSERT(loreg_search(re, "ac", &m), "match ac"); - ASSERT(loreg_search(re, "abc", &m), "match abc"); - ASSERT(!loreg_search(re, "abbc", &m), "no match abbc"); - loreg_free(re); + ASSERT(lorex_search(re, "ac", &m), "match ac"); + ASSERT(lorex_search(re, "abc", &m), "match abc"); + ASSERT(!lorex_search(re, "abbc", &m), "no match abbc"); + lorex_free(re); - re = loreg_compile("a{3}", &err); + re = lorex_compile("a{3}", &err); ASSERT(re != NULL, "compile a{3}"); - ASSERT(loreg_search(re, "aaa", &m), "match aaa"); - ASSERT(!loreg_search(re, "aa", &m), "no match aa"); - loreg_free(re); + ASSERT(lorex_search(re, "aaa", &m), "match aaa"); + ASSERT(!lorex_search(re, "aa", &m), "no match aa"); + lorex_free(re); - re = loreg_compile("a{2,4}", &err); + re = lorex_compile("a{2,4}", &err); ASSERT(re != NULL, "compile a{2,4}"); - ASSERT(loreg_search(re, "aa", &m), "match aa"); - ASSERT(loreg_search(re, "aaa", &m), "match aaa"); - ASSERT(loreg_search(re, "aaaa", &m), "match aaaa"); - ASSERT(!loreg_search(re, "a", &m), "no match a"); - loreg_free(re); + ASSERT(lorex_search(re, "aa", &m), "match aa"); + ASSERT(lorex_search(re, "aaa", &m), "match aaa"); + ASSERT(lorex_search(re, "aaaa", &m), "match aaaa"); + ASSERT(!lorex_search(re, "a", &m), "no match a"); + lorex_free(re); } TEST(character_classes) { - loreg_error_t err; - loreg_match_t m; + lorex_error_t err; + lorex_match_t m; - loreg_regex_t *re = loreg_compile("[aeiou]", &err); + lorex_regex_t *re = lorex_compile("[aeiou]", &err); ASSERT(re != NULL, "compile [aeiou]"); - ASSERT(loreg_search(re, "a", &m), "match a"); - ASSERT(loreg_search(re, "test", &m), "match test"); - ASSERT(!loreg_search(re, "xyz", &m), "no match xyz"); - loreg_free(re); + ASSERT(lorex_search(re, "a", &m), "match a"); + ASSERT(lorex_search(re, "test", &m), "match test"); + ASSERT(!lorex_search(re, "xyz", &m), "no match xyz"); + lorex_free(re); - re = loreg_compile("[a-z]", &err); + re = lorex_compile("[a-z]", &err); ASSERT(re != NULL, "compile [a-z]"); - ASSERT(loreg_search(re, "m", &m), "match m"); - ASSERT(!loreg_search(re, "5", &m), "no match 5"); - loreg_free(re); + ASSERT(lorex_search(re, "m", &m), "match m"); + ASSERT(!lorex_search(re, "5", &m), "no match 5"); + lorex_free(re); - re = loreg_compile("[^0-9]", &err); + re = lorex_compile("[^0-9]", &err); ASSERT(re != NULL, "compile [^0-9]"); - ASSERT(loreg_search(re, "a", &m), "match a"); - ASSERT(!loreg_search(re, "5", &m), "no match 5"); - loreg_free(re); + ASSERT(lorex_search(re, "a", &m), "match a"); + ASSERT(!lorex_search(re, "5", &m), "no match 5"); + lorex_free(re); - re = loreg_compile("\\d", &err); + re = lorex_compile("\\d", &err); ASSERT(re != NULL, "compile \\d"); - ASSERT(loreg_search(re, "5", &m), "match 5"); - ASSERT(!loreg_search(re, "a", &m), "no match a"); - loreg_free(re); + ASSERT(lorex_search(re, "5", &m), "match 5"); + ASSERT(!lorex_search(re, "a", &m), "no match a"); + lorex_free(re); - re = loreg_compile("\\w+", &err); + re = lorex_compile("\\w+", &err); ASSERT(re != NULL, "compile \\w+"); - ASSERT(loreg_search(re, "hello_123", &m), "match hello_123"); - loreg_free(re); + ASSERT(lorex_search(re, "hello_123", &m), "match hello_123"); + lorex_free(re); - re = loreg_compile("\\s", &err); + re = lorex_compile("\\s", &err); ASSERT(re != NULL, "compile \\s"); - ASSERT(loreg_search(re, " ", &m), "match space"); - ASSERT(loreg_search(re, "\t", &m), "match tab"); - ASSERT(!loreg_search(re, "a", &m), "no match a"); - loreg_free(re); + ASSERT(lorex_search(re, " ", &m), "match space"); + ASSERT(lorex_search(re, "\t", &m), "match tab"); + ASSERT(!lorex_search(re, "a", &m), "no match a"); + lorex_free(re); } TEST(groups) { - loreg_error_t err; - loreg_match_t m; + lorex_error_t err; + lorex_match_t m; - loreg_regex_t *re = loreg_compile("(ab)+", &err); + lorex_regex_t *re = lorex_compile("(ab)+", &err); ASSERT(re != NULL, "compile (ab)+"); - ASSERT(loreg_search(re, "ab", &m), "match ab"); - ASSERT(loreg_search(re, "abab", &m), "match abab"); - ASSERT(!loreg_search(re, "a", &m), "no match a"); - loreg_free(re); + ASSERT(lorex_search(re, "ab", &m), "match ab"); + ASSERT(lorex_search(re, "abab", &m), "match abab"); + ASSERT(!lorex_search(re, "a", &m), "no match a"); + lorex_free(re); - re = loreg_compile("(\\d+)-(\\d+)", &err); + re = lorex_compile("(\\d+)-(\\d+)", &err); ASSERT(re != NULL, "compile groups"); - ASSERT(loreg_search(re, "123-456", &m), "match 123-456"); + ASSERT(lorex_search(re, "123-456", &m), "match 123-456"); ASSERT(m.group_count == 2, "2 groups"); ASSERT(m.groups[0].matched, "group 0 matched"); ASSERT(m.groups[1].matched, "group 1 matched"); - loreg_free(re); + lorex_free(re); } TEST(alternation) { - loreg_error_t err; - loreg_match_t m; + lorex_error_t err; + lorex_match_t m; - loreg_regex_t *re = loreg_compile("cat|dog", &err); + lorex_regex_t *re = lorex_compile("cat|dog", &err); ASSERT(re != NULL, "compile cat|dog"); - ASSERT(loreg_search(re, "cat", &m), "match cat"); - ASSERT(loreg_search(re, "dog", &m), "match dog"); - ASSERT(!loreg_search(re, "rat", &m), "no match rat"); - loreg_free(re); + ASSERT(lorex_search(re, "cat", &m), "match cat"); + ASSERT(lorex_search(re, "dog", &m), "match dog"); + ASSERT(!lorex_search(re, "rat", &m), "no match rat"); + lorex_free(re); - re = loreg_compile("(red|blue) car", &err); + re = lorex_compile("(red|blue) car", &err); ASSERT(re != NULL, "compile (red|blue) car"); - ASSERT(loreg_search(re, "red car", &m), "match red car"); - ASSERT(loreg_search(re, "blue car", &m), "match blue car"); - ASSERT(!loreg_search(re, "green car", &m), "no match green car"); - loreg_free(re); + ASSERT(lorex_search(re, "red car", &m), "match red car"); + ASSERT(lorex_search(re, "blue car", &m), "match blue car"); + ASSERT(!lorex_search(re, "green car", &m), "no match green car"); + lorex_free(re); } TEST(escapes) { - loreg_error_t err; - loreg_match_t m; + lorex_error_t err; + lorex_match_t m; - loreg_regex_t *re = loreg_compile("1\\.5", &err); + lorex_regex_t *re = lorex_compile("1\\.5", &err); ASSERT(re != NULL, "compile 1\\.5"); - ASSERT(loreg_search(re, "1.5", &m), "match 1.5"); - ASSERT(!loreg_search(re, "1x5", &m), "no match 1x5"); - loreg_free(re); + ASSERT(lorex_search(re, "1.5", &m), "match 1.5"); + ASSERT(!lorex_search(re, "1x5", &m), "no match 1x5"); + lorex_free(re); - re = loreg_compile("\\(test\\)", &err); + re = lorex_compile("\\(test\\)", &err); ASSERT(re != NULL, "compile \\(test\\)"); - ASSERT(loreg_search(re, "(test)", &m), "match (test)"); - loreg_free(re); + ASSERT(lorex_search(re, "(test)", &m), "match (test)"); + lorex_free(re); } TEST(real_patterns) { - loreg_error_t err; - loreg_match_t m; + lorex_error_t err; + lorex_match_t m; - loreg_regex_t *re = loreg_compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", &err); + lorex_regex_t *re = lorex_compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", &err); ASSERT(re != NULL, "compile email"); - ASSERT(loreg_search(re, "user@example.com", &m), "match email"); - ASSERT(!loreg_search(re, "invalid", &m), "no match invalid"); - loreg_free(re); + ASSERT(lorex_search(re, "user@example.com", &m), "match email"); + ASSERT(!lorex_search(re, "invalid", &m), "no match invalid"); + lorex_free(re); - re = loreg_compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", &err); + re = lorex_compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", &err); ASSERT(re != NULL, "compile ip"); - ASSERT(loreg_search(re, "192.168.1.1", &m), "match ip"); - loreg_free(re); + ASSERT(lorex_search(re, "192.168.1.1", &m), "match ip"); + lorex_free(re); - re = loreg_compile("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", &err); + re = lorex_compile("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", &err); ASSERT(re != NULL, "compile url"); - ASSERT(loreg_search(re, "http://example.com", &m), "match http"); - ASSERT(loreg_search(re, "https://example.com/path", &m), "match https"); - loreg_free(re); + ASSERT(lorex_search(re, "http://example.com", &m), "match http"); + ASSERT(lorex_search(re, "https://example.com/path", &m), "match https"); + lorex_free(re); } TEST(error_handling) { - loreg_error_t err; + lorex_error_t err; - loreg_regex_t *re = loreg_compile("(abc", &err); + lorex_regex_t *re = lorex_compile("(abc", &err); ASSERT(re == NULL, "unbalanced paren"); - ASSERT(err == LOREG_ERR_UNBALANCED_PAREN, "correct error"); + ASSERT(err == LOREX_ERR_UNBALANCED_PAREN, "correct error"); } int main(void) { - printf("loreg comprehensive tests\n"); + printf("lorex comprehensive tests\n"); printf("========================\n\n"); clock_t start = clock(); diff --git a/tests/test_integration.c b/tests/test_integration.c index 4134b34..554e682 100644 --- a/tests/test_integration.c +++ b/tests/test_integration.c @@ -1,5 +1,5 @@ /* retoor */ -#include "../include/loreg.h" +#include "../include/lorex.h" #include #include @@ -10,22 +10,22 @@ static int failed = 0; #define NO_MATCH(pat, txt) test_match(pat, txt, 0, __LINE__) static void test_match(const char *pattern, const char *text, int expect, int line) { - loreg_error_t err; - loreg_regex_t *re = loreg_compile(pattern, &err); + lorex_error_t err; + lorex_regex_t *re = lorex_compile(pattern, &err); if (!re) { - printf("FAIL line %d: compile error for '%s': %s\n", line, pattern, loreg_error_string(err)); + printf("FAIL line %d: compile error for '%s': %s\n", line, pattern, lorex_error_string(err)); failed++; return; } - loreg_match_t m; - int result = loreg_search(re, text, &m) ? 1 : 0; + lorex_match_t m; + int result = lorex_search(re, text, &m) ? 1 : 0; if (result != expect) { printf("FAIL line %d: '%s' vs '%s' expected %s\n", line, pattern, text, expect ? "match" : "no match"); failed++; } else { passed++; } - loreg_free(re); + lorex_free(re); } static void test_literals(void) { @@ -613,8 +613,240 @@ static void test_pathological_patterns(void) { MATCH("(a?){5}a{5}", "aaaaa"); } +static void test_anchored_match(void) { + printf(" anchored match (lorex_match)...\n"); + lorex_error_t err; + lorex_match_t m; + + lorex_regex_t *re = lorex_compile("abc", &err); + if (re) { + if (lorex_match(re, "abc", &m)) { + passed++; + } else { + printf("FAIL: lorex_match should match 'abc' against 'abc'\n"); + failed++; + } + if (!lorex_match(re, "xabc", &m)) { + passed++; + } else { + printf("FAIL: lorex_match should not match 'abc' against 'xabc'\n"); + failed++; + } + if (lorex_match(re, "abcx", &m)) { + passed++; + } else { + printf("FAIL: lorex_match should match 'abc' at start of 'abcx'\n"); + failed++; + } + lorex_free(re); + } + + re = lorex_compile("^abc$", &err); + if (re) { + if (lorex_match(re, "abc", &m)) { + passed++; + } else { + printf("FAIL: lorex_match should match '^abc$' against 'abc'\n"); + failed++; + } + if (!lorex_match(re, "abcx", &m)) { + passed++; + } else { + printf("FAIL: lorex_match should not match '^abc$' against 'abcx'\n"); + failed++; + } + lorex_free(re); + } + + re = lorex_compile("a.*z", &err); + if (re) { + if (lorex_match(re, "abcz", &m)) { + passed++; + } else { + printf("FAIL: lorex_match should match 'a.*z' against 'abcz'\n"); + failed++; + } + if (!lorex_match(re, "xabcz", &m)) { + passed++; + } else { + printf("FAIL: lorex_match should not match 'a.*z' against 'xabcz'\n"); + failed++; + } + lorex_free(re); + } +} + +static void test_error_strings(void) { + printf(" error strings...\n"); + + if (strcmp(lorex_error_string(LOREX_OK), "success") == 0) { + passed++; + } else { + printf("FAIL: LOREX_OK should return 'success'\n"); + failed++; + } + + if (strcmp(lorex_error_string(LOREX_ERR_INVALID_PATTERN), "invalid pattern") == 0) { + passed++; + } else { + printf("FAIL: LOREX_ERR_INVALID_PATTERN error string\n"); + failed++; + } + + if (strcmp(lorex_error_string(LOREX_ERR_UNBALANCED_PAREN), "unbalanced parentheses") == 0) { + passed++; + } else { + printf("FAIL: LOREX_ERR_UNBALANCED_PAREN error string\n"); + failed++; + } + + if (strcmp(lorex_error_string(LOREX_ERR_EMPTY_GROUP), "empty group") == 0) { + passed++; + } else { + printf("FAIL: LOREX_ERR_EMPTY_GROUP error string\n"); + failed++; + } + + if (strcmp(lorex_error_string(LOREX_ERR_INVALID_QUANTIFIER), "invalid quantifier") == 0) { + passed++; + } else { + printf("FAIL: LOREX_ERR_INVALID_QUANTIFIER error string\n"); + failed++; + } + + if (strcmp(lorex_error_string(LOREX_ERR_INVALID_ESCAPE), "invalid escape sequence") == 0) { + passed++; + } else { + printf("FAIL: LOREX_ERR_INVALID_ESCAPE error string\n"); + failed++; + } + + if (strcmp(lorex_error_string(LOREX_ERR_OUT_OF_MEMORY), "out of memory") == 0) { + passed++; + } else { + printf("FAIL: LOREX_ERR_OUT_OF_MEMORY error string\n"); + failed++; + } + + if (strcmp(lorex_error_string(LOREX_ERR_STATE_OVERFLOW), "state overflow") == 0) { + passed++; + } else { + printf("FAIL: LOREX_ERR_STATE_OVERFLOW error string\n"); + failed++; + } + + if (strcmp(lorex_error_string((lorex_error_t)99), "unknown error") == 0) { + passed++; + } else { + printf("FAIL: unknown error code should return 'unknown error'\n"); + failed++; + } +} + +static void test_parser_errors(void) { + printf(" parser errors...\n"); + lorex_error_t err; + lorex_regex_t *re; + + re = lorex_compile("(abc", &err); + if (re == NULL && err == LOREX_ERR_UNBALANCED_PAREN) { + passed++; + } else { + printf("FAIL: '(abc' should fail with unbalanced paren\n"); + failed++; + if (re) lorex_free(re); + } + + re = lorex_compile("((a)", &err); + if (re == NULL && err == LOREX_ERR_UNBALANCED_PAREN) { + passed++; + } else { + printf("FAIL: '((a)' should fail with unbalanced paren\n"); + failed++; + if (re) lorex_free(re); + } + + re = lorex_compile("a{5,2}", &err); + if (re == NULL) { + passed++; + } else { + printf("FAIL: 'a{5,2}' should fail (min > max)\n"); + failed++; + lorex_free(re); + } + + re = lorex_compile("*abc", &err); + if (re == NULL) { + passed++; + } else { + printf("FAIL: '*abc' should fail\n"); + failed++; + lorex_free(re); + } + + re = lorex_compile("+abc", &err); + if (re == NULL) { + passed++; + } else { + printf("FAIL: '+abc' should fail\n"); + failed++; + lorex_free(re); + } + + re = lorex_compile("?abc", &err); + if (re == NULL) { + passed++; + } else { + printf("FAIL: '?abc' should fail\n"); + failed++; + lorex_free(re); + } +} + +static void test_bracket_char_classes(void) { + printf(" bracket character classes...\n"); + MATCH("[\\d]", "5"); + MATCH("[\\d]+", "12345"); + NO_MATCH("[\\d]", "a"); + MATCH("[\\w]", "a"); + MATCH("[\\w]", "Z"); + MATCH("[\\w]", "5"); + MATCH("[\\w]", "_"); + NO_MATCH("[\\w]", " "); + MATCH("[\\s]", " "); + MATCH("[\\s]", "\t"); + NO_MATCH("[\\s]", "a"); + MATCH("[a\\d]", "a"); + MATCH("[a\\d]", "5"); + NO_MATCH("[a\\d]", "b"); + MATCH("[\\da-z]", "5"); + MATCH("[\\da-z]", "m"); + NO_MATCH("[\\da-z]", "M"); + MATCH("[\\w\\s]+", "hello world"); + MATCH("[0-9\\s]+", "1 2 3"); + MATCH("[\\w-]+", "hello-world"); +} + +static void test_special_escapes(void) { + printf(" special escape sequences...\n"); + MATCH("\\n", "\n"); + MATCH("a\\nb", "a\nb"); + MATCH("\\t", "\t"); + MATCH("a\\tb", "a\tb"); + MATCH("\\r", "\r"); + MATCH("a\\rb", "a\rb"); + MATCH("\\n\\t\\r", "\n\t\r"); + MATCH("[\\n]", "\n"); + MATCH("[\\t]", "\t"); + MATCH("[\\r]", "\r"); + MATCH("[\\n\\t]+", "\n\t\n"); + NO_MATCH("\\n", "n"); + NO_MATCH("\\t", "t"); + NO_MATCH("\\r", "r"); +} + int main(void) { - printf("loreg integration tests\n"); + printf("lorex integration tests\n"); printf("=======================\n\n"); test_literals(); @@ -641,6 +873,11 @@ int main(void) { test_nested_groups(); test_real_world_patterns(); test_pathological_patterns(); + test_anchored_match(); + test_error_strings(); + test_parser_errors(); + test_bracket_char_classes(); + test_special_escapes(); printf("\n=======================\n"); printf("integration: %d passed, %d failed\n", passed, failed); diff --git a/tests/test_matcher.c b/tests/test_matcher.c index 0e51333..e85a9ce 100644 --- a/tests/test_matcher.c +++ b/tests/test_matcher.c @@ -1,5 +1,5 @@ /* retoor */ -#include "../include/loreg.h" +#include "../include/lorex.h" #include #include @@ -23,21 +23,21 @@ static int tests_failed = 0; } while(0) #define ASSERT_MATCH(pattern, text) do { \ - loreg_error_t err; \ - loreg_regex_t *re = loreg_compile(pattern, &err); \ + lorex_error_t err; \ + lorex_regex_t *re = lorex_compile(pattern, &err); \ ASSERT(re != NULL); \ - loreg_match_t result; \ - ASSERT(loreg_search(re, text, &result) == true); \ - loreg_free(re); \ + lorex_match_t result; \ + ASSERT(lorex_search(re, text, &result) == true); \ + lorex_free(re); \ } while(0) #define ASSERT_NO_MATCH(pattern, text) do { \ - loreg_error_t err; \ - loreg_regex_t *re = loreg_compile(pattern, &err); \ + lorex_error_t err; \ + lorex_regex_t *re = lorex_compile(pattern, &err); \ ASSERT(re != NULL); \ - loreg_match_t result; \ - ASSERT(loreg_search(re, text, &result) == false); \ - loreg_free(re); \ + lorex_match_t result; \ + ASSERT(lorex_search(re, text, &result) == false); \ + lorex_free(re); \ } while(0) TEST(simple_char) { @@ -209,53 +209,53 @@ TEST(complex_url) { } TEST(group_capture) { - loreg_error_t err; - loreg_regex_t *re = loreg_compile("(\\d+)-(\\d+)", &err); + lorex_error_t err; + lorex_regex_t *re = lorex_compile("(\\d+)-(\\d+)", &err); ASSERT(re != NULL); - loreg_match_t result; - ASSERT(loreg_search(re, "123-456", &result)); + lorex_match_t result; + ASSERT(lorex_search(re, "123-456", &result)); ASSERT(result.group_count == 2); ASSERT(result.groups[0].matched); ASSERT(result.groups[1].matched); - loreg_free(re); + lorex_free(re); } TEST(nested_groups) { - loreg_error_t err; - loreg_regex_t *re = loreg_compile("((a)(b))", &err); + lorex_error_t err; + lorex_regex_t *re = lorex_compile("((a)(b))", &err); ASSERT(re != NULL); - loreg_match_t result; - ASSERT(loreg_search(re, "ab", &result)); + lorex_match_t result; + ASSERT(lorex_search(re, "ab", &result)); ASSERT(result.group_count == 3); - loreg_free(re); + lorex_free(re); } TEST(empty_pattern) { - loreg_error_t err; - loreg_regex_t *re = loreg_compile("", &err); + lorex_error_t err; + lorex_regex_t *re = lorex_compile("", &err); ASSERT(re != NULL); - loreg_match_t result; - ASSERT(loreg_match(re, "anything", &result)); + lorex_match_t result; + ASSERT(lorex_match(re, "anything", &result)); - loreg_free(re); + lorex_free(re); } TEST(match_position) { - loreg_error_t err; - loreg_regex_t *re = loreg_compile("test", &err); + lorex_error_t err; + lorex_regex_t *re = lorex_compile("test", &err); ASSERT(re != NULL); - loreg_match_t result; - ASSERT(loreg_search(re, "xxxtestyyy", &result)); + lorex_match_t result; + ASSERT(lorex_search(re, "xxxtestyyy", &result)); ASSERT(result.match_start == 3); ASSERT(result.match_end == 7); - loreg_free(re); + lorex_free(re); } int main(void) { diff --git a/tests/test_nfa.c b/tests/test_nfa.c index 03c4b77..dc837e2 100644 --- a/tests/test_nfa.c +++ b/tests/test_nfa.c @@ -27,11 +27,11 @@ static nfa_t *compile_pattern(const char *pattern) { parser_t parser; parser_init(&parser, pattern); ast_node_t *ast = parser_parse(&parser); - if (!ast || parser_get_error(&parser) != LOREG_OK) { + if (!ast || parser_get_error(&parser) != LOREX_OK) { ast_free(ast); return NULL; } - loreg_error_t error; + lorex_error_t error; nfa_t *nfa = nfa_from_ast(ast, &error); ast_free(ast); return nfa; diff --git a/tests/test_parser.c b/tests/test_parser.c index de67ba9..b63d83d 100644 --- a/tests/test_parser.c +++ b/tests/test_parser.c @@ -245,7 +245,7 @@ TEST(complex_pattern) { ast_node_t *ast = parser_parse(&parser); ASSERT(ast != NULL); - ASSERT(parser_get_error(&parser) == LOREG_OK); + ASSERT(parser_get_error(&parser) == LOREX_OK); ast_free(ast); } @@ -255,7 +255,7 @@ TEST(unbalanced_paren) { parser_init(&parser, "(abc"); ast_node_t *ast = parser_parse(&parser); - ASSERT(ast == NULL || parser_get_error(&parser) == LOREG_ERR_UNBALANCED_PAREN); + ASSERT(ast == NULL || parser_get_error(&parser) == LOREX_ERR_UNBALANCED_PAREN); ast_free(ast); }