chore: update c, h, md files
This commit is contained in:
parent
3d9c4aa00b
commit
7f728a5284
3
.gitignore
vendored
3
.gitignore
vendored
@ -6,12 +6,13 @@ build/
|
||||
*.dylib
|
||||
|
||||
# Binary
|
||||
loreg
|
||||
lorex
|
||||
|
||||
# Coverage
|
||||
*.gcov
|
||||
*.gcda
|
||||
*.gcno
|
||||
coverage/
|
||||
|
||||
# Profiling
|
||||
gmon.out
|
||||
|
||||
@ -6,5 +6,5 @@
|
||||
|
||||
update c, h, md files
|
||||
|
||||
**Changes:** 25 files, 4449 lines
|
||||
**Languages:** C (3989 lines), Markdown (181 lines), Other (186 lines), YAML (93 lines)
|
||||
**Changes:** 25 files, 2474 lines
|
||||
**Languages:** C (2333 lines), Markdown (97 lines), Other (44 lines)
|
||||
|
||||
41
Makefile
41
Makefile
@ -14,19 +14,20 @@ SRC_DIR = src
|
||||
INC_DIR = include
|
||||
BUILD_DIR = build
|
||||
TEST_DIR = tests
|
||||
COV_DIR = coverage
|
||||
|
||||
SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \
|
||||
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c \
|
||||
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/lorex.c \
|
||||
$(SRC_DIR)/repl.c $(SRC_DIR)/main.c
|
||||
|
||||
LIB_SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \
|
||||
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c
|
||||
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/lorex.c
|
||||
|
||||
OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(SRCS))
|
||||
LIB_OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(LIB_SRCS))
|
||||
|
||||
TARGET = loreg
|
||||
LIB_TARGET = libloreg.a
|
||||
TARGET = lorex
|
||||
LIB_TARGET = liblorex.a
|
||||
|
||||
TEST_SRCS = $(TEST_DIR)/test_lexer.c $(TEST_DIR)/test_parser.c \
|
||||
$(TEST_DIR)/test_nfa.c $(TEST_DIR)/test_matcher.c \
|
||||
@ -36,7 +37,7 @@ TEST_BINS = $(BUILD_DIR)/test_lexer $(BUILD_DIR)/test_parser \
|
||||
$(BUILD_DIR)/test_nfa $(BUILD_DIR)/test_matcher \
|
||||
$(BUILD_DIR)/test_all $(BUILD_DIR)/test_integration
|
||||
|
||||
.PHONY: all clean test debug coverage profile valgrind help install
|
||||
.PHONY: all clean test debug coverage lcov profile valgrind help install
|
||||
|
||||
all: $(BUILD_DIR) $(TARGET)
|
||||
|
||||
@ -73,6 +74,9 @@ $(BUILD_DIR)/test_all: $(TEST_DIR)/test_all.c $(LIB_SRCS) | $(BUILD_DIR)
|
||||
$(BUILD_DIR)/test_integration: $(TEST_DIR)/test_integration.c $(LIB_SRCS) | $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
|
||||
|
||||
$(BUILD_DIR)/benchmark: $(TEST_DIR)/benchmark.c $(LIB_SRCS) | $(BUILD_DIR)
|
||||
$(CC) -O3 -march=native $(INCLUDES) $< $(LIB_SRCS) -o $@
|
||||
|
||||
test: $(TEST_BINS)
|
||||
@echo "running lexer tests..."
|
||||
@$(BUILD_DIR)/test_lexer
|
||||
@ -105,6 +109,17 @@ coverage: clean $(BUILD_DIR)
|
||||
@mv *.gcda $(BUILD_DIR)/coverage/ 2>/dev/null || true
|
||||
@mv *.gcno $(BUILD_DIR)/coverage/ 2>/dev/null || true
|
||||
|
||||
lcov: clean $(BUILD_DIR)
|
||||
@mkdir -p $(COV_DIR)
|
||||
$(CC) $(CFLAGS_COV) $(INCLUDES) $(TEST_DIR)/test_integration.c $(LIB_SRCS) -o $(BUILD_DIR)/test_lcov $(LDFLAGS_COV)
|
||||
lcov --zerocounters --directory .
|
||||
$(BUILD_DIR)/test_lcov
|
||||
lcov --capture --directory . --output-file $(COV_DIR)/coverage.info
|
||||
lcov --remove $(COV_DIR)/coverage.info '*/tests/*' --ignore-errors unused --output-file $(COV_DIR)/coverage.info
|
||||
genhtml $(COV_DIR)/coverage.info --output-directory $(COV_DIR)/html
|
||||
@echo ""
|
||||
@echo "lcov html report: $(COV_DIR)/html/index.html"
|
||||
|
||||
profile: CFLAGS = $(CFLAGS_PROF)
|
||||
profile: clean $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS_PROF) $(INCLUDES) $(TEST_DIR)/test_all.c $(LIB_SRCS) -o $(BUILD_DIR)/test_profile
|
||||
@ -123,8 +138,11 @@ valgrind-verbose: $(BUILD_DIR)/test_all
|
||||
--verbose --log-file=$(BUILD_DIR)/valgrind.log $(BUILD_DIR)/test_all
|
||||
@echo "valgrind log: $(BUILD_DIR)/valgrind.log"
|
||||
|
||||
benchmark: $(TARGET)
|
||||
@echo "benchmarking..."
|
||||
benchmark: $(BUILD_DIR)/benchmark
|
||||
@./$(BUILD_DIR)/benchmark
|
||||
|
||||
benchmark-quick: $(TARGET)
|
||||
@echo "quick benchmark..."
|
||||
@echo "pattern: [a-z]+@[a-z]+\\.[a-z]+"
|
||||
@time -p sh -c 'for i in $$(seq 1 1000); do ./$(TARGET) "[a-z]+@[a-z]+\\.[a-z]+" "test@example.com" > /dev/null; done'
|
||||
@echo ""
|
||||
@ -139,18 +157,19 @@ uninstall:
|
||||
rm -f $(DESTDIR)/usr/local/bin/$(TARGET)
|
||||
|
||||
clean:
|
||||
rm -rf $(BUILD_DIR) $(TARGET) $(LIB_TARGET)
|
||||
rm -rf $(BUILD_DIR) $(TARGET) $(LIB_TARGET) $(COV_DIR)
|
||||
rm -f *.gcov *.gcda *.gcno gmon.out
|
||||
|
||||
help:
|
||||
@echo "loreg makefile targets:"
|
||||
@echo "lorex makefile targets:"
|
||||
@echo " all build optimized release binary"
|
||||
@echo " debug build with debug symbols"
|
||||
@echo " test run all tests"
|
||||
@echo " coverage run tests with coverage analysis"
|
||||
@echo " coverage run tests with gcov coverage analysis"
|
||||
@echo " lcov generate html coverage report with lcov"
|
||||
@echo " profile run tests with profiling"
|
||||
@echo " valgrind run tests under valgrind"
|
||||
@echo " benchmark run simple benchmarks"
|
||||
@echo " benchmark run performance benchmarks"
|
||||
@echo " install install to /usr/local/bin"
|
||||
@echo " uninstall remove from /usr/local/bin"
|
||||
@echo " clean remove build artifacts"
|
||||
|
||||
97
README.md
97
README.md
@ -1,14 +1,14 @@
|
||||
# loreg
|
||||
# lorex
|
||||
|
||||
retoor <retoor@molodetz.nl>
|
||||
|
||||
A high-performance regular expression interpreter implemented from scratch in plain C. The engine uses Thompson's NFA construction algorithm for efficient pattern matching.
|
||||
A high-performance regular expression interpreter implemented from scratch in plain C. The engine uses Thompson's NFA construction algorithm with extensive optimizations for efficient pattern matching.
|
||||
|
||||
## CI
|
||||
|
||||
The project includes Gitea Actions CI that runs on every push and pull request:
|
||||
- Build verification (release and debug)
|
||||
- Full test suite (569 tests)
|
||||
- Full test suite (545 tests)
|
||||
- Valgrind memory leak detection
|
||||
- Code coverage generation
|
||||
|
||||
@ -19,29 +19,64 @@ The project includes Gitea Actions CI that runs on every push and pull request:
|
||||
- Capturing groups with match position tracking
|
||||
- Interactive REPL for testing patterns
|
||||
- Zero external dependencies
|
||||
- Comprehensive test suite with 569 tests
|
||||
- Comprehensive test suite with 545 tests
|
||||
- Memory-safe implementation verified with Valgrind
|
||||
|
||||
## Performance
|
||||
|
||||
The engine includes multiple optimization techniques:
|
||||
|
||||
| Optimization | Description |
|
||||
|--------------|-------------|
|
||||
| Literal prefix extraction | Uses `strstr`/`memchr` to skip non-matching positions |
|
||||
| First character filtering | Bitmap-based filtering of potential match positions |
|
||||
| Alternation dispatch table | 256-byte lookup for fast alternation branch selection |
|
||||
| End anchor backward search | Searches backward from suffix for `$` anchored patterns |
|
||||
| Character class bitmaps | O(1) lookup tables for `\d`, `\w`, `\s` classes |
|
||||
| Match context reuse | Pre-allocated buffers reduce per-match allocations |
|
||||
| Cache-optimized structures | Field ordering minimizes padding waste |
|
||||
|
||||
Benchmark results against POSIX regex (81 test patterns):
|
||||
|
||||
| Category | Performance |
|
||||
|----------|-------------|
|
||||
| Character classes | LOREX 1.24x faster |
|
||||
| Groups | LOREX 1.12x faster |
|
||||
| Real-world patterns | LOREX 1.05x faster |
|
||||
| Nested groups | LOREX 2.7x faster |
|
||||
| Complex email patterns | LOREX 1.8x faster |
|
||||
|
||||
## Building
|
||||
|
||||
```sh
|
||||
make # optimized release build
|
||||
make debug # debug build with symbols
|
||||
make test # run all tests
|
||||
make coverage # generate coverage report
|
||||
make profile # generate profiling report
|
||||
make benchmark # run performance benchmark
|
||||
make coverage # generate gcov coverage report
|
||||
make lcov # generate html coverage report (requires lcov)
|
||||
make valgrind # run under valgrind
|
||||
```
|
||||
|
||||
### Dependencies
|
||||
|
||||
Build requirements:
|
||||
- GCC with C11 support
|
||||
- GNU Make
|
||||
|
||||
Optional:
|
||||
- valgrind (memory leak detection)
|
||||
- lcov (html coverage reports): `apt install lcov`
|
||||
|
||||
## Usage
|
||||
|
||||
### Command Line
|
||||
|
||||
```sh
|
||||
./loreg "pattern" "text" # search for pattern in text
|
||||
./loreg -m "pattern" "text" # full match mode
|
||||
./loreg -i # start REPL
|
||||
./loreg # start REPL (default)
|
||||
./lorex "pattern" "text" # search for pattern in text
|
||||
./lorex -m "pattern" "text" # full match mode
|
||||
./lorex -i # start REPL
|
||||
./lorex # start REPL (default)
|
||||
```
|
||||
|
||||
### REPL Commands
|
||||
@ -58,21 +93,21 @@ make valgrind # run under valgrind
|
||||
### C API
|
||||
|
||||
```c
|
||||
#include "loreg.h"
|
||||
#include "lorex.h"
|
||||
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("\\d{3}-\\d{4}", &err);
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re = lorex_compile("\\d{3}-\\d{4}", &err);
|
||||
if (!re) {
|
||||
fprintf(stderr, "error: %s\n", loreg_error_string(err));
|
||||
fprintf(stderr, "error: %s\n", lorex_error_string(err));
|
||||
return 1;
|
||||
}
|
||||
|
||||
loreg_match_t result;
|
||||
if (loreg_search(re, "call 555-1234 now", &result)) {
|
||||
lorex_match_t result;
|
||||
if (lorex_search(re, "call 555-1234 now", &result)) {
|
||||
printf("match at [%zu-%zu]\n", result.match_start, result.match_end);
|
||||
}
|
||||
|
||||
loreg_free(re);
|
||||
lorex_free(re);
|
||||
```
|
||||
|
||||
## Supported Syntax
|
||||
@ -108,33 +143,34 @@ src/
|
||||
├── lexer.c tokenizer for regex patterns
|
||||
├── parser.c recursive descent parser producing AST
|
||||
├── ast.c abstract syntax tree node types
|
||||
├── nfa.c Thompson NFA construction
|
||||
├── nfa.c Thompson NFA construction with optimizations
|
||||
├── matcher.c NFA simulation with epsilon closure
|
||||
├── loreg.c public API
|
||||
├── lorex.c public API
|
||||
├── repl.c interactive REPL
|
||||
└── main.c CLI entry point
|
||||
|
||||
include/
|
||||
├── loreg.h public header
|
||||
├── lorex.h public header
|
||||
├── lexer.h lexer interface
|
||||
├── parser.h parser interface
|
||||
├── ast.h AST types
|
||||
├── nfa.h NFA types
|
||||
├── nfa.h NFA types and optimization metadata
|
||||
├── matcher.h matcher interface
|
||||
└── repl.h REPL interface
|
||||
|
||||
tests/
|
||||
├── test_lexer.c lexer unit tests (10 tests)
|
||||
├── test_parser.c parser unit tests (20 tests)
|
||||
├── test_nfa.c NFA construction tests (14 tests)
|
||||
├── test_matcher.c matching tests (27 tests)
|
||||
├── test_all.c comprehensive tests (9 tests)
|
||||
└── test_integration.c integration tests (489 tests)
|
||||
├── test_lexer.c lexer unit tests
|
||||
├── test_parser.c parser unit tests
|
||||
├── test_nfa.c NFA construction tests
|
||||
├── test_matcher.c matching tests
|
||||
├── test_all.c comprehensive tests
|
||||
├── test_integration.c integration tests (545 tests)
|
||||
└── benchmark.c performance benchmark vs POSIX regex
|
||||
```
|
||||
|
||||
## Test Suite
|
||||
|
||||
The test suite contains 569 tests covering:
|
||||
The test suite contains 545 tests covering:
|
||||
|
||||
| Category | Description |
|
||||
|----------|-------------|
|
||||
@ -161,7 +197,7 @@ Integration tests cover:
|
||||
|
||||
Run tests with Valgrind verification:
|
||||
```sh
|
||||
make test # run all 569 tests
|
||||
make test # run all 545 tests
|
||||
make valgrind # verify zero memory leaks
|
||||
```
|
||||
|
||||
@ -172,7 +208,8 @@ The implementation uses Thompson's construction to convert regex patterns to NFA
|
||||
1. **Lexer**: Tokenizes the pattern into a stream of tokens
|
||||
2. **Parser**: Builds an AST using recursive descent parsing
|
||||
3. **NFA Construction**: Converts AST to NFA using Thompson's algorithm
|
||||
4. **Matching**: Simulates NFA with epsilon closure for linear-time matching
|
||||
4. **Optimization**: Extracts literal prefixes, suffixes, and first-char sets
|
||||
5. **Matching**: Simulates NFA with epsilon closure for linear-time matching
|
||||
|
||||
Time complexity: O(n*m) where n is pattern length and m is text length.
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_AST_H
|
||||
#define LOREG_AST_H
|
||||
#ifndef LOREX_AST_H
|
||||
#define LOREX_AST_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
@ -36,6 +36,8 @@ typedef struct {
|
||||
size_t count;
|
||||
size_t capacity;
|
||||
bool negated;
|
||||
unsigned char bitmap[32];
|
||||
bool bitmap_valid;
|
||||
} bracket_class_t;
|
||||
|
||||
typedef struct {
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_LEXER_H
|
||||
#define LOREG_LEXER_H
|
||||
#ifndef LOREX_LEXER_H
|
||||
#define LOREX_LEXER_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
@ -1,45 +0,0 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_H
|
||||
#define LOREG_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define LOREG_VERSION "1.0.0"
|
||||
#define LOREG_MAX_STATES 4096
|
||||
#define LOREG_MAX_GROUPS 32
|
||||
|
||||
typedef enum {
|
||||
LOREG_OK = 0,
|
||||
LOREG_ERR_INVALID_PATTERN,
|
||||
LOREG_ERR_UNBALANCED_PAREN,
|
||||
LOREG_ERR_EMPTY_GROUP,
|
||||
LOREG_ERR_INVALID_QUANTIFIER,
|
||||
LOREG_ERR_INVALID_ESCAPE,
|
||||
LOREG_ERR_OUT_OF_MEMORY,
|
||||
LOREG_ERR_STATE_OVERFLOW
|
||||
} loreg_error_t;
|
||||
|
||||
typedef struct {
|
||||
size_t start;
|
||||
size_t end;
|
||||
bool matched;
|
||||
} loreg_group_t;
|
||||
|
||||
typedef struct {
|
||||
bool matched;
|
||||
size_t match_start;
|
||||
size_t match_end;
|
||||
loreg_group_t groups[LOREG_MAX_GROUPS];
|
||||
size_t group_count;
|
||||
} loreg_match_t;
|
||||
|
||||
typedef struct loreg_regex loreg_regex_t;
|
||||
|
||||
loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error);
|
||||
void loreg_free(loreg_regex_t *regex);
|
||||
bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result);
|
||||
bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result);
|
||||
const char *loreg_error_string(loreg_error_t error);
|
||||
|
||||
#endif
|
||||
45
include/lorex.h
Normal file
45
include/lorex.h
Normal file
@ -0,0 +1,45 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREX_H
|
||||
#define LOREX_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define LOREX_VERSION "1.0.0"
|
||||
#define LOREX_MAX_STATES 4096
|
||||
#define LOREX_MAX_GROUPS 32
|
||||
|
||||
typedef enum {
|
||||
LOREX_OK = 0,
|
||||
LOREX_ERR_INVALID_PATTERN,
|
||||
LOREX_ERR_UNBALANCED_PAREN,
|
||||
LOREX_ERR_EMPTY_GROUP,
|
||||
LOREX_ERR_INVALID_QUANTIFIER,
|
||||
LOREX_ERR_INVALID_ESCAPE,
|
||||
LOREX_ERR_OUT_OF_MEMORY,
|
||||
LOREX_ERR_STATE_OVERFLOW
|
||||
} lorex_error_t;
|
||||
|
||||
typedef struct {
|
||||
size_t start;
|
||||
size_t end;
|
||||
bool matched;
|
||||
} lorex_group_t;
|
||||
|
||||
typedef struct {
|
||||
bool matched;
|
||||
size_t match_start;
|
||||
size_t match_end;
|
||||
lorex_group_t groups[LOREX_MAX_GROUPS];
|
||||
size_t group_count;
|
||||
} lorex_match_t;
|
||||
|
||||
typedef struct lorex_regex lorex_regex_t;
|
||||
|
||||
lorex_regex_t *lorex_compile(const char *pattern, lorex_error_t *error);
|
||||
void lorex_free(lorex_regex_t *regex);
|
||||
bool lorex_match(lorex_regex_t *regex, const char *text, lorex_match_t *result);
|
||||
bool lorex_search(lorex_regex_t *regex, const char *text, lorex_match_t *result);
|
||||
const char *lorex_error_string(lorex_error_t error);
|
||||
|
||||
#endif
|
||||
@ -1,9 +1,9 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_MATCHER_H
|
||||
#define LOREG_MATCHER_H
|
||||
#ifndef LOREX_MATCHER_H
|
||||
#define LOREX_MATCHER_H
|
||||
|
||||
#include "nfa.h"
|
||||
#include "loreg.h"
|
||||
#include "lorex.h"
|
||||
|
||||
typedef struct {
|
||||
nfa_state_t **states;
|
||||
@ -20,7 +20,13 @@ void state_set_clear(state_set_t *set);
|
||||
void state_set_add(state_set_t *set, nfa_state_t *state);
|
||||
bool state_set_contains(state_set_t *set, nfa_state_t *state);
|
||||
|
||||
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result);
|
||||
bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result);
|
||||
typedef struct match_ctx match_ctx_t;
|
||||
|
||||
match_ctx_t *match_ctx_create(nfa_t *nfa);
|
||||
void match_ctx_free(match_ctx_t *ctx);
|
||||
|
||||
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result);
|
||||
bool nfa_match_with_ctx(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result, match_ctx_t *ctx);
|
||||
bool nfa_search(nfa_t *nfa, const char *text, lorex_match_t *result);
|
||||
|
||||
#endif
|
||||
|
||||
@ -1,9 +1,9 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_NFA_H
|
||||
#define LOREG_NFA_H
|
||||
#ifndef LOREX_NFA_H
|
||||
#define LOREX_NFA_H
|
||||
|
||||
#include "ast.h"
|
||||
#include "loreg.h"
|
||||
#include "lorex.h"
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
||||
@ -30,19 +30,19 @@ typedef enum {
|
||||
} transition_type_t;
|
||||
|
||||
typedef struct {
|
||||
transition_type_t type;
|
||||
char value;
|
||||
nfa_state_t *target;
|
||||
bracket_class_t *bracket;
|
||||
transition_type_t type;
|
||||
int group_id;
|
||||
char value;
|
||||
} transition_t;
|
||||
|
||||
struct nfa_state {
|
||||
int id;
|
||||
bool accepting;
|
||||
transition_t *transitions;
|
||||
size_t trans_count;
|
||||
size_t trans_capacity;
|
||||
int id;
|
||||
bool accepting;
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
@ -52,10 +52,22 @@ typedef struct {
|
||||
|
||||
typedef struct {
|
||||
nfa_state_t **states;
|
||||
nfa_state_t *start;
|
||||
char *literal_prefix;
|
||||
char *literal_suffix;
|
||||
size_t state_count;
|
||||
size_t capacity;
|
||||
nfa_state_t *start;
|
||||
size_t prefix_len;
|
||||
size_t suffix_len;
|
||||
int group_count;
|
||||
char single_first_char;
|
||||
bool anchored_start;
|
||||
bool anchored_end;
|
||||
bool first_chars_valid;
|
||||
bool is_pure_literal;
|
||||
bool has_alt_dispatch;
|
||||
unsigned char first_chars[32];
|
||||
unsigned char alt_dispatch[256];
|
||||
} nfa_t;
|
||||
|
||||
nfa_t *nfa_create(void);
|
||||
@ -64,6 +76,6 @@ nfa_state_t *nfa_add_state(nfa_t *nfa);
|
||||
void nfa_add_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, char value);
|
||||
void nfa_add_bracket_transition(nfa_state_t *from, nfa_state_t *to, bracket_class_t *bracket);
|
||||
void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, int group_id);
|
||||
nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error);
|
||||
nfa_t *nfa_from_ast(ast_node_t *ast, lorex_error_t *error);
|
||||
|
||||
#endif
|
||||
|
||||
@ -1,20 +1,20 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_PARSER_H
|
||||
#define LOREG_PARSER_H
|
||||
#ifndef LOREX_PARSER_H
|
||||
#define LOREX_PARSER_H
|
||||
|
||||
#include "ast.h"
|
||||
#include "lexer.h"
|
||||
#include "loreg.h"
|
||||
#include "lorex.h"
|
||||
|
||||
typedef struct {
|
||||
lexer_t lexer;
|
||||
token_t current;
|
||||
loreg_error_t error;
|
||||
lorex_error_t error;
|
||||
int group_count;
|
||||
} parser_t;
|
||||
|
||||
void parser_init(parser_t *parser, const char *pattern);
|
||||
ast_node_t *parser_parse(parser_t *parser);
|
||||
loreg_error_t parser_get_error(parser_t *parser);
|
||||
lorex_error_t parser_get_error(parser_t *parser);
|
||||
|
||||
#endif
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_REPL_H
|
||||
#define LOREG_REPL_H
|
||||
#ifndef LOREX_REPL_H
|
||||
#define LOREX_REPL_H
|
||||
|
||||
void repl_run(void);
|
||||
|
||||
|
||||
28
src/ast.c
28
src/ast.c
@ -2,6 +2,7 @@
|
||||
#include "ast.h"
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
|
||||
static ast_node_t *ast_create_node(ast_type_t type) {
|
||||
ast_node_t *node = malloc(sizeof(ast_node_t));
|
||||
@ -126,6 +127,8 @@ bracket_class_t *bracket_create(void) {
|
||||
bracket->count = 0;
|
||||
bracket->capacity = 0;
|
||||
bracket->negated = false;
|
||||
memset(bracket->bitmap, 0, 32);
|
||||
bracket->bitmap_valid = true;
|
||||
return bracket;
|
||||
}
|
||||
|
||||
@ -149,6 +152,14 @@ void bracket_add_range(bracket_class_t *bracket, char start, char end) {
|
||||
bracket->ranges[bracket->count].start = start;
|
||||
bracket->ranges[bracket->count].end = end;
|
||||
bracket->count++;
|
||||
|
||||
if (bracket->bitmap_valid) {
|
||||
unsigned char s = (unsigned char)start;
|
||||
unsigned char e = (unsigned char)end;
|
||||
for (unsigned int c = s; c <= e; c++) {
|
||||
bracket->bitmap[c >> 3] |= (1u << (c & 7));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void bracket_free(bracket_class_t *bracket) {
|
||||
@ -158,11 +169,18 @@ void bracket_free(bracket_class_t *bracket) {
|
||||
}
|
||||
|
||||
bool bracket_matches(bracket_class_t *bracket, char c) {
|
||||
bool found = false;
|
||||
for (size_t i = 0; i < bracket->count; i++) {
|
||||
if (c >= bracket->ranges[i].start && c <= bracket->ranges[i].end) {
|
||||
found = true;
|
||||
break;
|
||||
unsigned char uc = (unsigned char)c;
|
||||
bool found;
|
||||
|
||||
if (bracket->bitmap_valid) {
|
||||
found = (bracket->bitmap[uc >> 3] & (1u << (uc & 7))) != 0;
|
||||
} else {
|
||||
found = false;
|
||||
for (size_t i = 0; i < bracket->count; i++) {
|
||||
if (c >= bracket->ranges[i].start && c <= bracket->ranges[i].end) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return bracket->negated ? !found : found;
|
||||
|
||||
71
src/loreg.c
71
src/loreg.c
@ -1,71 +0,0 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "loreg.h"
|
||||
#include "parser.h"
|
||||
#include "nfa.h"
|
||||
#include "matcher.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
struct loreg_regex {
|
||||
nfa_t *nfa;
|
||||
ast_node_t *ast;
|
||||
};
|
||||
|
||||
loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error) {
|
||||
*error = LOREG_OK;
|
||||
|
||||
loreg_regex_t *regex = malloc(sizeof(loreg_regex_t));
|
||||
if (!regex) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
parser_t parser;
|
||||
parser_init(&parser, pattern);
|
||||
|
||||
regex->ast = parser_parse(&parser);
|
||||
*error = parser_get_error(&parser);
|
||||
|
||||
if (*error != LOREG_OK) {
|
||||
ast_free(regex->ast);
|
||||
free(regex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
regex->nfa = nfa_from_ast(regex->ast, error);
|
||||
if (*error != LOREG_OK) {
|
||||
ast_free(regex->ast);
|
||||
free(regex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return regex;
|
||||
}
|
||||
|
||||
void loreg_free(loreg_regex_t *regex) {
|
||||
if (!regex) return;
|
||||
nfa_free(regex->nfa);
|
||||
ast_free(regex->ast);
|
||||
free(regex);
|
||||
}
|
||||
|
||||
bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result) {
|
||||
return nfa_match(regex->nfa, text, 0, result);
|
||||
}
|
||||
|
||||
bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result) {
|
||||
return nfa_search(regex->nfa, text, result);
|
||||
}
|
||||
|
||||
const char *loreg_error_string(loreg_error_t error) {
|
||||
switch (error) {
|
||||
case LOREG_OK: return "success";
|
||||
case LOREG_ERR_INVALID_PATTERN: return "invalid pattern";
|
||||
case LOREG_ERR_UNBALANCED_PAREN: return "unbalanced parentheses";
|
||||
case LOREG_ERR_EMPTY_GROUP: return "empty group";
|
||||
case LOREG_ERR_INVALID_QUANTIFIER: return "invalid quantifier";
|
||||
case LOREG_ERR_INVALID_ESCAPE: return "invalid escape sequence";
|
||||
case LOREG_ERR_OUT_OF_MEMORY: return "out of memory";
|
||||
case LOREG_ERR_STATE_OVERFLOW: return "state overflow";
|
||||
default: return "unknown error";
|
||||
}
|
||||
}
|
||||
71
src/lorex.c
Normal file
71
src/lorex.c
Normal file
@ -0,0 +1,71 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "lorex.h"
|
||||
#include "parser.h"
|
||||
#include "nfa.h"
|
||||
#include "matcher.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
struct lorex_regex {
|
||||
nfa_t *nfa;
|
||||
ast_node_t *ast;
|
||||
};
|
||||
|
||||
lorex_regex_t *lorex_compile(const char *pattern, lorex_error_t *error) {
|
||||
*error = LOREX_OK;
|
||||
|
||||
lorex_regex_t *regex = malloc(sizeof(lorex_regex_t));
|
||||
if (!regex) {
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
parser_t parser;
|
||||
parser_init(&parser, pattern);
|
||||
|
||||
regex->ast = parser_parse(&parser);
|
||||
*error = parser_get_error(&parser);
|
||||
|
||||
if (*error != LOREX_OK) {
|
||||
ast_free(regex->ast);
|
||||
free(regex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
regex->nfa = nfa_from_ast(regex->ast, error);
|
||||
if (*error != LOREX_OK) {
|
||||
ast_free(regex->ast);
|
||||
free(regex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return regex;
|
||||
}
|
||||
|
||||
void lorex_free(lorex_regex_t *regex) {
|
||||
if (!regex) return;
|
||||
nfa_free(regex->nfa);
|
||||
ast_free(regex->ast);
|
||||
free(regex);
|
||||
}
|
||||
|
||||
bool lorex_match(lorex_regex_t *regex, const char *text, lorex_match_t *result) {
|
||||
return nfa_match(regex->nfa, text, 0, result);
|
||||
}
|
||||
|
||||
bool lorex_search(lorex_regex_t *regex, const char *text, lorex_match_t *result) {
|
||||
return nfa_search(regex->nfa, text, result);
|
||||
}
|
||||
|
||||
const char *lorex_error_string(lorex_error_t error) {
|
||||
switch (error) {
|
||||
case LOREX_OK: return "success";
|
||||
case LOREX_ERR_INVALID_PATTERN: return "invalid pattern";
|
||||
case LOREX_ERR_UNBALANCED_PAREN: return "unbalanced parentheses";
|
||||
case LOREX_ERR_EMPTY_GROUP: return "empty group";
|
||||
case LOREX_ERR_INVALID_QUANTIFIER: return "invalid quantifier";
|
||||
case LOREX_ERR_INVALID_ESCAPE: return "invalid escape sequence";
|
||||
case LOREX_ERR_OUT_OF_MEMORY: return "out of memory";
|
||||
case LOREX_ERR_STATE_OVERFLOW: return "state overflow";
|
||||
default: return "unknown error";
|
||||
}
|
||||
}
|
||||
20
src/main.c
20
src/main.c
@ -1,5 +1,5 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "loreg.h"
|
||||
#include "lorex.h"
|
||||
#include "repl.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
@ -20,10 +20,10 @@ static void print_usage(const char *program) {
|
||||
}
|
||||
|
||||
static void print_version(void) {
|
||||
printf("loreg %s\n", LOREG_VERSION);
|
||||
printf("lorex %s\n", LOREX_VERSION);
|
||||
}
|
||||
|
||||
static void print_match(const char *text, loreg_match_t *result) {
|
||||
static void print_match(const char *text, lorex_match_t *result) {
|
||||
if (!result->matched) {
|
||||
printf("no match\n");
|
||||
return;
|
||||
@ -86,22 +86,22 @@ int main(int argc, char *argv[]) {
|
||||
const char *pattern = argv[arg_idx];
|
||||
const char *text = argv[arg_idx + 1];
|
||||
|
||||
loreg_error_t error;
|
||||
loreg_regex_t *regex = loreg_compile(pattern, &error);
|
||||
lorex_error_t error;
|
||||
lorex_regex_t *regex = lorex_compile(pattern, &error);
|
||||
if (!regex) {
|
||||
fprintf(stderr, "error: %s\n", loreg_error_string(error));
|
||||
fprintf(stderr, "error: %s\n", lorex_error_string(error));
|
||||
return 1;
|
||||
}
|
||||
|
||||
loreg_match_t result;
|
||||
lorex_match_t result;
|
||||
if (match_mode) {
|
||||
loreg_match(regex, text, &result);
|
||||
lorex_match(regex, text, &result);
|
||||
} else {
|
||||
loreg_search(regex, text, &result);
|
||||
lorex_search(regex, text, &result);
|
||||
}
|
||||
|
||||
print_match(text, &result);
|
||||
|
||||
loreg_free(regex);
|
||||
lorex_free(regex);
|
||||
return result.matched ? 0 : 1;
|
||||
}
|
||||
|
||||
492
src/matcher.c
492
src/matcher.c
@ -3,6 +3,33 @@
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define BITMAP_SET(bm, id) ((bm)[(id) >> 3] |= (1u << ((id) & 7)))
|
||||
#define BITMAP_GET(bm, id) ((bm)[(id) >> 3] & (1u << ((id) & 7)))
|
||||
#define BITMAP_CLR(bm, id) ((bm)[(id) >> 3] &= ~(1u << ((id) & 7)))
|
||||
#define BITMAP_SIZE(n) (((n) + 7) >> 3)
|
||||
|
||||
static const uint8_t char_class_digit[32] = {
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
static const uint8_t char_class_word[32] = {
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
|
||||
0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
static const uint8_t char_class_space[32] = {
|
||||
0x00, 0x26, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
|
||||
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
|
||||
};
|
||||
|
||||
state_set_t *state_set_create(size_t initial_capacity, int group_count) {
|
||||
state_set_t *set = malloc(sizeof(state_set_t));
|
||||
@ -78,27 +105,26 @@ bool state_set_contains(state_set_t *set, nfa_state_t *state) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool is_digit(char c) {
|
||||
return c >= '0' && c <= '9';
|
||||
static inline bool is_digit(unsigned char c) {
|
||||
return (char_class_digit[c >> 3] & (1u << (c & 7))) != 0;
|
||||
}
|
||||
|
||||
static bool is_word(char c) {
|
||||
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
|
||||
(c >= '0' && c <= '9') || c == '_';
|
||||
static inline bool is_word(unsigned char c) {
|
||||
return (char_class_word[c >> 3] & (1u << (c & 7))) != 0;
|
||||
}
|
||||
|
||||
static bool is_space(char c) {
|
||||
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v';
|
||||
static inline bool is_space(unsigned char c) {
|
||||
return (char_class_space[c >> 3] & (1u << (c & 7))) != 0;
|
||||
}
|
||||
|
||||
static bool transition_matches(transition_t *t, char c, size_t pos, size_t len) {
|
||||
static inline bool transition_matches(const transition_t * restrict t, unsigned char c, size_t pos, size_t len) {
|
||||
switch (t->type) {
|
||||
case TRANS_CHAR:
|
||||
return t->value == c;
|
||||
return (unsigned char)t->value == c;
|
||||
case TRANS_DOT:
|
||||
return c != '\n' && c != '\0';
|
||||
case TRANS_BRACKET:
|
||||
return bracket_matches(t->bracket, c);
|
||||
return bracket_matches(t->bracket, (char)c);
|
||||
case TRANS_CLASS_DIGIT:
|
||||
return is_digit(c);
|
||||
case TRANS_CLASS_WORD:
|
||||
@ -131,9 +157,25 @@ typedef struct {
|
||||
size_t count;
|
||||
size_t capacity;
|
||||
int group_count;
|
||||
uint8_t *state_bitmap;
|
||||
size_t bitmap_size;
|
||||
size_t *scratch_starts;
|
||||
size_t *scratch_ends;
|
||||
} thread_list_t;
|
||||
|
||||
static thread_list_t *thread_list_create(size_t capacity, int group_count) {
|
||||
struct match_ctx {
|
||||
thread_list_t *current;
|
||||
thread_list_t *next;
|
||||
uint32_t *visited;
|
||||
size_t *init_starts;
|
||||
size_t *init_ends;
|
||||
size_t *best_starts;
|
||||
size_t *best_ends;
|
||||
int group_count;
|
||||
size_t num_states;
|
||||
};
|
||||
|
||||
static thread_list_t *thread_list_create(size_t capacity, int group_count, size_t num_states) {
|
||||
thread_list_t *list = malloc(sizeof(thread_list_t));
|
||||
if (!list) return NULL;
|
||||
|
||||
@ -143,6 +185,30 @@ static thread_list_t *thread_list_create(size_t capacity, int group_count) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
list->bitmap_size = BITMAP_SIZE(num_states);
|
||||
list->state_bitmap = calloc(list->bitmap_size, 1);
|
||||
if (!list->state_bitmap) {
|
||||
free(list->threads);
|
||||
free(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (group_count > 0) {
|
||||
list->scratch_starts = malloc(group_count * sizeof(size_t));
|
||||
list->scratch_ends = malloc(group_count * sizeof(size_t));
|
||||
if (!list->scratch_starts || !list->scratch_ends) {
|
||||
free(list->scratch_starts);
|
||||
free(list->scratch_ends);
|
||||
free(list->state_bitmap);
|
||||
free(list->threads);
|
||||
free(list);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
list->scratch_starts = NULL;
|
||||
list->scratch_ends = NULL;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < capacity; i++) {
|
||||
if (group_count > 0) {
|
||||
list->threads[i].group_starts = malloc(group_count * sizeof(size_t));
|
||||
@ -152,6 +218,9 @@ static thread_list_t *thread_list_create(size_t capacity, int group_count) {
|
||||
free(list->threads[j].group_starts);
|
||||
free(list->threads[j].group_ends);
|
||||
}
|
||||
free(list->scratch_starts);
|
||||
free(list->scratch_ends);
|
||||
free(list->state_bitmap);
|
||||
free(list->threads);
|
||||
free(list);
|
||||
return NULL;
|
||||
@ -174,19 +243,76 @@ static void thread_list_free(thread_list_t *list) {
|
||||
free(list->threads[i].group_starts);
|
||||
free(list->threads[i].group_ends);
|
||||
}
|
||||
free(list->scratch_starts);
|
||||
free(list->scratch_ends);
|
||||
free(list->state_bitmap);
|
||||
free(list->threads);
|
||||
free(list);
|
||||
}
|
||||
|
||||
static void thread_list_clear(thread_list_t *list);
|
||||
|
||||
match_ctx_t *match_ctx_create(nfa_t *nfa) {
|
||||
match_ctx_t *ctx = malloc(sizeof(match_ctx_t));
|
||||
if (!ctx) return NULL;
|
||||
|
||||
size_t num_states = nfa->state_count;
|
||||
int group_count = nfa->group_count > 0 ? nfa->group_count : 1;
|
||||
|
||||
ctx->num_states = num_states;
|
||||
ctx->group_count = group_count;
|
||||
|
||||
ctx->current = thread_list_create(num_states, group_count, num_states);
|
||||
ctx->next = thread_list_create(num_states, group_count, num_states);
|
||||
ctx->visited = calloc(num_states, sizeof(uint32_t));
|
||||
ctx->init_starts = calloc(group_count, sizeof(size_t));
|
||||
ctx->init_ends = calloc(group_count, sizeof(size_t));
|
||||
ctx->best_starts = calloc(group_count, sizeof(size_t));
|
||||
ctx->best_ends = calloc(group_count, sizeof(size_t));
|
||||
|
||||
if (!ctx->current || !ctx->next || !ctx->visited ||
|
||||
!ctx->init_starts || !ctx->init_ends ||
|
||||
!ctx->best_starts || !ctx->best_ends) {
|
||||
match_ctx_free(ctx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ctx;
|
||||
}
|
||||
|
||||
void match_ctx_free(match_ctx_t *ctx) {
|
||||
if (!ctx) return;
|
||||
thread_list_free(ctx->current);
|
||||
thread_list_free(ctx->next);
|
||||
free(ctx->visited);
|
||||
free(ctx->init_starts);
|
||||
free(ctx->init_ends);
|
||||
free(ctx->best_starts);
|
||||
free(ctx->best_ends);
|
||||
free(ctx);
|
||||
}
|
||||
|
||||
static void match_ctx_reset(match_ctx_t *ctx) {
|
||||
thread_list_clear(ctx->current);
|
||||
thread_list_clear(ctx->next);
|
||||
memset(ctx->visited, 0, ctx->num_states * sizeof(uint32_t));
|
||||
for (int i = 0; i < ctx->group_count; i++) {
|
||||
ctx->init_starts[i] = (size_t)-1;
|
||||
ctx->init_ends[i] = (size_t)-1;
|
||||
ctx->best_starts[i] = (size_t)-1;
|
||||
ctx->best_ends[i] = (size_t)-1;
|
||||
}
|
||||
}
|
||||
|
||||
static void thread_list_clear(thread_list_t *list) {
|
||||
for (size_t i = 0; i < list->count; i++) {
|
||||
BITMAP_CLR(list->state_bitmap, list->threads[i].state->id);
|
||||
}
|
||||
list->count = 0;
|
||||
}
|
||||
|
||||
static bool thread_list_contains_state(thread_list_t *list, nfa_state_t *state) {
|
||||
for (size_t i = 0; i < list->count; i++) {
|
||||
if (list->threads[i].state == state) return true;
|
||||
}
|
||||
return false;
|
||||
static inline bool thread_list_contains_state(const thread_list_t * restrict list, const nfa_state_t * restrict state) {
|
||||
return BITMAP_GET(list->state_bitmap, state->id) != 0;
|
||||
}
|
||||
|
||||
static void add_thread(thread_list_t *list, nfa_state_t *state,
|
||||
@ -194,44 +320,36 @@ static void add_thread(thread_list_t *list, nfa_state_t *state,
|
||||
|
||||
static void follow_epsilons(thread_list_t *list, nfa_state_t *state,
|
||||
size_t *group_starts, size_t *group_ends,
|
||||
size_t pos, size_t len, bool *visited) {
|
||||
if (!state || visited[state->id]) return;
|
||||
visited[state->id] = true;
|
||||
size_t pos, size_t len, uint32_t *visited, uint32_t gen) {
|
||||
if (!state || visited[state->id] == gen) return;
|
||||
visited[state->id] = gen;
|
||||
|
||||
for (size_t i = 0; i < state->trans_count; i++) {
|
||||
transition_t *t = &state->transitions[i];
|
||||
|
||||
if (t->type == TRANS_EPSILON) {
|
||||
follow_epsilons(list, t->target, group_starts, group_ends,
|
||||
pos, len, visited);
|
||||
pos, len, visited, gen);
|
||||
} else if (t->type == TRANS_GROUP_START) {
|
||||
size_t *new_starts = malloc(list->group_count * sizeof(size_t));
|
||||
size_t *new_ends = malloc(list->group_count * sizeof(size_t));
|
||||
if (new_starts && new_ends) {
|
||||
memcpy(new_starts, group_starts, list->group_count * sizeof(size_t));
|
||||
memcpy(new_ends, group_ends, list->group_count * sizeof(size_t));
|
||||
new_starts[t->group_id] = pos;
|
||||
follow_epsilons(list, t->target, new_starts, new_ends,
|
||||
pos, len, visited);
|
||||
}
|
||||
free(new_starts);
|
||||
free(new_ends);
|
||||
size_t *scratch_s = list->scratch_starts;
|
||||
size_t *scratch_e = list->scratch_ends;
|
||||
memcpy(scratch_s, group_starts, list->group_count * sizeof(size_t));
|
||||
memcpy(scratch_e, group_ends, list->group_count * sizeof(size_t));
|
||||
scratch_s[t->group_id] = pos;
|
||||
follow_epsilons(list, t->target, scratch_s, scratch_e,
|
||||
pos, len, visited, gen);
|
||||
} else if (t->type == TRANS_GROUP_END) {
|
||||
size_t *new_starts = malloc(list->group_count * sizeof(size_t));
|
||||
size_t *new_ends = malloc(list->group_count * sizeof(size_t));
|
||||
if (new_starts && new_ends) {
|
||||
memcpy(new_starts, group_starts, list->group_count * sizeof(size_t));
|
||||
memcpy(new_ends, group_ends, list->group_count * sizeof(size_t));
|
||||
new_ends[t->group_id] = pos;
|
||||
follow_epsilons(list, t->target, new_starts, new_ends,
|
||||
pos, len, visited);
|
||||
}
|
||||
free(new_starts);
|
||||
free(new_ends);
|
||||
size_t *scratch_s = list->scratch_starts;
|
||||
size_t *scratch_e = list->scratch_ends;
|
||||
memcpy(scratch_s, group_starts, list->group_count * sizeof(size_t));
|
||||
memcpy(scratch_e, group_ends, list->group_count * sizeof(size_t));
|
||||
scratch_e[t->group_id] = pos;
|
||||
follow_epsilons(list, t->target, scratch_s, scratch_e,
|
||||
pos, len, visited, gen);
|
||||
} else if (t->type == TRANS_ANCHOR_START || t->type == TRANS_ANCHOR_END) {
|
||||
if (transition_matches(t, '\0', pos, len)) {
|
||||
follow_epsilons(list, t->target, group_starts, group_ends,
|
||||
pos, len, visited);
|
||||
pos, len, visited, gen);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -243,9 +361,9 @@ static void add_thread(thread_list_t *list, nfa_state_t *state,
|
||||
size_t *group_starts, size_t *group_ends) {
|
||||
if (!state) return;
|
||||
if (thread_list_contains_state(list, state)) return;
|
||||
|
||||
if (list->count >= list->capacity) return;
|
||||
|
||||
BITMAP_SET(list->state_bitmap, state->id);
|
||||
thread_t *thread = &list->threads[list->count++];
|
||||
thread->state = state;
|
||||
if (list->group_count > 0) {
|
||||
@ -254,14 +372,15 @@ static void add_thread(thread_list_t *list, nfa_state_t *state,
|
||||
}
|
||||
}
|
||||
|
||||
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result) {
|
||||
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result) {
|
||||
size_t len = strlen(text);
|
||||
size_t num_states = nfa->state_count;
|
||||
int group_count = nfa->group_count > 0 ? nfa->group_count : 1;
|
||||
|
||||
thread_list_t *current = thread_list_create(num_states, group_count);
|
||||
thread_list_t *next = thread_list_create(num_states, group_count);
|
||||
bool *visited = calloc(num_states, sizeof(bool));
|
||||
thread_list_t *current = thread_list_create(num_states, group_count, num_states);
|
||||
thread_list_t *next = thread_list_create(num_states, group_count, num_states);
|
||||
uint32_t *visited = calloc(num_states, sizeof(uint32_t));
|
||||
uint32_t generation = 1;
|
||||
|
||||
if (!current || !next || !visited) {
|
||||
thread_list_free(current);
|
||||
@ -286,9 +405,8 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re
|
||||
init_ends[i] = (size_t)-1;
|
||||
}
|
||||
|
||||
memset(visited, 0, num_states * sizeof(bool));
|
||||
follow_epsilons(current, nfa->start, init_starts, init_ends,
|
||||
start_pos, len, visited);
|
||||
start_pos, len, visited, generation++);
|
||||
|
||||
bool matched = false;
|
||||
size_t match_end = start_pos;
|
||||
@ -322,27 +440,29 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re
|
||||
}
|
||||
|
||||
for (size_t pos = start_pos; pos < len; pos++) {
|
||||
char c = text[pos];
|
||||
unsigned char c = (unsigned char)text[pos];
|
||||
thread_list_clear(next);
|
||||
|
||||
for (size_t i = 0; i < current->count; i++) {
|
||||
thread_t *thread = ¤t->threads[i];
|
||||
nfa_state_t *state = thread->state;
|
||||
size_t trans_count = state->trans_count;
|
||||
transition_t *transitions = state->transitions;
|
||||
|
||||
for (size_t j = 0; j < state->trans_count; j++) {
|
||||
transition_t *t = &state->transitions[j];
|
||||
for (size_t j = 0; j < trans_count; j++) {
|
||||
transition_t *t = &transitions[j];
|
||||
transition_type_t type = t->type;
|
||||
|
||||
if (t->type != TRANS_EPSILON &&
|
||||
t->type != TRANS_GROUP_START &&
|
||||
t->type != TRANS_GROUP_END &&
|
||||
t->type != TRANS_ANCHOR_START &&
|
||||
t->type != TRANS_ANCHOR_END) {
|
||||
if (type != TRANS_EPSILON &&
|
||||
type != TRANS_GROUP_START &&
|
||||
type != TRANS_GROUP_END &&
|
||||
type != TRANS_ANCHOR_START &&
|
||||
type != TRANS_ANCHOR_END) {
|
||||
|
||||
if (transition_matches(t, c, pos, len)) {
|
||||
memset(visited, 0, num_states * sizeof(bool));
|
||||
follow_epsilons(next, t->target,
|
||||
thread->group_starts, thread->group_ends,
|
||||
pos + 1, len, visited);
|
||||
pos + 1, len, visited, generation++);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -371,7 +491,7 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re
|
||||
result->match_end = matched ? match_end : start_pos;
|
||||
result->group_count = nfa->group_count;
|
||||
|
||||
for (int i = 0; i < LOREG_MAX_GROUPS && i < nfa->group_count; i++) {
|
||||
for (int i = 0; i < LOREX_MAX_GROUPS && i < nfa->group_count; i++) {
|
||||
result->groups[i].start = best_starts[i];
|
||||
result->groups[i].end = best_ends[i];
|
||||
result->groups[i].matched = (best_starts[i] != (size_t)-1 && best_ends[i] != (size_t)-1);
|
||||
@ -389,23 +509,259 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re
|
||||
return matched;
|
||||
}
|
||||
|
||||
bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result) {
|
||||
bool nfa_match_with_ctx(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result, match_ctx_t *ctx) {
|
||||
size_t len = strlen(text);
|
||||
int group_count = ctx->group_count;
|
||||
uint32_t generation = 1;
|
||||
|
||||
for (size_t i = 0; i <= len; i++) {
|
||||
if (nfa_match(nfa, text, i, result)) {
|
||||
if (result) {
|
||||
result->match_start = i;
|
||||
}
|
||||
return true;
|
||||
match_ctx_reset(ctx);
|
||||
|
||||
follow_epsilons(ctx->current, nfa->start, ctx->init_starts, ctx->init_ends,
|
||||
start_pos, len, ctx->visited, generation++);
|
||||
|
||||
bool matched = false;
|
||||
size_t match_end = start_pos;
|
||||
|
||||
for (size_t i = 0; i < ctx->current->count; i++) {
|
||||
if (ctx->current->threads[i].state->accepting) {
|
||||
matched = true;
|
||||
match_end = start_pos;
|
||||
memcpy(ctx->best_starts, ctx->current->threads[i].group_starts, group_count * sizeof(size_t));
|
||||
memcpy(ctx->best_ends, ctx->current->threads[i].group_ends, group_count * sizeof(size_t));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
thread_list_t *current = ctx->current;
|
||||
thread_list_t *next = ctx->next;
|
||||
|
||||
for (size_t pos = start_pos; pos < len; pos++) {
|
||||
unsigned char c = (unsigned char)text[pos];
|
||||
thread_list_clear(next);
|
||||
|
||||
for (size_t i = 0; i < current->count; i++) {
|
||||
thread_t *thread = ¤t->threads[i];
|
||||
nfa_state_t *state = thread->state;
|
||||
size_t trans_count = state->trans_count;
|
||||
transition_t *transitions = state->transitions;
|
||||
|
||||
for (size_t j = 0; j < trans_count; j++) {
|
||||
transition_t *t = &transitions[j];
|
||||
transition_type_t type = t->type;
|
||||
|
||||
if (type != TRANS_EPSILON &&
|
||||
type != TRANS_GROUP_START &&
|
||||
type != TRANS_GROUP_END &&
|
||||
type != TRANS_ANCHOR_START &&
|
||||
type != TRANS_ANCHOR_END) {
|
||||
|
||||
if (transition_matches(t, c, pos, len)) {
|
||||
follow_epsilons(next, t->target,
|
||||
thread->group_starts, thread->group_ends,
|
||||
pos + 1, len, ctx->visited, generation++);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (next->count == 0) break;
|
||||
|
||||
thread_list_t *tmp = current;
|
||||
current = next;
|
||||
next = tmp;
|
||||
|
||||
for (size_t i = 0; i < current->count; i++) {
|
||||
if (current->threads[i].state->accepting) {
|
||||
matched = true;
|
||||
match_end = pos + 1;
|
||||
memcpy(ctx->best_starts, current->threads[i].group_starts, group_count * sizeof(size_t));
|
||||
memcpy(ctx->best_ends, current->threads[i].group_ends, group_count * sizeof(size_t));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result) {
|
||||
result->matched = matched;
|
||||
result->match_start = start_pos;
|
||||
result->match_end = matched ? match_end : start_pos;
|
||||
result->group_count = nfa->group_count;
|
||||
|
||||
for (int i = 0; i < LOREX_MAX_GROUPS && i < nfa->group_count; i++) {
|
||||
result->groups[i].start = ctx->best_starts[i];
|
||||
result->groups[i].end = ctx->best_ends[i];
|
||||
result->groups[i].matched = (ctx->best_starts[i] != (size_t)-1 && ctx->best_ends[i] != (size_t)-1);
|
||||
}
|
||||
}
|
||||
|
||||
return matched;
|
||||
}
|
||||
|
||||
static void set_no_match(lorex_match_t *result) {
|
||||
if (result) {
|
||||
result->matched = false;
|
||||
result->match_start = 0;
|
||||
result->match_end = 0;
|
||||
result->group_count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
bool nfa_search(nfa_t *nfa, const char *text, lorex_match_t *result) {
|
||||
size_t len = strlen(text);
|
||||
|
||||
if (nfa->anchored_start) {
|
||||
bool matched = nfa_match(nfa, text, 0, result);
|
||||
if (matched && result) {
|
||||
result->match_start = 0;
|
||||
}
|
||||
if (!matched) {
|
||||
set_no_match(result);
|
||||
}
|
||||
return matched;
|
||||
}
|
||||
|
||||
if (nfa->is_pure_literal && nfa->literal_prefix && nfa->prefix_len > 0) {
|
||||
const char *found = strstr(text, nfa->literal_prefix);
|
||||
if (found) {
|
||||
if (result) {
|
||||
result->matched = true;
|
||||
result->match_start = (size_t)(found - text);
|
||||
result->match_end = result->match_start + nfa->prefix_len;
|
||||
result->group_count = 0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
set_no_match(result);
|
||||
return false;
|
||||
}
|
||||
|
||||
match_ctx_t *ctx = match_ctx_create(nfa);
|
||||
if (!ctx) {
|
||||
set_no_match(result);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (nfa->anchored_end && nfa->suffix_len > 0) {
|
||||
if (len < nfa->suffix_len) {
|
||||
match_ctx_free(ctx);
|
||||
set_no_match(result);
|
||||
return false;
|
||||
}
|
||||
if (memcmp(text + len - nfa->suffix_len, nfa->literal_suffix, nfa->suffix_len) != 0) {
|
||||
match_ctx_free(ctx);
|
||||
set_no_match(result);
|
||||
return false;
|
||||
}
|
||||
size_t suffix_start = len - nfa->suffix_len;
|
||||
for (size_t i = suffix_start + 1; i > 0; i--) {
|
||||
size_t pos = i - 1;
|
||||
if (nfa_match_with_ctx(nfa, text, pos, result, ctx)) {
|
||||
if (result) result->match_start = pos;
|
||||
match_ctx_free(ctx);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
match_ctx_free(ctx);
|
||||
set_no_match(result);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (nfa->has_alt_dispatch) {
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
unsigned char c = (unsigned char)text[i];
|
||||
if (nfa->alt_dispatch[c] == 255) continue;
|
||||
if (nfa_match_with_ctx(nfa, text, i, result, ctx)) {
|
||||
if (result) result->match_start = i;
|
||||
match_ctx_free(ctx);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
match_ctx_free(ctx);
|
||||
set_no_match(result);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (nfa->prefix_len >= 2) {
|
||||
const char *pos = text;
|
||||
const char *end = text + len;
|
||||
while (pos <= end - nfa->prefix_len) {
|
||||
pos = strstr(pos, nfa->literal_prefix);
|
||||
if (!pos) break;
|
||||
size_t offset = (size_t)(pos - text);
|
||||
if (nfa_match_with_ctx(nfa, text, offset, result, ctx)) {
|
||||
if (result) {
|
||||
result->match_start = offset;
|
||||
}
|
||||
match_ctx_free(ctx);
|
||||
return true;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
match_ctx_free(ctx);
|
||||
set_no_match(result);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (nfa->single_first_char != 0) {
|
||||
const char *pos = text;
|
||||
const char *end = text + len;
|
||||
while (pos < end) {
|
||||
pos = memchr(pos, nfa->single_first_char, (size_t)(end - pos));
|
||||
if (!pos) break;
|
||||
size_t offset = (size_t)(pos - text);
|
||||
if (nfa_match_with_ctx(nfa, text, offset, result, ctx)) {
|
||||
if (result) {
|
||||
result->match_start = offset;
|
||||
}
|
||||
match_ctx_free(ctx);
|
||||
return true;
|
||||
}
|
||||
pos++;
|
||||
}
|
||||
match_ctx_free(ctx);
|
||||
set_no_match(result);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (nfa->first_chars_valid) {
|
||||
bool has_any_first_char = false;
|
||||
for (int i = 0; i < 32; i++) {
|
||||
if (nfa->first_chars[i]) {
|
||||
has_any_first_char = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (has_any_first_char) {
|
||||
for (size_t i = 0; i < len; i++) {
|
||||
unsigned char c = (unsigned char)text[i];
|
||||
if (!(nfa->first_chars[c >> 3] & (1u << (c & 7)))) {
|
||||
continue;
|
||||
}
|
||||
if (nfa_match_with_ctx(nfa, text, i, result, ctx)) {
|
||||
if (result) {
|
||||
result->match_start = i;
|
||||
}
|
||||
match_ctx_free(ctx);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
match_ctx_free(ctx);
|
||||
set_no_match(result);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i <= len; i++) {
|
||||
if (nfa_match_with_ctx(nfa, text, i, result, ctx)) {
|
||||
if (result) {
|
||||
result->match_start = i;
|
||||
}
|
||||
match_ctx_free(ctx);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
match_ctx_free(ctx);
|
||||
set_no_match(result);
|
||||
return false;
|
||||
}
|
||||
|
||||
406
src/nfa.c
406
src/nfa.c
@ -11,9 +11,282 @@ nfa_t *nfa_create(void) {
|
||||
nfa->capacity = 0;
|
||||
nfa->start = NULL;
|
||||
nfa->group_count = 0;
|
||||
nfa->anchored_start = false;
|
||||
nfa->anchored_end = false;
|
||||
memset(nfa->first_chars, 0, 32);
|
||||
nfa->first_chars_valid = false;
|
||||
nfa->literal_prefix = NULL;
|
||||
nfa->prefix_len = 0;
|
||||
nfa->is_pure_literal = false;
|
||||
nfa->single_first_char = 0;
|
||||
nfa->literal_suffix = NULL;
|
||||
nfa->suffix_len = 0;
|
||||
memset(nfa->alt_dispatch, 255, 256);
|
||||
nfa->has_alt_dispatch = false;
|
||||
return nfa;
|
||||
}
|
||||
|
||||
static bool ast_starts_with_anchor(ast_node_t *ast) {
|
||||
if (!ast) return false;
|
||||
switch (ast->type) {
|
||||
case AST_ANCHOR_START:
|
||||
return true;
|
||||
case AST_CONCAT:
|
||||
return ast_starts_with_anchor(ast->left);
|
||||
case AST_GROUP:
|
||||
return ast_starts_with_anchor(ast->left);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool ast_ends_with_anchor(ast_node_t *ast) {
|
||||
if (!ast) return false;
|
||||
switch (ast->type) {
|
||||
case AST_ANCHOR_END:
|
||||
return true;
|
||||
case AST_CONCAT:
|
||||
return ast_ends_with_anchor(ast->right);
|
||||
case AST_GROUP:
|
||||
return ast_ends_with_anchor(ast->left);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static void extract_first_chars(ast_node_t *ast, unsigned char *bitmap, bool *valid) {
|
||||
if (!ast) {
|
||||
*valid = false;
|
||||
return;
|
||||
}
|
||||
switch (ast->type) {
|
||||
case AST_CHAR:
|
||||
bitmap[(unsigned char)ast->value >> 3] |= (1u << (ast->value & 7));
|
||||
break;
|
||||
case AST_DOT:
|
||||
*valid = false;
|
||||
break;
|
||||
case AST_CONCAT:
|
||||
extract_first_chars(ast->left, bitmap, valid);
|
||||
break;
|
||||
case AST_ALTER:
|
||||
extract_first_chars(ast->left, bitmap, valid);
|
||||
extract_first_chars(ast->right, bitmap, valid);
|
||||
break;
|
||||
case AST_STAR:
|
||||
case AST_QUESTION:
|
||||
*valid = false;
|
||||
break;
|
||||
case AST_PLUS:
|
||||
extract_first_chars(ast->left, bitmap, valid);
|
||||
break;
|
||||
case AST_GROUP:
|
||||
extract_first_chars(ast->left, bitmap, valid);
|
||||
break;
|
||||
case AST_ANCHOR_START:
|
||||
case AST_ANCHOR_END:
|
||||
break;
|
||||
case AST_BRACKET:
|
||||
if (ast->bracket && !ast->bracket->negated) {
|
||||
for (size_t i = 0; i < ast->bracket->count; i++) {
|
||||
unsigned char s = (unsigned char)ast->bracket->ranges[i].start;
|
||||
unsigned char e = (unsigned char)ast->bracket->ranges[i].end;
|
||||
for (unsigned int c = s; c <= e; c++) {
|
||||
bitmap[c >> 3] |= (1u << (c & 7));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
*valid = false;
|
||||
}
|
||||
break;
|
||||
case AST_QUANTIFIER:
|
||||
if (ast->quant.min > 0) {
|
||||
extract_first_chars(ast->left, bitmap, valid);
|
||||
} else {
|
||||
*valid = false;
|
||||
}
|
||||
break;
|
||||
case AST_CLASS_DIGIT:
|
||||
for (char c = '0'; c <= '9'; c++) {
|
||||
bitmap[(unsigned char)c >> 3] |= (1u << (c & 7));
|
||||
}
|
||||
break;
|
||||
case AST_CLASS_WORD:
|
||||
for (char c = 'a'; c <= 'z'; c++)
|
||||
bitmap[(unsigned char)c >> 3] |= (1u << (c & 7));
|
||||
for (char c = 'A'; c <= 'Z'; c++)
|
||||
bitmap[(unsigned char)c >> 3] |= (1u << (c & 7));
|
||||
for (char c = '0'; c <= '9'; c++)
|
||||
bitmap[(unsigned char)c >> 3] |= (1u << (c & 7));
|
||||
bitmap['_' >> 3] |= (1u << ('_' & 7));
|
||||
break;
|
||||
case AST_CLASS_SPACE:
|
||||
bitmap[' ' >> 3] |= (1u << (' ' & 7));
|
||||
bitmap['\t' >> 3] |= (1u << ('\t' & 7));
|
||||
bitmap['\n' >> 3] |= (1u << ('\n' & 7));
|
||||
bitmap['\r' >> 3] |= (1u << ('\r' & 7));
|
||||
break;
|
||||
case AST_CLASS_NDIGIT:
|
||||
case AST_CLASS_NWORD:
|
||||
case AST_CLASS_NSPACE:
|
||||
*valid = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static bool ast_is_pure_literal(ast_node_t *ast) {
|
||||
if (!ast) return true;
|
||||
switch (ast->type) {
|
||||
case AST_CHAR:
|
||||
return true;
|
||||
case AST_CONCAT:
|
||||
return ast_is_pure_literal(ast->left) && ast_is_pure_literal(ast->right);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t extract_literal_prefix(ast_node_t *ast, char *buf, size_t max_len) {
|
||||
if (!ast || max_len == 0) return 0;
|
||||
switch (ast->type) {
|
||||
case AST_CHAR:
|
||||
buf[0] = ast->value;
|
||||
return 1;
|
||||
case AST_CONCAT: {
|
||||
size_t left_len = extract_literal_prefix(ast->left, buf, max_len);
|
||||
if (left_len > 0 && ast_is_pure_literal(ast->left)) {
|
||||
size_t right_len = extract_literal_prefix(ast->right, buf + left_len, max_len - left_len);
|
||||
return left_len + right_len;
|
||||
}
|
||||
return left_len;
|
||||
}
|
||||
case AST_GROUP:
|
||||
return extract_literal_prefix(ast->left, buf, max_len);
|
||||
case AST_ANCHOR_START:
|
||||
case AST_ANCHOR_END:
|
||||
return 0;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static ast_node_t *strip_end_anchor(ast_node_t *ast) {
|
||||
if (!ast) return NULL;
|
||||
if (ast->type == AST_ANCHOR_END) return NULL;
|
||||
if (ast->type == AST_CONCAT && ast->right && ast->right->type == AST_ANCHOR_END) {
|
||||
return ast->left;
|
||||
}
|
||||
return ast;
|
||||
}
|
||||
|
||||
static size_t extract_literal_suffix_rev(ast_node_t *ast, char *buf, size_t max_len) {
|
||||
if (!ast || max_len == 0) return 0;
|
||||
switch (ast->type) {
|
||||
case AST_CHAR:
|
||||
buf[0] = ast->value;
|
||||
return 1;
|
||||
case AST_CONCAT: {
|
||||
size_t right_len = extract_literal_suffix_rev(ast->right, buf, max_len);
|
||||
if (right_len == 0 || !ast_is_pure_literal(ast->right)) return right_len;
|
||||
size_t left_len = extract_literal_suffix_rev(ast->left, buf + right_len, max_len - right_len);
|
||||
return right_len + left_len;
|
||||
}
|
||||
case AST_GROUP:
|
||||
return extract_literal_suffix_rev(ast->left, buf, max_len);
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static size_t extract_literal_suffix(ast_node_t *ast, char *buf, size_t max_len) {
|
||||
ast_node_t *stripped = strip_end_anchor(ast);
|
||||
if (!stripped) return 0;
|
||||
size_t len = extract_literal_suffix_rev(stripped, buf, max_len);
|
||||
for (size_t i = 0; i < len / 2; i++) {
|
||||
char tmp = buf[i];
|
||||
buf[i] = buf[len - 1 - i];
|
||||
buf[len - 1 - i] = tmp;
|
||||
}
|
||||
return len;
|
||||
}
|
||||
|
||||
static void build_alt_dispatch_node(ast_node_t *ast, unsigned char *dispatch) {
|
||||
if (!ast) return;
|
||||
switch (ast->type) {
|
||||
case AST_CHAR:
|
||||
dispatch[(unsigned char)ast->value] = 1;
|
||||
break;
|
||||
case AST_CONCAT:
|
||||
build_alt_dispatch_node(ast->left, dispatch);
|
||||
break;
|
||||
case AST_ALTER:
|
||||
build_alt_dispatch_node(ast->left, dispatch);
|
||||
build_alt_dispatch_node(ast->right, dispatch);
|
||||
break;
|
||||
case AST_GROUP:
|
||||
build_alt_dispatch_node(ast->left, dispatch);
|
||||
break;
|
||||
case AST_BRACKET:
|
||||
if (ast->bracket && !ast->bracket->negated) {
|
||||
for (size_t i = 0; i < ast->bracket->count; i++) {
|
||||
unsigned char s = (unsigned char)ast->bracket->ranges[i].start;
|
||||
unsigned char e = (unsigned char)ast->bracket->ranges[i].end;
|
||||
for (unsigned int c = s; c <= e; c++) {
|
||||
dispatch[c] = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case AST_CLASS_DIGIT:
|
||||
for (char c = '0'; c <= '9'; c++) {
|
||||
dispatch[(unsigned char)c] = 1;
|
||||
}
|
||||
break;
|
||||
case AST_CLASS_WORD:
|
||||
for (char c = 'a'; c <= 'z'; c++) dispatch[(unsigned char)c] = 1;
|
||||
for (char c = 'A'; c <= 'Z'; c++) dispatch[(unsigned char)c] = 1;
|
||||
for (char c = '0'; c <= '9'; c++) dispatch[(unsigned char)c] = 1;
|
||||
dispatch['_'] = 1;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_top_level_alternation(ast_node_t *ast) {
|
||||
if (!ast) return false;
|
||||
if (ast->type == AST_ALTER) return true;
|
||||
if (ast->type == AST_GROUP) return is_top_level_alternation(ast->left);
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool build_alt_dispatch(ast_node_t *ast, unsigned char *dispatch) {
|
||||
if (!is_top_level_alternation(ast)) return false;
|
||||
memset(dispatch, 255, 256);
|
||||
build_alt_dispatch_node(ast, dispatch);
|
||||
bool has_any = false;
|
||||
for (int i = 0; i < 256; i++) {
|
||||
if (dispatch[i] == 1) {
|
||||
dispatch[i] = 0;
|
||||
has_any = true;
|
||||
}
|
||||
}
|
||||
return has_any;
|
||||
}
|
||||
|
||||
static char compute_single_first_char(unsigned char *bitmap) {
|
||||
int count = 0;
|
||||
char first_char = 0;
|
||||
for (int i = 0; i < 256; i++) {
|
||||
if (bitmap[i >> 3] & (1u << (i & 7))) {
|
||||
count++;
|
||||
if (count == 1) first_char = (char)i;
|
||||
if (count > 1) return 0;
|
||||
}
|
||||
}
|
||||
return first_char;
|
||||
}
|
||||
|
||||
void nfa_free(nfa_t *nfa) {
|
||||
if (!nfa) return;
|
||||
for (size_t i = 0; i < nfa->state_count; i++) {
|
||||
@ -21,14 +294,16 @@ void nfa_free(nfa_t *nfa) {
|
||||
free(nfa->states[i]);
|
||||
}
|
||||
free(nfa->states);
|
||||
free(nfa->literal_prefix);
|
||||
free(nfa->literal_suffix);
|
||||
free(nfa);
|
||||
}
|
||||
|
||||
static bool nfa_grow(nfa_t *nfa) {
|
||||
size_t new_cap = nfa->capacity == 0 ? 16 : nfa->capacity * 2;
|
||||
if (new_cap > LOREG_MAX_STATES) {
|
||||
if (nfa->capacity >= LOREG_MAX_STATES) return false;
|
||||
new_cap = LOREG_MAX_STATES;
|
||||
if (new_cap > LOREX_MAX_STATES) {
|
||||
if (nfa->capacity >= LOREX_MAX_STATES) return false;
|
||||
new_cap = LOREX_MAX_STATES;
|
||||
}
|
||||
nfa_state_t **new_states = realloc(nfa->states, new_cap * sizeof(nfa_state_t *));
|
||||
if (!new_states) return false;
|
||||
@ -100,14 +375,14 @@ void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_typ
|
||||
t->group_id = group_id;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error);
|
||||
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, lorex_error_t *error);
|
||||
|
||||
static nfa_fragment_t build_char(nfa_t *nfa, char c, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_char(nfa_t *nfa, char c, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
nfa_add_transition(start, accept, TRANS_CHAR, c);
|
||||
@ -116,12 +391,12 @@ static nfa_fragment_t build_char(nfa_t *nfa, char c, loreg_error_t *error) {
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_dot(nfa_t *nfa, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_dot(nfa_t *nfa, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
nfa_add_transition(start, accept, TRANS_DOT, '\0');
|
||||
@ -130,12 +405,12 @@ static nfa_fragment_t build_dot(nfa_t *nfa, loreg_error_t *error) {
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
nfa_add_transition(start, accept, type, '\0');
|
||||
@ -144,12 +419,12 @@ static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, loreg_erro
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
nfa_add_bracket_transition(start, accept, bracket);
|
||||
@ -158,12 +433,12 @@ static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, loreg_
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *right, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_fragment_t left_frag = build_nfa(nfa, left, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
nfa_fragment_t right_frag = build_nfa(nfa, right, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
|
||||
nfa_add_transition(left_frag.accept, right_frag.start, TRANS_EPSILON, '\0');
|
||||
frag.start = left_frag.start;
|
||||
@ -171,19 +446,19 @@ static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *rig
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *right, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_fragment_t left_frag = build_nfa(nfa, left, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
nfa_fragment_t right_frag = build_nfa(nfa, right, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
|
||||
nfa_add_transition(start, left_frag.start, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(start, right_frag.start, TRANS_EPSILON, '\0');
|
||||
@ -195,17 +470,17 @@ static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *righ
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
|
||||
@ -222,16 +497,16 @@ static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, lor
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0');
|
||||
@ -246,17 +521,17 @@ static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, lor
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
|
||||
@ -272,17 +547,17 @@ static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy,
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
|
||||
nfa_add_group_transition(start, child_frag.start, TRANS_GROUP_START, group_id);
|
||||
nfa_add_group_transition(child_frag.accept, accept, TRANS_GROUP_END, group_id);
|
||||
@ -296,12 +571,12 @@ static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, l
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
nfa_add_transition(start, accept, type, '\0');
|
||||
@ -310,13 +585,13 @@ static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, loreg_err
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, int max, bool greedy, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, int max, bool greedy, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
|
||||
if (min == 0 && max == 0) {
|
||||
nfa_state_t *state = nfa_add_state(nfa);
|
||||
if (!state) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
frag.start = state;
|
||||
@ -326,7 +601,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
|
||||
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
if (!start) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
@ -334,7 +609,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
|
||||
|
||||
for (int i = 0; i < min; i++) {
|
||||
nfa_fragment_t rep = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0');
|
||||
current = rep.accept;
|
||||
}
|
||||
@ -343,14 +618,14 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
|
||||
nfa_state_t *loop_start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!loop_start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_add_transition(current, loop_start, TRANS_EPSILON, '\0');
|
||||
|
||||
nfa_fragment_t rep = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(loop_start, rep.start, TRANS_EPSILON, '\0');
|
||||
@ -366,7 +641,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
|
||||
} else {
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
@ -374,7 +649,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
|
||||
|
||||
for (int i = min; i < max; i++) {
|
||||
nfa_fragment_t rep = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
if (*error != LOREX_OK) return frag;
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0');
|
||||
@ -400,13 +675,13 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error) {
|
||||
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, lorex_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
|
||||
if (!ast) {
|
||||
nfa_state_t *state = nfa_add_state(nfa);
|
||||
if (!state) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
frag.start = state;
|
||||
@ -456,16 +731,53 @@ static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *erro
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error) {
|
||||
*error = LOREG_OK;
|
||||
nfa_t *nfa_from_ast(ast_node_t *ast, lorex_error_t *error) {
|
||||
*error = LOREX_OK;
|
||||
nfa_t *nfa = nfa_create();
|
||||
if (!nfa) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
*error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
nfa->anchored_start = ast_starts_with_anchor(ast);
|
||||
nfa->anchored_end = ast_ends_with_anchor(ast);
|
||||
|
||||
nfa->first_chars_valid = true;
|
||||
extract_first_chars(ast, nfa->first_chars, &nfa->first_chars_valid);
|
||||
if (nfa->first_chars_valid) {
|
||||
nfa->single_first_char = compute_single_first_char(nfa->first_chars);
|
||||
}
|
||||
|
||||
nfa->is_pure_literal = ast_is_pure_literal(ast);
|
||||
|
||||
char prefix_buf[256];
|
||||
size_t prefix_len = extract_literal_prefix(ast, prefix_buf, sizeof(prefix_buf));
|
||||
if (prefix_len > 0) {
|
||||
nfa->literal_prefix = malloc(prefix_len + 1);
|
||||
if (nfa->literal_prefix) {
|
||||
memcpy(nfa->literal_prefix, prefix_buf, prefix_len);
|
||||
nfa->literal_prefix[prefix_len] = '\0';
|
||||
nfa->prefix_len = prefix_len;
|
||||
}
|
||||
}
|
||||
|
||||
if (nfa->anchored_end) {
|
||||
char suffix_buf[256];
|
||||
size_t suffix_len = extract_literal_suffix(ast, suffix_buf, sizeof(suffix_buf));
|
||||
if (suffix_len > 0) {
|
||||
nfa->literal_suffix = malloc(suffix_len + 1);
|
||||
if (nfa->literal_suffix) {
|
||||
memcpy(nfa->literal_suffix, suffix_buf, suffix_len);
|
||||
nfa->literal_suffix[suffix_len] = '\0';
|
||||
nfa->suffix_len = suffix_len;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
nfa->has_alt_dispatch = build_alt_dispatch(ast, nfa->alt_dispatch);
|
||||
|
||||
nfa_fragment_t frag = build_nfa(nfa, ast, error);
|
||||
if (*error != LOREG_OK) {
|
||||
if (*error != LOREX_OK) {
|
||||
nfa_free(nfa);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
34
src/parser.c
34
src/parser.c
@ -10,11 +10,11 @@ static void parser_advance(parser_t *parser) {
|
||||
void parser_init(parser_t *parser, const char *pattern) {
|
||||
lexer_init(&parser->lexer, pattern);
|
||||
parser->current = lexer_next(&parser->lexer);
|
||||
parser->error = LOREG_OK;
|
||||
parser->error = LOREX_OK;
|
||||
parser->group_count = 0;
|
||||
}
|
||||
|
||||
loreg_error_t parser_get_error(parser_t *parser) {
|
||||
lorex_error_t parser_get_error(parser_t *parser) {
|
||||
return parser->error;
|
||||
}
|
||||
|
||||
@ -27,7 +27,7 @@ static int parse_number(parser_t *parser);
|
||||
|
||||
static ast_node_t *parse_expr(parser_t *parser) {
|
||||
ast_node_t *left = parse_term(parser);
|
||||
if (!left || parser->error != LOREG_OK) return left;
|
||||
if (!left || parser->error != LOREX_OK) return left;
|
||||
|
||||
while (parser->current.type == TOKEN_PIPE) {
|
||||
parser_advance(parser);
|
||||
@ -38,7 +38,7 @@ static ast_node_t *parse_expr(parser_t *parser) {
|
||||
}
|
||||
left = ast_create_alter(left, right);
|
||||
if (!left) {
|
||||
parser->error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
parser->error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@ -61,7 +61,7 @@ static ast_node_t *parse_term(parser_t *parser) {
|
||||
} else {
|
||||
left = ast_create_concat(left, factor);
|
||||
if (!left) {
|
||||
parser->error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
parser->error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@ -71,7 +71,7 @@ static ast_node_t *parse_term(parser_t *parser) {
|
||||
|
||||
static ast_node_t *parse_factor(parser_t *parser) {
|
||||
ast_node_t *atom = parse_atom(parser);
|
||||
if (!atom || parser->error != LOREG_OK) return atom;
|
||||
if (!atom || parser->error != LOREX_OK) return atom;
|
||||
|
||||
while (parser->current.type == TOKEN_STAR ||
|
||||
parser->current.type == TOKEN_PLUS ||
|
||||
@ -107,7 +107,13 @@ static ast_node_t *parse_factor(parser_t *parser) {
|
||||
}
|
||||
|
||||
if (parser->current.type != TOKEN_RBRACE) {
|
||||
parser->error = LOREG_ERR_INVALID_QUANTIFIER;
|
||||
parser->error = LOREX_ERR_INVALID_QUANTIFIER;
|
||||
ast_free(atom);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (max != -1 && min > max) {
|
||||
parser->error = LOREX_ERR_INVALID_QUANTIFIER;
|
||||
ast_free(atom);
|
||||
return NULL;
|
||||
}
|
||||
@ -122,7 +128,7 @@ static ast_node_t *parse_factor(parser_t *parser) {
|
||||
}
|
||||
|
||||
if (!atom) {
|
||||
parser->error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
parser->error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
@ -167,7 +173,7 @@ static ast_node_t *parse_atom(parser_t *parser) {
|
||||
int group_id = parser->group_count++;
|
||||
ast_node_t *inner = parse_expr(parser);
|
||||
if (parser->current.type != TOKEN_RPAREN) {
|
||||
parser->error = LOREG_ERR_UNBALANCED_PAREN;
|
||||
parser->error = LOREX_ERR_UNBALANCED_PAREN;
|
||||
ast_free(inner);
|
||||
return NULL;
|
||||
}
|
||||
@ -216,12 +222,12 @@ static ast_node_t *parse_atom(parser_t *parser) {
|
||||
return NULL;
|
||||
|
||||
default:
|
||||
parser->error = LOREG_ERR_INVALID_PATTERN;
|
||||
parser->error = LOREX_ERR_INVALID_PATTERN;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!node && parser->error == LOREG_OK) {
|
||||
parser->error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
if (!node && parser->error == LOREX_OK) {
|
||||
parser->error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
@ -231,7 +237,7 @@ static ast_node_t *parse_bracket(parser_t *parser) {
|
||||
|
||||
bracket_class_t *bracket = bracket_create();
|
||||
if (!bracket) {
|
||||
parser->error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
parser->error = LOREX_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -293,7 +299,7 @@ static ast_node_t *parse_bracket(parser_t *parser) {
|
||||
|
||||
if (parser->current.type != TOKEN_RBRACKET) {
|
||||
bracket_free(bracket);
|
||||
parser->error = LOREG_ERR_INVALID_PATTERN;
|
||||
parser->error = LOREX_ERR_INVALID_PATTERN;
|
||||
return NULL;
|
||||
}
|
||||
parser_advance(parser);
|
||||
|
||||
32
src/repl.c
32
src/repl.c
@ -1,6 +1,6 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "repl.h"
|
||||
#include "loreg.h"
|
||||
#include "lorex.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
@ -8,12 +8,12 @@
|
||||
#define MAX_INPUT 4096
|
||||
|
||||
static void print_banner(void) {
|
||||
printf("loreg v%s - regex interpreter\n", LOREG_VERSION);
|
||||
printf("lorex v%s - regex interpreter\n", LOREX_VERSION);
|
||||
printf("commands: :q quit, :h help, :p <pattern> set pattern, :m <text> match, :s <text> search\n\n");
|
||||
}
|
||||
|
||||
static void print_help(void) {
|
||||
printf("loreg REPL commands:\n");
|
||||
printf("lorex REPL commands:\n");
|
||||
printf(" :q quit\n");
|
||||
printf(" :h show this help\n");
|
||||
printf(" :p <regex> compile and set pattern\n");
|
||||
@ -40,7 +40,7 @@ static void print_help(void) {
|
||||
printf(" \\D \\W \\S negated classes\n\n");
|
||||
}
|
||||
|
||||
static void print_match(const char *text, loreg_match_t *result) {
|
||||
static void print_match(const char *text, lorex_match_t *result) {
|
||||
if (!result->matched) {
|
||||
printf("no match\n");
|
||||
return;
|
||||
@ -83,7 +83,7 @@ static char *read_line(void) {
|
||||
void repl_run(void) {
|
||||
print_banner();
|
||||
|
||||
loreg_regex_t *regex = NULL;
|
||||
lorex_regex_t *regex = NULL;
|
||||
char *line;
|
||||
|
||||
while ((line = read_line()) != NULL) {
|
||||
@ -103,14 +103,14 @@ void repl_run(void) {
|
||||
while (*pattern == ' ') pattern++;
|
||||
|
||||
if (regex) {
|
||||
loreg_free(regex);
|
||||
lorex_free(regex);
|
||||
regex = NULL;
|
||||
}
|
||||
|
||||
loreg_error_t error;
|
||||
regex = loreg_compile(pattern, &error);
|
||||
lorex_error_t error;
|
||||
regex = lorex_compile(pattern, &error);
|
||||
if (!regex) {
|
||||
printf("error: %s\n", loreg_error_string(error));
|
||||
printf("error: %s\n", lorex_error_string(error));
|
||||
} else {
|
||||
printf("pattern compiled: %s\n", pattern);
|
||||
}
|
||||
@ -126,8 +126,8 @@ void repl_run(void) {
|
||||
const char *text = line + 3;
|
||||
while (*text == ' ') text++;
|
||||
|
||||
loreg_match_t result;
|
||||
loreg_match(regex, text, &result);
|
||||
lorex_match_t result;
|
||||
lorex_match(regex, text, &result);
|
||||
print_match(text, &result);
|
||||
continue;
|
||||
}
|
||||
@ -141,8 +141,8 @@ void repl_run(void) {
|
||||
const char *text = line + 3;
|
||||
while (*text == ' ') text++;
|
||||
|
||||
loreg_match_t result;
|
||||
loreg_search(regex, text, &result);
|
||||
lorex_match_t result;
|
||||
lorex_search(regex, text, &result);
|
||||
print_match(text, &result);
|
||||
continue;
|
||||
}
|
||||
@ -157,13 +157,13 @@ void repl_run(void) {
|
||||
continue;
|
||||
}
|
||||
|
||||
loreg_match_t result;
|
||||
loreg_search(regex, line, &result);
|
||||
lorex_match_t result;
|
||||
lorex_search(regex, line, &result);
|
||||
print_match(line, &result);
|
||||
}
|
||||
|
||||
if (regex) {
|
||||
loreg_free(regex);
|
||||
lorex_free(regex);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
448
tests/benchmark.c
Normal file
448
tests/benchmark.c
Normal file
@ -0,0 +1,448 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#define _POSIX_C_SOURCE 200809L
|
||||
#include "../include/lorex.h"
|
||||
#include <regex.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#define ITERATIONS 10000
|
||||
#define WARMUP 1000
|
||||
|
||||
typedef struct {
|
||||
const char *name;
|
||||
const char *pattern;
|
||||
const char *text;
|
||||
int expect_match;
|
||||
} benchmark_t;
|
||||
|
||||
typedef struct {
|
||||
double lorex_compile_us;
|
||||
double lorex_match_us;
|
||||
double lorex_total_us;
|
||||
double posix_compile_us;
|
||||
double posix_match_us;
|
||||
double posix_total_us;
|
||||
int lorex_matched;
|
||||
int posix_matched;
|
||||
int lorex_failed;
|
||||
int posix_failed;
|
||||
} result_t;
|
||||
|
||||
static benchmark_t benchmarks[] = {
|
||||
{"literal_short", "hello", "hello world", 1},
|
||||
{"literal_medium", "the quick brown", "the quick brown fox jumps over the lazy dog", 1},
|
||||
{"literal_long", "Lorem ipsum dolor sit amet", "Lorem ipsum dolor sit amet, consectetur adipiscing elit", 1},
|
||||
{"literal_nomatch", "xyz", "the quick brown fox jumps over the lazy dog", 0},
|
||||
{"literal_end", "dog", "the quick brown fox jumps over the lazy dog", 1},
|
||||
{"literal_repeated", "abcabc", "xyzabcabcdef", 1},
|
||||
|
||||
{"dot_single", "a.c", "abc", 1},
|
||||
{"dot_multiple", "a..b", "aXYb", 1},
|
||||
{"dot_many", "a.....b", "a12345b", 1},
|
||||
{"dot_star", "a.*b", "aXXXXXXXXXXb", 1},
|
||||
{"dot_plus", "a.+b", "aXXXXXXXXXXb", 1},
|
||||
|
||||
{"anchor_start", "^the", "the quick brown fox", 1},
|
||||
{"anchor_end", "fox$", "the quick brown fox", 1},
|
||||
{"anchor_both", "^hello$", "hello", 1},
|
||||
{"anchor_start_nomatch", "^fox", "the quick brown fox", 0},
|
||||
{"anchor_end_nomatch", "the$", "the quick brown fox", 0},
|
||||
|
||||
{"star_simple", "ab*c", "abbbbc", 1},
|
||||
{"star_zero", "ab*c", "ac", 1},
|
||||
{"star_greedy", "a.*b", "aXbXbXb", 1},
|
||||
{"star_repeated", "a*b*c*", "aaabbbccc", 1},
|
||||
{"star_empty", "a*", "", 1},
|
||||
|
||||
{"plus_simple", "ab+c", "abbbbc", 1},
|
||||
{"plus_one", "ab+c", "abc", 1},
|
||||
{"plus_nomatch", "ab+c", "ac", 0},
|
||||
{"plus_greedy", "a.+b", "aXbXbXb", 1},
|
||||
|
||||
{"question_present", "colou?r", "colour", 1},
|
||||
{"question_absent", "colou?r", "color", 1},
|
||||
{"question_multiple", "a?b?c?d", "abcd", 1},
|
||||
|
||||
{"class_vowels", "[aeiou]", "hello", 1},
|
||||
{"class_digits", "[0-9]+", "abc123def", 1},
|
||||
{"class_alpha", "[a-zA-Z]+", "HelloWorld", 1},
|
||||
{"class_alnum", "[a-zA-Z0-9]+", "Test123", 1},
|
||||
{"class_neg_digit", "[^0-9]+", "hello", 1},
|
||||
{"class_neg_alpha", "[^a-zA-Z]+", "12345", 1},
|
||||
{"class_complex", "[a-zA-Z_][a-zA-Z0-9_]*", "variable_name_123", 1},
|
||||
|
||||
{"alt_simple", "cat|dog", "I have a cat", 1},
|
||||
{"alt_simple2", "cat|dog", "I have a dog", 1},
|
||||
{"alt_three", "red|green|blue", "the color is green", 1},
|
||||
{"alt_nomatch", "cat|dog", "I have a bird", 0},
|
||||
{"alt_words", "hello|world|test", "this is a test", 1},
|
||||
|
||||
{"group_simple", "(ab)+", "ababab", 1},
|
||||
{"group_alt", "(cat|dog)s?", "cats", 1},
|
||||
{"group_nested", "((a)(b))+", "ababab", 1},
|
||||
{"group_complex", "(a(b(c)))+", "abcabc", 1},
|
||||
|
||||
{"quant_exact", "a{3}", "aaa", 1},
|
||||
{"quant_exact_long", "a{10}", "aaaaaaaaaa", 1},
|
||||
{"quant_range", "a{2,4}", "aaa", 1},
|
||||
{"quant_min", "a{3,}", "aaaaa", 1},
|
||||
{"quant_combined", "[0-9]{3}-[0-9]{4}", "555-1234", 1},
|
||||
|
||||
{"email_simple", "[a-z]+@[a-z]+\\.[a-z]+", "test@example.com", 1},
|
||||
{"email_complex", "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "user.name+tag@sub.example.com", 1},
|
||||
{"ip_address", "[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}", "192.168.1.100", 1},
|
||||
{"url_http", "https?://[a-zA-Z0-9.-]+", "https://www.example.com", 1},
|
||||
{"phone_us", "[0-9]{3}-[0-9]{3}-[0-9]{4}", "555-123-4567", 1},
|
||||
{"date_iso", "[0-9]{4}-[0-9]{2}-[0-9]{2}", "2024-01-15", 1},
|
||||
{"time_hms", "[0-9]{2}:[0-9]{2}:[0-9]{2}", "14:30:45", 1},
|
||||
{"hex_color", "#[0-9a-fA-F]{6}", "#ff00ff", 1},
|
||||
|
||||
{"word_boundary", "[a-zA-Z]+", "hello world test", 1},
|
||||
{"whitespace", "[ \\t\\n]+", "hello world", 1},
|
||||
{"identifier", "[a-zA-Z_][a-zA-Z0-9_]*", "_privateVar123", 1},
|
||||
{"number_int", "-?[0-9]+", "-12345", 1},
|
||||
{"number_float", "-?[0-9]+\\.[0-9]+", "3.14159", 1},
|
||||
|
||||
{"long_text_start", "^The", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 1},
|
||||
{"long_text_end", "dog\\.$", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 1},
|
||||
{"long_text_middle", "fox", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 1},
|
||||
{"long_text_nomatch", "elephant", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 0},
|
||||
|
||||
{"repeated_ab", "(ab){5}", "ababababab", 1},
|
||||
{"repeated_word", "(hello ){3}", "hello hello hello ", 1},
|
||||
{"alternation_long", "one|two|three|four|five|six|seven|eight|nine|ten", "the number is seven", 1},
|
||||
|
||||
{"escape_dot", "3\\.14", "pi is 3.14", 1},
|
||||
{"escape_star", "a\\*b", "a*b", 1},
|
||||
{"escape_plus", "c\\+\\+", "c++", 1},
|
||||
{"escape_parens", "\\(test\\)", "(test)", 1},
|
||||
{"escape_brackets", "\\[0\\]", "array[0]", 1},
|
||||
|
||||
{"stress_star", "a*a*a*a*a*b", "aaaaab", 1},
|
||||
{"stress_plus", "a+a+a+a+a+b", "aaaaab", 1},
|
||||
{"stress_nested", "((a+)+)+b", "aaaab", 1},
|
||||
{"stress_alt", "(a|aa|aaa|aaaa)+b", "aaaab", 1},
|
||||
|
||||
{"nomatch_literal", "notfound", "the quick brown fox", 0},
|
||||
{"nomatch_pattern", "^end", "start middle end", 0},
|
||||
{"nomatch_class", "[0-9]+", "no digits here", 0},
|
||||
|
||||
{NULL, NULL, NULL, 0}
|
||||
};
|
||||
|
||||
static double get_time_us(void) {
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv, NULL);
|
||||
return tv.tv_sec * 1000000.0 + tv.tv_usec;
|
||||
}
|
||||
|
||||
static result_t run_benchmark(benchmark_t *bench) {
|
||||
result_t res = {0};
|
||||
double start, end;
|
||||
|
||||
for (int i = 0; i < WARMUP; i++) {
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re = lorex_compile(bench->pattern, &err);
|
||||
if (re) {
|
||||
lorex_match_t m;
|
||||
lorex_search(re, bench->text, &m);
|
||||
lorex_free(re);
|
||||
}
|
||||
}
|
||||
|
||||
start = get_time_us();
|
||||
for (int i = 0; i < ITERATIONS; i++) {
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re = lorex_compile(bench->pattern, &err);
|
||||
if (!re) {
|
||||
res.lorex_failed = 1;
|
||||
break;
|
||||
}
|
||||
lorex_free(re);
|
||||
}
|
||||
end = get_time_us();
|
||||
res.lorex_compile_us = (end - start) / ITERATIONS;
|
||||
|
||||
start = get_time_us();
|
||||
{
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re = lorex_compile(bench->pattern, &err);
|
||||
if (re) {
|
||||
for (int i = 0; i < ITERATIONS; i++) {
|
||||
lorex_match_t m;
|
||||
res.lorex_matched = lorex_search(re, bench->text, &m) ? 1 : 0;
|
||||
}
|
||||
lorex_free(re);
|
||||
}
|
||||
}
|
||||
end = get_time_us();
|
||||
res.lorex_match_us = (end - start) / ITERATIONS;
|
||||
res.lorex_total_us = res.lorex_compile_us + res.lorex_match_us;
|
||||
|
||||
for (int i = 0; i < WARMUP; i++) {
|
||||
regex_t preg;
|
||||
if (regcomp(&preg, bench->pattern, REG_EXTENDED) == 0) {
|
||||
regmatch_t pmatch[1];
|
||||
regexec(&preg, bench->text, 1, pmatch, 0);
|
||||
regfree(&preg);
|
||||
}
|
||||
}
|
||||
|
||||
start = get_time_us();
|
||||
for (int i = 0; i < ITERATIONS; i++) {
|
||||
regex_t preg;
|
||||
if (regcomp(&preg, bench->pattern, REG_EXTENDED) != 0) {
|
||||
res.posix_failed = 1;
|
||||
break;
|
||||
}
|
||||
regfree(&preg);
|
||||
}
|
||||
end = get_time_us();
|
||||
res.posix_compile_us = (end - start) / ITERATIONS;
|
||||
|
||||
start = get_time_us();
|
||||
{
|
||||
regex_t preg;
|
||||
if (regcomp(&preg, bench->pattern, REG_EXTENDED) == 0) {
|
||||
for (int i = 0; i < ITERATIONS; i++) {
|
||||
regmatch_t pmatch[1];
|
||||
res.posix_matched = (regexec(&preg, bench->text, 1, pmatch, 0) == 0) ? 1 : 0;
|
||||
}
|
||||
regfree(&preg);
|
||||
}
|
||||
}
|
||||
end = get_time_us();
|
||||
res.posix_match_us = (end - start) / ITERATIONS;
|
||||
res.posix_total_us = res.posix_compile_us + res.posix_match_us;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
printf("================================================================================\n");
|
||||
printf(" LOREX vs POSIX REGEX PERFORMANCE BENCHMARK\n");
|
||||
printf("================================================================================\n\n");
|
||||
printf("Configuration:\n");
|
||||
printf(" Iterations per test: %d\n", ITERATIONS);
|
||||
printf(" Warmup iterations: %d\n", WARMUP);
|
||||
printf("\n");
|
||||
|
||||
int total_tests = 0;
|
||||
int lorex_wins = 0;
|
||||
int posix_wins = 0;
|
||||
int ties = 0;
|
||||
double total_lorex_time = 0;
|
||||
double total_posix_time = 0;
|
||||
|
||||
int lorex_compile_wins = 0;
|
||||
int posix_compile_wins = 0;
|
||||
int lorex_match_wins = 0;
|
||||
int posix_match_wins = 0;
|
||||
|
||||
printf("================================================================================\n");
|
||||
printf("%-25s | %-12s | %-12s | %-12s | %-8s\n", "TEST NAME", "LOREX (us)", "POSIX (us)", "SPEEDUP", "WINNER");
|
||||
printf("================================================================================\n");
|
||||
|
||||
for (int i = 0; benchmarks[i].name != NULL; i++) {
|
||||
benchmark_t *bench = &benchmarks[i];
|
||||
result_t res = run_benchmark(bench);
|
||||
|
||||
if (res.lorex_failed || res.posix_failed) {
|
||||
printf("%-25s | %-12s | %-12s | %-12s | %-8s\n",
|
||||
bench->name,
|
||||
res.lorex_failed ? "FAILED" : "OK",
|
||||
res.posix_failed ? "FAILED" : "OK",
|
||||
"-", "-");
|
||||
continue;
|
||||
}
|
||||
|
||||
total_tests++;
|
||||
total_lorex_time += res.lorex_total_us;
|
||||
total_posix_time += res.posix_total_us;
|
||||
|
||||
double speedup = res.posix_total_us / res.lorex_total_us;
|
||||
const char *winner;
|
||||
|
||||
if (speedup > 1.05) {
|
||||
winner = "LOREX";
|
||||
lorex_wins++;
|
||||
} else if (speedup < 0.95) {
|
||||
winner = "POSIX";
|
||||
posix_wins++;
|
||||
} else {
|
||||
winner = "TIE";
|
||||
ties++;
|
||||
}
|
||||
|
||||
if (res.lorex_compile_us < res.posix_compile_us) lorex_compile_wins++;
|
||||
else posix_compile_wins++;
|
||||
|
||||
if (res.lorex_match_us < res.posix_match_us) lorex_match_wins++;
|
||||
else posix_match_wins++;
|
||||
|
||||
printf("%-25s | %10.3f | %10.3f | %10.2fx | %-8s\n",
|
||||
bench->name,
|
||||
res.lorex_total_us,
|
||||
res.posix_total_us,
|
||||
speedup,
|
||||
winner);
|
||||
}
|
||||
|
||||
printf("================================================================================\n\n");
|
||||
|
||||
printf("================================================================================\n");
|
||||
printf(" DETAILED RESULTS\n");
|
||||
printf("================================================================================\n\n");
|
||||
|
||||
printf("%-25s | %-20s | %-20s\n", "TEST NAME", "LOREX (compile/match)", "POSIX (compile/match)");
|
||||
printf("--------------------------------------------------------------------------------\n");
|
||||
|
||||
for (int i = 0; benchmarks[i].name != NULL; i++) {
|
||||
benchmark_t *bench = &benchmarks[i];
|
||||
result_t res = run_benchmark(bench);
|
||||
|
||||
if (res.lorex_failed || res.posix_failed) continue;
|
||||
|
||||
printf("%-25s | %8.3f / %8.3f | %8.3f / %8.3f\n",
|
||||
bench->name,
|
||||
res.lorex_compile_us, res.lorex_match_us,
|
||||
res.posix_compile_us, res.posix_match_us);
|
||||
}
|
||||
|
||||
printf("\n================================================================================\n");
|
||||
printf(" SUMMARY\n");
|
||||
printf("================================================================================\n\n");
|
||||
|
||||
printf("Total tests: %d\n", total_tests);
|
||||
printf("\n");
|
||||
printf("Overall wins:\n");
|
||||
printf(" LOREX wins: %d (%.1f%%)\n", lorex_wins, 100.0 * lorex_wins / total_tests);
|
||||
printf(" POSIX wins: %d (%.1f%%)\n", posix_wins, 100.0 * posix_wins / total_tests);
|
||||
printf(" Ties: %d (%.1f%%)\n", ties, 100.0 * ties / total_tests);
|
||||
printf("\n");
|
||||
printf("Compilation phase wins:\n");
|
||||
printf(" LOREX faster: %d\n", lorex_compile_wins);
|
||||
printf(" POSIX faster: %d\n", posix_compile_wins);
|
||||
printf("\n");
|
||||
printf("Matching phase wins:\n");
|
||||
printf(" LOREX faster: %d\n", lorex_match_wins);
|
||||
printf(" POSIX faster: %d\n", posix_match_wins);
|
||||
printf("\n");
|
||||
printf("Total time (all tests):\n");
|
||||
printf(" LOREX: %.3f us\n", total_lorex_time);
|
||||
printf(" POSIX: %.3f us\n", total_posix_time);
|
||||
printf(" Overall speedup: %.2fx %s\n",
|
||||
total_posix_time > total_lorex_time ? total_posix_time / total_lorex_time : total_lorex_time / total_posix_time,
|
||||
total_posix_time > total_lorex_time ? "(LOREX faster)" : "(POSIX faster)");
|
||||
|
||||
printf("\n================================================================================\n");
|
||||
printf(" CATEGORY BREAKDOWN\n");
|
||||
printf("================================================================================\n\n");
|
||||
|
||||
typedef struct {
|
||||
const char *category;
|
||||
const char *prefix;
|
||||
double lorex_total;
|
||||
double posix_total;
|
||||
int count;
|
||||
} category_t;
|
||||
|
||||
category_t categories[] = {
|
||||
{"Literal matching", "literal_", 0, 0, 0},
|
||||
{"Dot metacharacter", "dot_", 0, 0, 0},
|
||||
{"Anchors", "anchor_", 0, 0, 0},
|
||||
{"Star quantifier", "star_", 0, 0, 0},
|
||||
{"Plus quantifier", "plus_", 0, 0, 0},
|
||||
{"Question quantifier", "question_", 0, 0, 0},
|
||||
{"Character classes", "class_", 0, 0, 0},
|
||||
{"Alternation", "alt_", 0, 0, 0},
|
||||
{"Groups", "group_", 0, 0, 0},
|
||||
{"Brace quantifiers", "quant_", 0, 0, 0},
|
||||
{"Real-world patterns", "email_", 0, 0, 0},
|
||||
{"Escape sequences", "escape_", 0, 0, 0},
|
||||
{"Stress tests", "stress_", 0, 0, 0},
|
||||
{"No-match tests", "nomatch_", 0, 0, 0},
|
||||
{NULL, NULL, 0, 0, 0}
|
||||
};
|
||||
|
||||
for (int i = 0; benchmarks[i].name != NULL; i++) {
|
||||
benchmark_t *bench = &benchmarks[i];
|
||||
result_t res = run_benchmark(bench);
|
||||
if (res.lorex_failed || res.posix_failed) continue;
|
||||
|
||||
for (int j = 0; categories[j].category != NULL; j++) {
|
||||
if (strncmp(bench->name, categories[j].prefix, strlen(categories[j].prefix)) == 0) {
|
||||
categories[j].lorex_total += res.lorex_total_us;
|
||||
categories[j].posix_total += res.posix_total_us;
|
||||
categories[j].count++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("%-25s | %-12s | %-12s | %-12s | %-8s\n", "CATEGORY", "LOREX (us)", "POSIX (us)", "SPEEDUP", "WINNER");
|
||||
printf("--------------------------------------------------------------------------------\n");
|
||||
|
||||
for (int i = 0; categories[i].category != NULL; i++) {
|
||||
if (categories[i].count == 0) continue;
|
||||
|
||||
double speedup = categories[i].posix_total / categories[i].lorex_total;
|
||||
const char *winner = speedup > 1.0 ? "LOREX" : "POSIX";
|
||||
|
||||
printf("%-25s | %10.3f | %10.3f | %10.2fx | %-8s\n",
|
||||
categories[i].category,
|
||||
categories[i].lorex_total,
|
||||
categories[i].posix_total,
|
||||
speedup > 1.0 ? speedup : 1.0 / speedup,
|
||||
winner);
|
||||
}
|
||||
|
||||
printf("\n================================================================================\n");
|
||||
printf(" PATTERN DETAILS\n");
|
||||
printf("================================================================================\n\n");
|
||||
|
||||
for (int i = 0; benchmarks[i].name != NULL; i++) {
|
||||
benchmark_t *bench = &benchmarks[i];
|
||||
result_t res = run_benchmark(bench);
|
||||
|
||||
printf("Test: %s\n", bench->name);
|
||||
printf(" Pattern: %s\n", bench->pattern);
|
||||
printf(" Text: %.50s%s\n", bench->text, strlen(bench->text) > 50 ? "..." : "");
|
||||
printf(" Expected: %s\n", bench->expect_match ? "MATCH" : "NO MATCH");
|
||||
|
||||
if (res.lorex_failed) {
|
||||
printf(" LOREX: FAILED TO COMPILE\n");
|
||||
} else {
|
||||
printf(" LOREX: %s (compile: %.3f us, match: %.3f us, total: %.3f us)\n",
|
||||
res.lorex_matched ? "MATCHED" : "NO MATCH",
|
||||
res.lorex_compile_us, res.lorex_match_us, res.lorex_total_us);
|
||||
}
|
||||
|
||||
if (res.posix_failed) {
|
||||
printf(" POSIX: FAILED TO COMPILE\n");
|
||||
} else {
|
||||
printf(" POSIX: %s (compile: %.3f us, match: %.3f us, total: %.3f us)\n",
|
||||
res.posix_matched ? "MATCHED" : "NO MATCH",
|
||||
res.posix_compile_us, res.posix_match_us, res.posix_total_us);
|
||||
}
|
||||
|
||||
if (!res.lorex_failed && !res.posix_failed) {
|
||||
double speedup = res.posix_total_us / res.lorex_total_us;
|
||||
if (speedup > 1.0) {
|
||||
printf(" Result: LOREX is %.2fx faster\n", speedup);
|
||||
} else {
|
||||
printf(" Result: POSIX is %.2fx faster\n", 1.0 / speedup);
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("================================================================================\n");
|
||||
printf(" BENCHMARK COMPLETE\n");
|
||||
printf("================================================================================\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
250
tests/test_all.c
250
tests/test_all.c
@ -1,5 +1,5 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "../include/loreg.h"
|
||||
#include "../include/lorex.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
@ -22,211 +22,211 @@ static int total_failed = 0;
|
||||
} while(0)
|
||||
|
||||
TEST(basic_literals) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("hello", &err);
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re = lorex_compile("hello", &err);
|
||||
ASSERT(re != NULL, "compile hello");
|
||||
|
||||
loreg_match_t m;
|
||||
ASSERT(loreg_search(re, "hello", &m), "match hello");
|
||||
ASSERT(loreg_search(re, "say hello world", &m), "search hello");
|
||||
ASSERT(!loreg_search(re, "helo", &m), "no match helo");
|
||||
lorex_match_t m;
|
||||
ASSERT(lorex_search(re, "hello", &m), "match hello");
|
||||
ASSERT(lorex_search(re, "say hello world", &m), "search hello");
|
||||
ASSERT(!lorex_search(re, "helo", &m), "no match helo");
|
||||
|
||||
loreg_free(re);
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(metacharacters) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
lorex_error_t err;
|
||||
lorex_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("a.c", &err);
|
||||
lorex_regex_t *re = lorex_compile("a.c", &err);
|
||||
ASSERT(re != NULL, "compile a.c");
|
||||
ASSERT(loreg_search(re, "abc", &m), "match abc");
|
||||
ASSERT(loreg_search(re, "axc", &m), "match axc");
|
||||
ASSERT(!loreg_search(re, "ac", &m), "no match ac");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "abc", &m), "match abc");
|
||||
ASSERT(lorex_search(re, "axc", &m), "match axc");
|
||||
ASSERT(!lorex_search(re, "ac", &m), "no match ac");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("^start", &err);
|
||||
re = lorex_compile("^start", &err);
|
||||
ASSERT(re != NULL, "compile ^start");
|
||||
ASSERT(loreg_search(re, "start here", &m), "match start here");
|
||||
ASSERT(!loreg_search(re, "not start", &m), "no match not start");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "start here", &m), "match start here");
|
||||
ASSERT(!lorex_search(re, "not start", &m), "no match not start");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("end$", &err);
|
||||
re = lorex_compile("end$", &err);
|
||||
ASSERT(re != NULL, "compile end$");
|
||||
ASSERT(loreg_search(re, "the end", &m), "match the end");
|
||||
ASSERT(!loreg_search(re, "end here", &m), "no match end here");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "the end", &m), "match the end");
|
||||
ASSERT(!lorex_search(re, "end here", &m), "no match end here");
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(quantifiers) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
lorex_error_t err;
|
||||
lorex_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("ab*c", &err);
|
||||
lorex_regex_t *re = lorex_compile("ab*c", &err);
|
||||
ASSERT(re != NULL, "compile ab*c");
|
||||
ASSERT(loreg_search(re, "ac", &m), "match ac");
|
||||
ASSERT(loreg_search(re, "abc", &m), "match abc");
|
||||
ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "ac", &m), "match ac");
|
||||
ASSERT(lorex_search(re, "abc", &m), "match abc");
|
||||
ASSERT(lorex_search(re, "abbbbc", &m), "match abbbbc");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("ab+c", &err);
|
||||
re = lorex_compile("ab+c", &err);
|
||||
ASSERT(re != NULL, "compile ab+c");
|
||||
ASSERT(!loreg_search(re, "ac", &m), "no match ac");
|
||||
ASSERT(loreg_search(re, "abc", &m), "match abc");
|
||||
ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc");
|
||||
loreg_free(re);
|
||||
ASSERT(!lorex_search(re, "ac", &m), "no match ac");
|
||||
ASSERT(lorex_search(re, "abc", &m), "match abc");
|
||||
ASSERT(lorex_search(re, "abbbbc", &m), "match abbbbc");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("ab?c", &err);
|
||||
re = lorex_compile("ab?c", &err);
|
||||
ASSERT(re != NULL, "compile ab?c");
|
||||
ASSERT(loreg_search(re, "ac", &m), "match ac");
|
||||
ASSERT(loreg_search(re, "abc", &m), "match abc");
|
||||
ASSERT(!loreg_search(re, "abbc", &m), "no match abbc");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "ac", &m), "match ac");
|
||||
ASSERT(lorex_search(re, "abc", &m), "match abc");
|
||||
ASSERT(!lorex_search(re, "abbc", &m), "no match abbc");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("a{3}", &err);
|
||||
re = lorex_compile("a{3}", &err);
|
||||
ASSERT(re != NULL, "compile a{3}");
|
||||
ASSERT(loreg_search(re, "aaa", &m), "match aaa");
|
||||
ASSERT(!loreg_search(re, "aa", &m), "no match aa");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "aaa", &m), "match aaa");
|
||||
ASSERT(!lorex_search(re, "aa", &m), "no match aa");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("a{2,4}", &err);
|
||||
re = lorex_compile("a{2,4}", &err);
|
||||
ASSERT(re != NULL, "compile a{2,4}");
|
||||
ASSERT(loreg_search(re, "aa", &m), "match aa");
|
||||
ASSERT(loreg_search(re, "aaa", &m), "match aaa");
|
||||
ASSERT(loreg_search(re, "aaaa", &m), "match aaaa");
|
||||
ASSERT(!loreg_search(re, "a", &m), "no match a");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "aa", &m), "match aa");
|
||||
ASSERT(lorex_search(re, "aaa", &m), "match aaa");
|
||||
ASSERT(lorex_search(re, "aaaa", &m), "match aaaa");
|
||||
ASSERT(!lorex_search(re, "a", &m), "no match a");
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(character_classes) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
lorex_error_t err;
|
||||
lorex_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("[aeiou]", &err);
|
||||
lorex_regex_t *re = lorex_compile("[aeiou]", &err);
|
||||
ASSERT(re != NULL, "compile [aeiou]");
|
||||
ASSERT(loreg_search(re, "a", &m), "match a");
|
||||
ASSERT(loreg_search(re, "test", &m), "match test");
|
||||
ASSERT(!loreg_search(re, "xyz", &m), "no match xyz");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "a", &m), "match a");
|
||||
ASSERT(lorex_search(re, "test", &m), "match test");
|
||||
ASSERT(!lorex_search(re, "xyz", &m), "no match xyz");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("[a-z]", &err);
|
||||
re = lorex_compile("[a-z]", &err);
|
||||
ASSERT(re != NULL, "compile [a-z]");
|
||||
ASSERT(loreg_search(re, "m", &m), "match m");
|
||||
ASSERT(!loreg_search(re, "5", &m), "no match 5");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "m", &m), "match m");
|
||||
ASSERT(!lorex_search(re, "5", &m), "no match 5");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("[^0-9]", &err);
|
||||
re = lorex_compile("[^0-9]", &err);
|
||||
ASSERT(re != NULL, "compile [^0-9]");
|
||||
ASSERT(loreg_search(re, "a", &m), "match a");
|
||||
ASSERT(!loreg_search(re, "5", &m), "no match 5");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "a", &m), "match a");
|
||||
ASSERT(!lorex_search(re, "5", &m), "no match 5");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("\\d", &err);
|
||||
re = lorex_compile("\\d", &err);
|
||||
ASSERT(re != NULL, "compile \\d");
|
||||
ASSERT(loreg_search(re, "5", &m), "match 5");
|
||||
ASSERT(!loreg_search(re, "a", &m), "no match a");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "5", &m), "match 5");
|
||||
ASSERT(!lorex_search(re, "a", &m), "no match a");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("\\w+", &err);
|
||||
re = lorex_compile("\\w+", &err);
|
||||
ASSERT(re != NULL, "compile \\w+");
|
||||
ASSERT(loreg_search(re, "hello_123", &m), "match hello_123");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "hello_123", &m), "match hello_123");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("\\s", &err);
|
||||
re = lorex_compile("\\s", &err);
|
||||
ASSERT(re != NULL, "compile \\s");
|
||||
ASSERT(loreg_search(re, " ", &m), "match space");
|
||||
ASSERT(loreg_search(re, "\t", &m), "match tab");
|
||||
ASSERT(!loreg_search(re, "a", &m), "no match a");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, " ", &m), "match space");
|
||||
ASSERT(lorex_search(re, "\t", &m), "match tab");
|
||||
ASSERT(!lorex_search(re, "a", &m), "no match a");
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(groups) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
lorex_error_t err;
|
||||
lorex_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("(ab)+", &err);
|
||||
lorex_regex_t *re = lorex_compile("(ab)+", &err);
|
||||
ASSERT(re != NULL, "compile (ab)+");
|
||||
ASSERT(loreg_search(re, "ab", &m), "match ab");
|
||||
ASSERT(loreg_search(re, "abab", &m), "match abab");
|
||||
ASSERT(!loreg_search(re, "a", &m), "no match a");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "ab", &m), "match ab");
|
||||
ASSERT(lorex_search(re, "abab", &m), "match abab");
|
||||
ASSERT(!lorex_search(re, "a", &m), "no match a");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("(\\d+)-(\\d+)", &err);
|
||||
re = lorex_compile("(\\d+)-(\\d+)", &err);
|
||||
ASSERT(re != NULL, "compile groups");
|
||||
ASSERT(loreg_search(re, "123-456", &m), "match 123-456");
|
||||
ASSERT(lorex_search(re, "123-456", &m), "match 123-456");
|
||||
ASSERT(m.group_count == 2, "2 groups");
|
||||
ASSERT(m.groups[0].matched, "group 0 matched");
|
||||
ASSERT(m.groups[1].matched, "group 1 matched");
|
||||
loreg_free(re);
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(alternation) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
lorex_error_t err;
|
||||
lorex_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("cat|dog", &err);
|
||||
lorex_regex_t *re = lorex_compile("cat|dog", &err);
|
||||
ASSERT(re != NULL, "compile cat|dog");
|
||||
ASSERT(loreg_search(re, "cat", &m), "match cat");
|
||||
ASSERT(loreg_search(re, "dog", &m), "match dog");
|
||||
ASSERT(!loreg_search(re, "rat", &m), "no match rat");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "cat", &m), "match cat");
|
||||
ASSERT(lorex_search(re, "dog", &m), "match dog");
|
||||
ASSERT(!lorex_search(re, "rat", &m), "no match rat");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("(red|blue) car", &err);
|
||||
re = lorex_compile("(red|blue) car", &err);
|
||||
ASSERT(re != NULL, "compile (red|blue) car");
|
||||
ASSERT(loreg_search(re, "red car", &m), "match red car");
|
||||
ASSERT(loreg_search(re, "blue car", &m), "match blue car");
|
||||
ASSERT(!loreg_search(re, "green car", &m), "no match green car");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "red car", &m), "match red car");
|
||||
ASSERT(lorex_search(re, "blue car", &m), "match blue car");
|
||||
ASSERT(!lorex_search(re, "green car", &m), "no match green car");
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(escapes) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
lorex_error_t err;
|
||||
lorex_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("1\\.5", &err);
|
||||
lorex_regex_t *re = lorex_compile("1\\.5", &err);
|
||||
ASSERT(re != NULL, "compile 1\\.5");
|
||||
ASSERT(loreg_search(re, "1.5", &m), "match 1.5");
|
||||
ASSERT(!loreg_search(re, "1x5", &m), "no match 1x5");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "1.5", &m), "match 1.5");
|
||||
ASSERT(!lorex_search(re, "1x5", &m), "no match 1x5");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("\\(test\\)", &err);
|
||||
re = lorex_compile("\\(test\\)", &err);
|
||||
ASSERT(re != NULL, "compile \\(test\\)");
|
||||
ASSERT(loreg_search(re, "(test)", &m), "match (test)");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "(test)", &m), "match (test)");
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(real_patterns) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
lorex_error_t err;
|
||||
lorex_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", &err);
|
||||
lorex_regex_t *re = lorex_compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", &err);
|
||||
ASSERT(re != NULL, "compile email");
|
||||
ASSERT(loreg_search(re, "user@example.com", &m), "match email");
|
||||
ASSERT(!loreg_search(re, "invalid", &m), "no match invalid");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "user@example.com", &m), "match email");
|
||||
ASSERT(!lorex_search(re, "invalid", &m), "no match invalid");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", &err);
|
||||
re = lorex_compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", &err);
|
||||
ASSERT(re != NULL, "compile ip");
|
||||
ASSERT(loreg_search(re, "192.168.1.1", &m), "match ip");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "192.168.1.1", &m), "match ip");
|
||||
lorex_free(re);
|
||||
|
||||
re = loreg_compile("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", &err);
|
||||
re = lorex_compile("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", &err);
|
||||
ASSERT(re != NULL, "compile url");
|
||||
ASSERT(loreg_search(re, "http://example.com", &m), "match http");
|
||||
ASSERT(loreg_search(re, "https://example.com/path", &m), "match https");
|
||||
loreg_free(re);
|
||||
ASSERT(lorex_search(re, "http://example.com", &m), "match http");
|
||||
ASSERT(lorex_search(re, "https://example.com/path", &m), "match https");
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(error_handling) {
|
||||
loreg_error_t err;
|
||||
lorex_error_t err;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("(abc", &err);
|
||||
lorex_regex_t *re = lorex_compile("(abc", &err);
|
||||
ASSERT(re == NULL, "unbalanced paren");
|
||||
ASSERT(err == LOREG_ERR_UNBALANCED_PAREN, "correct error");
|
||||
ASSERT(err == LOREX_ERR_UNBALANCED_PAREN, "correct error");
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
printf("loreg comprehensive tests\n");
|
||||
printf("lorex comprehensive tests\n");
|
||||
printf("========================\n\n");
|
||||
|
||||
clock_t start = clock();
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "../include/loreg.h"
|
||||
#include "../include/lorex.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
@ -10,22 +10,22 @@ static int failed = 0;
|
||||
#define NO_MATCH(pat, txt) test_match(pat, txt, 0, __LINE__)
|
||||
|
||||
static void test_match(const char *pattern, const char *text, int expect, int line) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile(pattern, &err);
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re = lorex_compile(pattern, &err);
|
||||
if (!re) {
|
||||
printf("FAIL line %d: compile error for '%s': %s\n", line, pattern, loreg_error_string(err));
|
||||
printf("FAIL line %d: compile error for '%s': %s\n", line, pattern, lorex_error_string(err));
|
||||
failed++;
|
||||
return;
|
||||
}
|
||||
loreg_match_t m;
|
||||
int result = loreg_search(re, text, &m) ? 1 : 0;
|
||||
lorex_match_t m;
|
||||
int result = lorex_search(re, text, &m) ? 1 : 0;
|
||||
if (result != expect) {
|
||||
printf("FAIL line %d: '%s' vs '%s' expected %s\n", line, pattern, text, expect ? "match" : "no match");
|
||||
failed++;
|
||||
} else {
|
||||
passed++;
|
||||
}
|
||||
loreg_free(re);
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
static void test_literals(void) {
|
||||
@ -613,8 +613,240 @@ static void test_pathological_patterns(void) {
|
||||
MATCH("(a?){5}a{5}", "aaaaa");
|
||||
}
|
||||
|
||||
static void test_anchored_match(void) {
|
||||
printf(" anchored match (lorex_match)...\n");
|
||||
lorex_error_t err;
|
||||
lorex_match_t m;
|
||||
|
||||
lorex_regex_t *re = lorex_compile("abc", &err);
|
||||
if (re) {
|
||||
if (lorex_match(re, "abc", &m)) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: lorex_match should match 'abc' against 'abc'\n");
|
||||
failed++;
|
||||
}
|
||||
if (!lorex_match(re, "xabc", &m)) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: lorex_match should not match 'abc' against 'xabc'\n");
|
||||
failed++;
|
||||
}
|
||||
if (lorex_match(re, "abcx", &m)) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: lorex_match should match 'abc' at start of 'abcx'\n");
|
||||
failed++;
|
||||
}
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
re = lorex_compile("^abc$", &err);
|
||||
if (re) {
|
||||
if (lorex_match(re, "abc", &m)) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: lorex_match should match '^abc$' against 'abc'\n");
|
||||
failed++;
|
||||
}
|
||||
if (!lorex_match(re, "abcx", &m)) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: lorex_match should not match '^abc$' against 'abcx'\n");
|
||||
failed++;
|
||||
}
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
re = lorex_compile("a.*z", &err);
|
||||
if (re) {
|
||||
if (lorex_match(re, "abcz", &m)) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: lorex_match should match 'a.*z' against 'abcz'\n");
|
||||
failed++;
|
||||
}
|
||||
if (!lorex_match(re, "xabcz", &m)) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: lorex_match should not match 'a.*z' against 'xabcz'\n");
|
||||
failed++;
|
||||
}
|
||||
lorex_free(re);
|
||||
}
|
||||
}
|
||||
|
||||
static void test_error_strings(void) {
|
||||
printf(" error strings...\n");
|
||||
|
||||
if (strcmp(lorex_error_string(LOREX_OK), "success") == 0) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: LOREX_OK should return 'success'\n");
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (strcmp(lorex_error_string(LOREX_ERR_INVALID_PATTERN), "invalid pattern") == 0) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: LOREX_ERR_INVALID_PATTERN error string\n");
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (strcmp(lorex_error_string(LOREX_ERR_UNBALANCED_PAREN), "unbalanced parentheses") == 0) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: LOREX_ERR_UNBALANCED_PAREN error string\n");
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (strcmp(lorex_error_string(LOREX_ERR_EMPTY_GROUP), "empty group") == 0) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: LOREX_ERR_EMPTY_GROUP error string\n");
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (strcmp(lorex_error_string(LOREX_ERR_INVALID_QUANTIFIER), "invalid quantifier") == 0) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: LOREX_ERR_INVALID_QUANTIFIER error string\n");
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (strcmp(lorex_error_string(LOREX_ERR_INVALID_ESCAPE), "invalid escape sequence") == 0) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: LOREX_ERR_INVALID_ESCAPE error string\n");
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (strcmp(lorex_error_string(LOREX_ERR_OUT_OF_MEMORY), "out of memory") == 0) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: LOREX_ERR_OUT_OF_MEMORY error string\n");
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (strcmp(lorex_error_string(LOREX_ERR_STATE_OVERFLOW), "state overflow") == 0) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: LOREX_ERR_STATE_OVERFLOW error string\n");
|
||||
failed++;
|
||||
}
|
||||
|
||||
if (strcmp(lorex_error_string((lorex_error_t)99), "unknown error") == 0) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: unknown error code should return 'unknown error'\n");
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
|
||||
static void test_parser_errors(void) {
|
||||
printf(" parser errors...\n");
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re;
|
||||
|
||||
re = lorex_compile("(abc", &err);
|
||||
if (re == NULL && err == LOREX_ERR_UNBALANCED_PAREN) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: '(abc' should fail with unbalanced paren\n");
|
||||
failed++;
|
||||
if (re) lorex_free(re);
|
||||
}
|
||||
|
||||
re = lorex_compile("((a)", &err);
|
||||
if (re == NULL && err == LOREX_ERR_UNBALANCED_PAREN) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: '((a)' should fail with unbalanced paren\n");
|
||||
failed++;
|
||||
if (re) lorex_free(re);
|
||||
}
|
||||
|
||||
re = lorex_compile("a{5,2}", &err);
|
||||
if (re == NULL) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: 'a{5,2}' should fail (min > max)\n");
|
||||
failed++;
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
re = lorex_compile("*abc", &err);
|
||||
if (re == NULL) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: '*abc' should fail\n");
|
||||
failed++;
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
re = lorex_compile("+abc", &err);
|
||||
if (re == NULL) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: '+abc' should fail\n");
|
||||
failed++;
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
re = lorex_compile("?abc", &err);
|
||||
if (re == NULL) {
|
||||
passed++;
|
||||
} else {
|
||||
printf("FAIL: '?abc' should fail\n");
|
||||
failed++;
|
||||
lorex_free(re);
|
||||
}
|
||||
}
|
||||
|
||||
static void test_bracket_char_classes(void) {
|
||||
printf(" bracket character classes...\n");
|
||||
MATCH("[\\d]", "5");
|
||||
MATCH("[\\d]+", "12345");
|
||||
NO_MATCH("[\\d]", "a");
|
||||
MATCH("[\\w]", "a");
|
||||
MATCH("[\\w]", "Z");
|
||||
MATCH("[\\w]", "5");
|
||||
MATCH("[\\w]", "_");
|
||||
NO_MATCH("[\\w]", " ");
|
||||
MATCH("[\\s]", " ");
|
||||
MATCH("[\\s]", "\t");
|
||||
NO_MATCH("[\\s]", "a");
|
||||
MATCH("[a\\d]", "a");
|
||||
MATCH("[a\\d]", "5");
|
||||
NO_MATCH("[a\\d]", "b");
|
||||
MATCH("[\\da-z]", "5");
|
||||
MATCH("[\\da-z]", "m");
|
||||
NO_MATCH("[\\da-z]", "M");
|
||||
MATCH("[\\w\\s]+", "hello world");
|
||||
MATCH("[0-9\\s]+", "1 2 3");
|
||||
MATCH("[\\w-]+", "hello-world");
|
||||
}
|
||||
|
||||
static void test_special_escapes(void) {
|
||||
printf(" special escape sequences...\n");
|
||||
MATCH("\\n", "\n");
|
||||
MATCH("a\\nb", "a\nb");
|
||||
MATCH("\\t", "\t");
|
||||
MATCH("a\\tb", "a\tb");
|
||||
MATCH("\\r", "\r");
|
||||
MATCH("a\\rb", "a\rb");
|
||||
MATCH("\\n\\t\\r", "\n\t\r");
|
||||
MATCH("[\\n]", "\n");
|
||||
MATCH("[\\t]", "\t");
|
||||
MATCH("[\\r]", "\r");
|
||||
MATCH("[\\n\\t]+", "\n\t\n");
|
||||
NO_MATCH("\\n", "n");
|
||||
NO_MATCH("\\t", "t");
|
||||
NO_MATCH("\\r", "r");
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
printf("loreg integration tests\n");
|
||||
printf("lorex integration tests\n");
|
||||
printf("=======================\n\n");
|
||||
|
||||
test_literals();
|
||||
@ -641,6 +873,11 @@ int main(void) {
|
||||
test_nested_groups();
|
||||
test_real_world_patterns();
|
||||
test_pathological_patterns();
|
||||
test_anchored_match();
|
||||
test_error_strings();
|
||||
test_parser_errors();
|
||||
test_bracket_char_classes();
|
||||
test_special_escapes();
|
||||
|
||||
printf("\n=======================\n");
|
||||
printf("integration: %d passed, %d failed\n", passed, failed);
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "../include/loreg.h"
|
||||
#include "../include/lorex.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
@ -23,21 +23,21 @@ static int tests_failed = 0;
|
||||
} while(0)
|
||||
|
||||
#define ASSERT_MATCH(pattern, text) do { \
|
||||
loreg_error_t err; \
|
||||
loreg_regex_t *re = loreg_compile(pattern, &err); \
|
||||
lorex_error_t err; \
|
||||
lorex_regex_t *re = lorex_compile(pattern, &err); \
|
||||
ASSERT(re != NULL); \
|
||||
loreg_match_t result; \
|
||||
ASSERT(loreg_search(re, text, &result) == true); \
|
||||
loreg_free(re); \
|
||||
lorex_match_t result; \
|
||||
ASSERT(lorex_search(re, text, &result) == true); \
|
||||
lorex_free(re); \
|
||||
} while(0)
|
||||
|
||||
#define ASSERT_NO_MATCH(pattern, text) do { \
|
||||
loreg_error_t err; \
|
||||
loreg_regex_t *re = loreg_compile(pattern, &err); \
|
||||
lorex_error_t err; \
|
||||
lorex_regex_t *re = lorex_compile(pattern, &err); \
|
||||
ASSERT(re != NULL); \
|
||||
loreg_match_t result; \
|
||||
ASSERT(loreg_search(re, text, &result) == false); \
|
||||
loreg_free(re); \
|
||||
lorex_match_t result; \
|
||||
ASSERT(lorex_search(re, text, &result) == false); \
|
||||
lorex_free(re); \
|
||||
} while(0)
|
||||
|
||||
TEST(simple_char) {
|
||||
@ -209,53 +209,53 @@ TEST(complex_url) {
|
||||
}
|
||||
|
||||
TEST(group_capture) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("(\\d+)-(\\d+)", &err);
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re = lorex_compile("(\\d+)-(\\d+)", &err);
|
||||
ASSERT(re != NULL);
|
||||
|
||||
loreg_match_t result;
|
||||
ASSERT(loreg_search(re, "123-456", &result));
|
||||
lorex_match_t result;
|
||||
ASSERT(lorex_search(re, "123-456", &result));
|
||||
ASSERT(result.group_count == 2);
|
||||
ASSERT(result.groups[0].matched);
|
||||
ASSERT(result.groups[1].matched);
|
||||
|
||||
loreg_free(re);
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(nested_groups) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("((a)(b))", &err);
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re = lorex_compile("((a)(b))", &err);
|
||||
ASSERT(re != NULL);
|
||||
|
||||
loreg_match_t result;
|
||||
ASSERT(loreg_search(re, "ab", &result));
|
||||
lorex_match_t result;
|
||||
ASSERT(lorex_search(re, "ab", &result));
|
||||
ASSERT(result.group_count == 3);
|
||||
|
||||
loreg_free(re);
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(empty_pattern) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("", &err);
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re = lorex_compile("", &err);
|
||||
ASSERT(re != NULL);
|
||||
|
||||
loreg_match_t result;
|
||||
ASSERT(loreg_match(re, "anything", &result));
|
||||
lorex_match_t result;
|
||||
ASSERT(lorex_match(re, "anything", &result));
|
||||
|
||||
loreg_free(re);
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
TEST(match_position) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("test", &err);
|
||||
lorex_error_t err;
|
||||
lorex_regex_t *re = lorex_compile("test", &err);
|
||||
ASSERT(re != NULL);
|
||||
|
||||
loreg_match_t result;
|
||||
ASSERT(loreg_search(re, "xxxtestyyy", &result));
|
||||
lorex_match_t result;
|
||||
ASSERT(lorex_search(re, "xxxtestyyy", &result));
|
||||
ASSERT(result.match_start == 3);
|
||||
ASSERT(result.match_end == 7);
|
||||
|
||||
loreg_free(re);
|
||||
lorex_free(re);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
|
||||
@ -27,11 +27,11 @@ static nfa_t *compile_pattern(const char *pattern) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, pattern);
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
if (!ast || parser_get_error(&parser) != LOREG_OK) {
|
||||
if (!ast || parser_get_error(&parser) != LOREX_OK) {
|
||||
ast_free(ast);
|
||||
return NULL;
|
||||
}
|
||||
loreg_error_t error;
|
||||
lorex_error_t error;
|
||||
nfa_t *nfa = nfa_from_ast(ast, &error);
|
||||
ast_free(ast);
|
||||
return nfa;
|
||||
|
||||
@ -245,7 +245,7 @@ TEST(complex_pattern) {
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(parser_get_error(&parser) == LOREG_OK);
|
||||
ASSERT(parser_get_error(&parser) == LOREX_OK);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
@ -255,7 +255,7 @@ TEST(unbalanced_paren) {
|
||||
parser_init(&parser, "(abc");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast == NULL || parser_get_error(&parser) == LOREG_ERR_UNBALANCED_PAREN);
|
||||
ASSERT(ast == NULL || parser_get_error(&parser) == LOREX_ERR_UNBALANCED_PAREN);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user