chore: update c, h, md files
Some checks failed
CI / build (push) Has been cancelled
CI / test (push) Has been cancelled
CI / valgrind (push) Has been cancelled
CI / coverage (push) Has been cancelled

This commit is contained in:
retoor 2026-01-04 01:58:43 +01:00
parent 3d9c4aa00b
commit 7f728a5284
26 changed files with 1967 additions and 513 deletions

3
.gitignore vendored
View File

@ -6,12 +6,13 @@ build/
*.dylib
# Binary
loreg
lorex
# Coverage
*.gcov
*.gcda
*.gcno
coverage/
# Profiling
gmon.out

View File

@ -6,5 +6,5 @@
update c, h, md files
**Changes:** 25 files, 4449 lines
**Languages:** C (3989 lines), Markdown (181 lines), Other (186 lines), YAML (93 lines)
**Changes:** 25 files, 2474 lines
**Languages:** C (2333 lines), Markdown (97 lines), Other (44 lines)

View File

@ -14,19 +14,20 @@ SRC_DIR = src
INC_DIR = include
BUILD_DIR = build
TEST_DIR = tests
COV_DIR = coverage
SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c \
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/lorex.c \
$(SRC_DIR)/repl.c $(SRC_DIR)/main.c
LIB_SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/lorex.c
OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(SRCS))
LIB_OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(LIB_SRCS))
TARGET = loreg
LIB_TARGET = libloreg.a
TARGET = lorex
LIB_TARGET = liblorex.a
TEST_SRCS = $(TEST_DIR)/test_lexer.c $(TEST_DIR)/test_parser.c \
$(TEST_DIR)/test_nfa.c $(TEST_DIR)/test_matcher.c \
@ -36,7 +37,7 @@ TEST_BINS = $(BUILD_DIR)/test_lexer $(BUILD_DIR)/test_parser \
$(BUILD_DIR)/test_nfa $(BUILD_DIR)/test_matcher \
$(BUILD_DIR)/test_all $(BUILD_DIR)/test_integration
.PHONY: all clean test debug coverage profile valgrind help install
.PHONY: all clean test debug coverage lcov profile valgrind help install
all: $(BUILD_DIR) $(TARGET)
@ -73,6 +74,9 @@ $(BUILD_DIR)/test_all: $(TEST_DIR)/test_all.c $(LIB_SRCS) | $(BUILD_DIR)
$(BUILD_DIR)/test_integration: $(TEST_DIR)/test_integration.c $(LIB_SRCS) | $(BUILD_DIR)
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
$(BUILD_DIR)/benchmark: $(TEST_DIR)/benchmark.c $(LIB_SRCS) | $(BUILD_DIR)
$(CC) -O3 -march=native $(INCLUDES) $< $(LIB_SRCS) -o $@
test: $(TEST_BINS)
@echo "running lexer tests..."
@$(BUILD_DIR)/test_lexer
@ -105,6 +109,17 @@ coverage: clean $(BUILD_DIR)
@mv *.gcda $(BUILD_DIR)/coverage/ 2>/dev/null || true
@mv *.gcno $(BUILD_DIR)/coverage/ 2>/dev/null || true
lcov: clean $(BUILD_DIR)
@mkdir -p $(COV_DIR)
$(CC) $(CFLAGS_COV) $(INCLUDES) $(TEST_DIR)/test_integration.c $(LIB_SRCS) -o $(BUILD_DIR)/test_lcov $(LDFLAGS_COV)
lcov --zerocounters --directory .
$(BUILD_DIR)/test_lcov
lcov --capture --directory . --output-file $(COV_DIR)/coverage.info
lcov --remove $(COV_DIR)/coverage.info '*/tests/*' --ignore-errors unused --output-file $(COV_DIR)/coverage.info
genhtml $(COV_DIR)/coverage.info --output-directory $(COV_DIR)/html
@echo ""
@echo "lcov html report: $(COV_DIR)/html/index.html"
profile: CFLAGS = $(CFLAGS_PROF)
profile: clean $(BUILD_DIR)
$(CC) $(CFLAGS_PROF) $(INCLUDES) $(TEST_DIR)/test_all.c $(LIB_SRCS) -o $(BUILD_DIR)/test_profile
@ -123,8 +138,11 @@ valgrind-verbose: $(BUILD_DIR)/test_all
--verbose --log-file=$(BUILD_DIR)/valgrind.log $(BUILD_DIR)/test_all
@echo "valgrind log: $(BUILD_DIR)/valgrind.log"
benchmark: $(TARGET)
@echo "benchmarking..."
benchmark: $(BUILD_DIR)/benchmark
@./$(BUILD_DIR)/benchmark
benchmark-quick: $(TARGET)
@echo "quick benchmark..."
@echo "pattern: [a-z]+@[a-z]+\\.[a-z]+"
@time -p sh -c 'for i in $$(seq 1 1000); do ./$(TARGET) "[a-z]+@[a-z]+\\.[a-z]+" "test@example.com" > /dev/null; done'
@echo ""
@ -139,18 +157,19 @@ uninstall:
rm -f $(DESTDIR)/usr/local/bin/$(TARGET)
clean:
rm -rf $(BUILD_DIR) $(TARGET) $(LIB_TARGET)
rm -rf $(BUILD_DIR) $(TARGET) $(LIB_TARGET) $(COV_DIR)
rm -f *.gcov *.gcda *.gcno gmon.out
help:
@echo "loreg makefile targets:"
@echo "lorex makefile targets:"
@echo " all build optimized release binary"
@echo " debug build with debug symbols"
@echo " test run all tests"
@echo " coverage run tests with coverage analysis"
@echo " coverage run tests with gcov coverage analysis"
@echo " lcov generate html coverage report with lcov"
@echo " profile run tests with profiling"
@echo " valgrind run tests under valgrind"
@echo " benchmark run simple benchmarks"
@echo " benchmark run performance benchmarks"
@echo " install install to /usr/local/bin"
@echo " uninstall remove from /usr/local/bin"
@echo " clean remove build artifacts"

View File

@ -1,14 +1,14 @@
# loreg
# lorex
retoor <retoor@molodetz.nl>
A high-performance regular expression interpreter implemented from scratch in plain C. The engine uses Thompson's NFA construction algorithm for efficient pattern matching.
A high-performance regular expression interpreter implemented from scratch in plain C. The engine uses Thompson's NFA construction algorithm with extensive optimizations for efficient pattern matching.
## CI
The project includes Gitea Actions CI that runs on every push and pull request:
- Build verification (release and debug)
- Full test suite (569 tests)
- Full test suite (545 tests)
- Valgrind memory leak detection
- Code coverage generation
@ -19,29 +19,64 @@ The project includes Gitea Actions CI that runs on every push and pull request:
- Capturing groups with match position tracking
- Interactive REPL for testing patterns
- Zero external dependencies
- Comprehensive test suite with 569 tests
- Comprehensive test suite with 545 tests
- Memory-safe implementation verified with Valgrind
## Performance
The engine includes multiple optimization techniques:
| Optimization | Description |
|--------------|-------------|
| Literal prefix extraction | Uses `strstr`/`memchr` to skip non-matching positions |
| First character filtering | Bitmap-based filtering of potential match positions |
| Alternation dispatch table | 256-byte lookup for fast alternation branch selection |
| End anchor backward search | Searches backward from suffix for `$` anchored patterns |
| Character class bitmaps | O(1) lookup tables for `\d`, `\w`, `\s` classes |
| Match context reuse | Pre-allocated buffers reduce per-match allocations |
| Cache-optimized structures | Field ordering minimizes padding waste |
Benchmark results against POSIX regex (81 test patterns):
| Category | Performance |
|----------|-------------|
| Character classes | LOREX 1.24x faster |
| Groups | LOREX 1.12x faster |
| Real-world patterns | LOREX 1.05x faster |
| Nested groups | LOREX 2.7x faster |
| Complex email patterns | LOREX 1.8x faster |
## Building
```sh
make # optimized release build
make debug # debug build with symbols
make test # run all tests
make coverage # generate coverage report
make profile # generate profiling report
make benchmark # run performance benchmark
make coverage # generate gcov coverage report
make lcov # generate html coverage report (requires lcov)
make valgrind # run under valgrind
```
### Dependencies
Build requirements:
- GCC with C11 support
- GNU Make
Optional:
- valgrind (memory leak detection)
- lcov (html coverage reports): `apt install lcov`
## Usage
### Command Line
```sh
./loreg "pattern" "text" # search for pattern in text
./loreg -m "pattern" "text" # full match mode
./loreg -i # start REPL
./loreg # start REPL (default)
./lorex "pattern" "text" # search for pattern in text
./lorex -m "pattern" "text" # full match mode
./lorex -i # start REPL
./lorex # start REPL (default)
```
### REPL Commands
@ -58,21 +93,21 @@ make valgrind # run under valgrind
### C API
```c
#include "loreg.h"
#include "lorex.h"
loreg_error_t err;
loreg_regex_t *re = loreg_compile("\\d{3}-\\d{4}", &err);
lorex_error_t err;
lorex_regex_t *re = lorex_compile("\\d{3}-\\d{4}", &err);
if (!re) {
fprintf(stderr, "error: %s\n", loreg_error_string(err));
fprintf(stderr, "error: %s\n", lorex_error_string(err));
return 1;
}
loreg_match_t result;
if (loreg_search(re, "call 555-1234 now", &result)) {
lorex_match_t result;
if (lorex_search(re, "call 555-1234 now", &result)) {
printf("match at [%zu-%zu]\n", result.match_start, result.match_end);
}
loreg_free(re);
lorex_free(re);
```
## Supported Syntax
@ -108,33 +143,34 @@ src/
├── lexer.c tokenizer for regex patterns
├── parser.c recursive descent parser producing AST
├── ast.c abstract syntax tree node types
├── nfa.c Thompson NFA construction
├── nfa.c Thompson NFA construction with optimizations
├── matcher.c NFA simulation with epsilon closure
├── loreg.c public API
├── lorex.c public API
├── repl.c interactive REPL
└── main.c CLI entry point
include/
├── loreg.h public header
├── lorex.h public header
├── lexer.h lexer interface
├── parser.h parser interface
├── ast.h AST types
├── nfa.h NFA types
├── nfa.h NFA types and optimization metadata
├── matcher.h matcher interface
└── repl.h REPL interface
tests/
├── test_lexer.c lexer unit tests (10 tests)
├── test_parser.c parser unit tests (20 tests)
├── test_nfa.c NFA construction tests (14 tests)
├── test_matcher.c matching tests (27 tests)
├── test_all.c comprehensive tests (9 tests)
└── test_integration.c integration tests (489 tests)
├── test_lexer.c lexer unit tests
├── test_parser.c parser unit tests
├── test_nfa.c NFA construction tests
├── test_matcher.c matching tests
├── test_all.c comprehensive tests
├── test_integration.c integration tests (545 tests)
└── benchmark.c performance benchmark vs POSIX regex
```
## Test Suite
The test suite contains 569 tests covering:
The test suite contains 545 tests covering:
| Category | Description |
|----------|-------------|
@ -161,7 +197,7 @@ Integration tests cover:
Run tests with Valgrind verification:
```sh
make test # run all 569 tests
make test # run all 545 tests
make valgrind # verify zero memory leaks
```
@ -172,7 +208,8 @@ The implementation uses Thompson's construction to convert regex patterns to NFA
1. **Lexer**: Tokenizes the pattern into a stream of tokens
2. **Parser**: Builds an AST using recursive descent parsing
3. **NFA Construction**: Converts AST to NFA using Thompson's algorithm
4. **Matching**: Simulates NFA with epsilon closure for linear-time matching
4. **Optimization**: Extracts literal prefixes, suffixes, and first-char sets
5. **Matching**: Simulates NFA with epsilon closure for linear-time matching
Time complexity: O(n*m) where n is pattern length and m is text length.

View File

@ -1,6 +1,6 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_AST_H
#define LOREG_AST_H
#ifndef LOREX_AST_H
#define LOREX_AST_H
#include <stdbool.h>
#include <stddef.h>
@ -36,6 +36,8 @@ typedef struct {
size_t count;
size_t capacity;
bool negated;
unsigned char bitmap[32];
bool bitmap_valid;
} bracket_class_t;
typedef struct {

View File

@ -1,6 +1,6 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_LEXER_H
#define LOREG_LEXER_H
#ifndef LOREX_LEXER_H
#define LOREX_LEXER_H
#include <stddef.h>
#include <stdbool.h>

View File

@ -1,45 +0,0 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_H
#define LOREG_H
#include <stddef.h>
#include <stdbool.h>
#define LOREG_VERSION "1.0.0"
#define LOREG_MAX_STATES 4096
#define LOREG_MAX_GROUPS 32
typedef enum {
LOREG_OK = 0,
LOREG_ERR_INVALID_PATTERN,
LOREG_ERR_UNBALANCED_PAREN,
LOREG_ERR_EMPTY_GROUP,
LOREG_ERR_INVALID_QUANTIFIER,
LOREG_ERR_INVALID_ESCAPE,
LOREG_ERR_OUT_OF_MEMORY,
LOREG_ERR_STATE_OVERFLOW
} loreg_error_t;
typedef struct {
size_t start;
size_t end;
bool matched;
} loreg_group_t;
typedef struct {
bool matched;
size_t match_start;
size_t match_end;
loreg_group_t groups[LOREG_MAX_GROUPS];
size_t group_count;
} loreg_match_t;
typedef struct loreg_regex loreg_regex_t;
loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error);
void loreg_free(loreg_regex_t *regex);
bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result);
bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result);
const char *loreg_error_string(loreg_error_t error);
#endif

45
include/lorex.h Normal file
View File

@ -0,0 +1,45 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREX_H
#define LOREX_H
#include <stddef.h>
#include <stdbool.h>
#define LOREX_VERSION "1.0.0"
#define LOREX_MAX_STATES 4096
#define LOREX_MAX_GROUPS 32
typedef enum {
LOREX_OK = 0,
LOREX_ERR_INVALID_PATTERN,
LOREX_ERR_UNBALANCED_PAREN,
LOREX_ERR_EMPTY_GROUP,
LOREX_ERR_INVALID_QUANTIFIER,
LOREX_ERR_INVALID_ESCAPE,
LOREX_ERR_OUT_OF_MEMORY,
LOREX_ERR_STATE_OVERFLOW
} lorex_error_t;
typedef struct {
size_t start;
size_t end;
bool matched;
} lorex_group_t;
typedef struct {
bool matched;
size_t match_start;
size_t match_end;
lorex_group_t groups[LOREX_MAX_GROUPS];
size_t group_count;
} lorex_match_t;
typedef struct lorex_regex lorex_regex_t;
lorex_regex_t *lorex_compile(const char *pattern, lorex_error_t *error);
void lorex_free(lorex_regex_t *regex);
bool lorex_match(lorex_regex_t *regex, const char *text, lorex_match_t *result);
bool lorex_search(lorex_regex_t *regex, const char *text, lorex_match_t *result);
const char *lorex_error_string(lorex_error_t error);
#endif

View File

@ -1,9 +1,9 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_MATCHER_H
#define LOREG_MATCHER_H
#ifndef LOREX_MATCHER_H
#define LOREX_MATCHER_H
#include "nfa.h"
#include "loreg.h"
#include "lorex.h"
typedef struct {
nfa_state_t **states;
@ -20,7 +20,13 @@ void state_set_clear(state_set_t *set);
void state_set_add(state_set_t *set, nfa_state_t *state);
bool state_set_contains(state_set_t *set, nfa_state_t *state);
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result);
bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result);
typedef struct match_ctx match_ctx_t;
match_ctx_t *match_ctx_create(nfa_t *nfa);
void match_ctx_free(match_ctx_t *ctx);
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result);
bool nfa_match_with_ctx(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result, match_ctx_t *ctx);
bool nfa_search(nfa_t *nfa, const char *text, lorex_match_t *result);
#endif

View File

@ -1,9 +1,9 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_NFA_H
#define LOREG_NFA_H
#ifndef LOREX_NFA_H
#define LOREX_NFA_H
#include "ast.h"
#include "loreg.h"
#include "lorex.h"
#include <stdbool.h>
#include <stddef.h>
@ -30,19 +30,19 @@ typedef enum {
} transition_type_t;
typedef struct {
transition_type_t type;
char value;
nfa_state_t *target;
bracket_class_t *bracket;
transition_type_t type;
int group_id;
char value;
} transition_t;
struct nfa_state {
int id;
bool accepting;
transition_t *transitions;
size_t trans_count;
size_t trans_capacity;
int id;
bool accepting;
};
typedef struct {
@ -52,10 +52,22 @@ typedef struct {
typedef struct {
nfa_state_t **states;
nfa_state_t *start;
char *literal_prefix;
char *literal_suffix;
size_t state_count;
size_t capacity;
nfa_state_t *start;
size_t prefix_len;
size_t suffix_len;
int group_count;
char single_first_char;
bool anchored_start;
bool anchored_end;
bool first_chars_valid;
bool is_pure_literal;
bool has_alt_dispatch;
unsigned char first_chars[32];
unsigned char alt_dispatch[256];
} nfa_t;
nfa_t *nfa_create(void);
@ -64,6 +76,6 @@ nfa_state_t *nfa_add_state(nfa_t *nfa);
void nfa_add_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, char value);
void nfa_add_bracket_transition(nfa_state_t *from, nfa_state_t *to, bracket_class_t *bracket);
void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, int group_id);
nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error);
nfa_t *nfa_from_ast(ast_node_t *ast, lorex_error_t *error);
#endif

View File

@ -1,20 +1,20 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_PARSER_H
#define LOREG_PARSER_H
#ifndef LOREX_PARSER_H
#define LOREX_PARSER_H
#include "ast.h"
#include "lexer.h"
#include "loreg.h"
#include "lorex.h"
typedef struct {
lexer_t lexer;
token_t current;
loreg_error_t error;
lorex_error_t error;
int group_count;
} parser_t;
void parser_init(parser_t *parser, const char *pattern);
ast_node_t *parser_parse(parser_t *parser);
loreg_error_t parser_get_error(parser_t *parser);
lorex_error_t parser_get_error(parser_t *parser);
#endif

View File

@ -1,6 +1,6 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_REPL_H
#define LOREG_REPL_H
#ifndef LOREX_REPL_H
#define LOREX_REPL_H
void repl_run(void);

View File

@ -2,6 +2,7 @@
#include "ast.h"
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
static ast_node_t *ast_create_node(ast_type_t type) {
ast_node_t *node = malloc(sizeof(ast_node_t));
@ -126,6 +127,8 @@ bracket_class_t *bracket_create(void) {
bracket->count = 0;
bracket->capacity = 0;
bracket->negated = false;
memset(bracket->bitmap, 0, 32);
bracket->bitmap_valid = true;
return bracket;
}
@ -149,6 +152,14 @@ void bracket_add_range(bracket_class_t *bracket, char start, char end) {
bracket->ranges[bracket->count].start = start;
bracket->ranges[bracket->count].end = end;
bracket->count++;
if (bracket->bitmap_valid) {
unsigned char s = (unsigned char)start;
unsigned char e = (unsigned char)end;
for (unsigned int c = s; c <= e; c++) {
bracket->bitmap[c >> 3] |= (1u << (c & 7));
}
}
}
void bracket_free(bracket_class_t *bracket) {
@ -158,11 +169,18 @@ void bracket_free(bracket_class_t *bracket) {
}
bool bracket_matches(bracket_class_t *bracket, char c) {
bool found = false;
for (size_t i = 0; i < bracket->count; i++) {
if (c >= bracket->ranges[i].start && c <= bracket->ranges[i].end) {
found = true;
break;
unsigned char uc = (unsigned char)c;
bool found;
if (bracket->bitmap_valid) {
found = (bracket->bitmap[uc >> 3] & (1u << (uc & 7))) != 0;
} else {
found = false;
for (size_t i = 0; i < bracket->count; i++) {
if (c >= bracket->ranges[i].start && c <= bracket->ranges[i].end) {
found = true;
break;
}
}
}
return bracket->negated ? !found : found;

View File

@ -1,71 +0,0 @@
/* retoor <retoor@molodetz.nl> */
#include "loreg.h"
#include "parser.h"
#include "nfa.h"
#include "matcher.h"
#include <stdlib.h>
struct loreg_regex {
nfa_t *nfa;
ast_node_t *ast;
};
loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error) {
*error = LOREG_OK;
loreg_regex_t *regex = malloc(sizeof(loreg_regex_t));
if (!regex) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return NULL;
}
parser_t parser;
parser_init(&parser, pattern);
regex->ast = parser_parse(&parser);
*error = parser_get_error(&parser);
if (*error != LOREG_OK) {
ast_free(regex->ast);
free(regex);
return NULL;
}
regex->nfa = nfa_from_ast(regex->ast, error);
if (*error != LOREG_OK) {
ast_free(regex->ast);
free(regex);
return NULL;
}
return regex;
}
void loreg_free(loreg_regex_t *regex) {
if (!regex) return;
nfa_free(regex->nfa);
ast_free(regex->ast);
free(regex);
}
bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result) {
return nfa_match(regex->nfa, text, 0, result);
}
bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result) {
return nfa_search(regex->nfa, text, result);
}
const char *loreg_error_string(loreg_error_t error) {
switch (error) {
case LOREG_OK: return "success";
case LOREG_ERR_INVALID_PATTERN: return "invalid pattern";
case LOREG_ERR_UNBALANCED_PAREN: return "unbalanced parentheses";
case LOREG_ERR_EMPTY_GROUP: return "empty group";
case LOREG_ERR_INVALID_QUANTIFIER: return "invalid quantifier";
case LOREG_ERR_INVALID_ESCAPE: return "invalid escape sequence";
case LOREG_ERR_OUT_OF_MEMORY: return "out of memory";
case LOREG_ERR_STATE_OVERFLOW: return "state overflow";
default: return "unknown error";
}
}

71
src/lorex.c Normal file
View File

@ -0,0 +1,71 @@
/* retoor <retoor@molodetz.nl> */
#include "lorex.h"
#include "parser.h"
#include "nfa.h"
#include "matcher.h"
#include <stdlib.h>
struct lorex_regex {
nfa_t *nfa;
ast_node_t *ast;
};
lorex_regex_t *lorex_compile(const char *pattern, lorex_error_t *error) {
*error = LOREX_OK;
lorex_regex_t *regex = malloc(sizeof(lorex_regex_t));
if (!regex) {
*error = LOREX_ERR_OUT_OF_MEMORY;
return NULL;
}
parser_t parser;
parser_init(&parser, pattern);
regex->ast = parser_parse(&parser);
*error = parser_get_error(&parser);
if (*error != LOREX_OK) {
ast_free(regex->ast);
free(regex);
return NULL;
}
regex->nfa = nfa_from_ast(regex->ast, error);
if (*error != LOREX_OK) {
ast_free(regex->ast);
free(regex);
return NULL;
}
return regex;
}
void lorex_free(lorex_regex_t *regex) {
if (!regex) return;
nfa_free(regex->nfa);
ast_free(regex->ast);
free(regex);
}
bool lorex_match(lorex_regex_t *regex, const char *text, lorex_match_t *result) {
return nfa_match(regex->nfa, text, 0, result);
}
bool lorex_search(lorex_regex_t *regex, const char *text, lorex_match_t *result) {
return nfa_search(regex->nfa, text, result);
}
const char *lorex_error_string(lorex_error_t error) {
switch (error) {
case LOREX_OK: return "success";
case LOREX_ERR_INVALID_PATTERN: return "invalid pattern";
case LOREX_ERR_UNBALANCED_PAREN: return "unbalanced parentheses";
case LOREX_ERR_EMPTY_GROUP: return "empty group";
case LOREX_ERR_INVALID_QUANTIFIER: return "invalid quantifier";
case LOREX_ERR_INVALID_ESCAPE: return "invalid escape sequence";
case LOREX_ERR_OUT_OF_MEMORY: return "out of memory";
case LOREX_ERR_STATE_OVERFLOW: return "state overflow";
default: return "unknown error";
}
}

View File

@ -1,5 +1,5 @@
/* retoor <retoor@molodetz.nl> */
#include "loreg.h"
#include "lorex.h"
#include "repl.h"
#include <stdio.h>
#include <string.h>
@ -20,10 +20,10 @@ static void print_usage(const char *program) {
}
static void print_version(void) {
printf("loreg %s\n", LOREG_VERSION);
printf("lorex %s\n", LOREX_VERSION);
}
static void print_match(const char *text, loreg_match_t *result) {
static void print_match(const char *text, lorex_match_t *result) {
if (!result->matched) {
printf("no match\n");
return;
@ -86,22 +86,22 @@ int main(int argc, char *argv[]) {
const char *pattern = argv[arg_idx];
const char *text = argv[arg_idx + 1];
loreg_error_t error;
loreg_regex_t *regex = loreg_compile(pattern, &error);
lorex_error_t error;
lorex_regex_t *regex = lorex_compile(pattern, &error);
if (!regex) {
fprintf(stderr, "error: %s\n", loreg_error_string(error));
fprintf(stderr, "error: %s\n", lorex_error_string(error));
return 1;
}
loreg_match_t result;
lorex_match_t result;
if (match_mode) {
loreg_match(regex, text, &result);
lorex_match(regex, text, &result);
} else {
loreg_search(regex, text, &result);
lorex_search(regex, text, &result);
}
print_match(text, &result);
loreg_free(regex);
lorex_free(regex);
return result.matched ? 0 : 1;
}

View File

@ -3,6 +3,33 @@
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdint.h>
#define BITMAP_SET(bm, id) ((bm)[(id) >> 3] |= (1u << ((id) & 7)))
#define BITMAP_GET(bm, id) ((bm)[(id) >> 3] & (1u << ((id) & 7)))
#define BITMAP_CLR(bm, id) ((bm)[(id) >> 3] &= ~(1u << ((id) & 7)))
#define BITMAP_SIZE(n) (((n) + 7) >> 3)
static const uint8_t char_class_digit[32] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
static const uint8_t char_class_word[32] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x03,
0xFE, 0xFF, 0xFF, 0x87, 0xFE, 0xFF, 0xFF, 0x07,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
static const uint8_t char_class_space[32] = {
0x00, 0x26, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
};
state_set_t *state_set_create(size_t initial_capacity, int group_count) {
state_set_t *set = malloc(sizeof(state_set_t));
@ -78,27 +105,26 @@ bool state_set_contains(state_set_t *set, nfa_state_t *state) {
return false;
}
static bool is_digit(char c) {
return c >= '0' && c <= '9';
static inline bool is_digit(unsigned char c) {
return (char_class_digit[c >> 3] & (1u << (c & 7))) != 0;
}
static bool is_word(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') || c == '_';
static inline bool is_word(unsigned char c) {
return (char_class_word[c >> 3] & (1u << (c & 7))) != 0;
}
static bool is_space(char c) {
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v';
static inline bool is_space(unsigned char c) {
return (char_class_space[c >> 3] & (1u << (c & 7))) != 0;
}
static bool transition_matches(transition_t *t, char c, size_t pos, size_t len) {
static inline bool transition_matches(const transition_t * restrict t, unsigned char c, size_t pos, size_t len) {
switch (t->type) {
case TRANS_CHAR:
return t->value == c;
return (unsigned char)t->value == c;
case TRANS_DOT:
return c != '\n' && c != '\0';
case TRANS_BRACKET:
return bracket_matches(t->bracket, c);
return bracket_matches(t->bracket, (char)c);
case TRANS_CLASS_DIGIT:
return is_digit(c);
case TRANS_CLASS_WORD:
@ -131,9 +157,25 @@ typedef struct {
size_t count;
size_t capacity;
int group_count;
uint8_t *state_bitmap;
size_t bitmap_size;
size_t *scratch_starts;
size_t *scratch_ends;
} thread_list_t;
static thread_list_t *thread_list_create(size_t capacity, int group_count) {
struct match_ctx {
thread_list_t *current;
thread_list_t *next;
uint32_t *visited;
size_t *init_starts;
size_t *init_ends;
size_t *best_starts;
size_t *best_ends;
int group_count;
size_t num_states;
};
static thread_list_t *thread_list_create(size_t capacity, int group_count, size_t num_states) {
thread_list_t *list = malloc(sizeof(thread_list_t));
if (!list) return NULL;
@ -143,6 +185,30 @@ static thread_list_t *thread_list_create(size_t capacity, int group_count) {
return NULL;
}
list->bitmap_size = BITMAP_SIZE(num_states);
list->state_bitmap = calloc(list->bitmap_size, 1);
if (!list->state_bitmap) {
free(list->threads);
free(list);
return NULL;
}
if (group_count > 0) {
list->scratch_starts = malloc(group_count * sizeof(size_t));
list->scratch_ends = malloc(group_count * sizeof(size_t));
if (!list->scratch_starts || !list->scratch_ends) {
free(list->scratch_starts);
free(list->scratch_ends);
free(list->state_bitmap);
free(list->threads);
free(list);
return NULL;
}
} else {
list->scratch_starts = NULL;
list->scratch_ends = NULL;
}
for (size_t i = 0; i < capacity; i++) {
if (group_count > 0) {
list->threads[i].group_starts = malloc(group_count * sizeof(size_t));
@ -152,6 +218,9 @@ static thread_list_t *thread_list_create(size_t capacity, int group_count) {
free(list->threads[j].group_starts);
free(list->threads[j].group_ends);
}
free(list->scratch_starts);
free(list->scratch_ends);
free(list->state_bitmap);
free(list->threads);
free(list);
return NULL;
@ -174,19 +243,76 @@ static void thread_list_free(thread_list_t *list) {
free(list->threads[i].group_starts);
free(list->threads[i].group_ends);
}
free(list->scratch_starts);
free(list->scratch_ends);
free(list->state_bitmap);
free(list->threads);
free(list);
}
static void thread_list_clear(thread_list_t *list);
match_ctx_t *match_ctx_create(nfa_t *nfa) {
match_ctx_t *ctx = malloc(sizeof(match_ctx_t));
if (!ctx) return NULL;
size_t num_states = nfa->state_count;
int group_count = nfa->group_count > 0 ? nfa->group_count : 1;
ctx->num_states = num_states;
ctx->group_count = group_count;
ctx->current = thread_list_create(num_states, group_count, num_states);
ctx->next = thread_list_create(num_states, group_count, num_states);
ctx->visited = calloc(num_states, sizeof(uint32_t));
ctx->init_starts = calloc(group_count, sizeof(size_t));
ctx->init_ends = calloc(group_count, sizeof(size_t));
ctx->best_starts = calloc(group_count, sizeof(size_t));
ctx->best_ends = calloc(group_count, sizeof(size_t));
if (!ctx->current || !ctx->next || !ctx->visited ||
!ctx->init_starts || !ctx->init_ends ||
!ctx->best_starts || !ctx->best_ends) {
match_ctx_free(ctx);
return NULL;
}
return ctx;
}
void match_ctx_free(match_ctx_t *ctx) {
if (!ctx) return;
thread_list_free(ctx->current);
thread_list_free(ctx->next);
free(ctx->visited);
free(ctx->init_starts);
free(ctx->init_ends);
free(ctx->best_starts);
free(ctx->best_ends);
free(ctx);
}
static void match_ctx_reset(match_ctx_t *ctx) {
thread_list_clear(ctx->current);
thread_list_clear(ctx->next);
memset(ctx->visited, 0, ctx->num_states * sizeof(uint32_t));
for (int i = 0; i < ctx->group_count; i++) {
ctx->init_starts[i] = (size_t)-1;
ctx->init_ends[i] = (size_t)-1;
ctx->best_starts[i] = (size_t)-1;
ctx->best_ends[i] = (size_t)-1;
}
}
static void thread_list_clear(thread_list_t *list) {
for (size_t i = 0; i < list->count; i++) {
BITMAP_CLR(list->state_bitmap, list->threads[i].state->id);
}
list->count = 0;
}
static bool thread_list_contains_state(thread_list_t *list, nfa_state_t *state) {
for (size_t i = 0; i < list->count; i++) {
if (list->threads[i].state == state) return true;
}
return false;
static inline bool thread_list_contains_state(const thread_list_t * restrict list, const nfa_state_t * restrict state) {
return BITMAP_GET(list->state_bitmap, state->id) != 0;
}
static void add_thread(thread_list_t *list, nfa_state_t *state,
@ -194,44 +320,36 @@ static void add_thread(thread_list_t *list, nfa_state_t *state,
static void follow_epsilons(thread_list_t *list, nfa_state_t *state,
size_t *group_starts, size_t *group_ends,
size_t pos, size_t len, bool *visited) {
if (!state || visited[state->id]) return;
visited[state->id] = true;
size_t pos, size_t len, uint32_t *visited, uint32_t gen) {
if (!state || visited[state->id] == gen) return;
visited[state->id] = gen;
for (size_t i = 0; i < state->trans_count; i++) {
transition_t *t = &state->transitions[i];
if (t->type == TRANS_EPSILON) {
follow_epsilons(list, t->target, group_starts, group_ends,
pos, len, visited);
pos, len, visited, gen);
} else if (t->type == TRANS_GROUP_START) {
size_t *new_starts = malloc(list->group_count * sizeof(size_t));
size_t *new_ends = malloc(list->group_count * sizeof(size_t));
if (new_starts && new_ends) {
memcpy(new_starts, group_starts, list->group_count * sizeof(size_t));
memcpy(new_ends, group_ends, list->group_count * sizeof(size_t));
new_starts[t->group_id] = pos;
follow_epsilons(list, t->target, new_starts, new_ends,
pos, len, visited);
}
free(new_starts);
free(new_ends);
size_t *scratch_s = list->scratch_starts;
size_t *scratch_e = list->scratch_ends;
memcpy(scratch_s, group_starts, list->group_count * sizeof(size_t));
memcpy(scratch_e, group_ends, list->group_count * sizeof(size_t));
scratch_s[t->group_id] = pos;
follow_epsilons(list, t->target, scratch_s, scratch_e,
pos, len, visited, gen);
} else if (t->type == TRANS_GROUP_END) {
size_t *new_starts = malloc(list->group_count * sizeof(size_t));
size_t *new_ends = malloc(list->group_count * sizeof(size_t));
if (new_starts && new_ends) {
memcpy(new_starts, group_starts, list->group_count * sizeof(size_t));
memcpy(new_ends, group_ends, list->group_count * sizeof(size_t));
new_ends[t->group_id] = pos;
follow_epsilons(list, t->target, new_starts, new_ends,
pos, len, visited);
}
free(new_starts);
free(new_ends);
size_t *scratch_s = list->scratch_starts;
size_t *scratch_e = list->scratch_ends;
memcpy(scratch_s, group_starts, list->group_count * sizeof(size_t));
memcpy(scratch_e, group_ends, list->group_count * sizeof(size_t));
scratch_e[t->group_id] = pos;
follow_epsilons(list, t->target, scratch_s, scratch_e,
pos, len, visited, gen);
} else if (t->type == TRANS_ANCHOR_START || t->type == TRANS_ANCHOR_END) {
if (transition_matches(t, '\0', pos, len)) {
follow_epsilons(list, t->target, group_starts, group_ends,
pos, len, visited);
pos, len, visited, gen);
}
}
}
@ -243,9 +361,9 @@ static void add_thread(thread_list_t *list, nfa_state_t *state,
size_t *group_starts, size_t *group_ends) {
if (!state) return;
if (thread_list_contains_state(list, state)) return;
if (list->count >= list->capacity) return;
BITMAP_SET(list->state_bitmap, state->id);
thread_t *thread = &list->threads[list->count++];
thread->state = state;
if (list->group_count > 0) {
@ -254,14 +372,15 @@ static void add_thread(thread_list_t *list, nfa_state_t *state,
}
}
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result) {
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result) {
size_t len = strlen(text);
size_t num_states = nfa->state_count;
int group_count = nfa->group_count > 0 ? nfa->group_count : 1;
thread_list_t *current = thread_list_create(num_states, group_count);
thread_list_t *next = thread_list_create(num_states, group_count);
bool *visited = calloc(num_states, sizeof(bool));
thread_list_t *current = thread_list_create(num_states, group_count, num_states);
thread_list_t *next = thread_list_create(num_states, group_count, num_states);
uint32_t *visited = calloc(num_states, sizeof(uint32_t));
uint32_t generation = 1;
if (!current || !next || !visited) {
thread_list_free(current);
@ -286,9 +405,8 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re
init_ends[i] = (size_t)-1;
}
memset(visited, 0, num_states * sizeof(bool));
follow_epsilons(current, nfa->start, init_starts, init_ends,
start_pos, len, visited);
start_pos, len, visited, generation++);
bool matched = false;
size_t match_end = start_pos;
@ -322,27 +440,29 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re
}
for (size_t pos = start_pos; pos < len; pos++) {
char c = text[pos];
unsigned char c = (unsigned char)text[pos];
thread_list_clear(next);
for (size_t i = 0; i < current->count; i++) {
thread_t *thread = &current->threads[i];
nfa_state_t *state = thread->state;
size_t trans_count = state->trans_count;
transition_t *transitions = state->transitions;
for (size_t j = 0; j < state->trans_count; j++) {
transition_t *t = &state->transitions[j];
for (size_t j = 0; j < trans_count; j++) {
transition_t *t = &transitions[j];
transition_type_t type = t->type;
if (t->type != TRANS_EPSILON &&
t->type != TRANS_GROUP_START &&
t->type != TRANS_GROUP_END &&
t->type != TRANS_ANCHOR_START &&
t->type != TRANS_ANCHOR_END) {
if (type != TRANS_EPSILON &&
type != TRANS_GROUP_START &&
type != TRANS_GROUP_END &&
type != TRANS_ANCHOR_START &&
type != TRANS_ANCHOR_END) {
if (transition_matches(t, c, pos, len)) {
memset(visited, 0, num_states * sizeof(bool));
follow_epsilons(next, t->target,
thread->group_starts, thread->group_ends,
pos + 1, len, visited);
pos + 1, len, visited, generation++);
}
}
}
@ -371,7 +491,7 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re
result->match_end = matched ? match_end : start_pos;
result->group_count = nfa->group_count;
for (int i = 0; i < LOREG_MAX_GROUPS && i < nfa->group_count; i++) {
for (int i = 0; i < LOREX_MAX_GROUPS && i < nfa->group_count; i++) {
result->groups[i].start = best_starts[i];
result->groups[i].end = best_ends[i];
result->groups[i].matched = (best_starts[i] != (size_t)-1 && best_ends[i] != (size_t)-1);
@ -389,23 +509,259 @@ bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *re
return matched;
}
bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result) {
bool nfa_match_with_ctx(nfa_t *nfa, const char *text, size_t start_pos, lorex_match_t *result, match_ctx_t *ctx) {
size_t len = strlen(text);
int group_count = ctx->group_count;
uint32_t generation = 1;
for (size_t i = 0; i <= len; i++) {
if (nfa_match(nfa, text, i, result)) {
if (result) {
result->match_start = i;
}
return true;
match_ctx_reset(ctx);
follow_epsilons(ctx->current, nfa->start, ctx->init_starts, ctx->init_ends,
start_pos, len, ctx->visited, generation++);
bool matched = false;
size_t match_end = start_pos;
for (size_t i = 0; i < ctx->current->count; i++) {
if (ctx->current->threads[i].state->accepting) {
matched = true;
match_end = start_pos;
memcpy(ctx->best_starts, ctx->current->threads[i].group_starts, group_count * sizeof(size_t));
memcpy(ctx->best_ends, ctx->current->threads[i].group_ends, group_count * sizeof(size_t));
break;
}
}
thread_list_t *current = ctx->current;
thread_list_t *next = ctx->next;
for (size_t pos = start_pos; pos < len; pos++) {
unsigned char c = (unsigned char)text[pos];
thread_list_clear(next);
for (size_t i = 0; i < current->count; i++) {
thread_t *thread = &current->threads[i];
nfa_state_t *state = thread->state;
size_t trans_count = state->trans_count;
transition_t *transitions = state->transitions;
for (size_t j = 0; j < trans_count; j++) {
transition_t *t = &transitions[j];
transition_type_t type = t->type;
if (type != TRANS_EPSILON &&
type != TRANS_GROUP_START &&
type != TRANS_GROUP_END &&
type != TRANS_ANCHOR_START &&
type != TRANS_ANCHOR_END) {
if (transition_matches(t, c, pos, len)) {
follow_epsilons(next, t->target,
thread->group_starts, thread->group_ends,
pos + 1, len, ctx->visited, generation++);
}
}
}
}
if (next->count == 0) break;
thread_list_t *tmp = current;
current = next;
next = tmp;
for (size_t i = 0; i < current->count; i++) {
if (current->threads[i].state->accepting) {
matched = true;
match_end = pos + 1;
memcpy(ctx->best_starts, current->threads[i].group_starts, group_count * sizeof(size_t));
memcpy(ctx->best_ends, current->threads[i].group_ends, group_count * sizeof(size_t));
break;
}
}
}
if (result) {
result->matched = matched;
result->match_start = start_pos;
result->match_end = matched ? match_end : start_pos;
result->group_count = nfa->group_count;
for (int i = 0; i < LOREX_MAX_GROUPS && i < nfa->group_count; i++) {
result->groups[i].start = ctx->best_starts[i];
result->groups[i].end = ctx->best_ends[i];
result->groups[i].matched = (ctx->best_starts[i] != (size_t)-1 && ctx->best_ends[i] != (size_t)-1);
}
}
return matched;
}
static void set_no_match(lorex_match_t *result) {
if (result) {
result->matched = false;
result->match_start = 0;
result->match_end = 0;
result->group_count = 0;
}
}
bool nfa_search(nfa_t *nfa, const char *text, lorex_match_t *result) {
size_t len = strlen(text);
if (nfa->anchored_start) {
bool matched = nfa_match(nfa, text, 0, result);
if (matched && result) {
result->match_start = 0;
}
if (!matched) {
set_no_match(result);
}
return matched;
}
if (nfa->is_pure_literal && nfa->literal_prefix && nfa->prefix_len > 0) {
const char *found = strstr(text, nfa->literal_prefix);
if (found) {
if (result) {
result->matched = true;
result->match_start = (size_t)(found - text);
result->match_end = result->match_start + nfa->prefix_len;
result->group_count = 0;
}
return true;
}
set_no_match(result);
return false;
}
match_ctx_t *ctx = match_ctx_create(nfa);
if (!ctx) {
set_no_match(result);
return false;
}
if (nfa->anchored_end && nfa->suffix_len > 0) {
if (len < nfa->suffix_len) {
match_ctx_free(ctx);
set_no_match(result);
return false;
}
if (memcmp(text + len - nfa->suffix_len, nfa->literal_suffix, nfa->suffix_len) != 0) {
match_ctx_free(ctx);
set_no_match(result);
return false;
}
size_t suffix_start = len - nfa->suffix_len;
for (size_t i = suffix_start + 1; i > 0; i--) {
size_t pos = i - 1;
if (nfa_match_with_ctx(nfa, text, pos, result, ctx)) {
if (result) result->match_start = pos;
match_ctx_free(ctx);
return true;
}
}
match_ctx_free(ctx);
set_no_match(result);
return false;
}
if (nfa->has_alt_dispatch) {
for (size_t i = 0; i < len; i++) {
unsigned char c = (unsigned char)text[i];
if (nfa->alt_dispatch[c] == 255) continue;
if (nfa_match_with_ctx(nfa, text, i, result, ctx)) {
if (result) result->match_start = i;
match_ctx_free(ctx);
return true;
}
}
match_ctx_free(ctx);
set_no_match(result);
return false;
}
if (nfa->prefix_len >= 2) {
const char *pos = text;
const char *end = text + len;
while (pos <= end - nfa->prefix_len) {
pos = strstr(pos, nfa->literal_prefix);
if (!pos) break;
size_t offset = (size_t)(pos - text);
if (nfa_match_with_ctx(nfa, text, offset, result, ctx)) {
if (result) {
result->match_start = offset;
}
match_ctx_free(ctx);
return true;
}
pos++;
}
match_ctx_free(ctx);
set_no_match(result);
return false;
}
if (nfa->single_first_char != 0) {
const char *pos = text;
const char *end = text + len;
while (pos < end) {
pos = memchr(pos, nfa->single_first_char, (size_t)(end - pos));
if (!pos) break;
size_t offset = (size_t)(pos - text);
if (nfa_match_with_ctx(nfa, text, offset, result, ctx)) {
if (result) {
result->match_start = offset;
}
match_ctx_free(ctx);
return true;
}
pos++;
}
match_ctx_free(ctx);
set_no_match(result);
return false;
}
if (nfa->first_chars_valid) {
bool has_any_first_char = false;
for (int i = 0; i < 32; i++) {
if (nfa->first_chars[i]) {
has_any_first_char = true;
break;
}
}
if (has_any_first_char) {
for (size_t i = 0; i < len; i++) {
unsigned char c = (unsigned char)text[i];
if (!(nfa->first_chars[c >> 3] & (1u << (c & 7)))) {
continue;
}
if (nfa_match_with_ctx(nfa, text, i, result, ctx)) {
if (result) {
result->match_start = i;
}
match_ctx_free(ctx);
return true;
}
}
match_ctx_free(ctx);
set_no_match(result);
return false;
}
}
for (size_t i = 0; i <= len; i++) {
if (nfa_match_with_ctx(nfa, text, i, result, ctx)) {
if (result) {
result->match_start = i;
}
match_ctx_free(ctx);
return true;
}
}
match_ctx_free(ctx);
set_no_match(result);
return false;
}

406
src/nfa.c
View File

@ -11,9 +11,282 @@ nfa_t *nfa_create(void) {
nfa->capacity = 0;
nfa->start = NULL;
nfa->group_count = 0;
nfa->anchored_start = false;
nfa->anchored_end = false;
memset(nfa->first_chars, 0, 32);
nfa->first_chars_valid = false;
nfa->literal_prefix = NULL;
nfa->prefix_len = 0;
nfa->is_pure_literal = false;
nfa->single_first_char = 0;
nfa->literal_suffix = NULL;
nfa->suffix_len = 0;
memset(nfa->alt_dispatch, 255, 256);
nfa->has_alt_dispatch = false;
return nfa;
}
static bool ast_starts_with_anchor(ast_node_t *ast) {
if (!ast) return false;
switch (ast->type) {
case AST_ANCHOR_START:
return true;
case AST_CONCAT:
return ast_starts_with_anchor(ast->left);
case AST_GROUP:
return ast_starts_with_anchor(ast->left);
default:
return false;
}
}
static bool ast_ends_with_anchor(ast_node_t *ast) {
if (!ast) return false;
switch (ast->type) {
case AST_ANCHOR_END:
return true;
case AST_CONCAT:
return ast_ends_with_anchor(ast->right);
case AST_GROUP:
return ast_ends_with_anchor(ast->left);
default:
return false;
}
}
static void extract_first_chars(ast_node_t *ast, unsigned char *bitmap, bool *valid) {
if (!ast) {
*valid = false;
return;
}
switch (ast->type) {
case AST_CHAR:
bitmap[(unsigned char)ast->value >> 3] |= (1u << (ast->value & 7));
break;
case AST_DOT:
*valid = false;
break;
case AST_CONCAT:
extract_first_chars(ast->left, bitmap, valid);
break;
case AST_ALTER:
extract_first_chars(ast->left, bitmap, valid);
extract_first_chars(ast->right, bitmap, valid);
break;
case AST_STAR:
case AST_QUESTION:
*valid = false;
break;
case AST_PLUS:
extract_first_chars(ast->left, bitmap, valid);
break;
case AST_GROUP:
extract_first_chars(ast->left, bitmap, valid);
break;
case AST_ANCHOR_START:
case AST_ANCHOR_END:
break;
case AST_BRACKET:
if (ast->bracket && !ast->bracket->negated) {
for (size_t i = 0; i < ast->bracket->count; i++) {
unsigned char s = (unsigned char)ast->bracket->ranges[i].start;
unsigned char e = (unsigned char)ast->bracket->ranges[i].end;
for (unsigned int c = s; c <= e; c++) {
bitmap[c >> 3] |= (1u << (c & 7));
}
}
} else {
*valid = false;
}
break;
case AST_QUANTIFIER:
if (ast->quant.min > 0) {
extract_first_chars(ast->left, bitmap, valid);
} else {
*valid = false;
}
break;
case AST_CLASS_DIGIT:
for (char c = '0'; c <= '9'; c++) {
bitmap[(unsigned char)c >> 3] |= (1u << (c & 7));
}
break;
case AST_CLASS_WORD:
for (char c = 'a'; c <= 'z'; c++)
bitmap[(unsigned char)c >> 3] |= (1u << (c & 7));
for (char c = 'A'; c <= 'Z'; c++)
bitmap[(unsigned char)c >> 3] |= (1u << (c & 7));
for (char c = '0'; c <= '9'; c++)
bitmap[(unsigned char)c >> 3] |= (1u << (c & 7));
bitmap['_' >> 3] |= (1u << ('_' & 7));
break;
case AST_CLASS_SPACE:
bitmap[' ' >> 3] |= (1u << (' ' & 7));
bitmap['\t' >> 3] |= (1u << ('\t' & 7));
bitmap['\n' >> 3] |= (1u << ('\n' & 7));
bitmap['\r' >> 3] |= (1u << ('\r' & 7));
break;
case AST_CLASS_NDIGIT:
case AST_CLASS_NWORD:
case AST_CLASS_NSPACE:
*valid = false;
break;
}
}
static bool ast_is_pure_literal(ast_node_t *ast) {
if (!ast) return true;
switch (ast->type) {
case AST_CHAR:
return true;
case AST_CONCAT:
return ast_is_pure_literal(ast->left) && ast_is_pure_literal(ast->right);
default:
return false;
}
}
static size_t extract_literal_prefix(ast_node_t *ast, char *buf, size_t max_len) {
if (!ast || max_len == 0) return 0;
switch (ast->type) {
case AST_CHAR:
buf[0] = ast->value;
return 1;
case AST_CONCAT: {
size_t left_len = extract_literal_prefix(ast->left, buf, max_len);
if (left_len > 0 && ast_is_pure_literal(ast->left)) {
size_t right_len = extract_literal_prefix(ast->right, buf + left_len, max_len - left_len);
return left_len + right_len;
}
return left_len;
}
case AST_GROUP:
return extract_literal_prefix(ast->left, buf, max_len);
case AST_ANCHOR_START:
case AST_ANCHOR_END:
return 0;
default:
return 0;
}
}
static ast_node_t *strip_end_anchor(ast_node_t *ast) {
if (!ast) return NULL;
if (ast->type == AST_ANCHOR_END) return NULL;
if (ast->type == AST_CONCAT && ast->right && ast->right->type == AST_ANCHOR_END) {
return ast->left;
}
return ast;
}
static size_t extract_literal_suffix_rev(ast_node_t *ast, char *buf, size_t max_len) {
if (!ast || max_len == 0) return 0;
switch (ast->type) {
case AST_CHAR:
buf[0] = ast->value;
return 1;
case AST_CONCAT: {
size_t right_len = extract_literal_suffix_rev(ast->right, buf, max_len);
if (right_len == 0 || !ast_is_pure_literal(ast->right)) return right_len;
size_t left_len = extract_literal_suffix_rev(ast->left, buf + right_len, max_len - right_len);
return right_len + left_len;
}
case AST_GROUP:
return extract_literal_suffix_rev(ast->left, buf, max_len);
default:
return 0;
}
}
static size_t extract_literal_suffix(ast_node_t *ast, char *buf, size_t max_len) {
ast_node_t *stripped = strip_end_anchor(ast);
if (!stripped) return 0;
size_t len = extract_literal_suffix_rev(stripped, buf, max_len);
for (size_t i = 0; i < len / 2; i++) {
char tmp = buf[i];
buf[i] = buf[len - 1 - i];
buf[len - 1 - i] = tmp;
}
return len;
}
static void build_alt_dispatch_node(ast_node_t *ast, unsigned char *dispatch) {
if (!ast) return;
switch (ast->type) {
case AST_CHAR:
dispatch[(unsigned char)ast->value] = 1;
break;
case AST_CONCAT:
build_alt_dispatch_node(ast->left, dispatch);
break;
case AST_ALTER:
build_alt_dispatch_node(ast->left, dispatch);
build_alt_dispatch_node(ast->right, dispatch);
break;
case AST_GROUP:
build_alt_dispatch_node(ast->left, dispatch);
break;
case AST_BRACKET:
if (ast->bracket && !ast->bracket->negated) {
for (size_t i = 0; i < ast->bracket->count; i++) {
unsigned char s = (unsigned char)ast->bracket->ranges[i].start;
unsigned char e = (unsigned char)ast->bracket->ranges[i].end;
for (unsigned int c = s; c <= e; c++) {
dispatch[c] = 1;
}
}
}
break;
case AST_CLASS_DIGIT:
for (char c = '0'; c <= '9'; c++) {
dispatch[(unsigned char)c] = 1;
}
break;
case AST_CLASS_WORD:
for (char c = 'a'; c <= 'z'; c++) dispatch[(unsigned char)c] = 1;
for (char c = 'A'; c <= 'Z'; c++) dispatch[(unsigned char)c] = 1;
for (char c = '0'; c <= '9'; c++) dispatch[(unsigned char)c] = 1;
dispatch['_'] = 1;
break;
default:
break;
}
}
static bool is_top_level_alternation(ast_node_t *ast) {
if (!ast) return false;
if (ast->type == AST_ALTER) return true;
if (ast->type == AST_GROUP) return is_top_level_alternation(ast->left);
return false;
}
static bool build_alt_dispatch(ast_node_t *ast, unsigned char *dispatch) {
if (!is_top_level_alternation(ast)) return false;
memset(dispatch, 255, 256);
build_alt_dispatch_node(ast, dispatch);
bool has_any = false;
for (int i = 0; i < 256; i++) {
if (dispatch[i] == 1) {
dispatch[i] = 0;
has_any = true;
}
}
return has_any;
}
static char compute_single_first_char(unsigned char *bitmap) {
int count = 0;
char first_char = 0;
for (int i = 0; i < 256; i++) {
if (bitmap[i >> 3] & (1u << (i & 7))) {
count++;
if (count == 1) first_char = (char)i;
if (count > 1) return 0;
}
}
return first_char;
}
void nfa_free(nfa_t *nfa) {
if (!nfa) return;
for (size_t i = 0; i < nfa->state_count; i++) {
@ -21,14 +294,16 @@ void nfa_free(nfa_t *nfa) {
free(nfa->states[i]);
}
free(nfa->states);
free(nfa->literal_prefix);
free(nfa->literal_suffix);
free(nfa);
}
static bool nfa_grow(nfa_t *nfa) {
size_t new_cap = nfa->capacity == 0 ? 16 : nfa->capacity * 2;
if (new_cap > LOREG_MAX_STATES) {
if (nfa->capacity >= LOREG_MAX_STATES) return false;
new_cap = LOREG_MAX_STATES;
if (new_cap > LOREX_MAX_STATES) {
if (nfa->capacity >= LOREX_MAX_STATES) return false;
new_cap = LOREX_MAX_STATES;
}
nfa_state_t **new_states = realloc(nfa->states, new_cap * sizeof(nfa_state_t *));
if (!new_states) return false;
@ -100,14 +375,14 @@ void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_typ
t->group_id = group_id;
}
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error);
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, lorex_error_t *error);
static nfa_fragment_t build_char(nfa_t *nfa, char c, loreg_error_t *error) {
static nfa_fragment_t build_char(nfa_t *nfa, char c, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(start, accept, TRANS_CHAR, c);
@ -116,12 +391,12 @@ static nfa_fragment_t build_char(nfa_t *nfa, char c, loreg_error_t *error) {
return frag;
}
static nfa_fragment_t build_dot(nfa_t *nfa, loreg_error_t *error) {
static nfa_fragment_t build_dot(nfa_t *nfa, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(start, accept, TRANS_DOT, '\0');
@ -130,12 +405,12 @@ static nfa_fragment_t build_dot(nfa_t *nfa, loreg_error_t *error) {
return frag;
}
static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, loreg_error_t *error) {
static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(start, accept, type, '\0');
@ -144,12 +419,12 @@ static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, loreg_erro
return frag;
}
static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, loreg_error_t *error) {
static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_bracket_transition(start, accept, bracket);
@ -158,12 +433,12 @@ static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, loreg_
return frag;
}
static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) {
static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *right, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_fragment_t left_frag = build_nfa(nfa, left, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
nfa_fragment_t right_frag = build_nfa(nfa, right, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
nfa_add_transition(left_frag.accept, right_frag.start, TRANS_EPSILON, '\0');
frag.start = left_frag.start;
@ -171,19 +446,19 @@ static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *rig
return frag;
}
static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) {
static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *right, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_fragment_t left_frag = build_nfa(nfa, left, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
nfa_fragment_t right_frag = build_nfa(nfa, right, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
nfa_add_transition(start, left_frag.start, TRANS_EPSILON, '\0');
nfa_add_transition(start, right_frag.start, TRANS_EPSILON, '\0');
@ -195,17 +470,17 @@ static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *righ
return frag;
}
static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
if (greedy) {
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
@ -222,16 +497,16 @@ static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, lor
return frag;
}
static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *accept = nfa_add_state(nfa);
if (!accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
if (greedy) {
nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0');
@ -246,17 +521,17 @@ static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, lor
return frag;
}
static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
if (greedy) {
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
@ -272,17 +547,17 @@ static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy,
return frag;
}
static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, loreg_error_t *error) {
static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
nfa_add_group_transition(start, child_frag.start, TRANS_GROUP_START, group_id);
nfa_add_group_transition(child_frag.accept, accept, TRANS_GROUP_END, group_id);
@ -296,12 +571,12 @@ static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, l
return frag;
}
static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, loreg_error_t *error) {
static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(start, accept, type, '\0');
@ -310,13 +585,13 @@ static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, loreg_err
return frag;
}
static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, int max, bool greedy, loreg_error_t *error) {
static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, int max, bool greedy, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
if (min == 0 && max == 0) {
nfa_state_t *state = nfa_add_state(nfa);
if (!state) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
frag.start = state;
@ -326,7 +601,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
nfa_state_t *start = nfa_add_state(nfa);
if (!start) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
@ -334,7 +609,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
for (int i = 0; i < min; i++) {
nfa_fragment_t rep = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0');
current = rep.accept;
}
@ -343,14 +618,14 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
nfa_state_t *loop_start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!loop_start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(current, loop_start, TRANS_EPSILON, '\0');
nfa_fragment_t rep = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
if (greedy) {
nfa_add_transition(loop_start, rep.start, TRANS_EPSILON, '\0');
@ -366,7 +641,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
} else {
nfa_state_t *accept = nfa_add_state(nfa);
if (!accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
@ -374,7 +649,7 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
for (int i = min; i < max; i++) {
nfa_fragment_t rep = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (*error != LOREX_OK) return frag;
if (greedy) {
nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0');
@ -400,13 +675,13 @@ static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, i
return frag;
}
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error) {
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, lorex_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
if (!ast) {
nfa_state_t *state = nfa_add_state(nfa);
if (!state) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return frag;
}
frag.start = state;
@ -456,16 +731,53 @@ static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *erro
return frag;
}
nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error) {
*error = LOREG_OK;
nfa_t *nfa_from_ast(ast_node_t *ast, lorex_error_t *error) {
*error = LOREX_OK;
nfa_t *nfa = nfa_create();
if (!nfa) {
*error = LOREG_ERR_OUT_OF_MEMORY;
*error = LOREX_ERR_OUT_OF_MEMORY;
return NULL;
}
nfa->anchored_start = ast_starts_with_anchor(ast);
nfa->anchored_end = ast_ends_with_anchor(ast);
nfa->first_chars_valid = true;
extract_first_chars(ast, nfa->first_chars, &nfa->first_chars_valid);
if (nfa->first_chars_valid) {
nfa->single_first_char = compute_single_first_char(nfa->first_chars);
}
nfa->is_pure_literal = ast_is_pure_literal(ast);
char prefix_buf[256];
size_t prefix_len = extract_literal_prefix(ast, prefix_buf, sizeof(prefix_buf));
if (prefix_len > 0) {
nfa->literal_prefix = malloc(prefix_len + 1);
if (nfa->literal_prefix) {
memcpy(nfa->literal_prefix, prefix_buf, prefix_len);
nfa->literal_prefix[prefix_len] = '\0';
nfa->prefix_len = prefix_len;
}
}
if (nfa->anchored_end) {
char suffix_buf[256];
size_t suffix_len = extract_literal_suffix(ast, suffix_buf, sizeof(suffix_buf));
if (suffix_len > 0) {
nfa->literal_suffix = malloc(suffix_len + 1);
if (nfa->literal_suffix) {
memcpy(nfa->literal_suffix, suffix_buf, suffix_len);
nfa->literal_suffix[suffix_len] = '\0';
nfa->suffix_len = suffix_len;
}
}
}
nfa->has_alt_dispatch = build_alt_dispatch(ast, nfa->alt_dispatch);
nfa_fragment_t frag = build_nfa(nfa, ast, error);
if (*error != LOREG_OK) {
if (*error != LOREX_OK) {
nfa_free(nfa);
return NULL;
}

View File

@ -10,11 +10,11 @@ static void parser_advance(parser_t *parser) {
void parser_init(parser_t *parser, const char *pattern) {
lexer_init(&parser->lexer, pattern);
parser->current = lexer_next(&parser->lexer);
parser->error = LOREG_OK;
parser->error = LOREX_OK;
parser->group_count = 0;
}
loreg_error_t parser_get_error(parser_t *parser) {
lorex_error_t parser_get_error(parser_t *parser) {
return parser->error;
}
@ -27,7 +27,7 @@ static int parse_number(parser_t *parser);
static ast_node_t *parse_expr(parser_t *parser) {
ast_node_t *left = parse_term(parser);
if (!left || parser->error != LOREG_OK) return left;
if (!left || parser->error != LOREX_OK) return left;
while (parser->current.type == TOKEN_PIPE) {
parser_advance(parser);
@ -38,7 +38,7 @@ static ast_node_t *parse_expr(parser_t *parser) {
}
left = ast_create_alter(left, right);
if (!left) {
parser->error = LOREG_ERR_OUT_OF_MEMORY;
parser->error = LOREX_ERR_OUT_OF_MEMORY;
return NULL;
}
}
@ -61,7 +61,7 @@ static ast_node_t *parse_term(parser_t *parser) {
} else {
left = ast_create_concat(left, factor);
if (!left) {
parser->error = LOREG_ERR_OUT_OF_MEMORY;
parser->error = LOREX_ERR_OUT_OF_MEMORY;
return NULL;
}
}
@ -71,7 +71,7 @@ static ast_node_t *parse_term(parser_t *parser) {
static ast_node_t *parse_factor(parser_t *parser) {
ast_node_t *atom = parse_atom(parser);
if (!atom || parser->error != LOREG_OK) return atom;
if (!atom || parser->error != LOREX_OK) return atom;
while (parser->current.type == TOKEN_STAR ||
parser->current.type == TOKEN_PLUS ||
@ -107,7 +107,13 @@ static ast_node_t *parse_factor(parser_t *parser) {
}
if (parser->current.type != TOKEN_RBRACE) {
parser->error = LOREG_ERR_INVALID_QUANTIFIER;
parser->error = LOREX_ERR_INVALID_QUANTIFIER;
ast_free(atom);
return NULL;
}
if (max != -1 && min > max) {
parser->error = LOREX_ERR_INVALID_QUANTIFIER;
ast_free(atom);
return NULL;
}
@ -122,7 +128,7 @@ static ast_node_t *parse_factor(parser_t *parser) {
}
if (!atom) {
parser->error = LOREG_ERR_OUT_OF_MEMORY;
parser->error = LOREX_ERR_OUT_OF_MEMORY;
return NULL;
}
}
@ -167,7 +173,7 @@ static ast_node_t *parse_atom(parser_t *parser) {
int group_id = parser->group_count++;
ast_node_t *inner = parse_expr(parser);
if (parser->current.type != TOKEN_RPAREN) {
parser->error = LOREG_ERR_UNBALANCED_PAREN;
parser->error = LOREX_ERR_UNBALANCED_PAREN;
ast_free(inner);
return NULL;
}
@ -216,12 +222,12 @@ static ast_node_t *parse_atom(parser_t *parser) {
return NULL;
default:
parser->error = LOREG_ERR_INVALID_PATTERN;
parser->error = LOREX_ERR_INVALID_PATTERN;
return NULL;
}
if (!node && parser->error == LOREG_OK) {
parser->error = LOREG_ERR_OUT_OF_MEMORY;
if (!node && parser->error == LOREX_OK) {
parser->error = LOREX_ERR_OUT_OF_MEMORY;
}
return node;
}
@ -231,7 +237,7 @@ static ast_node_t *parse_bracket(parser_t *parser) {
bracket_class_t *bracket = bracket_create();
if (!bracket) {
parser->error = LOREG_ERR_OUT_OF_MEMORY;
parser->error = LOREX_ERR_OUT_OF_MEMORY;
return NULL;
}
@ -293,7 +299,7 @@ static ast_node_t *parse_bracket(parser_t *parser) {
if (parser->current.type != TOKEN_RBRACKET) {
bracket_free(bracket);
parser->error = LOREG_ERR_INVALID_PATTERN;
parser->error = LOREX_ERR_INVALID_PATTERN;
return NULL;
}
parser_advance(parser);

View File

@ -1,6 +1,6 @@
/* retoor <retoor@molodetz.nl> */
#include "repl.h"
#include "loreg.h"
#include "lorex.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
@ -8,12 +8,12 @@
#define MAX_INPUT 4096
static void print_banner(void) {
printf("loreg v%s - regex interpreter\n", LOREG_VERSION);
printf("lorex v%s - regex interpreter\n", LOREX_VERSION);
printf("commands: :q quit, :h help, :p <pattern> set pattern, :m <text> match, :s <text> search\n\n");
}
static void print_help(void) {
printf("loreg REPL commands:\n");
printf("lorex REPL commands:\n");
printf(" :q quit\n");
printf(" :h show this help\n");
printf(" :p <regex> compile and set pattern\n");
@ -40,7 +40,7 @@ static void print_help(void) {
printf(" \\D \\W \\S negated classes\n\n");
}
static void print_match(const char *text, loreg_match_t *result) {
static void print_match(const char *text, lorex_match_t *result) {
if (!result->matched) {
printf("no match\n");
return;
@ -83,7 +83,7 @@ static char *read_line(void) {
void repl_run(void) {
print_banner();
loreg_regex_t *regex = NULL;
lorex_regex_t *regex = NULL;
char *line;
while ((line = read_line()) != NULL) {
@ -103,14 +103,14 @@ void repl_run(void) {
while (*pattern == ' ') pattern++;
if (regex) {
loreg_free(regex);
lorex_free(regex);
regex = NULL;
}
loreg_error_t error;
regex = loreg_compile(pattern, &error);
lorex_error_t error;
regex = lorex_compile(pattern, &error);
if (!regex) {
printf("error: %s\n", loreg_error_string(error));
printf("error: %s\n", lorex_error_string(error));
} else {
printf("pattern compiled: %s\n", pattern);
}
@ -126,8 +126,8 @@ void repl_run(void) {
const char *text = line + 3;
while (*text == ' ') text++;
loreg_match_t result;
loreg_match(regex, text, &result);
lorex_match_t result;
lorex_match(regex, text, &result);
print_match(text, &result);
continue;
}
@ -141,8 +141,8 @@ void repl_run(void) {
const char *text = line + 3;
while (*text == ' ') text++;
loreg_match_t result;
loreg_search(regex, text, &result);
lorex_match_t result;
lorex_search(regex, text, &result);
print_match(text, &result);
continue;
}
@ -157,13 +157,13 @@ void repl_run(void) {
continue;
}
loreg_match_t result;
loreg_search(regex, line, &result);
lorex_match_t result;
lorex_search(regex, line, &result);
print_match(line, &result);
}
if (regex) {
loreg_free(regex);
lorex_free(regex);
}
printf("\n");

448
tests/benchmark.c Normal file
View File

@ -0,0 +1,448 @@
/* retoor <retoor@molodetz.nl> */
#define _POSIX_C_SOURCE 200809L
#include "../include/lorex.h"
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#define ITERATIONS 10000
#define WARMUP 1000
typedef struct {
const char *name;
const char *pattern;
const char *text;
int expect_match;
} benchmark_t;
typedef struct {
double lorex_compile_us;
double lorex_match_us;
double lorex_total_us;
double posix_compile_us;
double posix_match_us;
double posix_total_us;
int lorex_matched;
int posix_matched;
int lorex_failed;
int posix_failed;
} result_t;
static benchmark_t benchmarks[] = {
{"literal_short", "hello", "hello world", 1},
{"literal_medium", "the quick brown", "the quick brown fox jumps over the lazy dog", 1},
{"literal_long", "Lorem ipsum dolor sit amet", "Lorem ipsum dolor sit amet, consectetur adipiscing elit", 1},
{"literal_nomatch", "xyz", "the quick brown fox jumps over the lazy dog", 0},
{"literal_end", "dog", "the quick brown fox jumps over the lazy dog", 1},
{"literal_repeated", "abcabc", "xyzabcabcdef", 1},
{"dot_single", "a.c", "abc", 1},
{"dot_multiple", "a..b", "aXYb", 1},
{"dot_many", "a.....b", "a12345b", 1},
{"dot_star", "a.*b", "aXXXXXXXXXXb", 1},
{"dot_plus", "a.+b", "aXXXXXXXXXXb", 1},
{"anchor_start", "^the", "the quick brown fox", 1},
{"anchor_end", "fox$", "the quick brown fox", 1},
{"anchor_both", "^hello$", "hello", 1},
{"anchor_start_nomatch", "^fox", "the quick brown fox", 0},
{"anchor_end_nomatch", "the$", "the quick brown fox", 0},
{"star_simple", "ab*c", "abbbbc", 1},
{"star_zero", "ab*c", "ac", 1},
{"star_greedy", "a.*b", "aXbXbXb", 1},
{"star_repeated", "a*b*c*", "aaabbbccc", 1},
{"star_empty", "a*", "", 1},
{"plus_simple", "ab+c", "abbbbc", 1},
{"plus_one", "ab+c", "abc", 1},
{"plus_nomatch", "ab+c", "ac", 0},
{"plus_greedy", "a.+b", "aXbXbXb", 1},
{"question_present", "colou?r", "colour", 1},
{"question_absent", "colou?r", "color", 1},
{"question_multiple", "a?b?c?d", "abcd", 1},
{"class_vowels", "[aeiou]", "hello", 1},
{"class_digits", "[0-9]+", "abc123def", 1},
{"class_alpha", "[a-zA-Z]+", "HelloWorld", 1},
{"class_alnum", "[a-zA-Z0-9]+", "Test123", 1},
{"class_neg_digit", "[^0-9]+", "hello", 1},
{"class_neg_alpha", "[^a-zA-Z]+", "12345", 1},
{"class_complex", "[a-zA-Z_][a-zA-Z0-9_]*", "variable_name_123", 1},
{"alt_simple", "cat|dog", "I have a cat", 1},
{"alt_simple2", "cat|dog", "I have a dog", 1},
{"alt_three", "red|green|blue", "the color is green", 1},
{"alt_nomatch", "cat|dog", "I have a bird", 0},
{"alt_words", "hello|world|test", "this is a test", 1},
{"group_simple", "(ab)+", "ababab", 1},
{"group_alt", "(cat|dog)s?", "cats", 1},
{"group_nested", "((a)(b))+", "ababab", 1},
{"group_complex", "(a(b(c)))+", "abcabc", 1},
{"quant_exact", "a{3}", "aaa", 1},
{"quant_exact_long", "a{10}", "aaaaaaaaaa", 1},
{"quant_range", "a{2,4}", "aaa", 1},
{"quant_min", "a{3,}", "aaaaa", 1},
{"quant_combined", "[0-9]{3}-[0-9]{4}", "555-1234", 1},
{"email_simple", "[a-z]+@[a-z]+\\.[a-z]+", "test@example.com", 1},
{"email_complex", "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "user.name+tag@sub.example.com", 1},
{"ip_address", "[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}", "192.168.1.100", 1},
{"url_http", "https?://[a-zA-Z0-9.-]+", "https://www.example.com", 1},
{"phone_us", "[0-9]{3}-[0-9]{3}-[0-9]{4}", "555-123-4567", 1},
{"date_iso", "[0-9]{4}-[0-9]{2}-[0-9]{2}", "2024-01-15", 1},
{"time_hms", "[0-9]{2}:[0-9]{2}:[0-9]{2}", "14:30:45", 1},
{"hex_color", "#[0-9a-fA-F]{6}", "#ff00ff", 1},
{"word_boundary", "[a-zA-Z]+", "hello world test", 1},
{"whitespace", "[ \\t\\n]+", "hello world", 1},
{"identifier", "[a-zA-Z_][a-zA-Z0-9_]*", "_privateVar123", 1},
{"number_int", "-?[0-9]+", "-12345", 1},
{"number_float", "-?[0-9]+\\.[0-9]+", "3.14159", 1},
{"long_text_start", "^The", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 1},
{"long_text_end", "dog\\.$", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 1},
{"long_text_middle", "fox", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 1},
{"long_text_nomatch", "elephant", "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog.", 0},
{"repeated_ab", "(ab){5}", "ababababab", 1},
{"repeated_word", "(hello ){3}", "hello hello hello ", 1},
{"alternation_long", "one|two|three|four|five|six|seven|eight|nine|ten", "the number is seven", 1},
{"escape_dot", "3\\.14", "pi is 3.14", 1},
{"escape_star", "a\\*b", "a*b", 1},
{"escape_plus", "c\\+\\+", "c++", 1},
{"escape_parens", "\\(test\\)", "(test)", 1},
{"escape_brackets", "\\[0\\]", "array[0]", 1},
{"stress_star", "a*a*a*a*a*b", "aaaaab", 1},
{"stress_plus", "a+a+a+a+a+b", "aaaaab", 1},
{"stress_nested", "((a+)+)+b", "aaaab", 1},
{"stress_alt", "(a|aa|aaa|aaaa)+b", "aaaab", 1},
{"nomatch_literal", "notfound", "the quick brown fox", 0},
{"nomatch_pattern", "^end", "start middle end", 0},
{"nomatch_class", "[0-9]+", "no digits here", 0},
{NULL, NULL, NULL, 0}
};
static double get_time_us(void) {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec * 1000000.0 + tv.tv_usec;
}
static result_t run_benchmark(benchmark_t *bench) {
result_t res = {0};
double start, end;
for (int i = 0; i < WARMUP; i++) {
lorex_error_t err;
lorex_regex_t *re = lorex_compile(bench->pattern, &err);
if (re) {
lorex_match_t m;
lorex_search(re, bench->text, &m);
lorex_free(re);
}
}
start = get_time_us();
for (int i = 0; i < ITERATIONS; i++) {
lorex_error_t err;
lorex_regex_t *re = lorex_compile(bench->pattern, &err);
if (!re) {
res.lorex_failed = 1;
break;
}
lorex_free(re);
}
end = get_time_us();
res.lorex_compile_us = (end - start) / ITERATIONS;
start = get_time_us();
{
lorex_error_t err;
lorex_regex_t *re = lorex_compile(bench->pattern, &err);
if (re) {
for (int i = 0; i < ITERATIONS; i++) {
lorex_match_t m;
res.lorex_matched = lorex_search(re, bench->text, &m) ? 1 : 0;
}
lorex_free(re);
}
}
end = get_time_us();
res.lorex_match_us = (end - start) / ITERATIONS;
res.lorex_total_us = res.lorex_compile_us + res.lorex_match_us;
for (int i = 0; i < WARMUP; i++) {
regex_t preg;
if (regcomp(&preg, bench->pattern, REG_EXTENDED) == 0) {
regmatch_t pmatch[1];
regexec(&preg, bench->text, 1, pmatch, 0);
regfree(&preg);
}
}
start = get_time_us();
for (int i = 0; i < ITERATIONS; i++) {
regex_t preg;
if (regcomp(&preg, bench->pattern, REG_EXTENDED) != 0) {
res.posix_failed = 1;
break;
}
regfree(&preg);
}
end = get_time_us();
res.posix_compile_us = (end - start) / ITERATIONS;
start = get_time_us();
{
regex_t preg;
if (regcomp(&preg, bench->pattern, REG_EXTENDED) == 0) {
for (int i = 0; i < ITERATIONS; i++) {
regmatch_t pmatch[1];
res.posix_matched = (regexec(&preg, bench->text, 1, pmatch, 0) == 0) ? 1 : 0;
}
regfree(&preg);
}
}
end = get_time_us();
res.posix_match_us = (end - start) / ITERATIONS;
res.posix_total_us = res.posix_compile_us + res.posix_match_us;
return res;
}
int main(void) {
printf("================================================================================\n");
printf(" LOREX vs POSIX REGEX PERFORMANCE BENCHMARK\n");
printf("================================================================================\n\n");
printf("Configuration:\n");
printf(" Iterations per test: %d\n", ITERATIONS);
printf(" Warmup iterations: %d\n", WARMUP);
printf("\n");
int total_tests = 0;
int lorex_wins = 0;
int posix_wins = 0;
int ties = 0;
double total_lorex_time = 0;
double total_posix_time = 0;
int lorex_compile_wins = 0;
int posix_compile_wins = 0;
int lorex_match_wins = 0;
int posix_match_wins = 0;
printf("================================================================================\n");
printf("%-25s | %-12s | %-12s | %-12s | %-8s\n", "TEST NAME", "LOREX (us)", "POSIX (us)", "SPEEDUP", "WINNER");
printf("================================================================================\n");
for (int i = 0; benchmarks[i].name != NULL; i++) {
benchmark_t *bench = &benchmarks[i];
result_t res = run_benchmark(bench);
if (res.lorex_failed || res.posix_failed) {
printf("%-25s | %-12s | %-12s | %-12s | %-8s\n",
bench->name,
res.lorex_failed ? "FAILED" : "OK",
res.posix_failed ? "FAILED" : "OK",
"-", "-");
continue;
}
total_tests++;
total_lorex_time += res.lorex_total_us;
total_posix_time += res.posix_total_us;
double speedup = res.posix_total_us / res.lorex_total_us;
const char *winner;
if (speedup > 1.05) {
winner = "LOREX";
lorex_wins++;
} else if (speedup < 0.95) {
winner = "POSIX";
posix_wins++;
} else {
winner = "TIE";
ties++;
}
if (res.lorex_compile_us < res.posix_compile_us) lorex_compile_wins++;
else posix_compile_wins++;
if (res.lorex_match_us < res.posix_match_us) lorex_match_wins++;
else posix_match_wins++;
printf("%-25s | %10.3f | %10.3f | %10.2fx | %-8s\n",
bench->name,
res.lorex_total_us,
res.posix_total_us,
speedup,
winner);
}
printf("================================================================================\n\n");
printf("================================================================================\n");
printf(" DETAILED RESULTS\n");
printf("================================================================================\n\n");
printf("%-25s | %-20s | %-20s\n", "TEST NAME", "LOREX (compile/match)", "POSIX (compile/match)");
printf("--------------------------------------------------------------------------------\n");
for (int i = 0; benchmarks[i].name != NULL; i++) {
benchmark_t *bench = &benchmarks[i];
result_t res = run_benchmark(bench);
if (res.lorex_failed || res.posix_failed) continue;
printf("%-25s | %8.3f / %8.3f | %8.3f / %8.3f\n",
bench->name,
res.lorex_compile_us, res.lorex_match_us,
res.posix_compile_us, res.posix_match_us);
}
printf("\n================================================================================\n");
printf(" SUMMARY\n");
printf("================================================================================\n\n");
printf("Total tests: %d\n", total_tests);
printf("\n");
printf("Overall wins:\n");
printf(" LOREX wins: %d (%.1f%%)\n", lorex_wins, 100.0 * lorex_wins / total_tests);
printf(" POSIX wins: %d (%.1f%%)\n", posix_wins, 100.0 * posix_wins / total_tests);
printf(" Ties: %d (%.1f%%)\n", ties, 100.0 * ties / total_tests);
printf("\n");
printf("Compilation phase wins:\n");
printf(" LOREX faster: %d\n", lorex_compile_wins);
printf(" POSIX faster: %d\n", posix_compile_wins);
printf("\n");
printf("Matching phase wins:\n");
printf(" LOREX faster: %d\n", lorex_match_wins);
printf(" POSIX faster: %d\n", posix_match_wins);
printf("\n");
printf("Total time (all tests):\n");
printf(" LOREX: %.3f us\n", total_lorex_time);
printf(" POSIX: %.3f us\n", total_posix_time);
printf(" Overall speedup: %.2fx %s\n",
total_posix_time > total_lorex_time ? total_posix_time / total_lorex_time : total_lorex_time / total_posix_time,
total_posix_time > total_lorex_time ? "(LOREX faster)" : "(POSIX faster)");
printf("\n================================================================================\n");
printf(" CATEGORY BREAKDOWN\n");
printf("================================================================================\n\n");
typedef struct {
const char *category;
const char *prefix;
double lorex_total;
double posix_total;
int count;
} category_t;
category_t categories[] = {
{"Literal matching", "literal_", 0, 0, 0},
{"Dot metacharacter", "dot_", 0, 0, 0},
{"Anchors", "anchor_", 0, 0, 0},
{"Star quantifier", "star_", 0, 0, 0},
{"Plus quantifier", "plus_", 0, 0, 0},
{"Question quantifier", "question_", 0, 0, 0},
{"Character classes", "class_", 0, 0, 0},
{"Alternation", "alt_", 0, 0, 0},
{"Groups", "group_", 0, 0, 0},
{"Brace quantifiers", "quant_", 0, 0, 0},
{"Real-world patterns", "email_", 0, 0, 0},
{"Escape sequences", "escape_", 0, 0, 0},
{"Stress tests", "stress_", 0, 0, 0},
{"No-match tests", "nomatch_", 0, 0, 0},
{NULL, NULL, 0, 0, 0}
};
for (int i = 0; benchmarks[i].name != NULL; i++) {
benchmark_t *bench = &benchmarks[i];
result_t res = run_benchmark(bench);
if (res.lorex_failed || res.posix_failed) continue;
for (int j = 0; categories[j].category != NULL; j++) {
if (strncmp(bench->name, categories[j].prefix, strlen(categories[j].prefix)) == 0) {
categories[j].lorex_total += res.lorex_total_us;
categories[j].posix_total += res.posix_total_us;
categories[j].count++;
break;
}
}
}
printf("%-25s | %-12s | %-12s | %-12s | %-8s\n", "CATEGORY", "LOREX (us)", "POSIX (us)", "SPEEDUP", "WINNER");
printf("--------------------------------------------------------------------------------\n");
for (int i = 0; categories[i].category != NULL; i++) {
if (categories[i].count == 0) continue;
double speedup = categories[i].posix_total / categories[i].lorex_total;
const char *winner = speedup > 1.0 ? "LOREX" : "POSIX";
printf("%-25s | %10.3f | %10.3f | %10.2fx | %-8s\n",
categories[i].category,
categories[i].lorex_total,
categories[i].posix_total,
speedup > 1.0 ? speedup : 1.0 / speedup,
winner);
}
printf("\n================================================================================\n");
printf(" PATTERN DETAILS\n");
printf("================================================================================\n\n");
for (int i = 0; benchmarks[i].name != NULL; i++) {
benchmark_t *bench = &benchmarks[i];
result_t res = run_benchmark(bench);
printf("Test: %s\n", bench->name);
printf(" Pattern: %s\n", bench->pattern);
printf(" Text: %.50s%s\n", bench->text, strlen(bench->text) > 50 ? "..." : "");
printf(" Expected: %s\n", bench->expect_match ? "MATCH" : "NO MATCH");
if (res.lorex_failed) {
printf(" LOREX: FAILED TO COMPILE\n");
} else {
printf(" LOREX: %s (compile: %.3f us, match: %.3f us, total: %.3f us)\n",
res.lorex_matched ? "MATCHED" : "NO MATCH",
res.lorex_compile_us, res.lorex_match_us, res.lorex_total_us);
}
if (res.posix_failed) {
printf(" POSIX: FAILED TO COMPILE\n");
} else {
printf(" POSIX: %s (compile: %.3f us, match: %.3f us, total: %.3f us)\n",
res.posix_matched ? "MATCHED" : "NO MATCH",
res.posix_compile_us, res.posix_match_us, res.posix_total_us);
}
if (!res.lorex_failed && !res.posix_failed) {
double speedup = res.posix_total_us / res.lorex_total_us;
if (speedup > 1.0) {
printf(" Result: LOREX is %.2fx faster\n", speedup);
} else {
printf(" Result: POSIX is %.2fx faster\n", 1.0 / speedup);
}
}
printf("\n");
}
printf("================================================================================\n");
printf(" BENCHMARK COMPLETE\n");
printf("================================================================================\n");
return 0;
}

View File

@ -1,5 +1,5 @@
/* retoor <retoor@molodetz.nl> */
#include "../include/loreg.h"
#include "../include/lorex.h"
#include <stdio.h>
#include <string.h>
#include <time.h>
@ -22,211 +22,211 @@ static int total_failed = 0;
} while(0)
TEST(basic_literals) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("hello", &err);
lorex_error_t err;
lorex_regex_t *re = lorex_compile("hello", &err);
ASSERT(re != NULL, "compile hello");
loreg_match_t m;
ASSERT(loreg_search(re, "hello", &m), "match hello");
ASSERT(loreg_search(re, "say hello world", &m), "search hello");
ASSERT(!loreg_search(re, "helo", &m), "no match helo");
lorex_match_t m;
ASSERT(lorex_search(re, "hello", &m), "match hello");
ASSERT(lorex_search(re, "say hello world", &m), "search hello");
ASSERT(!lorex_search(re, "helo", &m), "no match helo");
loreg_free(re);
lorex_free(re);
}
TEST(metacharacters) {
loreg_error_t err;
loreg_match_t m;
lorex_error_t err;
lorex_match_t m;
loreg_regex_t *re = loreg_compile("a.c", &err);
lorex_regex_t *re = lorex_compile("a.c", &err);
ASSERT(re != NULL, "compile a.c");
ASSERT(loreg_search(re, "abc", &m), "match abc");
ASSERT(loreg_search(re, "axc", &m), "match axc");
ASSERT(!loreg_search(re, "ac", &m), "no match ac");
loreg_free(re);
ASSERT(lorex_search(re, "abc", &m), "match abc");
ASSERT(lorex_search(re, "axc", &m), "match axc");
ASSERT(!lorex_search(re, "ac", &m), "no match ac");
lorex_free(re);
re = loreg_compile("^start", &err);
re = lorex_compile("^start", &err);
ASSERT(re != NULL, "compile ^start");
ASSERT(loreg_search(re, "start here", &m), "match start here");
ASSERT(!loreg_search(re, "not start", &m), "no match not start");
loreg_free(re);
ASSERT(lorex_search(re, "start here", &m), "match start here");
ASSERT(!lorex_search(re, "not start", &m), "no match not start");
lorex_free(re);
re = loreg_compile("end$", &err);
re = lorex_compile("end$", &err);
ASSERT(re != NULL, "compile end$");
ASSERT(loreg_search(re, "the end", &m), "match the end");
ASSERT(!loreg_search(re, "end here", &m), "no match end here");
loreg_free(re);
ASSERT(lorex_search(re, "the end", &m), "match the end");
ASSERT(!lorex_search(re, "end here", &m), "no match end here");
lorex_free(re);
}
TEST(quantifiers) {
loreg_error_t err;
loreg_match_t m;
lorex_error_t err;
lorex_match_t m;
loreg_regex_t *re = loreg_compile("ab*c", &err);
lorex_regex_t *re = lorex_compile("ab*c", &err);
ASSERT(re != NULL, "compile ab*c");
ASSERT(loreg_search(re, "ac", &m), "match ac");
ASSERT(loreg_search(re, "abc", &m), "match abc");
ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc");
loreg_free(re);
ASSERT(lorex_search(re, "ac", &m), "match ac");
ASSERT(lorex_search(re, "abc", &m), "match abc");
ASSERT(lorex_search(re, "abbbbc", &m), "match abbbbc");
lorex_free(re);
re = loreg_compile("ab+c", &err);
re = lorex_compile("ab+c", &err);
ASSERT(re != NULL, "compile ab+c");
ASSERT(!loreg_search(re, "ac", &m), "no match ac");
ASSERT(loreg_search(re, "abc", &m), "match abc");
ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc");
loreg_free(re);
ASSERT(!lorex_search(re, "ac", &m), "no match ac");
ASSERT(lorex_search(re, "abc", &m), "match abc");
ASSERT(lorex_search(re, "abbbbc", &m), "match abbbbc");
lorex_free(re);
re = loreg_compile("ab?c", &err);
re = lorex_compile("ab?c", &err);
ASSERT(re != NULL, "compile ab?c");
ASSERT(loreg_search(re, "ac", &m), "match ac");
ASSERT(loreg_search(re, "abc", &m), "match abc");
ASSERT(!loreg_search(re, "abbc", &m), "no match abbc");
loreg_free(re);
ASSERT(lorex_search(re, "ac", &m), "match ac");
ASSERT(lorex_search(re, "abc", &m), "match abc");
ASSERT(!lorex_search(re, "abbc", &m), "no match abbc");
lorex_free(re);
re = loreg_compile("a{3}", &err);
re = lorex_compile("a{3}", &err);
ASSERT(re != NULL, "compile a{3}");
ASSERT(loreg_search(re, "aaa", &m), "match aaa");
ASSERT(!loreg_search(re, "aa", &m), "no match aa");
loreg_free(re);
ASSERT(lorex_search(re, "aaa", &m), "match aaa");
ASSERT(!lorex_search(re, "aa", &m), "no match aa");
lorex_free(re);
re = loreg_compile("a{2,4}", &err);
re = lorex_compile("a{2,4}", &err);
ASSERT(re != NULL, "compile a{2,4}");
ASSERT(loreg_search(re, "aa", &m), "match aa");
ASSERT(loreg_search(re, "aaa", &m), "match aaa");
ASSERT(loreg_search(re, "aaaa", &m), "match aaaa");
ASSERT(!loreg_search(re, "a", &m), "no match a");
loreg_free(re);
ASSERT(lorex_search(re, "aa", &m), "match aa");
ASSERT(lorex_search(re, "aaa", &m), "match aaa");
ASSERT(lorex_search(re, "aaaa", &m), "match aaaa");
ASSERT(!lorex_search(re, "a", &m), "no match a");
lorex_free(re);
}
TEST(character_classes) {
loreg_error_t err;
loreg_match_t m;
lorex_error_t err;
lorex_match_t m;
loreg_regex_t *re = loreg_compile("[aeiou]", &err);
lorex_regex_t *re = lorex_compile("[aeiou]", &err);
ASSERT(re != NULL, "compile [aeiou]");
ASSERT(loreg_search(re, "a", &m), "match a");
ASSERT(loreg_search(re, "test", &m), "match test");
ASSERT(!loreg_search(re, "xyz", &m), "no match xyz");
loreg_free(re);
ASSERT(lorex_search(re, "a", &m), "match a");
ASSERT(lorex_search(re, "test", &m), "match test");
ASSERT(!lorex_search(re, "xyz", &m), "no match xyz");
lorex_free(re);
re = loreg_compile("[a-z]", &err);
re = lorex_compile("[a-z]", &err);
ASSERT(re != NULL, "compile [a-z]");
ASSERT(loreg_search(re, "m", &m), "match m");
ASSERT(!loreg_search(re, "5", &m), "no match 5");
loreg_free(re);
ASSERT(lorex_search(re, "m", &m), "match m");
ASSERT(!lorex_search(re, "5", &m), "no match 5");
lorex_free(re);
re = loreg_compile("[^0-9]", &err);
re = lorex_compile("[^0-9]", &err);
ASSERT(re != NULL, "compile [^0-9]");
ASSERT(loreg_search(re, "a", &m), "match a");
ASSERT(!loreg_search(re, "5", &m), "no match 5");
loreg_free(re);
ASSERT(lorex_search(re, "a", &m), "match a");
ASSERT(!lorex_search(re, "5", &m), "no match 5");
lorex_free(re);
re = loreg_compile("\\d", &err);
re = lorex_compile("\\d", &err);
ASSERT(re != NULL, "compile \\d");
ASSERT(loreg_search(re, "5", &m), "match 5");
ASSERT(!loreg_search(re, "a", &m), "no match a");
loreg_free(re);
ASSERT(lorex_search(re, "5", &m), "match 5");
ASSERT(!lorex_search(re, "a", &m), "no match a");
lorex_free(re);
re = loreg_compile("\\w+", &err);
re = lorex_compile("\\w+", &err);
ASSERT(re != NULL, "compile \\w+");
ASSERT(loreg_search(re, "hello_123", &m), "match hello_123");
loreg_free(re);
ASSERT(lorex_search(re, "hello_123", &m), "match hello_123");
lorex_free(re);
re = loreg_compile("\\s", &err);
re = lorex_compile("\\s", &err);
ASSERT(re != NULL, "compile \\s");
ASSERT(loreg_search(re, " ", &m), "match space");
ASSERT(loreg_search(re, "\t", &m), "match tab");
ASSERT(!loreg_search(re, "a", &m), "no match a");
loreg_free(re);
ASSERT(lorex_search(re, " ", &m), "match space");
ASSERT(lorex_search(re, "\t", &m), "match tab");
ASSERT(!lorex_search(re, "a", &m), "no match a");
lorex_free(re);
}
TEST(groups) {
loreg_error_t err;
loreg_match_t m;
lorex_error_t err;
lorex_match_t m;
loreg_regex_t *re = loreg_compile("(ab)+", &err);
lorex_regex_t *re = lorex_compile("(ab)+", &err);
ASSERT(re != NULL, "compile (ab)+");
ASSERT(loreg_search(re, "ab", &m), "match ab");
ASSERT(loreg_search(re, "abab", &m), "match abab");
ASSERT(!loreg_search(re, "a", &m), "no match a");
loreg_free(re);
ASSERT(lorex_search(re, "ab", &m), "match ab");
ASSERT(lorex_search(re, "abab", &m), "match abab");
ASSERT(!lorex_search(re, "a", &m), "no match a");
lorex_free(re);
re = loreg_compile("(\\d+)-(\\d+)", &err);
re = lorex_compile("(\\d+)-(\\d+)", &err);
ASSERT(re != NULL, "compile groups");
ASSERT(loreg_search(re, "123-456", &m), "match 123-456");
ASSERT(lorex_search(re, "123-456", &m), "match 123-456");
ASSERT(m.group_count == 2, "2 groups");
ASSERT(m.groups[0].matched, "group 0 matched");
ASSERT(m.groups[1].matched, "group 1 matched");
loreg_free(re);
lorex_free(re);
}
TEST(alternation) {
loreg_error_t err;
loreg_match_t m;
lorex_error_t err;
lorex_match_t m;
loreg_regex_t *re = loreg_compile("cat|dog", &err);
lorex_regex_t *re = lorex_compile("cat|dog", &err);
ASSERT(re != NULL, "compile cat|dog");
ASSERT(loreg_search(re, "cat", &m), "match cat");
ASSERT(loreg_search(re, "dog", &m), "match dog");
ASSERT(!loreg_search(re, "rat", &m), "no match rat");
loreg_free(re);
ASSERT(lorex_search(re, "cat", &m), "match cat");
ASSERT(lorex_search(re, "dog", &m), "match dog");
ASSERT(!lorex_search(re, "rat", &m), "no match rat");
lorex_free(re);
re = loreg_compile("(red|blue) car", &err);
re = lorex_compile("(red|blue) car", &err);
ASSERT(re != NULL, "compile (red|blue) car");
ASSERT(loreg_search(re, "red car", &m), "match red car");
ASSERT(loreg_search(re, "blue car", &m), "match blue car");
ASSERT(!loreg_search(re, "green car", &m), "no match green car");
loreg_free(re);
ASSERT(lorex_search(re, "red car", &m), "match red car");
ASSERT(lorex_search(re, "blue car", &m), "match blue car");
ASSERT(!lorex_search(re, "green car", &m), "no match green car");
lorex_free(re);
}
TEST(escapes) {
loreg_error_t err;
loreg_match_t m;
lorex_error_t err;
lorex_match_t m;
loreg_regex_t *re = loreg_compile("1\\.5", &err);
lorex_regex_t *re = lorex_compile("1\\.5", &err);
ASSERT(re != NULL, "compile 1\\.5");
ASSERT(loreg_search(re, "1.5", &m), "match 1.5");
ASSERT(!loreg_search(re, "1x5", &m), "no match 1x5");
loreg_free(re);
ASSERT(lorex_search(re, "1.5", &m), "match 1.5");
ASSERT(!lorex_search(re, "1x5", &m), "no match 1x5");
lorex_free(re);
re = loreg_compile("\\(test\\)", &err);
re = lorex_compile("\\(test\\)", &err);
ASSERT(re != NULL, "compile \\(test\\)");
ASSERT(loreg_search(re, "(test)", &m), "match (test)");
loreg_free(re);
ASSERT(lorex_search(re, "(test)", &m), "match (test)");
lorex_free(re);
}
TEST(real_patterns) {
loreg_error_t err;
loreg_match_t m;
lorex_error_t err;
lorex_match_t m;
loreg_regex_t *re = loreg_compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", &err);
lorex_regex_t *re = lorex_compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", &err);
ASSERT(re != NULL, "compile email");
ASSERT(loreg_search(re, "user@example.com", &m), "match email");
ASSERT(!loreg_search(re, "invalid", &m), "no match invalid");
loreg_free(re);
ASSERT(lorex_search(re, "user@example.com", &m), "match email");
ASSERT(!lorex_search(re, "invalid", &m), "no match invalid");
lorex_free(re);
re = loreg_compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", &err);
re = lorex_compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", &err);
ASSERT(re != NULL, "compile ip");
ASSERT(loreg_search(re, "192.168.1.1", &m), "match ip");
loreg_free(re);
ASSERT(lorex_search(re, "192.168.1.1", &m), "match ip");
lorex_free(re);
re = loreg_compile("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", &err);
re = lorex_compile("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", &err);
ASSERT(re != NULL, "compile url");
ASSERT(loreg_search(re, "http://example.com", &m), "match http");
ASSERT(loreg_search(re, "https://example.com/path", &m), "match https");
loreg_free(re);
ASSERT(lorex_search(re, "http://example.com", &m), "match http");
ASSERT(lorex_search(re, "https://example.com/path", &m), "match https");
lorex_free(re);
}
TEST(error_handling) {
loreg_error_t err;
lorex_error_t err;
loreg_regex_t *re = loreg_compile("(abc", &err);
lorex_regex_t *re = lorex_compile("(abc", &err);
ASSERT(re == NULL, "unbalanced paren");
ASSERT(err == LOREG_ERR_UNBALANCED_PAREN, "correct error");
ASSERT(err == LOREX_ERR_UNBALANCED_PAREN, "correct error");
}
int main(void) {
printf("loreg comprehensive tests\n");
printf("lorex comprehensive tests\n");
printf("========================\n\n");
clock_t start = clock();

View File

@ -1,5 +1,5 @@
/* retoor <retoor@molodetz.nl> */
#include "../include/loreg.h"
#include "../include/lorex.h"
#include <stdio.h>
#include <string.h>
@ -10,22 +10,22 @@ static int failed = 0;
#define NO_MATCH(pat, txt) test_match(pat, txt, 0, __LINE__)
static void test_match(const char *pattern, const char *text, int expect, int line) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile(pattern, &err);
lorex_error_t err;
lorex_regex_t *re = lorex_compile(pattern, &err);
if (!re) {
printf("FAIL line %d: compile error for '%s': %s\n", line, pattern, loreg_error_string(err));
printf("FAIL line %d: compile error for '%s': %s\n", line, pattern, lorex_error_string(err));
failed++;
return;
}
loreg_match_t m;
int result = loreg_search(re, text, &m) ? 1 : 0;
lorex_match_t m;
int result = lorex_search(re, text, &m) ? 1 : 0;
if (result != expect) {
printf("FAIL line %d: '%s' vs '%s' expected %s\n", line, pattern, text, expect ? "match" : "no match");
failed++;
} else {
passed++;
}
loreg_free(re);
lorex_free(re);
}
static void test_literals(void) {
@ -613,8 +613,240 @@ static void test_pathological_patterns(void) {
MATCH("(a?){5}a{5}", "aaaaa");
}
static void test_anchored_match(void) {
printf(" anchored match (lorex_match)...\n");
lorex_error_t err;
lorex_match_t m;
lorex_regex_t *re = lorex_compile("abc", &err);
if (re) {
if (lorex_match(re, "abc", &m)) {
passed++;
} else {
printf("FAIL: lorex_match should match 'abc' against 'abc'\n");
failed++;
}
if (!lorex_match(re, "xabc", &m)) {
passed++;
} else {
printf("FAIL: lorex_match should not match 'abc' against 'xabc'\n");
failed++;
}
if (lorex_match(re, "abcx", &m)) {
passed++;
} else {
printf("FAIL: lorex_match should match 'abc' at start of 'abcx'\n");
failed++;
}
lorex_free(re);
}
re = lorex_compile("^abc$", &err);
if (re) {
if (lorex_match(re, "abc", &m)) {
passed++;
} else {
printf("FAIL: lorex_match should match '^abc$' against 'abc'\n");
failed++;
}
if (!lorex_match(re, "abcx", &m)) {
passed++;
} else {
printf("FAIL: lorex_match should not match '^abc$' against 'abcx'\n");
failed++;
}
lorex_free(re);
}
re = lorex_compile("a.*z", &err);
if (re) {
if (lorex_match(re, "abcz", &m)) {
passed++;
} else {
printf("FAIL: lorex_match should match 'a.*z' against 'abcz'\n");
failed++;
}
if (!lorex_match(re, "xabcz", &m)) {
passed++;
} else {
printf("FAIL: lorex_match should not match 'a.*z' against 'xabcz'\n");
failed++;
}
lorex_free(re);
}
}
static void test_error_strings(void) {
printf(" error strings...\n");
if (strcmp(lorex_error_string(LOREX_OK), "success") == 0) {
passed++;
} else {
printf("FAIL: LOREX_OK should return 'success'\n");
failed++;
}
if (strcmp(lorex_error_string(LOREX_ERR_INVALID_PATTERN), "invalid pattern") == 0) {
passed++;
} else {
printf("FAIL: LOREX_ERR_INVALID_PATTERN error string\n");
failed++;
}
if (strcmp(lorex_error_string(LOREX_ERR_UNBALANCED_PAREN), "unbalanced parentheses") == 0) {
passed++;
} else {
printf("FAIL: LOREX_ERR_UNBALANCED_PAREN error string\n");
failed++;
}
if (strcmp(lorex_error_string(LOREX_ERR_EMPTY_GROUP), "empty group") == 0) {
passed++;
} else {
printf("FAIL: LOREX_ERR_EMPTY_GROUP error string\n");
failed++;
}
if (strcmp(lorex_error_string(LOREX_ERR_INVALID_QUANTIFIER), "invalid quantifier") == 0) {
passed++;
} else {
printf("FAIL: LOREX_ERR_INVALID_QUANTIFIER error string\n");
failed++;
}
if (strcmp(lorex_error_string(LOREX_ERR_INVALID_ESCAPE), "invalid escape sequence") == 0) {
passed++;
} else {
printf("FAIL: LOREX_ERR_INVALID_ESCAPE error string\n");
failed++;
}
if (strcmp(lorex_error_string(LOREX_ERR_OUT_OF_MEMORY), "out of memory") == 0) {
passed++;
} else {
printf("FAIL: LOREX_ERR_OUT_OF_MEMORY error string\n");
failed++;
}
if (strcmp(lorex_error_string(LOREX_ERR_STATE_OVERFLOW), "state overflow") == 0) {
passed++;
} else {
printf("FAIL: LOREX_ERR_STATE_OVERFLOW error string\n");
failed++;
}
if (strcmp(lorex_error_string((lorex_error_t)99), "unknown error") == 0) {
passed++;
} else {
printf("FAIL: unknown error code should return 'unknown error'\n");
failed++;
}
}
static void test_parser_errors(void) {
printf(" parser errors...\n");
lorex_error_t err;
lorex_regex_t *re;
re = lorex_compile("(abc", &err);
if (re == NULL && err == LOREX_ERR_UNBALANCED_PAREN) {
passed++;
} else {
printf("FAIL: '(abc' should fail with unbalanced paren\n");
failed++;
if (re) lorex_free(re);
}
re = lorex_compile("((a)", &err);
if (re == NULL && err == LOREX_ERR_UNBALANCED_PAREN) {
passed++;
} else {
printf("FAIL: '((a)' should fail with unbalanced paren\n");
failed++;
if (re) lorex_free(re);
}
re = lorex_compile("a{5,2}", &err);
if (re == NULL) {
passed++;
} else {
printf("FAIL: 'a{5,2}' should fail (min > max)\n");
failed++;
lorex_free(re);
}
re = lorex_compile("*abc", &err);
if (re == NULL) {
passed++;
} else {
printf("FAIL: '*abc' should fail\n");
failed++;
lorex_free(re);
}
re = lorex_compile("+abc", &err);
if (re == NULL) {
passed++;
} else {
printf("FAIL: '+abc' should fail\n");
failed++;
lorex_free(re);
}
re = lorex_compile("?abc", &err);
if (re == NULL) {
passed++;
} else {
printf("FAIL: '?abc' should fail\n");
failed++;
lorex_free(re);
}
}
static void test_bracket_char_classes(void) {
printf(" bracket character classes...\n");
MATCH("[\\d]", "5");
MATCH("[\\d]+", "12345");
NO_MATCH("[\\d]", "a");
MATCH("[\\w]", "a");
MATCH("[\\w]", "Z");
MATCH("[\\w]", "5");
MATCH("[\\w]", "_");
NO_MATCH("[\\w]", " ");
MATCH("[\\s]", " ");
MATCH("[\\s]", "\t");
NO_MATCH("[\\s]", "a");
MATCH("[a\\d]", "a");
MATCH("[a\\d]", "5");
NO_MATCH("[a\\d]", "b");
MATCH("[\\da-z]", "5");
MATCH("[\\da-z]", "m");
NO_MATCH("[\\da-z]", "M");
MATCH("[\\w\\s]+", "hello world");
MATCH("[0-9\\s]+", "1 2 3");
MATCH("[\\w-]+", "hello-world");
}
static void test_special_escapes(void) {
printf(" special escape sequences...\n");
MATCH("\\n", "\n");
MATCH("a\\nb", "a\nb");
MATCH("\\t", "\t");
MATCH("a\\tb", "a\tb");
MATCH("\\r", "\r");
MATCH("a\\rb", "a\rb");
MATCH("\\n\\t\\r", "\n\t\r");
MATCH("[\\n]", "\n");
MATCH("[\\t]", "\t");
MATCH("[\\r]", "\r");
MATCH("[\\n\\t]+", "\n\t\n");
NO_MATCH("\\n", "n");
NO_MATCH("\\t", "t");
NO_MATCH("\\r", "r");
}
int main(void) {
printf("loreg integration tests\n");
printf("lorex integration tests\n");
printf("=======================\n\n");
test_literals();
@ -641,6 +873,11 @@ int main(void) {
test_nested_groups();
test_real_world_patterns();
test_pathological_patterns();
test_anchored_match();
test_error_strings();
test_parser_errors();
test_bracket_char_classes();
test_special_escapes();
printf("\n=======================\n");
printf("integration: %d passed, %d failed\n", passed, failed);

View File

@ -1,5 +1,5 @@
/* retoor <retoor@molodetz.nl> */
#include "../include/loreg.h"
#include "../include/lorex.h"
#include <stdio.h>
#include <string.h>
@ -23,21 +23,21 @@ static int tests_failed = 0;
} while(0)
#define ASSERT_MATCH(pattern, text) do { \
loreg_error_t err; \
loreg_regex_t *re = loreg_compile(pattern, &err); \
lorex_error_t err; \
lorex_regex_t *re = lorex_compile(pattern, &err); \
ASSERT(re != NULL); \
loreg_match_t result; \
ASSERT(loreg_search(re, text, &result) == true); \
loreg_free(re); \
lorex_match_t result; \
ASSERT(lorex_search(re, text, &result) == true); \
lorex_free(re); \
} while(0)
#define ASSERT_NO_MATCH(pattern, text) do { \
loreg_error_t err; \
loreg_regex_t *re = loreg_compile(pattern, &err); \
lorex_error_t err; \
lorex_regex_t *re = lorex_compile(pattern, &err); \
ASSERT(re != NULL); \
loreg_match_t result; \
ASSERT(loreg_search(re, text, &result) == false); \
loreg_free(re); \
lorex_match_t result; \
ASSERT(lorex_search(re, text, &result) == false); \
lorex_free(re); \
} while(0)
TEST(simple_char) {
@ -209,53 +209,53 @@ TEST(complex_url) {
}
TEST(group_capture) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("(\\d+)-(\\d+)", &err);
lorex_error_t err;
lorex_regex_t *re = lorex_compile("(\\d+)-(\\d+)", &err);
ASSERT(re != NULL);
loreg_match_t result;
ASSERT(loreg_search(re, "123-456", &result));
lorex_match_t result;
ASSERT(lorex_search(re, "123-456", &result));
ASSERT(result.group_count == 2);
ASSERT(result.groups[0].matched);
ASSERT(result.groups[1].matched);
loreg_free(re);
lorex_free(re);
}
TEST(nested_groups) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("((a)(b))", &err);
lorex_error_t err;
lorex_regex_t *re = lorex_compile("((a)(b))", &err);
ASSERT(re != NULL);
loreg_match_t result;
ASSERT(loreg_search(re, "ab", &result));
lorex_match_t result;
ASSERT(lorex_search(re, "ab", &result));
ASSERT(result.group_count == 3);
loreg_free(re);
lorex_free(re);
}
TEST(empty_pattern) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("", &err);
lorex_error_t err;
lorex_regex_t *re = lorex_compile("", &err);
ASSERT(re != NULL);
loreg_match_t result;
ASSERT(loreg_match(re, "anything", &result));
lorex_match_t result;
ASSERT(lorex_match(re, "anything", &result));
loreg_free(re);
lorex_free(re);
}
TEST(match_position) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("test", &err);
lorex_error_t err;
lorex_regex_t *re = lorex_compile("test", &err);
ASSERT(re != NULL);
loreg_match_t result;
ASSERT(loreg_search(re, "xxxtestyyy", &result));
lorex_match_t result;
ASSERT(lorex_search(re, "xxxtestyyy", &result));
ASSERT(result.match_start == 3);
ASSERT(result.match_end == 7);
loreg_free(re);
lorex_free(re);
}
int main(void) {

View File

@ -27,11 +27,11 @@ static nfa_t *compile_pattern(const char *pattern) {
parser_t parser;
parser_init(&parser, pattern);
ast_node_t *ast = parser_parse(&parser);
if (!ast || parser_get_error(&parser) != LOREG_OK) {
if (!ast || parser_get_error(&parser) != LOREX_OK) {
ast_free(ast);
return NULL;
}
loreg_error_t error;
lorex_error_t error;
nfa_t *nfa = nfa_from_ast(ast, &error);
ast_free(ast);
return nfa;

View File

@ -245,7 +245,7 @@ TEST(complex_pattern) {
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(parser_get_error(&parser) == LOREG_OK);
ASSERT(parser_get_error(&parser) == LOREX_OK);
ast_free(ast);
}
@ -255,7 +255,7 @@ TEST(unbalanced_paren) {
parser_init(&parser, "(abc");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast == NULL || parser_get_error(&parser) == LOREG_ERR_UNBALANCED_PAREN);
ASSERT(ast == NULL || parser_get_error(&parser) == LOREX_ERR_UNBALANCED_PAREN);
ast_free(ast);
}