chore: update c, h, md files
Some checks are pending
CI / build (push) Waiting to run
CI / test (push) Blocked by required conditions
CI / valgrind (push) Blocked by required conditions
CI / coverage (push) Blocked by required conditions

This commit is contained in:
retoor 2026-01-04 00:04:48 +01:00
commit 3d9c4aa00b
26 changed files with 4459 additions and 0 deletions

93
.gitea/workflows/ci.yml Normal file
View File

@ -0,0 +1,93 @@
# retoor <retoor@molodetz.nl>
name: CI
on:
push:
branches:
- main
- master
pull_request:
branches:
- main
- master
jobs:
build:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y gcc make valgrind
- name: Build release
run: make
- name: Build debug
run: make debug
test:
runs-on: ubuntu-latest
needs: build
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y gcc make
- name: Run tests
run: make test
valgrind:
runs-on: ubuntu-latest
needs: build
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y gcc make valgrind
- name: Build test binaries
run: make build/test_integration build/test_all
- name: Valgrind comprehensive tests
run: |
valgrind --leak-check=full --show-leak-kinds=all \
--track-origins=yes --error-exitcode=1 \
./build/test_all
- name: Valgrind integration tests
run: |
valgrind --leak-check=full --show-leak-kinds=all \
--track-origins=yes --error-exitcode=1 \
./build/test_integration
coverage:
runs-on: ubuntu-latest
needs: test
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Install dependencies
run: |
sudo apt-get update
sudo apt-get install -y gcc make gcovr
- name: Generate coverage
run: make coverage
- name: Upload coverage artifacts
uses: actions/upload-artifact@v4
with:
name: coverage-report
path: build/coverage/

29
.gitignore vendored Normal file
View File

@ -0,0 +1,29 @@
# Build
build/
*.o
*.a
*.so
*.dylib
# Binary
loreg
# Coverage
*.gcov
*.gcda
*.gcno
# Profiling
gmon.out
*.prof
# Editor
*~
*.swp
*.swo
.vscode/
.idea/
# OS
.DS_Store
Thumbs.db

10
CHANGELOG.md Normal file
View File

@ -0,0 +1,10 @@
# Changelog
## Version 0.1.0 - 2026-01-04
update c, h, md files
**Changes:** 25 files, 4449 lines
**Languages:** C (3989 lines), Markdown (181 lines), Other (186 lines), YAML (93 lines)

157
Makefile Normal file
View File

@ -0,0 +1,157 @@
# retoor <retoor@molodetz.nl>
CC = gcc
CFLAGS = -Wall -Wextra -Werror -pedantic -std=c11 -O3 -march=native -flto
CFLAGS_DEBUG = -Wall -Wextra -pedantic -std=c11 -g -O0 -DDEBUG
CFLAGS_COV = -Wall -Wextra -pedantic -std=c11 -g -O0 --coverage -fprofile-arcs -ftest-coverage
CFLAGS_PROF = -Wall -Wextra -pedantic -std=c11 -O2 -pg
INCLUDES = -Iinclude
LDFLAGS = -flto
LDFLAGS_COV = --coverage
SRC_DIR = src
INC_DIR = include
BUILD_DIR = build
TEST_DIR = tests
SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c \
$(SRC_DIR)/repl.c $(SRC_DIR)/main.c
LIB_SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c
OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(SRCS))
LIB_OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(LIB_SRCS))
TARGET = loreg
LIB_TARGET = libloreg.a
TEST_SRCS = $(TEST_DIR)/test_lexer.c $(TEST_DIR)/test_parser.c \
$(TEST_DIR)/test_nfa.c $(TEST_DIR)/test_matcher.c \
$(TEST_DIR)/test_all.c $(TEST_DIR)/test_integration.c
TEST_BINS = $(BUILD_DIR)/test_lexer $(BUILD_DIR)/test_parser \
$(BUILD_DIR)/test_nfa $(BUILD_DIR)/test_matcher \
$(BUILD_DIR)/test_all $(BUILD_DIR)/test_integration
.PHONY: all clean test debug coverage profile valgrind help install
all: $(BUILD_DIR) $(TARGET)
$(BUILD_DIR):
mkdir -p $(BUILD_DIR)
$(BUILD_DIR)/%.o: $(SRC_DIR)/%.c | $(BUILD_DIR)
$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
$(TARGET): $(OBJS)
$(CC) $(OBJS) -o $@ $(LDFLAGS)
$(LIB_TARGET): $(LIB_OBJS)
ar rcs $@ $(LIB_OBJS)
debug: CFLAGS = $(CFLAGS_DEBUG)
debug: clean $(TARGET)
$(BUILD_DIR)/test_lexer: $(TEST_DIR)/test_lexer.c $(LIB_SRCS) | $(BUILD_DIR)
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
$(BUILD_DIR)/test_parser: $(TEST_DIR)/test_parser.c $(LIB_SRCS) | $(BUILD_DIR)
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
$(BUILD_DIR)/test_nfa: $(TEST_DIR)/test_nfa.c $(LIB_SRCS) | $(BUILD_DIR)
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
$(BUILD_DIR)/test_matcher: $(TEST_DIR)/test_matcher.c $(LIB_SRCS) | $(BUILD_DIR)
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
$(BUILD_DIR)/test_all: $(TEST_DIR)/test_all.c $(LIB_SRCS) | $(BUILD_DIR)
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
$(BUILD_DIR)/test_integration: $(TEST_DIR)/test_integration.c $(LIB_SRCS) | $(BUILD_DIR)
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
test: $(TEST_BINS)
@echo "running lexer tests..."
@$(BUILD_DIR)/test_lexer
@echo ""
@echo "running parser tests..."
@$(BUILD_DIR)/test_parser
@echo ""
@echo "running nfa tests..."
@$(BUILD_DIR)/test_nfa
@echo ""
@echo "running matcher tests..."
@$(BUILD_DIR)/test_matcher
@echo ""
@echo "running comprehensive tests..."
@$(BUILD_DIR)/test_all
@echo ""
@echo "running integration tests..."
@$(BUILD_DIR)/test_integration
coverage: CFLAGS = $(CFLAGS_COV)
coverage: LDFLAGS = $(LDFLAGS_COV)
coverage: clean $(BUILD_DIR)
$(CC) $(CFLAGS_COV) $(INCLUDES) $(TEST_DIR)/test_all.c $(LIB_SRCS) -o $(BUILD_DIR)/test_coverage $(LDFLAGS_COV)
$(BUILD_DIR)/test_coverage
gcov -b $(LIB_SRCS)
@echo ""
@echo "coverage report generated"
@mkdir -p $(BUILD_DIR)/coverage
@mv *.gcov $(BUILD_DIR)/coverage/ 2>/dev/null || true
@mv *.gcda $(BUILD_DIR)/coverage/ 2>/dev/null || true
@mv *.gcno $(BUILD_DIR)/coverage/ 2>/dev/null || true
profile: CFLAGS = $(CFLAGS_PROF)
profile: clean $(BUILD_DIR)
$(CC) $(CFLAGS_PROF) $(INCLUDES) $(TEST_DIR)/test_all.c $(LIB_SRCS) -o $(BUILD_DIR)/test_profile
$(BUILD_DIR)/test_profile
gprof $(BUILD_DIR)/test_profile gmon.out > $(BUILD_DIR)/profile.txt
@echo ""
@echo "profile report: $(BUILD_DIR)/profile.txt"
@mv gmon.out $(BUILD_DIR)/ 2>/dev/null || true
valgrind: $(BUILD_DIR)/test_all
valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes \
--error-exitcode=1 $(BUILD_DIR)/test_all
valgrind-verbose: $(BUILD_DIR)/test_all
valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes \
--verbose --log-file=$(BUILD_DIR)/valgrind.log $(BUILD_DIR)/test_all
@echo "valgrind log: $(BUILD_DIR)/valgrind.log"
benchmark: $(TARGET)
@echo "benchmarking..."
@echo "pattern: [a-z]+@[a-z]+\\.[a-z]+"
@time -p sh -c 'for i in $$(seq 1 1000); do ./$(TARGET) "[a-z]+@[a-z]+\\.[a-z]+" "test@example.com" > /dev/null; done'
@echo ""
@echo "pattern: (a|b)*abb"
@time -p sh -c 'for i in $$(seq 1 1000); do ./$(TARGET) "(a|b)*abb" "aabababb" > /dev/null; done'
install: $(TARGET)
install -d $(DESTDIR)/usr/local/bin
install -m 755 $(TARGET) $(DESTDIR)/usr/local/bin/
uninstall:
rm -f $(DESTDIR)/usr/local/bin/$(TARGET)
clean:
rm -rf $(BUILD_DIR) $(TARGET) $(LIB_TARGET)
rm -f *.gcov *.gcda *.gcno gmon.out
help:
@echo "loreg makefile targets:"
@echo " all build optimized release binary"
@echo " debug build with debug symbols"
@echo " test run all tests"
@echo " coverage run tests with coverage analysis"
@echo " profile run tests with profiling"
@echo " valgrind run tests under valgrind"
@echo " benchmark run simple benchmarks"
@echo " install install to /usr/local/bin"
@echo " uninstall remove from /usr/local/bin"
@echo " clean remove build artifacts"
@echo " help show this message"

181
README.md Normal file
View File

@ -0,0 +1,181 @@
# loreg
retoor <retoor@molodetz.nl>
A high-performance regular expression interpreter implemented from scratch in plain C. The engine uses Thompson's NFA construction algorithm for efficient pattern matching.
## CI
The project includes Gitea Actions CI that runs on every push and pull request:
- Build verification (release and debug)
- Full test suite (569 tests)
- Valgrind memory leak detection
- Code coverage generation
## Features
- Full regex syntax support: literals, metacharacters, quantifiers, character classes, groups, alternation, anchors
- NFA-based matching engine with Thompson construction
- Capturing groups with match position tracking
- Interactive REPL for testing patterns
- Zero external dependencies
- Comprehensive test suite with 569 tests
- Memory-safe implementation verified with Valgrind
## Building
```sh
make # optimized release build
make debug # debug build with symbols
make test # run all tests
make coverage # generate coverage report
make profile # generate profiling report
make valgrind # run under valgrind
```
## Usage
### Command Line
```sh
./loreg "pattern" "text" # search for pattern in text
./loreg -m "pattern" "text" # full match mode
./loreg -i # start REPL
./loreg # start REPL (default)
```
### REPL Commands
```
:p <pattern> compile and set pattern
:m <text> match text (anchored)
:s <text> search for pattern in text
<text> search (default)
:h help
:q quit
```
### C API
```c
#include "loreg.h"
loreg_error_t err;
loreg_regex_t *re = loreg_compile("\\d{3}-\\d{4}", &err);
if (!re) {
fprintf(stderr, "error: %s\n", loreg_error_string(err));
return 1;
}
loreg_match_t result;
if (loreg_search(re, "call 555-1234 now", &result)) {
printf("match at [%zu-%zu]\n", result.match_start, result.match_end);
}
loreg_free(re);
```
## Supported Syntax
| Pattern | Description |
|---------|-------------|
| `.` | any character except newline |
| `*` | zero or more |
| `+` | one or more |
| `?` | zero or one |
| `\|` | alternation |
| `()` | grouping and capture |
| `[]` | character class |
| `[^]` | negated character class |
| `[a-z]` | character range |
| `^` | start anchor |
| `$` | end anchor |
| `{n}` | exactly n |
| `{n,}` | n or more |
| `{n,m}` | n to m |
| `\d` | digit [0-9] |
| `\w` | word [a-zA-Z0-9_] |
| `\s` | whitespace |
| `\D` | non-digit |
| `\W` | non-word |
| `\S` | non-whitespace |
| `*?` `+?` `??` | non-greedy quantifiers |
## Architecture
```
src/
├── lexer.c tokenizer for regex patterns
├── parser.c recursive descent parser producing AST
├── ast.c abstract syntax tree node types
├── nfa.c Thompson NFA construction
├── matcher.c NFA simulation with epsilon closure
├── loreg.c public API
├── repl.c interactive REPL
└── main.c CLI entry point
include/
├── loreg.h public header
├── lexer.h lexer interface
├── parser.h parser interface
├── ast.h AST types
├── nfa.h NFA types
├── matcher.h matcher interface
└── repl.h REPL interface
tests/
├── test_lexer.c lexer unit tests (10 tests)
├── test_parser.c parser unit tests (20 tests)
├── test_nfa.c NFA construction tests (14 tests)
├── test_matcher.c matching tests (27 tests)
├── test_all.c comprehensive tests (9 tests)
└── test_integration.c integration tests (489 tests)
```
## Test Suite
The test suite contains 569 tests covering:
| Category | Description |
|----------|-------------|
| Lexer | Tokenization of patterns |
| Parser | AST construction and error handling |
| NFA | State machine construction |
| Matcher | Pattern matching correctness |
| Integration | Real-world regex patterns |
Integration tests cover:
- Literal matching and concatenation
- Dot metacharacter and wildcards
- Start/end anchors
- All quantifiers (*, +, ?, {n,m})
- Alternation and grouping
- Character classes and ranges
- Negated character classes
- Escape sequences
- Email, IP, URL, phone patterns
- Greedy vs non-greedy matching
- Nested groups and complex nesting
- Edge cases and boundary conditions
- Pathological/stress patterns
Run tests with Valgrind verification:
```sh
make test # run all 569 tests
make valgrind # verify zero memory leaks
```
## Algorithm
The implementation uses Thompson's construction to convert regex patterns to NFAs:
1. **Lexer**: Tokenizes the pattern into a stream of tokens
2. **Parser**: Builds an AST using recursive descent parsing
3. **NFA Construction**: Converts AST to NFA using Thompson's algorithm
4. **Matching**: Simulates NFA with epsilon closure for linear-time matching
Time complexity: O(n*m) where n is pattern length and m is text length.
## License
MIT

80
include/ast.h Normal file
View File

@ -0,0 +1,80 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_AST_H
#define LOREG_AST_H
#include <stdbool.h>
#include <stddef.h>
typedef enum {
AST_CHAR,
AST_DOT,
AST_CONCAT,
AST_ALTER,
AST_STAR,
AST_PLUS,
AST_QUESTION,
AST_GROUP,
AST_ANCHOR_START,
AST_ANCHOR_END,
AST_BRACKET,
AST_QUANTIFIER,
AST_CLASS_DIGIT,
AST_CLASS_WORD,
AST_CLASS_SPACE,
AST_CLASS_NDIGIT,
AST_CLASS_NWORD,
AST_CLASS_NSPACE
} ast_type_t;
typedef struct {
char start;
char end;
} char_range_t;
typedef struct {
char_range_t *ranges;
size_t count;
size_t capacity;
bool negated;
} bracket_class_t;
typedef struct {
int min;
int max;
bool greedy;
} quantifier_t;
typedef struct ast_node ast_node_t;
struct ast_node {
ast_type_t type;
char value;
ast_node_t *left;
ast_node_t *right;
int group_id;
bracket_class_t *bracket;
quantifier_t quant;
};
ast_node_t *ast_create_char(char c);
ast_node_t *ast_create_dot(void);
ast_node_t *ast_create_concat(ast_node_t *left, ast_node_t *right);
ast_node_t *ast_create_alter(ast_node_t *left, ast_node_t *right);
ast_node_t *ast_create_star(ast_node_t *child, bool greedy);
ast_node_t *ast_create_plus(ast_node_t *child, bool greedy);
ast_node_t *ast_create_question(ast_node_t *child, bool greedy);
ast_node_t *ast_create_group(ast_node_t *child, int group_id);
ast_node_t *ast_create_anchor_start(void);
ast_node_t *ast_create_anchor_end(void);
ast_node_t *ast_create_bracket(bracket_class_t *bracket);
ast_node_t *ast_create_quantifier(ast_node_t *child, int min, int max, bool greedy);
ast_node_t *ast_create_class(ast_type_t type);
void ast_free(ast_node_t *node);
bracket_class_t *bracket_create(void);
void bracket_add_char(bracket_class_t *bracket, char c);
void bracket_add_range(bracket_class_t *bracket, char start, char end);
void bracket_free(bracket_class_t *bracket);
bool bracket_matches(bracket_class_t *bracket, char c);
#endif

52
include/lexer.h Normal file
View File

@ -0,0 +1,52 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_LEXER_H
#define LOREG_LEXER_H
#include <stddef.h>
#include <stdbool.h>
typedef enum {
TOKEN_CHAR,
TOKEN_DOT,
TOKEN_STAR,
TOKEN_PLUS,
TOKEN_QUESTION,
TOKEN_PIPE,
TOKEN_LPAREN,
TOKEN_RPAREN,
TOKEN_LBRACKET,
TOKEN_RBRACKET,
TOKEN_CARET,
TOKEN_DOLLAR,
TOKEN_LBRACE,
TOKEN_RBRACE,
TOKEN_BACKSLASH,
TOKEN_DASH,
TOKEN_CLASS_DIGIT,
TOKEN_CLASS_WORD,
TOKEN_CLASS_SPACE,
TOKEN_CLASS_NDIGIT,
TOKEN_CLASS_NWORD,
TOKEN_CLASS_NSPACE,
TOKEN_EOF
} token_type_t;
typedef struct {
token_type_t type;
char value;
size_t position;
} token_t;
typedef struct {
const char *pattern;
size_t length;
size_t position;
bool in_bracket;
} lexer_t;
void lexer_init(lexer_t *lexer, const char *pattern);
token_t lexer_next(lexer_t *lexer);
token_t lexer_peek(lexer_t *lexer);
bool lexer_eof(lexer_t *lexer);
#endif

45
include/loreg.h Normal file
View File

@ -0,0 +1,45 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_H
#define LOREG_H
#include <stddef.h>
#include <stdbool.h>
#define LOREG_VERSION "1.0.0"
#define LOREG_MAX_STATES 4096
#define LOREG_MAX_GROUPS 32
typedef enum {
LOREG_OK = 0,
LOREG_ERR_INVALID_PATTERN,
LOREG_ERR_UNBALANCED_PAREN,
LOREG_ERR_EMPTY_GROUP,
LOREG_ERR_INVALID_QUANTIFIER,
LOREG_ERR_INVALID_ESCAPE,
LOREG_ERR_OUT_OF_MEMORY,
LOREG_ERR_STATE_OVERFLOW
} loreg_error_t;
typedef struct {
size_t start;
size_t end;
bool matched;
} loreg_group_t;
typedef struct {
bool matched;
size_t match_start;
size_t match_end;
loreg_group_t groups[LOREG_MAX_GROUPS];
size_t group_count;
} loreg_match_t;
typedef struct loreg_regex loreg_regex_t;
loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error);
void loreg_free(loreg_regex_t *regex);
bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result);
bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result);
const char *loreg_error_string(loreg_error_t error);
#endif

26
include/matcher.h Normal file
View File

@ -0,0 +1,26 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_MATCHER_H
#define LOREG_MATCHER_H
#include "nfa.h"
#include "loreg.h"
typedef struct {
nfa_state_t **states;
size_t count;
size_t capacity;
size_t *group_starts;
size_t *group_ends;
int group_count;
} state_set_t;
state_set_t *state_set_create(size_t initial_capacity, int group_count);
void state_set_free(state_set_t *set);
void state_set_clear(state_set_t *set);
void state_set_add(state_set_t *set, nfa_state_t *state);
bool state_set_contains(state_set_t *set, nfa_state_t *state);
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result);
bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result);
#endif

69
include/nfa.h Normal file
View File

@ -0,0 +1,69 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_NFA_H
#define LOREG_NFA_H
#include "ast.h"
#include "loreg.h"
#include <stdbool.h>
#include <stddef.h>
#define EPSILON '\0'
#define NFA_MAX_TRANSITIONS 256
typedef struct nfa_state nfa_state_t;
typedef enum {
TRANS_CHAR,
TRANS_EPSILON,
TRANS_DOT,
TRANS_BRACKET,
TRANS_CLASS_DIGIT,
TRANS_CLASS_WORD,
TRANS_CLASS_SPACE,
TRANS_CLASS_NDIGIT,
TRANS_CLASS_NWORD,
TRANS_CLASS_NSPACE,
TRANS_GROUP_START,
TRANS_GROUP_END,
TRANS_ANCHOR_START,
TRANS_ANCHOR_END
} transition_type_t;
typedef struct {
transition_type_t type;
char value;
nfa_state_t *target;
bracket_class_t *bracket;
int group_id;
} transition_t;
struct nfa_state {
int id;
bool accepting;
transition_t *transitions;
size_t trans_count;
size_t trans_capacity;
};
typedef struct {
nfa_state_t *start;
nfa_state_t *accept;
} nfa_fragment_t;
typedef struct {
nfa_state_t **states;
size_t state_count;
size_t capacity;
nfa_state_t *start;
int group_count;
} nfa_t;
nfa_t *nfa_create(void);
void nfa_free(nfa_t *nfa);
nfa_state_t *nfa_add_state(nfa_t *nfa);
void nfa_add_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, char value);
void nfa_add_bracket_transition(nfa_state_t *from, nfa_state_t *to, bracket_class_t *bracket);
void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, int group_id);
nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error);
#endif

20
include/parser.h Normal file
View File

@ -0,0 +1,20 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_PARSER_H
#define LOREG_PARSER_H
#include "ast.h"
#include "lexer.h"
#include "loreg.h"
typedef struct {
lexer_t lexer;
token_t current;
loreg_error_t error;
int group_count;
} parser_t;
void parser_init(parser_t *parser, const char *pattern);
ast_node_t *parser_parse(parser_t *parser);
loreg_error_t parser_get_error(parser_t *parser);
#endif

7
include/repl.h Normal file
View File

@ -0,0 +1,7 @@
/* retoor <retoor@molodetz.nl> */
#ifndef LOREG_REPL_H
#define LOREG_REPL_H
void repl_run(void);
#endif

169
src/ast.c Normal file
View File

@ -0,0 +1,169 @@
/* retoor <retoor@molodetz.nl> */
#include "ast.h"
#include <stdlib.h>
#include <ctype.h>
static ast_node_t *ast_create_node(ast_type_t type) {
ast_node_t *node = malloc(sizeof(ast_node_t));
if (!node) return NULL;
node->type = type;
node->value = '\0';
node->left = NULL;
node->right = NULL;
node->group_id = -1;
node->bracket = NULL;
node->quant.min = 0;
node->quant.max = -1;
node->quant.greedy = true;
return node;
}
ast_node_t *ast_create_char(char c) {
ast_node_t *node = ast_create_node(AST_CHAR);
if (node) node->value = c;
return node;
}
ast_node_t *ast_create_dot(void) {
return ast_create_node(AST_DOT);
}
ast_node_t *ast_create_concat(ast_node_t *left, ast_node_t *right) {
ast_node_t *node = ast_create_node(AST_CONCAT);
if (node) {
node->left = left;
node->right = right;
}
return node;
}
ast_node_t *ast_create_alter(ast_node_t *left, ast_node_t *right) {
ast_node_t *node = ast_create_node(AST_ALTER);
if (node) {
node->left = left;
node->right = right;
}
return node;
}
ast_node_t *ast_create_star(ast_node_t *child, bool greedy) {
ast_node_t *node = ast_create_node(AST_STAR);
if (node) {
node->left = child;
node->quant.greedy = greedy;
}
return node;
}
ast_node_t *ast_create_plus(ast_node_t *child, bool greedy) {
ast_node_t *node = ast_create_node(AST_PLUS);
if (node) {
node->left = child;
node->quant.greedy = greedy;
}
return node;
}
ast_node_t *ast_create_question(ast_node_t *child, bool greedy) {
ast_node_t *node = ast_create_node(AST_QUESTION);
if (node) {
node->left = child;
node->quant.greedy = greedy;
}
return node;
}
ast_node_t *ast_create_group(ast_node_t *child, int group_id) {
ast_node_t *node = ast_create_node(AST_GROUP);
if (node) {
node->left = child;
node->group_id = group_id;
}
return node;
}
ast_node_t *ast_create_anchor_start(void) {
return ast_create_node(AST_ANCHOR_START);
}
ast_node_t *ast_create_anchor_end(void) {
return ast_create_node(AST_ANCHOR_END);
}
ast_node_t *ast_create_bracket(bracket_class_t *bracket) {
ast_node_t *node = ast_create_node(AST_BRACKET);
if (node) node->bracket = bracket;
return node;
}
ast_node_t *ast_create_quantifier(ast_node_t *child, int min, int max, bool greedy) {
ast_node_t *node = ast_create_node(AST_QUANTIFIER);
if (node) {
node->left = child;
node->quant.min = min;
node->quant.max = max;
node->quant.greedy = greedy;
}
return node;
}
ast_node_t *ast_create_class(ast_type_t type) {
return ast_create_node(type);
}
void ast_free(ast_node_t *node) {
if (!node) return;
ast_free(node->left);
ast_free(node->right);
if (node->bracket) bracket_free(node->bracket);
free(node);
}
bracket_class_t *bracket_create(void) {
bracket_class_t *bracket = malloc(sizeof(bracket_class_t));
if (!bracket) return NULL;
bracket->ranges = NULL;
bracket->count = 0;
bracket->capacity = 0;
bracket->negated = false;
return bracket;
}
static bool bracket_grow(bracket_class_t *bracket) {
size_t new_cap = bracket->capacity == 0 ? 8 : bracket->capacity * 2;
char_range_t *new_ranges = realloc(bracket->ranges, new_cap * sizeof(char_range_t));
if (!new_ranges) return false;
bracket->ranges = new_ranges;
bracket->capacity = new_cap;
return true;
}
void bracket_add_char(bracket_class_t *bracket, char c) {
bracket_add_range(bracket, c, c);
}
void bracket_add_range(bracket_class_t *bracket, char start, char end) {
if (bracket->count >= bracket->capacity) {
if (!bracket_grow(bracket)) return;
}
bracket->ranges[bracket->count].start = start;
bracket->ranges[bracket->count].end = end;
bracket->count++;
}
void bracket_free(bracket_class_t *bracket) {
if (!bracket) return;
free(bracket->ranges);
free(bracket);
}
bool bracket_matches(bracket_class_t *bracket, char c) {
bool found = false;
for (size_t i = 0; i < bracket->count; i++) {
if (c >= bracket->ranges[i].start && c <= bracket->ranges[i].end) {
found = true;
break;
}
}
return bracket->negated ? !found : found;
}

125
src/lexer.c Normal file
View File

@ -0,0 +1,125 @@
/* retoor <retoor@molodetz.nl> */
#include "lexer.h"
#include <string.h>
void lexer_init(lexer_t *lexer, const char *pattern) {
lexer->pattern = pattern;
lexer->length = strlen(pattern);
lexer->position = 0;
lexer->in_bracket = false;
}
static token_t make_token(token_type_t type, char value, size_t pos) {
token_t token;
token.type = type;
token.value = value;
token.position = pos;
return token;
}
token_t lexer_next(lexer_t *lexer) {
if (lexer->position >= lexer->length) {
return make_token(TOKEN_EOF, '\0', lexer->position);
}
char c = lexer->pattern[lexer->position];
size_t pos = lexer->position;
lexer->position++;
if (c == '[' && !lexer->in_bracket) {
lexer->in_bracket = true;
return make_token(TOKEN_LBRACKET, c, pos);
}
if (c == ']' && lexer->in_bracket) {
lexer->in_bracket = false;
return make_token(TOKEN_RBRACKET, c, pos);
}
if (lexer->in_bracket) {
if (c == '-') {
return make_token(TOKEN_DASH, c, pos);
}
if (c == '^' && pos > 0 && lexer->pattern[pos - 1] == '[') {
return make_token(TOKEN_CARET, c, pos);
}
if (c == '\\' && lexer->position < lexer->length) {
char next = lexer->pattern[lexer->position];
lexer->position++;
switch (next) {
case 'd': return make_token(TOKEN_CLASS_DIGIT, 'd', pos);
case 'w': return make_token(TOKEN_CLASS_WORD, 'w', pos);
case 's': return make_token(TOKEN_CLASS_SPACE, 's', pos);
case 'D': return make_token(TOKEN_CLASS_NDIGIT, 'D', pos);
case 'W': return make_token(TOKEN_CLASS_NWORD, 'W', pos);
case 'S': return make_token(TOKEN_CLASS_NSPACE, 'S', pos);
case 'n': return make_token(TOKEN_CHAR, '\n', pos);
case 't': return make_token(TOKEN_CHAR, '\t', pos);
case 'r': return make_token(TOKEN_CHAR, '\r', pos);
default: return make_token(TOKEN_CHAR, next, pos);
}
}
return make_token(TOKEN_CHAR, c, pos);
}
if (c == '\\' && lexer->position < lexer->length) {
char next = lexer->pattern[lexer->position];
lexer->position++;
switch (next) {
case 'd': return make_token(TOKEN_CLASS_DIGIT, 'd', pos);
case 'w': return make_token(TOKEN_CLASS_WORD, 'w', pos);
case 's': return make_token(TOKEN_CLASS_SPACE, 's', pos);
case 'D': return make_token(TOKEN_CLASS_NDIGIT, 'D', pos);
case 'W': return make_token(TOKEN_CLASS_NWORD, 'W', pos);
case 'S': return make_token(TOKEN_CLASS_NSPACE, 'S', pos);
case 'n': return make_token(TOKEN_CHAR, '\n', pos);
case 't': return make_token(TOKEN_CHAR, '\t', pos);
case 'r': return make_token(TOKEN_CHAR, '\r', pos);
case '.':
case '*':
case '+':
case '?':
case '|':
case '(':
case ')':
case '[':
case ']':
case '{':
case '}':
case '^':
case '$':
case '\\':
return make_token(TOKEN_CHAR, next, pos);
default:
return make_token(TOKEN_CHAR, next, pos);
}
}
switch (c) {
case '.': return make_token(TOKEN_DOT, c, pos);
case '*': return make_token(TOKEN_STAR, c, pos);
case '+': return make_token(TOKEN_PLUS, c, pos);
case '?': return make_token(TOKEN_QUESTION, c, pos);
case '|': return make_token(TOKEN_PIPE, c, pos);
case '(': return make_token(TOKEN_LPAREN, c, pos);
case ')': return make_token(TOKEN_RPAREN, c, pos);
case '^': return make_token(TOKEN_CARET, c, pos);
case '$': return make_token(TOKEN_DOLLAR, c, pos);
case '{': return make_token(TOKEN_LBRACE, c, pos);
case '}': return make_token(TOKEN_RBRACE, c, pos);
default: return make_token(TOKEN_CHAR, c, pos);
}
}
token_t lexer_peek(lexer_t *lexer) {
size_t saved_pos = lexer->position;
bool saved_bracket = lexer->in_bracket;
token_t token = lexer_next(lexer);
lexer->position = saved_pos;
lexer->in_bracket = saved_bracket;
return token;
}
bool lexer_eof(lexer_t *lexer) {
return lexer->position >= lexer->length;
}

71
src/loreg.c Normal file
View File

@ -0,0 +1,71 @@
/* retoor <retoor@molodetz.nl> */
#include "loreg.h"
#include "parser.h"
#include "nfa.h"
#include "matcher.h"
#include <stdlib.h>
struct loreg_regex {
nfa_t *nfa;
ast_node_t *ast;
};
loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error) {
*error = LOREG_OK;
loreg_regex_t *regex = malloc(sizeof(loreg_regex_t));
if (!regex) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return NULL;
}
parser_t parser;
parser_init(&parser, pattern);
regex->ast = parser_parse(&parser);
*error = parser_get_error(&parser);
if (*error != LOREG_OK) {
ast_free(regex->ast);
free(regex);
return NULL;
}
regex->nfa = nfa_from_ast(regex->ast, error);
if (*error != LOREG_OK) {
ast_free(regex->ast);
free(regex);
return NULL;
}
return regex;
}
void loreg_free(loreg_regex_t *regex) {
if (!regex) return;
nfa_free(regex->nfa);
ast_free(regex->ast);
free(regex);
}
bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result) {
return nfa_match(regex->nfa, text, 0, result);
}
bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result) {
return nfa_search(regex->nfa, text, result);
}
const char *loreg_error_string(loreg_error_t error) {
switch (error) {
case LOREG_OK: return "success";
case LOREG_ERR_INVALID_PATTERN: return "invalid pattern";
case LOREG_ERR_UNBALANCED_PAREN: return "unbalanced parentheses";
case LOREG_ERR_EMPTY_GROUP: return "empty group";
case LOREG_ERR_INVALID_QUANTIFIER: return "invalid quantifier";
case LOREG_ERR_INVALID_ESCAPE: return "invalid escape sequence";
case LOREG_ERR_OUT_OF_MEMORY: return "out of memory";
case LOREG_ERR_STATE_OVERFLOW: return "state overflow";
default: return "unknown error";
}
}

107
src/main.c Normal file
View File

@ -0,0 +1,107 @@
/* retoor <retoor@molodetz.nl> */
#include "loreg.h"
#include "repl.h"
#include <stdio.h>
#include <string.h>
static void print_usage(const char *program) {
printf("usage: %s [options] [pattern] [text]\n", program);
printf("options:\n");
printf(" -h, --help show this help\n");
printf(" -v, --version show version\n");
printf(" -m, --match full match mode (default is search)\n");
printf(" -i start interactive REPL\n");
printf("\n");
printf("examples:\n");
printf(" %s start REPL\n", program);
printf(" %s -i start REPL\n", program);
printf(" %s \"a+b\" \"aaab\" search pattern in text\n", program);
printf(" %s -m \"a+b\" \"aaab\" match pattern against text\n", program);
}
static void print_version(void) {
printf("loreg %s\n", LOREG_VERSION);
}
static void print_match(const char *text, loreg_match_t *result) {
if (!result->matched) {
printf("no match\n");
return;
}
printf("match: \"");
for (size_t i = result->match_start; i < result->match_end; i++) {
printf("%c", text[i]);
}
printf("\" [%zu-%zu]\n", result->match_start, result->match_end);
for (size_t i = 0; i < result->group_count; i++) {
if (result->groups[i].matched) {
printf(" group %zu: \"", i);
for (size_t j = result->groups[i].start; j < result->groups[i].end; j++) {
printf("%c", text[j]);
}
printf("\" [%zu-%zu]\n", result->groups[i].start, result->groups[i].end);
}
}
}
int main(int argc, char *argv[]) {
if (argc == 1) {
repl_run();
return 0;
}
bool match_mode = false;
int arg_idx = 1;
while (arg_idx < argc && argv[arg_idx][0] == '-') {
if (strcmp(argv[arg_idx], "-h") == 0 || strcmp(argv[arg_idx], "--help") == 0) {
print_usage(argv[0]);
return 0;
}
if (strcmp(argv[arg_idx], "-v") == 0 || strcmp(argv[arg_idx], "--version") == 0) {
print_version();
return 0;
}
if (strcmp(argv[arg_idx], "-m") == 0 || strcmp(argv[arg_idx], "--match") == 0) {
match_mode = true;
arg_idx++;
continue;
}
if (strcmp(argv[arg_idx], "-i") == 0) {
repl_run();
return 0;
}
fprintf(stderr, "unknown option: %s\n", argv[arg_idx]);
return 1;
}
if (argc - arg_idx < 2) {
fprintf(stderr, "error: pattern and text required\n");
print_usage(argv[0]);
return 1;
}
const char *pattern = argv[arg_idx];
const char *text = argv[arg_idx + 1];
loreg_error_t error;
loreg_regex_t *regex = loreg_compile(pattern, &error);
if (!regex) {
fprintf(stderr, "error: %s\n", loreg_error_string(error));
return 1;
}
loreg_match_t result;
if (match_mode) {
loreg_match(regex, text, &result);
} else {
loreg_search(regex, text, &result);
}
print_match(text, &result);
loreg_free(regex);
return result.matched ? 0 : 1;
}

411
src/matcher.c Normal file
View File

@ -0,0 +1,411 @@
/* retoor <retoor@molodetz.nl> */
#include "matcher.h"
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
state_set_t *state_set_create(size_t initial_capacity, int group_count) {
state_set_t *set = malloc(sizeof(state_set_t));
if (!set) return NULL;
set->states = calloc(initial_capacity, sizeof(nfa_state_t *));
if (!set->states) {
free(set);
return NULL;
}
set->count = 0;
set->capacity = initial_capacity;
set->group_count = group_count;
if (group_count > 0) {
set->group_starts = calloc(group_count, sizeof(size_t));
set->group_ends = calloc(group_count, sizeof(size_t));
if (!set->group_starts || !set->group_ends) {
free(set->group_starts);
free(set->group_ends);
free(set->states);
free(set);
return NULL;
}
for (int i = 0; i < group_count; i++) {
set->group_starts[i] = (size_t)-1;
set->group_ends[i] = (size_t)-1;
}
} else {
set->group_starts = NULL;
set->group_ends = NULL;
}
return set;
}
void state_set_free(state_set_t *set) {
if (!set) return;
free(set->states);
free(set->group_starts);
free(set->group_ends);
free(set);
}
void state_set_clear(state_set_t *set) {
memset(set->states, 0, set->capacity * sizeof(nfa_state_t *));
set->count = 0;
}
static bool state_set_grow(state_set_t *set) {
size_t new_cap = set->capacity * 2;
nfa_state_t **new_states = realloc(set->states, new_cap * sizeof(nfa_state_t *));
if (!new_states) return false;
memset(new_states + set->capacity, 0, set->capacity * sizeof(nfa_state_t *));
set->states = new_states;
set->capacity = new_cap;
return true;
}
void state_set_add(state_set_t *set, nfa_state_t *state) {
if (state_set_contains(set, state)) return;
if (set->count >= set->capacity) {
if (!state_set_grow(set)) return;
}
set->states[set->count++] = state;
}
bool state_set_contains(state_set_t *set, nfa_state_t *state) {
for (size_t i = 0; i < set->count; i++) {
if (set->states[i] == state) return true;
}
return false;
}
static bool is_digit(char c) {
return c >= '0' && c <= '9';
}
static bool is_word(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
(c >= '0' && c <= '9') || c == '_';
}
static bool is_space(char c) {
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v';
}
static bool transition_matches(transition_t *t, char c, size_t pos, size_t len) {
switch (t->type) {
case TRANS_CHAR:
return t->value == c;
case TRANS_DOT:
return c != '\n' && c != '\0';
case TRANS_BRACKET:
return bracket_matches(t->bracket, c);
case TRANS_CLASS_DIGIT:
return is_digit(c);
case TRANS_CLASS_WORD:
return is_word(c);
case TRANS_CLASS_SPACE:
return is_space(c);
case TRANS_CLASS_NDIGIT:
return !is_digit(c) && c != '\0';
case TRANS_CLASS_NWORD:
return !is_word(c) && c != '\0';
case TRANS_CLASS_NSPACE:
return !is_space(c) && c != '\0';
case TRANS_ANCHOR_START:
return pos == 0;
case TRANS_ANCHOR_END:
return pos == len;
default:
return false;
}
}
typedef struct {
nfa_state_t *state;
size_t *group_starts;
size_t *group_ends;
} thread_t;
typedef struct {
thread_t *threads;
size_t count;
size_t capacity;
int group_count;
} thread_list_t;
static thread_list_t *thread_list_create(size_t capacity, int group_count) {
thread_list_t *list = malloc(sizeof(thread_list_t));
if (!list) return NULL;
list->threads = malloc(capacity * sizeof(thread_t));
if (!list->threads) {
free(list);
return NULL;
}
for (size_t i = 0; i < capacity; i++) {
if (group_count > 0) {
list->threads[i].group_starts = malloc(group_count * sizeof(size_t));
list->threads[i].group_ends = malloc(group_count * sizeof(size_t));
if (!list->threads[i].group_starts || !list->threads[i].group_ends) {
for (size_t j = 0; j <= i; j++) {
free(list->threads[j].group_starts);
free(list->threads[j].group_ends);
}
free(list->threads);
free(list);
return NULL;
}
} else {
list->threads[i].group_starts = NULL;
list->threads[i].group_ends = NULL;
}
}
list->count = 0;
list->capacity = capacity;
list->group_count = group_count;
return list;
}
static void thread_list_free(thread_list_t *list) {
if (!list) return;
for (size_t i = 0; i < list->capacity; i++) {
free(list->threads[i].group_starts);
free(list->threads[i].group_ends);
}
free(list->threads);
free(list);
}
static void thread_list_clear(thread_list_t *list) {
list->count = 0;
}
static bool thread_list_contains_state(thread_list_t *list, nfa_state_t *state) {
for (size_t i = 0; i < list->count; i++) {
if (list->threads[i].state == state) return true;
}
return false;
}
static void add_thread(thread_list_t *list, nfa_state_t *state,
size_t *group_starts, size_t *group_ends);
static void follow_epsilons(thread_list_t *list, nfa_state_t *state,
size_t *group_starts, size_t *group_ends,
size_t pos, size_t len, bool *visited) {
if (!state || visited[state->id]) return;
visited[state->id] = true;
for (size_t i = 0; i < state->trans_count; i++) {
transition_t *t = &state->transitions[i];
if (t->type == TRANS_EPSILON) {
follow_epsilons(list, t->target, group_starts, group_ends,
pos, len, visited);
} else if (t->type == TRANS_GROUP_START) {
size_t *new_starts = malloc(list->group_count * sizeof(size_t));
size_t *new_ends = malloc(list->group_count * sizeof(size_t));
if (new_starts && new_ends) {
memcpy(new_starts, group_starts, list->group_count * sizeof(size_t));
memcpy(new_ends, group_ends, list->group_count * sizeof(size_t));
new_starts[t->group_id] = pos;
follow_epsilons(list, t->target, new_starts, new_ends,
pos, len, visited);
}
free(new_starts);
free(new_ends);
} else if (t->type == TRANS_GROUP_END) {
size_t *new_starts = malloc(list->group_count * sizeof(size_t));
size_t *new_ends = malloc(list->group_count * sizeof(size_t));
if (new_starts && new_ends) {
memcpy(new_starts, group_starts, list->group_count * sizeof(size_t));
memcpy(new_ends, group_ends, list->group_count * sizeof(size_t));
new_ends[t->group_id] = pos;
follow_epsilons(list, t->target, new_starts, new_ends,
pos, len, visited);
}
free(new_starts);
free(new_ends);
} else if (t->type == TRANS_ANCHOR_START || t->type == TRANS_ANCHOR_END) {
if (transition_matches(t, '\0', pos, len)) {
follow_epsilons(list, t->target, group_starts, group_ends,
pos, len, visited);
}
}
}
add_thread(list, state, group_starts, group_ends);
}
static void add_thread(thread_list_t *list, nfa_state_t *state,
size_t *group_starts, size_t *group_ends) {
if (!state) return;
if (thread_list_contains_state(list, state)) return;
if (list->count >= list->capacity) return;
thread_t *thread = &list->threads[list->count++];
thread->state = state;
if (list->group_count > 0) {
memcpy(thread->group_starts, group_starts, list->group_count * sizeof(size_t));
memcpy(thread->group_ends, group_ends, list->group_count * sizeof(size_t));
}
}
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result) {
size_t len = strlen(text);
size_t num_states = nfa->state_count;
int group_count = nfa->group_count > 0 ? nfa->group_count : 1;
thread_list_t *current = thread_list_create(num_states, group_count);
thread_list_t *next = thread_list_create(num_states, group_count);
bool *visited = calloc(num_states, sizeof(bool));
if (!current || !next || !visited) {
thread_list_free(current);
thread_list_free(next);
free(visited);
return false;
}
size_t *init_starts = calloc(group_count, sizeof(size_t));
size_t *init_ends = calloc(group_count, sizeof(size_t));
if (!init_starts || !init_ends) {
free(init_starts);
free(init_ends);
thread_list_free(current);
thread_list_free(next);
free(visited);
return false;
}
for (int i = 0; i < group_count; i++) {
init_starts[i] = (size_t)-1;
init_ends[i] = (size_t)-1;
}
memset(visited, 0, num_states * sizeof(bool));
follow_epsilons(current, nfa->start, init_starts, init_ends,
start_pos, len, visited);
bool matched = false;
size_t match_end = start_pos;
size_t *best_starts = calloc(group_count, sizeof(size_t));
size_t *best_ends = calloc(group_count, sizeof(size_t));
if (!best_starts || !best_ends) {
free(init_starts);
free(init_ends);
free(best_starts);
free(best_ends);
thread_list_free(current);
thread_list_free(next);
free(visited);
return false;
}
for (int i = 0; i < group_count; i++) {
best_starts[i] = (size_t)-1;
best_ends[i] = (size_t)-1;
}
for (size_t i = 0; i < current->count; i++) {
if (current->threads[i].state->accepting) {
matched = true;
match_end = start_pos;
memcpy(best_starts, current->threads[i].group_starts, group_count * sizeof(size_t));
memcpy(best_ends, current->threads[i].group_ends, group_count * sizeof(size_t));
break;
}
}
for (size_t pos = start_pos; pos < len; pos++) {
char c = text[pos];
thread_list_clear(next);
for (size_t i = 0; i < current->count; i++) {
thread_t *thread = &current->threads[i];
nfa_state_t *state = thread->state;
for (size_t j = 0; j < state->trans_count; j++) {
transition_t *t = &state->transitions[j];
if (t->type != TRANS_EPSILON &&
t->type != TRANS_GROUP_START &&
t->type != TRANS_GROUP_END &&
t->type != TRANS_ANCHOR_START &&
t->type != TRANS_ANCHOR_END) {
if (transition_matches(t, c, pos, len)) {
memset(visited, 0, num_states * sizeof(bool));
follow_epsilons(next, t->target,
thread->group_starts, thread->group_ends,
pos + 1, len, visited);
}
}
}
}
if (next->count == 0) break;
thread_list_t *tmp = current;
current = next;
next = tmp;
for (size_t i = 0; i < current->count; i++) {
if (current->threads[i].state->accepting) {
matched = true;
match_end = pos + 1;
memcpy(best_starts, current->threads[i].group_starts, group_count * sizeof(size_t));
memcpy(best_ends, current->threads[i].group_ends, group_count * sizeof(size_t));
break;
}
}
}
if (result) {
result->matched = matched;
result->match_start = start_pos;
result->match_end = matched ? match_end : start_pos;
result->group_count = nfa->group_count;
for (int i = 0; i < LOREG_MAX_GROUPS && i < nfa->group_count; i++) {
result->groups[i].start = best_starts[i];
result->groups[i].end = best_ends[i];
result->groups[i].matched = (best_starts[i] != (size_t)-1 && best_ends[i] != (size_t)-1);
}
}
free(init_starts);
free(init_ends);
free(best_starts);
free(best_ends);
thread_list_free(current);
thread_list_free(next);
free(visited);
return matched;
}
bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result) {
size_t len = strlen(text);
for (size_t i = 0; i <= len; i++) {
if (nfa_match(nfa, text, i, result)) {
if (result) {
result->match_start = i;
}
return true;
}
}
if (result) {
result->matched = false;
result->match_start = 0;
result->match_end = 0;
result->group_count = 0;
}
return false;
}

477
src/nfa.c Normal file
View File

@ -0,0 +1,477 @@
/* retoor <retoor@molodetz.nl> */
#include "nfa.h"
#include <stdlib.h>
#include <string.h>
nfa_t *nfa_create(void) {
nfa_t *nfa = malloc(sizeof(nfa_t));
if (!nfa) return NULL;
nfa->states = NULL;
nfa->state_count = 0;
nfa->capacity = 0;
nfa->start = NULL;
nfa->group_count = 0;
return nfa;
}
void nfa_free(nfa_t *nfa) {
if (!nfa) return;
for (size_t i = 0; i < nfa->state_count; i++) {
free(nfa->states[i]->transitions);
free(nfa->states[i]);
}
free(nfa->states);
free(nfa);
}
static bool nfa_grow(nfa_t *nfa) {
size_t new_cap = nfa->capacity == 0 ? 16 : nfa->capacity * 2;
if (new_cap > LOREG_MAX_STATES) {
if (nfa->capacity >= LOREG_MAX_STATES) return false;
new_cap = LOREG_MAX_STATES;
}
nfa_state_t **new_states = realloc(nfa->states, new_cap * sizeof(nfa_state_t *));
if (!new_states) return false;
nfa->states = new_states;
nfa->capacity = new_cap;
return true;
}
nfa_state_t *nfa_add_state(nfa_t *nfa) {
if (nfa->state_count >= nfa->capacity) {
if (!nfa_grow(nfa)) return NULL;
}
nfa_state_t *state = malloc(sizeof(nfa_state_t));
if (!state) return NULL;
state->id = (int)nfa->state_count;
state->accepting = false;
state->transitions = NULL;
state->trans_count = 0;
state->trans_capacity = 0;
nfa->states[nfa->state_count++] = state;
return state;
}
static bool transition_grow(nfa_state_t *state) {
size_t new_cap = state->trans_capacity == 0 ? 4 : state->trans_capacity * 2;
transition_t *new_trans = realloc(state->transitions, new_cap * sizeof(transition_t));
if (!new_trans) return false;
state->transitions = new_trans;
state->trans_capacity = new_cap;
return true;
}
void nfa_add_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, char value) {
if (from->trans_count >= from->trans_capacity) {
if (!transition_grow(from)) return;
}
transition_t *t = &from->transitions[from->trans_count++];
t->type = type;
t->value = value;
t->target = to;
t->bracket = NULL;
t->group_id = -1;
}
void nfa_add_bracket_transition(nfa_state_t *from, nfa_state_t *to, bracket_class_t *bracket) {
if (from->trans_count >= from->trans_capacity) {
if (!transition_grow(from)) return;
}
transition_t *t = &from->transitions[from->trans_count++];
t->type = TRANS_BRACKET;
t->value = '\0';
t->target = to;
t->bracket = bracket;
t->group_id = -1;
}
void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, int group_id) {
if (from->trans_count >= from->trans_capacity) {
if (!transition_grow(from)) return;
}
transition_t *t = &from->transitions[from->trans_count++];
t->type = type;
t->value = '\0';
t->target = to;
t->bracket = NULL;
t->group_id = group_id;
}
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error);
static nfa_fragment_t build_char(nfa_t *nfa, char c, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(start, accept, TRANS_CHAR, c);
frag.start = start;
frag.accept = accept;
return frag;
}
static nfa_fragment_t build_dot(nfa_t *nfa, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(start, accept, TRANS_DOT, '\0');
frag.start = start;
frag.accept = accept;
return frag;
}
static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(start, accept, type, '\0');
frag.start = start;
frag.accept = accept;
return frag;
}
static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_bracket_transition(start, accept, bracket);
frag.start = start;
frag.accept = accept;
return frag;
}
static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_fragment_t left_frag = build_nfa(nfa, left, error);
if (*error != LOREG_OK) return frag;
nfa_fragment_t right_frag = build_nfa(nfa, right, error);
if (*error != LOREG_OK) return frag;
nfa_add_transition(left_frag.accept, right_frag.start, TRANS_EPSILON, '\0');
frag.start = left_frag.start;
frag.accept = right_frag.accept;
return frag;
}
static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_fragment_t left_frag = build_nfa(nfa, left, error);
if (*error != LOREG_OK) return frag;
nfa_fragment_t right_frag = build_nfa(nfa, right, error);
if (*error != LOREG_OK) return frag;
nfa_add_transition(start, left_frag.start, TRANS_EPSILON, '\0');
nfa_add_transition(start, right_frag.start, TRANS_EPSILON, '\0');
nfa_add_transition(left_frag.accept, accept, TRANS_EPSILON, '\0');
nfa_add_transition(right_frag.accept, accept, TRANS_EPSILON, '\0');
frag.start = start;
frag.accept = accept;
return frag;
}
static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (greedy) {
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
nfa_add_transition(start, accept, TRANS_EPSILON, '\0');
} else {
nfa_add_transition(start, accept, TRANS_EPSILON, '\0');
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
}
nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0');
nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0');
frag.start = start;
frag.accept = accept;
return frag;
}
static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *accept = nfa_add_state(nfa);
if (!accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (greedy) {
nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0');
nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0');
} else {
nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0');
nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0');
}
frag.start = child_frag.start;
frag.accept = accept;
return frag;
}
static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (greedy) {
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
nfa_add_transition(start, accept, TRANS_EPSILON, '\0');
} else {
nfa_add_transition(start, accept, TRANS_EPSILON, '\0');
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
}
nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0');
frag.start = start;
frag.accept = accept;
return frag;
}
static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
nfa_add_group_transition(start, child_frag.start, TRANS_GROUP_START, group_id);
nfa_add_group_transition(child_frag.accept, accept, TRANS_GROUP_END, group_id);
if (group_id + 1 > nfa->group_count) {
nfa->group_count = group_id + 1;
}
frag.start = start;
frag.accept = accept;
return frag;
}
static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
nfa_state_t *start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(start, accept, type, '\0');
frag.start = start;
frag.accept = accept;
return frag;
}
static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, int max, bool greedy, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
if (min == 0 && max == 0) {
nfa_state_t *state = nfa_add_state(nfa);
if (!state) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
frag.start = state;
frag.accept = state;
return frag;
}
nfa_state_t *start = nfa_add_state(nfa);
if (!start) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_state_t *current = start;
for (int i = 0; i < min; i++) {
nfa_fragment_t rep = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0');
current = rep.accept;
}
if (max < 0) {
nfa_state_t *loop_start = nfa_add_state(nfa);
nfa_state_t *accept = nfa_add_state(nfa);
if (!loop_start || !accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(current, loop_start, TRANS_EPSILON, '\0');
nfa_fragment_t rep = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (greedy) {
nfa_add_transition(loop_start, rep.start, TRANS_EPSILON, '\0');
nfa_add_transition(loop_start, accept, TRANS_EPSILON, '\0');
} else {
nfa_add_transition(loop_start, accept, TRANS_EPSILON, '\0');
nfa_add_transition(loop_start, rep.start, TRANS_EPSILON, '\0');
}
nfa_add_transition(rep.accept, loop_start, TRANS_EPSILON, '\0');
frag.start = start;
frag.accept = accept;
} else {
nfa_state_t *accept = nfa_add_state(nfa);
if (!accept) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
nfa_add_transition(current, accept, TRANS_EPSILON, '\0');
for (int i = min; i < max; i++) {
nfa_fragment_t rep = build_nfa(nfa, child, error);
if (*error != LOREG_OK) return frag;
if (greedy) {
nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0');
} else {
nfa_add_transition(current, accept, TRANS_EPSILON, '\0');
nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0');
}
if (greedy) {
nfa_add_transition(rep.accept, accept, TRANS_EPSILON, '\0');
}
current = rep.accept;
}
if (!greedy) {
nfa_add_transition(current, accept, TRANS_EPSILON, '\0');
}
frag.start = start;
frag.accept = accept;
}
return frag;
}
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error) {
nfa_fragment_t frag = {NULL, NULL};
if (!ast) {
nfa_state_t *state = nfa_add_state(nfa);
if (!state) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return frag;
}
frag.start = state;
frag.accept = state;
return frag;
}
switch (ast->type) {
case AST_CHAR:
return build_char(nfa, ast->value, error);
case AST_DOT:
return build_dot(nfa, error);
case AST_CONCAT:
return build_concat(nfa, ast->left, ast->right, error);
case AST_ALTER:
return build_alter(nfa, ast->left, ast->right, error);
case AST_STAR:
return build_star(nfa, ast->left, ast->quant.greedy, error);
case AST_PLUS:
return build_plus(nfa, ast->left, ast->quant.greedy, error);
case AST_QUESTION:
return build_question(nfa, ast->left, ast->quant.greedy, error);
case AST_GROUP:
return build_group(nfa, ast->left, ast->group_id, error);
case AST_ANCHOR_START:
return build_anchor(nfa, TRANS_ANCHOR_START, error);
case AST_ANCHOR_END:
return build_anchor(nfa, TRANS_ANCHOR_END, error);
case AST_BRACKET:
return build_bracket(nfa, ast->bracket, error);
case AST_QUANTIFIER:
return build_quantifier(nfa, ast->left, ast->quant.min, ast->quant.max, ast->quant.greedy, error);
case AST_CLASS_DIGIT:
return build_class(nfa, TRANS_CLASS_DIGIT, error);
case AST_CLASS_WORD:
return build_class(nfa, TRANS_CLASS_WORD, error);
case AST_CLASS_SPACE:
return build_class(nfa, TRANS_CLASS_SPACE, error);
case AST_CLASS_NDIGIT:
return build_class(nfa, TRANS_CLASS_NDIGIT, error);
case AST_CLASS_NWORD:
return build_class(nfa, TRANS_CLASS_NWORD, error);
case AST_CLASS_NSPACE:
return build_class(nfa, TRANS_CLASS_NSPACE, error);
}
return frag;
}
nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error) {
*error = LOREG_OK;
nfa_t *nfa = nfa_create();
if (!nfa) {
*error = LOREG_ERR_OUT_OF_MEMORY;
return NULL;
}
nfa_fragment_t frag = build_nfa(nfa, ast, error);
if (*error != LOREG_OK) {
nfa_free(nfa);
return NULL;
}
nfa->start = frag.start;
frag.accept->accepting = true;
return nfa;
}

309
src/parser.c Normal file
View File

@ -0,0 +1,309 @@
/* retoor <retoor@molodetz.nl> */
#include "parser.h"
#include <stdlib.h>
#include <ctype.h>
static void parser_advance(parser_t *parser) {
parser->current = lexer_next(&parser->lexer);
}
void parser_init(parser_t *parser, const char *pattern) {
lexer_init(&parser->lexer, pattern);
parser->current = lexer_next(&parser->lexer);
parser->error = LOREG_OK;
parser->group_count = 0;
}
loreg_error_t parser_get_error(parser_t *parser) {
return parser->error;
}
static ast_node_t *parse_expr(parser_t *parser);
static ast_node_t *parse_term(parser_t *parser);
static ast_node_t *parse_factor(parser_t *parser);
static ast_node_t *parse_atom(parser_t *parser);
static ast_node_t *parse_bracket(parser_t *parser);
static int parse_number(parser_t *parser);
static ast_node_t *parse_expr(parser_t *parser) {
ast_node_t *left = parse_term(parser);
if (!left || parser->error != LOREG_OK) return left;
while (parser->current.type == TOKEN_PIPE) {
parser_advance(parser);
ast_node_t *right = parse_term(parser);
if (!right) {
ast_free(left);
return NULL;
}
left = ast_create_alter(left, right);
if (!left) {
parser->error = LOREG_ERR_OUT_OF_MEMORY;
return NULL;
}
}
return left;
}
static ast_node_t *parse_term(parser_t *parser) {
ast_node_t *left = NULL;
while (parser->current.type != TOKEN_EOF &&
parser->current.type != TOKEN_PIPE &&
parser->current.type != TOKEN_RPAREN) {
ast_node_t *factor = parse_factor(parser);
if (!factor) {
ast_free(left);
return NULL;
}
if (left == NULL) {
left = factor;
} else {
left = ast_create_concat(left, factor);
if (!left) {
parser->error = LOREG_ERR_OUT_OF_MEMORY;
return NULL;
}
}
}
return left;
}
static ast_node_t *parse_factor(parser_t *parser) {
ast_node_t *atom = parse_atom(parser);
if (!atom || parser->error != LOREG_OK) return atom;
while (parser->current.type == TOKEN_STAR ||
parser->current.type == TOKEN_PLUS ||
parser->current.type == TOKEN_QUESTION ||
parser->current.type == TOKEN_LBRACE) {
token_type_t quant_type = parser->current.type;
parser_advance(parser);
bool greedy = true;
if (parser->current.type == TOKEN_QUESTION) {
greedy = false;
parser_advance(parser);
}
if (quant_type == TOKEN_STAR) {
atom = ast_create_star(atom, greedy);
} else if (quant_type == TOKEN_PLUS) {
atom = ast_create_plus(atom, greedy);
} else if (quant_type == TOKEN_QUESTION) {
atom = ast_create_question(atom, greedy);
} else if (quant_type == TOKEN_LBRACE) {
int min = parse_number(parser);
int max = min;
if (parser->current.type == TOKEN_CHAR && parser->current.value == ',') {
parser_advance(parser);
if (parser->current.type == TOKEN_RBRACE) {
max = -1;
} else {
max = parse_number(parser);
}
}
if (parser->current.type != TOKEN_RBRACE) {
parser->error = LOREG_ERR_INVALID_QUANTIFIER;
ast_free(atom);
return NULL;
}
parser_advance(parser);
if (parser->current.type == TOKEN_QUESTION) {
greedy = false;
parser_advance(parser);
}
atom = ast_create_quantifier(atom, min, max, greedy);
}
if (!atom) {
parser->error = LOREG_ERR_OUT_OF_MEMORY;
return NULL;
}
}
return atom;
}
static int parse_number(parser_t *parser) {
int num = 0;
while (parser->current.type == TOKEN_CHAR && isdigit(parser->current.value)) {
num = num * 10 + (parser->current.value - '0');
parser_advance(parser);
}
return num;
}
static ast_node_t *parse_atom(parser_t *parser) {
ast_node_t *node = NULL;
switch (parser->current.type) {
case TOKEN_CHAR:
node = ast_create_char(parser->current.value);
parser_advance(parser);
break;
case TOKEN_DOT:
node = ast_create_dot();
parser_advance(parser);
break;
case TOKEN_CARET:
node = ast_create_anchor_start();
parser_advance(parser);
break;
case TOKEN_DOLLAR:
node = ast_create_anchor_end();
parser_advance(parser);
break;
case TOKEN_LPAREN: {
parser_advance(parser);
int group_id = parser->group_count++;
ast_node_t *inner = parse_expr(parser);
if (parser->current.type != TOKEN_RPAREN) {
parser->error = LOREG_ERR_UNBALANCED_PAREN;
ast_free(inner);
return NULL;
}
parser_advance(parser);
node = ast_create_group(inner, group_id);
break;
}
case TOKEN_LBRACKET:
node = parse_bracket(parser);
break;
case TOKEN_CLASS_DIGIT:
node = ast_create_class(AST_CLASS_DIGIT);
parser_advance(parser);
break;
case TOKEN_CLASS_WORD:
node = ast_create_class(AST_CLASS_WORD);
parser_advance(parser);
break;
case TOKEN_CLASS_SPACE:
node = ast_create_class(AST_CLASS_SPACE);
parser_advance(parser);
break;
case TOKEN_CLASS_NDIGIT:
node = ast_create_class(AST_CLASS_NDIGIT);
parser_advance(parser);
break;
case TOKEN_CLASS_NWORD:
node = ast_create_class(AST_CLASS_NWORD);
parser_advance(parser);
break;
case TOKEN_CLASS_NSPACE:
node = ast_create_class(AST_CLASS_NSPACE);
parser_advance(parser);
break;
case TOKEN_EOF:
case TOKEN_PIPE:
case TOKEN_RPAREN:
return NULL;
default:
parser->error = LOREG_ERR_INVALID_PATTERN;
return NULL;
}
if (!node && parser->error == LOREG_OK) {
parser->error = LOREG_ERR_OUT_OF_MEMORY;
}
return node;
}
static ast_node_t *parse_bracket(parser_t *parser) {
parser_advance(parser);
bracket_class_t *bracket = bracket_create();
if (!bracket) {
parser->error = LOREG_ERR_OUT_OF_MEMORY;
return NULL;
}
if (parser->current.type == TOKEN_CARET) {
bracket->negated = true;
parser_advance(parser);
}
while (parser->current.type != TOKEN_RBRACKET && parser->current.type != TOKEN_EOF) {
if (parser->current.type == TOKEN_CLASS_DIGIT ||
parser->current.type == TOKEN_CLASS_WORD ||
parser->current.type == TOKEN_CLASS_SPACE ||
parser->current.type == TOKEN_CLASS_NDIGIT ||
parser->current.type == TOKEN_CLASS_NWORD ||
parser->current.type == TOKEN_CLASS_NSPACE) {
switch (parser->current.type) {
case TOKEN_CLASS_DIGIT:
bracket_add_range(bracket, '0', '9');
break;
case TOKEN_CLASS_WORD:
bracket_add_range(bracket, 'a', 'z');
bracket_add_range(bracket, 'A', 'Z');
bracket_add_range(bracket, '0', '9');
bracket_add_char(bracket, '_');
break;
case TOKEN_CLASS_SPACE:
bracket_add_char(bracket, ' ');
bracket_add_char(bracket, '\t');
bracket_add_char(bracket, '\n');
bracket_add_char(bracket, '\r');
bracket_add_char(bracket, '\f');
bracket_add_char(bracket, '\v');
break;
default:
break;
}
parser_advance(parser);
continue;
}
char start = parser->current.value;
parser_advance(parser);
if (parser->current.type == TOKEN_DASH) {
parser_advance(parser);
if (parser->current.type == TOKEN_RBRACKET || parser->current.type == TOKEN_EOF) {
bracket_add_char(bracket, start);
bracket_add_char(bracket, '-');
} else {
char end = parser->current.value;
bracket_add_range(bracket, start, end);
parser_advance(parser);
}
} else {
bracket_add_char(bracket, start);
}
}
if (parser->current.type != TOKEN_RBRACKET) {
bracket_free(bracket);
parser->error = LOREG_ERR_INVALID_PATTERN;
return NULL;
}
parser_advance(parser);
return ast_create_bracket(bracket);
}
ast_node_t *parser_parse(parser_t *parser) {
if (parser->current.type == TOKEN_EOF) {
return NULL;
}
return parse_expr(parser);
}

170
src/repl.c Normal file
View File

@ -0,0 +1,170 @@
/* retoor <retoor@molodetz.nl> */
#include "repl.h"
#include "loreg.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_INPUT 4096
static void print_banner(void) {
printf("loreg v%s - regex interpreter\n", LOREG_VERSION);
printf("commands: :q quit, :h help, :p <pattern> set pattern, :m <text> match, :s <text> search\n\n");
}
static void print_help(void) {
printf("loreg REPL commands:\n");
printf(" :q quit\n");
printf(" :h show this help\n");
printf(" :p <regex> compile and set pattern\n");
printf(" :m <text> match text against pattern (anchored)\n");
printf(" :s <text> search for pattern in text\n");
printf(" <text> search for pattern in text\n\n");
printf("regex syntax:\n");
printf(" . any character\n");
printf(" * zero or more\n");
printf(" + one or more\n");
printf(" ? zero or one\n");
printf(" | alternation\n");
printf(" () grouping\n");
printf(" [] character class\n");
printf(" [^] negated class\n");
printf(" ^ start anchor\n");
printf(" $ end anchor\n");
printf(" {n} exactly n\n");
printf(" {n,} n or more\n");
printf(" {n,m} n to m\n");
printf(" \\d digit\n");
printf(" \\w word character\n");
printf(" \\s whitespace\n");
printf(" \\D \\W \\S negated classes\n\n");
}
static void print_match(const char *text, loreg_match_t *result) {
if (!result->matched) {
printf("no match\n");
return;
}
printf("match: \"");
for (size_t i = result->match_start; i < result->match_end; i++) {
printf("%c", text[i]);
}
printf("\" [%zu-%zu]\n", result->match_start, result->match_end);
for (size_t i = 0; i < result->group_count; i++) {
if (result->groups[i].matched) {
printf(" group %zu: \"", i);
for (size_t j = result->groups[i].start; j < result->groups[i].end; j++) {
printf("%c", text[j]);
}
printf("\" [%zu-%zu]\n", result->groups[i].start, result->groups[i].end);
}
}
}
static char *read_line(void) {
static char buffer[MAX_INPUT];
printf("> ");
fflush(stdout);
if (!fgets(buffer, MAX_INPUT, stdin)) {
return NULL;
}
size_t len = strlen(buffer);
if (len > 0 && buffer[len - 1] == '\n') {
buffer[len - 1] = '\0';
}
return buffer;
}
void repl_run(void) {
print_banner();
loreg_regex_t *regex = NULL;
char *line;
while ((line = read_line()) != NULL) {
if (strlen(line) == 0) continue;
if (strcmp(line, ":q") == 0 || strcmp(line, ":quit") == 0) {
break;
}
if (strcmp(line, ":h") == 0 || strcmp(line, ":help") == 0) {
print_help();
continue;
}
if (strncmp(line, ":p ", 3) == 0) {
const char *pattern = line + 3;
while (*pattern == ' ') pattern++;
if (regex) {
loreg_free(regex);
regex = NULL;
}
loreg_error_t error;
regex = loreg_compile(pattern, &error);
if (!regex) {
printf("error: %s\n", loreg_error_string(error));
} else {
printf("pattern compiled: %s\n", pattern);
}
continue;
}
if (strncmp(line, ":m ", 3) == 0) {
if (!regex) {
printf("error: no pattern set (use :p <pattern>)\n");
continue;
}
const char *text = line + 3;
while (*text == ' ') text++;
loreg_match_t result;
loreg_match(regex, text, &result);
print_match(text, &result);
continue;
}
if (strncmp(line, ":s ", 3) == 0) {
if (!regex) {
printf("error: no pattern set (use :p <pattern>)\n");
continue;
}
const char *text = line + 3;
while (*text == ' ') text++;
loreg_match_t result;
loreg_search(regex, text, &result);
print_match(text, &result);
continue;
}
if (line[0] == ':') {
printf("unknown command: %s\n", line);
continue;
}
if (!regex) {
printf("error: no pattern set (use :p <pattern>)\n");
continue;
}
loreg_match_t result;
loreg_search(regex, line, &result);
print_match(line, &result);
}
if (regex) {
loreg_free(regex);
}
printf("\n");
}

252
tests/test_all.c Normal file
View File

@ -0,0 +1,252 @@
/* retoor <retoor@molodetz.nl> */
#include "../include/loreg.h"
#include <stdio.h>
#include <string.h>
#include <time.h>
static int total_passed = 0;
static int total_failed = 0;
#define ASSERT(cond, msg) do { \
if (!(cond)) { \
printf(" FAIL: %s\n", msg); \
total_failed++; \
return; \
} \
} while(0)
#define TEST(name) static void test_##name(void)
#define RUN(name) do { \
test_##name(); \
total_passed++; \
} while(0)
TEST(basic_literals) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("hello", &err);
ASSERT(re != NULL, "compile hello");
loreg_match_t m;
ASSERT(loreg_search(re, "hello", &m), "match hello");
ASSERT(loreg_search(re, "say hello world", &m), "search hello");
ASSERT(!loreg_search(re, "helo", &m), "no match helo");
loreg_free(re);
}
TEST(metacharacters) {
loreg_error_t err;
loreg_match_t m;
loreg_regex_t *re = loreg_compile("a.c", &err);
ASSERT(re != NULL, "compile a.c");
ASSERT(loreg_search(re, "abc", &m), "match abc");
ASSERT(loreg_search(re, "axc", &m), "match axc");
ASSERT(!loreg_search(re, "ac", &m), "no match ac");
loreg_free(re);
re = loreg_compile("^start", &err);
ASSERT(re != NULL, "compile ^start");
ASSERT(loreg_search(re, "start here", &m), "match start here");
ASSERT(!loreg_search(re, "not start", &m), "no match not start");
loreg_free(re);
re = loreg_compile("end$", &err);
ASSERT(re != NULL, "compile end$");
ASSERT(loreg_search(re, "the end", &m), "match the end");
ASSERT(!loreg_search(re, "end here", &m), "no match end here");
loreg_free(re);
}
TEST(quantifiers) {
loreg_error_t err;
loreg_match_t m;
loreg_regex_t *re = loreg_compile("ab*c", &err);
ASSERT(re != NULL, "compile ab*c");
ASSERT(loreg_search(re, "ac", &m), "match ac");
ASSERT(loreg_search(re, "abc", &m), "match abc");
ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc");
loreg_free(re);
re = loreg_compile("ab+c", &err);
ASSERT(re != NULL, "compile ab+c");
ASSERT(!loreg_search(re, "ac", &m), "no match ac");
ASSERT(loreg_search(re, "abc", &m), "match abc");
ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc");
loreg_free(re);
re = loreg_compile("ab?c", &err);
ASSERT(re != NULL, "compile ab?c");
ASSERT(loreg_search(re, "ac", &m), "match ac");
ASSERT(loreg_search(re, "abc", &m), "match abc");
ASSERT(!loreg_search(re, "abbc", &m), "no match abbc");
loreg_free(re);
re = loreg_compile("a{3}", &err);
ASSERT(re != NULL, "compile a{3}");
ASSERT(loreg_search(re, "aaa", &m), "match aaa");
ASSERT(!loreg_search(re, "aa", &m), "no match aa");
loreg_free(re);
re = loreg_compile("a{2,4}", &err);
ASSERT(re != NULL, "compile a{2,4}");
ASSERT(loreg_search(re, "aa", &m), "match aa");
ASSERT(loreg_search(re, "aaa", &m), "match aaa");
ASSERT(loreg_search(re, "aaaa", &m), "match aaaa");
ASSERT(!loreg_search(re, "a", &m), "no match a");
loreg_free(re);
}
TEST(character_classes) {
loreg_error_t err;
loreg_match_t m;
loreg_regex_t *re = loreg_compile("[aeiou]", &err);
ASSERT(re != NULL, "compile [aeiou]");
ASSERT(loreg_search(re, "a", &m), "match a");
ASSERT(loreg_search(re, "test", &m), "match test");
ASSERT(!loreg_search(re, "xyz", &m), "no match xyz");
loreg_free(re);
re = loreg_compile("[a-z]", &err);
ASSERT(re != NULL, "compile [a-z]");
ASSERT(loreg_search(re, "m", &m), "match m");
ASSERT(!loreg_search(re, "5", &m), "no match 5");
loreg_free(re);
re = loreg_compile("[^0-9]", &err);
ASSERT(re != NULL, "compile [^0-9]");
ASSERT(loreg_search(re, "a", &m), "match a");
ASSERT(!loreg_search(re, "5", &m), "no match 5");
loreg_free(re);
re = loreg_compile("\\d", &err);
ASSERT(re != NULL, "compile \\d");
ASSERT(loreg_search(re, "5", &m), "match 5");
ASSERT(!loreg_search(re, "a", &m), "no match a");
loreg_free(re);
re = loreg_compile("\\w+", &err);
ASSERT(re != NULL, "compile \\w+");
ASSERT(loreg_search(re, "hello_123", &m), "match hello_123");
loreg_free(re);
re = loreg_compile("\\s", &err);
ASSERT(re != NULL, "compile \\s");
ASSERT(loreg_search(re, " ", &m), "match space");
ASSERT(loreg_search(re, "\t", &m), "match tab");
ASSERT(!loreg_search(re, "a", &m), "no match a");
loreg_free(re);
}
TEST(groups) {
loreg_error_t err;
loreg_match_t m;
loreg_regex_t *re = loreg_compile("(ab)+", &err);
ASSERT(re != NULL, "compile (ab)+");
ASSERT(loreg_search(re, "ab", &m), "match ab");
ASSERT(loreg_search(re, "abab", &m), "match abab");
ASSERT(!loreg_search(re, "a", &m), "no match a");
loreg_free(re);
re = loreg_compile("(\\d+)-(\\d+)", &err);
ASSERT(re != NULL, "compile groups");
ASSERT(loreg_search(re, "123-456", &m), "match 123-456");
ASSERT(m.group_count == 2, "2 groups");
ASSERT(m.groups[0].matched, "group 0 matched");
ASSERT(m.groups[1].matched, "group 1 matched");
loreg_free(re);
}
TEST(alternation) {
loreg_error_t err;
loreg_match_t m;
loreg_regex_t *re = loreg_compile("cat|dog", &err);
ASSERT(re != NULL, "compile cat|dog");
ASSERT(loreg_search(re, "cat", &m), "match cat");
ASSERT(loreg_search(re, "dog", &m), "match dog");
ASSERT(!loreg_search(re, "rat", &m), "no match rat");
loreg_free(re);
re = loreg_compile("(red|blue) car", &err);
ASSERT(re != NULL, "compile (red|blue) car");
ASSERT(loreg_search(re, "red car", &m), "match red car");
ASSERT(loreg_search(re, "blue car", &m), "match blue car");
ASSERT(!loreg_search(re, "green car", &m), "no match green car");
loreg_free(re);
}
TEST(escapes) {
loreg_error_t err;
loreg_match_t m;
loreg_regex_t *re = loreg_compile("1\\.5", &err);
ASSERT(re != NULL, "compile 1\\.5");
ASSERT(loreg_search(re, "1.5", &m), "match 1.5");
ASSERT(!loreg_search(re, "1x5", &m), "no match 1x5");
loreg_free(re);
re = loreg_compile("\\(test\\)", &err);
ASSERT(re != NULL, "compile \\(test\\)");
ASSERT(loreg_search(re, "(test)", &m), "match (test)");
loreg_free(re);
}
TEST(real_patterns) {
loreg_error_t err;
loreg_match_t m;
loreg_regex_t *re = loreg_compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", &err);
ASSERT(re != NULL, "compile email");
ASSERT(loreg_search(re, "user@example.com", &m), "match email");
ASSERT(!loreg_search(re, "invalid", &m), "no match invalid");
loreg_free(re);
re = loreg_compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", &err);
ASSERT(re != NULL, "compile ip");
ASSERT(loreg_search(re, "192.168.1.1", &m), "match ip");
loreg_free(re);
re = loreg_compile("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", &err);
ASSERT(re != NULL, "compile url");
ASSERT(loreg_search(re, "http://example.com", &m), "match http");
ASSERT(loreg_search(re, "https://example.com/path", &m), "match https");
loreg_free(re);
}
TEST(error_handling) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("(abc", &err);
ASSERT(re == NULL, "unbalanced paren");
ASSERT(err == LOREG_ERR_UNBALANCED_PAREN, "correct error");
}
int main(void) {
printf("loreg comprehensive tests\n");
printf("========================\n\n");
clock_t start = clock();
RUN(basic_literals);
RUN(metacharacters);
RUN(quantifiers);
RUN(character_classes);
RUN(groups);
RUN(alternation);
RUN(escapes);
RUN(real_patterns);
RUN(error_handling);
clock_t end = clock();
double elapsed = (double)(end - start) / CLOCKS_PER_SEC;
printf("\n========================\n");
printf("passed: %d, failed: %d\n", total_passed, total_failed);
printf("time: %.3f seconds\n", elapsed);
return total_failed > 0 ? 1 : 0;
}

650
tests/test_integration.c Normal file
View File

@ -0,0 +1,650 @@
/* retoor <retoor@molodetz.nl> */
#include "../include/loreg.h"
#include <stdio.h>
#include <string.h>
static int passed = 0;
static int failed = 0;
#define MATCH(pat, txt) test_match(pat, txt, 1, __LINE__)
#define NO_MATCH(pat, txt) test_match(pat, txt, 0, __LINE__)
static void test_match(const char *pattern, const char *text, int expect, int line) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile(pattern, &err);
if (!re) {
printf("FAIL line %d: compile error for '%s': %s\n", line, pattern, loreg_error_string(err));
failed++;
return;
}
loreg_match_t m;
int result = loreg_search(re, text, &m) ? 1 : 0;
if (result != expect) {
printf("FAIL line %d: '%s' vs '%s' expected %s\n", line, pattern, text, expect ? "match" : "no match");
failed++;
} else {
passed++;
}
loreg_free(re);
}
static void test_literals(void) {
printf(" literals...\n");
MATCH("a", "a");
MATCH("a", "ba");
MATCH("a", "ab");
MATCH("abc", "abc");
MATCH("abc", "xabcy");
MATCH("hello", "hello world");
MATCH("world", "hello world");
MATCH("lo wo", "hello world");
NO_MATCH("abc", "ab");
NO_MATCH("abc", "abd");
NO_MATCH("xyz", "abc");
NO_MATCH("hello", "helo");
MATCH("", "anything");
MATCH("", "");
MATCH("a", "aaa");
MATCH("aa", "aaa");
MATCH("aaa", "aaa");
NO_MATCH("aaaa", "aaa");
}
static void test_dot(void) {
printf(" dot metacharacter...\n");
MATCH(".", "a");
MATCH(".", "x");
MATCH(".", "5");
MATCH(".", " ");
MATCH("..", "ab");
MATCH("...", "abc");
MATCH("a.c", "abc");
MATCH("a.c", "aXc");
MATCH("a.c", "a9c");
MATCH("a.c", "a c");
NO_MATCH("a.c", "ac");
NO_MATCH("a.c", "abbc");
MATCH("....", "test");
MATCH(".", "!");
MATCH(".", "@");
MATCH("a..b", "aXYb");
MATCH("a...b", "a123b");
NO_MATCH("a..b", "aXb");
}
static void test_anchors(void) {
printf(" anchors...\n");
MATCH("^a", "a");
MATCH("^a", "abc");
NO_MATCH("^a", "ba");
NO_MATCH("^a", " a");
MATCH("a$", "a");
MATCH("a$", "ba");
NO_MATCH("a$", "ab");
NO_MATCH("a$", "a ");
MATCH("^abc$", "abc");
NO_MATCH("^abc$", "xabc");
NO_MATCH("^abc$", "abcx");
NO_MATCH("^abc$", " abc");
NO_MATCH("^abc$", "abc ");
MATCH("^$", "");
NO_MATCH("^$", "a");
MATCH("^hello$", "hello");
MATCH("^hello world$", "hello world");
NO_MATCH("^hello world$", "hello world!");
MATCH("^a.*z$", "abcdefghijklmnopqrstuvwxyz");
MATCH("^.", "x");
MATCH(".$", "x");
}
static void test_star(void) {
printf(" star quantifier...\n");
MATCH("a*", "");
MATCH("a*", "a");
MATCH("a*", "aa");
MATCH("a*", "aaa");
MATCH("a*", "aaaaaaaaaa");
MATCH("a*", "b");
MATCH("a*b", "b");
MATCH("a*b", "ab");
MATCH("a*b", "aab");
MATCH("a*b", "aaaaaab");
NO_MATCH("a*b", "a");
MATCH("ba*", "b");
MATCH("ba*", "ba");
MATCH("ba*", "baaa");
MATCH(".*", "");
MATCH(".*", "anything at all");
MATCH("a.*b", "ab");
MATCH("a.*b", "aXb");
MATCH("a.*b", "aXXXXXb");
MATCH("a.*b", "a b");
MATCH("x*y*z*", "");
MATCH("x*y*z*", "xyz");
MATCH("x*y*z*", "xxxyyyzzz");
MATCH("ab*c", "ac");
MATCH("ab*c", "abc");
MATCH("ab*c", "abbbbc");
}
static void test_plus(void) {
printf(" plus quantifier...\n");
NO_MATCH("a+", "");
MATCH("a+", "a");
MATCH("a+", "aa");
MATCH("a+", "aaa");
MATCH("a+", "aaaaaaaaaa");
MATCH("a+", "ba");
MATCH("a+b", "ab");
MATCH("a+b", "aab");
MATCH("a+b", "aaaaaab");
NO_MATCH("a+b", "b");
NO_MATCH("a+b", "a");
MATCH("ba+", "ba");
MATCH("ba+", "baaa");
NO_MATCH("ba+", "b");
MATCH(".+", "a");
MATCH(".+", "anything");
NO_MATCH(".+", "");
MATCH("a.+b", "aXb");
MATCH("a.+b", "aXXXXXb");
NO_MATCH("a.+b", "ab");
MATCH("ab+c", "abc");
MATCH("ab+c", "abbbbc");
NO_MATCH("ab+c", "ac");
}
static void test_question(void) {
printf(" question quantifier...\n");
MATCH("a?", "");
MATCH("a?", "a");
MATCH("a?", "aa");
MATCH("a?b", "b");
MATCH("a?b", "ab");
MATCH("a?b", "aab");
MATCH("colou?r", "color");
MATCH("colou?r", "colour");
NO_MATCH("colou?r", "colouur");
MATCH("ab?c", "ac");
MATCH("ab?c", "abc");
NO_MATCH("ab?c", "abbc");
MATCH("https?://", "http://");
MATCH("https?://", "https://");
MATCH(".?", "");
MATCH(".?", "x");
}
static void test_alternation(void) {
printf(" alternation...\n");
MATCH("a|b", "a");
MATCH("a|b", "b");
NO_MATCH("a|b", "c");
MATCH("cat|dog", "cat");
MATCH("cat|dog", "dog");
NO_MATCH("cat|dog", "rat");
MATCH("cat|dog", "my cat");
MATCH("cat|dog", "my dog");
MATCH("a|b|c", "a");
MATCH("a|b|c", "b");
MATCH("a|b|c", "c");
NO_MATCH("a|b|c", "d");
MATCH("ab|cd", "ab");
MATCH("ab|cd", "cd");
NO_MATCH("ab|cd", "ac");
MATCH("abc|def|ghi", "abc");
MATCH("abc|def|ghi", "def");
MATCH("abc|def|ghi", "ghi");
MATCH("a|ab|abc", "abc");
MATCH("abc|ab|a", "abc");
MATCH("red|green|blue", "the red car");
MATCH("red|green|blue", "green light");
MATCH("red|green|blue", "blue sky");
}
static void test_groups(void) {
printf(" groups...\n");
MATCH("(a)", "a");
MATCH("(ab)", "ab");
MATCH("(abc)", "abc");
MATCH("(a)(b)", "ab");
MATCH("(a)(b)(c)", "abc");
MATCH("(ab)+", "ab");
MATCH("(ab)+", "abab");
MATCH("(ab)+", "ababab");
NO_MATCH("(ab)+", "a");
NO_MATCH("(ab)+", "ba");
MATCH("(ab)*", "");
MATCH("(ab)*", "ab");
MATCH("(ab)*", "abab");
MATCH("(ab)?", "");
MATCH("(ab)?", "ab");
MATCH("(a|b)+", "a");
MATCH("(a|b)+", "b");
MATCH("(a|b)+", "ab");
MATCH("(a|b)+", "ba");
MATCH("(a|b)+", "aabb");
MATCH("(a|b)+", "abba");
MATCH("((a))", "a");
MATCH("((ab))", "ab");
MATCH("(a(b)c)", "abc");
MATCH("(a(b(c)))", "abc");
MATCH("((a)(b))", "ab");
MATCH("(red|blue) car", "red car");
MATCH("(red|blue) car", "blue car");
NO_MATCH("(red|blue) car", "green car");
}
static void test_bracket_simple(void) {
printf(" bracket expressions (simple)...\n");
MATCH("[a]", "a");
NO_MATCH("[a]", "b");
MATCH("[ab]", "a");
MATCH("[ab]", "b");
NO_MATCH("[ab]", "c");
MATCH("[abc]", "a");
MATCH("[abc]", "b");
MATCH("[abc]", "c");
NO_MATCH("[abc]", "d");
MATCH("[aeiou]", "a");
MATCH("[aeiou]", "e");
MATCH("[aeiou]", "i");
MATCH("[aeiou]", "o");
MATCH("[aeiou]", "u");
NO_MATCH("[aeiou]", "b");
MATCH("[abc]+", "aaa");
MATCH("[abc]+", "abc");
MATCH("[abc]+", "cba");
MATCH("[abc]+", "abcabc");
MATCH("[xyz]*", "");
MATCH("[xyz]*", "xyz");
}
static void test_bracket_ranges(void) {
printf(" bracket expressions (ranges)...\n");
MATCH("[a-z]", "a");
MATCH("[a-z]", "m");
MATCH("[a-z]", "z");
NO_MATCH("[a-z]", "A");
NO_MATCH("[a-z]", "0");
MATCH("[A-Z]", "A");
MATCH("[A-Z]", "M");
MATCH("[A-Z]", "Z");
NO_MATCH("[A-Z]", "a");
MATCH("[0-9]", "0");
MATCH("[0-9]", "5");
MATCH("[0-9]", "9");
NO_MATCH("[0-9]", "a");
MATCH("[a-zA-Z]", "a");
MATCH("[a-zA-Z]", "Z");
NO_MATCH("[a-zA-Z]", "5");
MATCH("[a-zA-Z0-9]", "a");
MATCH("[a-zA-Z0-9]", "Z");
MATCH("[a-zA-Z0-9]", "5");
NO_MATCH("[a-zA-Z0-9]", "!");
MATCH("[a-z]+", "hello");
MATCH("[A-Z]+", "HELLO");
MATCH("[0-9]+", "12345");
MATCH("[a-z0-9]+", "abc123");
}
static void test_bracket_negated(void) {
printf(" bracket expressions (negated)...\n");
NO_MATCH("[^a]", "a");
MATCH("[^a]", "b");
MATCH("[^a]", "x");
NO_MATCH("[^abc]", "a");
NO_MATCH("[^abc]", "b");
NO_MATCH("[^abc]", "c");
MATCH("[^abc]", "d");
MATCH("[^abc]", "x");
NO_MATCH("[^a-z]", "a");
NO_MATCH("[^a-z]", "m");
NO_MATCH("[^a-z]", "z");
MATCH("[^a-z]", "A");
MATCH("[^a-z]", "5");
MATCH("[^a-z]", "!");
NO_MATCH("[^0-9]", "5");
MATCH("[^0-9]", "a");
MATCH("[^0-9]+", "hello");
NO_MATCH("[^aeiou]+", "aaa");
MATCH("[^aeiou]+", "xyz");
}
static void test_character_classes(void) {
printf(" character classes...\n");
MATCH("\\d", "0");
MATCH("\\d", "5");
MATCH("\\d", "9");
NO_MATCH("\\d", "a");
NO_MATCH("\\d", " ");
MATCH("\\d+", "123");
MATCH("\\d+", "0");
MATCH("\\d+", "9876543210");
NO_MATCH("\\d+", "");
NO_MATCH("\\d+", "abc");
MATCH("\\D", "a");
MATCH("\\D", " ");
MATCH("\\D", "!");
NO_MATCH("\\D", "5");
MATCH("\\w", "a");
MATCH("\\w", "Z");
MATCH("\\w", "0");
MATCH("\\w", "_");
NO_MATCH("\\w", " ");
NO_MATCH("\\w", "!");
MATCH("\\w+", "hello");
MATCH("\\w+", "Hello123");
MATCH("\\w+", "var_name");
MATCH("\\W", " ");
MATCH("\\W", "!");
MATCH("\\W", "@");
NO_MATCH("\\W", "a");
NO_MATCH("\\W", "_");
MATCH("\\s", " ");
MATCH("\\s", "\t");
MATCH("\\s", "\n");
NO_MATCH("\\s", "a");
NO_MATCH("\\s", "5");
MATCH("\\s+", " ");
MATCH("\\s+", " \t\n");
MATCH("\\S", "a");
MATCH("\\S", "5");
MATCH("\\S", "!");
NO_MATCH("\\S", " ");
NO_MATCH("\\S", "\t");
}
static void test_quantifier_braces(void) {
printf(" brace quantifiers...\n");
MATCH("a{3}", "aaa");
MATCH("a{3}", "aaaa");
NO_MATCH("a{3}", "aa");
MATCH("a{1}", "a");
MATCH("a{1}", "aa");
NO_MATCH("a{1}", "");
MATCH("a{0}", "");
MATCH("a{0}", "b");
MATCH("a{2,4}", "aa");
MATCH("a{2,4}", "aaa");
MATCH("a{2,4}", "aaaa");
MATCH("a{2,4}", "aaaaa");
NO_MATCH("a{2,4}", "a");
MATCH("a{2,}", "aa");
MATCH("a{2,}", "aaa");
MATCH("a{2,}", "aaaaaaaaaa");
NO_MATCH("a{2,}", "a");
MATCH("a{0,2}", "");
MATCH("a{0,2}", "a");
MATCH("a{0,2}", "aa");
MATCH("a{0,2}", "aaa");
MATCH("[0-9]{3}", "123");
MATCH("[0-9]{3}", "000");
NO_MATCH("[0-9]{3}", "12");
MATCH("(ab){2}", "abab");
MATCH("(ab){2}", "ababab");
NO_MATCH("(ab){2}", "ab");
}
static void test_escape_sequences(void) {
printf(" escape sequences...\n");
MATCH("\\.", ".");
NO_MATCH("\\.", "a");
MATCH("\\*", "*");
NO_MATCH("\\*", "a");
MATCH("\\+", "+");
MATCH("\\?", "?");
MATCH("\\|", "|");
MATCH("\\(", "(");
MATCH("\\)", ")");
MATCH("\\[", "[");
MATCH("\\]", "]");
MATCH("\\{", "{");
MATCH("\\}", "}");
MATCH("\\^", "^");
MATCH("\\$", "$");
MATCH("\\\\", "\\");
MATCH("a\\.b", "a.b");
NO_MATCH("a\\.b", "aXb");
MATCH("\\d\\.\\d", "1.5");
MATCH("c\\+\\+", "c++");
MATCH("\\(test\\)", "(test)");
MATCH("\\[0\\]", "[0]");
}
static void test_complex_patterns(void) {
printf(" complex patterns...\n");
MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "user@example.com");
MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "test.user@mail.example.org");
NO_MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "invalid");
NO_MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "@example.com");
MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "192.168.1.1");
MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "10.0.0.1");
MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "255.255.255.255");
NO_MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "1.2.3");
MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "http://example.com");
MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "https://example.com");
MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "http://example.com/path");
MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "https://example.com/path/to/page");
MATCH("\\d{3}-\\d{3}-\\d{4}", "123-456-7890");
MATCH("\\d{3}-\\d{3}-\\d{4}", "555-123-4567");
NO_MATCH("\\d{3}-\\d{3}-\\d{4}", "12-345-6789");
NO_MATCH("\\d{3}-\\d{3}-\\d{4}", "1234567890");
MATCH("\\(\\d{3}\\) \\d{3}-\\d{4}", "(123) 456-7890");
MATCH("[A-Z]{2}\\d{6}", "AB123456");
NO_MATCH("[A-Z]{2}\\d{6}", "A1234567");
MATCH("\\d{4}-\\d{2}-\\d{2}", "2024-01-15");
MATCH("\\d{2}/\\d{2}/\\d{4}", "01/15/2024");
MATCH("\\d{1,2}:\\d{2}(:\\d{2})?", "12:30");
MATCH("\\d{1,2}:\\d{2}(:\\d{2})?", "12:30:45");
MATCH("\\d{1,2}:\\d{2}(:\\d{2})?", "9:05");
}
static void test_word_boundaries(void) {
printf(" word patterns...\n");
MATCH("\\w+", "hello");
MATCH("\\w+", "hello123");
MATCH("\\w+", "test_var");
MATCH("[a-zA-Z_][a-zA-Z0-9_]*", "variable");
MATCH("[a-zA-Z_][a-zA-Z0-9_]*", "_private");
MATCH("[a-zA-Z_][a-zA-Z0-9_]*", "var123");
NO_MATCH("^[a-zA-Z_][a-zA-Z0-9_]*$", "123var");
MATCH("\\w+\\s+\\w+", "hello world");
MATCH("\\w+\\s+\\w+", "foo bar");
NO_MATCH("\\w+\\s+\\w+", "hello");
}
static void test_greedy_vs_nongreedy(void) {
printf(" greedy vs non-greedy...\n");
MATCH("a+", "aaa");
MATCH("a+?", "aaa");
MATCH("a*", "aaa");
MATCH("a*?", "aaa");
MATCH("a?", "a");
MATCH("a??", "a");
MATCH("a{2,4}", "aaaa");
MATCH("a{2,4}?", "aaaa");
MATCH(".*x", "abcx");
MATCH(".*?x", "abcx");
}
static void test_empty_and_edge_cases(void) {
printf(" empty and edge cases...\n");
MATCH("", "");
MATCH("", "abc");
MATCH("a*", "");
MATCH("a?", "");
MATCH("(a*)*", "");
MATCH("(a*)+", "");
MATCH("(a+)*", "");
MATCH("(a|b)*", "");
MATCH("[a-z]*", "");
NO_MATCH("a+", "");
NO_MATCH(".+", "");
NO_MATCH("[a-z]+", "");
MATCH("^", "");
MATCH("$", "");
MATCH("^$", "");
NO_MATCH("^$", "a");
MATCH("a*b*c*", "");
MATCH("a*b*c*", "abc");
MATCH("a*b*c*", "aabbcc");
MATCH("a*b*c*", "c");
MATCH("a*b*c*", "b");
}
static void test_special_characters_in_text(void) {
printf(" special characters in text...\n");
MATCH("a", "a\nb");
MATCH("b", "a\nb");
MATCH("a.b", "a\tb");
NO_MATCH("a.b", "a\nb");
MATCH("\\.", "3.14");
MATCH("\\+", "1+2");
MATCH("\\*", "2*3");
MATCH("\\?", "why?");
MATCH("\\(\\)", "func()");
MATCH("\\[\\]", "array[]");
MATCH("\\{\\}", "object{}");
MATCH("\\^", "x^2");
MATCH("\\$", "$100");
MATCH("\\|", "a|b");
}
static void test_repetition_combinations(void) {
printf(" repetition combinations...\n");
MATCH("a+b+", "ab");
MATCH("a+b+", "aabb");
MATCH("a+b+", "aaabbb");
NO_MATCH("a+b+", "a");
NO_MATCH("a+b+", "b");
MATCH("a*b+", "b");
MATCH("a*b+", "ab");
MATCH("a*b+", "aab");
MATCH("a+b*", "a");
MATCH("a+b*", "ab");
MATCH("a+b*", "abb");
MATCH("a*b*", "");
MATCH("a*b*", "a");
MATCH("a*b*", "b");
MATCH("a*b*", "ab");
MATCH("(ab)+c+", "abc");
MATCH("(ab)+c+", "ababcc");
MATCH("(a+b)+", "ab");
MATCH("(a+b)+", "aabaaab");
MATCH("((a+)+)+", "a");
MATCH("((a+)+)+", "aaa");
}
static void test_alternation_combinations(void) {
printf(" alternation combinations...\n");
MATCH("a|b|c|d|e", "a");
MATCH("a|b|c|d|e", "e");
NO_MATCH("a|b|c|d|e", "f");
MATCH("(a|b)(c|d)", "ac");
MATCH("(a|b)(c|d)", "ad");
MATCH("(a|b)(c|d)", "bc");
MATCH("(a|b)(c|d)", "bd");
NO_MATCH("(a|b)(c|d)", "ab");
MATCH("(cat|dog)s?", "cat");
MATCH("(cat|dog)s?", "cats");
MATCH("(cat|dog)s?", "dog");
MATCH("(cat|dog)s?", "dogs");
MATCH("(red|green|blue)\\s+(car|truck)", "red car");
MATCH("(red|green|blue)\\s+(car|truck)", "green truck");
MATCH("(a|aa|aaa)", "aaa");
MATCH("(aaa|aa|a)", "aaa");
}
static void test_nested_groups(void) {
printf(" nested groups...\n");
MATCH("((a))", "a");
MATCH("(((a)))", "a");
MATCH("((a)(b))", "ab");
MATCH("((a(b))c)", "abc");
MATCH("(a(b(c)))", "abc");
MATCH("((a|b)(c|d))", "ac");
MATCH("(a(b|c)d)", "abd");
MATCH("(a(b|c)d)", "acd");
MATCH("((ab)+)", "abab");
MATCH("(a(bc)*d)", "ad");
MATCH("(a(bc)*d)", "abcd");
MATCH("(a(bc)*d)", "abcbcd");
MATCH("((a+)(b+))", "aabb");
MATCH("(((a|b)+)c)", "ababc");
}
static void test_real_world_patterns(void) {
printf(" real world patterns...\n");
MATCH("[a-zA-Z]+", "Hello");
MATCH("[a-zA-Z]+", "WORLD");
MATCH("[a-zA-Z]+", "test");
MATCH("-?\\d+", "123");
MATCH("-?\\d+", "-456");
MATCH("-?\\d+", "0");
MATCH("-?\\d+\\.?\\d*", "3.14");
MATCH("-?\\d+\\.?\\d*", "-2.5");
MATCH("-?\\d+\\.?\\d*", "42");
MATCH("[a-fA-F0-9]+", "deadbeef");
MATCH("[a-fA-F0-9]+", "CAFEBABE");
MATCH("[a-fA-F0-9]+", "123abc");
MATCH("[01]+", "101010");
MATCH("[01]+", "11110000");
MATCH("[A-Z][a-z]+", "Hello");
MATCH("[A-Z][a-z]+", "World");
NO_MATCH("[A-Z][a-z]+", "hello");
MATCH("\"[^\"]*\"", "\"hello\"");
MATCH("\"[^\"]*\"", "\"hello world\"");
MATCH("\"[^\"]*\"", "\"\"");
MATCH("'[^']*'", "'test'");
MATCH("#[a-fA-F0-9]{6}", "#ff0000");
MATCH("#[a-fA-F0-9]{6}", "#00FF00");
MATCH("#[a-fA-F0-9]{3}", "#f00");
}
static void test_pathological_patterns(void) {
printf(" stress test patterns...\n");
MATCH("a?a?a?aaa", "aaa");
MATCH("(a+)+", "aaaa");
MATCH("(a*)*", "aaaa");
MATCH("(a|a)+", "aaaa");
MATCH("((a*)*)*", "aaaa");
MATCH("a*a*a*a*a*b", "aaaaab");
MATCH(".*.*.*.*.*", "test");
MATCH("(a?){5}a{5}", "aaaaa");
}
int main(void) {
printf("loreg integration tests\n");
printf("=======================\n\n");
test_literals();
test_dot();
test_anchors();
test_star();
test_plus();
test_question();
test_alternation();
test_groups();
test_bracket_simple();
test_bracket_ranges();
test_bracket_negated();
test_character_classes();
test_quantifier_braces();
test_escape_sequences();
test_complex_patterns();
test_word_boundaries();
test_greedy_vs_nongreedy();
test_empty_and_edge_cases();
test_special_characters_in_text();
test_repetition_combinations();
test_alternation_combinations();
test_nested_groups();
test_real_world_patterns();
test_pathological_patterns();
printf("\n=======================\n");
printf("integration: %d passed, %d failed\n", passed, failed);
printf("total tests: %d\n", passed + failed);
return failed > 0 ? 1 : 0;
}

195
tests/test_lexer.c Normal file
View File

@ -0,0 +1,195 @@
/* retoor <retoor@molodetz.nl> */
#include "../include/lexer.h"
#include <stdio.h>
#include <assert.h>
#include <string.h>
static int tests_passed = 0;
static int tests_failed = 0;
#define TEST(name) static void test_##name(void)
#define RUN_TEST(name) do { \
printf(" %s... ", #name); \
test_##name(); \
printf("ok\n"); \
tests_passed++; \
} while(0)
#define ASSERT(cond) do { \
if (!(cond)) { \
printf("FAILED at line %d: %s\n", __LINE__, #cond); \
tests_failed++; \
return; \
} \
} while(0)
TEST(simple_chars) {
lexer_t lexer;
lexer_init(&lexer, "abc");
token_t t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'b');
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'c');
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_EOF);
}
TEST(meta_chars) {
lexer_t lexer;
lexer_init(&lexer, ".*+?|()^$");
ASSERT(lexer_next(&lexer).type == TOKEN_DOT);
ASSERT(lexer_next(&lexer).type == TOKEN_STAR);
ASSERT(lexer_next(&lexer).type == TOKEN_PLUS);
ASSERT(lexer_next(&lexer).type == TOKEN_QUESTION);
ASSERT(lexer_next(&lexer).type == TOKEN_PIPE);
ASSERT(lexer_next(&lexer).type == TOKEN_LPAREN);
ASSERT(lexer_next(&lexer).type == TOKEN_RPAREN);
ASSERT(lexer_next(&lexer).type == TOKEN_CARET);
ASSERT(lexer_next(&lexer).type == TOKEN_DOLLAR);
ASSERT(lexer_next(&lexer).type == TOKEN_EOF);
}
TEST(escaped_chars) {
lexer_t lexer;
lexer_init(&lexer, "\\*\\+\\.");
token_t t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == '*');
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == '+');
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == '.');
}
TEST(character_classes) {
lexer_t lexer;
lexer_init(&lexer, "\\d\\w\\s\\D\\W\\S");
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_DIGIT);
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_WORD);
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_SPACE);
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_NDIGIT);
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_NWORD);
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_NSPACE);
}
TEST(bracket_expression) {
lexer_t lexer;
lexer_init(&lexer, "[abc]");
ASSERT(lexer_next(&lexer).type == TOKEN_LBRACKET);
token_t t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'b');
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'c');
ASSERT(lexer_next(&lexer).type == TOKEN_RBRACKET);
}
TEST(bracket_range) {
lexer_t lexer;
lexer_init(&lexer, "[a-z]");
ASSERT(lexer_next(&lexer).type == TOKEN_LBRACKET);
token_t t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
ASSERT(lexer_next(&lexer).type == TOKEN_DASH);
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'z');
ASSERT(lexer_next(&lexer).type == TOKEN_RBRACKET);
}
TEST(negated_bracket) {
lexer_t lexer;
lexer_init(&lexer, "[^a]");
ASSERT(lexer_next(&lexer).type == TOKEN_LBRACKET);
ASSERT(lexer_next(&lexer).type == TOKEN_CARET);
token_t t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
ASSERT(lexer_next(&lexer).type == TOKEN_RBRACKET);
}
TEST(quantifier_braces) {
lexer_t lexer;
lexer_init(&lexer, "a{3}");
token_t t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
ASSERT(lexer_next(&lexer).type == TOKEN_LBRACE);
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == '3');
ASSERT(lexer_next(&lexer).type == TOKEN_RBRACE);
}
TEST(peek) {
lexer_t lexer;
lexer_init(&lexer, "ab");
token_t t = lexer_peek(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
t = lexer_peek(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
t = lexer_peek(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == 'b');
}
TEST(escape_sequences) {
lexer_t lexer;
lexer_init(&lexer, "\\n\\t\\r");
token_t t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == '\n');
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == '\t');
t = lexer_next(&lexer);
ASSERT(t.type == TOKEN_CHAR && t.value == '\r');
}
int main(void) {
printf("lexer tests:\n");
RUN_TEST(simple_chars);
RUN_TEST(meta_chars);
RUN_TEST(escaped_chars);
RUN_TEST(character_classes);
RUN_TEST(bracket_expression);
RUN_TEST(bracket_range);
RUN_TEST(negated_bracket);
RUN_TEST(quantifier_braces);
RUN_TEST(peek);
RUN_TEST(escape_sequences);
printf("\nlexer: %d passed, %d failed\n", tests_passed, tests_failed);
return tests_failed > 0 ? 1 : 0;
}

294
tests/test_matcher.c Normal file
View File

@ -0,0 +1,294 @@
/* retoor <retoor@molodetz.nl> */
#include "../include/loreg.h"
#include <stdio.h>
#include <string.h>
static int tests_passed = 0;
static int tests_failed = 0;
#define TEST(name) static void test_##name(void)
#define RUN_TEST(name) do { \
printf(" %s... ", #name); \
test_##name(); \
printf("ok\n"); \
tests_passed++; \
} while(0)
#define ASSERT(cond) do { \
if (!(cond)) { \
printf("FAILED at line %d: %s\n", __LINE__, #cond); \
tests_failed++; \
return; \
} \
} while(0)
#define ASSERT_MATCH(pattern, text) do { \
loreg_error_t err; \
loreg_regex_t *re = loreg_compile(pattern, &err); \
ASSERT(re != NULL); \
loreg_match_t result; \
ASSERT(loreg_search(re, text, &result) == true); \
loreg_free(re); \
} while(0)
#define ASSERT_NO_MATCH(pattern, text) do { \
loreg_error_t err; \
loreg_regex_t *re = loreg_compile(pattern, &err); \
ASSERT(re != NULL); \
loreg_match_t result; \
ASSERT(loreg_search(re, text, &result) == false); \
loreg_free(re); \
} while(0)
TEST(simple_char) {
ASSERT_MATCH("a", "a");
ASSERT_MATCH("a", "bab");
ASSERT_NO_MATCH("a", "bcd");
}
TEST(concat) {
ASSERT_MATCH("ab", "ab");
ASSERT_MATCH("ab", "xaby");
ASSERT_NO_MATCH("ab", "ba");
}
TEST(alternation) {
ASSERT_MATCH("a|b", "a");
ASSERT_MATCH("a|b", "b");
ASSERT_MATCH("cat|dog", "cat");
ASSERT_MATCH("cat|dog", "dog");
ASSERT_NO_MATCH("cat|dog", "rat");
}
TEST(star) {
ASSERT_MATCH("a*", "");
ASSERT_MATCH("a*", "a");
ASSERT_MATCH("a*", "aaa");
ASSERT_MATCH("a*b", "b");
ASSERT_MATCH("a*b", "ab");
ASSERT_MATCH("a*b", "aaab");
}
TEST(plus) {
ASSERT_NO_MATCH("a+", "");
ASSERT_MATCH("a+", "a");
ASSERT_MATCH("a+", "aaa");
ASSERT_MATCH("a+b", "ab");
ASSERT_MATCH("a+b", "aaab");
}
TEST(question) {
ASSERT_MATCH("a?", "");
ASSERT_MATCH("a?", "a");
ASSERT_MATCH("a?b", "b");
ASSERT_MATCH("a?b", "ab");
}
TEST(dot) {
ASSERT_MATCH(".", "a");
ASSERT_MATCH(".", "x");
ASSERT_MATCH("a.b", "aab");
ASSERT_MATCH("a.b", "axb");
ASSERT_NO_MATCH("a.b", "ab");
}
TEST(bracket_simple) {
ASSERT_MATCH("[abc]", "a");
ASSERT_MATCH("[abc]", "b");
ASSERT_MATCH("[abc]", "c");
ASSERT_NO_MATCH("[abc]", "d");
}
TEST(bracket_range) {
ASSERT_MATCH("[a-z]", "a");
ASSERT_MATCH("[a-z]", "m");
ASSERT_MATCH("[a-z]", "z");
ASSERT_NO_MATCH("[a-z]", "A");
ASSERT_NO_MATCH("[a-z]", "0");
}
TEST(bracket_negated) {
ASSERT_NO_MATCH("[^abc]", "a");
ASSERT_NO_MATCH("[^abc]", "b");
ASSERT_MATCH("[^abc]", "d");
ASSERT_MATCH("[^abc]", "x");
}
TEST(group) {
ASSERT_MATCH("(ab)", "ab");
ASSERT_MATCH("(ab)+", "abab");
ASSERT_MATCH("(a|b)+", "abba");
}
TEST(anchors) {
ASSERT_MATCH("^a", "a");
ASSERT_MATCH("^a", "abc");
ASSERT_NO_MATCH("^a", "ba");
ASSERT_MATCH("a$", "a");
ASSERT_MATCH("a$", "ba");
ASSERT_NO_MATCH("a$", "ab");
ASSERT_MATCH("^abc$", "abc");
ASSERT_NO_MATCH("^abc$", "xabc");
ASSERT_NO_MATCH("^abc$", "abcx");
}
TEST(quantifier_exact) {
ASSERT_MATCH("a{3}", "aaa");
ASSERT_MATCH("a{3}", "aaaa");
ASSERT_NO_MATCH("a{3}", "aa");
}
TEST(quantifier_range) {
ASSERT_MATCH("a{2,4}", "aa");
ASSERT_MATCH("a{2,4}", "aaa");
ASSERT_MATCH("a{2,4}", "aaaa");
ASSERT_NO_MATCH("a{2,4}", "a");
}
TEST(quantifier_open) {
ASSERT_MATCH("a{2,}", "aa");
ASSERT_MATCH("a{2,}", "aaaaa");
ASSERT_NO_MATCH("a{2,}", "a");
}
TEST(class_digit) {
ASSERT_MATCH("\\d", "0");
ASSERT_MATCH("\\d", "9");
ASSERT_MATCH("\\d+", "123");
ASSERT_NO_MATCH("\\d", "a");
}
TEST(class_word) {
ASSERT_MATCH("\\w", "a");
ASSERT_MATCH("\\w", "Z");
ASSERT_MATCH("\\w", "0");
ASSERT_MATCH("\\w", "_");
ASSERT_NO_MATCH("\\w", " ");
ASSERT_NO_MATCH("\\w", "-");
}
TEST(class_space) {
ASSERT_MATCH("\\s", " ");
ASSERT_MATCH("\\s", "\t");
ASSERT_MATCH("\\s", "\n");
ASSERT_NO_MATCH("\\s", "a");
}
TEST(class_negated) {
ASSERT_NO_MATCH("\\D", "0");
ASSERT_MATCH("\\D", "a");
ASSERT_NO_MATCH("\\W", "a");
ASSERT_MATCH("\\W", " ");
ASSERT_NO_MATCH("\\S", " ");
ASSERT_MATCH("\\S", "a");
}
TEST(escape_sequences) {
ASSERT_MATCH("\\.", ".");
ASSERT_NO_MATCH("\\.", "a");
ASSERT_MATCH("\\*", "*");
ASSERT_MATCH("\\+", "+");
ASSERT_MATCH("\\?", "?");
}
TEST(complex_email) {
ASSERT_MATCH("[a-z]+@[a-z]+\\.[a-z]+", "test@example.com");
ASSERT_NO_MATCH("[a-z]+@[a-z]+\\.[a-z]+", "invalid");
}
TEST(complex_phone) {
ASSERT_MATCH("\\d{3}-\\d{3}-\\d{4}", "123-456-7890");
ASSERT_NO_MATCH("\\d{3}-\\d{3}-\\d{4}", "123-456-789");
}
TEST(complex_url) {
ASSERT_MATCH("https?://[a-z]+\\.[a-z]+", "http://example.com");
ASSERT_MATCH("https?://[a-z]+\\.[a-z]+", "https://example.com");
}
TEST(group_capture) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("(\\d+)-(\\d+)", &err);
ASSERT(re != NULL);
loreg_match_t result;
ASSERT(loreg_search(re, "123-456", &result));
ASSERT(result.group_count == 2);
ASSERT(result.groups[0].matched);
ASSERT(result.groups[1].matched);
loreg_free(re);
}
TEST(nested_groups) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("((a)(b))", &err);
ASSERT(re != NULL);
loreg_match_t result;
ASSERT(loreg_search(re, "ab", &result));
ASSERT(result.group_count == 3);
loreg_free(re);
}
TEST(empty_pattern) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("", &err);
ASSERT(re != NULL);
loreg_match_t result;
ASSERT(loreg_match(re, "anything", &result));
loreg_free(re);
}
TEST(match_position) {
loreg_error_t err;
loreg_regex_t *re = loreg_compile("test", &err);
ASSERT(re != NULL);
loreg_match_t result;
ASSERT(loreg_search(re, "xxxtestyyy", &result));
ASSERT(result.match_start == 3);
ASSERT(result.match_end == 7);
loreg_free(re);
}
int main(void) {
printf("matcher tests:\n");
RUN_TEST(simple_char);
RUN_TEST(concat);
RUN_TEST(alternation);
RUN_TEST(star);
RUN_TEST(plus);
RUN_TEST(question);
RUN_TEST(dot);
RUN_TEST(bracket_simple);
RUN_TEST(bracket_range);
RUN_TEST(bracket_negated);
RUN_TEST(group);
RUN_TEST(anchors);
RUN_TEST(quantifier_exact);
RUN_TEST(quantifier_range);
RUN_TEST(quantifier_open);
RUN_TEST(class_digit);
RUN_TEST(class_word);
RUN_TEST(class_space);
RUN_TEST(class_negated);
RUN_TEST(escape_sequences);
RUN_TEST(complex_email);
RUN_TEST(complex_phone);
RUN_TEST(complex_url);
RUN_TEST(group_capture);
RUN_TEST(nested_groups);
RUN_TEST(empty_pattern);
RUN_TEST(match_position);
printf("\nmatcher: %d passed, %d failed\n", tests_passed, tests_failed);
return tests_failed > 0 ? 1 : 0;
}

159
tests/test_nfa.c Normal file
View File

@ -0,0 +1,159 @@
/* retoor <retoor@molodetz.nl> */
#include "../include/nfa.h"
#include "../include/parser.h"
#include <stdio.h>
#include <assert.h>
static int tests_passed = 0;
static int tests_failed = 0;
#define TEST(name) static void test_##name(void)
#define RUN_TEST(name) do { \
printf(" %s... ", #name); \
test_##name(); \
printf("ok\n"); \
tests_passed++; \
} while(0)
#define ASSERT(cond) do { \
if (!(cond)) { \
printf("FAILED at line %d: %s\n", __LINE__, #cond); \
tests_failed++; \
return; \
} \
} while(0)
static nfa_t *compile_pattern(const char *pattern) {
parser_t parser;
parser_init(&parser, pattern);
ast_node_t *ast = parser_parse(&parser);
if (!ast || parser_get_error(&parser) != LOREG_OK) {
ast_free(ast);
return NULL;
}
loreg_error_t error;
nfa_t *nfa = nfa_from_ast(ast, &error);
ast_free(ast);
return nfa;
}
TEST(single_char) {
nfa_t *nfa = compile_pattern("a");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
ASSERT(nfa->state_count >= 2);
nfa_free(nfa);
}
TEST(concat) {
nfa_t *nfa = compile_pattern("ab");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
nfa_free(nfa);
}
TEST(alternation) {
nfa_t *nfa = compile_pattern("a|b");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
nfa_free(nfa);
}
TEST(star) {
nfa_t *nfa = compile_pattern("a*");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
nfa_free(nfa);
}
TEST(plus) {
nfa_t *nfa = compile_pattern("a+");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
nfa_free(nfa);
}
TEST(question) {
nfa_t *nfa = compile_pattern("a?");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
nfa_free(nfa);
}
TEST(group) {
nfa_t *nfa = compile_pattern("(ab)");
ASSERT(nfa != NULL);
ASSERT(nfa->group_count == 1);
nfa_free(nfa);
}
TEST(nested_groups) {
nfa_t *nfa = compile_pattern("((a)(b))");
ASSERT(nfa != NULL);
ASSERT(nfa->group_count == 3);
nfa_free(nfa);
}
TEST(bracket) {
nfa_t *nfa = compile_pattern("[abc]");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
nfa_free(nfa);
}
TEST(quantifier) {
nfa_t *nfa = compile_pattern("a{2,4}");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
nfa_free(nfa);
}
TEST(complex_pattern) {
nfa_t *nfa = compile_pattern("^([a-z]+)@([a-z]+)\\.([a-z]{2,})$");
ASSERT(nfa != NULL);
ASSERT(nfa->group_count == 3);
nfa_free(nfa);
}
TEST(dot) {
nfa_t *nfa = compile_pattern("a.b");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
nfa_free(nfa);
}
TEST(anchors) {
nfa_t *nfa = compile_pattern("^abc$");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
nfa_free(nfa);
}
TEST(character_classes) {
nfa_t *nfa = compile_pattern("\\d\\w\\s");
ASSERT(nfa != NULL);
ASSERT(nfa->start != NULL);
nfa_free(nfa);
}
int main(void) {
printf("nfa tests:\n");
RUN_TEST(single_char);
RUN_TEST(concat);
RUN_TEST(alternation);
RUN_TEST(star);
RUN_TEST(plus);
RUN_TEST(question);
RUN_TEST(group);
RUN_TEST(nested_groups);
RUN_TEST(bracket);
RUN_TEST(quantifier);
RUN_TEST(complex_pattern);
RUN_TEST(dot);
RUN_TEST(anchors);
RUN_TEST(character_classes);
printf("\nnfa: %d passed, %d failed\n", tests_passed, tests_failed);
return tests_failed > 0 ? 1 : 0;
}

301
tests/test_parser.c Normal file
View File

@ -0,0 +1,301 @@
/* retoor <retoor@molodetz.nl> */
#include "../include/parser.h"
#include <stdio.h>
#include <assert.h>
static int tests_passed = 0;
static int tests_failed = 0;
#define TEST(name) static void test_##name(void)
#define RUN_TEST(name) do { \
printf(" %s... ", #name); \
test_##name(); \
printf("ok\n"); \
tests_passed++; \
} while(0)
#define ASSERT(cond) do { \
if (!(cond)) { \
printf("FAILED at line %d: %s\n", __LINE__, #cond); \
tests_failed++; \
return; \
} \
} while(0)
TEST(single_char) {
parser_t parser;
parser_init(&parser, "a");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_CHAR);
ASSERT(ast->value == 'a');
ast_free(ast);
}
TEST(concat) {
parser_t parser;
parser_init(&parser, "ab");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_CONCAT);
ASSERT(ast->left->type == AST_CHAR);
ASSERT(ast->left->value == 'a');
ASSERT(ast->right->type == AST_CHAR);
ASSERT(ast->right->value == 'b');
ast_free(ast);
}
TEST(alternation) {
parser_t parser;
parser_init(&parser, "a|b");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_ALTER);
ASSERT(ast->left->type == AST_CHAR);
ASSERT(ast->left->value == 'a');
ASSERT(ast->right->type == AST_CHAR);
ASSERT(ast->right->value == 'b');
ast_free(ast);
}
TEST(star) {
parser_t parser;
parser_init(&parser, "a*");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_STAR);
ASSERT(ast->left->type == AST_CHAR);
ASSERT(ast->left->value == 'a');
ast_free(ast);
}
TEST(plus) {
parser_t parser;
parser_init(&parser, "a+");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_PLUS);
ASSERT(ast->left->type == AST_CHAR);
ASSERT(ast->left->value == 'a');
ast_free(ast);
}
TEST(question) {
parser_t parser;
parser_init(&parser, "a?");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_QUESTION);
ASSERT(ast->left->type == AST_CHAR);
ASSERT(ast->left->value == 'a');
ast_free(ast);
}
TEST(group) {
parser_t parser;
parser_init(&parser, "(ab)");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_GROUP);
ASSERT(ast->group_id == 0);
ASSERT(ast->left->type == AST_CONCAT);
ast_free(ast);
}
TEST(dot) {
parser_t parser;
parser_init(&parser, ".");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_DOT);
ast_free(ast);
}
TEST(anchors) {
parser_t parser;
parser_init(&parser, "^a$");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_CONCAT);
ast_free(ast);
}
TEST(bracket_simple) {
parser_t parser;
parser_init(&parser, "[abc]");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_BRACKET);
ASSERT(ast->bracket != NULL);
ASSERT(ast->bracket->count == 3);
ast_free(ast);
}
TEST(bracket_range) {
parser_t parser;
parser_init(&parser, "[a-z]");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_BRACKET);
ASSERT(ast->bracket != NULL);
ASSERT(ast->bracket->count == 1);
ASSERT(ast->bracket->ranges[0].start == 'a');
ASSERT(ast->bracket->ranges[0].end == 'z');
ast_free(ast);
}
TEST(bracket_negated) {
parser_t parser;
parser_init(&parser, "[^a]");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_BRACKET);
ASSERT(ast->bracket->negated == true);
ast_free(ast);
}
TEST(quantifier_exact) {
parser_t parser;
parser_init(&parser, "a{3}");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_QUANTIFIER);
ASSERT(ast->quant.min == 3);
ASSERT(ast->quant.max == 3);
ast_free(ast);
}
TEST(quantifier_range) {
parser_t parser;
parser_init(&parser, "a{2,5}");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_QUANTIFIER);
ASSERT(ast->quant.min == 2);
ASSERT(ast->quant.max == 5);
ast_free(ast);
}
TEST(quantifier_open) {
parser_t parser;
parser_init(&parser, "a{2,}");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_QUANTIFIER);
ASSERT(ast->quant.min == 2);
ASSERT(ast->quant.max == -1);
ast_free(ast);
}
TEST(character_class_digit) {
parser_t parser;
parser_init(&parser, "\\d");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_CLASS_DIGIT);
ast_free(ast);
}
TEST(character_class_word) {
parser_t parser;
parser_init(&parser, "\\w");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_CLASS_WORD);
ast_free(ast);
}
TEST(complex_pattern) {
parser_t parser;
parser_init(&parser, "^([a-z]+)@([a-z]+)\\.([a-z]{2,})$");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(parser_get_error(&parser) == LOREG_OK);
ast_free(ast);
}
TEST(unbalanced_paren) {
parser_t parser;
parser_init(&parser, "(abc");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast == NULL || parser_get_error(&parser) == LOREG_ERR_UNBALANCED_PAREN);
ast_free(ast);
}
TEST(non_greedy) {
parser_t parser;
parser_init(&parser, "a*?");
ast_node_t *ast = parser_parse(&parser);
ASSERT(ast != NULL);
ASSERT(ast->type == AST_STAR);
ASSERT(ast->quant.greedy == false);
ast_free(ast);
}
int main(void) {
printf("parser tests:\n");
RUN_TEST(single_char);
RUN_TEST(concat);
RUN_TEST(alternation);
RUN_TEST(star);
RUN_TEST(plus);
RUN_TEST(question);
RUN_TEST(group);
RUN_TEST(dot);
RUN_TEST(anchors);
RUN_TEST(bracket_simple);
RUN_TEST(bracket_range);
RUN_TEST(bracket_negated);
RUN_TEST(quantifier_exact);
RUN_TEST(quantifier_range);
RUN_TEST(quantifier_open);
RUN_TEST(character_class_digit);
RUN_TEST(character_class_word);
RUN_TEST(complex_pattern);
RUN_TEST(unbalanced_paren);
RUN_TEST(non_greedy);
printf("\nparser: %d passed, %d failed\n", tests_passed, tests_failed);
return tests_failed > 0 ? 1 : 0;
}