chore: update c, h, md files
This commit is contained in:
commit
3d9c4aa00b
93
.gitea/workflows/ci.yml
Normal file
93
.gitea/workflows/ci.yml
Normal file
@ -0,0 +1,93 @@
|
||||
# retoor <retoor@molodetz.nl>
|
||||
name: CI
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- master
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
- master
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc make valgrind
|
||||
|
||||
- name: Build release
|
||||
run: make
|
||||
|
||||
- name: Build debug
|
||||
run: make debug
|
||||
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc make
|
||||
|
||||
- name: Run tests
|
||||
run: make test
|
||||
|
||||
valgrind:
|
||||
runs-on: ubuntu-latest
|
||||
needs: build
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc make valgrind
|
||||
|
||||
- name: Build test binaries
|
||||
run: make build/test_integration build/test_all
|
||||
|
||||
- name: Valgrind comprehensive tests
|
||||
run: |
|
||||
valgrind --leak-check=full --show-leak-kinds=all \
|
||||
--track-origins=yes --error-exitcode=1 \
|
||||
./build/test_all
|
||||
|
||||
- name: Valgrind integration tests
|
||||
run: |
|
||||
valgrind --leak-check=full --show-leak-kinds=all \
|
||||
--track-origins=yes --error-exitcode=1 \
|
||||
./build/test_integration
|
||||
|
||||
coverage:
|
||||
runs-on: ubuntu-latest
|
||||
needs: test
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y gcc make gcovr
|
||||
|
||||
- name: Generate coverage
|
||||
run: make coverage
|
||||
|
||||
- name: Upload coverage artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: coverage-report
|
||||
path: build/coverage/
|
||||
29
.gitignore
vendored
Normal file
29
.gitignore
vendored
Normal file
@ -0,0 +1,29 @@
|
||||
# Build
|
||||
build/
|
||||
*.o
|
||||
*.a
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
# Binary
|
||||
loreg
|
||||
|
||||
# Coverage
|
||||
*.gcov
|
||||
*.gcda
|
||||
*.gcno
|
||||
|
||||
# Profiling
|
||||
gmon.out
|
||||
*.prof
|
||||
|
||||
# Editor
|
||||
*~
|
||||
*.swp
|
||||
*.swo
|
||||
.vscode/
|
||||
.idea/
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
10
CHANGELOG.md
Normal file
10
CHANGELOG.md
Normal file
@ -0,0 +1,10 @@
|
||||
# Changelog
|
||||
|
||||
|
||||
|
||||
## Version 0.1.0 - 2026-01-04
|
||||
|
||||
update c, h, md files
|
||||
|
||||
**Changes:** 25 files, 4449 lines
|
||||
**Languages:** C (3989 lines), Markdown (181 lines), Other (186 lines), YAML (93 lines)
|
||||
157
Makefile
Normal file
157
Makefile
Normal file
@ -0,0 +1,157 @@
|
||||
# retoor <retoor@molodetz.nl>
|
||||
|
||||
CC = gcc
|
||||
CFLAGS = -Wall -Wextra -Werror -pedantic -std=c11 -O3 -march=native -flto
|
||||
CFLAGS_DEBUG = -Wall -Wextra -pedantic -std=c11 -g -O0 -DDEBUG
|
||||
CFLAGS_COV = -Wall -Wextra -pedantic -std=c11 -g -O0 --coverage -fprofile-arcs -ftest-coverage
|
||||
CFLAGS_PROF = -Wall -Wextra -pedantic -std=c11 -O2 -pg
|
||||
|
||||
INCLUDES = -Iinclude
|
||||
LDFLAGS = -flto
|
||||
LDFLAGS_COV = --coverage
|
||||
|
||||
SRC_DIR = src
|
||||
INC_DIR = include
|
||||
BUILD_DIR = build
|
||||
TEST_DIR = tests
|
||||
|
||||
SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \
|
||||
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c \
|
||||
$(SRC_DIR)/repl.c $(SRC_DIR)/main.c
|
||||
|
||||
LIB_SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \
|
||||
$(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c
|
||||
|
||||
OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(SRCS))
|
||||
LIB_OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(LIB_SRCS))
|
||||
|
||||
TARGET = loreg
|
||||
LIB_TARGET = libloreg.a
|
||||
|
||||
TEST_SRCS = $(TEST_DIR)/test_lexer.c $(TEST_DIR)/test_parser.c \
|
||||
$(TEST_DIR)/test_nfa.c $(TEST_DIR)/test_matcher.c \
|
||||
$(TEST_DIR)/test_all.c $(TEST_DIR)/test_integration.c
|
||||
|
||||
TEST_BINS = $(BUILD_DIR)/test_lexer $(BUILD_DIR)/test_parser \
|
||||
$(BUILD_DIR)/test_nfa $(BUILD_DIR)/test_matcher \
|
||||
$(BUILD_DIR)/test_all $(BUILD_DIR)/test_integration
|
||||
|
||||
.PHONY: all clean test debug coverage profile valgrind help install
|
||||
|
||||
all: $(BUILD_DIR) $(TARGET)
|
||||
|
||||
$(BUILD_DIR):
|
||||
mkdir -p $(BUILD_DIR)
|
||||
|
||||
$(BUILD_DIR)/%.o: $(SRC_DIR)/%.c | $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@
|
||||
|
||||
$(TARGET): $(OBJS)
|
||||
$(CC) $(OBJS) -o $@ $(LDFLAGS)
|
||||
|
||||
$(LIB_TARGET): $(LIB_OBJS)
|
||||
ar rcs $@ $(LIB_OBJS)
|
||||
|
||||
debug: CFLAGS = $(CFLAGS_DEBUG)
|
||||
debug: clean $(TARGET)
|
||||
|
||||
$(BUILD_DIR)/test_lexer: $(TEST_DIR)/test_lexer.c $(LIB_SRCS) | $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
|
||||
|
||||
$(BUILD_DIR)/test_parser: $(TEST_DIR)/test_parser.c $(LIB_SRCS) | $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
|
||||
|
||||
$(BUILD_DIR)/test_nfa: $(TEST_DIR)/test_nfa.c $(LIB_SRCS) | $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
|
||||
|
||||
$(BUILD_DIR)/test_matcher: $(TEST_DIR)/test_matcher.c $(LIB_SRCS) | $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
|
||||
|
||||
$(BUILD_DIR)/test_all: $(TEST_DIR)/test_all.c $(LIB_SRCS) | $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
|
||||
|
||||
$(BUILD_DIR)/test_integration: $(TEST_DIR)/test_integration.c $(LIB_SRCS) | $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@
|
||||
|
||||
test: $(TEST_BINS)
|
||||
@echo "running lexer tests..."
|
||||
@$(BUILD_DIR)/test_lexer
|
||||
@echo ""
|
||||
@echo "running parser tests..."
|
||||
@$(BUILD_DIR)/test_parser
|
||||
@echo ""
|
||||
@echo "running nfa tests..."
|
||||
@$(BUILD_DIR)/test_nfa
|
||||
@echo ""
|
||||
@echo "running matcher tests..."
|
||||
@$(BUILD_DIR)/test_matcher
|
||||
@echo ""
|
||||
@echo "running comprehensive tests..."
|
||||
@$(BUILD_DIR)/test_all
|
||||
@echo ""
|
||||
@echo "running integration tests..."
|
||||
@$(BUILD_DIR)/test_integration
|
||||
|
||||
coverage: CFLAGS = $(CFLAGS_COV)
|
||||
coverage: LDFLAGS = $(LDFLAGS_COV)
|
||||
coverage: clean $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS_COV) $(INCLUDES) $(TEST_DIR)/test_all.c $(LIB_SRCS) -o $(BUILD_DIR)/test_coverage $(LDFLAGS_COV)
|
||||
$(BUILD_DIR)/test_coverage
|
||||
gcov -b $(LIB_SRCS)
|
||||
@echo ""
|
||||
@echo "coverage report generated"
|
||||
@mkdir -p $(BUILD_DIR)/coverage
|
||||
@mv *.gcov $(BUILD_DIR)/coverage/ 2>/dev/null || true
|
||||
@mv *.gcda $(BUILD_DIR)/coverage/ 2>/dev/null || true
|
||||
@mv *.gcno $(BUILD_DIR)/coverage/ 2>/dev/null || true
|
||||
|
||||
profile: CFLAGS = $(CFLAGS_PROF)
|
||||
profile: clean $(BUILD_DIR)
|
||||
$(CC) $(CFLAGS_PROF) $(INCLUDES) $(TEST_DIR)/test_all.c $(LIB_SRCS) -o $(BUILD_DIR)/test_profile
|
||||
$(BUILD_DIR)/test_profile
|
||||
gprof $(BUILD_DIR)/test_profile gmon.out > $(BUILD_DIR)/profile.txt
|
||||
@echo ""
|
||||
@echo "profile report: $(BUILD_DIR)/profile.txt"
|
||||
@mv gmon.out $(BUILD_DIR)/ 2>/dev/null || true
|
||||
|
||||
valgrind: $(BUILD_DIR)/test_all
|
||||
valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes \
|
||||
--error-exitcode=1 $(BUILD_DIR)/test_all
|
||||
|
||||
valgrind-verbose: $(BUILD_DIR)/test_all
|
||||
valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes \
|
||||
--verbose --log-file=$(BUILD_DIR)/valgrind.log $(BUILD_DIR)/test_all
|
||||
@echo "valgrind log: $(BUILD_DIR)/valgrind.log"
|
||||
|
||||
benchmark: $(TARGET)
|
||||
@echo "benchmarking..."
|
||||
@echo "pattern: [a-z]+@[a-z]+\\.[a-z]+"
|
||||
@time -p sh -c 'for i in $$(seq 1 1000); do ./$(TARGET) "[a-z]+@[a-z]+\\.[a-z]+" "test@example.com" > /dev/null; done'
|
||||
@echo ""
|
||||
@echo "pattern: (a|b)*abb"
|
||||
@time -p sh -c 'for i in $$(seq 1 1000); do ./$(TARGET) "(a|b)*abb" "aabababb" > /dev/null; done'
|
||||
|
||||
install: $(TARGET)
|
||||
install -d $(DESTDIR)/usr/local/bin
|
||||
install -m 755 $(TARGET) $(DESTDIR)/usr/local/bin/
|
||||
|
||||
uninstall:
|
||||
rm -f $(DESTDIR)/usr/local/bin/$(TARGET)
|
||||
|
||||
clean:
|
||||
rm -rf $(BUILD_DIR) $(TARGET) $(LIB_TARGET)
|
||||
rm -f *.gcov *.gcda *.gcno gmon.out
|
||||
|
||||
help:
|
||||
@echo "loreg makefile targets:"
|
||||
@echo " all build optimized release binary"
|
||||
@echo " debug build with debug symbols"
|
||||
@echo " test run all tests"
|
||||
@echo " coverage run tests with coverage analysis"
|
||||
@echo " profile run tests with profiling"
|
||||
@echo " valgrind run tests under valgrind"
|
||||
@echo " benchmark run simple benchmarks"
|
||||
@echo " install install to /usr/local/bin"
|
||||
@echo " uninstall remove from /usr/local/bin"
|
||||
@echo " clean remove build artifacts"
|
||||
@echo " help show this message"
|
||||
181
README.md
Normal file
181
README.md
Normal file
@ -0,0 +1,181 @@
|
||||
# loreg
|
||||
|
||||
retoor <retoor@molodetz.nl>
|
||||
|
||||
A high-performance regular expression interpreter implemented from scratch in plain C. The engine uses Thompson's NFA construction algorithm for efficient pattern matching.
|
||||
|
||||
## CI
|
||||
|
||||
The project includes Gitea Actions CI that runs on every push and pull request:
|
||||
- Build verification (release and debug)
|
||||
- Full test suite (569 tests)
|
||||
- Valgrind memory leak detection
|
||||
- Code coverage generation
|
||||
|
||||
## Features
|
||||
|
||||
- Full regex syntax support: literals, metacharacters, quantifiers, character classes, groups, alternation, anchors
|
||||
- NFA-based matching engine with Thompson construction
|
||||
- Capturing groups with match position tracking
|
||||
- Interactive REPL for testing patterns
|
||||
- Zero external dependencies
|
||||
- Comprehensive test suite with 569 tests
|
||||
- Memory-safe implementation verified with Valgrind
|
||||
|
||||
## Building
|
||||
|
||||
```sh
|
||||
make # optimized release build
|
||||
make debug # debug build with symbols
|
||||
make test # run all tests
|
||||
make coverage # generate coverage report
|
||||
make profile # generate profiling report
|
||||
make valgrind # run under valgrind
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Command Line
|
||||
|
||||
```sh
|
||||
./loreg "pattern" "text" # search for pattern in text
|
||||
./loreg -m "pattern" "text" # full match mode
|
||||
./loreg -i # start REPL
|
||||
./loreg # start REPL (default)
|
||||
```
|
||||
|
||||
### REPL Commands
|
||||
|
||||
```
|
||||
:p <pattern> compile and set pattern
|
||||
:m <text> match text (anchored)
|
||||
:s <text> search for pattern in text
|
||||
<text> search (default)
|
||||
:h help
|
||||
:q quit
|
||||
```
|
||||
|
||||
### C API
|
||||
|
||||
```c
|
||||
#include "loreg.h"
|
||||
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("\\d{3}-\\d{4}", &err);
|
||||
if (!re) {
|
||||
fprintf(stderr, "error: %s\n", loreg_error_string(err));
|
||||
return 1;
|
||||
}
|
||||
|
||||
loreg_match_t result;
|
||||
if (loreg_search(re, "call 555-1234 now", &result)) {
|
||||
printf("match at [%zu-%zu]\n", result.match_start, result.match_end);
|
||||
}
|
||||
|
||||
loreg_free(re);
|
||||
```
|
||||
|
||||
## Supported Syntax
|
||||
|
||||
| Pattern | Description |
|
||||
|---------|-------------|
|
||||
| `.` | any character except newline |
|
||||
| `*` | zero or more |
|
||||
| `+` | one or more |
|
||||
| `?` | zero or one |
|
||||
| `\|` | alternation |
|
||||
| `()` | grouping and capture |
|
||||
| `[]` | character class |
|
||||
| `[^]` | negated character class |
|
||||
| `[a-z]` | character range |
|
||||
| `^` | start anchor |
|
||||
| `$` | end anchor |
|
||||
| `{n}` | exactly n |
|
||||
| `{n,}` | n or more |
|
||||
| `{n,m}` | n to m |
|
||||
| `\d` | digit [0-9] |
|
||||
| `\w` | word [a-zA-Z0-9_] |
|
||||
| `\s` | whitespace |
|
||||
| `\D` | non-digit |
|
||||
| `\W` | non-word |
|
||||
| `\S` | non-whitespace |
|
||||
| `*?` `+?` `??` | non-greedy quantifiers |
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
src/
|
||||
├── lexer.c tokenizer for regex patterns
|
||||
├── parser.c recursive descent parser producing AST
|
||||
├── ast.c abstract syntax tree node types
|
||||
├── nfa.c Thompson NFA construction
|
||||
├── matcher.c NFA simulation with epsilon closure
|
||||
├── loreg.c public API
|
||||
├── repl.c interactive REPL
|
||||
└── main.c CLI entry point
|
||||
|
||||
include/
|
||||
├── loreg.h public header
|
||||
├── lexer.h lexer interface
|
||||
├── parser.h parser interface
|
||||
├── ast.h AST types
|
||||
├── nfa.h NFA types
|
||||
├── matcher.h matcher interface
|
||||
└── repl.h REPL interface
|
||||
|
||||
tests/
|
||||
├── test_lexer.c lexer unit tests (10 tests)
|
||||
├── test_parser.c parser unit tests (20 tests)
|
||||
├── test_nfa.c NFA construction tests (14 tests)
|
||||
├── test_matcher.c matching tests (27 tests)
|
||||
├── test_all.c comprehensive tests (9 tests)
|
||||
└── test_integration.c integration tests (489 tests)
|
||||
```
|
||||
|
||||
## Test Suite
|
||||
|
||||
The test suite contains 569 tests covering:
|
||||
|
||||
| Category | Description |
|
||||
|----------|-------------|
|
||||
| Lexer | Tokenization of patterns |
|
||||
| Parser | AST construction and error handling |
|
||||
| NFA | State machine construction |
|
||||
| Matcher | Pattern matching correctness |
|
||||
| Integration | Real-world regex patterns |
|
||||
|
||||
Integration tests cover:
|
||||
- Literal matching and concatenation
|
||||
- Dot metacharacter and wildcards
|
||||
- Start/end anchors
|
||||
- All quantifiers (*, +, ?, {n,m})
|
||||
- Alternation and grouping
|
||||
- Character classes and ranges
|
||||
- Negated character classes
|
||||
- Escape sequences
|
||||
- Email, IP, URL, phone patterns
|
||||
- Greedy vs non-greedy matching
|
||||
- Nested groups and complex nesting
|
||||
- Edge cases and boundary conditions
|
||||
- Pathological/stress patterns
|
||||
|
||||
Run tests with Valgrind verification:
|
||||
```sh
|
||||
make test # run all 569 tests
|
||||
make valgrind # verify zero memory leaks
|
||||
```
|
||||
|
||||
## Algorithm
|
||||
|
||||
The implementation uses Thompson's construction to convert regex patterns to NFAs:
|
||||
|
||||
1. **Lexer**: Tokenizes the pattern into a stream of tokens
|
||||
2. **Parser**: Builds an AST using recursive descent parsing
|
||||
3. **NFA Construction**: Converts AST to NFA using Thompson's algorithm
|
||||
4. **Matching**: Simulates NFA with epsilon closure for linear-time matching
|
||||
|
||||
Time complexity: O(n*m) where n is pattern length and m is text length.
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
80
include/ast.h
Normal file
80
include/ast.h
Normal file
@ -0,0 +1,80 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_AST_H
|
||||
#define LOREG_AST_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
||||
typedef enum {
|
||||
AST_CHAR,
|
||||
AST_DOT,
|
||||
AST_CONCAT,
|
||||
AST_ALTER,
|
||||
AST_STAR,
|
||||
AST_PLUS,
|
||||
AST_QUESTION,
|
||||
AST_GROUP,
|
||||
AST_ANCHOR_START,
|
||||
AST_ANCHOR_END,
|
||||
AST_BRACKET,
|
||||
AST_QUANTIFIER,
|
||||
AST_CLASS_DIGIT,
|
||||
AST_CLASS_WORD,
|
||||
AST_CLASS_SPACE,
|
||||
AST_CLASS_NDIGIT,
|
||||
AST_CLASS_NWORD,
|
||||
AST_CLASS_NSPACE
|
||||
} ast_type_t;
|
||||
|
||||
typedef struct {
|
||||
char start;
|
||||
char end;
|
||||
} char_range_t;
|
||||
|
||||
typedef struct {
|
||||
char_range_t *ranges;
|
||||
size_t count;
|
||||
size_t capacity;
|
||||
bool negated;
|
||||
} bracket_class_t;
|
||||
|
||||
typedef struct {
|
||||
int min;
|
||||
int max;
|
||||
bool greedy;
|
||||
} quantifier_t;
|
||||
|
||||
typedef struct ast_node ast_node_t;
|
||||
|
||||
struct ast_node {
|
||||
ast_type_t type;
|
||||
char value;
|
||||
ast_node_t *left;
|
||||
ast_node_t *right;
|
||||
int group_id;
|
||||
bracket_class_t *bracket;
|
||||
quantifier_t quant;
|
||||
};
|
||||
|
||||
ast_node_t *ast_create_char(char c);
|
||||
ast_node_t *ast_create_dot(void);
|
||||
ast_node_t *ast_create_concat(ast_node_t *left, ast_node_t *right);
|
||||
ast_node_t *ast_create_alter(ast_node_t *left, ast_node_t *right);
|
||||
ast_node_t *ast_create_star(ast_node_t *child, bool greedy);
|
||||
ast_node_t *ast_create_plus(ast_node_t *child, bool greedy);
|
||||
ast_node_t *ast_create_question(ast_node_t *child, bool greedy);
|
||||
ast_node_t *ast_create_group(ast_node_t *child, int group_id);
|
||||
ast_node_t *ast_create_anchor_start(void);
|
||||
ast_node_t *ast_create_anchor_end(void);
|
||||
ast_node_t *ast_create_bracket(bracket_class_t *bracket);
|
||||
ast_node_t *ast_create_quantifier(ast_node_t *child, int min, int max, bool greedy);
|
||||
ast_node_t *ast_create_class(ast_type_t type);
|
||||
void ast_free(ast_node_t *node);
|
||||
|
||||
bracket_class_t *bracket_create(void);
|
||||
void bracket_add_char(bracket_class_t *bracket, char c);
|
||||
void bracket_add_range(bracket_class_t *bracket, char start, char end);
|
||||
void bracket_free(bracket_class_t *bracket);
|
||||
bool bracket_matches(bracket_class_t *bracket, char c);
|
||||
|
||||
#endif
|
||||
52
include/lexer.h
Normal file
52
include/lexer.h
Normal file
@ -0,0 +1,52 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_LEXER_H
|
||||
#define LOREG_LEXER_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
typedef enum {
|
||||
TOKEN_CHAR,
|
||||
TOKEN_DOT,
|
||||
TOKEN_STAR,
|
||||
TOKEN_PLUS,
|
||||
TOKEN_QUESTION,
|
||||
TOKEN_PIPE,
|
||||
TOKEN_LPAREN,
|
||||
TOKEN_RPAREN,
|
||||
TOKEN_LBRACKET,
|
||||
TOKEN_RBRACKET,
|
||||
TOKEN_CARET,
|
||||
TOKEN_DOLLAR,
|
||||
TOKEN_LBRACE,
|
||||
TOKEN_RBRACE,
|
||||
TOKEN_BACKSLASH,
|
||||
TOKEN_DASH,
|
||||
TOKEN_CLASS_DIGIT,
|
||||
TOKEN_CLASS_WORD,
|
||||
TOKEN_CLASS_SPACE,
|
||||
TOKEN_CLASS_NDIGIT,
|
||||
TOKEN_CLASS_NWORD,
|
||||
TOKEN_CLASS_NSPACE,
|
||||
TOKEN_EOF
|
||||
} token_type_t;
|
||||
|
||||
typedef struct {
|
||||
token_type_t type;
|
||||
char value;
|
||||
size_t position;
|
||||
} token_t;
|
||||
|
||||
typedef struct {
|
||||
const char *pattern;
|
||||
size_t length;
|
||||
size_t position;
|
||||
bool in_bracket;
|
||||
} lexer_t;
|
||||
|
||||
void lexer_init(lexer_t *lexer, const char *pattern);
|
||||
token_t lexer_next(lexer_t *lexer);
|
||||
token_t lexer_peek(lexer_t *lexer);
|
||||
bool lexer_eof(lexer_t *lexer);
|
||||
|
||||
#endif
|
||||
45
include/loreg.h
Normal file
45
include/loreg.h
Normal file
@ -0,0 +1,45 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_H
|
||||
#define LOREG_H
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#define LOREG_VERSION "1.0.0"
|
||||
#define LOREG_MAX_STATES 4096
|
||||
#define LOREG_MAX_GROUPS 32
|
||||
|
||||
typedef enum {
|
||||
LOREG_OK = 0,
|
||||
LOREG_ERR_INVALID_PATTERN,
|
||||
LOREG_ERR_UNBALANCED_PAREN,
|
||||
LOREG_ERR_EMPTY_GROUP,
|
||||
LOREG_ERR_INVALID_QUANTIFIER,
|
||||
LOREG_ERR_INVALID_ESCAPE,
|
||||
LOREG_ERR_OUT_OF_MEMORY,
|
||||
LOREG_ERR_STATE_OVERFLOW
|
||||
} loreg_error_t;
|
||||
|
||||
typedef struct {
|
||||
size_t start;
|
||||
size_t end;
|
||||
bool matched;
|
||||
} loreg_group_t;
|
||||
|
||||
typedef struct {
|
||||
bool matched;
|
||||
size_t match_start;
|
||||
size_t match_end;
|
||||
loreg_group_t groups[LOREG_MAX_GROUPS];
|
||||
size_t group_count;
|
||||
} loreg_match_t;
|
||||
|
||||
typedef struct loreg_regex loreg_regex_t;
|
||||
|
||||
loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error);
|
||||
void loreg_free(loreg_regex_t *regex);
|
||||
bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result);
|
||||
bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result);
|
||||
const char *loreg_error_string(loreg_error_t error);
|
||||
|
||||
#endif
|
||||
26
include/matcher.h
Normal file
26
include/matcher.h
Normal file
@ -0,0 +1,26 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_MATCHER_H
|
||||
#define LOREG_MATCHER_H
|
||||
|
||||
#include "nfa.h"
|
||||
#include "loreg.h"
|
||||
|
||||
typedef struct {
|
||||
nfa_state_t **states;
|
||||
size_t count;
|
||||
size_t capacity;
|
||||
size_t *group_starts;
|
||||
size_t *group_ends;
|
||||
int group_count;
|
||||
} state_set_t;
|
||||
|
||||
state_set_t *state_set_create(size_t initial_capacity, int group_count);
|
||||
void state_set_free(state_set_t *set);
|
||||
void state_set_clear(state_set_t *set);
|
||||
void state_set_add(state_set_t *set, nfa_state_t *state);
|
||||
bool state_set_contains(state_set_t *set, nfa_state_t *state);
|
||||
|
||||
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result);
|
||||
bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result);
|
||||
|
||||
#endif
|
||||
69
include/nfa.h
Normal file
69
include/nfa.h
Normal file
@ -0,0 +1,69 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_NFA_H
|
||||
#define LOREG_NFA_H
|
||||
|
||||
#include "ast.h"
|
||||
#include "loreg.h"
|
||||
#include <stdbool.h>
|
||||
#include <stddef.h>
|
||||
|
||||
#define EPSILON '\0'
|
||||
#define NFA_MAX_TRANSITIONS 256
|
||||
|
||||
typedef struct nfa_state nfa_state_t;
|
||||
|
||||
typedef enum {
|
||||
TRANS_CHAR,
|
||||
TRANS_EPSILON,
|
||||
TRANS_DOT,
|
||||
TRANS_BRACKET,
|
||||
TRANS_CLASS_DIGIT,
|
||||
TRANS_CLASS_WORD,
|
||||
TRANS_CLASS_SPACE,
|
||||
TRANS_CLASS_NDIGIT,
|
||||
TRANS_CLASS_NWORD,
|
||||
TRANS_CLASS_NSPACE,
|
||||
TRANS_GROUP_START,
|
||||
TRANS_GROUP_END,
|
||||
TRANS_ANCHOR_START,
|
||||
TRANS_ANCHOR_END
|
||||
} transition_type_t;
|
||||
|
||||
typedef struct {
|
||||
transition_type_t type;
|
||||
char value;
|
||||
nfa_state_t *target;
|
||||
bracket_class_t *bracket;
|
||||
int group_id;
|
||||
} transition_t;
|
||||
|
||||
struct nfa_state {
|
||||
int id;
|
||||
bool accepting;
|
||||
transition_t *transitions;
|
||||
size_t trans_count;
|
||||
size_t trans_capacity;
|
||||
};
|
||||
|
||||
typedef struct {
|
||||
nfa_state_t *start;
|
||||
nfa_state_t *accept;
|
||||
} nfa_fragment_t;
|
||||
|
||||
typedef struct {
|
||||
nfa_state_t **states;
|
||||
size_t state_count;
|
||||
size_t capacity;
|
||||
nfa_state_t *start;
|
||||
int group_count;
|
||||
} nfa_t;
|
||||
|
||||
nfa_t *nfa_create(void);
|
||||
void nfa_free(nfa_t *nfa);
|
||||
nfa_state_t *nfa_add_state(nfa_t *nfa);
|
||||
void nfa_add_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, char value);
|
||||
void nfa_add_bracket_transition(nfa_state_t *from, nfa_state_t *to, bracket_class_t *bracket);
|
||||
void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, int group_id);
|
||||
nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error);
|
||||
|
||||
#endif
|
||||
20
include/parser.h
Normal file
20
include/parser.h
Normal file
@ -0,0 +1,20 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_PARSER_H
|
||||
#define LOREG_PARSER_H
|
||||
|
||||
#include "ast.h"
|
||||
#include "lexer.h"
|
||||
#include "loreg.h"
|
||||
|
||||
typedef struct {
|
||||
lexer_t lexer;
|
||||
token_t current;
|
||||
loreg_error_t error;
|
||||
int group_count;
|
||||
} parser_t;
|
||||
|
||||
void parser_init(parser_t *parser, const char *pattern);
|
||||
ast_node_t *parser_parse(parser_t *parser);
|
||||
loreg_error_t parser_get_error(parser_t *parser);
|
||||
|
||||
#endif
|
||||
7
include/repl.h
Normal file
7
include/repl.h
Normal file
@ -0,0 +1,7 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#ifndef LOREG_REPL_H
|
||||
#define LOREG_REPL_H
|
||||
|
||||
void repl_run(void);
|
||||
|
||||
#endif
|
||||
169
src/ast.c
Normal file
169
src/ast.c
Normal file
@ -0,0 +1,169 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "ast.h"
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
|
||||
static ast_node_t *ast_create_node(ast_type_t type) {
|
||||
ast_node_t *node = malloc(sizeof(ast_node_t));
|
||||
if (!node) return NULL;
|
||||
node->type = type;
|
||||
node->value = '\0';
|
||||
node->left = NULL;
|
||||
node->right = NULL;
|
||||
node->group_id = -1;
|
||||
node->bracket = NULL;
|
||||
node->quant.min = 0;
|
||||
node->quant.max = -1;
|
||||
node->quant.greedy = true;
|
||||
return node;
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_char(char c) {
|
||||
ast_node_t *node = ast_create_node(AST_CHAR);
|
||||
if (node) node->value = c;
|
||||
return node;
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_dot(void) {
|
||||
return ast_create_node(AST_DOT);
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_concat(ast_node_t *left, ast_node_t *right) {
|
||||
ast_node_t *node = ast_create_node(AST_CONCAT);
|
||||
if (node) {
|
||||
node->left = left;
|
||||
node->right = right;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_alter(ast_node_t *left, ast_node_t *right) {
|
||||
ast_node_t *node = ast_create_node(AST_ALTER);
|
||||
if (node) {
|
||||
node->left = left;
|
||||
node->right = right;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_star(ast_node_t *child, bool greedy) {
|
||||
ast_node_t *node = ast_create_node(AST_STAR);
|
||||
if (node) {
|
||||
node->left = child;
|
||||
node->quant.greedy = greedy;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_plus(ast_node_t *child, bool greedy) {
|
||||
ast_node_t *node = ast_create_node(AST_PLUS);
|
||||
if (node) {
|
||||
node->left = child;
|
||||
node->quant.greedy = greedy;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_question(ast_node_t *child, bool greedy) {
|
||||
ast_node_t *node = ast_create_node(AST_QUESTION);
|
||||
if (node) {
|
||||
node->left = child;
|
||||
node->quant.greedy = greedy;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_group(ast_node_t *child, int group_id) {
|
||||
ast_node_t *node = ast_create_node(AST_GROUP);
|
||||
if (node) {
|
||||
node->left = child;
|
||||
node->group_id = group_id;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_anchor_start(void) {
|
||||
return ast_create_node(AST_ANCHOR_START);
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_anchor_end(void) {
|
||||
return ast_create_node(AST_ANCHOR_END);
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_bracket(bracket_class_t *bracket) {
|
||||
ast_node_t *node = ast_create_node(AST_BRACKET);
|
||||
if (node) node->bracket = bracket;
|
||||
return node;
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_quantifier(ast_node_t *child, int min, int max, bool greedy) {
|
||||
ast_node_t *node = ast_create_node(AST_QUANTIFIER);
|
||||
if (node) {
|
||||
node->left = child;
|
||||
node->quant.min = min;
|
||||
node->quant.max = max;
|
||||
node->quant.greedy = greedy;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
ast_node_t *ast_create_class(ast_type_t type) {
|
||||
return ast_create_node(type);
|
||||
}
|
||||
|
||||
void ast_free(ast_node_t *node) {
|
||||
if (!node) return;
|
||||
ast_free(node->left);
|
||||
ast_free(node->right);
|
||||
if (node->bracket) bracket_free(node->bracket);
|
||||
free(node);
|
||||
}
|
||||
|
||||
bracket_class_t *bracket_create(void) {
|
||||
bracket_class_t *bracket = malloc(sizeof(bracket_class_t));
|
||||
if (!bracket) return NULL;
|
||||
bracket->ranges = NULL;
|
||||
bracket->count = 0;
|
||||
bracket->capacity = 0;
|
||||
bracket->negated = false;
|
||||
return bracket;
|
||||
}
|
||||
|
||||
static bool bracket_grow(bracket_class_t *bracket) {
|
||||
size_t new_cap = bracket->capacity == 0 ? 8 : bracket->capacity * 2;
|
||||
char_range_t *new_ranges = realloc(bracket->ranges, new_cap * sizeof(char_range_t));
|
||||
if (!new_ranges) return false;
|
||||
bracket->ranges = new_ranges;
|
||||
bracket->capacity = new_cap;
|
||||
return true;
|
||||
}
|
||||
|
||||
void bracket_add_char(bracket_class_t *bracket, char c) {
|
||||
bracket_add_range(bracket, c, c);
|
||||
}
|
||||
|
||||
void bracket_add_range(bracket_class_t *bracket, char start, char end) {
|
||||
if (bracket->count >= bracket->capacity) {
|
||||
if (!bracket_grow(bracket)) return;
|
||||
}
|
||||
bracket->ranges[bracket->count].start = start;
|
||||
bracket->ranges[bracket->count].end = end;
|
||||
bracket->count++;
|
||||
}
|
||||
|
||||
void bracket_free(bracket_class_t *bracket) {
|
||||
if (!bracket) return;
|
||||
free(bracket->ranges);
|
||||
free(bracket);
|
||||
}
|
||||
|
||||
bool bracket_matches(bracket_class_t *bracket, char c) {
|
||||
bool found = false;
|
||||
for (size_t i = 0; i < bracket->count; i++) {
|
||||
if (c >= bracket->ranges[i].start && c <= bracket->ranges[i].end) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return bracket->negated ? !found : found;
|
||||
}
|
||||
125
src/lexer.c
Normal file
125
src/lexer.c
Normal file
@ -0,0 +1,125 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "lexer.h"
|
||||
#include <string.h>
|
||||
|
||||
void lexer_init(lexer_t *lexer, const char *pattern) {
|
||||
lexer->pattern = pattern;
|
||||
lexer->length = strlen(pattern);
|
||||
lexer->position = 0;
|
||||
lexer->in_bracket = false;
|
||||
}
|
||||
|
||||
static token_t make_token(token_type_t type, char value, size_t pos) {
|
||||
token_t token;
|
||||
token.type = type;
|
||||
token.value = value;
|
||||
token.position = pos;
|
||||
return token;
|
||||
}
|
||||
|
||||
token_t lexer_next(lexer_t *lexer) {
|
||||
if (lexer->position >= lexer->length) {
|
||||
return make_token(TOKEN_EOF, '\0', lexer->position);
|
||||
}
|
||||
|
||||
char c = lexer->pattern[lexer->position];
|
||||
size_t pos = lexer->position;
|
||||
lexer->position++;
|
||||
|
||||
if (c == '[' && !lexer->in_bracket) {
|
||||
lexer->in_bracket = true;
|
||||
return make_token(TOKEN_LBRACKET, c, pos);
|
||||
}
|
||||
|
||||
if (c == ']' && lexer->in_bracket) {
|
||||
lexer->in_bracket = false;
|
||||
return make_token(TOKEN_RBRACKET, c, pos);
|
||||
}
|
||||
|
||||
if (lexer->in_bracket) {
|
||||
if (c == '-') {
|
||||
return make_token(TOKEN_DASH, c, pos);
|
||||
}
|
||||
if (c == '^' && pos > 0 && lexer->pattern[pos - 1] == '[') {
|
||||
return make_token(TOKEN_CARET, c, pos);
|
||||
}
|
||||
if (c == '\\' && lexer->position < lexer->length) {
|
||||
char next = lexer->pattern[lexer->position];
|
||||
lexer->position++;
|
||||
switch (next) {
|
||||
case 'd': return make_token(TOKEN_CLASS_DIGIT, 'd', pos);
|
||||
case 'w': return make_token(TOKEN_CLASS_WORD, 'w', pos);
|
||||
case 's': return make_token(TOKEN_CLASS_SPACE, 's', pos);
|
||||
case 'D': return make_token(TOKEN_CLASS_NDIGIT, 'D', pos);
|
||||
case 'W': return make_token(TOKEN_CLASS_NWORD, 'W', pos);
|
||||
case 'S': return make_token(TOKEN_CLASS_NSPACE, 'S', pos);
|
||||
case 'n': return make_token(TOKEN_CHAR, '\n', pos);
|
||||
case 't': return make_token(TOKEN_CHAR, '\t', pos);
|
||||
case 'r': return make_token(TOKEN_CHAR, '\r', pos);
|
||||
default: return make_token(TOKEN_CHAR, next, pos);
|
||||
}
|
||||
}
|
||||
return make_token(TOKEN_CHAR, c, pos);
|
||||
}
|
||||
|
||||
if (c == '\\' && lexer->position < lexer->length) {
|
||||
char next = lexer->pattern[lexer->position];
|
||||
lexer->position++;
|
||||
switch (next) {
|
||||
case 'd': return make_token(TOKEN_CLASS_DIGIT, 'd', pos);
|
||||
case 'w': return make_token(TOKEN_CLASS_WORD, 'w', pos);
|
||||
case 's': return make_token(TOKEN_CLASS_SPACE, 's', pos);
|
||||
case 'D': return make_token(TOKEN_CLASS_NDIGIT, 'D', pos);
|
||||
case 'W': return make_token(TOKEN_CLASS_NWORD, 'W', pos);
|
||||
case 'S': return make_token(TOKEN_CLASS_NSPACE, 'S', pos);
|
||||
case 'n': return make_token(TOKEN_CHAR, '\n', pos);
|
||||
case 't': return make_token(TOKEN_CHAR, '\t', pos);
|
||||
case 'r': return make_token(TOKEN_CHAR, '\r', pos);
|
||||
case '.':
|
||||
case '*':
|
||||
case '+':
|
||||
case '?':
|
||||
case '|':
|
||||
case '(':
|
||||
case ')':
|
||||
case '[':
|
||||
case ']':
|
||||
case '{':
|
||||
case '}':
|
||||
case '^':
|
||||
case '$':
|
||||
case '\\':
|
||||
return make_token(TOKEN_CHAR, next, pos);
|
||||
default:
|
||||
return make_token(TOKEN_CHAR, next, pos);
|
||||
}
|
||||
}
|
||||
|
||||
switch (c) {
|
||||
case '.': return make_token(TOKEN_DOT, c, pos);
|
||||
case '*': return make_token(TOKEN_STAR, c, pos);
|
||||
case '+': return make_token(TOKEN_PLUS, c, pos);
|
||||
case '?': return make_token(TOKEN_QUESTION, c, pos);
|
||||
case '|': return make_token(TOKEN_PIPE, c, pos);
|
||||
case '(': return make_token(TOKEN_LPAREN, c, pos);
|
||||
case ')': return make_token(TOKEN_RPAREN, c, pos);
|
||||
case '^': return make_token(TOKEN_CARET, c, pos);
|
||||
case '$': return make_token(TOKEN_DOLLAR, c, pos);
|
||||
case '{': return make_token(TOKEN_LBRACE, c, pos);
|
||||
case '}': return make_token(TOKEN_RBRACE, c, pos);
|
||||
default: return make_token(TOKEN_CHAR, c, pos);
|
||||
}
|
||||
}
|
||||
|
||||
token_t lexer_peek(lexer_t *lexer) {
|
||||
size_t saved_pos = lexer->position;
|
||||
bool saved_bracket = lexer->in_bracket;
|
||||
token_t token = lexer_next(lexer);
|
||||
lexer->position = saved_pos;
|
||||
lexer->in_bracket = saved_bracket;
|
||||
return token;
|
||||
}
|
||||
|
||||
bool lexer_eof(lexer_t *lexer) {
|
||||
return lexer->position >= lexer->length;
|
||||
}
|
||||
71
src/loreg.c
Normal file
71
src/loreg.c
Normal file
@ -0,0 +1,71 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "loreg.h"
|
||||
#include "parser.h"
|
||||
#include "nfa.h"
|
||||
#include "matcher.h"
|
||||
#include <stdlib.h>
|
||||
|
||||
struct loreg_regex {
|
||||
nfa_t *nfa;
|
||||
ast_node_t *ast;
|
||||
};
|
||||
|
||||
loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error) {
|
||||
*error = LOREG_OK;
|
||||
|
||||
loreg_regex_t *regex = malloc(sizeof(loreg_regex_t));
|
||||
if (!regex) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
parser_t parser;
|
||||
parser_init(&parser, pattern);
|
||||
|
||||
regex->ast = parser_parse(&parser);
|
||||
*error = parser_get_error(&parser);
|
||||
|
||||
if (*error != LOREG_OK) {
|
||||
ast_free(regex->ast);
|
||||
free(regex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
regex->nfa = nfa_from_ast(regex->ast, error);
|
||||
if (*error != LOREG_OK) {
|
||||
ast_free(regex->ast);
|
||||
free(regex);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return regex;
|
||||
}
|
||||
|
||||
void loreg_free(loreg_regex_t *regex) {
|
||||
if (!regex) return;
|
||||
nfa_free(regex->nfa);
|
||||
ast_free(regex->ast);
|
||||
free(regex);
|
||||
}
|
||||
|
||||
bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result) {
|
||||
return nfa_match(regex->nfa, text, 0, result);
|
||||
}
|
||||
|
||||
bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result) {
|
||||
return nfa_search(regex->nfa, text, result);
|
||||
}
|
||||
|
||||
const char *loreg_error_string(loreg_error_t error) {
|
||||
switch (error) {
|
||||
case LOREG_OK: return "success";
|
||||
case LOREG_ERR_INVALID_PATTERN: return "invalid pattern";
|
||||
case LOREG_ERR_UNBALANCED_PAREN: return "unbalanced parentheses";
|
||||
case LOREG_ERR_EMPTY_GROUP: return "empty group";
|
||||
case LOREG_ERR_INVALID_QUANTIFIER: return "invalid quantifier";
|
||||
case LOREG_ERR_INVALID_ESCAPE: return "invalid escape sequence";
|
||||
case LOREG_ERR_OUT_OF_MEMORY: return "out of memory";
|
||||
case LOREG_ERR_STATE_OVERFLOW: return "state overflow";
|
||||
default: return "unknown error";
|
||||
}
|
||||
}
|
||||
107
src/main.c
Normal file
107
src/main.c
Normal file
@ -0,0 +1,107 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "loreg.h"
|
||||
#include "repl.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
static void print_usage(const char *program) {
|
||||
printf("usage: %s [options] [pattern] [text]\n", program);
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help\n");
|
||||
printf(" -v, --version show version\n");
|
||||
printf(" -m, --match full match mode (default is search)\n");
|
||||
printf(" -i start interactive REPL\n");
|
||||
printf("\n");
|
||||
printf("examples:\n");
|
||||
printf(" %s start REPL\n", program);
|
||||
printf(" %s -i start REPL\n", program);
|
||||
printf(" %s \"a+b\" \"aaab\" search pattern in text\n", program);
|
||||
printf(" %s -m \"a+b\" \"aaab\" match pattern against text\n", program);
|
||||
}
|
||||
|
||||
static void print_version(void) {
|
||||
printf("loreg %s\n", LOREG_VERSION);
|
||||
}
|
||||
|
||||
static void print_match(const char *text, loreg_match_t *result) {
|
||||
if (!result->matched) {
|
||||
printf("no match\n");
|
||||
return;
|
||||
}
|
||||
|
||||
printf("match: \"");
|
||||
for (size_t i = result->match_start; i < result->match_end; i++) {
|
||||
printf("%c", text[i]);
|
||||
}
|
||||
printf("\" [%zu-%zu]\n", result->match_start, result->match_end);
|
||||
|
||||
for (size_t i = 0; i < result->group_count; i++) {
|
||||
if (result->groups[i].matched) {
|
||||
printf(" group %zu: \"", i);
|
||||
for (size_t j = result->groups[i].start; j < result->groups[i].end; j++) {
|
||||
printf("%c", text[j]);
|
||||
}
|
||||
printf("\" [%zu-%zu]\n", result->groups[i].start, result->groups[i].end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
if (argc == 1) {
|
||||
repl_run();
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool match_mode = false;
|
||||
int arg_idx = 1;
|
||||
|
||||
while (arg_idx < argc && argv[arg_idx][0] == '-') {
|
||||
if (strcmp(argv[arg_idx], "-h") == 0 || strcmp(argv[arg_idx], "--help") == 0) {
|
||||
print_usage(argv[0]);
|
||||
return 0;
|
||||
}
|
||||
if (strcmp(argv[arg_idx], "-v") == 0 || strcmp(argv[arg_idx], "--version") == 0) {
|
||||
print_version();
|
||||
return 0;
|
||||
}
|
||||
if (strcmp(argv[arg_idx], "-m") == 0 || strcmp(argv[arg_idx], "--match") == 0) {
|
||||
match_mode = true;
|
||||
arg_idx++;
|
||||
continue;
|
||||
}
|
||||
if (strcmp(argv[arg_idx], "-i") == 0) {
|
||||
repl_run();
|
||||
return 0;
|
||||
}
|
||||
fprintf(stderr, "unknown option: %s\n", argv[arg_idx]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argc - arg_idx < 2) {
|
||||
fprintf(stderr, "error: pattern and text required\n");
|
||||
print_usage(argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char *pattern = argv[arg_idx];
|
||||
const char *text = argv[arg_idx + 1];
|
||||
|
||||
loreg_error_t error;
|
||||
loreg_regex_t *regex = loreg_compile(pattern, &error);
|
||||
if (!regex) {
|
||||
fprintf(stderr, "error: %s\n", loreg_error_string(error));
|
||||
return 1;
|
||||
}
|
||||
|
||||
loreg_match_t result;
|
||||
if (match_mode) {
|
||||
loreg_match(regex, text, &result);
|
||||
} else {
|
||||
loreg_search(regex, text, &result);
|
||||
}
|
||||
|
||||
print_match(text, &result);
|
||||
|
||||
loreg_free(regex);
|
||||
return result.matched ? 0 : 1;
|
||||
}
|
||||
411
src/matcher.c
Normal file
411
src/matcher.c
Normal file
@ -0,0 +1,411 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "matcher.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
|
||||
state_set_t *state_set_create(size_t initial_capacity, int group_count) {
|
||||
state_set_t *set = malloc(sizeof(state_set_t));
|
||||
if (!set) return NULL;
|
||||
|
||||
set->states = calloc(initial_capacity, sizeof(nfa_state_t *));
|
||||
if (!set->states) {
|
||||
free(set);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
set->count = 0;
|
||||
set->capacity = initial_capacity;
|
||||
set->group_count = group_count;
|
||||
|
||||
if (group_count > 0) {
|
||||
set->group_starts = calloc(group_count, sizeof(size_t));
|
||||
set->group_ends = calloc(group_count, sizeof(size_t));
|
||||
if (!set->group_starts || !set->group_ends) {
|
||||
free(set->group_starts);
|
||||
free(set->group_ends);
|
||||
free(set->states);
|
||||
free(set);
|
||||
return NULL;
|
||||
}
|
||||
for (int i = 0; i < group_count; i++) {
|
||||
set->group_starts[i] = (size_t)-1;
|
||||
set->group_ends[i] = (size_t)-1;
|
||||
}
|
||||
} else {
|
||||
set->group_starts = NULL;
|
||||
set->group_ends = NULL;
|
||||
}
|
||||
|
||||
return set;
|
||||
}
|
||||
|
||||
void state_set_free(state_set_t *set) {
|
||||
if (!set) return;
|
||||
free(set->states);
|
||||
free(set->group_starts);
|
||||
free(set->group_ends);
|
||||
free(set);
|
||||
}
|
||||
|
||||
void state_set_clear(state_set_t *set) {
|
||||
memset(set->states, 0, set->capacity * sizeof(nfa_state_t *));
|
||||
set->count = 0;
|
||||
}
|
||||
|
||||
static bool state_set_grow(state_set_t *set) {
|
||||
size_t new_cap = set->capacity * 2;
|
||||
nfa_state_t **new_states = realloc(set->states, new_cap * sizeof(nfa_state_t *));
|
||||
if (!new_states) return false;
|
||||
memset(new_states + set->capacity, 0, set->capacity * sizeof(nfa_state_t *));
|
||||
set->states = new_states;
|
||||
set->capacity = new_cap;
|
||||
return true;
|
||||
}
|
||||
|
||||
void state_set_add(state_set_t *set, nfa_state_t *state) {
|
||||
if (state_set_contains(set, state)) return;
|
||||
if (set->count >= set->capacity) {
|
||||
if (!state_set_grow(set)) return;
|
||||
}
|
||||
set->states[set->count++] = state;
|
||||
}
|
||||
|
||||
bool state_set_contains(state_set_t *set, nfa_state_t *state) {
|
||||
for (size_t i = 0; i < set->count; i++) {
|
||||
if (set->states[i] == state) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool is_digit(char c) {
|
||||
return c >= '0' && c <= '9';
|
||||
}
|
||||
|
||||
static bool is_word(char c) {
|
||||
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
|
||||
(c >= '0' && c <= '9') || c == '_';
|
||||
}
|
||||
|
||||
static bool is_space(char c) {
|
||||
return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v';
|
||||
}
|
||||
|
||||
static bool transition_matches(transition_t *t, char c, size_t pos, size_t len) {
|
||||
switch (t->type) {
|
||||
case TRANS_CHAR:
|
||||
return t->value == c;
|
||||
case TRANS_DOT:
|
||||
return c != '\n' && c != '\0';
|
||||
case TRANS_BRACKET:
|
||||
return bracket_matches(t->bracket, c);
|
||||
case TRANS_CLASS_DIGIT:
|
||||
return is_digit(c);
|
||||
case TRANS_CLASS_WORD:
|
||||
return is_word(c);
|
||||
case TRANS_CLASS_SPACE:
|
||||
return is_space(c);
|
||||
case TRANS_CLASS_NDIGIT:
|
||||
return !is_digit(c) && c != '\0';
|
||||
case TRANS_CLASS_NWORD:
|
||||
return !is_word(c) && c != '\0';
|
||||
case TRANS_CLASS_NSPACE:
|
||||
return !is_space(c) && c != '\0';
|
||||
case TRANS_ANCHOR_START:
|
||||
return pos == 0;
|
||||
case TRANS_ANCHOR_END:
|
||||
return pos == len;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
nfa_state_t *state;
|
||||
size_t *group_starts;
|
||||
size_t *group_ends;
|
||||
} thread_t;
|
||||
|
||||
typedef struct {
|
||||
thread_t *threads;
|
||||
size_t count;
|
||||
size_t capacity;
|
||||
int group_count;
|
||||
} thread_list_t;
|
||||
|
||||
static thread_list_t *thread_list_create(size_t capacity, int group_count) {
|
||||
thread_list_t *list = malloc(sizeof(thread_list_t));
|
||||
if (!list) return NULL;
|
||||
|
||||
list->threads = malloc(capacity * sizeof(thread_t));
|
||||
if (!list->threads) {
|
||||
free(list);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < capacity; i++) {
|
||||
if (group_count > 0) {
|
||||
list->threads[i].group_starts = malloc(group_count * sizeof(size_t));
|
||||
list->threads[i].group_ends = malloc(group_count * sizeof(size_t));
|
||||
if (!list->threads[i].group_starts || !list->threads[i].group_ends) {
|
||||
for (size_t j = 0; j <= i; j++) {
|
||||
free(list->threads[j].group_starts);
|
||||
free(list->threads[j].group_ends);
|
||||
}
|
||||
free(list->threads);
|
||||
free(list);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
list->threads[i].group_starts = NULL;
|
||||
list->threads[i].group_ends = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
list->count = 0;
|
||||
list->capacity = capacity;
|
||||
list->group_count = group_count;
|
||||
return list;
|
||||
}
|
||||
|
||||
static void thread_list_free(thread_list_t *list) {
|
||||
if (!list) return;
|
||||
for (size_t i = 0; i < list->capacity; i++) {
|
||||
free(list->threads[i].group_starts);
|
||||
free(list->threads[i].group_ends);
|
||||
}
|
||||
free(list->threads);
|
||||
free(list);
|
||||
}
|
||||
|
||||
static void thread_list_clear(thread_list_t *list) {
|
||||
list->count = 0;
|
||||
}
|
||||
|
||||
static bool thread_list_contains_state(thread_list_t *list, nfa_state_t *state) {
|
||||
for (size_t i = 0; i < list->count; i++) {
|
||||
if (list->threads[i].state == state) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static void add_thread(thread_list_t *list, nfa_state_t *state,
|
||||
size_t *group_starts, size_t *group_ends);
|
||||
|
||||
static void follow_epsilons(thread_list_t *list, nfa_state_t *state,
|
||||
size_t *group_starts, size_t *group_ends,
|
||||
size_t pos, size_t len, bool *visited) {
|
||||
if (!state || visited[state->id]) return;
|
||||
visited[state->id] = true;
|
||||
|
||||
for (size_t i = 0; i < state->trans_count; i++) {
|
||||
transition_t *t = &state->transitions[i];
|
||||
|
||||
if (t->type == TRANS_EPSILON) {
|
||||
follow_epsilons(list, t->target, group_starts, group_ends,
|
||||
pos, len, visited);
|
||||
} else if (t->type == TRANS_GROUP_START) {
|
||||
size_t *new_starts = malloc(list->group_count * sizeof(size_t));
|
||||
size_t *new_ends = malloc(list->group_count * sizeof(size_t));
|
||||
if (new_starts && new_ends) {
|
||||
memcpy(new_starts, group_starts, list->group_count * sizeof(size_t));
|
||||
memcpy(new_ends, group_ends, list->group_count * sizeof(size_t));
|
||||
new_starts[t->group_id] = pos;
|
||||
follow_epsilons(list, t->target, new_starts, new_ends,
|
||||
pos, len, visited);
|
||||
}
|
||||
free(new_starts);
|
||||
free(new_ends);
|
||||
} else if (t->type == TRANS_GROUP_END) {
|
||||
size_t *new_starts = malloc(list->group_count * sizeof(size_t));
|
||||
size_t *new_ends = malloc(list->group_count * sizeof(size_t));
|
||||
if (new_starts && new_ends) {
|
||||
memcpy(new_starts, group_starts, list->group_count * sizeof(size_t));
|
||||
memcpy(new_ends, group_ends, list->group_count * sizeof(size_t));
|
||||
new_ends[t->group_id] = pos;
|
||||
follow_epsilons(list, t->target, new_starts, new_ends,
|
||||
pos, len, visited);
|
||||
}
|
||||
free(new_starts);
|
||||
free(new_ends);
|
||||
} else if (t->type == TRANS_ANCHOR_START || t->type == TRANS_ANCHOR_END) {
|
||||
if (transition_matches(t, '\0', pos, len)) {
|
||||
follow_epsilons(list, t->target, group_starts, group_ends,
|
||||
pos, len, visited);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
add_thread(list, state, group_starts, group_ends);
|
||||
}
|
||||
|
||||
static void add_thread(thread_list_t *list, nfa_state_t *state,
|
||||
size_t *group_starts, size_t *group_ends) {
|
||||
if (!state) return;
|
||||
if (thread_list_contains_state(list, state)) return;
|
||||
|
||||
if (list->count >= list->capacity) return;
|
||||
|
||||
thread_t *thread = &list->threads[list->count++];
|
||||
thread->state = state;
|
||||
if (list->group_count > 0) {
|
||||
memcpy(thread->group_starts, group_starts, list->group_count * sizeof(size_t));
|
||||
memcpy(thread->group_ends, group_ends, list->group_count * sizeof(size_t));
|
||||
}
|
||||
}
|
||||
|
||||
bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result) {
|
||||
size_t len = strlen(text);
|
||||
size_t num_states = nfa->state_count;
|
||||
int group_count = nfa->group_count > 0 ? nfa->group_count : 1;
|
||||
|
||||
thread_list_t *current = thread_list_create(num_states, group_count);
|
||||
thread_list_t *next = thread_list_create(num_states, group_count);
|
||||
bool *visited = calloc(num_states, sizeof(bool));
|
||||
|
||||
if (!current || !next || !visited) {
|
||||
thread_list_free(current);
|
||||
thread_list_free(next);
|
||||
free(visited);
|
||||
return false;
|
||||
}
|
||||
|
||||
size_t *init_starts = calloc(group_count, sizeof(size_t));
|
||||
size_t *init_ends = calloc(group_count, sizeof(size_t));
|
||||
if (!init_starts || !init_ends) {
|
||||
free(init_starts);
|
||||
free(init_ends);
|
||||
thread_list_free(current);
|
||||
thread_list_free(next);
|
||||
free(visited);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < group_count; i++) {
|
||||
init_starts[i] = (size_t)-1;
|
||||
init_ends[i] = (size_t)-1;
|
||||
}
|
||||
|
||||
memset(visited, 0, num_states * sizeof(bool));
|
||||
follow_epsilons(current, nfa->start, init_starts, init_ends,
|
||||
start_pos, len, visited);
|
||||
|
||||
bool matched = false;
|
||||
size_t match_end = start_pos;
|
||||
size_t *best_starts = calloc(group_count, sizeof(size_t));
|
||||
size_t *best_ends = calloc(group_count, sizeof(size_t));
|
||||
|
||||
if (!best_starts || !best_ends) {
|
||||
free(init_starts);
|
||||
free(init_ends);
|
||||
free(best_starts);
|
||||
free(best_ends);
|
||||
thread_list_free(current);
|
||||
thread_list_free(next);
|
||||
free(visited);
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < group_count; i++) {
|
||||
best_starts[i] = (size_t)-1;
|
||||
best_ends[i] = (size_t)-1;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < current->count; i++) {
|
||||
if (current->threads[i].state->accepting) {
|
||||
matched = true;
|
||||
match_end = start_pos;
|
||||
memcpy(best_starts, current->threads[i].group_starts, group_count * sizeof(size_t));
|
||||
memcpy(best_ends, current->threads[i].group_ends, group_count * sizeof(size_t));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t pos = start_pos; pos < len; pos++) {
|
||||
char c = text[pos];
|
||||
thread_list_clear(next);
|
||||
|
||||
for (size_t i = 0; i < current->count; i++) {
|
||||
thread_t *thread = ¤t->threads[i];
|
||||
nfa_state_t *state = thread->state;
|
||||
|
||||
for (size_t j = 0; j < state->trans_count; j++) {
|
||||
transition_t *t = &state->transitions[j];
|
||||
|
||||
if (t->type != TRANS_EPSILON &&
|
||||
t->type != TRANS_GROUP_START &&
|
||||
t->type != TRANS_GROUP_END &&
|
||||
t->type != TRANS_ANCHOR_START &&
|
||||
t->type != TRANS_ANCHOR_END) {
|
||||
|
||||
if (transition_matches(t, c, pos, len)) {
|
||||
memset(visited, 0, num_states * sizeof(bool));
|
||||
follow_epsilons(next, t->target,
|
||||
thread->group_starts, thread->group_ends,
|
||||
pos + 1, len, visited);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (next->count == 0) break;
|
||||
|
||||
thread_list_t *tmp = current;
|
||||
current = next;
|
||||
next = tmp;
|
||||
|
||||
for (size_t i = 0; i < current->count; i++) {
|
||||
if (current->threads[i].state->accepting) {
|
||||
matched = true;
|
||||
match_end = pos + 1;
|
||||
memcpy(best_starts, current->threads[i].group_starts, group_count * sizeof(size_t));
|
||||
memcpy(best_ends, current->threads[i].group_ends, group_count * sizeof(size_t));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result) {
|
||||
result->matched = matched;
|
||||
result->match_start = start_pos;
|
||||
result->match_end = matched ? match_end : start_pos;
|
||||
result->group_count = nfa->group_count;
|
||||
|
||||
for (int i = 0; i < LOREG_MAX_GROUPS && i < nfa->group_count; i++) {
|
||||
result->groups[i].start = best_starts[i];
|
||||
result->groups[i].end = best_ends[i];
|
||||
result->groups[i].matched = (best_starts[i] != (size_t)-1 && best_ends[i] != (size_t)-1);
|
||||
}
|
||||
}
|
||||
|
||||
free(init_starts);
|
||||
free(init_ends);
|
||||
free(best_starts);
|
||||
free(best_ends);
|
||||
thread_list_free(current);
|
||||
thread_list_free(next);
|
||||
free(visited);
|
||||
|
||||
return matched;
|
||||
}
|
||||
|
||||
bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result) {
|
||||
size_t len = strlen(text);
|
||||
|
||||
for (size_t i = 0; i <= len; i++) {
|
||||
if (nfa_match(nfa, text, i, result)) {
|
||||
if (result) {
|
||||
result->match_start = i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (result) {
|
||||
result->matched = false;
|
||||
result->match_start = 0;
|
||||
result->match_end = 0;
|
||||
result->group_count = 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
477
src/nfa.c
Normal file
477
src/nfa.c
Normal file
@ -0,0 +1,477 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "nfa.h"
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
nfa_t *nfa_create(void) {
|
||||
nfa_t *nfa = malloc(sizeof(nfa_t));
|
||||
if (!nfa) return NULL;
|
||||
nfa->states = NULL;
|
||||
nfa->state_count = 0;
|
||||
nfa->capacity = 0;
|
||||
nfa->start = NULL;
|
||||
nfa->group_count = 0;
|
||||
return nfa;
|
||||
}
|
||||
|
||||
void nfa_free(nfa_t *nfa) {
|
||||
if (!nfa) return;
|
||||
for (size_t i = 0; i < nfa->state_count; i++) {
|
||||
free(nfa->states[i]->transitions);
|
||||
free(nfa->states[i]);
|
||||
}
|
||||
free(nfa->states);
|
||||
free(nfa);
|
||||
}
|
||||
|
||||
static bool nfa_grow(nfa_t *nfa) {
|
||||
size_t new_cap = nfa->capacity == 0 ? 16 : nfa->capacity * 2;
|
||||
if (new_cap > LOREG_MAX_STATES) {
|
||||
if (nfa->capacity >= LOREG_MAX_STATES) return false;
|
||||
new_cap = LOREG_MAX_STATES;
|
||||
}
|
||||
nfa_state_t **new_states = realloc(nfa->states, new_cap * sizeof(nfa_state_t *));
|
||||
if (!new_states) return false;
|
||||
nfa->states = new_states;
|
||||
nfa->capacity = new_cap;
|
||||
return true;
|
||||
}
|
||||
|
||||
nfa_state_t *nfa_add_state(nfa_t *nfa) {
|
||||
if (nfa->state_count >= nfa->capacity) {
|
||||
if (!nfa_grow(nfa)) return NULL;
|
||||
}
|
||||
|
||||
nfa_state_t *state = malloc(sizeof(nfa_state_t));
|
||||
if (!state) return NULL;
|
||||
|
||||
state->id = (int)nfa->state_count;
|
||||
state->accepting = false;
|
||||
state->transitions = NULL;
|
||||
state->trans_count = 0;
|
||||
state->trans_capacity = 0;
|
||||
|
||||
nfa->states[nfa->state_count++] = state;
|
||||
return state;
|
||||
}
|
||||
|
||||
static bool transition_grow(nfa_state_t *state) {
|
||||
size_t new_cap = state->trans_capacity == 0 ? 4 : state->trans_capacity * 2;
|
||||
transition_t *new_trans = realloc(state->transitions, new_cap * sizeof(transition_t));
|
||||
if (!new_trans) return false;
|
||||
state->transitions = new_trans;
|
||||
state->trans_capacity = new_cap;
|
||||
return true;
|
||||
}
|
||||
|
||||
void nfa_add_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, char value) {
|
||||
if (from->trans_count >= from->trans_capacity) {
|
||||
if (!transition_grow(from)) return;
|
||||
}
|
||||
transition_t *t = &from->transitions[from->trans_count++];
|
||||
t->type = type;
|
||||
t->value = value;
|
||||
t->target = to;
|
||||
t->bracket = NULL;
|
||||
t->group_id = -1;
|
||||
}
|
||||
|
||||
void nfa_add_bracket_transition(nfa_state_t *from, nfa_state_t *to, bracket_class_t *bracket) {
|
||||
if (from->trans_count >= from->trans_capacity) {
|
||||
if (!transition_grow(from)) return;
|
||||
}
|
||||
transition_t *t = &from->transitions[from->trans_count++];
|
||||
t->type = TRANS_BRACKET;
|
||||
t->value = '\0';
|
||||
t->target = to;
|
||||
t->bracket = bracket;
|
||||
t->group_id = -1;
|
||||
}
|
||||
|
||||
void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, int group_id) {
|
||||
if (from->trans_count >= from->trans_capacity) {
|
||||
if (!transition_grow(from)) return;
|
||||
}
|
||||
transition_t *t = &from->transitions[from->trans_count++];
|
||||
t->type = type;
|
||||
t->value = '\0';
|
||||
t->target = to;
|
||||
t->bracket = NULL;
|
||||
t->group_id = group_id;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error);
|
||||
|
||||
static nfa_fragment_t build_char(nfa_t *nfa, char c, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
nfa_add_transition(start, accept, TRANS_CHAR, c);
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_dot(nfa_t *nfa, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
nfa_add_transition(start, accept, TRANS_DOT, '\0');
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
nfa_add_transition(start, accept, type, '\0');
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
nfa_add_bracket_transition(start, accept, bracket);
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_fragment_t left_frag = build_nfa(nfa, left, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
nfa_fragment_t right_frag = build_nfa(nfa, right, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
|
||||
nfa_add_transition(left_frag.accept, right_frag.start, TRANS_EPSILON, '\0');
|
||||
frag.start = left_frag.start;
|
||||
frag.accept = right_frag.accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_fragment_t left_frag = build_nfa(nfa, left, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
nfa_fragment_t right_frag = build_nfa(nfa, right, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
|
||||
nfa_add_transition(start, left_frag.start, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(start, right_frag.start, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(left_frag.accept, accept, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(right_frag.accept, accept, TRANS_EPSILON, '\0');
|
||||
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(start, accept, TRANS_EPSILON, '\0');
|
||||
} else {
|
||||
nfa_add_transition(start, accept, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
|
||||
}
|
||||
nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0');
|
||||
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0');
|
||||
} else {
|
||||
nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0');
|
||||
}
|
||||
|
||||
frag.start = child_frag.start;
|
||||
frag.accept = accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(start, accept, TRANS_EPSILON, '\0');
|
||||
} else {
|
||||
nfa_add_transition(start, accept, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0');
|
||||
}
|
||||
nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0');
|
||||
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_fragment_t child_frag = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
|
||||
nfa_add_group_transition(start, child_frag.start, TRANS_GROUP_START, group_id);
|
||||
nfa_add_group_transition(child_frag.accept, accept, TRANS_GROUP_END, group_id);
|
||||
|
||||
if (group_id + 1 > nfa->group_count) {
|
||||
nfa->group_count = group_id + 1;
|
||||
}
|
||||
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
nfa_add_transition(start, accept, type, '\0');
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, int max, bool greedy, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
|
||||
if (min == 0 && max == 0) {
|
||||
nfa_state_t *state = nfa_add_state(nfa);
|
||||
if (!state) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
frag.start = state;
|
||||
frag.accept = state;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_state_t *start = nfa_add_state(nfa);
|
||||
if (!start) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_state_t *current = start;
|
||||
|
||||
for (int i = 0; i < min; i++) {
|
||||
nfa_fragment_t rep = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0');
|
||||
current = rep.accept;
|
||||
}
|
||||
|
||||
if (max < 0) {
|
||||
nfa_state_t *loop_start = nfa_add_state(nfa);
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!loop_start || !accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_add_transition(current, loop_start, TRANS_EPSILON, '\0');
|
||||
|
||||
nfa_fragment_t rep = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(loop_start, rep.start, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(loop_start, accept, TRANS_EPSILON, '\0');
|
||||
} else {
|
||||
nfa_add_transition(loop_start, accept, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(loop_start, rep.start, TRANS_EPSILON, '\0');
|
||||
}
|
||||
nfa_add_transition(rep.accept, loop_start, TRANS_EPSILON, '\0');
|
||||
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
} else {
|
||||
nfa_state_t *accept = nfa_add_state(nfa);
|
||||
if (!accept) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_add_transition(current, accept, TRANS_EPSILON, '\0');
|
||||
|
||||
for (int i = min; i < max; i++) {
|
||||
nfa_fragment_t rep = build_nfa(nfa, child, error);
|
||||
if (*error != LOREG_OK) return frag;
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0');
|
||||
} else {
|
||||
nfa_add_transition(current, accept, TRANS_EPSILON, '\0');
|
||||
nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0');
|
||||
}
|
||||
|
||||
if (greedy) {
|
||||
nfa_add_transition(rep.accept, accept, TRANS_EPSILON, '\0');
|
||||
}
|
||||
current = rep.accept;
|
||||
}
|
||||
|
||||
if (!greedy) {
|
||||
nfa_add_transition(current, accept, TRANS_EPSILON, '\0');
|
||||
}
|
||||
|
||||
frag.start = start;
|
||||
frag.accept = accept;
|
||||
}
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error) {
|
||||
nfa_fragment_t frag = {NULL, NULL};
|
||||
|
||||
if (!ast) {
|
||||
nfa_state_t *state = nfa_add_state(nfa);
|
||||
if (!state) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return frag;
|
||||
}
|
||||
frag.start = state;
|
||||
frag.accept = state;
|
||||
return frag;
|
||||
}
|
||||
|
||||
switch (ast->type) {
|
||||
case AST_CHAR:
|
||||
return build_char(nfa, ast->value, error);
|
||||
case AST_DOT:
|
||||
return build_dot(nfa, error);
|
||||
case AST_CONCAT:
|
||||
return build_concat(nfa, ast->left, ast->right, error);
|
||||
case AST_ALTER:
|
||||
return build_alter(nfa, ast->left, ast->right, error);
|
||||
case AST_STAR:
|
||||
return build_star(nfa, ast->left, ast->quant.greedy, error);
|
||||
case AST_PLUS:
|
||||
return build_plus(nfa, ast->left, ast->quant.greedy, error);
|
||||
case AST_QUESTION:
|
||||
return build_question(nfa, ast->left, ast->quant.greedy, error);
|
||||
case AST_GROUP:
|
||||
return build_group(nfa, ast->left, ast->group_id, error);
|
||||
case AST_ANCHOR_START:
|
||||
return build_anchor(nfa, TRANS_ANCHOR_START, error);
|
||||
case AST_ANCHOR_END:
|
||||
return build_anchor(nfa, TRANS_ANCHOR_END, error);
|
||||
case AST_BRACKET:
|
||||
return build_bracket(nfa, ast->bracket, error);
|
||||
case AST_QUANTIFIER:
|
||||
return build_quantifier(nfa, ast->left, ast->quant.min, ast->quant.max, ast->quant.greedy, error);
|
||||
case AST_CLASS_DIGIT:
|
||||
return build_class(nfa, TRANS_CLASS_DIGIT, error);
|
||||
case AST_CLASS_WORD:
|
||||
return build_class(nfa, TRANS_CLASS_WORD, error);
|
||||
case AST_CLASS_SPACE:
|
||||
return build_class(nfa, TRANS_CLASS_SPACE, error);
|
||||
case AST_CLASS_NDIGIT:
|
||||
return build_class(nfa, TRANS_CLASS_NDIGIT, error);
|
||||
case AST_CLASS_NWORD:
|
||||
return build_class(nfa, TRANS_CLASS_NWORD, error);
|
||||
case AST_CLASS_NSPACE:
|
||||
return build_class(nfa, TRANS_CLASS_NSPACE, error);
|
||||
}
|
||||
|
||||
return frag;
|
||||
}
|
||||
|
||||
nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error) {
|
||||
*error = LOREG_OK;
|
||||
nfa_t *nfa = nfa_create();
|
||||
if (!nfa) {
|
||||
*error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
nfa_fragment_t frag = build_nfa(nfa, ast, error);
|
||||
if (*error != LOREG_OK) {
|
||||
nfa_free(nfa);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
nfa->start = frag.start;
|
||||
frag.accept->accepting = true;
|
||||
|
||||
return nfa;
|
||||
}
|
||||
309
src/parser.c
Normal file
309
src/parser.c
Normal file
@ -0,0 +1,309 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "parser.h"
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
|
||||
static void parser_advance(parser_t *parser) {
|
||||
parser->current = lexer_next(&parser->lexer);
|
||||
}
|
||||
|
||||
void parser_init(parser_t *parser, const char *pattern) {
|
||||
lexer_init(&parser->lexer, pattern);
|
||||
parser->current = lexer_next(&parser->lexer);
|
||||
parser->error = LOREG_OK;
|
||||
parser->group_count = 0;
|
||||
}
|
||||
|
||||
loreg_error_t parser_get_error(parser_t *parser) {
|
||||
return parser->error;
|
||||
}
|
||||
|
||||
static ast_node_t *parse_expr(parser_t *parser);
|
||||
static ast_node_t *parse_term(parser_t *parser);
|
||||
static ast_node_t *parse_factor(parser_t *parser);
|
||||
static ast_node_t *parse_atom(parser_t *parser);
|
||||
static ast_node_t *parse_bracket(parser_t *parser);
|
||||
static int parse_number(parser_t *parser);
|
||||
|
||||
static ast_node_t *parse_expr(parser_t *parser) {
|
||||
ast_node_t *left = parse_term(parser);
|
||||
if (!left || parser->error != LOREG_OK) return left;
|
||||
|
||||
while (parser->current.type == TOKEN_PIPE) {
|
||||
parser_advance(parser);
|
||||
ast_node_t *right = parse_term(parser);
|
||||
if (!right) {
|
||||
ast_free(left);
|
||||
return NULL;
|
||||
}
|
||||
left = ast_create_alter(left, right);
|
||||
if (!left) {
|
||||
parser->error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
static ast_node_t *parse_term(parser_t *parser) {
|
||||
ast_node_t *left = NULL;
|
||||
|
||||
while (parser->current.type != TOKEN_EOF &&
|
||||
parser->current.type != TOKEN_PIPE &&
|
||||
parser->current.type != TOKEN_RPAREN) {
|
||||
ast_node_t *factor = parse_factor(parser);
|
||||
if (!factor) {
|
||||
ast_free(left);
|
||||
return NULL;
|
||||
}
|
||||
if (left == NULL) {
|
||||
left = factor;
|
||||
} else {
|
||||
left = ast_create_concat(left, factor);
|
||||
if (!left) {
|
||||
parser->error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
return left;
|
||||
}
|
||||
|
||||
static ast_node_t *parse_factor(parser_t *parser) {
|
||||
ast_node_t *atom = parse_atom(parser);
|
||||
if (!atom || parser->error != LOREG_OK) return atom;
|
||||
|
||||
while (parser->current.type == TOKEN_STAR ||
|
||||
parser->current.type == TOKEN_PLUS ||
|
||||
parser->current.type == TOKEN_QUESTION ||
|
||||
parser->current.type == TOKEN_LBRACE) {
|
||||
|
||||
token_type_t quant_type = parser->current.type;
|
||||
parser_advance(parser);
|
||||
|
||||
bool greedy = true;
|
||||
if (parser->current.type == TOKEN_QUESTION) {
|
||||
greedy = false;
|
||||
parser_advance(parser);
|
||||
}
|
||||
|
||||
if (quant_type == TOKEN_STAR) {
|
||||
atom = ast_create_star(atom, greedy);
|
||||
} else if (quant_type == TOKEN_PLUS) {
|
||||
atom = ast_create_plus(atom, greedy);
|
||||
} else if (quant_type == TOKEN_QUESTION) {
|
||||
atom = ast_create_question(atom, greedy);
|
||||
} else if (quant_type == TOKEN_LBRACE) {
|
||||
int min = parse_number(parser);
|
||||
int max = min;
|
||||
|
||||
if (parser->current.type == TOKEN_CHAR && parser->current.value == ',') {
|
||||
parser_advance(parser);
|
||||
if (parser->current.type == TOKEN_RBRACE) {
|
||||
max = -1;
|
||||
} else {
|
||||
max = parse_number(parser);
|
||||
}
|
||||
}
|
||||
|
||||
if (parser->current.type != TOKEN_RBRACE) {
|
||||
parser->error = LOREG_ERR_INVALID_QUANTIFIER;
|
||||
ast_free(atom);
|
||||
return NULL;
|
||||
}
|
||||
parser_advance(parser);
|
||||
|
||||
if (parser->current.type == TOKEN_QUESTION) {
|
||||
greedy = false;
|
||||
parser_advance(parser);
|
||||
}
|
||||
|
||||
atom = ast_create_quantifier(atom, min, max, greedy);
|
||||
}
|
||||
|
||||
if (!atom) {
|
||||
parser->error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return atom;
|
||||
}
|
||||
|
||||
static int parse_number(parser_t *parser) {
|
||||
int num = 0;
|
||||
while (parser->current.type == TOKEN_CHAR && isdigit(parser->current.value)) {
|
||||
num = num * 10 + (parser->current.value - '0');
|
||||
parser_advance(parser);
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
static ast_node_t *parse_atom(parser_t *parser) {
|
||||
ast_node_t *node = NULL;
|
||||
|
||||
switch (parser->current.type) {
|
||||
case TOKEN_CHAR:
|
||||
node = ast_create_char(parser->current.value);
|
||||
parser_advance(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_DOT:
|
||||
node = ast_create_dot();
|
||||
parser_advance(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_CARET:
|
||||
node = ast_create_anchor_start();
|
||||
parser_advance(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_DOLLAR:
|
||||
node = ast_create_anchor_end();
|
||||
parser_advance(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_LPAREN: {
|
||||
parser_advance(parser);
|
||||
int group_id = parser->group_count++;
|
||||
ast_node_t *inner = parse_expr(parser);
|
||||
if (parser->current.type != TOKEN_RPAREN) {
|
||||
parser->error = LOREG_ERR_UNBALANCED_PAREN;
|
||||
ast_free(inner);
|
||||
return NULL;
|
||||
}
|
||||
parser_advance(parser);
|
||||
node = ast_create_group(inner, group_id);
|
||||
break;
|
||||
}
|
||||
|
||||
case TOKEN_LBRACKET:
|
||||
node = parse_bracket(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_CLASS_DIGIT:
|
||||
node = ast_create_class(AST_CLASS_DIGIT);
|
||||
parser_advance(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_CLASS_WORD:
|
||||
node = ast_create_class(AST_CLASS_WORD);
|
||||
parser_advance(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_CLASS_SPACE:
|
||||
node = ast_create_class(AST_CLASS_SPACE);
|
||||
parser_advance(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_CLASS_NDIGIT:
|
||||
node = ast_create_class(AST_CLASS_NDIGIT);
|
||||
parser_advance(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_CLASS_NWORD:
|
||||
node = ast_create_class(AST_CLASS_NWORD);
|
||||
parser_advance(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_CLASS_NSPACE:
|
||||
node = ast_create_class(AST_CLASS_NSPACE);
|
||||
parser_advance(parser);
|
||||
break;
|
||||
|
||||
case TOKEN_EOF:
|
||||
case TOKEN_PIPE:
|
||||
case TOKEN_RPAREN:
|
||||
return NULL;
|
||||
|
||||
default:
|
||||
parser->error = LOREG_ERR_INVALID_PATTERN;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (!node && parser->error == LOREG_OK) {
|
||||
parser->error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
}
|
||||
return node;
|
||||
}
|
||||
|
||||
static ast_node_t *parse_bracket(parser_t *parser) {
|
||||
parser_advance(parser);
|
||||
|
||||
bracket_class_t *bracket = bracket_create();
|
||||
if (!bracket) {
|
||||
parser->error = LOREG_ERR_OUT_OF_MEMORY;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (parser->current.type == TOKEN_CARET) {
|
||||
bracket->negated = true;
|
||||
parser_advance(parser);
|
||||
}
|
||||
|
||||
while (parser->current.type != TOKEN_RBRACKET && parser->current.type != TOKEN_EOF) {
|
||||
if (parser->current.type == TOKEN_CLASS_DIGIT ||
|
||||
parser->current.type == TOKEN_CLASS_WORD ||
|
||||
parser->current.type == TOKEN_CLASS_SPACE ||
|
||||
parser->current.type == TOKEN_CLASS_NDIGIT ||
|
||||
parser->current.type == TOKEN_CLASS_NWORD ||
|
||||
parser->current.type == TOKEN_CLASS_NSPACE) {
|
||||
|
||||
switch (parser->current.type) {
|
||||
case TOKEN_CLASS_DIGIT:
|
||||
bracket_add_range(bracket, '0', '9');
|
||||
break;
|
||||
case TOKEN_CLASS_WORD:
|
||||
bracket_add_range(bracket, 'a', 'z');
|
||||
bracket_add_range(bracket, 'A', 'Z');
|
||||
bracket_add_range(bracket, '0', '9');
|
||||
bracket_add_char(bracket, '_');
|
||||
break;
|
||||
case TOKEN_CLASS_SPACE:
|
||||
bracket_add_char(bracket, ' ');
|
||||
bracket_add_char(bracket, '\t');
|
||||
bracket_add_char(bracket, '\n');
|
||||
bracket_add_char(bracket, '\r');
|
||||
bracket_add_char(bracket, '\f');
|
||||
bracket_add_char(bracket, '\v');
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
parser_advance(parser);
|
||||
continue;
|
||||
}
|
||||
|
||||
char start = parser->current.value;
|
||||
parser_advance(parser);
|
||||
|
||||
if (parser->current.type == TOKEN_DASH) {
|
||||
parser_advance(parser);
|
||||
if (parser->current.type == TOKEN_RBRACKET || parser->current.type == TOKEN_EOF) {
|
||||
bracket_add_char(bracket, start);
|
||||
bracket_add_char(bracket, '-');
|
||||
} else {
|
||||
char end = parser->current.value;
|
||||
bracket_add_range(bracket, start, end);
|
||||
parser_advance(parser);
|
||||
}
|
||||
} else {
|
||||
bracket_add_char(bracket, start);
|
||||
}
|
||||
}
|
||||
|
||||
if (parser->current.type != TOKEN_RBRACKET) {
|
||||
bracket_free(bracket);
|
||||
parser->error = LOREG_ERR_INVALID_PATTERN;
|
||||
return NULL;
|
||||
}
|
||||
parser_advance(parser);
|
||||
|
||||
return ast_create_bracket(bracket);
|
||||
}
|
||||
|
||||
ast_node_t *parser_parse(parser_t *parser) {
|
||||
if (parser->current.type == TOKEN_EOF) {
|
||||
return NULL;
|
||||
}
|
||||
return parse_expr(parser);
|
||||
}
|
||||
170
src/repl.c
Normal file
170
src/repl.c
Normal file
@ -0,0 +1,170 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "repl.h"
|
||||
#include "loreg.h"
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#define MAX_INPUT 4096
|
||||
|
||||
static void print_banner(void) {
|
||||
printf("loreg v%s - regex interpreter\n", LOREG_VERSION);
|
||||
printf("commands: :q quit, :h help, :p <pattern> set pattern, :m <text> match, :s <text> search\n\n");
|
||||
}
|
||||
|
||||
static void print_help(void) {
|
||||
printf("loreg REPL commands:\n");
|
||||
printf(" :q quit\n");
|
||||
printf(" :h show this help\n");
|
||||
printf(" :p <regex> compile and set pattern\n");
|
||||
printf(" :m <text> match text against pattern (anchored)\n");
|
||||
printf(" :s <text> search for pattern in text\n");
|
||||
printf(" <text> search for pattern in text\n\n");
|
||||
printf("regex syntax:\n");
|
||||
printf(" . any character\n");
|
||||
printf(" * zero or more\n");
|
||||
printf(" + one or more\n");
|
||||
printf(" ? zero or one\n");
|
||||
printf(" | alternation\n");
|
||||
printf(" () grouping\n");
|
||||
printf(" [] character class\n");
|
||||
printf(" [^] negated class\n");
|
||||
printf(" ^ start anchor\n");
|
||||
printf(" $ end anchor\n");
|
||||
printf(" {n} exactly n\n");
|
||||
printf(" {n,} n or more\n");
|
||||
printf(" {n,m} n to m\n");
|
||||
printf(" \\d digit\n");
|
||||
printf(" \\w word character\n");
|
||||
printf(" \\s whitespace\n");
|
||||
printf(" \\D \\W \\S negated classes\n\n");
|
||||
}
|
||||
|
||||
static void print_match(const char *text, loreg_match_t *result) {
|
||||
if (!result->matched) {
|
||||
printf("no match\n");
|
||||
return;
|
||||
}
|
||||
|
||||
printf("match: \"");
|
||||
for (size_t i = result->match_start; i < result->match_end; i++) {
|
||||
printf("%c", text[i]);
|
||||
}
|
||||
printf("\" [%zu-%zu]\n", result->match_start, result->match_end);
|
||||
|
||||
for (size_t i = 0; i < result->group_count; i++) {
|
||||
if (result->groups[i].matched) {
|
||||
printf(" group %zu: \"", i);
|
||||
for (size_t j = result->groups[i].start; j < result->groups[i].end; j++) {
|
||||
printf("%c", text[j]);
|
||||
}
|
||||
printf("\" [%zu-%zu]\n", result->groups[i].start, result->groups[i].end);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static char *read_line(void) {
|
||||
static char buffer[MAX_INPUT];
|
||||
printf("> ");
|
||||
fflush(stdout);
|
||||
|
||||
if (!fgets(buffer, MAX_INPUT, stdin)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t len = strlen(buffer);
|
||||
if (len > 0 && buffer[len - 1] == '\n') {
|
||||
buffer[len - 1] = '\0';
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void repl_run(void) {
|
||||
print_banner();
|
||||
|
||||
loreg_regex_t *regex = NULL;
|
||||
char *line;
|
||||
|
||||
while ((line = read_line()) != NULL) {
|
||||
if (strlen(line) == 0) continue;
|
||||
|
||||
if (strcmp(line, ":q") == 0 || strcmp(line, ":quit") == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (strcmp(line, ":h") == 0 || strcmp(line, ":help") == 0) {
|
||||
print_help();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strncmp(line, ":p ", 3) == 0) {
|
||||
const char *pattern = line + 3;
|
||||
while (*pattern == ' ') pattern++;
|
||||
|
||||
if (regex) {
|
||||
loreg_free(regex);
|
||||
regex = NULL;
|
||||
}
|
||||
|
||||
loreg_error_t error;
|
||||
regex = loreg_compile(pattern, &error);
|
||||
if (!regex) {
|
||||
printf("error: %s\n", loreg_error_string(error));
|
||||
} else {
|
||||
printf("pattern compiled: %s\n", pattern);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strncmp(line, ":m ", 3) == 0) {
|
||||
if (!regex) {
|
||||
printf("error: no pattern set (use :p <pattern>)\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
const char *text = line + 3;
|
||||
while (*text == ' ') text++;
|
||||
|
||||
loreg_match_t result;
|
||||
loreg_match(regex, text, &result);
|
||||
print_match(text, &result);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strncmp(line, ":s ", 3) == 0) {
|
||||
if (!regex) {
|
||||
printf("error: no pattern set (use :p <pattern>)\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
const char *text = line + 3;
|
||||
while (*text == ' ') text++;
|
||||
|
||||
loreg_match_t result;
|
||||
loreg_search(regex, text, &result);
|
||||
print_match(text, &result);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (line[0] == ':') {
|
||||
printf("unknown command: %s\n", line);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!regex) {
|
||||
printf("error: no pattern set (use :p <pattern>)\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
loreg_match_t result;
|
||||
loreg_search(regex, line, &result);
|
||||
print_match(line, &result);
|
||||
}
|
||||
|
||||
if (regex) {
|
||||
loreg_free(regex);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
252
tests/test_all.c
Normal file
252
tests/test_all.c
Normal file
@ -0,0 +1,252 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "../include/loreg.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
static int total_passed = 0;
|
||||
static int total_failed = 0;
|
||||
|
||||
#define ASSERT(cond, msg) do { \
|
||||
if (!(cond)) { \
|
||||
printf(" FAIL: %s\n", msg); \
|
||||
total_failed++; \
|
||||
return; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define TEST(name) static void test_##name(void)
|
||||
#define RUN(name) do { \
|
||||
test_##name(); \
|
||||
total_passed++; \
|
||||
} while(0)
|
||||
|
||||
TEST(basic_literals) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("hello", &err);
|
||||
ASSERT(re != NULL, "compile hello");
|
||||
|
||||
loreg_match_t m;
|
||||
ASSERT(loreg_search(re, "hello", &m), "match hello");
|
||||
ASSERT(loreg_search(re, "say hello world", &m), "search hello");
|
||||
ASSERT(!loreg_search(re, "helo", &m), "no match helo");
|
||||
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(metacharacters) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("a.c", &err);
|
||||
ASSERT(re != NULL, "compile a.c");
|
||||
ASSERT(loreg_search(re, "abc", &m), "match abc");
|
||||
ASSERT(loreg_search(re, "axc", &m), "match axc");
|
||||
ASSERT(!loreg_search(re, "ac", &m), "no match ac");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("^start", &err);
|
||||
ASSERT(re != NULL, "compile ^start");
|
||||
ASSERT(loreg_search(re, "start here", &m), "match start here");
|
||||
ASSERT(!loreg_search(re, "not start", &m), "no match not start");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("end$", &err);
|
||||
ASSERT(re != NULL, "compile end$");
|
||||
ASSERT(loreg_search(re, "the end", &m), "match the end");
|
||||
ASSERT(!loreg_search(re, "end here", &m), "no match end here");
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(quantifiers) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("ab*c", &err);
|
||||
ASSERT(re != NULL, "compile ab*c");
|
||||
ASSERT(loreg_search(re, "ac", &m), "match ac");
|
||||
ASSERT(loreg_search(re, "abc", &m), "match abc");
|
||||
ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("ab+c", &err);
|
||||
ASSERT(re != NULL, "compile ab+c");
|
||||
ASSERT(!loreg_search(re, "ac", &m), "no match ac");
|
||||
ASSERT(loreg_search(re, "abc", &m), "match abc");
|
||||
ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("ab?c", &err);
|
||||
ASSERT(re != NULL, "compile ab?c");
|
||||
ASSERT(loreg_search(re, "ac", &m), "match ac");
|
||||
ASSERT(loreg_search(re, "abc", &m), "match abc");
|
||||
ASSERT(!loreg_search(re, "abbc", &m), "no match abbc");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("a{3}", &err);
|
||||
ASSERT(re != NULL, "compile a{3}");
|
||||
ASSERT(loreg_search(re, "aaa", &m), "match aaa");
|
||||
ASSERT(!loreg_search(re, "aa", &m), "no match aa");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("a{2,4}", &err);
|
||||
ASSERT(re != NULL, "compile a{2,4}");
|
||||
ASSERT(loreg_search(re, "aa", &m), "match aa");
|
||||
ASSERT(loreg_search(re, "aaa", &m), "match aaa");
|
||||
ASSERT(loreg_search(re, "aaaa", &m), "match aaaa");
|
||||
ASSERT(!loreg_search(re, "a", &m), "no match a");
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(character_classes) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("[aeiou]", &err);
|
||||
ASSERT(re != NULL, "compile [aeiou]");
|
||||
ASSERT(loreg_search(re, "a", &m), "match a");
|
||||
ASSERT(loreg_search(re, "test", &m), "match test");
|
||||
ASSERT(!loreg_search(re, "xyz", &m), "no match xyz");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("[a-z]", &err);
|
||||
ASSERT(re != NULL, "compile [a-z]");
|
||||
ASSERT(loreg_search(re, "m", &m), "match m");
|
||||
ASSERT(!loreg_search(re, "5", &m), "no match 5");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("[^0-9]", &err);
|
||||
ASSERT(re != NULL, "compile [^0-9]");
|
||||
ASSERT(loreg_search(re, "a", &m), "match a");
|
||||
ASSERT(!loreg_search(re, "5", &m), "no match 5");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("\\d", &err);
|
||||
ASSERT(re != NULL, "compile \\d");
|
||||
ASSERT(loreg_search(re, "5", &m), "match 5");
|
||||
ASSERT(!loreg_search(re, "a", &m), "no match a");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("\\w+", &err);
|
||||
ASSERT(re != NULL, "compile \\w+");
|
||||
ASSERT(loreg_search(re, "hello_123", &m), "match hello_123");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("\\s", &err);
|
||||
ASSERT(re != NULL, "compile \\s");
|
||||
ASSERT(loreg_search(re, " ", &m), "match space");
|
||||
ASSERT(loreg_search(re, "\t", &m), "match tab");
|
||||
ASSERT(!loreg_search(re, "a", &m), "no match a");
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(groups) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("(ab)+", &err);
|
||||
ASSERT(re != NULL, "compile (ab)+");
|
||||
ASSERT(loreg_search(re, "ab", &m), "match ab");
|
||||
ASSERT(loreg_search(re, "abab", &m), "match abab");
|
||||
ASSERT(!loreg_search(re, "a", &m), "no match a");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("(\\d+)-(\\d+)", &err);
|
||||
ASSERT(re != NULL, "compile groups");
|
||||
ASSERT(loreg_search(re, "123-456", &m), "match 123-456");
|
||||
ASSERT(m.group_count == 2, "2 groups");
|
||||
ASSERT(m.groups[0].matched, "group 0 matched");
|
||||
ASSERT(m.groups[1].matched, "group 1 matched");
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(alternation) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("cat|dog", &err);
|
||||
ASSERT(re != NULL, "compile cat|dog");
|
||||
ASSERT(loreg_search(re, "cat", &m), "match cat");
|
||||
ASSERT(loreg_search(re, "dog", &m), "match dog");
|
||||
ASSERT(!loreg_search(re, "rat", &m), "no match rat");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("(red|blue) car", &err);
|
||||
ASSERT(re != NULL, "compile (red|blue) car");
|
||||
ASSERT(loreg_search(re, "red car", &m), "match red car");
|
||||
ASSERT(loreg_search(re, "blue car", &m), "match blue car");
|
||||
ASSERT(!loreg_search(re, "green car", &m), "no match green car");
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(escapes) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("1\\.5", &err);
|
||||
ASSERT(re != NULL, "compile 1\\.5");
|
||||
ASSERT(loreg_search(re, "1.5", &m), "match 1.5");
|
||||
ASSERT(!loreg_search(re, "1x5", &m), "no match 1x5");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("\\(test\\)", &err);
|
||||
ASSERT(re != NULL, "compile \\(test\\)");
|
||||
ASSERT(loreg_search(re, "(test)", &m), "match (test)");
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(real_patterns) {
|
||||
loreg_error_t err;
|
||||
loreg_match_t m;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", &err);
|
||||
ASSERT(re != NULL, "compile email");
|
||||
ASSERT(loreg_search(re, "user@example.com", &m), "match email");
|
||||
ASSERT(!loreg_search(re, "invalid", &m), "no match invalid");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", &err);
|
||||
ASSERT(re != NULL, "compile ip");
|
||||
ASSERT(loreg_search(re, "192.168.1.1", &m), "match ip");
|
||||
loreg_free(re);
|
||||
|
||||
re = loreg_compile("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", &err);
|
||||
ASSERT(re != NULL, "compile url");
|
||||
ASSERT(loreg_search(re, "http://example.com", &m), "match http");
|
||||
ASSERT(loreg_search(re, "https://example.com/path", &m), "match https");
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(error_handling) {
|
||||
loreg_error_t err;
|
||||
|
||||
loreg_regex_t *re = loreg_compile("(abc", &err);
|
||||
ASSERT(re == NULL, "unbalanced paren");
|
||||
ASSERT(err == LOREG_ERR_UNBALANCED_PAREN, "correct error");
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
printf("loreg comprehensive tests\n");
|
||||
printf("========================\n\n");
|
||||
|
||||
clock_t start = clock();
|
||||
|
||||
RUN(basic_literals);
|
||||
RUN(metacharacters);
|
||||
RUN(quantifiers);
|
||||
RUN(character_classes);
|
||||
RUN(groups);
|
||||
RUN(alternation);
|
||||
RUN(escapes);
|
||||
RUN(real_patterns);
|
||||
RUN(error_handling);
|
||||
|
||||
clock_t end = clock();
|
||||
double elapsed = (double)(end - start) / CLOCKS_PER_SEC;
|
||||
|
||||
printf("\n========================\n");
|
||||
printf("passed: %d, failed: %d\n", total_passed, total_failed);
|
||||
printf("time: %.3f seconds\n", elapsed);
|
||||
|
||||
return total_failed > 0 ? 1 : 0;
|
||||
}
|
||||
650
tests/test_integration.c
Normal file
650
tests/test_integration.c
Normal file
@ -0,0 +1,650 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "../include/loreg.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
static int passed = 0;
|
||||
static int failed = 0;
|
||||
|
||||
#define MATCH(pat, txt) test_match(pat, txt, 1, __LINE__)
|
||||
#define NO_MATCH(pat, txt) test_match(pat, txt, 0, __LINE__)
|
||||
|
||||
static void test_match(const char *pattern, const char *text, int expect, int line) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile(pattern, &err);
|
||||
if (!re) {
|
||||
printf("FAIL line %d: compile error for '%s': %s\n", line, pattern, loreg_error_string(err));
|
||||
failed++;
|
||||
return;
|
||||
}
|
||||
loreg_match_t m;
|
||||
int result = loreg_search(re, text, &m) ? 1 : 0;
|
||||
if (result != expect) {
|
||||
printf("FAIL line %d: '%s' vs '%s' expected %s\n", line, pattern, text, expect ? "match" : "no match");
|
||||
failed++;
|
||||
} else {
|
||||
passed++;
|
||||
}
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
static void test_literals(void) {
|
||||
printf(" literals...\n");
|
||||
MATCH("a", "a");
|
||||
MATCH("a", "ba");
|
||||
MATCH("a", "ab");
|
||||
MATCH("abc", "abc");
|
||||
MATCH("abc", "xabcy");
|
||||
MATCH("hello", "hello world");
|
||||
MATCH("world", "hello world");
|
||||
MATCH("lo wo", "hello world");
|
||||
NO_MATCH("abc", "ab");
|
||||
NO_MATCH("abc", "abd");
|
||||
NO_MATCH("xyz", "abc");
|
||||
NO_MATCH("hello", "helo");
|
||||
MATCH("", "anything");
|
||||
MATCH("", "");
|
||||
MATCH("a", "aaa");
|
||||
MATCH("aa", "aaa");
|
||||
MATCH("aaa", "aaa");
|
||||
NO_MATCH("aaaa", "aaa");
|
||||
}
|
||||
|
||||
static void test_dot(void) {
|
||||
printf(" dot metacharacter...\n");
|
||||
MATCH(".", "a");
|
||||
MATCH(".", "x");
|
||||
MATCH(".", "5");
|
||||
MATCH(".", " ");
|
||||
MATCH("..", "ab");
|
||||
MATCH("...", "abc");
|
||||
MATCH("a.c", "abc");
|
||||
MATCH("a.c", "aXc");
|
||||
MATCH("a.c", "a9c");
|
||||
MATCH("a.c", "a c");
|
||||
NO_MATCH("a.c", "ac");
|
||||
NO_MATCH("a.c", "abbc");
|
||||
MATCH("....", "test");
|
||||
MATCH(".", "!");
|
||||
MATCH(".", "@");
|
||||
MATCH("a..b", "aXYb");
|
||||
MATCH("a...b", "a123b");
|
||||
NO_MATCH("a..b", "aXb");
|
||||
}
|
||||
|
||||
static void test_anchors(void) {
|
||||
printf(" anchors...\n");
|
||||
MATCH("^a", "a");
|
||||
MATCH("^a", "abc");
|
||||
NO_MATCH("^a", "ba");
|
||||
NO_MATCH("^a", " a");
|
||||
MATCH("a$", "a");
|
||||
MATCH("a$", "ba");
|
||||
NO_MATCH("a$", "ab");
|
||||
NO_MATCH("a$", "a ");
|
||||
MATCH("^abc$", "abc");
|
||||
NO_MATCH("^abc$", "xabc");
|
||||
NO_MATCH("^abc$", "abcx");
|
||||
NO_MATCH("^abc$", " abc");
|
||||
NO_MATCH("^abc$", "abc ");
|
||||
MATCH("^$", "");
|
||||
NO_MATCH("^$", "a");
|
||||
MATCH("^hello$", "hello");
|
||||
MATCH("^hello world$", "hello world");
|
||||
NO_MATCH("^hello world$", "hello world!");
|
||||
MATCH("^a.*z$", "abcdefghijklmnopqrstuvwxyz");
|
||||
MATCH("^.", "x");
|
||||
MATCH(".$", "x");
|
||||
}
|
||||
|
||||
static void test_star(void) {
|
||||
printf(" star quantifier...\n");
|
||||
MATCH("a*", "");
|
||||
MATCH("a*", "a");
|
||||
MATCH("a*", "aa");
|
||||
MATCH("a*", "aaa");
|
||||
MATCH("a*", "aaaaaaaaaa");
|
||||
MATCH("a*", "b");
|
||||
MATCH("a*b", "b");
|
||||
MATCH("a*b", "ab");
|
||||
MATCH("a*b", "aab");
|
||||
MATCH("a*b", "aaaaaab");
|
||||
NO_MATCH("a*b", "a");
|
||||
MATCH("ba*", "b");
|
||||
MATCH("ba*", "ba");
|
||||
MATCH("ba*", "baaa");
|
||||
MATCH(".*", "");
|
||||
MATCH(".*", "anything at all");
|
||||
MATCH("a.*b", "ab");
|
||||
MATCH("a.*b", "aXb");
|
||||
MATCH("a.*b", "aXXXXXb");
|
||||
MATCH("a.*b", "a b");
|
||||
MATCH("x*y*z*", "");
|
||||
MATCH("x*y*z*", "xyz");
|
||||
MATCH("x*y*z*", "xxxyyyzzz");
|
||||
MATCH("ab*c", "ac");
|
||||
MATCH("ab*c", "abc");
|
||||
MATCH("ab*c", "abbbbc");
|
||||
}
|
||||
|
||||
static void test_plus(void) {
|
||||
printf(" plus quantifier...\n");
|
||||
NO_MATCH("a+", "");
|
||||
MATCH("a+", "a");
|
||||
MATCH("a+", "aa");
|
||||
MATCH("a+", "aaa");
|
||||
MATCH("a+", "aaaaaaaaaa");
|
||||
MATCH("a+", "ba");
|
||||
MATCH("a+b", "ab");
|
||||
MATCH("a+b", "aab");
|
||||
MATCH("a+b", "aaaaaab");
|
||||
NO_MATCH("a+b", "b");
|
||||
NO_MATCH("a+b", "a");
|
||||
MATCH("ba+", "ba");
|
||||
MATCH("ba+", "baaa");
|
||||
NO_MATCH("ba+", "b");
|
||||
MATCH(".+", "a");
|
||||
MATCH(".+", "anything");
|
||||
NO_MATCH(".+", "");
|
||||
MATCH("a.+b", "aXb");
|
||||
MATCH("a.+b", "aXXXXXb");
|
||||
NO_MATCH("a.+b", "ab");
|
||||
MATCH("ab+c", "abc");
|
||||
MATCH("ab+c", "abbbbc");
|
||||
NO_MATCH("ab+c", "ac");
|
||||
}
|
||||
|
||||
static void test_question(void) {
|
||||
printf(" question quantifier...\n");
|
||||
MATCH("a?", "");
|
||||
MATCH("a?", "a");
|
||||
MATCH("a?", "aa");
|
||||
MATCH("a?b", "b");
|
||||
MATCH("a?b", "ab");
|
||||
MATCH("a?b", "aab");
|
||||
MATCH("colou?r", "color");
|
||||
MATCH("colou?r", "colour");
|
||||
NO_MATCH("colou?r", "colouur");
|
||||
MATCH("ab?c", "ac");
|
||||
MATCH("ab?c", "abc");
|
||||
NO_MATCH("ab?c", "abbc");
|
||||
MATCH("https?://", "http://");
|
||||
MATCH("https?://", "https://");
|
||||
MATCH(".?", "");
|
||||
MATCH(".?", "x");
|
||||
}
|
||||
|
||||
static void test_alternation(void) {
|
||||
printf(" alternation...\n");
|
||||
MATCH("a|b", "a");
|
||||
MATCH("a|b", "b");
|
||||
NO_MATCH("a|b", "c");
|
||||
MATCH("cat|dog", "cat");
|
||||
MATCH("cat|dog", "dog");
|
||||
NO_MATCH("cat|dog", "rat");
|
||||
MATCH("cat|dog", "my cat");
|
||||
MATCH("cat|dog", "my dog");
|
||||
MATCH("a|b|c", "a");
|
||||
MATCH("a|b|c", "b");
|
||||
MATCH("a|b|c", "c");
|
||||
NO_MATCH("a|b|c", "d");
|
||||
MATCH("ab|cd", "ab");
|
||||
MATCH("ab|cd", "cd");
|
||||
NO_MATCH("ab|cd", "ac");
|
||||
MATCH("abc|def|ghi", "abc");
|
||||
MATCH("abc|def|ghi", "def");
|
||||
MATCH("abc|def|ghi", "ghi");
|
||||
MATCH("a|ab|abc", "abc");
|
||||
MATCH("abc|ab|a", "abc");
|
||||
MATCH("red|green|blue", "the red car");
|
||||
MATCH("red|green|blue", "green light");
|
||||
MATCH("red|green|blue", "blue sky");
|
||||
}
|
||||
|
||||
static void test_groups(void) {
|
||||
printf(" groups...\n");
|
||||
MATCH("(a)", "a");
|
||||
MATCH("(ab)", "ab");
|
||||
MATCH("(abc)", "abc");
|
||||
MATCH("(a)(b)", "ab");
|
||||
MATCH("(a)(b)(c)", "abc");
|
||||
MATCH("(ab)+", "ab");
|
||||
MATCH("(ab)+", "abab");
|
||||
MATCH("(ab)+", "ababab");
|
||||
NO_MATCH("(ab)+", "a");
|
||||
NO_MATCH("(ab)+", "ba");
|
||||
MATCH("(ab)*", "");
|
||||
MATCH("(ab)*", "ab");
|
||||
MATCH("(ab)*", "abab");
|
||||
MATCH("(ab)?", "");
|
||||
MATCH("(ab)?", "ab");
|
||||
MATCH("(a|b)+", "a");
|
||||
MATCH("(a|b)+", "b");
|
||||
MATCH("(a|b)+", "ab");
|
||||
MATCH("(a|b)+", "ba");
|
||||
MATCH("(a|b)+", "aabb");
|
||||
MATCH("(a|b)+", "abba");
|
||||
MATCH("((a))", "a");
|
||||
MATCH("((ab))", "ab");
|
||||
MATCH("(a(b)c)", "abc");
|
||||
MATCH("(a(b(c)))", "abc");
|
||||
MATCH("((a)(b))", "ab");
|
||||
MATCH("(red|blue) car", "red car");
|
||||
MATCH("(red|blue) car", "blue car");
|
||||
NO_MATCH("(red|blue) car", "green car");
|
||||
}
|
||||
|
||||
static void test_bracket_simple(void) {
|
||||
printf(" bracket expressions (simple)...\n");
|
||||
MATCH("[a]", "a");
|
||||
NO_MATCH("[a]", "b");
|
||||
MATCH("[ab]", "a");
|
||||
MATCH("[ab]", "b");
|
||||
NO_MATCH("[ab]", "c");
|
||||
MATCH("[abc]", "a");
|
||||
MATCH("[abc]", "b");
|
||||
MATCH("[abc]", "c");
|
||||
NO_MATCH("[abc]", "d");
|
||||
MATCH("[aeiou]", "a");
|
||||
MATCH("[aeiou]", "e");
|
||||
MATCH("[aeiou]", "i");
|
||||
MATCH("[aeiou]", "o");
|
||||
MATCH("[aeiou]", "u");
|
||||
NO_MATCH("[aeiou]", "b");
|
||||
MATCH("[abc]+", "aaa");
|
||||
MATCH("[abc]+", "abc");
|
||||
MATCH("[abc]+", "cba");
|
||||
MATCH("[abc]+", "abcabc");
|
||||
MATCH("[xyz]*", "");
|
||||
MATCH("[xyz]*", "xyz");
|
||||
}
|
||||
|
||||
static void test_bracket_ranges(void) {
|
||||
printf(" bracket expressions (ranges)...\n");
|
||||
MATCH("[a-z]", "a");
|
||||
MATCH("[a-z]", "m");
|
||||
MATCH("[a-z]", "z");
|
||||
NO_MATCH("[a-z]", "A");
|
||||
NO_MATCH("[a-z]", "0");
|
||||
MATCH("[A-Z]", "A");
|
||||
MATCH("[A-Z]", "M");
|
||||
MATCH("[A-Z]", "Z");
|
||||
NO_MATCH("[A-Z]", "a");
|
||||
MATCH("[0-9]", "0");
|
||||
MATCH("[0-9]", "5");
|
||||
MATCH("[0-9]", "9");
|
||||
NO_MATCH("[0-9]", "a");
|
||||
MATCH("[a-zA-Z]", "a");
|
||||
MATCH("[a-zA-Z]", "Z");
|
||||
NO_MATCH("[a-zA-Z]", "5");
|
||||
MATCH("[a-zA-Z0-9]", "a");
|
||||
MATCH("[a-zA-Z0-9]", "Z");
|
||||
MATCH("[a-zA-Z0-9]", "5");
|
||||
NO_MATCH("[a-zA-Z0-9]", "!");
|
||||
MATCH("[a-z]+", "hello");
|
||||
MATCH("[A-Z]+", "HELLO");
|
||||
MATCH("[0-9]+", "12345");
|
||||
MATCH("[a-z0-9]+", "abc123");
|
||||
}
|
||||
|
||||
static void test_bracket_negated(void) {
|
||||
printf(" bracket expressions (negated)...\n");
|
||||
NO_MATCH("[^a]", "a");
|
||||
MATCH("[^a]", "b");
|
||||
MATCH("[^a]", "x");
|
||||
NO_MATCH("[^abc]", "a");
|
||||
NO_MATCH("[^abc]", "b");
|
||||
NO_MATCH("[^abc]", "c");
|
||||
MATCH("[^abc]", "d");
|
||||
MATCH("[^abc]", "x");
|
||||
NO_MATCH("[^a-z]", "a");
|
||||
NO_MATCH("[^a-z]", "m");
|
||||
NO_MATCH("[^a-z]", "z");
|
||||
MATCH("[^a-z]", "A");
|
||||
MATCH("[^a-z]", "5");
|
||||
MATCH("[^a-z]", "!");
|
||||
NO_MATCH("[^0-9]", "5");
|
||||
MATCH("[^0-9]", "a");
|
||||
MATCH("[^0-9]+", "hello");
|
||||
NO_MATCH("[^aeiou]+", "aaa");
|
||||
MATCH("[^aeiou]+", "xyz");
|
||||
}
|
||||
|
||||
static void test_character_classes(void) {
|
||||
printf(" character classes...\n");
|
||||
MATCH("\\d", "0");
|
||||
MATCH("\\d", "5");
|
||||
MATCH("\\d", "9");
|
||||
NO_MATCH("\\d", "a");
|
||||
NO_MATCH("\\d", " ");
|
||||
MATCH("\\d+", "123");
|
||||
MATCH("\\d+", "0");
|
||||
MATCH("\\d+", "9876543210");
|
||||
NO_MATCH("\\d+", "");
|
||||
NO_MATCH("\\d+", "abc");
|
||||
MATCH("\\D", "a");
|
||||
MATCH("\\D", " ");
|
||||
MATCH("\\D", "!");
|
||||
NO_MATCH("\\D", "5");
|
||||
MATCH("\\w", "a");
|
||||
MATCH("\\w", "Z");
|
||||
MATCH("\\w", "0");
|
||||
MATCH("\\w", "_");
|
||||
NO_MATCH("\\w", " ");
|
||||
NO_MATCH("\\w", "!");
|
||||
MATCH("\\w+", "hello");
|
||||
MATCH("\\w+", "Hello123");
|
||||
MATCH("\\w+", "var_name");
|
||||
MATCH("\\W", " ");
|
||||
MATCH("\\W", "!");
|
||||
MATCH("\\W", "@");
|
||||
NO_MATCH("\\W", "a");
|
||||
NO_MATCH("\\W", "_");
|
||||
MATCH("\\s", " ");
|
||||
MATCH("\\s", "\t");
|
||||
MATCH("\\s", "\n");
|
||||
NO_MATCH("\\s", "a");
|
||||
NO_MATCH("\\s", "5");
|
||||
MATCH("\\s+", " ");
|
||||
MATCH("\\s+", " \t\n");
|
||||
MATCH("\\S", "a");
|
||||
MATCH("\\S", "5");
|
||||
MATCH("\\S", "!");
|
||||
NO_MATCH("\\S", " ");
|
||||
NO_MATCH("\\S", "\t");
|
||||
}
|
||||
|
||||
static void test_quantifier_braces(void) {
|
||||
printf(" brace quantifiers...\n");
|
||||
MATCH("a{3}", "aaa");
|
||||
MATCH("a{3}", "aaaa");
|
||||
NO_MATCH("a{3}", "aa");
|
||||
MATCH("a{1}", "a");
|
||||
MATCH("a{1}", "aa");
|
||||
NO_MATCH("a{1}", "");
|
||||
MATCH("a{0}", "");
|
||||
MATCH("a{0}", "b");
|
||||
MATCH("a{2,4}", "aa");
|
||||
MATCH("a{2,4}", "aaa");
|
||||
MATCH("a{2,4}", "aaaa");
|
||||
MATCH("a{2,4}", "aaaaa");
|
||||
NO_MATCH("a{2,4}", "a");
|
||||
MATCH("a{2,}", "aa");
|
||||
MATCH("a{2,}", "aaa");
|
||||
MATCH("a{2,}", "aaaaaaaaaa");
|
||||
NO_MATCH("a{2,}", "a");
|
||||
MATCH("a{0,2}", "");
|
||||
MATCH("a{0,2}", "a");
|
||||
MATCH("a{0,2}", "aa");
|
||||
MATCH("a{0,2}", "aaa");
|
||||
MATCH("[0-9]{3}", "123");
|
||||
MATCH("[0-9]{3}", "000");
|
||||
NO_MATCH("[0-9]{3}", "12");
|
||||
MATCH("(ab){2}", "abab");
|
||||
MATCH("(ab){2}", "ababab");
|
||||
NO_MATCH("(ab){2}", "ab");
|
||||
}
|
||||
|
||||
static void test_escape_sequences(void) {
|
||||
printf(" escape sequences...\n");
|
||||
MATCH("\\.", ".");
|
||||
NO_MATCH("\\.", "a");
|
||||
MATCH("\\*", "*");
|
||||
NO_MATCH("\\*", "a");
|
||||
MATCH("\\+", "+");
|
||||
MATCH("\\?", "?");
|
||||
MATCH("\\|", "|");
|
||||
MATCH("\\(", "(");
|
||||
MATCH("\\)", ")");
|
||||
MATCH("\\[", "[");
|
||||
MATCH("\\]", "]");
|
||||
MATCH("\\{", "{");
|
||||
MATCH("\\}", "}");
|
||||
MATCH("\\^", "^");
|
||||
MATCH("\\$", "$");
|
||||
MATCH("\\\\", "\\");
|
||||
MATCH("a\\.b", "a.b");
|
||||
NO_MATCH("a\\.b", "aXb");
|
||||
MATCH("\\d\\.\\d", "1.5");
|
||||
MATCH("c\\+\\+", "c++");
|
||||
MATCH("\\(test\\)", "(test)");
|
||||
MATCH("\\[0\\]", "[0]");
|
||||
}
|
||||
|
||||
static void test_complex_patterns(void) {
|
||||
printf(" complex patterns...\n");
|
||||
MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "user@example.com");
|
||||
MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "test.user@mail.example.org");
|
||||
NO_MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "invalid");
|
||||
NO_MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "@example.com");
|
||||
MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "192.168.1.1");
|
||||
MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "10.0.0.1");
|
||||
MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "255.255.255.255");
|
||||
NO_MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "1.2.3");
|
||||
MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "http://example.com");
|
||||
MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "https://example.com");
|
||||
MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "http://example.com/path");
|
||||
MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "https://example.com/path/to/page");
|
||||
MATCH("\\d{3}-\\d{3}-\\d{4}", "123-456-7890");
|
||||
MATCH("\\d{3}-\\d{3}-\\d{4}", "555-123-4567");
|
||||
NO_MATCH("\\d{3}-\\d{3}-\\d{4}", "12-345-6789");
|
||||
NO_MATCH("\\d{3}-\\d{3}-\\d{4}", "1234567890");
|
||||
MATCH("\\(\\d{3}\\) \\d{3}-\\d{4}", "(123) 456-7890");
|
||||
MATCH("[A-Z]{2}\\d{6}", "AB123456");
|
||||
NO_MATCH("[A-Z]{2}\\d{6}", "A1234567");
|
||||
MATCH("\\d{4}-\\d{2}-\\d{2}", "2024-01-15");
|
||||
MATCH("\\d{2}/\\d{2}/\\d{4}", "01/15/2024");
|
||||
MATCH("\\d{1,2}:\\d{2}(:\\d{2})?", "12:30");
|
||||
MATCH("\\d{1,2}:\\d{2}(:\\d{2})?", "12:30:45");
|
||||
MATCH("\\d{1,2}:\\d{2}(:\\d{2})?", "9:05");
|
||||
}
|
||||
|
||||
static void test_word_boundaries(void) {
|
||||
printf(" word patterns...\n");
|
||||
MATCH("\\w+", "hello");
|
||||
MATCH("\\w+", "hello123");
|
||||
MATCH("\\w+", "test_var");
|
||||
MATCH("[a-zA-Z_][a-zA-Z0-9_]*", "variable");
|
||||
MATCH("[a-zA-Z_][a-zA-Z0-9_]*", "_private");
|
||||
MATCH("[a-zA-Z_][a-zA-Z0-9_]*", "var123");
|
||||
NO_MATCH("^[a-zA-Z_][a-zA-Z0-9_]*$", "123var");
|
||||
MATCH("\\w+\\s+\\w+", "hello world");
|
||||
MATCH("\\w+\\s+\\w+", "foo bar");
|
||||
NO_MATCH("\\w+\\s+\\w+", "hello");
|
||||
}
|
||||
|
||||
static void test_greedy_vs_nongreedy(void) {
|
||||
printf(" greedy vs non-greedy...\n");
|
||||
MATCH("a+", "aaa");
|
||||
MATCH("a+?", "aaa");
|
||||
MATCH("a*", "aaa");
|
||||
MATCH("a*?", "aaa");
|
||||
MATCH("a?", "a");
|
||||
MATCH("a??", "a");
|
||||
MATCH("a{2,4}", "aaaa");
|
||||
MATCH("a{2,4}?", "aaaa");
|
||||
MATCH(".*x", "abcx");
|
||||
MATCH(".*?x", "abcx");
|
||||
}
|
||||
|
||||
static void test_empty_and_edge_cases(void) {
|
||||
printf(" empty and edge cases...\n");
|
||||
MATCH("", "");
|
||||
MATCH("", "abc");
|
||||
MATCH("a*", "");
|
||||
MATCH("a?", "");
|
||||
MATCH("(a*)*", "");
|
||||
MATCH("(a*)+", "");
|
||||
MATCH("(a+)*", "");
|
||||
MATCH("(a|b)*", "");
|
||||
MATCH("[a-z]*", "");
|
||||
NO_MATCH("a+", "");
|
||||
NO_MATCH(".+", "");
|
||||
NO_MATCH("[a-z]+", "");
|
||||
MATCH("^", "");
|
||||
MATCH("$", "");
|
||||
MATCH("^$", "");
|
||||
NO_MATCH("^$", "a");
|
||||
MATCH("a*b*c*", "");
|
||||
MATCH("a*b*c*", "abc");
|
||||
MATCH("a*b*c*", "aabbcc");
|
||||
MATCH("a*b*c*", "c");
|
||||
MATCH("a*b*c*", "b");
|
||||
}
|
||||
|
||||
static void test_special_characters_in_text(void) {
|
||||
printf(" special characters in text...\n");
|
||||
MATCH("a", "a\nb");
|
||||
MATCH("b", "a\nb");
|
||||
MATCH("a.b", "a\tb");
|
||||
NO_MATCH("a.b", "a\nb");
|
||||
MATCH("\\.", "3.14");
|
||||
MATCH("\\+", "1+2");
|
||||
MATCH("\\*", "2*3");
|
||||
MATCH("\\?", "why?");
|
||||
MATCH("\\(\\)", "func()");
|
||||
MATCH("\\[\\]", "array[]");
|
||||
MATCH("\\{\\}", "object{}");
|
||||
MATCH("\\^", "x^2");
|
||||
MATCH("\\$", "$100");
|
||||
MATCH("\\|", "a|b");
|
||||
}
|
||||
|
||||
static void test_repetition_combinations(void) {
|
||||
printf(" repetition combinations...\n");
|
||||
MATCH("a+b+", "ab");
|
||||
MATCH("a+b+", "aabb");
|
||||
MATCH("a+b+", "aaabbb");
|
||||
NO_MATCH("a+b+", "a");
|
||||
NO_MATCH("a+b+", "b");
|
||||
MATCH("a*b+", "b");
|
||||
MATCH("a*b+", "ab");
|
||||
MATCH("a*b+", "aab");
|
||||
MATCH("a+b*", "a");
|
||||
MATCH("a+b*", "ab");
|
||||
MATCH("a+b*", "abb");
|
||||
MATCH("a*b*", "");
|
||||
MATCH("a*b*", "a");
|
||||
MATCH("a*b*", "b");
|
||||
MATCH("a*b*", "ab");
|
||||
MATCH("(ab)+c+", "abc");
|
||||
MATCH("(ab)+c+", "ababcc");
|
||||
MATCH("(a+b)+", "ab");
|
||||
MATCH("(a+b)+", "aabaaab");
|
||||
MATCH("((a+)+)+", "a");
|
||||
MATCH("((a+)+)+", "aaa");
|
||||
}
|
||||
|
||||
static void test_alternation_combinations(void) {
|
||||
printf(" alternation combinations...\n");
|
||||
MATCH("a|b|c|d|e", "a");
|
||||
MATCH("a|b|c|d|e", "e");
|
||||
NO_MATCH("a|b|c|d|e", "f");
|
||||
MATCH("(a|b)(c|d)", "ac");
|
||||
MATCH("(a|b)(c|d)", "ad");
|
||||
MATCH("(a|b)(c|d)", "bc");
|
||||
MATCH("(a|b)(c|d)", "bd");
|
||||
NO_MATCH("(a|b)(c|d)", "ab");
|
||||
MATCH("(cat|dog)s?", "cat");
|
||||
MATCH("(cat|dog)s?", "cats");
|
||||
MATCH("(cat|dog)s?", "dog");
|
||||
MATCH("(cat|dog)s?", "dogs");
|
||||
MATCH("(red|green|blue)\\s+(car|truck)", "red car");
|
||||
MATCH("(red|green|blue)\\s+(car|truck)", "green truck");
|
||||
MATCH("(a|aa|aaa)", "aaa");
|
||||
MATCH("(aaa|aa|a)", "aaa");
|
||||
}
|
||||
|
||||
static void test_nested_groups(void) {
|
||||
printf(" nested groups...\n");
|
||||
MATCH("((a))", "a");
|
||||
MATCH("(((a)))", "a");
|
||||
MATCH("((a)(b))", "ab");
|
||||
MATCH("((a(b))c)", "abc");
|
||||
MATCH("(a(b(c)))", "abc");
|
||||
MATCH("((a|b)(c|d))", "ac");
|
||||
MATCH("(a(b|c)d)", "abd");
|
||||
MATCH("(a(b|c)d)", "acd");
|
||||
MATCH("((ab)+)", "abab");
|
||||
MATCH("(a(bc)*d)", "ad");
|
||||
MATCH("(a(bc)*d)", "abcd");
|
||||
MATCH("(a(bc)*d)", "abcbcd");
|
||||
MATCH("((a+)(b+))", "aabb");
|
||||
MATCH("(((a|b)+)c)", "ababc");
|
||||
}
|
||||
|
||||
static void test_real_world_patterns(void) {
|
||||
printf(" real world patterns...\n");
|
||||
MATCH("[a-zA-Z]+", "Hello");
|
||||
MATCH("[a-zA-Z]+", "WORLD");
|
||||
MATCH("[a-zA-Z]+", "test");
|
||||
MATCH("-?\\d+", "123");
|
||||
MATCH("-?\\d+", "-456");
|
||||
MATCH("-?\\d+", "0");
|
||||
MATCH("-?\\d+\\.?\\d*", "3.14");
|
||||
MATCH("-?\\d+\\.?\\d*", "-2.5");
|
||||
MATCH("-?\\d+\\.?\\d*", "42");
|
||||
MATCH("[a-fA-F0-9]+", "deadbeef");
|
||||
MATCH("[a-fA-F0-9]+", "CAFEBABE");
|
||||
MATCH("[a-fA-F0-9]+", "123abc");
|
||||
MATCH("[01]+", "101010");
|
||||
MATCH("[01]+", "11110000");
|
||||
MATCH("[A-Z][a-z]+", "Hello");
|
||||
MATCH("[A-Z][a-z]+", "World");
|
||||
NO_MATCH("[A-Z][a-z]+", "hello");
|
||||
MATCH("\"[^\"]*\"", "\"hello\"");
|
||||
MATCH("\"[^\"]*\"", "\"hello world\"");
|
||||
MATCH("\"[^\"]*\"", "\"\"");
|
||||
MATCH("'[^']*'", "'test'");
|
||||
MATCH("#[a-fA-F0-9]{6}", "#ff0000");
|
||||
MATCH("#[a-fA-F0-9]{6}", "#00FF00");
|
||||
MATCH("#[a-fA-F0-9]{3}", "#f00");
|
||||
}
|
||||
|
||||
static void test_pathological_patterns(void) {
|
||||
printf(" stress test patterns...\n");
|
||||
MATCH("a?a?a?aaa", "aaa");
|
||||
MATCH("(a+)+", "aaaa");
|
||||
MATCH("(a*)*", "aaaa");
|
||||
MATCH("(a|a)+", "aaaa");
|
||||
MATCH("((a*)*)*", "aaaa");
|
||||
MATCH("a*a*a*a*a*b", "aaaaab");
|
||||
MATCH(".*.*.*.*.*", "test");
|
||||
MATCH("(a?){5}a{5}", "aaaaa");
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
printf("loreg integration tests\n");
|
||||
printf("=======================\n\n");
|
||||
|
||||
test_literals();
|
||||
test_dot();
|
||||
test_anchors();
|
||||
test_star();
|
||||
test_plus();
|
||||
test_question();
|
||||
test_alternation();
|
||||
test_groups();
|
||||
test_bracket_simple();
|
||||
test_bracket_ranges();
|
||||
test_bracket_negated();
|
||||
test_character_classes();
|
||||
test_quantifier_braces();
|
||||
test_escape_sequences();
|
||||
test_complex_patterns();
|
||||
test_word_boundaries();
|
||||
test_greedy_vs_nongreedy();
|
||||
test_empty_and_edge_cases();
|
||||
test_special_characters_in_text();
|
||||
test_repetition_combinations();
|
||||
test_alternation_combinations();
|
||||
test_nested_groups();
|
||||
test_real_world_patterns();
|
||||
test_pathological_patterns();
|
||||
|
||||
printf("\n=======================\n");
|
||||
printf("integration: %d passed, %d failed\n", passed, failed);
|
||||
printf("total tests: %d\n", passed + failed);
|
||||
|
||||
return failed > 0 ? 1 : 0;
|
||||
}
|
||||
195
tests/test_lexer.c
Normal file
195
tests/test_lexer.c
Normal file
@ -0,0 +1,195 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "../include/lexer.h"
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
#include <string.h>
|
||||
|
||||
static int tests_passed = 0;
|
||||
static int tests_failed = 0;
|
||||
|
||||
#define TEST(name) static void test_##name(void)
|
||||
#define RUN_TEST(name) do { \
|
||||
printf(" %s... ", #name); \
|
||||
test_##name(); \
|
||||
printf("ok\n"); \
|
||||
tests_passed++; \
|
||||
} while(0)
|
||||
|
||||
#define ASSERT(cond) do { \
|
||||
if (!(cond)) { \
|
||||
printf("FAILED at line %d: %s\n", __LINE__, #cond); \
|
||||
tests_failed++; \
|
||||
return; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
TEST(simple_chars) {
|
||||
lexer_t lexer;
|
||||
lexer_init(&lexer, "abc");
|
||||
|
||||
token_t t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'b');
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'c');
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_EOF);
|
||||
}
|
||||
|
||||
TEST(meta_chars) {
|
||||
lexer_t lexer;
|
||||
lexer_init(&lexer, ".*+?|()^$");
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_DOT);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_STAR);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_PLUS);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_QUESTION);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_PIPE);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_LPAREN);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_RPAREN);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_CARET);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_DOLLAR);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_EOF);
|
||||
}
|
||||
|
||||
TEST(escaped_chars) {
|
||||
lexer_t lexer;
|
||||
lexer_init(&lexer, "\\*\\+\\.");
|
||||
|
||||
token_t t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == '*');
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == '+');
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == '.');
|
||||
}
|
||||
|
||||
TEST(character_classes) {
|
||||
lexer_t lexer;
|
||||
lexer_init(&lexer, "\\d\\w\\s\\D\\W\\S");
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_DIGIT);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_WORD);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_SPACE);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_NDIGIT);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_NWORD);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_NSPACE);
|
||||
}
|
||||
|
||||
TEST(bracket_expression) {
|
||||
lexer_t lexer;
|
||||
lexer_init(&lexer, "[abc]");
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_LBRACKET);
|
||||
|
||||
token_t t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'b');
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'c');
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_RBRACKET);
|
||||
}
|
||||
|
||||
TEST(bracket_range) {
|
||||
lexer_t lexer;
|
||||
lexer_init(&lexer, "[a-z]");
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_LBRACKET);
|
||||
|
||||
token_t t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_DASH);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'z');
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_RBRACKET);
|
||||
}
|
||||
|
||||
TEST(negated_bracket) {
|
||||
lexer_t lexer;
|
||||
lexer_init(&lexer, "[^a]");
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_LBRACKET);
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_CARET);
|
||||
|
||||
token_t t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_RBRACKET);
|
||||
}
|
||||
|
||||
TEST(quantifier_braces) {
|
||||
lexer_t lexer;
|
||||
lexer_init(&lexer, "a{3}");
|
||||
|
||||
token_t t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_LBRACE);
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == '3');
|
||||
|
||||
ASSERT(lexer_next(&lexer).type == TOKEN_RBRACE);
|
||||
}
|
||||
|
||||
TEST(peek) {
|
||||
lexer_t lexer;
|
||||
lexer_init(&lexer, "ab");
|
||||
|
||||
token_t t = lexer_peek(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
|
||||
|
||||
t = lexer_peek(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'a');
|
||||
|
||||
t = lexer_peek(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == 'b');
|
||||
}
|
||||
|
||||
TEST(escape_sequences) {
|
||||
lexer_t lexer;
|
||||
lexer_init(&lexer, "\\n\\t\\r");
|
||||
|
||||
token_t t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == '\n');
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == '\t');
|
||||
|
||||
t = lexer_next(&lexer);
|
||||
ASSERT(t.type == TOKEN_CHAR && t.value == '\r');
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
printf("lexer tests:\n");
|
||||
|
||||
RUN_TEST(simple_chars);
|
||||
RUN_TEST(meta_chars);
|
||||
RUN_TEST(escaped_chars);
|
||||
RUN_TEST(character_classes);
|
||||
RUN_TEST(bracket_expression);
|
||||
RUN_TEST(bracket_range);
|
||||
RUN_TEST(negated_bracket);
|
||||
RUN_TEST(quantifier_braces);
|
||||
RUN_TEST(peek);
|
||||
RUN_TEST(escape_sequences);
|
||||
|
||||
printf("\nlexer: %d passed, %d failed\n", tests_passed, tests_failed);
|
||||
return tests_failed > 0 ? 1 : 0;
|
||||
}
|
||||
294
tests/test_matcher.c
Normal file
294
tests/test_matcher.c
Normal file
@ -0,0 +1,294 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "../include/loreg.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
static int tests_passed = 0;
|
||||
static int tests_failed = 0;
|
||||
|
||||
#define TEST(name) static void test_##name(void)
|
||||
#define RUN_TEST(name) do { \
|
||||
printf(" %s... ", #name); \
|
||||
test_##name(); \
|
||||
printf("ok\n"); \
|
||||
tests_passed++; \
|
||||
} while(0)
|
||||
|
||||
#define ASSERT(cond) do { \
|
||||
if (!(cond)) { \
|
||||
printf("FAILED at line %d: %s\n", __LINE__, #cond); \
|
||||
tests_failed++; \
|
||||
return; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define ASSERT_MATCH(pattern, text) do { \
|
||||
loreg_error_t err; \
|
||||
loreg_regex_t *re = loreg_compile(pattern, &err); \
|
||||
ASSERT(re != NULL); \
|
||||
loreg_match_t result; \
|
||||
ASSERT(loreg_search(re, text, &result) == true); \
|
||||
loreg_free(re); \
|
||||
} while(0)
|
||||
|
||||
#define ASSERT_NO_MATCH(pattern, text) do { \
|
||||
loreg_error_t err; \
|
||||
loreg_regex_t *re = loreg_compile(pattern, &err); \
|
||||
ASSERT(re != NULL); \
|
||||
loreg_match_t result; \
|
||||
ASSERT(loreg_search(re, text, &result) == false); \
|
||||
loreg_free(re); \
|
||||
} while(0)
|
||||
|
||||
TEST(simple_char) {
|
||||
ASSERT_MATCH("a", "a");
|
||||
ASSERT_MATCH("a", "bab");
|
||||
ASSERT_NO_MATCH("a", "bcd");
|
||||
}
|
||||
|
||||
TEST(concat) {
|
||||
ASSERT_MATCH("ab", "ab");
|
||||
ASSERT_MATCH("ab", "xaby");
|
||||
ASSERT_NO_MATCH("ab", "ba");
|
||||
}
|
||||
|
||||
TEST(alternation) {
|
||||
ASSERT_MATCH("a|b", "a");
|
||||
ASSERT_MATCH("a|b", "b");
|
||||
ASSERT_MATCH("cat|dog", "cat");
|
||||
ASSERT_MATCH("cat|dog", "dog");
|
||||
ASSERT_NO_MATCH("cat|dog", "rat");
|
||||
}
|
||||
|
||||
TEST(star) {
|
||||
ASSERT_MATCH("a*", "");
|
||||
ASSERT_MATCH("a*", "a");
|
||||
ASSERT_MATCH("a*", "aaa");
|
||||
ASSERT_MATCH("a*b", "b");
|
||||
ASSERT_MATCH("a*b", "ab");
|
||||
ASSERT_MATCH("a*b", "aaab");
|
||||
}
|
||||
|
||||
TEST(plus) {
|
||||
ASSERT_NO_MATCH("a+", "");
|
||||
ASSERT_MATCH("a+", "a");
|
||||
ASSERT_MATCH("a+", "aaa");
|
||||
ASSERT_MATCH("a+b", "ab");
|
||||
ASSERT_MATCH("a+b", "aaab");
|
||||
}
|
||||
|
||||
TEST(question) {
|
||||
ASSERT_MATCH("a?", "");
|
||||
ASSERT_MATCH("a?", "a");
|
||||
ASSERT_MATCH("a?b", "b");
|
||||
ASSERT_MATCH("a?b", "ab");
|
||||
}
|
||||
|
||||
TEST(dot) {
|
||||
ASSERT_MATCH(".", "a");
|
||||
ASSERT_MATCH(".", "x");
|
||||
ASSERT_MATCH("a.b", "aab");
|
||||
ASSERT_MATCH("a.b", "axb");
|
||||
ASSERT_NO_MATCH("a.b", "ab");
|
||||
}
|
||||
|
||||
TEST(bracket_simple) {
|
||||
ASSERT_MATCH("[abc]", "a");
|
||||
ASSERT_MATCH("[abc]", "b");
|
||||
ASSERT_MATCH("[abc]", "c");
|
||||
ASSERT_NO_MATCH("[abc]", "d");
|
||||
}
|
||||
|
||||
TEST(bracket_range) {
|
||||
ASSERT_MATCH("[a-z]", "a");
|
||||
ASSERT_MATCH("[a-z]", "m");
|
||||
ASSERT_MATCH("[a-z]", "z");
|
||||
ASSERT_NO_MATCH("[a-z]", "A");
|
||||
ASSERT_NO_MATCH("[a-z]", "0");
|
||||
}
|
||||
|
||||
TEST(bracket_negated) {
|
||||
ASSERT_NO_MATCH("[^abc]", "a");
|
||||
ASSERT_NO_MATCH("[^abc]", "b");
|
||||
ASSERT_MATCH("[^abc]", "d");
|
||||
ASSERT_MATCH("[^abc]", "x");
|
||||
}
|
||||
|
||||
TEST(group) {
|
||||
ASSERT_MATCH("(ab)", "ab");
|
||||
ASSERT_MATCH("(ab)+", "abab");
|
||||
ASSERT_MATCH("(a|b)+", "abba");
|
||||
}
|
||||
|
||||
TEST(anchors) {
|
||||
ASSERT_MATCH("^a", "a");
|
||||
ASSERT_MATCH("^a", "abc");
|
||||
ASSERT_NO_MATCH("^a", "ba");
|
||||
|
||||
ASSERT_MATCH("a$", "a");
|
||||
ASSERT_MATCH("a$", "ba");
|
||||
ASSERT_NO_MATCH("a$", "ab");
|
||||
|
||||
ASSERT_MATCH("^abc$", "abc");
|
||||
ASSERT_NO_MATCH("^abc$", "xabc");
|
||||
ASSERT_NO_MATCH("^abc$", "abcx");
|
||||
}
|
||||
|
||||
TEST(quantifier_exact) {
|
||||
ASSERT_MATCH("a{3}", "aaa");
|
||||
ASSERT_MATCH("a{3}", "aaaa");
|
||||
ASSERT_NO_MATCH("a{3}", "aa");
|
||||
}
|
||||
|
||||
TEST(quantifier_range) {
|
||||
ASSERT_MATCH("a{2,4}", "aa");
|
||||
ASSERT_MATCH("a{2,4}", "aaa");
|
||||
ASSERT_MATCH("a{2,4}", "aaaa");
|
||||
ASSERT_NO_MATCH("a{2,4}", "a");
|
||||
}
|
||||
|
||||
TEST(quantifier_open) {
|
||||
ASSERT_MATCH("a{2,}", "aa");
|
||||
ASSERT_MATCH("a{2,}", "aaaaa");
|
||||
ASSERT_NO_MATCH("a{2,}", "a");
|
||||
}
|
||||
|
||||
TEST(class_digit) {
|
||||
ASSERT_MATCH("\\d", "0");
|
||||
ASSERT_MATCH("\\d", "9");
|
||||
ASSERT_MATCH("\\d+", "123");
|
||||
ASSERT_NO_MATCH("\\d", "a");
|
||||
}
|
||||
|
||||
TEST(class_word) {
|
||||
ASSERT_MATCH("\\w", "a");
|
||||
ASSERT_MATCH("\\w", "Z");
|
||||
ASSERT_MATCH("\\w", "0");
|
||||
ASSERT_MATCH("\\w", "_");
|
||||
ASSERT_NO_MATCH("\\w", " ");
|
||||
ASSERT_NO_MATCH("\\w", "-");
|
||||
}
|
||||
|
||||
TEST(class_space) {
|
||||
ASSERT_MATCH("\\s", " ");
|
||||
ASSERT_MATCH("\\s", "\t");
|
||||
ASSERT_MATCH("\\s", "\n");
|
||||
ASSERT_NO_MATCH("\\s", "a");
|
||||
}
|
||||
|
||||
TEST(class_negated) {
|
||||
ASSERT_NO_MATCH("\\D", "0");
|
||||
ASSERT_MATCH("\\D", "a");
|
||||
ASSERT_NO_MATCH("\\W", "a");
|
||||
ASSERT_MATCH("\\W", " ");
|
||||
ASSERT_NO_MATCH("\\S", " ");
|
||||
ASSERT_MATCH("\\S", "a");
|
||||
}
|
||||
|
||||
TEST(escape_sequences) {
|
||||
ASSERT_MATCH("\\.", ".");
|
||||
ASSERT_NO_MATCH("\\.", "a");
|
||||
ASSERT_MATCH("\\*", "*");
|
||||
ASSERT_MATCH("\\+", "+");
|
||||
ASSERT_MATCH("\\?", "?");
|
||||
}
|
||||
|
||||
TEST(complex_email) {
|
||||
ASSERT_MATCH("[a-z]+@[a-z]+\\.[a-z]+", "test@example.com");
|
||||
ASSERT_NO_MATCH("[a-z]+@[a-z]+\\.[a-z]+", "invalid");
|
||||
}
|
||||
|
||||
TEST(complex_phone) {
|
||||
ASSERT_MATCH("\\d{3}-\\d{3}-\\d{4}", "123-456-7890");
|
||||
ASSERT_NO_MATCH("\\d{3}-\\d{3}-\\d{4}", "123-456-789");
|
||||
}
|
||||
|
||||
TEST(complex_url) {
|
||||
ASSERT_MATCH("https?://[a-z]+\\.[a-z]+", "http://example.com");
|
||||
ASSERT_MATCH("https?://[a-z]+\\.[a-z]+", "https://example.com");
|
||||
}
|
||||
|
||||
TEST(group_capture) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("(\\d+)-(\\d+)", &err);
|
||||
ASSERT(re != NULL);
|
||||
|
||||
loreg_match_t result;
|
||||
ASSERT(loreg_search(re, "123-456", &result));
|
||||
ASSERT(result.group_count == 2);
|
||||
ASSERT(result.groups[0].matched);
|
||||
ASSERT(result.groups[1].matched);
|
||||
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(nested_groups) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("((a)(b))", &err);
|
||||
ASSERT(re != NULL);
|
||||
|
||||
loreg_match_t result;
|
||||
ASSERT(loreg_search(re, "ab", &result));
|
||||
ASSERT(result.group_count == 3);
|
||||
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(empty_pattern) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("", &err);
|
||||
ASSERT(re != NULL);
|
||||
|
||||
loreg_match_t result;
|
||||
ASSERT(loreg_match(re, "anything", &result));
|
||||
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
TEST(match_position) {
|
||||
loreg_error_t err;
|
||||
loreg_regex_t *re = loreg_compile("test", &err);
|
||||
ASSERT(re != NULL);
|
||||
|
||||
loreg_match_t result;
|
||||
ASSERT(loreg_search(re, "xxxtestyyy", &result));
|
||||
ASSERT(result.match_start == 3);
|
||||
ASSERT(result.match_end == 7);
|
||||
|
||||
loreg_free(re);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
printf("matcher tests:\n");
|
||||
|
||||
RUN_TEST(simple_char);
|
||||
RUN_TEST(concat);
|
||||
RUN_TEST(alternation);
|
||||
RUN_TEST(star);
|
||||
RUN_TEST(plus);
|
||||
RUN_TEST(question);
|
||||
RUN_TEST(dot);
|
||||
RUN_TEST(bracket_simple);
|
||||
RUN_TEST(bracket_range);
|
||||
RUN_TEST(bracket_negated);
|
||||
RUN_TEST(group);
|
||||
RUN_TEST(anchors);
|
||||
RUN_TEST(quantifier_exact);
|
||||
RUN_TEST(quantifier_range);
|
||||
RUN_TEST(quantifier_open);
|
||||
RUN_TEST(class_digit);
|
||||
RUN_TEST(class_word);
|
||||
RUN_TEST(class_space);
|
||||
RUN_TEST(class_negated);
|
||||
RUN_TEST(escape_sequences);
|
||||
RUN_TEST(complex_email);
|
||||
RUN_TEST(complex_phone);
|
||||
RUN_TEST(complex_url);
|
||||
RUN_TEST(group_capture);
|
||||
RUN_TEST(nested_groups);
|
||||
RUN_TEST(empty_pattern);
|
||||
RUN_TEST(match_position);
|
||||
|
||||
printf("\nmatcher: %d passed, %d failed\n", tests_passed, tests_failed);
|
||||
return tests_failed > 0 ? 1 : 0;
|
||||
}
|
||||
159
tests/test_nfa.c
Normal file
159
tests/test_nfa.c
Normal file
@ -0,0 +1,159 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "../include/nfa.h"
|
||||
#include "../include/parser.h"
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
static int tests_passed = 0;
|
||||
static int tests_failed = 0;
|
||||
|
||||
#define TEST(name) static void test_##name(void)
|
||||
#define RUN_TEST(name) do { \
|
||||
printf(" %s... ", #name); \
|
||||
test_##name(); \
|
||||
printf("ok\n"); \
|
||||
tests_passed++; \
|
||||
} while(0)
|
||||
|
||||
#define ASSERT(cond) do { \
|
||||
if (!(cond)) { \
|
||||
printf("FAILED at line %d: %s\n", __LINE__, #cond); \
|
||||
tests_failed++; \
|
||||
return; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
static nfa_t *compile_pattern(const char *pattern) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, pattern);
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
if (!ast || parser_get_error(&parser) != LOREG_OK) {
|
||||
ast_free(ast);
|
||||
return NULL;
|
||||
}
|
||||
loreg_error_t error;
|
||||
nfa_t *nfa = nfa_from_ast(ast, &error);
|
||||
ast_free(ast);
|
||||
return nfa;
|
||||
}
|
||||
|
||||
TEST(single_char) {
|
||||
nfa_t *nfa = compile_pattern("a");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
ASSERT(nfa->state_count >= 2);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(concat) {
|
||||
nfa_t *nfa = compile_pattern("ab");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(alternation) {
|
||||
nfa_t *nfa = compile_pattern("a|b");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(star) {
|
||||
nfa_t *nfa = compile_pattern("a*");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(plus) {
|
||||
nfa_t *nfa = compile_pattern("a+");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(question) {
|
||||
nfa_t *nfa = compile_pattern("a?");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(group) {
|
||||
nfa_t *nfa = compile_pattern("(ab)");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->group_count == 1);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(nested_groups) {
|
||||
nfa_t *nfa = compile_pattern("((a)(b))");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->group_count == 3);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(bracket) {
|
||||
nfa_t *nfa = compile_pattern("[abc]");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(quantifier) {
|
||||
nfa_t *nfa = compile_pattern("a{2,4}");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(complex_pattern) {
|
||||
nfa_t *nfa = compile_pattern("^([a-z]+)@([a-z]+)\\.([a-z]{2,})$");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->group_count == 3);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(dot) {
|
||||
nfa_t *nfa = compile_pattern("a.b");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(anchors) {
|
||||
nfa_t *nfa = compile_pattern("^abc$");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
TEST(character_classes) {
|
||||
nfa_t *nfa = compile_pattern("\\d\\w\\s");
|
||||
ASSERT(nfa != NULL);
|
||||
ASSERT(nfa->start != NULL);
|
||||
nfa_free(nfa);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
printf("nfa tests:\n");
|
||||
|
||||
RUN_TEST(single_char);
|
||||
RUN_TEST(concat);
|
||||
RUN_TEST(alternation);
|
||||
RUN_TEST(star);
|
||||
RUN_TEST(plus);
|
||||
RUN_TEST(question);
|
||||
RUN_TEST(group);
|
||||
RUN_TEST(nested_groups);
|
||||
RUN_TEST(bracket);
|
||||
RUN_TEST(quantifier);
|
||||
RUN_TEST(complex_pattern);
|
||||
RUN_TEST(dot);
|
||||
RUN_TEST(anchors);
|
||||
RUN_TEST(character_classes);
|
||||
|
||||
printf("\nnfa: %d passed, %d failed\n", tests_passed, tests_failed);
|
||||
return tests_failed > 0 ? 1 : 0;
|
||||
}
|
||||
301
tests/test_parser.c
Normal file
301
tests/test_parser.c
Normal file
@ -0,0 +1,301 @@
|
||||
/* retoor <retoor@molodetz.nl> */
|
||||
#include "../include/parser.h"
|
||||
#include <stdio.h>
|
||||
#include <assert.h>
|
||||
|
||||
static int tests_passed = 0;
|
||||
static int tests_failed = 0;
|
||||
|
||||
#define TEST(name) static void test_##name(void)
|
||||
#define RUN_TEST(name) do { \
|
||||
printf(" %s... ", #name); \
|
||||
test_##name(); \
|
||||
printf("ok\n"); \
|
||||
tests_passed++; \
|
||||
} while(0)
|
||||
|
||||
#define ASSERT(cond) do { \
|
||||
if (!(cond)) { \
|
||||
printf("FAILED at line %d: %s\n", __LINE__, #cond); \
|
||||
tests_failed++; \
|
||||
return; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
TEST(single_char) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "a");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_CHAR);
|
||||
ASSERT(ast->value == 'a');
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(concat) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "ab");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_CONCAT);
|
||||
ASSERT(ast->left->type == AST_CHAR);
|
||||
ASSERT(ast->left->value == 'a');
|
||||
ASSERT(ast->right->type == AST_CHAR);
|
||||
ASSERT(ast->right->value == 'b');
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(alternation) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "a|b");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_ALTER);
|
||||
ASSERT(ast->left->type == AST_CHAR);
|
||||
ASSERT(ast->left->value == 'a');
|
||||
ASSERT(ast->right->type == AST_CHAR);
|
||||
ASSERT(ast->right->value == 'b');
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(star) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "a*");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_STAR);
|
||||
ASSERT(ast->left->type == AST_CHAR);
|
||||
ASSERT(ast->left->value == 'a');
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(plus) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "a+");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_PLUS);
|
||||
ASSERT(ast->left->type == AST_CHAR);
|
||||
ASSERT(ast->left->value == 'a');
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(question) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "a?");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_QUESTION);
|
||||
ASSERT(ast->left->type == AST_CHAR);
|
||||
ASSERT(ast->left->value == 'a');
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(group) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "(ab)");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_GROUP);
|
||||
ASSERT(ast->group_id == 0);
|
||||
ASSERT(ast->left->type == AST_CONCAT);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(dot) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, ".");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_DOT);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(anchors) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "^a$");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_CONCAT);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(bracket_simple) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "[abc]");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_BRACKET);
|
||||
ASSERT(ast->bracket != NULL);
|
||||
ASSERT(ast->bracket->count == 3);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(bracket_range) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "[a-z]");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_BRACKET);
|
||||
ASSERT(ast->bracket != NULL);
|
||||
ASSERT(ast->bracket->count == 1);
|
||||
ASSERT(ast->bracket->ranges[0].start == 'a');
|
||||
ASSERT(ast->bracket->ranges[0].end == 'z');
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(bracket_negated) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "[^a]");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_BRACKET);
|
||||
ASSERT(ast->bracket->negated == true);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(quantifier_exact) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "a{3}");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_QUANTIFIER);
|
||||
ASSERT(ast->quant.min == 3);
|
||||
ASSERT(ast->quant.max == 3);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(quantifier_range) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "a{2,5}");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_QUANTIFIER);
|
||||
ASSERT(ast->quant.min == 2);
|
||||
ASSERT(ast->quant.max == 5);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(quantifier_open) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "a{2,}");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_QUANTIFIER);
|
||||
ASSERT(ast->quant.min == 2);
|
||||
ASSERT(ast->quant.max == -1);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(character_class_digit) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "\\d");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_CLASS_DIGIT);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(character_class_word) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "\\w");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_CLASS_WORD);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(complex_pattern) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "^([a-z]+)@([a-z]+)\\.([a-z]{2,})$");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(parser_get_error(&parser) == LOREG_OK);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(unbalanced_paren) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "(abc");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast == NULL || parser_get_error(&parser) == LOREG_ERR_UNBALANCED_PAREN);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
TEST(non_greedy) {
|
||||
parser_t parser;
|
||||
parser_init(&parser, "a*?");
|
||||
ast_node_t *ast = parser_parse(&parser);
|
||||
|
||||
ASSERT(ast != NULL);
|
||||
ASSERT(ast->type == AST_STAR);
|
||||
ASSERT(ast->quant.greedy == false);
|
||||
|
||||
ast_free(ast);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
printf("parser tests:\n");
|
||||
|
||||
RUN_TEST(single_char);
|
||||
RUN_TEST(concat);
|
||||
RUN_TEST(alternation);
|
||||
RUN_TEST(star);
|
||||
RUN_TEST(plus);
|
||||
RUN_TEST(question);
|
||||
RUN_TEST(group);
|
||||
RUN_TEST(dot);
|
||||
RUN_TEST(anchors);
|
||||
RUN_TEST(bracket_simple);
|
||||
RUN_TEST(bracket_range);
|
||||
RUN_TEST(bracket_negated);
|
||||
RUN_TEST(quantifier_exact);
|
||||
RUN_TEST(quantifier_range);
|
||||
RUN_TEST(quantifier_open);
|
||||
RUN_TEST(character_class_digit);
|
||||
RUN_TEST(character_class_word);
|
||||
RUN_TEST(complex_pattern);
|
||||
RUN_TEST(unbalanced_paren);
|
||||
RUN_TEST(non_greedy);
|
||||
|
||||
printf("\nparser: %d passed, %d failed\n", tests_passed, tests_failed);
|
||||
return tests_failed > 0 ? 1 : 0;
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user