commit 3d9c4aa00be1bfa0ecf660cfdd55c0e01337a428 Author: retoor Date: Sun Jan 4 00:04:48 2026 +0100 chore: update c, h, md files diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml new file mode 100644 index 0000000..a373ab4 --- /dev/null +++ b/.gitea/workflows/ci.yml @@ -0,0 +1,93 @@ +# retoor +name: CI + +on: + push: + branches: + - main + - master + pull_request: + branches: + - main + - master + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y gcc make valgrind + + - name: Build release + run: make + + - name: Build debug + run: make debug + + test: + runs-on: ubuntu-latest + needs: build + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y gcc make + + - name: Run tests + run: make test + + valgrind: + runs-on: ubuntu-latest + needs: build + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y gcc make valgrind + + - name: Build test binaries + run: make build/test_integration build/test_all + + - name: Valgrind comprehensive tests + run: | + valgrind --leak-check=full --show-leak-kinds=all \ + --track-origins=yes --error-exitcode=1 \ + ./build/test_all + + - name: Valgrind integration tests + run: | + valgrind --leak-check=full --show-leak-kinds=all \ + --track-origins=yes --error-exitcode=1 \ + ./build/test_integration + + coverage: + runs-on: ubuntu-latest + needs: test + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y gcc make gcovr + + - name: Generate coverage + run: make coverage + + - name: Upload coverage artifacts + uses: actions/upload-artifact@v4 + with: + name: coverage-report + path: build/coverage/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2e4133 --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# Build +build/ +*.o +*.a +*.so +*.dylib + +# Binary +loreg + +# Coverage +*.gcov +*.gcda +*.gcno + +# Profiling +gmon.out +*.prof + +# Editor +*~ +*.swp +*.swo +.vscode/ +.idea/ + +# OS +.DS_Store +Thumbs.db diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8cef0f7 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# Changelog + + + +## Version 0.1.0 - 2026-01-04 + +update c, h, md files + +**Changes:** 25 files, 4449 lines +**Languages:** C (3989 lines), Markdown (181 lines), Other (186 lines), YAML (93 lines) diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..48118f5 --- /dev/null +++ b/Makefile @@ -0,0 +1,157 @@ +# retoor + +CC = gcc +CFLAGS = -Wall -Wextra -Werror -pedantic -std=c11 -O3 -march=native -flto +CFLAGS_DEBUG = -Wall -Wextra -pedantic -std=c11 -g -O0 -DDEBUG +CFLAGS_COV = -Wall -Wextra -pedantic -std=c11 -g -O0 --coverage -fprofile-arcs -ftest-coverage +CFLAGS_PROF = -Wall -Wextra -pedantic -std=c11 -O2 -pg + +INCLUDES = -Iinclude +LDFLAGS = -flto +LDFLAGS_COV = --coverage + +SRC_DIR = src +INC_DIR = include +BUILD_DIR = build +TEST_DIR = tests + +SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \ + $(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c \ + $(SRC_DIR)/repl.c $(SRC_DIR)/main.c + +LIB_SRCS = $(SRC_DIR)/lexer.c $(SRC_DIR)/ast.c $(SRC_DIR)/parser.c \ + $(SRC_DIR)/nfa.c $(SRC_DIR)/matcher.c $(SRC_DIR)/loreg.c + +OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(SRCS)) +LIB_OBJS = $(patsubst $(SRC_DIR)/%.c,$(BUILD_DIR)/%.o,$(LIB_SRCS)) + +TARGET = loreg +LIB_TARGET = libloreg.a + +TEST_SRCS = $(TEST_DIR)/test_lexer.c $(TEST_DIR)/test_parser.c \ + $(TEST_DIR)/test_nfa.c $(TEST_DIR)/test_matcher.c \ + $(TEST_DIR)/test_all.c $(TEST_DIR)/test_integration.c + +TEST_BINS = $(BUILD_DIR)/test_lexer $(BUILD_DIR)/test_parser \ + $(BUILD_DIR)/test_nfa $(BUILD_DIR)/test_matcher \ + $(BUILD_DIR)/test_all $(BUILD_DIR)/test_integration + +.PHONY: all clean test debug coverage profile valgrind help install + +all: $(BUILD_DIR) $(TARGET) + +$(BUILD_DIR): + mkdir -p $(BUILD_DIR) + +$(BUILD_DIR)/%.o: $(SRC_DIR)/%.c | $(BUILD_DIR) + $(CC) $(CFLAGS) $(INCLUDES) -c $< -o $@ + +$(TARGET): $(OBJS) + $(CC) $(OBJS) -o $@ $(LDFLAGS) + +$(LIB_TARGET): $(LIB_OBJS) + ar rcs $@ $(LIB_OBJS) + +debug: CFLAGS = $(CFLAGS_DEBUG) +debug: clean $(TARGET) + +$(BUILD_DIR)/test_lexer: $(TEST_DIR)/test_lexer.c $(LIB_SRCS) | $(BUILD_DIR) + $(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@ + +$(BUILD_DIR)/test_parser: $(TEST_DIR)/test_parser.c $(LIB_SRCS) | $(BUILD_DIR) + $(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@ + +$(BUILD_DIR)/test_nfa: $(TEST_DIR)/test_nfa.c $(LIB_SRCS) | $(BUILD_DIR) + $(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@ + +$(BUILD_DIR)/test_matcher: $(TEST_DIR)/test_matcher.c $(LIB_SRCS) | $(BUILD_DIR) + $(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@ + +$(BUILD_DIR)/test_all: $(TEST_DIR)/test_all.c $(LIB_SRCS) | $(BUILD_DIR) + $(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@ + +$(BUILD_DIR)/test_integration: $(TEST_DIR)/test_integration.c $(LIB_SRCS) | $(BUILD_DIR) + $(CC) $(CFLAGS_DEBUG) $(INCLUDES) $< $(LIB_SRCS) -o $@ + +test: $(TEST_BINS) + @echo "running lexer tests..." + @$(BUILD_DIR)/test_lexer + @echo "" + @echo "running parser tests..." + @$(BUILD_DIR)/test_parser + @echo "" + @echo "running nfa tests..." + @$(BUILD_DIR)/test_nfa + @echo "" + @echo "running matcher tests..." + @$(BUILD_DIR)/test_matcher + @echo "" + @echo "running comprehensive tests..." + @$(BUILD_DIR)/test_all + @echo "" + @echo "running integration tests..." + @$(BUILD_DIR)/test_integration + +coverage: CFLAGS = $(CFLAGS_COV) +coverage: LDFLAGS = $(LDFLAGS_COV) +coverage: clean $(BUILD_DIR) + $(CC) $(CFLAGS_COV) $(INCLUDES) $(TEST_DIR)/test_all.c $(LIB_SRCS) -o $(BUILD_DIR)/test_coverage $(LDFLAGS_COV) + $(BUILD_DIR)/test_coverage + gcov -b $(LIB_SRCS) + @echo "" + @echo "coverage report generated" + @mkdir -p $(BUILD_DIR)/coverage + @mv *.gcov $(BUILD_DIR)/coverage/ 2>/dev/null || true + @mv *.gcda $(BUILD_DIR)/coverage/ 2>/dev/null || true + @mv *.gcno $(BUILD_DIR)/coverage/ 2>/dev/null || true + +profile: CFLAGS = $(CFLAGS_PROF) +profile: clean $(BUILD_DIR) + $(CC) $(CFLAGS_PROF) $(INCLUDES) $(TEST_DIR)/test_all.c $(LIB_SRCS) -o $(BUILD_DIR)/test_profile + $(BUILD_DIR)/test_profile + gprof $(BUILD_DIR)/test_profile gmon.out > $(BUILD_DIR)/profile.txt + @echo "" + @echo "profile report: $(BUILD_DIR)/profile.txt" + @mv gmon.out $(BUILD_DIR)/ 2>/dev/null || true + +valgrind: $(BUILD_DIR)/test_all + valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes \ + --error-exitcode=1 $(BUILD_DIR)/test_all + +valgrind-verbose: $(BUILD_DIR)/test_all + valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes \ + --verbose --log-file=$(BUILD_DIR)/valgrind.log $(BUILD_DIR)/test_all + @echo "valgrind log: $(BUILD_DIR)/valgrind.log" + +benchmark: $(TARGET) + @echo "benchmarking..." + @echo "pattern: [a-z]+@[a-z]+\\.[a-z]+" + @time -p sh -c 'for i in $$(seq 1 1000); do ./$(TARGET) "[a-z]+@[a-z]+\\.[a-z]+" "test@example.com" > /dev/null; done' + @echo "" + @echo "pattern: (a|b)*abb" + @time -p sh -c 'for i in $$(seq 1 1000); do ./$(TARGET) "(a|b)*abb" "aabababb" > /dev/null; done' + +install: $(TARGET) + install -d $(DESTDIR)/usr/local/bin + install -m 755 $(TARGET) $(DESTDIR)/usr/local/bin/ + +uninstall: + rm -f $(DESTDIR)/usr/local/bin/$(TARGET) + +clean: + rm -rf $(BUILD_DIR) $(TARGET) $(LIB_TARGET) + rm -f *.gcov *.gcda *.gcno gmon.out + +help: + @echo "loreg makefile targets:" + @echo " all build optimized release binary" + @echo " debug build with debug symbols" + @echo " test run all tests" + @echo " coverage run tests with coverage analysis" + @echo " profile run tests with profiling" + @echo " valgrind run tests under valgrind" + @echo " benchmark run simple benchmarks" + @echo " install install to /usr/local/bin" + @echo " uninstall remove from /usr/local/bin" + @echo " clean remove build artifacts" + @echo " help show this message" diff --git a/README.md b/README.md new file mode 100644 index 0000000..3d8cb35 --- /dev/null +++ b/README.md @@ -0,0 +1,181 @@ +# loreg + +retoor + +A high-performance regular expression interpreter implemented from scratch in plain C. The engine uses Thompson's NFA construction algorithm for efficient pattern matching. + +## CI + +The project includes Gitea Actions CI that runs on every push and pull request: +- Build verification (release and debug) +- Full test suite (569 tests) +- Valgrind memory leak detection +- Code coverage generation + +## Features + +- Full regex syntax support: literals, metacharacters, quantifiers, character classes, groups, alternation, anchors +- NFA-based matching engine with Thompson construction +- Capturing groups with match position tracking +- Interactive REPL for testing patterns +- Zero external dependencies +- Comprehensive test suite with 569 tests +- Memory-safe implementation verified with Valgrind + +## Building + +```sh +make # optimized release build +make debug # debug build with symbols +make test # run all tests +make coverage # generate coverage report +make profile # generate profiling report +make valgrind # run under valgrind +``` + +## Usage + +### Command Line + +```sh +./loreg "pattern" "text" # search for pattern in text +./loreg -m "pattern" "text" # full match mode +./loreg -i # start REPL +./loreg # start REPL (default) +``` + +### REPL Commands + +``` +:p compile and set pattern +:m match text (anchored) +:s search for pattern in text + search (default) +:h help +:q quit +``` + +### C API + +```c +#include "loreg.h" + +loreg_error_t err; +loreg_regex_t *re = loreg_compile("\\d{3}-\\d{4}", &err); +if (!re) { + fprintf(stderr, "error: %s\n", loreg_error_string(err)); + return 1; +} + +loreg_match_t result; +if (loreg_search(re, "call 555-1234 now", &result)) { + printf("match at [%zu-%zu]\n", result.match_start, result.match_end); +} + +loreg_free(re); +``` + +## Supported Syntax + +| Pattern | Description | +|---------|-------------| +| `.` | any character except newline | +| `*` | zero or more | +| `+` | one or more | +| `?` | zero or one | +| `\|` | alternation | +| `()` | grouping and capture | +| `[]` | character class | +| `[^]` | negated character class | +| `[a-z]` | character range | +| `^` | start anchor | +| `$` | end anchor | +| `{n}` | exactly n | +| `{n,}` | n or more | +| `{n,m}` | n to m | +| `\d` | digit [0-9] | +| `\w` | word [a-zA-Z0-9_] | +| `\s` | whitespace | +| `\D` | non-digit | +| `\W` | non-word | +| `\S` | non-whitespace | +| `*?` `+?` `??` | non-greedy quantifiers | + +## Architecture + +``` +src/ +├── lexer.c tokenizer for regex patterns +├── parser.c recursive descent parser producing AST +├── ast.c abstract syntax tree node types +├── nfa.c Thompson NFA construction +├── matcher.c NFA simulation with epsilon closure +├── loreg.c public API +├── repl.c interactive REPL +└── main.c CLI entry point + +include/ +├── loreg.h public header +├── lexer.h lexer interface +├── parser.h parser interface +├── ast.h AST types +├── nfa.h NFA types +├── matcher.h matcher interface +└── repl.h REPL interface + +tests/ +├── test_lexer.c lexer unit tests (10 tests) +├── test_parser.c parser unit tests (20 tests) +├── test_nfa.c NFA construction tests (14 tests) +├── test_matcher.c matching tests (27 tests) +├── test_all.c comprehensive tests (9 tests) +└── test_integration.c integration tests (489 tests) +``` + +## Test Suite + +The test suite contains 569 tests covering: + +| Category | Description | +|----------|-------------| +| Lexer | Tokenization of patterns | +| Parser | AST construction and error handling | +| NFA | State machine construction | +| Matcher | Pattern matching correctness | +| Integration | Real-world regex patterns | + +Integration tests cover: +- Literal matching and concatenation +- Dot metacharacter and wildcards +- Start/end anchors +- All quantifiers (*, +, ?, {n,m}) +- Alternation and grouping +- Character classes and ranges +- Negated character classes +- Escape sequences +- Email, IP, URL, phone patterns +- Greedy vs non-greedy matching +- Nested groups and complex nesting +- Edge cases and boundary conditions +- Pathological/stress patterns + +Run tests with Valgrind verification: +```sh +make test # run all 569 tests +make valgrind # verify zero memory leaks +``` + +## Algorithm + +The implementation uses Thompson's construction to convert regex patterns to NFAs: + +1. **Lexer**: Tokenizes the pattern into a stream of tokens +2. **Parser**: Builds an AST using recursive descent parsing +3. **NFA Construction**: Converts AST to NFA using Thompson's algorithm +4. **Matching**: Simulates NFA with epsilon closure for linear-time matching + +Time complexity: O(n*m) where n is pattern length and m is text length. + +## License + +MIT diff --git a/include/ast.h b/include/ast.h new file mode 100644 index 0000000..1b33bfc --- /dev/null +++ b/include/ast.h @@ -0,0 +1,80 @@ +/* retoor */ +#ifndef LOREG_AST_H +#define LOREG_AST_H + +#include +#include + +typedef enum { + AST_CHAR, + AST_DOT, + AST_CONCAT, + AST_ALTER, + AST_STAR, + AST_PLUS, + AST_QUESTION, + AST_GROUP, + AST_ANCHOR_START, + AST_ANCHOR_END, + AST_BRACKET, + AST_QUANTIFIER, + AST_CLASS_DIGIT, + AST_CLASS_WORD, + AST_CLASS_SPACE, + AST_CLASS_NDIGIT, + AST_CLASS_NWORD, + AST_CLASS_NSPACE +} ast_type_t; + +typedef struct { + char start; + char end; +} char_range_t; + +typedef struct { + char_range_t *ranges; + size_t count; + size_t capacity; + bool negated; +} bracket_class_t; + +typedef struct { + int min; + int max; + bool greedy; +} quantifier_t; + +typedef struct ast_node ast_node_t; + +struct ast_node { + ast_type_t type; + char value; + ast_node_t *left; + ast_node_t *right; + int group_id; + bracket_class_t *bracket; + quantifier_t quant; +}; + +ast_node_t *ast_create_char(char c); +ast_node_t *ast_create_dot(void); +ast_node_t *ast_create_concat(ast_node_t *left, ast_node_t *right); +ast_node_t *ast_create_alter(ast_node_t *left, ast_node_t *right); +ast_node_t *ast_create_star(ast_node_t *child, bool greedy); +ast_node_t *ast_create_plus(ast_node_t *child, bool greedy); +ast_node_t *ast_create_question(ast_node_t *child, bool greedy); +ast_node_t *ast_create_group(ast_node_t *child, int group_id); +ast_node_t *ast_create_anchor_start(void); +ast_node_t *ast_create_anchor_end(void); +ast_node_t *ast_create_bracket(bracket_class_t *bracket); +ast_node_t *ast_create_quantifier(ast_node_t *child, int min, int max, bool greedy); +ast_node_t *ast_create_class(ast_type_t type); +void ast_free(ast_node_t *node); + +bracket_class_t *bracket_create(void); +void bracket_add_char(bracket_class_t *bracket, char c); +void bracket_add_range(bracket_class_t *bracket, char start, char end); +void bracket_free(bracket_class_t *bracket); +bool bracket_matches(bracket_class_t *bracket, char c); + +#endif diff --git a/include/lexer.h b/include/lexer.h new file mode 100644 index 0000000..229c452 --- /dev/null +++ b/include/lexer.h @@ -0,0 +1,52 @@ +/* retoor */ +#ifndef LOREG_LEXER_H +#define LOREG_LEXER_H + +#include +#include + +typedef enum { + TOKEN_CHAR, + TOKEN_DOT, + TOKEN_STAR, + TOKEN_PLUS, + TOKEN_QUESTION, + TOKEN_PIPE, + TOKEN_LPAREN, + TOKEN_RPAREN, + TOKEN_LBRACKET, + TOKEN_RBRACKET, + TOKEN_CARET, + TOKEN_DOLLAR, + TOKEN_LBRACE, + TOKEN_RBRACE, + TOKEN_BACKSLASH, + TOKEN_DASH, + TOKEN_CLASS_DIGIT, + TOKEN_CLASS_WORD, + TOKEN_CLASS_SPACE, + TOKEN_CLASS_NDIGIT, + TOKEN_CLASS_NWORD, + TOKEN_CLASS_NSPACE, + TOKEN_EOF +} token_type_t; + +typedef struct { + token_type_t type; + char value; + size_t position; +} token_t; + +typedef struct { + const char *pattern; + size_t length; + size_t position; + bool in_bracket; +} lexer_t; + +void lexer_init(lexer_t *lexer, const char *pattern); +token_t lexer_next(lexer_t *lexer); +token_t lexer_peek(lexer_t *lexer); +bool lexer_eof(lexer_t *lexer); + +#endif diff --git a/include/loreg.h b/include/loreg.h new file mode 100644 index 0000000..e8fc41f --- /dev/null +++ b/include/loreg.h @@ -0,0 +1,45 @@ +/* retoor */ +#ifndef LOREG_H +#define LOREG_H + +#include +#include + +#define LOREG_VERSION "1.0.0" +#define LOREG_MAX_STATES 4096 +#define LOREG_MAX_GROUPS 32 + +typedef enum { + LOREG_OK = 0, + LOREG_ERR_INVALID_PATTERN, + LOREG_ERR_UNBALANCED_PAREN, + LOREG_ERR_EMPTY_GROUP, + LOREG_ERR_INVALID_QUANTIFIER, + LOREG_ERR_INVALID_ESCAPE, + LOREG_ERR_OUT_OF_MEMORY, + LOREG_ERR_STATE_OVERFLOW +} loreg_error_t; + +typedef struct { + size_t start; + size_t end; + bool matched; +} loreg_group_t; + +typedef struct { + bool matched; + size_t match_start; + size_t match_end; + loreg_group_t groups[LOREG_MAX_GROUPS]; + size_t group_count; +} loreg_match_t; + +typedef struct loreg_regex loreg_regex_t; + +loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error); +void loreg_free(loreg_regex_t *regex); +bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result); +bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result); +const char *loreg_error_string(loreg_error_t error); + +#endif diff --git a/include/matcher.h b/include/matcher.h new file mode 100644 index 0000000..04cd880 --- /dev/null +++ b/include/matcher.h @@ -0,0 +1,26 @@ +/* retoor */ +#ifndef LOREG_MATCHER_H +#define LOREG_MATCHER_H + +#include "nfa.h" +#include "loreg.h" + +typedef struct { + nfa_state_t **states; + size_t count; + size_t capacity; + size_t *group_starts; + size_t *group_ends; + int group_count; +} state_set_t; + +state_set_t *state_set_create(size_t initial_capacity, int group_count); +void state_set_free(state_set_t *set); +void state_set_clear(state_set_t *set); +void state_set_add(state_set_t *set, nfa_state_t *state); +bool state_set_contains(state_set_t *set, nfa_state_t *state); + +bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result); +bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result); + +#endif diff --git a/include/nfa.h b/include/nfa.h new file mode 100644 index 0000000..3c0579a --- /dev/null +++ b/include/nfa.h @@ -0,0 +1,69 @@ +/* retoor */ +#ifndef LOREG_NFA_H +#define LOREG_NFA_H + +#include "ast.h" +#include "loreg.h" +#include +#include + +#define EPSILON '\0' +#define NFA_MAX_TRANSITIONS 256 + +typedef struct nfa_state nfa_state_t; + +typedef enum { + TRANS_CHAR, + TRANS_EPSILON, + TRANS_DOT, + TRANS_BRACKET, + TRANS_CLASS_DIGIT, + TRANS_CLASS_WORD, + TRANS_CLASS_SPACE, + TRANS_CLASS_NDIGIT, + TRANS_CLASS_NWORD, + TRANS_CLASS_NSPACE, + TRANS_GROUP_START, + TRANS_GROUP_END, + TRANS_ANCHOR_START, + TRANS_ANCHOR_END +} transition_type_t; + +typedef struct { + transition_type_t type; + char value; + nfa_state_t *target; + bracket_class_t *bracket; + int group_id; +} transition_t; + +struct nfa_state { + int id; + bool accepting; + transition_t *transitions; + size_t trans_count; + size_t trans_capacity; +}; + +typedef struct { + nfa_state_t *start; + nfa_state_t *accept; +} nfa_fragment_t; + +typedef struct { + nfa_state_t **states; + size_t state_count; + size_t capacity; + nfa_state_t *start; + int group_count; +} nfa_t; + +nfa_t *nfa_create(void); +void nfa_free(nfa_t *nfa); +nfa_state_t *nfa_add_state(nfa_t *nfa); +void nfa_add_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, char value); +void nfa_add_bracket_transition(nfa_state_t *from, nfa_state_t *to, bracket_class_t *bracket); +void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, int group_id); +nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error); + +#endif diff --git a/include/parser.h b/include/parser.h new file mode 100644 index 0000000..8605299 --- /dev/null +++ b/include/parser.h @@ -0,0 +1,20 @@ +/* retoor */ +#ifndef LOREG_PARSER_H +#define LOREG_PARSER_H + +#include "ast.h" +#include "lexer.h" +#include "loreg.h" + +typedef struct { + lexer_t lexer; + token_t current; + loreg_error_t error; + int group_count; +} parser_t; + +void parser_init(parser_t *parser, const char *pattern); +ast_node_t *parser_parse(parser_t *parser); +loreg_error_t parser_get_error(parser_t *parser); + +#endif diff --git a/include/repl.h b/include/repl.h new file mode 100644 index 0000000..006c1df --- /dev/null +++ b/include/repl.h @@ -0,0 +1,7 @@ +/* retoor */ +#ifndef LOREG_REPL_H +#define LOREG_REPL_H + +void repl_run(void); + +#endif diff --git a/src/ast.c b/src/ast.c new file mode 100644 index 0000000..fc88f99 --- /dev/null +++ b/src/ast.c @@ -0,0 +1,169 @@ +/* retoor */ +#include "ast.h" +#include +#include + +static ast_node_t *ast_create_node(ast_type_t type) { + ast_node_t *node = malloc(sizeof(ast_node_t)); + if (!node) return NULL; + node->type = type; + node->value = '\0'; + node->left = NULL; + node->right = NULL; + node->group_id = -1; + node->bracket = NULL; + node->quant.min = 0; + node->quant.max = -1; + node->quant.greedy = true; + return node; +} + +ast_node_t *ast_create_char(char c) { + ast_node_t *node = ast_create_node(AST_CHAR); + if (node) node->value = c; + return node; +} + +ast_node_t *ast_create_dot(void) { + return ast_create_node(AST_DOT); +} + +ast_node_t *ast_create_concat(ast_node_t *left, ast_node_t *right) { + ast_node_t *node = ast_create_node(AST_CONCAT); + if (node) { + node->left = left; + node->right = right; + } + return node; +} + +ast_node_t *ast_create_alter(ast_node_t *left, ast_node_t *right) { + ast_node_t *node = ast_create_node(AST_ALTER); + if (node) { + node->left = left; + node->right = right; + } + return node; +} + +ast_node_t *ast_create_star(ast_node_t *child, bool greedy) { + ast_node_t *node = ast_create_node(AST_STAR); + if (node) { + node->left = child; + node->quant.greedy = greedy; + } + return node; +} + +ast_node_t *ast_create_plus(ast_node_t *child, bool greedy) { + ast_node_t *node = ast_create_node(AST_PLUS); + if (node) { + node->left = child; + node->quant.greedy = greedy; + } + return node; +} + +ast_node_t *ast_create_question(ast_node_t *child, bool greedy) { + ast_node_t *node = ast_create_node(AST_QUESTION); + if (node) { + node->left = child; + node->quant.greedy = greedy; + } + return node; +} + +ast_node_t *ast_create_group(ast_node_t *child, int group_id) { + ast_node_t *node = ast_create_node(AST_GROUP); + if (node) { + node->left = child; + node->group_id = group_id; + } + return node; +} + +ast_node_t *ast_create_anchor_start(void) { + return ast_create_node(AST_ANCHOR_START); +} + +ast_node_t *ast_create_anchor_end(void) { + return ast_create_node(AST_ANCHOR_END); +} + +ast_node_t *ast_create_bracket(bracket_class_t *bracket) { + ast_node_t *node = ast_create_node(AST_BRACKET); + if (node) node->bracket = bracket; + return node; +} + +ast_node_t *ast_create_quantifier(ast_node_t *child, int min, int max, bool greedy) { + ast_node_t *node = ast_create_node(AST_QUANTIFIER); + if (node) { + node->left = child; + node->quant.min = min; + node->quant.max = max; + node->quant.greedy = greedy; + } + return node; +} + +ast_node_t *ast_create_class(ast_type_t type) { + return ast_create_node(type); +} + +void ast_free(ast_node_t *node) { + if (!node) return; + ast_free(node->left); + ast_free(node->right); + if (node->bracket) bracket_free(node->bracket); + free(node); +} + +bracket_class_t *bracket_create(void) { + bracket_class_t *bracket = malloc(sizeof(bracket_class_t)); + if (!bracket) return NULL; + bracket->ranges = NULL; + bracket->count = 0; + bracket->capacity = 0; + bracket->negated = false; + return bracket; +} + +static bool bracket_grow(bracket_class_t *bracket) { + size_t new_cap = bracket->capacity == 0 ? 8 : bracket->capacity * 2; + char_range_t *new_ranges = realloc(bracket->ranges, new_cap * sizeof(char_range_t)); + if (!new_ranges) return false; + bracket->ranges = new_ranges; + bracket->capacity = new_cap; + return true; +} + +void bracket_add_char(bracket_class_t *bracket, char c) { + bracket_add_range(bracket, c, c); +} + +void bracket_add_range(bracket_class_t *bracket, char start, char end) { + if (bracket->count >= bracket->capacity) { + if (!bracket_grow(bracket)) return; + } + bracket->ranges[bracket->count].start = start; + bracket->ranges[bracket->count].end = end; + bracket->count++; +} + +void bracket_free(bracket_class_t *bracket) { + if (!bracket) return; + free(bracket->ranges); + free(bracket); +} + +bool bracket_matches(bracket_class_t *bracket, char c) { + bool found = false; + for (size_t i = 0; i < bracket->count; i++) { + if (c >= bracket->ranges[i].start && c <= bracket->ranges[i].end) { + found = true; + break; + } + } + return bracket->negated ? !found : found; +} diff --git a/src/lexer.c b/src/lexer.c new file mode 100644 index 0000000..95fc092 --- /dev/null +++ b/src/lexer.c @@ -0,0 +1,125 @@ +/* retoor */ +#include "lexer.h" +#include + +void lexer_init(lexer_t *lexer, const char *pattern) { + lexer->pattern = pattern; + lexer->length = strlen(pattern); + lexer->position = 0; + lexer->in_bracket = false; +} + +static token_t make_token(token_type_t type, char value, size_t pos) { + token_t token; + token.type = type; + token.value = value; + token.position = pos; + return token; +} + +token_t lexer_next(lexer_t *lexer) { + if (lexer->position >= lexer->length) { + return make_token(TOKEN_EOF, '\0', lexer->position); + } + + char c = lexer->pattern[lexer->position]; + size_t pos = lexer->position; + lexer->position++; + + if (c == '[' && !lexer->in_bracket) { + lexer->in_bracket = true; + return make_token(TOKEN_LBRACKET, c, pos); + } + + if (c == ']' && lexer->in_bracket) { + lexer->in_bracket = false; + return make_token(TOKEN_RBRACKET, c, pos); + } + + if (lexer->in_bracket) { + if (c == '-') { + return make_token(TOKEN_DASH, c, pos); + } + if (c == '^' && pos > 0 && lexer->pattern[pos - 1] == '[') { + return make_token(TOKEN_CARET, c, pos); + } + if (c == '\\' && lexer->position < lexer->length) { + char next = lexer->pattern[lexer->position]; + lexer->position++; + switch (next) { + case 'd': return make_token(TOKEN_CLASS_DIGIT, 'd', pos); + case 'w': return make_token(TOKEN_CLASS_WORD, 'w', pos); + case 's': return make_token(TOKEN_CLASS_SPACE, 's', pos); + case 'D': return make_token(TOKEN_CLASS_NDIGIT, 'D', pos); + case 'W': return make_token(TOKEN_CLASS_NWORD, 'W', pos); + case 'S': return make_token(TOKEN_CLASS_NSPACE, 'S', pos); + case 'n': return make_token(TOKEN_CHAR, '\n', pos); + case 't': return make_token(TOKEN_CHAR, '\t', pos); + case 'r': return make_token(TOKEN_CHAR, '\r', pos); + default: return make_token(TOKEN_CHAR, next, pos); + } + } + return make_token(TOKEN_CHAR, c, pos); + } + + if (c == '\\' && lexer->position < lexer->length) { + char next = lexer->pattern[lexer->position]; + lexer->position++; + switch (next) { + case 'd': return make_token(TOKEN_CLASS_DIGIT, 'd', pos); + case 'w': return make_token(TOKEN_CLASS_WORD, 'w', pos); + case 's': return make_token(TOKEN_CLASS_SPACE, 's', pos); + case 'D': return make_token(TOKEN_CLASS_NDIGIT, 'D', pos); + case 'W': return make_token(TOKEN_CLASS_NWORD, 'W', pos); + case 'S': return make_token(TOKEN_CLASS_NSPACE, 'S', pos); + case 'n': return make_token(TOKEN_CHAR, '\n', pos); + case 't': return make_token(TOKEN_CHAR, '\t', pos); + case 'r': return make_token(TOKEN_CHAR, '\r', pos); + case '.': + case '*': + case '+': + case '?': + case '|': + case '(': + case ')': + case '[': + case ']': + case '{': + case '}': + case '^': + case '$': + case '\\': + return make_token(TOKEN_CHAR, next, pos); + default: + return make_token(TOKEN_CHAR, next, pos); + } + } + + switch (c) { + case '.': return make_token(TOKEN_DOT, c, pos); + case '*': return make_token(TOKEN_STAR, c, pos); + case '+': return make_token(TOKEN_PLUS, c, pos); + case '?': return make_token(TOKEN_QUESTION, c, pos); + case '|': return make_token(TOKEN_PIPE, c, pos); + case '(': return make_token(TOKEN_LPAREN, c, pos); + case ')': return make_token(TOKEN_RPAREN, c, pos); + case '^': return make_token(TOKEN_CARET, c, pos); + case '$': return make_token(TOKEN_DOLLAR, c, pos); + case '{': return make_token(TOKEN_LBRACE, c, pos); + case '}': return make_token(TOKEN_RBRACE, c, pos); + default: return make_token(TOKEN_CHAR, c, pos); + } +} + +token_t lexer_peek(lexer_t *lexer) { + size_t saved_pos = lexer->position; + bool saved_bracket = lexer->in_bracket; + token_t token = lexer_next(lexer); + lexer->position = saved_pos; + lexer->in_bracket = saved_bracket; + return token; +} + +bool lexer_eof(lexer_t *lexer) { + return lexer->position >= lexer->length; +} diff --git a/src/loreg.c b/src/loreg.c new file mode 100644 index 0000000..23ab908 --- /dev/null +++ b/src/loreg.c @@ -0,0 +1,71 @@ +/* retoor */ +#include "loreg.h" +#include "parser.h" +#include "nfa.h" +#include "matcher.h" +#include + +struct loreg_regex { + nfa_t *nfa; + ast_node_t *ast; +}; + +loreg_regex_t *loreg_compile(const char *pattern, loreg_error_t *error) { + *error = LOREG_OK; + + loreg_regex_t *regex = malloc(sizeof(loreg_regex_t)); + if (!regex) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return NULL; + } + + parser_t parser; + parser_init(&parser, pattern); + + regex->ast = parser_parse(&parser); + *error = parser_get_error(&parser); + + if (*error != LOREG_OK) { + ast_free(regex->ast); + free(regex); + return NULL; + } + + regex->nfa = nfa_from_ast(regex->ast, error); + if (*error != LOREG_OK) { + ast_free(regex->ast); + free(regex); + return NULL; + } + + return regex; +} + +void loreg_free(loreg_regex_t *regex) { + if (!regex) return; + nfa_free(regex->nfa); + ast_free(regex->ast); + free(regex); +} + +bool loreg_match(loreg_regex_t *regex, const char *text, loreg_match_t *result) { + return nfa_match(regex->nfa, text, 0, result); +} + +bool loreg_search(loreg_regex_t *regex, const char *text, loreg_match_t *result) { + return nfa_search(regex->nfa, text, result); +} + +const char *loreg_error_string(loreg_error_t error) { + switch (error) { + case LOREG_OK: return "success"; + case LOREG_ERR_INVALID_PATTERN: return "invalid pattern"; + case LOREG_ERR_UNBALANCED_PAREN: return "unbalanced parentheses"; + case LOREG_ERR_EMPTY_GROUP: return "empty group"; + case LOREG_ERR_INVALID_QUANTIFIER: return "invalid quantifier"; + case LOREG_ERR_INVALID_ESCAPE: return "invalid escape sequence"; + case LOREG_ERR_OUT_OF_MEMORY: return "out of memory"; + case LOREG_ERR_STATE_OVERFLOW: return "state overflow"; + default: return "unknown error"; + } +} diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..57c8ec6 --- /dev/null +++ b/src/main.c @@ -0,0 +1,107 @@ +/* retoor */ +#include "loreg.h" +#include "repl.h" +#include +#include + +static void print_usage(const char *program) { + printf("usage: %s [options] [pattern] [text]\n", program); + printf("options:\n"); + printf(" -h, --help show this help\n"); + printf(" -v, --version show version\n"); + printf(" -m, --match full match mode (default is search)\n"); + printf(" -i start interactive REPL\n"); + printf("\n"); + printf("examples:\n"); + printf(" %s start REPL\n", program); + printf(" %s -i start REPL\n", program); + printf(" %s \"a+b\" \"aaab\" search pattern in text\n", program); + printf(" %s -m \"a+b\" \"aaab\" match pattern against text\n", program); +} + +static void print_version(void) { + printf("loreg %s\n", LOREG_VERSION); +} + +static void print_match(const char *text, loreg_match_t *result) { + if (!result->matched) { + printf("no match\n"); + return; + } + + printf("match: \""); + for (size_t i = result->match_start; i < result->match_end; i++) { + printf("%c", text[i]); + } + printf("\" [%zu-%zu]\n", result->match_start, result->match_end); + + for (size_t i = 0; i < result->group_count; i++) { + if (result->groups[i].matched) { + printf(" group %zu: \"", i); + for (size_t j = result->groups[i].start; j < result->groups[i].end; j++) { + printf("%c", text[j]); + } + printf("\" [%zu-%zu]\n", result->groups[i].start, result->groups[i].end); + } + } +} + +int main(int argc, char *argv[]) { + if (argc == 1) { + repl_run(); + return 0; + } + + bool match_mode = false; + int arg_idx = 1; + + while (arg_idx < argc && argv[arg_idx][0] == '-') { + if (strcmp(argv[arg_idx], "-h") == 0 || strcmp(argv[arg_idx], "--help") == 0) { + print_usage(argv[0]); + return 0; + } + if (strcmp(argv[arg_idx], "-v") == 0 || strcmp(argv[arg_idx], "--version") == 0) { + print_version(); + return 0; + } + if (strcmp(argv[arg_idx], "-m") == 0 || strcmp(argv[arg_idx], "--match") == 0) { + match_mode = true; + arg_idx++; + continue; + } + if (strcmp(argv[arg_idx], "-i") == 0) { + repl_run(); + return 0; + } + fprintf(stderr, "unknown option: %s\n", argv[arg_idx]); + return 1; + } + + if (argc - arg_idx < 2) { + fprintf(stderr, "error: pattern and text required\n"); + print_usage(argv[0]); + return 1; + } + + const char *pattern = argv[arg_idx]; + const char *text = argv[arg_idx + 1]; + + loreg_error_t error; + loreg_regex_t *regex = loreg_compile(pattern, &error); + if (!regex) { + fprintf(stderr, "error: %s\n", loreg_error_string(error)); + return 1; + } + + loreg_match_t result; + if (match_mode) { + loreg_match(regex, text, &result); + } else { + loreg_search(regex, text, &result); + } + + print_match(text, &result); + + loreg_free(regex); + return result.matched ? 0 : 1; +} diff --git a/src/matcher.c b/src/matcher.c new file mode 100644 index 0000000..74e5108 --- /dev/null +++ b/src/matcher.c @@ -0,0 +1,411 @@ +/* retoor */ +#include "matcher.h" +#include +#include +#include + +state_set_t *state_set_create(size_t initial_capacity, int group_count) { + state_set_t *set = malloc(sizeof(state_set_t)); + if (!set) return NULL; + + set->states = calloc(initial_capacity, sizeof(nfa_state_t *)); + if (!set->states) { + free(set); + return NULL; + } + + set->count = 0; + set->capacity = initial_capacity; + set->group_count = group_count; + + if (group_count > 0) { + set->group_starts = calloc(group_count, sizeof(size_t)); + set->group_ends = calloc(group_count, sizeof(size_t)); + if (!set->group_starts || !set->group_ends) { + free(set->group_starts); + free(set->group_ends); + free(set->states); + free(set); + return NULL; + } + for (int i = 0; i < group_count; i++) { + set->group_starts[i] = (size_t)-1; + set->group_ends[i] = (size_t)-1; + } + } else { + set->group_starts = NULL; + set->group_ends = NULL; + } + + return set; +} + +void state_set_free(state_set_t *set) { + if (!set) return; + free(set->states); + free(set->group_starts); + free(set->group_ends); + free(set); +} + +void state_set_clear(state_set_t *set) { + memset(set->states, 0, set->capacity * sizeof(nfa_state_t *)); + set->count = 0; +} + +static bool state_set_grow(state_set_t *set) { + size_t new_cap = set->capacity * 2; + nfa_state_t **new_states = realloc(set->states, new_cap * sizeof(nfa_state_t *)); + if (!new_states) return false; + memset(new_states + set->capacity, 0, set->capacity * sizeof(nfa_state_t *)); + set->states = new_states; + set->capacity = new_cap; + return true; +} + +void state_set_add(state_set_t *set, nfa_state_t *state) { + if (state_set_contains(set, state)) return; + if (set->count >= set->capacity) { + if (!state_set_grow(set)) return; + } + set->states[set->count++] = state; +} + +bool state_set_contains(state_set_t *set, nfa_state_t *state) { + for (size_t i = 0; i < set->count; i++) { + if (set->states[i] == state) return true; + } + return false; +} + +static bool is_digit(char c) { + return c >= '0' && c <= '9'; +} + +static bool is_word(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || c == '_'; +} + +static bool is_space(char c) { + return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'; +} + +static bool transition_matches(transition_t *t, char c, size_t pos, size_t len) { + switch (t->type) { + case TRANS_CHAR: + return t->value == c; + case TRANS_DOT: + return c != '\n' && c != '\0'; + case TRANS_BRACKET: + return bracket_matches(t->bracket, c); + case TRANS_CLASS_DIGIT: + return is_digit(c); + case TRANS_CLASS_WORD: + return is_word(c); + case TRANS_CLASS_SPACE: + return is_space(c); + case TRANS_CLASS_NDIGIT: + return !is_digit(c) && c != '\0'; + case TRANS_CLASS_NWORD: + return !is_word(c) && c != '\0'; + case TRANS_CLASS_NSPACE: + return !is_space(c) && c != '\0'; + case TRANS_ANCHOR_START: + return pos == 0; + case TRANS_ANCHOR_END: + return pos == len; + default: + return false; + } +} + +typedef struct { + nfa_state_t *state; + size_t *group_starts; + size_t *group_ends; +} thread_t; + +typedef struct { + thread_t *threads; + size_t count; + size_t capacity; + int group_count; +} thread_list_t; + +static thread_list_t *thread_list_create(size_t capacity, int group_count) { + thread_list_t *list = malloc(sizeof(thread_list_t)); + if (!list) return NULL; + + list->threads = malloc(capacity * sizeof(thread_t)); + if (!list->threads) { + free(list); + return NULL; + } + + for (size_t i = 0; i < capacity; i++) { + if (group_count > 0) { + list->threads[i].group_starts = malloc(group_count * sizeof(size_t)); + list->threads[i].group_ends = malloc(group_count * sizeof(size_t)); + if (!list->threads[i].group_starts || !list->threads[i].group_ends) { + for (size_t j = 0; j <= i; j++) { + free(list->threads[j].group_starts); + free(list->threads[j].group_ends); + } + free(list->threads); + free(list); + return NULL; + } + } else { + list->threads[i].group_starts = NULL; + list->threads[i].group_ends = NULL; + } + } + + list->count = 0; + list->capacity = capacity; + list->group_count = group_count; + return list; +} + +static void thread_list_free(thread_list_t *list) { + if (!list) return; + for (size_t i = 0; i < list->capacity; i++) { + free(list->threads[i].group_starts); + free(list->threads[i].group_ends); + } + free(list->threads); + free(list); +} + +static void thread_list_clear(thread_list_t *list) { + list->count = 0; +} + +static bool thread_list_contains_state(thread_list_t *list, nfa_state_t *state) { + for (size_t i = 0; i < list->count; i++) { + if (list->threads[i].state == state) return true; + } + return false; +} + +static void add_thread(thread_list_t *list, nfa_state_t *state, + size_t *group_starts, size_t *group_ends); + +static void follow_epsilons(thread_list_t *list, nfa_state_t *state, + size_t *group_starts, size_t *group_ends, + size_t pos, size_t len, bool *visited) { + if (!state || visited[state->id]) return; + visited[state->id] = true; + + for (size_t i = 0; i < state->trans_count; i++) { + transition_t *t = &state->transitions[i]; + + if (t->type == TRANS_EPSILON) { + follow_epsilons(list, t->target, group_starts, group_ends, + pos, len, visited); + } else if (t->type == TRANS_GROUP_START) { + size_t *new_starts = malloc(list->group_count * sizeof(size_t)); + size_t *new_ends = malloc(list->group_count * sizeof(size_t)); + if (new_starts && new_ends) { + memcpy(new_starts, group_starts, list->group_count * sizeof(size_t)); + memcpy(new_ends, group_ends, list->group_count * sizeof(size_t)); + new_starts[t->group_id] = pos; + follow_epsilons(list, t->target, new_starts, new_ends, + pos, len, visited); + } + free(new_starts); + free(new_ends); + } else if (t->type == TRANS_GROUP_END) { + size_t *new_starts = malloc(list->group_count * sizeof(size_t)); + size_t *new_ends = malloc(list->group_count * sizeof(size_t)); + if (new_starts && new_ends) { + memcpy(new_starts, group_starts, list->group_count * sizeof(size_t)); + memcpy(new_ends, group_ends, list->group_count * sizeof(size_t)); + new_ends[t->group_id] = pos; + follow_epsilons(list, t->target, new_starts, new_ends, + pos, len, visited); + } + free(new_starts); + free(new_ends); + } else if (t->type == TRANS_ANCHOR_START || t->type == TRANS_ANCHOR_END) { + if (transition_matches(t, '\0', pos, len)) { + follow_epsilons(list, t->target, group_starts, group_ends, + pos, len, visited); + } + } + } + + add_thread(list, state, group_starts, group_ends); +} + +static void add_thread(thread_list_t *list, nfa_state_t *state, + size_t *group_starts, size_t *group_ends) { + if (!state) return; + if (thread_list_contains_state(list, state)) return; + + if (list->count >= list->capacity) return; + + thread_t *thread = &list->threads[list->count++]; + thread->state = state; + if (list->group_count > 0) { + memcpy(thread->group_starts, group_starts, list->group_count * sizeof(size_t)); + memcpy(thread->group_ends, group_ends, list->group_count * sizeof(size_t)); + } +} + +bool nfa_match(nfa_t *nfa, const char *text, size_t start_pos, loreg_match_t *result) { + size_t len = strlen(text); + size_t num_states = nfa->state_count; + int group_count = nfa->group_count > 0 ? nfa->group_count : 1; + + thread_list_t *current = thread_list_create(num_states, group_count); + thread_list_t *next = thread_list_create(num_states, group_count); + bool *visited = calloc(num_states, sizeof(bool)); + + if (!current || !next || !visited) { + thread_list_free(current); + thread_list_free(next); + free(visited); + return false; + } + + size_t *init_starts = calloc(group_count, sizeof(size_t)); + size_t *init_ends = calloc(group_count, sizeof(size_t)); + if (!init_starts || !init_ends) { + free(init_starts); + free(init_ends); + thread_list_free(current); + thread_list_free(next); + free(visited); + return false; + } + + for (int i = 0; i < group_count; i++) { + init_starts[i] = (size_t)-1; + init_ends[i] = (size_t)-1; + } + + memset(visited, 0, num_states * sizeof(bool)); + follow_epsilons(current, nfa->start, init_starts, init_ends, + start_pos, len, visited); + + bool matched = false; + size_t match_end = start_pos; + size_t *best_starts = calloc(group_count, sizeof(size_t)); + size_t *best_ends = calloc(group_count, sizeof(size_t)); + + if (!best_starts || !best_ends) { + free(init_starts); + free(init_ends); + free(best_starts); + free(best_ends); + thread_list_free(current); + thread_list_free(next); + free(visited); + return false; + } + + for (int i = 0; i < group_count; i++) { + best_starts[i] = (size_t)-1; + best_ends[i] = (size_t)-1; + } + + for (size_t i = 0; i < current->count; i++) { + if (current->threads[i].state->accepting) { + matched = true; + match_end = start_pos; + memcpy(best_starts, current->threads[i].group_starts, group_count * sizeof(size_t)); + memcpy(best_ends, current->threads[i].group_ends, group_count * sizeof(size_t)); + break; + } + } + + for (size_t pos = start_pos; pos < len; pos++) { + char c = text[pos]; + thread_list_clear(next); + + for (size_t i = 0; i < current->count; i++) { + thread_t *thread = ¤t->threads[i]; + nfa_state_t *state = thread->state; + + for (size_t j = 0; j < state->trans_count; j++) { + transition_t *t = &state->transitions[j]; + + if (t->type != TRANS_EPSILON && + t->type != TRANS_GROUP_START && + t->type != TRANS_GROUP_END && + t->type != TRANS_ANCHOR_START && + t->type != TRANS_ANCHOR_END) { + + if (transition_matches(t, c, pos, len)) { + memset(visited, 0, num_states * sizeof(bool)); + follow_epsilons(next, t->target, + thread->group_starts, thread->group_ends, + pos + 1, len, visited); + } + } + } + } + + if (next->count == 0) break; + + thread_list_t *tmp = current; + current = next; + next = tmp; + + for (size_t i = 0; i < current->count; i++) { + if (current->threads[i].state->accepting) { + matched = true; + match_end = pos + 1; + memcpy(best_starts, current->threads[i].group_starts, group_count * sizeof(size_t)); + memcpy(best_ends, current->threads[i].group_ends, group_count * sizeof(size_t)); + break; + } + } + } + + if (result) { + result->matched = matched; + result->match_start = start_pos; + result->match_end = matched ? match_end : start_pos; + result->group_count = nfa->group_count; + + for (int i = 0; i < LOREG_MAX_GROUPS && i < nfa->group_count; i++) { + result->groups[i].start = best_starts[i]; + result->groups[i].end = best_ends[i]; + result->groups[i].matched = (best_starts[i] != (size_t)-1 && best_ends[i] != (size_t)-1); + } + } + + free(init_starts); + free(init_ends); + free(best_starts); + free(best_ends); + thread_list_free(current); + thread_list_free(next); + free(visited); + + return matched; +} + +bool nfa_search(nfa_t *nfa, const char *text, loreg_match_t *result) { + size_t len = strlen(text); + + for (size_t i = 0; i <= len; i++) { + if (nfa_match(nfa, text, i, result)) { + if (result) { + result->match_start = i; + } + return true; + } + } + + if (result) { + result->matched = false; + result->match_start = 0; + result->match_end = 0; + result->group_count = 0; + } + return false; +} diff --git a/src/nfa.c b/src/nfa.c new file mode 100644 index 0000000..d13bd15 --- /dev/null +++ b/src/nfa.c @@ -0,0 +1,477 @@ +/* retoor */ +#include "nfa.h" +#include +#include + +nfa_t *nfa_create(void) { + nfa_t *nfa = malloc(sizeof(nfa_t)); + if (!nfa) return NULL; + nfa->states = NULL; + nfa->state_count = 0; + nfa->capacity = 0; + nfa->start = NULL; + nfa->group_count = 0; + return nfa; +} + +void nfa_free(nfa_t *nfa) { + if (!nfa) return; + for (size_t i = 0; i < nfa->state_count; i++) { + free(nfa->states[i]->transitions); + free(nfa->states[i]); + } + free(nfa->states); + free(nfa); +} + +static bool nfa_grow(nfa_t *nfa) { + size_t new_cap = nfa->capacity == 0 ? 16 : nfa->capacity * 2; + if (new_cap > LOREG_MAX_STATES) { + if (nfa->capacity >= LOREG_MAX_STATES) return false; + new_cap = LOREG_MAX_STATES; + } + nfa_state_t **new_states = realloc(nfa->states, new_cap * sizeof(nfa_state_t *)); + if (!new_states) return false; + nfa->states = new_states; + nfa->capacity = new_cap; + return true; +} + +nfa_state_t *nfa_add_state(nfa_t *nfa) { + if (nfa->state_count >= nfa->capacity) { + if (!nfa_grow(nfa)) return NULL; + } + + nfa_state_t *state = malloc(sizeof(nfa_state_t)); + if (!state) return NULL; + + state->id = (int)nfa->state_count; + state->accepting = false; + state->transitions = NULL; + state->trans_count = 0; + state->trans_capacity = 0; + + nfa->states[nfa->state_count++] = state; + return state; +} + +static bool transition_grow(nfa_state_t *state) { + size_t new_cap = state->trans_capacity == 0 ? 4 : state->trans_capacity * 2; + transition_t *new_trans = realloc(state->transitions, new_cap * sizeof(transition_t)); + if (!new_trans) return false; + state->transitions = new_trans; + state->trans_capacity = new_cap; + return true; +} + +void nfa_add_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, char value) { + if (from->trans_count >= from->trans_capacity) { + if (!transition_grow(from)) return; + } + transition_t *t = &from->transitions[from->trans_count++]; + t->type = type; + t->value = value; + t->target = to; + t->bracket = NULL; + t->group_id = -1; +} + +void nfa_add_bracket_transition(nfa_state_t *from, nfa_state_t *to, bracket_class_t *bracket) { + if (from->trans_count >= from->trans_capacity) { + if (!transition_grow(from)) return; + } + transition_t *t = &from->transitions[from->trans_count++]; + t->type = TRANS_BRACKET; + t->value = '\0'; + t->target = to; + t->bracket = bracket; + t->group_id = -1; +} + +void nfa_add_group_transition(nfa_state_t *from, nfa_state_t *to, transition_type_t type, int group_id) { + if (from->trans_count >= from->trans_capacity) { + if (!transition_grow(from)) return; + } + transition_t *t = &from->transitions[from->trans_count++]; + t->type = type; + t->value = '\0'; + t->target = to; + t->bracket = NULL; + t->group_id = group_id; +} + +static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error); + +static nfa_fragment_t build_char(nfa_t *nfa, char c, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_state_t *start = nfa_add_state(nfa); + nfa_state_t *accept = nfa_add_state(nfa); + if (!start || !accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + nfa_add_transition(start, accept, TRANS_CHAR, c); + frag.start = start; + frag.accept = accept; + return frag; +} + +static nfa_fragment_t build_dot(nfa_t *nfa, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_state_t *start = nfa_add_state(nfa); + nfa_state_t *accept = nfa_add_state(nfa); + if (!start || !accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + nfa_add_transition(start, accept, TRANS_DOT, '\0'); + frag.start = start; + frag.accept = accept; + return frag; +} + +static nfa_fragment_t build_class(nfa_t *nfa, transition_type_t type, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_state_t *start = nfa_add_state(nfa); + nfa_state_t *accept = nfa_add_state(nfa); + if (!start || !accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + nfa_add_transition(start, accept, type, '\0'); + frag.start = start; + frag.accept = accept; + return frag; +} + +static nfa_fragment_t build_bracket(nfa_t *nfa, bracket_class_t *bracket, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_state_t *start = nfa_add_state(nfa); + nfa_state_t *accept = nfa_add_state(nfa); + if (!start || !accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + nfa_add_bracket_transition(start, accept, bracket); + frag.start = start; + frag.accept = accept; + return frag; +} + +static nfa_fragment_t build_concat(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_fragment_t left_frag = build_nfa(nfa, left, error); + if (*error != LOREG_OK) return frag; + nfa_fragment_t right_frag = build_nfa(nfa, right, error); + if (*error != LOREG_OK) return frag; + + nfa_add_transition(left_frag.accept, right_frag.start, TRANS_EPSILON, '\0'); + frag.start = left_frag.start; + frag.accept = right_frag.accept; + return frag; +} + +static nfa_fragment_t build_alter(nfa_t *nfa, ast_node_t *left, ast_node_t *right, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_state_t *start = nfa_add_state(nfa); + nfa_state_t *accept = nfa_add_state(nfa); + if (!start || !accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + + nfa_fragment_t left_frag = build_nfa(nfa, left, error); + if (*error != LOREG_OK) return frag; + nfa_fragment_t right_frag = build_nfa(nfa, right, error); + if (*error != LOREG_OK) return frag; + + nfa_add_transition(start, left_frag.start, TRANS_EPSILON, '\0'); + nfa_add_transition(start, right_frag.start, TRANS_EPSILON, '\0'); + nfa_add_transition(left_frag.accept, accept, TRANS_EPSILON, '\0'); + nfa_add_transition(right_frag.accept, accept, TRANS_EPSILON, '\0'); + + frag.start = start; + frag.accept = accept; + return frag; +} + +static nfa_fragment_t build_star(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_state_t *start = nfa_add_state(nfa); + nfa_state_t *accept = nfa_add_state(nfa); + if (!start || !accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + + nfa_fragment_t child_frag = build_nfa(nfa, child, error); + if (*error != LOREG_OK) return frag; + + if (greedy) { + nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0'); + nfa_add_transition(start, accept, TRANS_EPSILON, '\0'); + } else { + nfa_add_transition(start, accept, TRANS_EPSILON, '\0'); + nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0'); + } + nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0'); + nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0'); + + frag.start = start; + frag.accept = accept; + return frag; +} + +static nfa_fragment_t build_plus(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_state_t *accept = nfa_add_state(nfa); + if (!accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + + nfa_fragment_t child_frag = build_nfa(nfa, child, error); + if (*error != LOREG_OK) return frag; + + if (greedy) { + nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0'); + nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0'); + } else { + nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0'); + nfa_add_transition(child_frag.accept, child_frag.start, TRANS_EPSILON, '\0'); + } + + frag.start = child_frag.start; + frag.accept = accept; + return frag; +} + +static nfa_fragment_t build_question(nfa_t *nfa, ast_node_t *child, bool greedy, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_state_t *start = nfa_add_state(nfa); + nfa_state_t *accept = nfa_add_state(nfa); + if (!start || !accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + + nfa_fragment_t child_frag = build_nfa(nfa, child, error); + if (*error != LOREG_OK) return frag; + + if (greedy) { + nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0'); + nfa_add_transition(start, accept, TRANS_EPSILON, '\0'); + } else { + nfa_add_transition(start, accept, TRANS_EPSILON, '\0'); + nfa_add_transition(start, child_frag.start, TRANS_EPSILON, '\0'); + } + nfa_add_transition(child_frag.accept, accept, TRANS_EPSILON, '\0'); + + frag.start = start; + frag.accept = accept; + return frag; +} + +static nfa_fragment_t build_group(nfa_t *nfa, ast_node_t *child, int group_id, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_state_t *start = nfa_add_state(nfa); + nfa_state_t *accept = nfa_add_state(nfa); + if (!start || !accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + + nfa_fragment_t child_frag = build_nfa(nfa, child, error); + if (*error != LOREG_OK) return frag; + + nfa_add_group_transition(start, child_frag.start, TRANS_GROUP_START, group_id); + nfa_add_group_transition(child_frag.accept, accept, TRANS_GROUP_END, group_id); + + if (group_id + 1 > nfa->group_count) { + nfa->group_count = group_id + 1; + } + + frag.start = start; + frag.accept = accept; + return frag; +} + +static nfa_fragment_t build_anchor(nfa_t *nfa, transition_type_t type, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + nfa_state_t *start = nfa_add_state(nfa); + nfa_state_t *accept = nfa_add_state(nfa); + if (!start || !accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + nfa_add_transition(start, accept, type, '\0'); + frag.start = start; + frag.accept = accept; + return frag; +} + +static nfa_fragment_t build_quantifier(nfa_t *nfa, ast_node_t *child, int min, int max, bool greedy, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + + if (min == 0 && max == 0) { + nfa_state_t *state = nfa_add_state(nfa); + if (!state) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + frag.start = state; + frag.accept = state; + return frag; + } + + nfa_state_t *start = nfa_add_state(nfa); + if (!start) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + + nfa_state_t *current = start; + + for (int i = 0; i < min; i++) { + nfa_fragment_t rep = build_nfa(nfa, child, error); + if (*error != LOREG_OK) return frag; + nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0'); + current = rep.accept; + } + + if (max < 0) { + nfa_state_t *loop_start = nfa_add_state(nfa); + nfa_state_t *accept = nfa_add_state(nfa); + if (!loop_start || !accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + + nfa_add_transition(current, loop_start, TRANS_EPSILON, '\0'); + + nfa_fragment_t rep = build_nfa(nfa, child, error); + if (*error != LOREG_OK) return frag; + + if (greedy) { + nfa_add_transition(loop_start, rep.start, TRANS_EPSILON, '\0'); + nfa_add_transition(loop_start, accept, TRANS_EPSILON, '\0'); + } else { + nfa_add_transition(loop_start, accept, TRANS_EPSILON, '\0'); + nfa_add_transition(loop_start, rep.start, TRANS_EPSILON, '\0'); + } + nfa_add_transition(rep.accept, loop_start, TRANS_EPSILON, '\0'); + + frag.start = start; + frag.accept = accept; + } else { + nfa_state_t *accept = nfa_add_state(nfa); + if (!accept) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + + nfa_add_transition(current, accept, TRANS_EPSILON, '\0'); + + for (int i = min; i < max; i++) { + nfa_fragment_t rep = build_nfa(nfa, child, error); + if (*error != LOREG_OK) return frag; + + if (greedy) { + nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0'); + } else { + nfa_add_transition(current, accept, TRANS_EPSILON, '\0'); + nfa_add_transition(current, rep.start, TRANS_EPSILON, '\0'); + } + + if (greedy) { + nfa_add_transition(rep.accept, accept, TRANS_EPSILON, '\0'); + } + current = rep.accept; + } + + if (!greedy) { + nfa_add_transition(current, accept, TRANS_EPSILON, '\0'); + } + + frag.start = start; + frag.accept = accept; + } + + return frag; +} + +static nfa_fragment_t build_nfa(nfa_t *nfa, ast_node_t *ast, loreg_error_t *error) { + nfa_fragment_t frag = {NULL, NULL}; + + if (!ast) { + nfa_state_t *state = nfa_add_state(nfa); + if (!state) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return frag; + } + frag.start = state; + frag.accept = state; + return frag; + } + + switch (ast->type) { + case AST_CHAR: + return build_char(nfa, ast->value, error); + case AST_DOT: + return build_dot(nfa, error); + case AST_CONCAT: + return build_concat(nfa, ast->left, ast->right, error); + case AST_ALTER: + return build_alter(nfa, ast->left, ast->right, error); + case AST_STAR: + return build_star(nfa, ast->left, ast->quant.greedy, error); + case AST_PLUS: + return build_plus(nfa, ast->left, ast->quant.greedy, error); + case AST_QUESTION: + return build_question(nfa, ast->left, ast->quant.greedy, error); + case AST_GROUP: + return build_group(nfa, ast->left, ast->group_id, error); + case AST_ANCHOR_START: + return build_anchor(nfa, TRANS_ANCHOR_START, error); + case AST_ANCHOR_END: + return build_anchor(nfa, TRANS_ANCHOR_END, error); + case AST_BRACKET: + return build_bracket(nfa, ast->bracket, error); + case AST_QUANTIFIER: + return build_quantifier(nfa, ast->left, ast->quant.min, ast->quant.max, ast->quant.greedy, error); + case AST_CLASS_DIGIT: + return build_class(nfa, TRANS_CLASS_DIGIT, error); + case AST_CLASS_WORD: + return build_class(nfa, TRANS_CLASS_WORD, error); + case AST_CLASS_SPACE: + return build_class(nfa, TRANS_CLASS_SPACE, error); + case AST_CLASS_NDIGIT: + return build_class(nfa, TRANS_CLASS_NDIGIT, error); + case AST_CLASS_NWORD: + return build_class(nfa, TRANS_CLASS_NWORD, error); + case AST_CLASS_NSPACE: + return build_class(nfa, TRANS_CLASS_NSPACE, error); + } + + return frag; +} + +nfa_t *nfa_from_ast(ast_node_t *ast, loreg_error_t *error) { + *error = LOREG_OK; + nfa_t *nfa = nfa_create(); + if (!nfa) { + *error = LOREG_ERR_OUT_OF_MEMORY; + return NULL; + } + + nfa_fragment_t frag = build_nfa(nfa, ast, error); + if (*error != LOREG_OK) { + nfa_free(nfa); + return NULL; + } + + nfa->start = frag.start; + frag.accept->accepting = true; + + return nfa; +} diff --git a/src/parser.c b/src/parser.c new file mode 100644 index 0000000..a26cc4a --- /dev/null +++ b/src/parser.c @@ -0,0 +1,309 @@ +/* retoor */ +#include "parser.h" +#include +#include + +static void parser_advance(parser_t *parser) { + parser->current = lexer_next(&parser->lexer); +} + +void parser_init(parser_t *parser, const char *pattern) { + lexer_init(&parser->lexer, pattern); + parser->current = lexer_next(&parser->lexer); + parser->error = LOREG_OK; + parser->group_count = 0; +} + +loreg_error_t parser_get_error(parser_t *parser) { + return parser->error; +} + +static ast_node_t *parse_expr(parser_t *parser); +static ast_node_t *parse_term(parser_t *parser); +static ast_node_t *parse_factor(parser_t *parser); +static ast_node_t *parse_atom(parser_t *parser); +static ast_node_t *parse_bracket(parser_t *parser); +static int parse_number(parser_t *parser); + +static ast_node_t *parse_expr(parser_t *parser) { + ast_node_t *left = parse_term(parser); + if (!left || parser->error != LOREG_OK) return left; + + while (parser->current.type == TOKEN_PIPE) { + parser_advance(parser); + ast_node_t *right = parse_term(parser); + if (!right) { + ast_free(left); + return NULL; + } + left = ast_create_alter(left, right); + if (!left) { + parser->error = LOREG_ERR_OUT_OF_MEMORY; + return NULL; + } + } + return left; +} + +static ast_node_t *parse_term(parser_t *parser) { + ast_node_t *left = NULL; + + while (parser->current.type != TOKEN_EOF && + parser->current.type != TOKEN_PIPE && + parser->current.type != TOKEN_RPAREN) { + ast_node_t *factor = parse_factor(parser); + if (!factor) { + ast_free(left); + return NULL; + } + if (left == NULL) { + left = factor; + } else { + left = ast_create_concat(left, factor); + if (!left) { + parser->error = LOREG_ERR_OUT_OF_MEMORY; + return NULL; + } + } + } + return left; +} + +static ast_node_t *parse_factor(parser_t *parser) { + ast_node_t *atom = parse_atom(parser); + if (!atom || parser->error != LOREG_OK) return atom; + + while (parser->current.type == TOKEN_STAR || + parser->current.type == TOKEN_PLUS || + parser->current.type == TOKEN_QUESTION || + parser->current.type == TOKEN_LBRACE) { + + token_type_t quant_type = parser->current.type; + parser_advance(parser); + + bool greedy = true; + if (parser->current.type == TOKEN_QUESTION) { + greedy = false; + parser_advance(parser); + } + + if (quant_type == TOKEN_STAR) { + atom = ast_create_star(atom, greedy); + } else if (quant_type == TOKEN_PLUS) { + atom = ast_create_plus(atom, greedy); + } else if (quant_type == TOKEN_QUESTION) { + atom = ast_create_question(atom, greedy); + } else if (quant_type == TOKEN_LBRACE) { + int min = parse_number(parser); + int max = min; + + if (parser->current.type == TOKEN_CHAR && parser->current.value == ',') { + parser_advance(parser); + if (parser->current.type == TOKEN_RBRACE) { + max = -1; + } else { + max = parse_number(parser); + } + } + + if (parser->current.type != TOKEN_RBRACE) { + parser->error = LOREG_ERR_INVALID_QUANTIFIER; + ast_free(atom); + return NULL; + } + parser_advance(parser); + + if (parser->current.type == TOKEN_QUESTION) { + greedy = false; + parser_advance(parser); + } + + atom = ast_create_quantifier(atom, min, max, greedy); + } + + if (!atom) { + parser->error = LOREG_ERR_OUT_OF_MEMORY; + return NULL; + } + } + return atom; +} + +static int parse_number(parser_t *parser) { + int num = 0; + while (parser->current.type == TOKEN_CHAR && isdigit(parser->current.value)) { + num = num * 10 + (parser->current.value - '0'); + parser_advance(parser); + } + return num; +} + +static ast_node_t *parse_atom(parser_t *parser) { + ast_node_t *node = NULL; + + switch (parser->current.type) { + case TOKEN_CHAR: + node = ast_create_char(parser->current.value); + parser_advance(parser); + break; + + case TOKEN_DOT: + node = ast_create_dot(); + parser_advance(parser); + break; + + case TOKEN_CARET: + node = ast_create_anchor_start(); + parser_advance(parser); + break; + + case TOKEN_DOLLAR: + node = ast_create_anchor_end(); + parser_advance(parser); + break; + + case TOKEN_LPAREN: { + parser_advance(parser); + int group_id = parser->group_count++; + ast_node_t *inner = parse_expr(parser); + if (parser->current.type != TOKEN_RPAREN) { + parser->error = LOREG_ERR_UNBALANCED_PAREN; + ast_free(inner); + return NULL; + } + parser_advance(parser); + node = ast_create_group(inner, group_id); + break; + } + + case TOKEN_LBRACKET: + node = parse_bracket(parser); + break; + + case TOKEN_CLASS_DIGIT: + node = ast_create_class(AST_CLASS_DIGIT); + parser_advance(parser); + break; + + case TOKEN_CLASS_WORD: + node = ast_create_class(AST_CLASS_WORD); + parser_advance(parser); + break; + + case TOKEN_CLASS_SPACE: + node = ast_create_class(AST_CLASS_SPACE); + parser_advance(parser); + break; + + case TOKEN_CLASS_NDIGIT: + node = ast_create_class(AST_CLASS_NDIGIT); + parser_advance(parser); + break; + + case TOKEN_CLASS_NWORD: + node = ast_create_class(AST_CLASS_NWORD); + parser_advance(parser); + break; + + case TOKEN_CLASS_NSPACE: + node = ast_create_class(AST_CLASS_NSPACE); + parser_advance(parser); + break; + + case TOKEN_EOF: + case TOKEN_PIPE: + case TOKEN_RPAREN: + return NULL; + + default: + parser->error = LOREG_ERR_INVALID_PATTERN; + return NULL; + } + + if (!node && parser->error == LOREG_OK) { + parser->error = LOREG_ERR_OUT_OF_MEMORY; + } + return node; +} + +static ast_node_t *parse_bracket(parser_t *parser) { + parser_advance(parser); + + bracket_class_t *bracket = bracket_create(); + if (!bracket) { + parser->error = LOREG_ERR_OUT_OF_MEMORY; + return NULL; + } + + if (parser->current.type == TOKEN_CARET) { + bracket->negated = true; + parser_advance(parser); + } + + while (parser->current.type != TOKEN_RBRACKET && parser->current.type != TOKEN_EOF) { + if (parser->current.type == TOKEN_CLASS_DIGIT || + parser->current.type == TOKEN_CLASS_WORD || + parser->current.type == TOKEN_CLASS_SPACE || + parser->current.type == TOKEN_CLASS_NDIGIT || + parser->current.type == TOKEN_CLASS_NWORD || + parser->current.type == TOKEN_CLASS_NSPACE) { + + switch (parser->current.type) { + case TOKEN_CLASS_DIGIT: + bracket_add_range(bracket, '0', '9'); + break; + case TOKEN_CLASS_WORD: + bracket_add_range(bracket, 'a', 'z'); + bracket_add_range(bracket, 'A', 'Z'); + bracket_add_range(bracket, '0', '9'); + bracket_add_char(bracket, '_'); + break; + case TOKEN_CLASS_SPACE: + bracket_add_char(bracket, ' '); + bracket_add_char(bracket, '\t'); + bracket_add_char(bracket, '\n'); + bracket_add_char(bracket, '\r'); + bracket_add_char(bracket, '\f'); + bracket_add_char(bracket, '\v'); + break; + default: + break; + } + parser_advance(parser); + continue; + } + + char start = parser->current.value; + parser_advance(parser); + + if (parser->current.type == TOKEN_DASH) { + parser_advance(parser); + if (parser->current.type == TOKEN_RBRACKET || parser->current.type == TOKEN_EOF) { + bracket_add_char(bracket, start); + bracket_add_char(bracket, '-'); + } else { + char end = parser->current.value; + bracket_add_range(bracket, start, end); + parser_advance(parser); + } + } else { + bracket_add_char(bracket, start); + } + } + + if (parser->current.type != TOKEN_RBRACKET) { + bracket_free(bracket); + parser->error = LOREG_ERR_INVALID_PATTERN; + return NULL; + } + parser_advance(parser); + + return ast_create_bracket(bracket); +} + +ast_node_t *parser_parse(parser_t *parser) { + if (parser->current.type == TOKEN_EOF) { + return NULL; + } + return parse_expr(parser); +} diff --git a/src/repl.c b/src/repl.c new file mode 100644 index 0000000..7386a77 --- /dev/null +++ b/src/repl.c @@ -0,0 +1,170 @@ +/* retoor */ +#include "repl.h" +#include "loreg.h" +#include +#include +#include + +#define MAX_INPUT 4096 + +static void print_banner(void) { + printf("loreg v%s - regex interpreter\n", LOREG_VERSION); + printf("commands: :q quit, :h help, :p set pattern, :m match, :s search\n\n"); +} + +static void print_help(void) { + printf("loreg REPL commands:\n"); + printf(" :q quit\n"); + printf(" :h show this help\n"); + printf(" :p compile and set pattern\n"); + printf(" :m match text against pattern (anchored)\n"); + printf(" :s search for pattern in text\n"); + printf(" search for pattern in text\n\n"); + printf("regex syntax:\n"); + printf(" . any character\n"); + printf(" * zero or more\n"); + printf(" + one or more\n"); + printf(" ? zero or one\n"); + printf(" | alternation\n"); + printf(" () grouping\n"); + printf(" [] character class\n"); + printf(" [^] negated class\n"); + printf(" ^ start anchor\n"); + printf(" $ end anchor\n"); + printf(" {n} exactly n\n"); + printf(" {n,} n or more\n"); + printf(" {n,m} n to m\n"); + printf(" \\d digit\n"); + printf(" \\w word character\n"); + printf(" \\s whitespace\n"); + printf(" \\D \\W \\S negated classes\n\n"); +} + +static void print_match(const char *text, loreg_match_t *result) { + if (!result->matched) { + printf("no match\n"); + return; + } + + printf("match: \""); + for (size_t i = result->match_start; i < result->match_end; i++) { + printf("%c", text[i]); + } + printf("\" [%zu-%zu]\n", result->match_start, result->match_end); + + for (size_t i = 0; i < result->group_count; i++) { + if (result->groups[i].matched) { + printf(" group %zu: \"", i); + for (size_t j = result->groups[i].start; j < result->groups[i].end; j++) { + printf("%c", text[j]); + } + printf("\" [%zu-%zu]\n", result->groups[i].start, result->groups[i].end); + } + } +} + +static char *read_line(void) { + static char buffer[MAX_INPUT]; + printf("> "); + fflush(stdout); + + if (!fgets(buffer, MAX_INPUT, stdin)) { + return NULL; + } + + size_t len = strlen(buffer); + if (len > 0 && buffer[len - 1] == '\n') { + buffer[len - 1] = '\0'; + } + + return buffer; +} + +void repl_run(void) { + print_banner(); + + loreg_regex_t *regex = NULL; + char *line; + + while ((line = read_line()) != NULL) { + if (strlen(line) == 0) continue; + + if (strcmp(line, ":q") == 0 || strcmp(line, ":quit") == 0) { + break; + } + + if (strcmp(line, ":h") == 0 || strcmp(line, ":help") == 0) { + print_help(); + continue; + } + + if (strncmp(line, ":p ", 3) == 0) { + const char *pattern = line + 3; + while (*pattern == ' ') pattern++; + + if (regex) { + loreg_free(regex); + regex = NULL; + } + + loreg_error_t error; + regex = loreg_compile(pattern, &error); + if (!regex) { + printf("error: %s\n", loreg_error_string(error)); + } else { + printf("pattern compiled: %s\n", pattern); + } + continue; + } + + if (strncmp(line, ":m ", 3) == 0) { + if (!regex) { + printf("error: no pattern set (use :p )\n"); + continue; + } + + const char *text = line + 3; + while (*text == ' ') text++; + + loreg_match_t result; + loreg_match(regex, text, &result); + print_match(text, &result); + continue; + } + + if (strncmp(line, ":s ", 3) == 0) { + if (!regex) { + printf("error: no pattern set (use :p )\n"); + continue; + } + + const char *text = line + 3; + while (*text == ' ') text++; + + loreg_match_t result; + loreg_search(regex, text, &result); + print_match(text, &result); + continue; + } + + if (line[0] == ':') { + printf("unknown command: %s\n", line); + continue; + } + + if (!regex) { + printf("error: no pattern set (use :p )\n"); + continue; + } + + loreg_match_t result; + loreg_search(regex, line, &result); + print_match(line, &result); + } + + if (regex) { + loreg_free(regex); + } + + printf("\n"); +} diff --git a/tests/test_all.c b/tests/test_all.c new file mode 100644 index 0000000..07088c8 --- /dev/null +++ b/tests/test_all.c @@ -0,0 +1,252 @@ +/* retoor */ +#include "../include/loreg.h" +#include +#include +#include + +static int total_passed = 0; +static int total_failed = 0; + +#define ASSERT(cond, msg) do { \ + if (!(cond)) { \ + printf(" FAIL: %s\n", msg); \ + total_failed++; \ + return; \ + } \ +} while(0) + +#define TEST(name) static void test_##name(void) +#define RUN(name) do { \ + test_##name(); \ + total_passed++; \ +} while(0) + +TEST(basic_literals) { + loreg_error_t err; + loreg_regex_t *re = loreg_compile("hello", &err); + ASSERT(re != NULL, "compile hello"); + + loreg_match_t m; + ASSERT(loreg_search(re, "hello", &m), "match hello"); + ASSERT(loreg_search(re, "say hello world", &m), "search hello"); + ASSERT(!loreg_search(re, "helo", &m), "no match helo"); + + loreg_free(re); +} + +TEST(metacharacters) { + loreg_error_t err; + loreg_match_t m; + + loreg_regex_t *re = loreg_compile("a.c", &err); + ASSERT(re != NULL, "compile a.c"); + ASSERT(loreg_search(re, "abc", &m), "match abc"); + ASSERT(loreg_search(re, "axc", &m), "match axc"); + ASSERT(!loreg_search(re, "ac", &m), "no match ac"); + loreg_free(re); + + re = loreg_compile("^start", &err); + ASSERT(re != NULL, "compile ^start"); + ASSERT(loreg_search(re, "start here", &m), "match start here"); + ASSERT(!loreg_search(re, "not start", &m), "no match not start"); + loreg_free(re); + + re = loreg_compile("end$", &err); + ASSERT(re != NULL, "compile end$"); + ASSERT(loreg_search(re, "the end", &m), "match the end"); + ASSERT(!loreg_search(re, "end here", &m), "no match end here"); + loreg_free(re); +} + +TEST(quantifiers) { + loreg_error_t err; + loreg_match_t m; + + loreg_regex_t *re = loreg_compile("ab*c", &err); + ASSERT(re != NULL, "compile ab*c"); + ASSERT(loreg_search(re, "ac", &m), "match ac"); + ASSERT(loreg_search(re, "abc", &m), "match abc"); + ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc"); + loreg_free(re); + + re = loreg_compile("ab+c", &err); + ASSERT(re != NULL, "compile ab+c"); + ASSERT(!loreg_search(re, "ac", &m), "no match ac"); + ASSERT(loreg_search(re, "abc", &m), "match abc"); + ASSERT(loreg_search(re, "abbbbc", &m), "match abbbbc"); + loreg_free(re); + + re = loreg_compile("ab?c", &err); + ASSERT(re != NULL, "compile ab?c"); + ASSERT(loreg_search(re, "ac", &m), "match ac"); + ASSERT(loreg_search(re, "abc", &m), "match abc"); + ASSERT(!loreg_search(re, "abbc", &m), "no match abbc"); + loreg_free(re); + + re = loreg_compile("a{3}", &err); + ASSERT(re != NULL, "compile a{3}"); + ASSERT(loreg_search(re, "aaa", &m), "match aaa"); + ASSERT(!loreg_search(re, "aa", &m), "no match aa"); + loreg_free(re); + + re = loreg_compile("a{2,4}", &err); + ASSERT(re != NULL, "compile a{2,4}"); + ASSERT(loreg_search(re, "aa", &m), "match aa"); + ASSERT(loreg_search(re, "aaa", &m), "match aaa"); + ASSERT(loreg_search(re, "aaaa", &m), "match aaaa"); + ASSERT(!loreg_search(re, "a", &m), "no match a"); + loreg_free(re); +} + +TEST(character_classes) { + loreg_error_t err; + loreg_match_t m; + + loreg_regex_t *re = loreg_compile("[aeiou]", &err); + ASSERT(re != NULL, "compile [aeiou]"); + ASSERT(loreg_search(re, "a", &m), "match a"); + ASSERT(loreg_search(re, "test", &m), "match test"); + ASSERT(!loreg_search(re, "xyz", &m), "no match xyz"); + loreg_free(re); + + re = loreg_compile("[a-z]", &err); + ASSERT(re != NULL, "compile [a-z]"); + ASSERT(loreg_search(re, "m", &m), "match m"); + ASSERT(!loreg_search(re, "5", &m), "no match 5"); + loreg_free(re); + + re = loreg_compile("[^0-9]", &err); + ASSERT(re != NULL, "compile [^0-9]"); + ASSERT(loreg_search(re, "a", &m), "match a"); + ASSERT(!loreg_search(re, "5", &m), "no match 5"); + loreg_free(re); + + re = loreg_compile("\\d", &err); + ASSERT(re != NULL, "compile \\d"); + ASSERT(loreg_search(re, "5", &m), "match 5"); + ASSERT(!loreg_search(re, "a", &m), "no match a"); + loreg_free(re); + + re = loreg_compile("\\w+", &err); + ASSERT(re != NULL, "compile \\w+"); + ASSERT(loreg_search(re, "hello_123", &m), "match hello_123"); + loreg_free(re); + + re = loreg_compile("\\s", &err); + ASSERT(re != NULL, "compile \\s"); + ASSERT(loreg_search(re, " ", &m), "match space"); + ASSERT(loreg_search(re, "\t", &m), "match tab"); + ASSERT(!loreg_search(re, "a", &m), "no match a"); + loreg_free(re); +} + +TEST(groups) { + loreg_error_t err; + loreg_match_t m; + + loreg_regex_t *re = loreg_compile("(ab)+", &err); + ASSERT(re != NULL, "compile (ab)+"); + ASSERT(loreg_search(re, "ab", &m), "match ab"); + ASSERT(loreg_search(re, "abab", &m), "match abab"); + ASSERT(!loreg_search(re, "a", &m), "no match a"); + loreg_free(re); + + re = loreg_compile("(\\d+)-(\\d+)", &err); + ASSERT(re != NULL, "compile groups"); + ASSERT(loreg_search(re, "123-456", &m), "match 123-456"); + ASSERT(m.group_count == 2, "2 groups"); + ASSERT(m.groups[0].matched, "group 0 matched"); + ASSERT(m.groups[1].matched, "group 1 matched"); + loreg_free(re); +} + +TEST(alternation) { + loreg_error_t err; + loreg_match_t m; + + loreg_regex_t *re = loreg_compile("cat|dog", &err); + ASSERT(re != NULL, "compile cat|dog"); + ASSERT(loreg_search(re, "cat", &m), "match cat"); + ASSERT(loreg_search(re, "dog", &m), "match dog"); + ASSERT(!loreg_search(re, "rat", &m), "no match rat"); + loreg_free(re); + + re = loreg_compile("(red|blue) car", &err); + ASSERT(re != NULL, "compile (red|blue) car"); + ASSERT(loreg_search(re, "red car", &m), "match red car"); + ASSERT(loreg_search(re, "blue car", &m), "match blue car"); + ASSERT(!loreg_search(re, "green car", &m), "no match green car"); + loreg_free(re); +} + +TEST(escapes) { + loreg_error_t err; + loreg_match_t m; + + loreg_regex_t *re = loreg_compile("1\\.5", &err); + ASSERT(re != NULL, "compile 1\\.5"); + ASSERT(loreg_search(re, "1.5", &m), "match 1.5"); + ASSERT(!loreg_search(re, "1x5", &m), "no match 1x5"); + loreg_free(re); + + re = loreg_compile("\\(test\\)", &err); + ASSERT(re != NULL, "compile \\(test\\)"); + ASSERT(loreg_search(re, "(test)", &m), "match (test)"); + loreg_free(re); +} + +TEST(real_patterns) { + loreg_error_t err; + loreg_match_t m; + + loreg_regex_t *re = loreg_compile("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", &err); + ASSERT(re != NULL, "compile email"); + ASSERT(loreg_search(re, "user@example.com", &m), "match email"); + ASSERT(!loreg_search(re, "invalid", &m), "no match invalid"); + loreg_free(re); + + re = loreg_compile("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", &err); + ASSERT(re != NULL, "compile ip"); + ASSERT(loreg_search(re, "192.168.1.1", &m), "match ip"); + loreg_free(re); + + re = loreg_compile("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", &err); + ASSERT(re != NULL, "compile url"); + ASSERT(loreg_search(re, "http://example.com", &m), "match http"); + ASSERT(loreg_search(re, "https://example.com/path", &m), "match https"); + loreg_free(re); +} + +TEST(error_handling) { + loreg_error_t err; + + loreg_regex_t *re = loreg_compile("(abc", &err); + ASSERT(re == NULL, "unbalanced paren"); + ASSERT(err == LOREG_ERR_UNBALANCED_PAREN, "correct error"); +} + +int main(void) { + printf("loreg comprehensive tests\n"); + printf("========================\n\n"); + + clock_t start = clock(); + + RUN(basic_literals); + RUN(metacharacters); + RUN(quantifiers); + RUN(character_classes); + RUN(groups); + RUN(alternation); + RUN(escapes); + RUN(real_patterns); + RUN(error_handling); + + clock_t end = clock(); + double elapsed = (double)(end - start) / CLOCKS_PER_SEC; + + printf("\n========================\n"); + printf("passed: %d, failed: %d\n", total_passed, total_failed); + printf("time: %.3f seconds\n", elapsed); + + return total_failed > 0 ? 1 : 0; +} diff --git a/tests/test_integration.c b/tests/test_integration.c new file mode 100644 index 0000000..4134b34 --- /dev/null +++ b/tests/test_integration.c @@ -0,0 +1,650 @@ +/* retoor */ +#include "../include/loreg.h" +#include +#include + +static int passed = 0; +static int failed = 0; + +#define MATCH(pat, txt) test_match(pat, txt, 1, __LINE__) +#define NO_MATCH(pat, txt) test_match(pat, txt, 0, __LINE__) + +static void test_match(const char *pattern, const char *text, int expect, int line) { + loreg_error_t err; + loreg_regex_t *re = loreg_compile(pattern, &err); + if (!re) { + printf("FAIL line %d: compile error for '%s': %s\n", line, pattern, loreg_error_string(err)); + failed++; + return; + } + loreg_match_t m; + int result = loreg_search(re, text, &m) ? 1 : 0; + if (result != expect) { + printf("FAIL line %d: '%s' vs '%s' expected %s\n", line, pattern, text, expect ? "match" : "no match"); + failed++; + } else { + passed++; + } + loreg_free(re); +} + +static void test_literals(void) { + printf(" literals...\n"); + MATCH("a", "a"); + MATCH("a", "ba"); + MATCH("a", "ab"); + MATCH("abc", "abc"); + MATCH("abc", "xabcy"); + MATCH("hello", "hello world"); + MATCH("world", "hello world"); + MATCH("lo wo", "hello world"); + NO_MATCH("abc", "ab"); + NO_MATCH("abc", "abd"); + NO_MATCH("xyz", "abc"); + NO_MATCH("hello", "helo"); + MATCH("", "anything"); + MATCH("", ""); + MATCH("a", "aaa"); + MATCH("aa", "aaa"); + MATCH("aaa", "aaa"); + NO_MATCH("aaaa", "aaa"); +} + +static void test_dot(void) { + printf(" dot metacharacter...\n"); + MATCH(".", "a"); + MATCH(".", "x"); + MATCH(".", "5"); + MATCH(".", " "); + MATCH("..", "ab"); + MATCH("...", "abc"); + MATCH("a.c", "abc"); + MATCH("a.c", "aXc"); + MATCH("a.c", "a9c"); + MATCH("a.c", "a c"); + NO_MATCH("a.c", "ac"); + NO_MATCH("a.c", "abbc"); + MATCH("....", "test"); + MATCH(".", "!"); + MATCH(".", "@"); + MATCH("a..b", "aXYb"); + MATCH("a...b", "a123b"); + NO_MATCH("a..b", "aXb"); +} + +static void test_anchors(void) { + printf(" anchors...\n"); + MATCH("^a", "a"); + MATCH("^a", "abc"); + NO_MATCH("^a", "ba"); + NO_MATCH("^a", " a"); + MATCH("a$", "a"); + MATCH("a$", "ba"); + NO_MATCH("a$", "ab"); + NO_MATCH("a$", "a "); + MATCH("^abc$", "abc"); + NO_MATCH("^abc$", "xabc"); + NO_MATCH("^abc$", "abcx"); + NO_MATCH("^abc$", " abc"); + NO_MATCH("^abc$", "abc "); + MATCH("^$", ""); + NO_MATCH("^$", "a"); + MATCH("^hello$", "hello"); + MATCH("^hello world$", "hello world"); + NO_MATCH("^hello world$", "hello world!"); + MATCH("^a.*z$", "abcdefghijklmnopqrstuvwxyz"); + MATCH("^.", "x"); + MATCH(".$", "x"); +} + +static void test_star(void) { + printf(" star quantifier...\n"); + MATCH("a*", ""); + MATCH("a*", "a"); + MATCH("a*", "aa"); + MATCH("a*", "aaa"); + MATCH("a*", "aaaaaaaaaa"); + MATCH("a*", "b"); + MATCH("a*b", "b"); + MATCH("a*b", "ab"); + MATCH("a*b", "aab"); + MATCH("a*b", "aaaaaab"); + NO_MATCH("a*b", "a"); + MATCH("ba*", "b"); + MATCH("ba*", "ba"); + MATCH("ba*", "baaa"); + MATCH(".*", ""); + MATCH(".*", "anything at all"); + MATCH("a.*b", "ab"); + MATCH("a.*b", "aXb"); + MATCH("a.*b", "aXXXXXb"); + MATCH("a.*b", "a b"); + MATCH("x*y*z*", ""); + MATCH("x*y*z*", "xyz"); + MATCH("x*y*z*", "xxxyyyzzz"); + MATCH("ab*c", "ac"); + MATCH("ab*c", "abc"); + MATCH("ab*c", "abbbbc"); +} + +static void test_plus(void) { + printf(" plus quantifier...\n"); + NO_MATCH("a+", ""); + MATCH("a+", "a"); + MATCH("a+", "aa"); + MATCH("a+", "aaa"); + MATCH("a+", "aaaaaaaaaa"); + MATCH("a+", "ba"); + MATCH("a+b", "ab"); + MATCH("a+b", "aab"); + MATCH("a+b", "aaaaaab"); + NO_MATCH("a+b", "b"); + NO_MATCH("a+b", "a"); + MATCH("ba+", "ba"); + MATCH("ba+", "baaa"); + NO_MATCH("ba+", "b"); + MATCH(".+", "a"); + MATCH(".+", "anything"); + NO_MATCH(".+", ""); + MATCH("a.+b", "aXb"); + MATCH("a.+b", "aXXXXXb"); + NO_MATCH("a.+b", "ab"); + MATCH("ab+c", "abc"); + MATCH("ab+c", "abbbbc"); + NO_MATCH("ab+c", "ac"); +} + +static void test_question(void) { + printf(" question quantifier...\n"); + MATCH("a?", ""); + MATCH("a?", "a"); + MATCH("a?", "aa"); + MATCH("a?b", "b"); + MATCH("a?b", "ab"); + MATCH("a?b", "aab"); + MATCH("colou?r", "color"); + MATCH("colou?r", "colour"); + NO_MATCH("colou?r", "colouur"); + MATCH("ab?c", "ac"); + MATCH("ab?c", "abc"); + NO_MATCH("ab?c", "abbc"); + MATCH("https?://", "http://"); + MATCH("https?://", "https://"); + MATCH(".?", ""); + MATCH(".?", "x"); +} + +static void test_alternation(void) { + printf(" alternation...\n"); + MATCH("a|b", "a"); + MATCH("a|b", "b"); + NO_MATCH("a|b", "c"); + MATCH("cat|dog", "cat"); + MATCH("cat|dog", "dog"); + NO_MATCH("cat|dog", "rat"); + MATCH("cat|dog", "my cat"); + MATCH("cat|dog", "my dog"); + MATCH("a|b|c", "a"); + MATCH("a|b|c", "b"); + MATCH("a|b|c", "c"); + NO_MATCH("a|b|c", "d"); + MATCH("ab|cd", "ab"); + MATCH("ab|cd", "cd"); + NO_MATCH("ab|cd", "ac"); + MATCH("abc|def|ghi", "abc"); + MATCH("abc|def|ghi", "def"); + MATCH("abc|def|ghi", "ghi"); + MATCH("a|ab|abc", "abc"); + MATCH("abc|ab|a", "abc"); + MATCH("red|green|blue", "the red car"); + MATCH("red|green|blue", "green light"); + MATCH("red|green|blue", "blue sky"); +} + +static void test_groups(void) { + printf(" groups...\n"); + MATCH("(a)", "a"); + MATCH("(ab)", "ab"); + MATCH("(abc)", "abc"); + MATCH("(a)(b)", "ab"); + MATCH("(a)(b)(c)", "abc"); + MATCH("(ab)+", "ab"); + MATCH("(ab)+", "abab"); + MATCH("(ab)+", "ababab"); + NO_MATCH("(ab)+", "a"); + NO_MATCH("(ab)+", "ba"); + MATCH("(ab)*", ""); + MATCH("(ab)*", "ab"); + MATCH("(ab)*", "abab"); + MATCH("(ab)?", ""); + MATCH("(ab)?", "ab"); + MATCH("(a|b)+", "a"); + MATCH("(a|b)+", "b"); + MATCH("(a|b)+", "ab"); + MATCH("(a|b)+", "ba"); + MATCH("(a|b)+", "aabb"); + MATCH("(a|b)+", "abba"); + MATCH("((a))", "a"); + MATCH("((ab))", "ab"); + MATCH("(a(b)c)", "abc"); + MATCH("(a(b(c)))", "abc"); + MATCH("((a)(b))", "ab"); + MATCH("(red|blue) car", "red car"); + MATCH("(red|blue) car", "blue car"); + NO_MATCH("(red|blue) car", "green car"); +} + +static void test_bracket_simple(void) { + printf(" bracket expressions (simple)...\n"); + MATCH("[a]", "a"); + NO_MATCH("[a]", "b"); + MATCH("[ab]", "a"); + MATCH("[ab]", "b"); + NO_MATCH("[ab]", "c"); + MATCH("[abc]", "a"); + MATCH("[abc]", "b"); + MATCH("[abc]", "c"); + NO_MATCH("[abc]", "d"); + MATCH("[aeiou]", "a"); + MATCH("[aeiou]", "e"); + MATCH("[aeiou]", "i"); + MATCH("[aeiou]", "o"); + MATCH("[aeiou]", "u"); + NO_MATCH("[aeiou]", "b"); + MATCH("[abc]+", "aaa"); + MATCH("[abc]+", "abc"); + MATCH("[abc]+", "cba"); + MATCH("[abc]+", "abcabc"); + MATCH("[xyz]*", ""); + MATCH("[xyz]*", "xyz"); +} + +static void test_bracket_ranges(void) { + printf(" bracket expressions (ranges)...\n"); + MATCH("[a-z]", "a"); + MATCH("[a-z]", "m"); + MATCH("[a-z]", "z"); + NO_MATCH("[a-z]", "A"); + NO_MATCH("[a-z]", "0"); + MATCH("[A-Z]", "A"); + MATCH("[A-Z]", "M"); + MATCH("[A-Z]", "Z"); + NO_MATCH("[A-Z]", "a"); + MATCH("[0-9]", "0"); + MATCH("[0-9]", "5"); + MATCH("[0-9]", "9"); + NO_MATCH("[0-9]", "a"); + MATCH("[a-zA-Z]", "a"); + MATCH("[a-zA-Z]", "Z"); + NO_MATCH("[a-zA-Z]", "5"); + MATCH("[a-zA-Z0-9]", "a"); + MATCH("[a-zA-Z0-9]", "Z"); + MATCH("[a-zA-Z0-9]", "5"); + NO_MATCH("[a-zA-Z0-9]", "!"); + MATCH("[a-z]+", "hello"); + MATCH("[A-Z]+", "HELLO"); + MATCH("[0-9]+", "12345"); + MATCH("[a-z0-9]+", "abc123"); +} + +static void test_bracket_negated(void) { + printf(" bracket expressions (negated)...\n"); + NO_MATCH("[^a]", "a"); + MATCH("[^a]", "b"); + MATCH("[^a]", "x"); + NO_MATCH("[^abc]", "a"); + NO_MATCH("[^abc]", "b"); + NO_MATCH("[^abc]", "c"); + MATCH("[^abc]", "d"); + MATCH("[^abc]", "x"); + NO_MATCH("[^a-z]", "a"); + NO_MATCH("[^a-z]", "m"); + NO_MATCH("[^a-z]", "z"); + MATCH("[^a-z]", "A"); + MATCH("[^a-z]", "5"); + MATCH("[^a-z]", "!"); + NO_MATCH("[^0-9]", "5"); + MATCH("[^0-9]", "a"); + MATCH("[^0-9]+", "hello"); + NO_MATCH("[^aeiou]+", "aaa"); + MATCH("[^aeiou]+", "xyz"); +} + +static void test_character_classes(void) { + printf(" character classes...\n"); + MATCH("\\d", "0"); + MATCH("\\d", "5"); + MATCH("\\d", "9"); + NO_MATCH("\\d", "a"); + NO_MATCH("\\d", " "); + MATCH("\\d+", "123"); + MATCH("\\d+", "0"); + MATCH("\\d+", "9876543210"); + NO_MATCH("\\d+", ""); + NO_MATCH("\\d+", "abc"); + MATCH("\\D", "a"); + MATCH("\\D", " "); + MATCH("\\D", "!"); + NO_MATCH("\\D", "5"); + MATCH("\\w", "a"); + MATCH("\\w", "Z"); + MATCH("\\w", "0"); + MATCH("\\w", "_"); + NO_MATCH("\\w", " "); + NO_MATCH("\\w", "!"); + MATCH("\\w+", "hello"); + MATCH("\\w+", "Hello123"); + MATCH("\\w+", "var_name"); + MATCH("\\W", " "); + MATCH("\\W", "!"); + MATCH("\\W", "@"); + NO_MATCH("\\W", "a"); + NO_MATCH("\\W", "_"); + MATCH("\\s", " "); + MATCH("\\s", "\t"); + MATCH("\\s", "\n"); + NO_MATCH("\\s", "a"); + NO_MATCH("\\s", "5"); + MATCH("\\s+", " "); + MATCH("\\s+", " \t\n"); + MATCH("\\S", "a"); + MATCH("\\S", "5"); + MATCH("\\S", "!"); + NO_MATCH("\\S", " "); + NO_MATCH("\\S", "\t"); +} + +static void test_quantifier_braces(void) { + printf(" brace quantifiers...\n"); + MATCH("a{3}", "aaa"); + MATCH("a{3}", "aaaa"); + NO_MATCH("a{3}", "aa"); + MATCH("a{1}", "a"); + MATCH("a{1}", "aa"); + NO_MATCH("a{1}", ""); + MATCH("a{0}", ""); + MATCH("a{0}", "b"); + MATCH("a{2,4}", "aa"); + MATCH("a{2,4}", "aaa"); + MATCH("a{2,4}", "aaaa"); + MATCH("a{2,4}", "aaaaa"); + NO_MATCH("a{2,4}", "a"); + MATCH("a{2,}", "aa"); + MATCH("a{2,}", "aaa"); + MATCH("a{2,}", "aaaaaaaaaa"); + NO_MATCH("a{2,}", "a"); + MATCH("a{0,2}", ""); + MATCH("a{0,2}", "a"); + MATCH("a{0,2}", "aa"); + MATCH("a{0,2}", "aaa"); + MATCH("[0-9]{3}", "123"); + MATCH("[0-9]{3}", "000"); + NO_MATCH("[0-9]{3}", "12"); + MATCH("(ab){2}", "abab"); + MATCH("(ab){2}", "ababab"); + NO_MATCH("(ab){2}", "ab"); +} + +static void test_escape_sequences(void) { + printf(" escape sequences...\n"); + MATCH("\\.", "."); + NO_MATCH("\\.", "a"); + MATCH("\\*", "*"); + NO_MATCH("\\*", "a"); + MATCH("\\+", "+"); + MATCH("\\?", "?"); + MATCH("\\|", "|"); + MATCH("\\(", "("); + MATCH("\\)", ")"); + MATCH("\\[", "["); + MATCH("\\]", "]"); + MATCH("\\{", "{"); + MATCH("\\}", "}"); + MATCH("\\^", "^"); + MATCH("\\$", "$"); + MATCH("\\\\", "\\"); + MATCH("a\\.b", "a.b"); + NO_MATCH("a\\.b", "aXb"); + MATCH("\\d\\.\\d", "1.5"); + MATCH("c\\+\\+", "c++"); + MATCH("\\(test\\)", "(test)"); + MATCH("\\[0\\]", "[0]"); +} + +static void test_complex_patterns(void) { + printf(" complex patterns...\n"); + MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "user@example.com"); + MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "test.user@mail.example.org"); + NO_MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "invalid"); + NO_MATCH("[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}", "@example.com"); + MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "192.168.1.1"); + MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "10.0.0.1"); + MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "255.255.255.255"); + NO_MATCH("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}", "1.2.3"); + MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "http://example.com"); + MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "https://example.com"); + MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "http://example.com/path"); + MATCH("https?://[a-zA-Z0-9.-]+(/[a-zA-Z0-9./-]*)?", "https://example.com/path/to/page"); + MATCH("\\d{3}-\\d{3}-\\d{4}", "123-456-7890"); + MATCH("\\d{3}-\\d{3}-\\d{4}", "555-123-4567"); + NO_MATCH("\\d{3}-\\d{3}-\\d{4}", "12-345-6789"); + NO_MATCH("\\d{3}-\\d{3}-\\d{4}", "1234567890"); + MATCH("\\(\\d{3}\\) \\d{3}-\\d{4}", "(123) 456-7890"); + MATCH("[A-Z]{2}\\d{6}", "AB123456"); + NO_MATCH("[A-Z]{2}\\d{6}", "A1234567"); + MATCH("\\d{4}-\\d{2}-\\d{2}", "2024-01-15"); + MATCH("\\d{2}/\\d{2}/\\d{4}", "01/15/2024"); + MATCH("\\d{1,2}:\\d{2}(:\\d{2})?", "12:30"); + MATCH("\\d{1,2}:\\d{2}(:\\d{2})?", "12:30:45"); + MATCH("\\d{1,2}:\\d{2}(:\\d{2})?", "9:05"); +} + +static void test_word_boundaries(void) { + printf(" word patterns...\n"); + MATCH("\\w+", "hello"); + MATCH("\\w+", "hello123"); + MATCH("\\w+", "test_var"); + MATCH("[a-zA-Z_][a-zA-Z0-9_]*", "variable"); + MATCH("[a-zA-Z_][a-zA-Z0-9_]*", "_private"); + MATCH("[a-zA-Z_][a-zA-Z0-9_]*", "var123"); + NO_MATCH("^[a-zA-Z_][a-zA-Z0-9_]*$", "123var"); + MATCH("\\w+\\s+\\w+", "hello world"); + MATCH("\\w+\\s+\\w+", "foo bar"); + NO_MATCH("\\w+\\s+\\w+", "hello"); +} + +static void test_greedy_vs_nongreedy(void) { + printf(" greedy vs non-greedy...\n"); + MATCH("a+", "aaa"); + MATCH("a+?", "aaa"); + MATCH("a*", "aaa"); + MATCH("a*?", "aaa"); + MATCH("a?", "a"); + MATCH("a??", "a"); + MATCH("a{2,4}", "aaaa"); + MATCH("a{2,4}?", "aaaa"); + MATCH(".*x", "abcx"); + MATCH(".*?x", "abcx"); +} + +static void test_empty_and_edge_cases(void) { + printf(" empty and edge cases...\n"); + MATCH("", ""); + MATCH("", "abc"); + MATCH("a*", ""); + MATCH("a?", ""); + MATCH("(a*)*", ""); + MATCH("(a*)+", ""); + MATCH("(a+)*", ""); + MATCH("(a|b)*", ""); + MATCH("[a-z]*", ""); + NO_MATCH("a+", ""); + NO_MATCH(".+", ""); + NO_MATCH("[a-z]+", ""); + MATCH("^", ""); + MATCH("$", ""); + MATCH("^$", ""); + NO_MATCH("^$", "a"); + MATCH("a*b*c*", ""); + MATCH("a*b*c*", "abc"); + MATCH("a*b*c*", "aabbcc"); + MATCH("a*b*c*", "c"); + MATCH("a*b*c*", "b"); +} + +static void test_special_characters_in_text(void) { + printf(" special characters in text...\n"); + MATCH("a", "a\nb"); + MATCH("b", "a\nb"); + MATCH("a.b", "a\tb"); + NO_MATCH("a.b", "a\nb"); + MATCH("\\.", "3.14"); + MATCH("\\+", "1+2"); + MATCH("\\*", "2*3"); + MATCH("\\?", "why?"); + MATCH("\\(\\)", "func()"); + MATCH("\\[\\]", "array[]"); + MATCH("\\{\\}", "object{}"); + MATCH("\\^", "x^2"); + MATCH("\\$", "$100"); + MATCH("\\|", "a|b"); +} + +static void test_repetition_combinations(void) { + printf(" repetition combinations...\n"); + MATCH("a+b+", "ab"); + MATCH("a+b+", "aabb"); + MATCH("a+b+", "aaabbb"); + NO_MATCH("a+b+", "a"); + NO_MATCH("a+b+", "b"); + MATCH("a*b+", "b"); + MATCH("a*b+", "ab"); + MATCH("a*b+", "aab"); + MATCH("a+b*", "a"); + MATCH("a+b*", "ab"); + MATCH("a+b*", "abb"); + MATCH("a*b*", ""); + MATCH("a*b*", "a"); + MATCH("a*b*", "b"); + MATCH("a*b*", "ab"); + MATCH("(ab)+c+", "abc"); + MATCH("(ab)+c+", "ababcc"); + MATCH("(a+b)+", "ab"); + MATCH("(a+b)+", "aabaaab"); + MATCH("((a+)+)+", "a"); + MATCH("((a+)+)+", "aaa"); +} + +static void test_alternation_combinations(void) { + printf(" alternation combinations...\n"); + MATCH("a|b|c|d|e", "a"); + MATCH("a|b|c|d|e", "e"); + NO_MATCH("a|b|c|d|e", "f"); + MATCH("(a|b)(c|d)", "ac"); + MATCH("(a|b)(c|d)", "ad"); + MATCH("(a|b)(c|d)", "bc"); + MATCH("(a|b)(c|d)", "bd"); + NO_MATCH("(a|b)(c|d)", "ab"); + MATCH("(cat|dog)s?", "cat"); + MATCH("(cat|dog)s?", "cats"); + MATCH("(cat|dog)s?", "dog"); + MATCH("(cat|dog)s?", "dogs"); + MATCH("(red|green|blue)\\s+(car|truck)", "red car"); + MATCH("(red|green|blue)\\s+(car|truck)", "green truck"); + MATCH("(a|aa|aaa)", "aaa"); + MATCH("(aaa|aa|a)", "aaa"); +} + +static void test_nested_groups(void) { + printf(" nested groups...\n"); + MATCH("((a))", "a"); + MATCH("(((a)))", "a"); + MATCH("((a)(b))", "ab"); + MATCH("((a(b))c)", "abc"); + MATCH("(a(b(c)))", "abc"); + MATCH("((a|b)(c|d))", "ac"); + MATCH("(a(b|c)d)", "abd"); + MATCH("(a(b|c)d)", "acd"); + MATCH("((ab)+)", "abab"); + MATCH("(a(bc)*d)", "ad"); + MATCH("(a(bc)*d)", "abcd"); + MATCH("(a(bc)*d)", "abcbcd"); + MATCH("((a+)(b+))", "aabb"); + MATCH("(((a|b)+)c)", "ababc"); +} + +static void test_real_world_patterns(void) { + printf(" real world patterns...\n"); + MATCH("[a-zA-Z]+", "Hello"); + MATCH("[a-zA-Z]+", "WORLD"); + MATCH("[a-zA-Z]+", "test"); + MATCH("-?\\d+", "123"); + MATCH("-?\\d+", "-456"); + MATCH("-?\\d+", "0"); + MATCH("-?\\d+\\.?\\d*", "3.14"); + MATCH("-?\\d+\\.?\\d*", "-2.5"); + MATCH("-?\\d+\\.?\\d*", "42"); + MATCH("[a-fA-F0-9]+", "deadbeef"); + MATCH("[a-fA-F0-9]+", "CAFEBABE"); + MATCH("[a-fA-F0-9]+", "123abc"); + MATCH("[01]+", "101010"); + MATCH("[01]+", "11110000"); + MATCH("[A-Z][a-z]+", "Hello"); + MATCH("[A-Z][a-z]+", "World"); + NO_MATCH("[A-Z][a-z]+", "hello"); + MATCH("\"[^\"]*\"", "\"hello\""); + MATCH("\"[^\"]*\"", "\"hello world\""); + MATCH("\"[^\"]*\"", "\"\""); + MATCH("'[^']*'", "'test'"); + MATCH("#[a-fA-F0-9]{6}", "#ff0000"); + MATCH("#[a-fA-F0-9]{6}", "#00FF00"); + MATCH("#[a-fA-F0-9]{3}", "#f00"); +} + +static void test_pathological_patterns(void) { + printf(" stress test patterns...\n"); + MATCH("a?a?a?aaa", "aaa"); + MATCH("(a+)+", "aaaa"); + MATCH("(a*)*", "aaaa"); + MATCH("(a|a)+", "aaaa"); + MATCH("((a*)*)*", "aaaa"); + MATCH("a*a*a*a*a*b", "aaaaab"); + MATCH(".*.*.*.*.*", "test"); + MATCH("(a?){5}a{5}", "aaaaa"); +} + +int main(void) { + printf("loreg integration tests\n"); + printf("=======================\n\n"); + + test_literals(); + test_dot(); + test_anchors(); + test_star(); + test_plus(); + test_question(); + test_alternation(); + test_groups(); + test_bracket_simple(); + test_bracket_ranges(); + test_bracket_negated(); + test_character_classes(); + test_quantifier_braces(); + test_escape_sequences(); + test_complex_patterns(); + test_word_boundaries(); + test_greedy_vs_nongreedy(); + test_empty_and_edge_cases(); + test_special_characters_in_text(); + test_repetition_combinations(); + test_alternation_combinations(); + test_nested_groups(); + test_real_world_patterns(); + test_pathological_patterns(); + + printf("\n=======================\n"); + printf("integration: %d passed, %d failed\n", passed, failed); + printf("total tests: %d\n", passed + failed); + + return failed > 0 ? 1 : 0; +} diff --git a/tests/test_lexer.c b/tests/test_lexer.c new file mode 100644 index 0000000..87e59c2 --- /dev/null +++ b/tests/test_lexer.c @@ -0,0 +1,195 @@ +/* retoor */ +#include "../include/lexer.h" +#include +#include +#include + +static int tests_passed = 0; +static int tests_failed = 0; + +#define TEST(name) static void test_##name(void) +#define RUN_TEST(name) do { \ + printf(" %s... ", #name); \ + test_##name(); \ + printf("ok\n"); \ + tests_passed++; \ +} while(0) + +#define ASSERT(cond) do { \ + if (!(cond)) { \ + printf("FAILED at line %d: %s\n", __LINE__, #cond); \ + tests_failed++; \ + return; \ + } \ +} while(0) + +TEST(simple_chars) { + lexer_t lexer; + lexer_init(&lexer, "abc"); + + token_t t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'a'); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'b'); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'c'); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_EOF); +} + +TEST(meta_chars) { + lexer_t lexer; + lexer_init(&lexer, ".*+?|()^$"); + + ASSERT(lexer_next(&lexer).type == TOKEN_DOT); + ASSERT(lexer_next(&lexer).type == TOKEN_STAR); + ASSERT(lexer_next(&lexer).type == TOKEN_PLUS); + ASSERT(lexer_next(&lexer).type == TOKEN_QUESTION); + ASSERT(lexer_next(&lexer).type == TOKEN_PIPE); + ASSERT(lexer_next(&lexer).type == TOKEN_LPAREN); + ASSERT(lexer_next(&lexer).type == TOKEN_RPAREN); + ASSERT(lexer_next(&lexer).type == TOKEN_CARET); + ASSERT(lexer_next(&lexer).type == TOKEN_DOLLAR); + ASSERT(lexer_next(&lexer).type == TOKEN_EOF); +} + +TEST(escaped_chars) { + lexer_t lexer; + lexer_init(&lexer, "\\*\\+\\."); + + token_t t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == '*'); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == '+'); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == '.'); +} + +TEST(character_classes) { + lexer_t lexer; + lexer_init(&lexer, "\\d\\w\\s\\D\\W\\S"); + + ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_DIGIT); + ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_WORD); + ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_SPACE); + ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_NDIGIT); + ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_NWORD); + ASSERT(lexer_next(&lexer).type == TOKEN_CLASS_NSPACE); +} + +TEST(bracket_expression) { + lexer_t lexer; + lexer_init(&lexer, "[abc]"); + + ASSERT(lexer_next(&lexer).type == TOKEN_LBRACKET); + + token_t t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'a'); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'b'); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'c'); + + ASSERT(lexer_next(&lexer).type == TOKEN_RBRACKET); +} + +TEST(bracket_range) { + lexer_t lexer; + lexer_init(&lexer, "[a-z]"); + + ASSERT(lexer_next(&lexer).type == TOKEN_LBRACKET); + + token_t t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'a'); + + ASSERT(lexer_next(&lexer).type == TOKEN_DASH); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'z'); + + ASSERT(lexer_next(&lexer).type == TOKEN_RBRACKET); +} + +TEST(negated_bracket) { + lexer_t lexer; + lexer_init(&lexer, "[^a]"); + + ASSERT(lexer_next(&lexer).type == TOKEN_LBRACKET); + ASSERT(lexer_next(&lexer).type == TOKEN_CARET); + + token_t t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'a'); + + ASSERT(lexer_next(&lexer).type == TOKEN_RBRACKET); +} + +TEST(quantifier_braces) { + lexer_t lexer; + lexer_init(&lexer, "a{3}"); + + token_t t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'a'); + + ASSERT(lexer_next(&lexer).type == TOKEN_LBRACE); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == '3'); + + ASSERT(lexer_next(&lexer).type == TOKEN_RBRACE); +} + +TEST(peek) { + lexer_t lexer; + lexer_init(&lexer, "ab"); + + token_t t = lexer_peek(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'a'); + + t = lexer_peek(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'a'); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'a'); + + t = lexer_peek(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == 'b'); +} + +TEST(escape_sequences) { + lexer_t lexer; + lexer_init(&lexer, "\\n\\t\\r"); + + token_t t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == '\n'); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == '\t'); + + t = lexer_next(&lexer); + ASSERT(t.type == TOKEN_CHAR && t.value == '\r'); +} + +int main(void) { + printf("lexer tests:\n"); + + RUN_TEST(simple_chars); + RUN_TEST(meta_chars); + RUN_TEST(escaped_chars); + RUN_TEST(character_classes); + RUN_TEST(bracket_expression); + RUN_TEST(bracket_range); + RUN_TEST(negated_bracket); + RUN_TEST(quantifier_braces); + RUN_TEST(peek); + RUN_TEST(escape_sequences); + + printf("\nlexer: %d passed, %d failed\n", tests_passed, tests_failed); + return tests_failed > 0 ? 1 : 0; +} diff --git a/tests/test_matcher.c b/tests/test_matcher.c new file mode 100644 index 0000000..0e51333 --- /dev/null +++ b/tests/test_matcher.c @@ -0,0 +1,294 @@ +/* retoor */ +#include "../include/loreg.h" +#include +#include + +static int tests_passed = 0; +static int tests_failed = 0; + +#define TEST(name) static void test_##name(void) +#define RUN_TEST(name) do { \ + printf(" %s... ", #name); \ + test_##name(); \ + printf("ok\n"); \ + tests_passed++; \ +} while(0) + +#define ASSERT(cond) do { \ + if (!(cond)) { \ + printf("FAILED at line %d: %s\n", __LINE__, #cond); \ + tests_failed++; \ + return; \ + } \ +} while(0) + +#define ASSERT_MATCH(pattern, text) do { \ + loreg_error_t err; \ + loreg_regex_t *re = loreg_compile(pattern, &err); \ + ASSERT(re != NULL); \ + loreg_match_t result; \ + ASSERT(loreg_search(re, text, &result) == true); \ + loreg_free(re); \ +} while(0) + +#define ASSERT_NO_MATCH(pattern, text) do { \ + loreg_error_t err; \ + loreg_regex_t *re = loreg_compile(pattern, &err); \ + ASSERT(re != NULL); \ + loreg_match_t result; \ + ASSERT(loreg_search(re, text, &result) == false); \ + loreg_free(re); \ +} while(0) + +TEST(simple_char) { + ASSERT_MATCH("a", "a"); + ASSERT_MATCH("a", "bab"); + ASSERT_NO_MATCH("a", "bcd"); +} + +TEST(concat) { + ASSERT_MATCH("ab", "ab"); + ASSERT_MATCH("ab", "xaby"); + ASSERT_NO_MATCH("ab", "ba"); +} + +TEST(alternation) { + ASSERT_MATCH("a|b", "a"); + ASSERT_MATCH("a|b", "b"); + ASSERT_MATCH("cat|dog", "cat"); + ASSERT_MATCH("cat|dog", "dog"); + ASSERT_NO_MATCH("cat|dog", "rat"); +} + +TEST(star) { + ASSERT_MATCH("a*", ""); + ASSERT_MATCH("a*", "a"); + ASSERT_MATCH("a*", "aaa"); + ASSERT_MATCH("a*b", "b"); + ASSERT_MATCH("a*b", "ab"); + ASSERT_MATCH("a*b", "aaab"); +} + +TEST(plus) { + ASSERT_NO_MATCH("a+", ""); + ASSERT_MATCH("a+", "a"); + ASSERT_MATCH("a+", "aaa"); + ASSERT_MATCH("a+b", "ab"); + ASSERT_MATCH("a+b", "aaab"); +} + +TEST(question) { + ASSERT_MATCH("a?", ""); + ASSERT_MATCH("a?", "a"); + ASSERT_MATCH("a?b", "b"); + ASSERT_MATCH("a?b", "ab"); +} + +TEST(dot) { + ASSERT_MATCH(".", "a"); + ASSERT_MATCH(".", "x"); + ASSERT_MATCH("a.b", "aab"); + ASSERT_MATCH("a.b", "axb"); + ASSERT_NO_MATCH("a.b", "ab"); +} + +TEST(bracket_simple) { + ASSERT_MATCH("[abc]", "a"); + ASSERT_MATCH("[abc]", "b"); + ASSERT_MATCH("[abc]", "c"); + ASSERT_NO_MATCH("[abc]", "d"); +} + +TEST(bracket_range) { + ASSERT_MATCH("[a-z]", "a"); + ASSERT_MATCH("[a-z]", "m"); + ASSERT_MATCH("[a-z]", "z"); + ASSERT_NO_MATCH("[a-z]", "A"); + ASSERT_NO_MATCH("[a-z]", "0"); +} + +TEST(bracket_negated) { + ASSERT_NO_MATCH("[^abc]", "a"); + ASSERT_NO_MATCH("[^abc]", "b"); + ASSERT_MATCH("[^abc]", "d"); + ASSERT_MATCH("[^abc]", "x"); +} + +TEST(group) { + ASSERT_MATCH("(ab)", "ab"); + ASSERT_MATCH("(ab)+", "abab"); + ASSERT_MATCH("(a|b)+", "abba"); +} + +TEST(anchors) { + ASSERT_MATCH("^a", "a"); + ASSERT_MATCH("^a", "abc"); + ASSERT_NO_MATCH("^a", "ba"); + + ASSERT_MATCH("a$", "a"); + ASSERT_MATCH("a$", "ba"); + ASSERT_NO_MATCH("a$", "ab"); + + ASSERT_MATCH("^abc$", "abc"); + ASSERT_NO_MATCH("^abc$", "xabc"); + ASSERT_NO_MATCH("^abc$", "abcx"); +} + +TEST(quantifier_exact) { + ASSERT_MATCH("a{3}", "aaa"); + ASSERT_MATCH("a{3}", "aaaa"); + ASSERT_NO_MATCH("a{3}", "aa"); +} + +TEST(quantifier_range) { + ASSERT_MATCH("a{2,4}", "aa"); + ASSERT_MATCH("a{2,4}", "aaa"); + ASSERT_MATCH("a{2,4}", "aaaa"); + ASSERT_NO_MATCH("a{2,4}", "a"); +} + +TEST(quantifier_open) { + ASSERT_MATCH("a{2,}", "aa"); + ASSERT_MATCH("a{2,}", "aaaaa"); + ASSERT_NO_MATCH("a{2,}", "a"); +} + +TEST(class_digit) { + ASSERT_MATCH("\\d", "0"); + ASSERT_MATCH("\\d", "9"); + ASSERT_MATCH("\\d+", "123"); + ASSERT_NO_MATCH("\\d", "a"); +} + +TEST(class_word) { + ASSERT_MATCH("\\w", "a"); + ASSERT_MATCH("\\w", "Z"); + ASSERT_MATCH("\\w", "0"); + ASSERT_MATCH("\\w", "_"); + ASSERT_NO_MATCH("\\w", " "); + ASSERT_NO_MATCH("\\w", "-"); +} + +TEST(class_space) { + ASSERT_MATCH("\\s", " "); + ASSERT_MATCH("\\s", "\t"); + ASSERT_MATCH("\\s", "\n"); + ASSERT_NO_MATCH("\\s", "a"); +} + +TEST(class_negated) { + ASSERT_NO_MATCH("\\D", "0"); + ASSERT_MATCH("\\D", "a"); + ASSERT_NO_MATCH("\\W", "a"); + ASSERT_MATCH("\\W", " "); + ASSERT_NO_MATCH("\\S", " "); + ASSERT_MATCH("\\S", "a"); +} + +TEST(escape_sequences) { + ASSERT_MATCH("\\.", "."); + ASSERT_NO_MATCH("\\.", "a"); + ASSERT_MATCH("\\*", "*"); + ASSERT_MATCH("\\+", "+"); + ASSERT_MATCH("\\?", "?"); +} + +TEST(complex_email) { + ASSERT_MATCH("[a-z]+@[a-z]+\\.[a-z]+", "test@example.com"); + ASSERT_NO_MATCH("[a-z]+@[a-z]+\\.[a-z]+", "invalid"); +} + +TEST(complex_phone) { + ASSERT_MATCH("\\d{3}-\\d{3}-\\d{4}", "123-456-7890"); + ASSERT_NO_MATCH("\\d{3}-\\d{3}-\\d{4}", "123-456-789"); +} + +TEST(complex_url) { + ASSERT_MATCH("https?://[a-z]+\\.[a-z]+", "http://example.com"); + ASSERT_MATCH("https?://[a-z]+\\.[a-z]+", "https://example.com"); +} + +TEST(group_capture) { + loreg_error_t err; + loreg_regex_t *re = loreg_compile("(\\d+)-(\\d+)", &err); + ASSERT(re != NULL); + + loreg_match_t result; + ASSERT(loreg_search(re, "123-456", &result)); + ASSERT(result.group_count == 2); + ASSERT(result.groups[0].matched); + ASSERT(result.groups[1].matched); + + loreg_free(re); +} + +TEST(nested_groups) { + loreg_error_t err; + loreg_regex_t *re = loreg_compile("((a)(b))", &err); + ASSERT(re != NULL); + + loreg_match_t result; + ASSERT(loreg_search(re, "ab", &result)); + ASSERT(result.group_count == 3); + + loreg_free(re); +} + +TEST(empty_pattern) { + loreg_error_t err; + loreg_regex_t *re = loreg_compile("", &err); + ASSERT(re != NULL); + + loreg_match_t result; + ASSERT(loreg_match(re, "anything", &result)); + + loreg_free(re); +} + +TEST(match_position) { + loreg_error_t err; + loreg_regex_t *re = loreg_compile("test", &err); + ASSERT(re != NULL); + + loreg_match_t result; + ASSERT(loreg_search(re, "xxxtestyyy", &result)); + ASSERT(result.match_start == 3); + ASSERT(result.match_end == 7); + + loreg_free(re); +} + +int main(void) { + printf("matcher tests:\n"); + + RUN_TEST(simple_char); + RUN_TEST(concat); + RUN_TEST(alternation); + RUN_TEST(star); + RUN_TEST(plus); + RUN_TEST(question); + RUN_TEST(dot); + RUN_TEST(bracket_simple); + RUN_TEST(bracket_range); + RUN_TEST(bracket_negated); + RUN_TEST(group); + RUN_TEST(anchors); + RUN_TEST(quantifier_exact); + RUN_TEST(quantifier_range); + RUN_TEST(quantifier_open); + RUN_TEST(class_digit); + RUN_TEST(class_word); + RUN_TEST(class_space); + RUN_TEST(class_negated); + RUN_TEST(escape_sequences); + RUN_TEST(complex_email); + RUN_TEST(complex_phone); + RUN_TEST(complex_url); + RUN_TEST(group_capture); + RUN_TEST(nested_groups); + RUN_TEST(empty_pattern); + RUN_TEST(match_position); + + printf("\nmatcher: %d passed, %d failed\n", tests_passed, tests_failed); + return tests_failed > 0 ? 1 : 0; +} diff --git a/tests/test_nfa.c b/tests/test_nfa.c new file mode 100644 index 0000000..03c4b77 --- /dev/null +++ b/tests/test_nfa.c @@ -0,0 +1,159 @@ +/* retoor */ +#include "../include/nfa.h" +#include "../include/parser.h" +#include +#include + +static int tests_passed = 0; +static int tests_failed = 0; + +#define TEST(name) static void test_##name(void) +#define RUN_TEST(name) do { \ + printf(" %s... ", #name); \ + test_##name(); \ + printf("ok\n"); \ + tests_passed++; \ +} while(0) + +#define ASSERT(cond) do { \ + if (!(cond)) { \ + printf("FAILED at line %d: %s\n", __LINE__, #cond); \ + tests_failed++; \ + return; \ + } \ +} while(0) + +static nfa_t *compile_pattern(const char *pattern) { + parser_t parser; + parser_init(&parser, pattern); + ast_node_t *ast = parser_parse(&parser); + if (!ast || parser_get_error(&parser) != LOREG_OK) { + ast_free(ast); + return NULL; + } + loreg_error_t error; + nfa_t *nfa = nfa_from_ast(ast, &error); + ast_free(ast); + return nfa; +} + +TEST(single_char) { + nfa_t *nfa = compile_pattern("a"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + ASSERT(nfa->state_count >= 2); + nfa_free(nfa); +} + +TEST(concat) { + nfa_t *nfa = compile_pattern("ab"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + nfa_free(nfa); +} + +TEST(alternation) { + nfa_t *nfa = compile_pattern("a|b"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + nfa_free(nfa); +} + +TEST(star) { + nfa_t *nfa = compile_pattern("a*"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + nfa_free(nfa); +} + +TEST(plus) { + nfa_t *nfa = compile_pattern("a+"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + nfa_free(nfa); +} + +TEST(question) { + nfa_t *nfa = compile_pattern("a?"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + nfa_free(nfa); +} + +TEST(group) { + nfa_t *nfa = compile_pattern("(ab)"); + ASSERT(nfa != NULL); + ASSERT(nfa->group_count == 1); + nfa_free(nfa); +} + +TEST(nested_groups) { + nfa_t *nfa = compile_pattern("((a)(b))"); + ASSERT(nfa != NULL); + ASSERT(nfa->group_count == 3); + nfa_free(nfa); +} + +TEST(bracket) { + nfa_t *nfa = compile_pattern("[abc]"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + nfa_free(nfa); +} + +TEST(quantifier) { + nfa_t *nfa = compile_pattern("a{2,4}"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + nfa_free(nfa); +} + +TEST(complex_pattern) { + nfa_t *nfa = compile_pattern("^([a-z]+)@([a-z]+)\\.([a-z]{2,})$"); + ASSERT(nfa != NULL); + ASSERT(nfa->group_count == 3); + nfa_free(nfa); +} + +TEST(dot) { + nfa_t *nfa = compile_pattern("a.b"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + nfa_free(nfa); +} + +TEST(anchors) { + nfa_t *nfa = compile_pattern("^abc$"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + nfa_free(nfa); +} + +TEST(character_classes) { + nfa_t *nfa = compile_pattern("\\d\\w\\s"); + ASSERT(nfa != NULL); + ASSERT(nfa->start != NULL); + nfa_free(nfa); +} + +int main(void) { + printf("nfa tests:\n"); + + RUN_TEST(single_char); + RUN_TEST(concat); + RUN_TEST(alternation); + RUN_TEST(star); + RUN_TEST(plus); + RUN_TEST(question); + RUN_TEST(group); + RUN_TEST(nested_groups); + RUN_TEST(bracket); + RUN_TEST(quantifier); + RUN_TEST(complex_pattern); + RUN_TEST(dot); + RUN_TEST(anchors); + RUN_TEST(character_classes); + + printf("\nnfa: %d passed, %d failed\n", tests_passed, tests_failed); + return tests_failed > 0 ? 1 : 0; +} diff --git a/tests/test_parser.c b/tests/test_parser.c new file mode 100644 index 0000000..de67ba9 --- /dev/null +++ b/tests/test_parser.c @@ -0,0 +1,301 @@ +/* retoor */ +#include "../include/parser.h" +#include +#include + +static int tests_passed = 0; +static int tests_failed = 0; + +#define TEST(name) static void test_##name(void) +#define RUN_TEST(name) do { \ + printf(" %s... ", #name); \ + test_##name(); \ + printf("ok\n"); \ + tests_passed++; \ +} while(0) + +#define ASSERT(cond) do { \ + if (!(cond)) { \ + printf("FAILED at line %d: %s\n", __LINE__, #cond); \ + tests_failed++; \ + return; \ + } \ +} while(0) + +TEST(single_char) { + parser_t parser; + parser_init(&parser, "a"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_CHAR); + ASSERT(ast->value == 'a'); + + ast_free(ast); +} + +TEST(concat) { + parser_t parser; + parser_init(&parser, "ab"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_CONCAT); + ASSERT(ast->left->type == AST_CHAR); + ASSERT(ast->left->value == 'a'); + ASSERT(ast->right->type == AST_CHAR); + ASSERT(ast->right->value == 'b'); + + ast_free(ast); +} + +TEST(alternation) { + parser_t parser; + parser_init(&parser, "a|b"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_ALTER); + ASSERT(ast->left->type == AST_CHAR); + ASSERT(ast->left->value == 'a'); + ASSERT(ast->right->type == AST_CHAR); + ASSERT(ast->right->value == 'b'); + + ast_free(ast); +} + +TEST(star) { + parser_t parser; + parser_init(&parser, "a*"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_STAR); + ASSERT(ast->left->type == AST_CHAR); + ASSERT(ast->left->value == 'a'); + + ast_free(ast); +} + +TEST(plus) { + parser_t parser; + parser_init(&parser, "a+"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_PLUS); + ASSERT(ast->left->type == AST_CHAR); + ASSERT(ast->left->value == 'a'); + + ast_free(ast); +} + +TEST(question) { + parser_t parser; + parser_init(&parser, "a?"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_QUESTION); + ASSERT(ast->left->type == AST_CHAR); + ASSERT(ast->left->value == 'a'); + + ast_free(ast); +} + +TEST(group) { + parser_t parser; + parser_init(&parser, "(ab)"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_GROUP); + ASSERT(ast->group_id == 0); + ASSERT(ast->left->type == AST_CONCAT); + + ast_free(ast); +} + +TEST(dot) { + parser_t parser; + parser_init(&parser, "."); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_DOT); + + ast_free(ast); +} + +TEST(anchors) { + parser_t parser; + parser_init(&parser, "^a$"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_CONCAT); + + ast_free(ast); +} + +TEST(bracket_simple) { + parser_t parser; + parser_init(&parser, "[abc]"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_BRACKET); + ASSERT(ast->bracket != NULL); + ASSERT(ast->bracket->count == 3); + + ast_free(ast); +} + +TEST(bracket_range) { + parser_t parser; + parser_init(&parser, "[a-z]"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_BRACKET); + ASSERT(ast->bracket != NULL); + ASSERT(ast->bracket->count == 1); + ASSERT(ast->bracket->ranges[0].start == 'a'); + ASSERT(ast->bracket->ranges[0].end == 'z'); + + ast_free(ast); +} + +TEST(bracket_negated) { + parser_t parser; + parser_init(&parser, "[^a]"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_BRACKET); + ASSERT(ast->bracket->negated == true); + + ast_free(ast); +} + +TEST(quantifier_exact) { + parser_t parser; + parser_init(&parser, "a{3}"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_QUANTIFIER); + ASSERT(ast->quant.min == 3); + ASSERT(ast->quant.max == 3); + + ast_free(ast); +} + +TEST(quantifier_range) { + parser_t parser; + parser_init(&parser, "a{2,5}"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_QUANTIFIER); + ASSERT(ast->quant.min == 2); + ASSERT(ast->quant.max == 5); + + ast_free(ast); +} + +TEST(quantifier_open) { + parser_t parser; + parser_init(&parser, "a{2,}"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_QUANTIFIER); + ASSERT(ast->quant.min == 2); + ASSERT(ast->quant.max == -1); + + ast_free(ast); +} + +TEST(character_class_digit) { + parser_t parser; + parser_init(&parser, "\\d"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_CLASS_DIGIT); + + ast_free(ast); +} + +TEST(character_class_word) { + parser_t parser; + parser_init(&parser, "\\w"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_CLASS_WORD); + + ast_free(ast); +} + +TEST(complex_pattern) { + parser_t parser; + parser_init(&parser, "^([a-z]+)@([a-z]+)\\.([a-z]{2,})$"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(parser_get_error(&parser) == LOREG_OK); + + ast_free(ast); +} + +TEST(unbalanced_paren) { + parser_t parser; + parser_init(&parser, "(abc"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast == NULL || parser_get_error(&parser) == LOREG_ERR_UNBALANCED_PAREN); + + ast_free(ast); +} + +TEST(non_greedy) { + parser_t parser; + parser_init(&parser, "a*?"); + ast_node_t *ast = parser_parse(&parser); + + ASSERT(ast != NULL); + ASSERT(ast->type == AST_STAR); + ASSERT(ast->quant.greedy == false); + + ast_free(ast); +} + +int main(void) { + printf("parser tests:\n"); + + RUN_TEST(single_char); + RUN_TEST(concat); + RUN_TEST(alternation); + RUN_TEST(star); + RUN_TEST(plus); + RUN_TEST(question); + RUN_TEST(group); + RUN_TEST(dot); + RUN_TEST(anchors); + RUN_TEST(bracket_simple); + RUN_TEST(bracket_range); + RUN_TEST(bracket_negated); + RUN_TEST(quantifier_exact); + RUN_TEST(quantifier_range); + RUN_TEST(quantifier_open); + RUN_TEST(character_class_digit); + RUN_TEST(character_class_word); + RUN_TEST(complex_pattern); + RUN_TEST(unbalanced_paren); + RUN_TEST(non_greedy); + + printf("\nparser: %d passed, %d failed\n", tests_passed, tests_failed); + return tests_failed > 0 ? 1 : 0; +}