New repository.
All checks were successful
Build and run rrex3 / build (push) Successful in 13s
Build and run rrex2 / build (push) Successful in 43s
Build and run rrex4 / build (push) Successful in 19s

This commit is contained in:
retoor 2025-01-14 19:05:52 +01:00
commit 6a65176b15
22 changed files with 14145 additions and 0 deletions

192
.clang-format Normal file
View File

@ -0,0 +1,192 @@
---
Language: Cpp
# BasedOnStyle: LLVM
AccessModifierOffset: -2
AlignAfterOpenBracket: Align
AlignArrayOfStructures: None
AlignConsecutiveMacros: None
AlignConsecutiveAssignments: None
AlignConsecutiveBitFields: None
AlignConsecutiveDeclarations: None
AlignEscapedNewlines: Right
AlignOperands: Align
AlignTrailingComments: true
AllowAllArgumentsOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortEnumsOnASingleLine: true
AllowShortBlocksOnASingleLine: Never
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortLambdasOnASingleLine: All
AllowShortIfStatementsOnASingleLine: Never
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: MultiLine
AttributeMacros:
- __capability
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterCaseLabel: false
AfterClass: false
AfterControlStatement: Never
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
AfterExternBlock: false
BeforeCatch: false
BeforeElse: false
BeforeLambdaBody: false
BeforeWhile: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeConceptDeclarations: true
BreakBeforeBraces: Attach
BreakBeforeInheritanceComma: false
BreakInheritanceList: BeforeColon
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 80
CommentPragmas: '^ IWYU pragma:'
QualifierAlignment: Leave
CompactNamespaces: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DeriveLineEnding: true
DerivePointerAlignment: false
DisableFormat: false
EmptyLineAfterAccessModifier: Never
EmptyLineBeforeAccessModifier: LogicalBlock
ExperimentalAutoDetectBinPacking: false
PackConstructorInitializers: BinPack
BasedOnStyle: ''
ConstructorInitializerAllOnOneLineOrOnePerLine: false
AllowAllConstructorInitializersOnNextLine: true
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IfMacros:
- KJ_IF_MAYBE
IncludeBlocks: Preserve
IncludeCategories:
- Regex: '^"(llvm|llvm-c|clang|clang-c)/'
Priority: 2
SortPriority: 0
CaseSensitive: false
- Regex: '^(<|"(gtest|gmock|isl|json)/)'
Priority: 3
SortPriority: 0
CaseSensitive: false
- Regex: '.*'
Priority: 1
SortPriority: 0
CaseSensitive: false
IncludeIsMainRegex: '(Test)?$'
IncludeIsMainSourceRegex: ''
IndentAccessModifiers: false
IndentCaseLabels: false
IndentCaseBlocks: false
IndentGotoLabels: true
IndentPPDirectives: None
IndentExternBlock: AfterExternBlock
IndentRequires: false
IndentWidth: 4
IndentWrappedFunctionNames: false
InsertTrailingCommas: None
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: true
LambdaBodyIndentation: Signature
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBinPackProtocolList: Auto
ObjCBlockIndentWidth: 2
ObjCBreakBeforeNestedBlockParam: true
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: true
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakOpenParenthesis: 0
PenaltyBreakString: 1000
PenaltyBreakTemplateDeclaration: 10
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PenaltyIndentedWhitespace: 0
PointerAlignment: Right
PPIndentWidth: -1
ReferenceAlignment: Pointer
ReflowComments: true
RemoveBracesLLVM: false
SeparateDefinitionBlocks: Leave
ShortNamespaceLines: 1
SortIncludes: CaseSensitive
SortJavaStaticImport: Before
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterLogicalNot: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeCaseColon: false
SpaceBeforeCpp11BracedList: false
SpaceBeforeCtorInitializerColon: true
SpaceBeforeInheritanceColon: true
SpaceBeforeParens: ControlStatements
SpaceBeforeParensOptions:
AfterControlStatements: true
AfterForeachMacros: true
AfterFunctionDefinitionName: false
AfterFunctionDeclarationName: false
AfterIfMacros: true
AfterOverloadedOperator: false
BeforeNonEmptyParentheses: false
SpaceAroundPointerQualifiers: Default
SpaceBeforeRangeBasedForLoopColon: true
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: Never
SpacesInConditionalStatement: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInLineCommentPrefix:
Minimum: 1
Maximum: -1
SpacesInParentheses: false
SpacesInSquareBrackets: false
SpaceBeforeSquareBrackets: false
BitFieldColonSpacing: Both
Standard: Latest
StatementAttributeLikeMacros:
- Q_EMIT
StatementMacros:
- Q_UNUSED
- QT_REQUIRE_VERSION
TabWidth: 8
UseCRLF: false
UseTab: Never
WhitespaceSensitiveMacros:
- STRINGIZE
- PP_STRINGIZE
- BOOST_PP_STRINGIZE
- NS_SWIFT_NAME
- CF_SWIFT_NAME
...

View File

@ -0,0 +1,28 @@
name: Build and run rrex2
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
env:
BUILD_TYPE: Release
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build
working-directory: ${{github.workspace}}
run: make build
- name: Test
working-directory: ${{github.workspace}}
run: make run

View File

@ -0,0 +1,28 @@
name: Build and run rrex3
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
env:
BUILD_TYPE: Release
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build
working-directory: ${{github.workspace}}
run: make build_rrex3
- name: Test
working-directory: ${{github.workspace}}
run: make run_rrex3

View File

@ -0,0 +1,28 @@
name: Build and run rrex4
on:
push:
branches: [ "main" ]
pull_request:
branches: [ "main" ]
env:
BUILD_TYPE: Release
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build
working-directory: ${{github.workspace}}
run: make rrex4
- name: Test
working-directory: ${{github.workspace}}
run: make debug_rrex4

14
.gitignore vendored Normal file
View File

@ -0,0 +1,14 @@
.vscode
.bzr
*.o
rrex2
rrex2full.c
rrex2full
rrex4
rrex.coverage
rrex3.coverage
rrex4.coverage
*.cast
*.tty
regex.py
rrex3alle.c

113
Makefile Normal file
View File

@ -0,0 +1,113 @@
all: format_all build run
update_rlib:
cp ../rlib/rlib.c ./rlib.h
format_all:
clang-format *.c *.h -i
build:
gcc rrex2.c -o rrex2 -O2 -Wall -Wextra -static
run:
./rrex2
test:
$(MAKE) build
./rrex2 test
cli: build
./rrex2 cli
one-file:
rmerge rrex2.c > rrex2full.c
clang-format rrex2full.c -i
gcc rrex2full.c -o rrexfull.o -O3 -static -Wall -Wextra
@echo "g++ rrex2full.c -o rrex2full.o -O2"
compiler:
gcc compiler.c -o compiler.o -O3
./compiler.o
coverage:
@rm -f *.gcda 2>/dev/null
@rm -f *.gcno 2>/dev/null
@rm -f rrex.coverage.info 2>/dev/null
gcc -pg -fprofile-arcs -ftest-coverage -g -o rrex_coverage.o rrex2.c
./rrex_coverage.o test
lcov --capture --directory . --output-file rrex.coverage.info
genhtml rrex.coverage.info --output-directory rrex.coverage
@rm -f *.gcda 2>/dev/null
@rm -f *.gcno 2>/dev/null
@rm -f rrex.coverage.info 2>/dev/null
@rm -f rrex_coverage.o
@rm -f gmon.out
google-chrome rrex.coverage/index.html
build_and_run_rrex3: build_rrex3 run_rrex3
build_rrex3:
gcc rrex3.c -o rrex3 -Wall -Wextra -Ofast
-@rmerge rrex3.c > rrex3all.c
-@gcc -E rrex3.c -o rrex3alle.c -Wall -Wextra -Ofast
run_rrex3:
./rrex3
r4: rrex4 run_rrex4
rrex4: rrex4.c rrex4.h
gcc rrex4.c -o rrex4 -Wall -Wextra -Ofast
run_rrex4:
./rrex4
debug_rrex4:
./rrex4 --debug
coverage_rrex4:
@rm -f *.gcda 2>/dev/null
@rm -f *.gcno 2>/dev/null
@rm -f rrex4.coverage.info 2>/dev/null
gcc -pg -fprofile-arcs -ftest-coverage -g -o rrex4_coverage.o rrex4.c
./rrex4_coverage.o test --debug
lcov --capture --directory . --output-file rrex4.coverage.info
genhtml rrex4.coverage.info --output-directory rrex4.coverage
@rm -f *.gcda 2>/dev/null
@rm -f *.gcno 2>/dev/null
@rm -f rrex4.coverage.info 2>/dev/null
@rm -f rrex4_coverage.o
@rm -f gmon.out
google-chrome rrex4.coverage/index.html
build_and_run_re: build_re run_re
build_re:
gcc re.c -o re -Wall -Wextra -O2
run_re:
./re "/home/projects/retoor/rlib" " (.*) ";
coverage_rrex3:
@rm -f *.gcda 2>/dev/null
@rm -f *.gcno 2>/dev/null
@rm -f rrex3.coverage.info 2>/dev/null
gcc -pg -fprofile-arcs -ftest-coverage -g -o rrex3_coverage.o rrex3.c
./rrex3_coverage.o test
lcov --capture --directory . --output-file rrex3.coverage.info
genhtml rrex3.coverage.info --output-directory rrex3.coverage
@rm -f *.gcda 2>/dev/null
@rm -f *.gcno 2>/dev/null
@rm -f rrex3.coverage.info 2>/dev/null
@rm -f rrex3_coverage.o
@rm -f gmon.out
google-chrome rrex3.coverage/index.html
publish:
brz add
brz commit
brz push lp:rrex2

54
README.md Normal file
View File

@ -0,0 +1,54 @@
# RREX
## Regular expression interpreter / validator
This regular expression validator is made with the target to be faster than the glibc regular expression validator and with success. In 23/25 tests it scores a better result than the original glibc validator. My bytecode compiler is way faster than the one provided by glibc and my executor often. For single validation, my validator is always a better choice.
## Benchmark and test
Benchmark vs. glibc regex. (Animated gif, takes some time to load)
![Gif of build process](build.gif)
## Todo / issues
- Segmenation fault fix is expr ends with \\d?
- rassert(!rrex("123", "[123]+b")); doesn't work
- abc with abc[gg]d matches valid. Shouldn't be so
## Make
### `all:`
Runs the following tasks sequentially:
- **`one-file:`** Merges, formats, and compiles a single C file.
- **`format_all:`** Formats all `.c` and `.h` files using `clang-format`.
- **`build:`** Compiles the main program (`rrex2.c`) with optimization and static linking.
- **`run:`** Executes the compiled program (`rrex2`).
### `format_all:`
Formats all `.c` and `.h` files in the directory using `clang-format`.
### `build:`
Compiles `rrex2.c` into an executable named `rrex2` with optimization (`-O2`), all warnings enabled (`-Wall`), and extra warnings enabled (`-Wextra`). The executable is statically linked.
### `run:`
Runs the compiled program (`rrex2`).
### `test:`
Rebuilds the project by running the `build` target and then runs the program with `test` as an argument.
### `cli:`
Runs the `build` target and then executes the program in CLI mode.
### `compiler:`
Compiles `compiler.c` into an optimized executable `compiler.o` and then runs it.
### `backup:`
Creates a compressed archive (`rrex.rzip`) of all `.c`, `.h`, Makefile, and markdown files in the directory.
### `coverage:`
Generates code coverage information:
- Removes existing coverage data files.
- Compiles `rrex2.c` with profiling and test coverage flags.
- Executes the compiled coverage binary with `test` as an argument.
- Captures coverage data with `lcov` and generates an HTML report using `genhtml`.
- Opens the coverage report in Google Chrome.
- Cleans up intermediate coverage files and the binary.

BIN
build.gif Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 MiB

31
compiler.c Normal file
View File

@ -0,0 +1,31 @@
#include "compiler.h"
#include "rlib.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void rrex_compiler_repl() {
rclear();
printf("Type expression to convert bytecode to human readable format.\n");
while (true) {
rprintb("> ");
char line[8096];
rreadline(line, sizeof(line), true);
if (!line)
continue;
char bdata[sizeof(line) * 2];
rprint("\\t");
rrex_compile(line, bdata);
rprinty("< ");
print_bc(bdata);
rprint("\n");
}
}
int main() {
rrex_compiler_tests();
printf("%s\n", "Executed all compiler tests at boot of this application.");
rrex_compiler_repl();
return 0;
}

314
compiler.h Normal file
View File

@ -0,0 +1,314 @@
#include "rlib.h"
#include "rrex.h"
#include <assert.h>
typedef struct rrex_compiler_t {
int previous_method;
char *previous_method_start;
char *bdata;
char *rdata;
} rrex_compiler_t;
void compile_one(rrex_compiler_t *compiler, char **content, char **compiled,
int *indexp);
void rrex_compile(char *content, char *compiled);
int test_compiler();
int convert_bt(size_t i);
char *format_bc(char *code);
void print_bc(char *code);
int test_compile(char *s, char *r);
void rexx_init_compiler(rrex_compiler_t *c, char *rdata, char *bdata) {
memset(c, 0, sizeof(rrex_compiler_t));
c->rdata = rdata;
c->bdata = bdata;
c->previous_method = 0;
c->previous_method_start = rdata;
}
typedef enum reg_new_t {
RN_LITERAL = 1,
RN_DRANGE = 2,
RN_ARANGE = 3,
RN_IGNORE = 4,
RN_REPEAT = 5,
RN_FUNCTION = 6,
RN_DOT,
RN_ROOF,
RN_CHOICE_START,
RN_CHOICE_END,
RN_WHITESPACE,
RN_SLASH_CD,
RN_SLASH_CW,
RN_PLUS,
RN_DOLLAR,
RN_ASTERISK,
RN_GROUP_START,
RN_GROUP_END,
RN_PIPE,
RN_QUESTION,
RN_DIGIT,
RN_ALPHA
} reg_new_t;
int convert_bt(size_t i) {
char chars[] = "lRRirf.^[]wDW+$*()|?da";
if (i < strlen(chars) + 1) // Index starts at 1
return chars[i - 1];
return i;
}
char *format_bc(char *code) {
static char result[50000];
result[0] = 0;
char value;
int type = 0;
for (size_t i = 0; i < strlen(code); i++) {
type = 0;
value = code[i];
if (i && (code[i - 1] == 1)) {
type = 1; // no byte
} else if (code[i - 1] == RN_REPEAT) {
type = 2; // int
} else {
type = 0; // byte
}
char chunk[10];
chunk[0] = 0;
if (type == 0) {
sprintf(chunk, "%c", convert_bt(value));
} else if (type == 2)
sprintf(chunk, "%d", value);
else
sprintf(chunk, "(%c)", value);
strcat(result, chunk);
}
return result;
}
void print_bc(char *code) {
char *human_readable = format_bc(code);
printf("%s", human_readable);
}
void compile_one(rrex_compiler_t *compiler, char **content, char **compiled,
int *indexp) {
char *r = *content;
char *c = *compiled;
int index = *indexp;
if (*r == '*') {
compiler->previous_method_start = r;
c[index] = RN_ASTERISK;
index++;
r++;
} else if (*r == '\\') {
r++;
if (*r == 'd') {
compiler->previous_method_start = r - 1;
c[index] = RN_DIGIT;
index++;
r++;
} else if (*r == 'w') {
compiler->previous_method_start = r - 1;
c[index] = RN_ALPHA;
index++;
r++;
} else if (*r == 's') {
compiler->previous_method_start = r - 1;
c[index] = RN_WHITESPACE;
index++;
r++;
} else if (*r == 'D') {
compiler->previous_method_start = r - 1;
c[index] = RN_SLASH_CD;
index++;
r++;
} else if (*r == 'W') {
compiler->previous_method_start = r - 1;
c[index] = RN_SLASH_CW;
index++;
r++;
} else {
compiler->previous_method_start = r - 1;
c[index] = *r;
index++;
r++;
}
} else if (*r == '$') {
compiler->previous_method_start = r;
c[index] = RN_DOLLAR;
index++;
r++;
} else if (*r == '(') {
char *choice_start = r;
r++;
c[index] = RN_GROUP_START;
index++;
while (*r != ')') {
compile_one(compiler, &r, &c, &index);
}
compiler->previous_method_start = choice_start;
c[index] = RN_GROUP_END;
index++;
r++;
} else if (*r == '|') {
compiler->previous_method_start = r;
c[index] = RN_PIPE;
index++;
r++;
} else if (*r == '?') {
r++;
if (index) {
char buff_r[1024] = {0};
char *br = buff_r;
char *br_start = br;
char *first_position = compiler->previous_method_start;
char *rindex = first_position;
while (rindex != r - 1) {
*br = *rindex;
br++;
*br = 0;
rindex++;
}
br = br_start;
char buff_b[1024] = {0};
char *bc = buff_b;
char *bc_start = buff_b;
int indexb = 0;
compile_one(compiler, &br, &bc, &indexb);
bc = bc_start;
index -= strlen(bc);
c[index] = RN_QUESTION;
index++;
while (*bc) {
c[index] = *bc;
index++;
bc++;
}
compiler->previous_method_start = r - 1;
}
} else if (isalpharange(r) || isdigitrange(r)) {
compiler->previous_method_start = r;
c[index] = isalpha(*r) ? RN_ARANGE : RN_DRANGE;
index++;
c[index] = *r;
index++;
r += 2;
c[index] = *r;
index++;
r++;
} else if (*r == '.') {
compiler->previous_method_start = r;
c[index] = RN_DOT;
index++;
r++;
} else if (*r == '^') {
compiler->previous_method_start = r;
c[index] = RN_ROOF;
index++;
r++;
} else if (*r == '[') {
char *choice_start = r;
r++;
c[index] = RN_CHOICE_START;
index++;
while (*r != ']') {
compile_one(compiler, &r, &c, &index);
}
compiler->previous_method_start = choice_start;
c[index] = RN_CHOICE_END;
index++;
r++;
} else if (*r == '+') {
compiler->previous_method_start = r;
r++;
c[index] = RN_PLUS;
index++;
} else if (*r == '{') {
r++;
char *to_repeat = compiler->previous_method_start; // r - 2;
compiler->previous_method_start = r;
char *to_repeat_end = r - 2;
if (isgrouping(to_repeat)) {
char begin_chr = groupcreverse(*(r - 2));
while (*to_repeat != begin_chr)
to_repeat--;
to_repeat--;
} else {
to_repeat--;
}
int times = *r - '0';
r++;
while (isdigit(*r)) {
times *= 10;
times += *r - '0';
r++;
}
for (int i = 0; i < times - 1; i++) {
char *repeat_index = to_repeat + 1;
while (repeat_index <= to_repeat_end) {
compile_one(compiler, &repeat_index, &c, &index);
}
}
r++;
} else {
compiler->previous_method_start = r;
c[index] = *r;
index++;
r++;
}
c[index] = 0;
*indexp = index;
*content = r;
*compiled = c;
}
void rrex_compile(char *content, char *compiled) {
rrex_compiler_t compiler;
rexx_init_compiler(&compiler, content, compiled);
char *r = content;
int index = 0;
while (*r) {
compile_one(&compiler, &r, &compiled, &index);
}
compiled[index] = 0;
}
int test_compile(char *s, char *r) {
char compiled[50000];
memset(compiled, 0, sizeof(compiled));
rrex_compile(s, compiled);
char *human_format = format_bc(compiled);
bool result = !strcmp(r, human_format);
rassert(result);
return result;
}
void rrex_compiler_tests() {
rtest_banner("rrex compiler");
test_compile("\\W\\w\\d\\D", "WadD");
test_compile("0-9", "R09");
test_compile("a-z", "Raz");
test_compile("0-9a-z", "R09Raz");
test_compile("0-9A-Z", "R09RAZ");
test_compile("^12^3", "^12^3");
test_compile("3{1}", "3");
test_compile("3{2}", "33");
test_compile("[123]{1}", "[123]");
test_compile("[123]{2}", "[123][123]");
test_compile("[123]{3}$", "[123][123][123]$");
test_compile("(123){3}$", "(123)(123)(123)$");
}

BIN
re Executable file

Binary file not shown.

53
re.c Normal file
View File

@ -0,0 +1,53 @@
#define RREX3_DEBUG 0
#include "../rlib/rlib.h"
#include "rrex3.h"
rrex3_t *rrex;
char *expr;
void file_found(char *path) {
if (rfile_size(path) < 1024) {
if (risdir(path))
return;
if (!rstrendswith(path, ".c") && !rstrendswith(path, ".h")) {
return;
}
if (rstrendswith(path, "rlib.h")) {
return;
}
size_t size = rfile_size(path);
char file_data[size + 1 + 1024 * 1024];
rfile_readb(path, file_data, size);
file_data[size] = 0;
rrex->str = file_data;
char *ptr = file_data;
while (rrex3(rrex, ptr, expr)) {
ptr = rrex->str;
printf("%s\n", rrex->str);
printf(">%s<\n", rrex->matches[0]);
printf("J,");
break;
// printf("%s\n",rrex->matches[0]);
}
printf("\n");
}
}
int main(int argc, char *argv[]) {
rrex = rrex3_new();
if (argc != 3) {
printf("Usage: <path> <expr>\n");
return 1;
}
char fixed[strlen(argv[2]) + 20];
fixed[0] = 0;
strcpy(fixed, "int (.*)[; ]?");
// sprintf(fixed, "%s", argv[2]);
expr = fixed;
;
rforfile(argv[1], file_found);
return 0;
}

4843
rlib.h Normal file

File diff suppressed because it is too large Load Diff

339
rrex.h Normal file
View File

@ -0,0 +1,339 @@
#include "rlib.h"
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
bool latleast(char *s, unsigned int l) {
if (!l)
return true;
unsigned int i = 0;
while (s[i] != 0) {
i++;
if (i == l)
return true;
}
return false;
}
bool long_enough(char *s, char *n) {
while (++(*n)) {
if (!(++(*s)))
return true;
}
return false;
}
int swith(char *s1, char *s2) {
while (*s1 == *s2) {
if (!*s2)
return true;
s1++;
s2++;
}
return (*s1 && !*s2);
}
int substr(char *s, int start, int len, char *rdata) {
for (int i = 0; i < len; i++) {
if (s[i + start] == 0)
return false;
rdata[i] = s[i + start];
}
rdata[len] = '\0';
return strlen(rdata);
}
char groupcreverse(char c) {
if (c == '{')
return '}';
if (c == '}')
return '{';
if (c == '(')
return ')';
if (c == ')')
return '(';
if (c == '[')
return ']';
if (c == ']')
return '[';
return 0;
}
char isgroupingc(char c) { return groupcreverse(c) != 0; }
bool isgrouping(char *s) { return isgroupingc(s[0]) > 0; }
void test_isgrouping() {
rassert(isgrouping("{"));
rassert(isgrouping("{test"));
rassert(isgrouping("}"));
rassert(isgrouping("("));
rassert(isgrouping(")"));
rassert(isgrouping("["));
rassert(isgrouping("]"));
rassert(!isgrouping("!"));
}
int sextract(char *s, char *s_open, char *s_close, char *rdata) {
unsigned int indent = 0;
char *s_original = s;
char *sptr = s;
int start = -1;
int pos = 0;
unsigned int s_open_len = strlen(s_open);
while (*sptr) {
pos = sptr - s_original;
if (start == -1 && !swith(sptr, s_open)) {
break;
} else if (start == -1) {
start = s_open_len;
indent++;
} else if (swith(sptr, s_open)) {
indent++;
} else if (swith(sptr, s_close)) {
indent--;
if (indent == 0) {
if (substr(s_original, start, pos - start, rdata))
return pos;
else
return false;
}
}
sptr++;
}
rdata[0] = 0;
return -1;
}
// expr rex
int exprtok(char *expr, char *ex) {
if (*expr == '\\' && *(expr + 1) != 0) {
*ex = *expr;
*(ex++) = *(expr + 1);
*(ex + 2) = 0;
return 2;
}
char close_chr = groupcreverse(*expr);
if (close_chr) {
int length = 0;
char open_chr = *expr;
int indent = 0;
while (*expr) {
length++;
char c = *expr;
if (c == open_chr) {
indent++;
} else if (c == close_chr) {
indent--;
}
*ex = c;
if (indent == 0) {
break;
}
ex++;
expr++;
}
(*ex++) = 0;
return indent == 0 ? length : 0;
} else if (isalpharange(expr) || isdigitrange(expr)) {
for (int i = 0; i < 3; i++) {
ex[i] = expr[i];
}
ex[3] = 0;
return 3;
}
// printf("%s\n",expr);
if (*expr) {
*ex = *expr;
*(ex++) = 0;
return 1;
}
return 0;
}
void _test_exprtok(char *expr, char *texpected, int len) {
char tok[4096];
int toklen = exprtok(expr, tok);
if (toklen != len) {
printf("%d:%d\n", toklen, len);
printf("Assert error length of expected token %s\n", texpected);
rassert(toklen == len);
}
if (strncmp(expr, texpected, len)) {
printf(
"Compare error of exprtok with expected token %s does not starts "
"with %s\n",
expr, texpected);
rassert(false);
}
}
typedef struct rreg_token_t {
char content[4096];
int len;
} rreg_token_t;
void test_exprtok() {
_test_exprtok("[abc]def", "[abc]", 5);
_test_exprtok("0-9", "0-9", 3);
_test_exprtok("a-z", "a-z", 3);
_test_exprtok("A-Z", "A-Z", 3);
_test_exprtok("\\w", "\\w", 2);
_test_exprtok("\\", "\\", 1);
_test_exprtok("a", "a", 1);
}
int sexpand(char *s, char *rdata) {
int times = 0;
while (isgrouping(s)) {
char c_open[2] = {s[0], '\0'};
char c_close[2] = {groupcreverse(c_open[0]), '\0'};
if (sextract(s, c_open, c_close, rdata) > 0)
times++;
s = rdata;
}
return times;
}
void test_sexpand() {
char rdata[1024];
rassert(sexpand("[a]", rdata) == 1);
rassert(!strcmp(rdata, "a"));
rassert(sexpand("(a)", rdata) == 1);
rassert(!strcmp(rdata, "a"));
rassert(sexpand("[a)", rdata) == 0);
rassert(!strcmp(rdata, ""));
rassert(sexpand("(a]", rdata) == 0);
rassert(!strcmp(rdata, ""));
rassert(sexpand("[{(a)}]", rdata) == 3);
rassert(!strcmp(rdata, "a"));
}
void test_isalpharange() {
rassert(isalpharange("a-z"));
rassert(isalpharange("a-a"));
rassert(isalpharange("z-z"));
rassert(isalpharange("a-Z"));
rassert(isalpharange("Z-a"));
rassert(isalpharange("Z-Z"));
rassert(!isalpharange("-a"));
rassert(!isalpharange("a-"));
rassert(!isalpharange("a"));
rassert(!isalpharange("-"));
rassert(!isalpharange("z"));
rassert(!isalpharange("-A"));
rassert(!isalpharange("A-"));
rassert(!isalpharange("A"));
rassert(!isalpharange("-"));
rassert(!isalpharange("Z"));
rassert(!isalpharange("0-9"));
}
void test_isdigitrange() {
rassert(isdigitrange("0-9"));
rassert(isdigitrange("0-0"));
rassert(isdigitrange("9-9"));
rassert(!isdigitrange("-0"));
rassert(!isdigitrange("0-"));
rassert(!isdigitrange("0"));
rassert(!isdigitrange("-"));
rassert(!isdigitrange("9"));
rassert(!isdigitrange("a-a"));
}
void test_swith() {
rassert(swith("r", "r"));
rassert(!swith("r", "re"));
rassert(swith("retoor", "r"));
rassert(swith("retoor", "re"));
rassert(swith("retoor", "retoor"));
rassert(!swith("retoor", "retoori"));
rassert(!swith("retoor", "retoorii"));
rassert(!swith("<value>", "<v>"));
rassert(!swith("<v>", "<value>"));
}
void test_substr() {
int r;
char str[1024];
r = substr("[-]", 1, 1, str);
rassert(r == 1);
rassert(!strcmp(str, "-"));
r = substr("[-]", 0, 1, str);
rassert(r == 1);
rassert(!strcmp(str, "["));
r = substr("[-]", 2, 1, str);
rassert(r == 1);
rassert(!strcmp(str, "]"));
r = substr("[-]", 0, 3, str);
rassert(r == 3);
rassert(!strcmp(str, "[-]"));
r = substr("[-]", 0, 2, str);
rassert(r == 2);
rassert(!strcmp(str, "[-"));
}
void test_sextract() {
char rdata[1024];
int pos = 0;
rassert((pos = sextract("(valid)", "(", ")", rdata)) == 6);
rassert(!strcmp("valid", rdata));
rassert((pos = sextract("{valid}", "{", "}", rdata)) == 6);
rassert(!strcmp("valid", rdata));
rassert((pos = sextract("{{valid}", "{", "}", rdata)) == -1);
rassert(!strcmp("", rdata));
rassert((pos = sextract("{valid}}", "{", "}", rdata)) == 6);
rassert(!strcmp("valid", rdata));
rassert((pos = sextract("{{valid}}", "{", "}", rdata)) == 8);
rassert(!strcmp("{valid}", rdata));
rassert((pos = sextract("{[({valid}}", "{", "}", rdata)) == 10);
rassert(!strcmp("[({valid}", rdata));
rassert((pos = sextract("/*valid*/", "/*", "*/", rdata)) > 0);
rassert(!strcmp("valid", rdata));
rassert((pos = sextract("/**valid*/", "/**", "*/", rdata)) > 0);
rassert(!strcmp("valid", rdata));
rassert((pos = sextract("/*valid**/", "/*", "**/", rdata)) > 0);
rassert(!strcmp("valid", rdata));
rassert((pos = sextract("<test>valid</test>", "<test>", "</test>", rdata)) >
0);
rassert(!strcmp("valid", rdata));
rassert((pos = sextract("<t>valid</t>", "<test>", "</test>", rdata)) == -1);
rassert(!strcmp("", rdata));
}
void test_latleast() {
rassert(latleast("", 0));
rassert(latleast("a", 1));
rassert(latleast("aa", 1));
rassert(latleast("aaa", 1));
rassert(latleast("aa", 2));
rassert(latleast("aaa", 2));
rassert(!latleast("a", 2));
}
bool iswhitespace(char c) {
return c == ' ' || c == '\t' || c == '\r' || c == '\n';
}
void rrex_functions_test() {
rtest_banner("rrex functions") test_isalpharange();
test_isdigitrange();
test_swith();
test_substr();
test_sextract();
test_isgrouping();
test_sexpand();
test_latleast();
test_exprtok();
}

242
rrex2.c Normal file
View File

@ -0,0 +1,242 @@
#include "rrex2.h"
#include <math.h>
#include <regex.h>
void cregex_repeat(char *s, char *r) {
// Get object from shared data object. rbf is session variable of current
// bench function.
regex_t *regex = (regex_t *)rbf->data;
// Only get's executed at beginning of the benchmark. Executed once.
if (rbf->first) {
// Set session data
regex = (regex_t *)malloc(sizeof(regex_t));
rbf->data = regex;
regcomp(regex, r, REG_EXTENDED);
}
// The code to benchmark
rassert(!regexec(regex, s, 0, NULL, 0))
// Is executed only once at end of benchmark
if (rbf->last) regfree(regex);
}
void rrex_repeat(char *s, char *r) {
char *bdata = (char *)rbf->data;
if (rbf->first) {
bdata = (char *)malloc(4096);
rrex_compile(r, bdata);
rbf->data = bdata;
}
rassert(rrex_match(s, bdata));
if (rbf->last) {
free(rbf->data);
}
}
int wins_rrex = 0;
int loss_rrex = 0;
nsecs_t total_execution_time = 0;
long total_times = 0;
bool validate_dutch_zipcode_c(char *code) {
if (strlen(code) != 7)
return false;
for (int i = 0; i < 4; i++) {
if (!isdigit(code[i]))
return false;
}
if (!iswhitespace(code[4])) {
return false;
}
for (int i = 6; i < 7; i++) {
if (!isalpha(code[i]))
return false;
}
return true;
}
bool validate_dutch_zipcode_c_literal(char *code) {
if (strlen(code) != 7)
return false;
return isdigit(code[0]) && isdigit(code[1]) && isdigit(code[2]) &&
isdigit(code[3]) && iswhitespace(code[4]) && isalpha(code[5]) &&
isalpha(code[6]);
}
void validate_dutch_zipcode_creg(char *s) {
regex_t regex;
char *pattern = "\\d{4} [a-zA-Z]{2}";
int ret = regcomp(&regex, pattern, REG_EXTENDED);
if (ret) {
printf("cregex comp error\b");
exit(0);
}
ret = regexec(&regex, s, 0, NULL, 0);
if (!ret) {
printf("cregex exec error\b");
exit(0);
}
regfree(&regex);
}
void validate_dutch_zipcode_creg_precompiled(char *s) {
regex_t *regex = (regex_t *)rbf->data;
// Only get's executed at beginning of the benchmark. Executed once.
if (rbf->first) {
// Set session data
regex = (regex_t *)malloc(sizeof(regex_t));
rbf->data = regex;
char *pattern = "\\d{4} [a-zA-Z]{2}";
regcomp(regex, pattern, REG_EXTENDED);
}
// The code to benchmark
int ret = regexec(regex, s, 0, NULL, 0);
if (!ret) {
printf("cregex exec error\b");
exit(0);
}
// Is executed only once at end of benchmark
if (rbf->last)
regfree(regex);
}
void validate_dutch_zipcode_rrex_precompiled(char *s) {
char *bcode = (char *)rbf->data;
if (rbf->first) {
bcode = (char *)malloc(20);
rrex_compile("\\d{4} [a-zA-Z]{2}", bcode);
rbf->data = bcode;
}
rrex_match(s, bcode);
if (rbf->last) {
free(bcode);
}
}
bool validate_dutch_zipcode_rrex(char *s) {
return rrex(s, "\\d{4} [a-zA-Z]{2}");
}
void benchmark_dutch_zipcode(long times, char *s) {
rbench_t *r = rbench_new();
r->show_progress = false;
r->stdout = false;
r->add_function(r, "rrex", "zipcode", (void *)validate_dutch_zipcode_rrex);
r->add_function(r, "rrex compiled", "zipcode",
(void *)validate_dutch_zipcode_rrex_precompiled);
r->add_function(r, "creg", "zipcode", (void *)validate_dutch_zipcode_creg);
r->add_function(r, "creg compiled", "zipcode",
(void *)validate_dutch_zipcode_creg_precompiled);
r->add_function(r, "native c", "zipcode c",
(void *)validate_dutch_zipcode_c);
r->add_function(r, "native c literal", "zipcode c",
(void *)validate_dutch_zipcode_c_literal);
printf("Benchmarking validation of %s with rrex and native c code.\n", s);
r->execute1(r, times, s);
rbench_free(r);
}
void benchmark(long times, char *s, char *e) {
rprint("Benchmark \\l string:<%s> expr:<%s>\t\n", s, e);
rbench_t *r;
r = rbench_new();
r->show_progress = false;
r->stdout = false;
r->add_function(r, "executor", "rrex", (void *)rrex_repeat);
r->add_function(r, "executor", "clib", (void *)cregex_repeat);
if (r->execute2(r, times, s, e)->winner == 1) {
wins_rrex++;
} else {
loss_rrex++;
}
total_execution_time += r->execution_time;
total_times += times * 2;
rbench_free(r);
}
void rrex_benchmark_tests(long times) {
benchmark_dutch_zipcode(times / 10, "7245 SR");
benchmark_dutch_zipcode(times / 10, "A245 SR");
benchmark_dutch_zipcode(times / 10, "7245 S3");
benchmark(times, "abababc", "^(ab)+c$");
// c regex does not support:
// benchmark(times,"123a33","\\d+a\\d+$");
benchmark(times, "9-3", "([3-9]-[3-9])");
benchmark(times, "1234A", "[1-4]{4}A");
benchmark(times, "abcdef", "abcd?ef");
benchmark(times, "ce", "(a|b|c|d)e");
benchmark(times, "a", "(a)");
benchmark(times, "aa", "(a){2}");
// benchmark(times, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaq", "[^xyzv]+q$");
benchmark(times, "abcabcabcabcabcabc", "[acb][acb]{4}");
benchmark(times, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
"[1A-Z0-9a12345]{33}");
benchmark(times, "abcd", "abcd");
benchmark(times, "aaaaaaaaa", "a{9}");
benchmark(times, "a", "[abc]");
benchmark(times, "aa", "[abc]{2}");
benchmark(times, "ab", "[abc]{2}");
benchmark(times, "ac", "[abc]{2}");
benchmark(times, "c", "[abc]");
benchmark(times, "123", "[0-9][0-9][0-9]");
benchmark(times, "ab*", "[a-z]b.");
benchmark(times, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", ".{33}");
// benchmark(times, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaq", "[dbac]+q$");
benchmark(times, "#include \"test.h\"", "^#include *\"[a-z\\.]+\"$");
benchmark(times, "abcdefgh", "^.*gh$");
benchmark(times, "randomtextbeforeabcdefgh", "^random.*gh$");
benchmark(times, "abcdefg", "a?bcdf?ef?g");
printf("Times won: %d / %d\n", wins_rrex, wins_rrex + loss_rrex);
printf("Total execution time: %s\n", format_time(total_execution_time));
printf("Total times: %s\n", rformat_number(total_times));
}
void repl() {
while (true) {
char s[4096];
char e[4096];
rprint("%s", "Write a string to parse:\n");
rreadline(s, 1024, true);
rprint("Write a reqular expression:\n");
rreadline(e, 1024, true);
rprint("\\t");
bool valid = rrex(s, e);
if (valid) {
rprintgf(stdout, "\\T %s", valid ? "valid\n" : "invalid\n");
} else {
rprintrf(stdout, "\\T %s", valid ? "valid\n" : "invalid\n");
}
}
}
void rrex_tests() {
rrex_functions_test();
rrex_compiler_tests();
rrex_executor_tests();
__attribute__((unused)) int res = rtest_end("");
rprintg("Tests passed.\n\n");
sleep(1);
}
int main(int argc, char *argv[]) {
/*
20000000 for 140s (1 billion times)
16000000 for 100s
32000000 for 200s
4000000 for 30s
2000000 for 15s -> this is minimum to get consistent result
1000000 for 7.5s
*/
long times = 2000000;
if (argc > 1) {
if (!strcmp(argv[1], "cli")) {
repl();
return 0;
} else if (!strcmp(argv[1], "test")) {
times = 20;
}
}
rrex_tests();
rrex_benchmark_tests(times);
return 0;
}

447
rrex2.h Normal file
View File

@ -0,0 +1,447 @@
#include "compiler.h"
#include "rlib.h"
#include <regex.h>
#define ifwhile(cond, action) \
bool _did_doit; \
_did_doit = false; \
while (cond) { \
action \
}; \
if (_did_doit)
// bool is_valid = expr ? 1 : 0;
// repeat:
// bool valid = expr != NULL *expr > 0;
// bool _expr_true = false;
// if(res){
// _expr_true = true;
//}
// bool ifwhile(bool res){
//
//}
struct rrex_executor_t;
typedef bool (*rrex_function)(struct rrex_executor_t *);
typedef struct rrex_executor_t {
char *previous_position;
char previous;
char *bdata;
char *_bdata;
char *sdata;
char *_sdata;
long current;
bool valid;
rrex_function functions[30];
} rrex_executor_t;
bool rrex_match(char *sdata, char *bdata);
bool rrex_execute_one(rrex_executor_t *t);
bool rrex(char *s, char *r);
bool rrex(char *s, char *r) {
char b[4096];
rrex_compile(r, b);
return rrex_match(s, b);
}
bool rrex_match_sol(rrex_executor_t *executor) {
executor->previous = RN_ROOF;
executor->previous_position = executor->bdata;
bool valid = executor->sdata == executor->_sdata;
if (valid) {
executor->bdata++;
}
return valid;
}
bool rrex_match_dot(rrex_executor_t *executor) {
executor->previous = RN_DOT;
executor->previous_position = executor->bdata;
if ((executor->sdata)[0] != '\n') {
executor->sdata++;
executor->bdata++;
return true;
}
return false;
}
bool rrex_match_digit(rrex_executor_t *executor) {
if (isdigit(*executor->sdata)) {
executor->sdata++;
executor->bdata++;
return true;
}
return false;
}
bool rrex_match_whitespace(rrex_executor_t *executor) {
if (*executor->sdata == ' ' || *executor->sdata == '\t' ||
*executor->sdata == '\n' || *executor->sdata == '\r') {
executor->sdata++;
executor->bdata++;
return true;
}
return false;
}
bool rrex_match_word(rrex_executor_t *executor) {
if (isalpha((executor->sdata)[0]) || (executor->sdata)[0] == '_') {
executor->sdata++;
executor->bdata++;
return true;
}
return false;
}
bool rrex_match_not_word(rrex_executor_t *executor) {
if (!(isalpha(*executor->sdata) || *executor->sdata == '_')) {
executor->sdata++;
executor->bdata++;
return true;
}
return false;
}
bool rrex_match_not_digit(rrex_executor_t *executor) {
if (!(isdigit(*executor->sdata))) {
executor->sdata++;
executor->bdata++;
return true;
}
return false;
}
bool rrex_match_dollar(rrex_executor_t *executor) {
if (*executor->sdata == '\0') {
executor->bdata++;
return true;
}
return false;
}
bool rrex_match_literal(rrex_executor_t *executor) {
if (*executor->bdata == *executor->sdata) {
executor->bdata++;
executor->sdata++;
return true;
}
return false;
}
bool rrex_match_group(rrex_executor_t *executor) {
bool v = true;
executor->bdata++;
char *sdata_before_fail = executor->sdata;
while (v && *executor->bdata != RN_GROUP_END) {
v = rrex_execute_one(executor);
if (!v) {
while (*executor->bdata != RN_GROUP_END) {
if (*executor->bdata == RN_PIPE) {
v = true;
executor->bdata++;
break;
}
executor->bdata++;
}
} else if (*executor->bdata == RN_PIPE) {
break;
}
}
while (*executor->bdata != RN_GROUP_END) {
executor->bdata++;
}
executor->bdata++;
if (!v) {
executor->sdata = sdata_before_fail;
}
return v;
}
bool rrex_match_choice(rrex_executor_t *executor) {
bool v;
executor->bdata++;
bool reverse = *executor->bdata == RN_ROOF;
if (reverse)
executor->bdata++;
while (*executor->bdata != RN_CHOICE_END) {
v = rrex_execute_one(executor);
if (reverse) {
v = !v;
if (v)
executor->sdata++;
}
if (v) {
break;
} else {
if (!reverse)
executor->bdata++;
}
}
while (*executor->bdata != RN_CHOICE_END) {
executor->bdata++;
}
executor->bdata++;
return v;
}
bool rrex_match_optional(rrex_executor_t *executor) {
executor->bdata++;
char *optional_start = executor->bdata;
bool v = rrex_execute_one(executor);
if (!v) {
executor->bdata = optional_start;
char closer = 0;
if (*executor->bdata == RN_CHOICE_START) {
closer = RN_CHOICE_END;
}
if (*executor->bdata == RN_GROUP_START) {
closer = RN_GROUP_END;
}
if (closer) {
while (*executor->bdata != closer) {
executor->bdata++;
}
}
executor->bdata++;
}
return true;
}
bool rrex_match_at_least_one(rrex_executor_t *executor) {
bool v = true;
bool once_valid;
executor->bdata++;
char *method_position = executor->previous_position;
char *next = executor->bdata;
while (v) {
executor->bdata = method_position;
v = rrex_execute_one(executor);
if (v)
once_valid = true;
executor->bdata = next;
bool v_right = rrex_execute_one(executor);
if (v_right) {
once_valid = true;
break;
}
}
return once_valid;
}
bool rrex_match_range(rrex_executor_t *executor) {
// Go to first parameter and remember
executor->bdata++;
char char_start = *executor->bdata;
// Go to second parameter and remember
executor->bdata++;
char char_end = *executor->bdata;
// Swap parameters if first one is higher than second one
if (char_start > char_end) {
char temp = char_end;
char_end = char_start;
char_start = temp;
}
// Compare if current char in sdata is between parameters
if (*executor->sdata >= char_start && *executor->sdata <= char_end) {
executor->bdata++;
executor->sdata++;
return true;
}
// Set pointer before parameters. Back to R.
executor->bdata--;
executor->bdata--;
return false;
}
bool rrex_match_plus(rrex_executor_t *executor) {
char *plus_position = executor->bdata;
char *next = plus_position + 1;
char *to_repeat = executor->previous_position;
// Return value
bool valid = true;
bool matched_once = false;
char *sdata_before_fail;
while (valid) {
// Check if EOF is reached
if (!*executor->sdata) {
break;
}
executor->bdata = to_repeat;
sdata_before_fail = executor->sdata;
valid = rrex_execute_one(executor);
if (valid) {
matched_once = true;
} else {
// should other function do
executor->sdata = sdata_before_fail;
}
if (!valid && *(executor->bdata = next) && rrex_execute_one(executor)) {
// if(!valid)
break;
}
}
if (matched_once && executor->bdata == plus_position) {
// Move pointer to after RN_PLUS sign.
executor->bdata++;
}
return matched_once;
}
bool rrex_execute_one(rrex_executor_t *executor) {
bool valid;
executor->current = *executor->bdata;
int previous = executor->current;
char *previous_position = executor->bdata;
if (executor->current > 31)
executor->current = RN_LITERAL;
valid = executor->functions[executor->current](executor);
// executor->current = *executor->bdata;
executor->previous = previous;
executor->previous_position = previous_position;
return valid;
}
bool rrex_match(char *sdata, char *bdata) {
rrex_executor_t executor;
executor.bdata = bdata;
executor._bdata = bdata;
executor.sdata = sdata;
executor._sdata = sdata;
executor.previous_position = executor.bdata;
executor.functions[RN_ARANGE] = rrex_match_range;
executor.functions[RN_CHOICE_START] = rrex_match_choice;
executor.functions[RN_DOLLAR] = rrex_match_dollar;
executor.functions[RN_DOT] = rrex_match_dot;
executor.functions[RN_DRANGE] = rrex_match_range;
executor.functions[RN_LITERAL] = rrex_match_literal;
executor.functions[RN_SLASH_CD] = rrex_match_not_digit;
executor.functions[RN_SLASH_CW] = rrex_match_not_word;
executor.functions[RN_PLUS] = rrex_match_plus;
executor.functions[RN_ASTERISK] = rrex_match_at_least_one;
executor.functions[RN_WHITESPACE] = rrex_match_whitespace;
executor.functions[RN_GROUP_START] = rrex_match_group;
executor.functions[RN_QUESTION] = rrex_match_optional;
executor.functions[RN_ROOF] = rrex_match_sol;
executor.functions[RN_DIGIT] = rrex_match_digit;
executor.functions[RN_ALPHA] = rrex_match_word;
rrex_executor_t *ex = &executor;
char *s_padding = ex->sdata;
bool valid = true;
while (valid && *ex->bdata) {
valid = rrex_execute_one(&executor);
if (!valid && *ex->sdata) {
if (*ex->_bdata == RN_ROOF) {
break;
}
s_padding++;
ex->sdata = s_padding;
ex->bdata = ex->_bdata;
if (*ex->bdata && *ex->sdata)
valid = true;
}
}
return valid;
}
void rrex_executor_tests() {
rtest_banner("rrex regular expressions");
// rassert(rrex("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaq", "[^qxyzv]+q$"));
rassert(rrex("abababa", "^(ab)+a$"));
rassert(rrex(" a ", "\\sa\\s"));
rassert(!rrex("a", "\\s"));
rassert(rrex("abc", "ab[def]?c"));
rassert(rrex("abc", "ab(d|e|f)?c"));
rassert(rrex("1990-01-13",
"^(19|20)\\d\\d-(0[1-9]|1[0-2])-(0[1-9]|[12]\\d|3[01])$"));
rassert(rrex("1990-01-13", "(19|20)\\d\\d-[0?1]\\d-[0123]\\d"));
// rassert(rrex("1990-1-3", "(19|20)\\d\\d-[0?1]\\d-[0123]\\d"));
// rassert(rrex("1990-1-3", "(19|20)\\d\\d-[01]?\\d-[0123]\\d"));
rassert(
rrex("1990-13-25", "(19|20)\\d\\d-([01]\\d?||\\d)-([0123]\\d|\\d)$"));
rassert(
!rrex("1990-13-45", "(19|20)\\d\\d-([01]\\d?||\\d)-([0123]\\d|\\d)$"))
//(19|20)\d\d-[01]?\d-[0123]\d
rassert(rrex("a", "[zsa]"));
rassert(rrex("abcdefg", "abcd?efg"));
rassert(rrex("abcefg", "abcd?efg"));
rassert(rrex("ce", "(a|b|c|d)e"));
rassert(rrex("A", "A-Z"));
rassert(rrex("a", "a-Z"));
rassert(rrex("abcab", "[abc][acb]{4}$"));
rassert(rrex("aa", "\\w{2}$"));
rassert(rrex("a", "[ca]"));
rassert(rrex("1-4", "1\\-4"));
rassert(rrex("a", "[ba]"));
rassert(rrex("5", "4-9"));
rassert(rrex("4", "4-9"));
rassert(rrex("9", "4-9"));
rassert(rrex("123A", "1-41-41-4A"));
rassert(!rrex("123B", "1-41-41-4A"));
rassert(!rrex("1", "4-9"));
rassert(rrex("abca", "[abc][abc][abc]a$"));
rassert(rrex("abca", "[a-z][abc][abc]a"));
rassert(rrex("abca", "[\\w][abc][abc]a"));
rassert(rrex("a5a5g!a", "a0-9a-z\\d\\D\\Wa"));
rassert(!rrex("1", "\\D"));
rassert(!rrex("a", "\\W"));
rassert(!rrex("1", "\\w"));
rassert(!rrex("a", "\\d"));
rassert(!rrex("\n", "."));
rassert(rrex("a", "a$"));
rassert(rrex("a1ba1ba1b", "[a-z\\db]{3}"));
rassert(rrex("abbc", "a{1}[a-z]{2}c{1}"));
rassert(rrex("aA", "[a-zA-Z]{2}"));
rassert(!rrex("123", "\\d+a"));
rassert(rrex("123a", "[123]+a"));
printf("JSSS\n");
rassert(rrex("123", "[123]+"));
rassert(!rrex("123b", "[123]+a"));
// rassert(!rrex("123", "[123]+b")); NOT READY YET
rassert(rrex("abababc", "^(ab)+c$"));
rassert(!rrex("abababb", "^(ab)+a$"));
rassert(!rrex("abababa", "^(ab)+b$"));
rassert(!rrex("abdabdabda", "^(abc)+a$"));
rassert(!rrex("abababa", "^(abc)+a$"));
rassert(rrex("123a33", "\\d+a\\d+"));
rassert(!rrex("123ab", "\\d+$"));
rassert(rrex("567", "[^1234]"));
rassert(rrex("400", "[^5]"));
rassert(!rrex("132213gh", ".*gd"));
rassert(!rrex("132213gd", ".*gh"));
rassert(rrex("#include \"test.h\"x", "#include *\"[a-z\\.]*\"x"));
// rassert(rrex("#include \"test.h\"x", "#include.*\".*\"x"));
rassert(!rrex("#include \"test.h\"y", ".*#include.*\".*\"x"));
rassert(rrex("123test", "^123"));
rassert(rrex("test123", "123"));
rassert(!rrex("test123", "^123"));
rassert(rrex("test123", "123$"));
rassert(rrex("test123test", "123"));
rassert(!rrex("test123test", "123$"));
}

BIN
rrex3 Executable file

Binary file not shown.

79
rrex3.c Normal file
View File

@ -0,0 +1,79 @@
#define RREX3_DEBUG 1
#include "rrex3.h"
#include "rlib.h"
#include <regex.h>
void benchmark(int times, char *str, char *expr) {
regmatch_t matches[10];
printf("Matching \"%s\" with \"%s\".\n", str, expr);
regex_t regex;
if (regcomp(&regex, expr, REG_EXTENDED)) {
printf("Creg: error in regular expression.\n");
exit(1);
}
printf("creg: ");
RBENCH(times, {
if (regexec(&regex, str, 0, matches, 0)) {
printf("Creg: error executing regular expression.\n");
}
})
regfree(&regex);
;
rrex3_t *rrex = rrex3_compile(NULL, expr);
printf("rrex3 (%s): ", rrex->compiled);
RBENCH(times, {
if (rrex3(rrex, str, NULL)) {
} else {
printf("Rrex3: error\n");
exit(0);
}
});
rrex3_free(rrex);
printf("\n");
}
int main() {
rrex3_test();
int times = 1;
benchmark(times, "\"stdio.h\"\"string.h\"\"sys/time.h\"",
"\".*\"\".*\"\".*\"");
benchmark(times, "abcdefghijklmnopqrstuvwxyz",
"abcdefghijklmnopqrstuvwxyz$");
benchmark(times, "aaaaaaaaaaaaaaaaaaaaaaaaaa",
"aaaaaaaaaaaaaaaaaaaaaaaaaa$");
benchmark(times, "abcdefghijklmnopqrstuvwxyz",
"..........................$");
// [abcm] failed
benchmark(times, "abcdefghijklmnopqrstuvwxyz", ".*z");
benchmark(times, "abcde", ".*e");
benchmark(times, "abcdef", ".*f");
benchmark(times, "abcdefghijklmnopqrstuvwxyz",
"[a]b*c+d\\w[f-g][g][h-i][i][^a][abcdefgk][l][m][n][o][p][a-z][r]"
"[s][t][u][v][w].*z$");
benchmark(times, "zzz",
"[abcdefghijklmnopqrstuvwxyz][abcdefghijklmnopqrstuvwxyz]["
"abcdefghijklmnopqrstuvwxyz]$");
benchmark(times, "7245 Sr", "[0-9][0-9][0-9][0-9] ?\\w\\w$");
benchmark(times,
"abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmn"
"opqrstuvwxyzesting",
"[z-z][e-e]");
benchmark(times,
"abcdefghijklmnopqrstuvwxyabcdefghijklmnopqrstuvwxyabcdefghijklmn"
"opqrstuvwxyzesting",
"zesting");
benchmark(times, "\"stdio.h\"\"string.h\"\"sys/time.h\"",
"\"(.*)\"\"(.*)\"\"(.*)\"");
benchmark(times, " \"stdio.h\"\"string.h\"\"sys/time.h\"",
"\".+\"\".+\"\".+\"");
benchmark(times, " \"stdio.h\"\"string.h\"\"sys/time.h\"",
"\"(.+)\"\"(.+)\"\"(.+)\"");
}

1277
rrex3.h Normal file

File diff suppressed because it is too large Load Diff

4931
rrex3all.c Normal file

File diff suppressed because it is too large Load Diff

371
rrex4.c Normal file
View File

@ -0,0 +1,371 @@
#define R4_DEBUG_a
#include "rrex4.h"
#include "rlib.h"
#include <regex.h>
bool bench_r4(unsigned int times, char *str, char *expr) {
RBENCH(times, {
r4_t *r = r4(str, expr);
if (r->valid == false) {
printf("Bench r4 error\n");
exit(1);
}
r4_free(r);
});
return true;
}
void bench_c(unsigned int times, char *str, char *expr) {
regex_t regex;
if (regcomp(&regex, expr, REG_EXTENDED)) {
printf("Creg: error in regular expression.\n");
exit(1);
}
RBENCH(times, {
if (regexec(&regex, str, 0, NULL, 0)) {
printf("Creg: error executing regular expression.\n");
exit(1);
}
});
regfree(&regex);
}
bool bench(unsigned int times, char *str, char *expr) {
printf("%d:(%s)<%s>\n", times, str, expr);
printf("c:");
bench_c(times, str, expr);
printf("r:");
bench_r4(times, str, expr);
return true;
}
void test_r4_next() {
r4_t *r = r4_new();
char *str = "abcdefghijklmnop";
char *reg = "(\\w\\w\\w\\w)";
r = r4(str, reg);
assert(r->valid);
assert(r->match_count == 1);
assert(!strcmp(r->matches[0], "abcd"));
// Again with same regex as parameter
r = r4_next(r, reg);
assert(r->valid);
assert(r->match_count == 1);
assert(!strcmp(r->matches[0], "efgh"));
// Again with same regex as parameter
r = r4_next(r, reg);
assert(r->valid);
assert(r->match_count == 1);
assert(!strcmp(r->matches[0], "ijkl"));
// Reuse expression, NULL parameter
r = r4_next(r, NULL);
assert(r->valid);
assert(r->match_count == 1);
assert(!strcmp(r->matches[0], "mnop"));
// No results using r4_next
r = r4_next(r, NULL);
assert(r->valid);
assert(r->match_count == 0);
// Again no results using r4_next, Shouldn't crash
r = r4_next(r, NULL);
assert(r->valid);
assert(r->match_count == 0);
r4_free(r);
}
void bench_all(unsigned int times) {
assert(bench(times, "suvw",
"[abcdefghijklmnopqrstuvw][abcdefghijklmnopqrstuvw]["
"abcdefghijklmnopqrstuvw][abcdefghijklmnopqrstuvw]"));
assert(bench(times, "ponyyy", "^p+o.*yyy$$$$"));
assert(bench(times, " ponyyzd", "p+o.*yyzd$$$$"));
assert(bench(times, "abc", "def|gek|abc"));
assert(bench(times, "abc", "def|a?b?c|def"));
assert(bench(times, "NL18RABO0322309700",
"([A-Z]{2})([0-9]{2})([A-Z]{4}[0-9])([0-9]+)$"));
assert(bench(times, "a 1 b 2 c 3 d 4 ", "([A-Z0-9 ]+)"));
}
bool r4_match_stats(char *str, char *expr) {
r4_t *r = r4(str, expr);
bool result = r->valid;
printf("%d:(%s)<%s>\n", r->validation_count, r->_str, r->_expr);
if (result) {
printf(" - match(0)\t: \"%s\"\n", r->match);
}
for (unsigned i = 0; i < r->match_count; i++) {
printf(" - match(%d)\t: \"%s\"\n", i + 1, r->matches[i]);
}
r4_free(r);
return result;
}
void test_r4_bug_check_capture_overflow() {
// This is a former bug in r4.
// Case one
r4_t *r = r4("test", "(test)+");
assert(r->match_count == 1);
r4_free(r);
// Case two
r = r4("tester", "(t\\est\\e\\r)+");
assert(r->match_count == 1);
printf("%s\n", r->matches[0]);
r4_free(r);
// Case three
r = r4("test", "(t\\est\\e\\r)+");
assert(r->match_count == 0);
r4_free(r);
}
void test_r4_capture_main_group() {
// Case 1
r4_t *r = r4("testtesttesttest", "(test)+test$");
// printf("%s\n",r->match);
// assert(!strcmp(r->match,"testtesttesttest"));
assert(r->match_count == 3);
assert(!strcmp(r->matches[0], "test"));
assert(!strcmp(r->matches[1], "test"));
assert(!strcmp(r->matches[2], "test"));
r4_free(r);
// Case 2 (with search)
/*
r = r4(" testtesttesttest","(test)+test$");
printf("%s\n",r->match);
assert(!strcmp(r->match,"testtesttesttest"));
assert(r->match_count == 3);
assert(!strcmp(r->matches[0], "test"));
assert(!strcmp(r->matches[1], "test"));
assert(!strcmp(r->matches[2], "test"));
r4_free(r); */
}
char test_r4_capture_dynamic_amount() {
r4_t *r = r4("testtesttesttest", "(test)+test$");
assert(r->match_count == 3);
assert(!strcmp(r->matches[0], "test"));
assert(!strcmp(r->matches[1], "test"));
assert(!strcmp(r->matches[2], "test"));
r4_free(r);
return true;
// Some advanced capturing
// Fails
r = r4("testtesttesttest", "([tes]+)+test$");
printf("%d\n", r->match_count);
assert(r->match_count == 1);
assert(!strcmp(r->matches[0], "testtesttest"));
r4_free(r);
}
int main(int argc, char *argv[]) {
for (int i = 0; i < argc; i++) {
if (!strcmp(argv[i], "--debug")) {
r4_enable_debug();
}
}
// Has to be fixed
r4_match_stats("r4@r4.net",
"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]*$");
// r4_match_stats("r4@r4.net", "^[^@\\s]+@[^@\\s]+\\.[^@\\s]+$");
// exit(0);
test_r4_capture_main_group();
assert(r4_match_stats("testtesttesttest", "(test)+test$"));
assert(r4_match_stats("testtest", "test"));
// Group testing
assert(r4_match_stats("aaadddd", "(a+)(d+)$"));
assert(r4_match_stats("aaa", "(a+)$"));
assert(r4_match_stats("aaadddd", "(d+)$"));
assert(r4_match_stats("aaadddd", "(d+)"));
assert(r4_match_stats("aaa\"dddd\"", "\"(d+)\""));
assert(r4_match_stats("aaadddd", "(a*)(d+)$"));
assert(r4_match_stats("aaa", "(a*)$"));
assert(r4_match_stats("aaadddd", "(d*)$"));
assert(r4_match_stats("aaadddd", "(d*)"));
assert(r4_match_stats("aaa\"dddd\" ", "\"(d*)\"\\s*"));
// Words
assert(r4_match_stats("a", "\\w"));
assert(!r4_match_stats("1", "\\w"));
assert(r4_match_stats("1", "\\W"));
assert(!r4_match_stats("a", "\\W"));
assert(r4_match_stats("aa", "\\w{2}"));
assert(r4_match_stats("11", "\\W{2}"));
assert(r4_match_stats("1", "[\\W]"));
// Digits
assert(r4_match_stats("1", "\\d"));
assert(!r4_match_stats("a", "\\d"));
assert(r4_match_stats("a", "\\D"));
assert(!r4_match_stats("1", "\\D"));
assert(r4_match_stats("11", "\\d{2}$"));
assert(r4_match_stats("aa", "\\D{2}$"));
assert(r4_match_stats("a", "[\\D]"));
// Whitespace
assert(r4_match_stats(" ", "\\s"));
assert(r4_match_stats(" a", "\\s"));
assert(!r4_match_stats("a", "[\\s]"));
assert(r4_match_stats("a ", "[\\s]"));
assert(r4_match_stats("a", "\\S"));
assert(!r4_match_stats(" ", "\\S"));
assert(!r4_match_stats(" ", "[\\S]"));
assert(r4_match_stats("b ", "[\\S]"));
assert(r4_match_stats(" b", "[\\S]"));
// Boundaries
assert(r4_match_stats("a", "\\b"));
assert(r4_match_stats("a", "\\ba$"));
assert(r4_match_stats("a", "^\\ba$"));
assert(r4_match_stats("aa", "\\b"));
assert(!r4_match_stats("aa", "\\b$"));
assert(r4_match_stats("aa", "[\\b]"));
assert(r4_match_stats("a", "\\B"));
assert(r4_match_stats("a", "\\Ba$"));
assert(r4_match_stats("a", "^\\Ba$"));
assert(r4_match_stats("aa", "\\B"));
assert(!r4_match_stats("aa", "^\\B"));
assert(!r4_match_stats("a1", "a[\\B]$"));
// Optional
assert(!r4_match_stats("a", "?"));
assert(r4_match_stats("a", "a?"));
assert(r4_match_stats("a", "b?"));
assert(r4_match_stats("a", "^b?"));
assert(r4_match_stats("a", "a?$"));
assert(!r4_match_stats("a", "b?$"));
assert(r4_match_stats("a", "[def]?a$"));
// Range
assert(r4_match_stats("a", "a{1}"));
assert(r4_match_stats("ab", "a{1}"));
assert(r4_match_stats("aa", "a{2}"));
assert(!r4_match_stats("aab", "a{3}"));
assert(!r4_match_stats("a1", "a{2}"));
assert(r4_match_stats("ab", "a{1,2}"));
assert(r4_match_stats("aa", "a{2,}"));
// Group (Custom function set)
r4_match_stats("*?+$^.|\\[{()}]@ ", "[*?+$^.|\\\\[{()}]]+$@\\s");
// Miscellaneous tests
bool debug_mode_original = _r4_debug;
_r4_debug = false;
r4_enable_debug();
assert(_r4_debug);
r4_disable_debug();
assert(!_r4_debug);
_r4_debug = debug_mode_original;
assert(r4_match("a", "a"));
assert(!r4_match("b", "a"));
r4_init(NULL);
r4_free(NULL);
r4_free_matches(NULL);
// Next tests
test_r4_next();
// Check if former known bugs are still fixed
test_r4_bug_check_capture_overflow();
// Check if capture amount is dynamic
test_r4_capture_dynamic_amount();
char *c_function_regex =
"(\\w[\\w\\d]*[\\s\\*]*)\\s*\\w[\\w\\d]*\\s*\\((.*)\\)\\s*\\{";
r4_match_stats("int **main() {}", c_function_regex);
r4_match_stats("int main(int argc, char *argv[],(void *)aaa) {}",
c_function_regex);
assert(r4_match_stats("NL18RABO0322309700",
"(\\w{2})(\\d{2})(\\w{4}\\d)(\\d{10})"));
// exit(0);
unsigned int times = 1;
bench_all(times);
RBENCH(1, {
assert(r4_match_stats("#define DEFINETEST 1",
"#define\\s(+[\\w\\d_]+)\\s+[\\w\\d_]+"));
// assert(r4_match_stats("#define DEFINETEST 1\n",
// s "#define\\s+\\w[\\d\\w_]+\\s+[\\w\\d_]\\s*"));
assert(!r4_match_stats("aa", "aaaa"));
assert(r4_match_stats("ponyyy", "^p+o.*yyy$$$$"));
assert(!r4_match_stats("ponyyy", "p%+o.*yyy$$$$"));
assert(!r4_match_stats("ponyyyd", "^p+o.*yyz$$$$"));
assert(r4_match_stats("123", "[0-2][2-2][1-3]$"));
assert(r4_match_stats("aaaabC5", "(a)(\\w)a*(a)\\w[A-Z][0-9]$"));
assert(r4_match_stats("abcdeeeeee", "ab(cdeee)e"));
assert(r4_match_stats("1234567", "12(.*)67$"));
assert(r4_match_stats("12111678993", "12(.*)67(.*)3$"));
assert(r4_match_stats("NL18RABO0322309700", "NL(.*)R(.*)0(.*)0(.*)$"));
assert(r4_match_stats("NL18RABO0322309700",
"(\\w{2})(\\d{2})(\\w{4}\\d)(\\d+)$"));
assert(r4_match_stats("NL18RABO0322309700garbage",
"(\\w{2})(\\d{2})(\\w{4}\\d)(\\d+)"));
assert(r4_match_stats("NL18RABO0322309700",
"(\\w{2})(\\d{2})(\\w{4}\\d)(\\d+)$"));
assert(r4_match_stats(" NL18RABO0322309700",
"(\\w{2})(\\d{2})(\\w{4}\\d)(\\d+)$"));
assert(r4_match_stats(" NL18RABO0322309700",
"(\\w{2})(\\d{2})(\\w{4}\\d)(\\d+)$"));
assert(
r4_match_stats("NL18RABO0", "(\\w\\w)(\\d\\d)(\\w\\w\\w\\w\\d)$"));
assert(r4_match_stats("q", "\\q$"));
assert(r4_match_stats("ab123", "[a-z0-9]+$"));
assert(r4_match_stats("ppppony", "p*pppony"));
assert(r4_match_stats("aa", "a{2}$"));
assert(r4_match_stats("A23", "[0-2A-z][2-2][1-3]$"));
assert(r4_match_stats("z23", "[0-2A-z][2-2][1-3]$"));
assert(r4_match_stats("r23", "[0-2Ar][2-2][1-3]$"));
assert(r4_match_stats("test", "\\w\\w\\w\\w$"));
assert(!r4_match_stats("test", "\\W\\w\\w\\w$"));
assert(r4_match_stats("1est", "\\W\\w\\w\\w$"));
assert(r4_match_stats("1est", "\\d\\w\\w\\w$"));
assert(r4_match_stats("Aest", "\\D\\w\\w\\w$"));
assert(r4_match_stats("abc", "[ab]+"));
assert(!r4_match_stats("abc", "[ab]+$"));
assert(r4_match_stats("abc", "[abc]+$"));
assert(!r4_match_stats("a", "[^ba]"));
assert(!r4_match_stats("a", "[^ab]"));
assert(r4_match_stats(" ponyyzd", "p+o.*yyzd$$$$"));
assert(r4_match_stats("abc", "def|gek|abc"));
assert(!r4_match_stats("abc", "def|gek|abd"));
assert(r4_match_stats("abc", "def|abc|def"));
assert(r4_match_stats(
"suwv", "[abcdesfghijklmnopqrtuvw][abcdefghijklmnopqrstuvw]["
"abcdefghijklmnopqrstuvw][abcdefghijklmnopqrstuvw]"));
assert(r4_match_stats("123", "(.*)(.*)(.*)"));
assert(r4_match_stats("1234", "(.*)(.*)(.*)"));
assert(r4_match_stats("#include \"test.c\"", "#include\\s+\"(.*)\""));
assert(r4_match_stats("#define TEST_JE VALUE",
"#define\\s+([A-Za-z_0-9]+)\\s+([A-Za-z_0-9]+)"));
//
assert(r4_match_stats("bbb", "a*(bbb)"));
// Tests added for coverage
assert(!r4_match_stats("1", "[\\D]"));
assert(!r4_match_stats("11", "\\D{2}"));
assert(!r4_match_stats("ab", "ba"));
assert(r4_match_stats("2", "[4-2]"));
});
return 0;
}

761
rrex4.h Normal file
View File

@ -0,0 +1,761 @@
#ifndef RREX4_H
#define RREX4_H
#include <assert.h>
#include <ctype.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define R4_DEBUG_a
#ifdef R4_DEBUG
static int _r4_debug = 1;
#else
static int _r4_debug = 0;
#endif
static char *_format_function_name(const char *name) {
static char result[100];
result[0] = 0;
char *new_name = (char *)name;
new_name += 11;
if (new_name[0] == '_')
new_name += 1;
if (strlen(new_name) == 0) {
return " -";
}
strcpy(result, new_name);
return result;
}
#define DEBUG_VALIDATE_FUNCTION \
if (_r4_debug || r4->debug) \
printf("DEBUG: %s %s <%s> \"%s\"\n", _format_function_name(__func__), \
r4->valid ? "valid" : "INVALID", r4->expr, r4->str);
struct r4_t;
void r4_enable_debug() { _r4_debug = true; }
void r4_disable_debug() { _r4_debug = false; }
typedef bool (*r4_function)(struct r4_t *);
typedef struct r4_t {
bool debug;
bool valid;
bool in_block;
bool is_greedy;
bool in_range;
unsigned int backtracking;
unsigned int loop_count;
unsigned int in_group;
unsigned int match_count;
unsigned int validation_count;
unsigned int start;
unsigned int end;
unsigned int length;
bool (*functions[254])(struct r4_t *);
bool (*slash_functions[254])(struct r4_t *);
char *_str;
char *_expr;
char *match;
char *str;
char *expr;
char *str_previous;
char *expr_previous;
char **matches;
} r4_t;
static bool v4_initiated = false;
typedef bool (*v4_function_map)(r4_t *);
v4_function_map v4_function_map_global[256];
v4_function_map v4_function_map_slash[256];
v4_function_map v4_function_map_block[256];
static void r4_free_matches(r4_t *r) {
if (!r)
return;
if (r->match) {
free(r->match);
r->match = NULL;
}
if (!r->match_count) {
return;
}
for (unsigned i = 0; i < r->match_count; i++) {
free(r->matches[i]);
}
free(r->matches);
r->match_count = 0;
r->matches = NULL;
}
static void r4_free(r4_t *r) {
if (!r)
return;
r4_free_matches(r);
free(r);
}
static bool r4_backtrack(r4_t *r4);
static bool r4_validate(r4_t *r4);
static void r4_match_add(r4_t *r4, char *extracted);
static bool r4_validate_literal(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
if (!r4->valid)
return false;
if (*r4->str != *r4->expr) {
r4->valid = false;
} else {
r4->str++;
}
r4->expr++;
if (r4->in_block || r4->in_range || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static bool r4_validate_question_mark(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
r4->valid = true;
r4->expr++;
return r4_validate(r4);
}
static bool r4_validate_plus(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
r4->expr++;
if (r4->valid == false) {
return r4_validate(r4);
}
char *expr_left = r4->expr_previous;
char *expr_right = r4->expr;
char *str = r4->str;
char *return_expr = NULL;
if (*expr_right == ')') {
return_expr = expr_right;
expr_right++;
}
r4->is_greedy = false;
r4->expr = expr_left;
while (r4->valid) {
if (*expr_right) {
r4->expr = expr_right;
r4->is_greedy = true;
if (r4_backtrack(r4)) {
if (return_expr) {
r4->str = str;
r4->expr = return_expr;
}
return r4_validate(r4);
} else {
r4->is_greedy = false;
}
}
r4->valid = true;
r4->expr = expr_left;
r4->str = str;
r4_validate(r4);
str = r4->str;
}
r4->is_greedy = true;
r4->valid = true;
r4->expr = return_expr ? return_expr : expr_right;
return r4_validate(r4);
}
static bool r4_validate_dollar(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
r4->expr++;
r4->valid = *r4->str == 0;
return r4_validate(r4);
}
static bool r4_validate_roof(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
if (r4->str != r4->_str) {
return false;
}
r4->expr++;
return r4_validate(r4);
}
static bool r4_validate_dot(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
if (*r4->str == 0) {
return false;
}
r4->expr++;
r4->valid = *r4->str != '\n';
r4->str++;
if (r4->in_block || r4->in_range || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static bool r4_validate_asterisk(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
r4->expr++;
if (r4->valid == false) {
r4->valid = true;
return r4->valid;
// return r4_validate(r4);
}
char *expr_left = r4->expr_previous;
char *expr_right = r4->expr;
char *str = r4->str;
char *return_expr = NULL;
if (*expr_right == ')') {
return_expr = expr_right;
expr_right++;
}
r4->is_greedy = false;
r4->expr = expr_left;
while (r4->valid) {
if (*expr_right) {
r4->expr = expr_right;
r4->is_greedy = true;
if (r4_backtrack(r4)) {
if (return_expr) {
r4->str = str;
r4->expr = return_expr;
}
return r4_validate(r4);
} else {
r4->is_greedy = false;
}
}
r4->valid = true;
r4->expr = expr_left;
r4->str = str;
r4_validate(r4);
str = r4->str;
}
r4->is_greedy = true;
r4->valid = true;
r4->expr = return_expr ? return_expr : expr_right;
return r4_validate(r4);
}
static bool r4_validate_pipe(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
r4->expr++;
if (r4->valid == true) {
return true;
} else {
r4->valid = true;
}
return r4_validate(r4);
}
static bool r4_validate_digit(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
if (!isdigit(*r4->str)) {
r4->valid = false;
} else {
r4->str++;
}
r4->expr++;
if (r4->in_block || r4->in_range || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static bool r4_validate_not_digit(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
if (isdigit(*r4->str)) {
r4->valid = false;
} else {
r4->str++;
}
r4->expr++;
if (r4->in_block || r4->in_range || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static bool r4_validate_word(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
if (!isalpha(*r4->str)) {
r4->valid = false;
} else {
r4->str++;
}
r4->expr++;
if (r4->in_block || r4->in_range || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static bool r4_validate_not_word(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
if (isalpha(*r4->str)) {
r4->valid = false;
} else {
r4->str++;
}
r4->expr++;
if (r4->in_block || r4->in_range || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static bool r4_isrange(char *s) {
if (!isalnum(*s)) {
return false;
}
if (*(s + 1) != '-') {
return false;
}
return isalnum(*(s + 2));
}
static bool r4_validate_block_open(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
if (r4->valid == false) {
return false;
}
char *expr_self = r4->expr;
r4->expr++;
bool reversed = *r4->expr == '^';
if (reversed) {
r4->expr++;
}
bool valid_once = false;
r4->in_block = true;
while (*r4->expr != ']') {
r4->valid = true;
if (r4_isrange(r4->expr)) {
char s = *r4->expr;
char e = *(r4->expr + 2);
r4->expr += 2;
if (s > e) {
char tempc = s;
s = e;
e = tempc;
}
if (*r4->str >= s && *r4->str <= e) {
if (!reversed) {
r4->str++;
}
valid_once = true;
break;
} else {
r4->expr++;
}
} else if (r4_validate(r4)) {
valid_once = true;
if (reversed)
r4->str--;
break;
}
}
char *expr_end = strchr(r4->expr, ']');
r4->expr = expr_end ? expr_end : r4->expr;
r4->in_block = false;
r4->valid = expr_end && (!reversed ? valid_once : !valid_once);
r4->expr++;
r4->expr_previous = expr_self;
if (r4->in_range || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static bool r4_validate_whitespace(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
r4->valid = strchr("\r\t \n", *r4->str) != NULL;
r4->expr++;
if (r4->valid) {
r4->str++;
}
if (r4->in_range || r4->in_block || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static bool r4_validate_not_whitespace(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
r4->valid = strchr("\r\t \n", *r4->str) == NULL;
r4->expr++;
if (r4->valid) {
r4->str++;
}
if (r4->in_range || r4->in_block || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static bool r4_validate_range(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION;
if (r4->valid == false) {
r4->expr++;
return false;
}
char *previous = r4->expr_previous;
r4->in_range = true;
r4->expr++;
unsigned int start = 0;
while (isdigit(*r4->expr)) {
start = 10 * start;
start += *r4->expr - '0';
r4->expr++;
}
if (start != 0)
start--;
unsigned int end = 0;
bool variable_end_range = false;
if (*r4->expr == ',') {
r4->expr++;
if (!isdigit(*r4->expr)) {
variable_end_range = true;
}
}
while (isdigit(*r4->expr)) {
end = end * 10;
end += *r4->expr - '0';
r4->expr++;
}
r4->expr++;
bool valid = true;
char *expr_right = r4->expr;
for (unsigned int i = 0; i < start; i++) {
r4->expr = previous;
valid = r4_validate(r4);
if (!*r4->str)
break;
if (!valid) {
break;
}
}
r4->expr = expr_right;
r4->in_range = false;
if (!r4->valid)
return false;
return r4_validate(r4);
for (unsigned int i = start; i < end; i++) {
r4->expr = previous;
valid = r4_validate(r4);
if (!valid) {
break;
}
}
while (variable_end_range) {
r4->in_range = false;
valid = r4_validate(r4);
r4->in_range = true;
if (valid) {
break;
}
r4->in_range = true;
valid = r4_validate(r4);
r4->in_range = false;
if (!valid) {
break;
}
}
r4->valid = valid;
return r4_validate(r4);
}
static bool r4_validate_group_close(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
return r4->valid;
}
static bool r4_validate_group_open(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
char *expr_previous = r4->expr_previous;
r4->expr++;
bool save_match = r4->in_group == 0;
r4->in_group++;
char *str_extract_start = r4->str;
bool valid = r4_validate(r4);
if (!valid || *r4->expr != ')') {
// this is a valid case if not everything between () matches
r4->in_group--;
if (save_match == false) {
r4->valid = true;
}
// Not direct return? Not sure
return r4_validate(r4);
}
if (save_match) {
char *str_extract_end = r4->str;
unsigned int extracted_length = str_extract_end - str_extract_start;
// strlen(str_extract_start) - strlen(str_extract_end);
char *str_extracted =
(char *)calloc(sizeof(char), extracted_length + 1);
strncpy(str_extracted, str_extract_start, extracted_length);
r4_match_add(r4, str_extracted);
}
assert(*r4->expr == ')');
r4->expr++;
r4->in_group--;
r4->expr_previous = expr_previous;
return r4_validate(r4);
}
static bool r4_validate_slash(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
// The handling code for handling slashes is implemented in r4_validate
char *expr_previous = r4->expr_previous;
r4->expr++;
r4_function f = v4_function_map_slash[(int)*r4->expr];
r4->expr_previous = expr_previous;
return f(r4);
}
static void r4_match_add(r4_t *r4, char *extracted) {
r4->matches =
(char **)realloc(r4->matches, (r4->match_count + 1) * sizeof(char *));
r4->matches[r4->match_count] = extracted;
r4->match_count++;
}
static bool r4_validate_word_boundary_start(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
r4->expr++;
if (!r4->valid) {
return r4->valid;
}
r4->valid =
isalpha(*r4->str) && (r4->str == r4->_str || !isalpha(*(r4->str - 1)));
if (r4->in_range || r4->in_block || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static bool r4_validate_word_boundary_end(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
r4->expr++;
if (!r4->valid) {
return r4->valid;
}
r4->valid =
isalpha(*r4->str) && (*(r4->str + 1) == 0 || !isalpha(*(r4->str + 1)));
if (r4->in_range || r4->in_block || !r4->is_greedy) {
return r4->valid;
}
return r4_validate(r4);
}
static void v4_init_function_maps() {
if (v4_initiated)
return;
v4_initiated = true;
for (__uint8_t i = 0; i < 255; i++) {
v4_function_map_global[i] = r4_validate_literal;
v4_function_map_slash[i] = r4_validate_literal;
v4_function_map_block[i] = r4_validate_literal;
}
v4_function_map_global['*'] = r4_validate_asterisk;
v4_function_map_global['?'] = r4_validate_question_mark;
v4_function_map_global['+'] = r4_validate_plus;
v4_function_map_global['$'] = r4_validate_dollar;
v4_function_map_global['^'] = r4_validate_roof;
v4_function_map_global['.'] = r4_validate_dot;
v4_function_map_global['|'] = r4_validate_pipe;
v4_function_map_global['\\'] = r4_validate_slash;
v4_function_map_global['['] = r4_validate_block_open;
v4_function_map_global['{'] = r4_validate_range;
v4_function_map_global['('] = r4_validate_group_open;
v4_function_map_global[')'] = r4_validate_group_close;
v4_function_map_slash['b'] = r4_validate_word_boundary_start;
v4_function_map_slash['B'] = r4_validate_word_boundary_end;
v4_function_map_slash['d'] = r4_validate_digit;
v4_function_map_slash['w'] = r4_validate_word;
v4_function_map_slash['D'] = r4_validate_not_digit;
v4_function_map_slash['W'] = r4_validate_not_word;
v4_function_map_slash['s'] = r4_validate_whitespace;
v4_function_map_slash['S'] = r4_validate_not_whitespace;
v4_function_map_block['\\'] = r4_validate_slash;
v4_function_map_block['{'] = r4_validate_range;
}
void r4_init(r4_t *r4) {
v4_init_function_maps();
if (r4 == NULL)
return;
r4->debug = _r4_debug;
r4->valid = true;
r4->validation_count = 0;
r4->match_count = 0;
r4->start = 0;
r4->end = 0;
r4->length = 0;
r4->matches = NULL;
}
static bool r4_looks_behind(char c) { return strchr("?*+{", c) != NULL; }
r4_t *r4_new() {
r4_t *r4 = (r4_t *)malloc(sizeof(r4_t));
r4_init(r4);
return r4;
}
static bool r4_pipe_next(r4_t *r4) {
char *expr = r4->expr;
while (*expr) {
if (*expr == '|') {
r4->expr = expr + 1;
r4->valid = true;
return true;
}
expr++;
}
return false;
}
static bool r4_backtrack(r4_t *r4) {
if (_r4_debug)
printf("\033[36mDEBUG: backtrack start (%d)\n", r4->backtracking);
r4->backtracking++;
char *str = r4->str;
char *expr = r4->expr;
bool result = r4_validate(r4);
r4->backtracking--;
if (result == false) {
r4->expr = expr;
r4->str = str;
}
if (_r4_debug)
printf("DEBUG: backtrack end (%d) result: %d %s\n", r4->backtracking,
result, r4->backtracking == 0 ? "\033[0m" : "");
return result;
}
static bool r4_validate(r4_t *r4) {
DEBUG_VALIDATE_FUNCTION
r4->validation_count++;
char c_val = *r4->expr;
if (c_val == 0) {
return r4->valid;
}
if (!r4_looks_behind(c_val)) {
r4->expr_previous = r4->expr;
} else if (r4->expr == r4->_expr) {
// Regex may not start with a look behind ufnction
return false;
}
if (!r4->valid && !r4_looks_behind(*r4->expr)) {
if (!r4_pipe_next(r4)) {
return false;
}
}
r4_function f;
if (r4->in_block) {
f = v4_function_map_block[(int)c_val];
} else {
f = v4_function_map_global[(int)c_val];
}
r4->valid = f(r4);
return r4->valid;
}
char *r4_get_match(r4_t *r) {
char *match = (char *)malloc(r->length + 1);
strncpy(match, r->_str + r->start, r->length);
match[r->length] = 0;
return match;
}
static bool r4_search(r4_t *r) {
bool valid = true;
char *str_next = r->str;
while (*r->str) {
if (!(valid = r4_validate(r))) {
// Move next until we find a match
if (!r->backtracking) {
r->start++;
}
str_next++;
r->str = str_next;
r->expr = r->_expr;
r->valid = true;
} else {
/// HIGH DOUBT
if (!r->backtracking) {
// r->start = 0;
}
break;
}
}
r->valid = valid;
if (r->valid) {
r->end = strlen(r->_str) - strlen(r->str);
r->length = r->end - r->start;
r->match = r4_get_match(r);
}
return r->valid;
}
r4_t *r4(const char *str, const char *expr) {
r4_t *r = r4_new();
r->_str = (char *)str;
r->_expr = (char *)expr;
r->match = NULL;
r->str = r->_str;
r->expr = r->_expr;
r->str_previous = r->_str;
r->expr_previous = r->expr;
r->in_block = false;
r->is_greedy = true;
r->in_group = 0;
r->loop_count = 0;
r->backtracking = 0;
r->in_range = false;
r4_search(r);
return r;
}
r4_t *r4_next(r4_t *r, char *expr) {
if (expr) {
r->_expr = expr;
}
r->backtracking = 0;
r->expr = r->_expr;
r->is_greedy = true;
r->in_block = false;
r->in_range = false;
r->in_group = false;
r4_free_matches(r);
r4_search(r);
return r;
}
bool r4_match(char *str, char *expr) {
r4_t *r = r4(str, expr);
bool result = r->valid;
r4_free(r);
return result;
}
#endif