448 lines
13 KiB
C
448 lines
13 KiB
C
|
#include "compiler.h"
|
||
|
#include "rlib.h"
|
||
|
#include <regex.h>
|
||
|
|
||
|
#define ifwhile(cond, action) \
|
||
|
bool _did_doit; \
|
||
|
_did_doit = false; \
|
||
|
while (cond) { \
|
||
|
action \
|
||
|
}; \
|
||
|
if (_did_doit)
|
||
|
|
||
|
// bool is_valid = expr ? 1 : 0;
|
||
|
// repeat:
|
||
|
|
||
|
// bool valid = expr != NULL *expr > 0;
|
||
|
|
||
|
// bool _expr_true = false;
|
||
|
// if(res){
|
||
|
// _expr_true = true;
|
||
|
//}
|
||
|
// bool ifwhile(bool res){
|
||
|
//
|
||
|
//}
|
||
|
struct rrex_executor_t;
|
||
|
|
||
|
typedef bool (*rrex_function)(struct rrex_executor_t *);
|
||
|
|
||
|
typedef struct rrex_executor_t {
|
||
|
char *previous_position;
|
||
|
char previous;
|
||
|
char *bdata;
|
||
|
char *_bdata;
|
||
|
char *sdata;
|
||
|
char *_sdata;
|
||
|
|
||
|
long current;
|
||
|
bool valid;
|
||
|
rrex_function functions[30];
|
||
|
|
||
|
} rrex_executor_t;
|
||
|
|
||
|
bool rrex_match(char *sdata, char *bdata);
|
||
|
bool rrex_execute_one(rrex_executor_t *t);
|
||
|
bool rrex(char *s, char *r);
|
||
|
|
||
|
bool rrex(char *s, char *r) {
|
||
|
char b[4096];
|
||
|
rrex_compile(r, b);
|
||
|
return rrex_match(s, b);
|
||
|
}
|
||
|
|
||
|
bool rrex_match_sol(rrex_executor_t *executor) {
|
||
|
executor->previous = RN_ROOF;
|
||
|
executor->previous_position = executor->bdata;
|
||
|
bool valid = executor->sdata == executor->_sdata;
|
||
|
if (valid) {
|
||
|
executor->bdata++;
|
||
|
}
|
||
|
return valid;
|
||
|
}
|
||
|
bool rrex_match_dot(rrex_executor_t *executor) {
|
||
|
executor->previous = RN_DOT;
|
||
|
executor->previous_position = executor->bdata;
|
||
|
if ((executor->sdata)[0] != '\n') {
|
||
|
executor->sdata++;
|
||
|
executor->bdata++;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_digit(rrex_executor_t *executor) {
|
||
|
if (isdigit(*executor->sdata)) {
|
||
|
executor->sdata++;
|
||
|
executor->bdata++;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_whitespace(rrex_executor_t *executor) {
|
||
|
if (*executor->sdata == ' ' || *executor->sdata == '\t' ||
|
||
|
*executor->sdata == '\n' || *executor->sdata == '\r') {
|
||
|
executor->sdata++;
|
||
|
executor->bdata++;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_word(rrex_executor_t *executor) {
|
||
|
if (isalpha((executor->sdata)[0]) || (executor->sdata)[0] == '_') {
|
||
|
executor->sdata++;
|
||
|
executor->bdata++;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_not_word(rrex_executor_t *executor) {
|
||
|
if (!(isalpha(*executor->sdata) || *executor->sdata == '_')) {
|
||
|
executor->sdata++;
|
||
|
executor->bdata++;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_not_digit(rrex_executor_t *executor) {
|
||
|
if (!(isdigit(*executor->sdata))) {
|
||
|
executor->sdata++;
|
||
|
executor->bdata++;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
bool rrex_match_dollar(rrex_executor_t *executor) {
|
||
|
if (*executor->sdata == '\0') {
|
||
|
executor->bdata++;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_literal(rrex_executor_t *executor) {
|
||
|
if (*executor->bdata == *executor->sdata) {
|
||
|
executor->bdata++;
|
||
|
executor->sdata++;
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_group(rrex_executor_t *executor) {
|
||
|
bool v = true;
|
||
|
executor->bdata++;
|
||
|
char *sdata_before_fail = executor->sdata;
|
||
|
while (v && *executor->bdata != RN_GROUP_END) {
|
||
|
v = rrex_execute_one(executor);
|
||
|
if (!v) {
|
||
|
while (*executor->bdata != RN_GROUP_END) {
|
||
|
if (*executor->bdata == RN_PIPE) {
|
||
|
v = true;
|
||
|
executor->bdata++;
|
||
|
break;
|
||
|
}
|
||
|
executor->bdata++;
|
||
|
}
|
||
|
} else if (*executor->bdata == RN_PIPE) {
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
while (*executor->bdata != RN_GROUP_END) {
|
||
|
executor->bdata++;
|
||
|
}
|
||
|
executor->bdata++;
|
||
|
if (!v) {
|
||
|
executor->sdata = sdata_before_fail;
|
||
|
}
|
||
|
return v;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_choice(rrex_executor_t *executor) {
|
||
|
bool v;
|
||
|
|
||
|
executor->bdata++;
|
||
|
bool reverse = *executor->bdata == RN_ROOF;
|
||
|
if (reverse)
|
||
|
executor->bdata++;
|
||
|
|
||
|
while (*executor->bdata != RN_CHOICE_END) {
|
||
|
v = rrex_execute_one(executor);
|
||
|
if (reverse) {
|
||
|
|
||
|
v = !v;
|
||
|
if (v)
|
||
|
executor->sdata++;
|
||
|
}
|
||
|
if (v) {
|
||
|
break;
|
||
|
} else {
|
||
|
|
||
|
if (!reverse)
|
||
|
executor->bdata++;
|
||
|
}
|
||
|
}
|
||
|
while (*executor->bdata != RN_CHOICE_END) {
|
||
|
executor->bdata++;
|
||
|
}
|
||
|
|
||
|
executor->bdata++;
|
||
|
|
||
|
return v;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_optional(rrex_executor_t *executor) {
|
||
|
executor->bdata++;
|
||
|
char *optional_start = executor->bdata;
|
||
|
bool v = rrex_execute_one(executor);
|
||
|
if (!v) {
|
||
|
executor->bdata = optional_start;
|
||
|
char closer = 0;
|
||
|
if (*executor->bdata == RN_CHOICE_START) {
|
||
|
closer = RN_CHOICE_END;
|
||
|
}
|
||
|
if (*executor->bdata == RN_GROUP_START) {
|
||
|
closer = RN_GROUP_END;
|
||
|
}
|
||
|
if (closer) {
|
||
|
while (*executor->bdata != closer) {
|
||
|
executor->bdata++;
|
||
|
}
|
||
|
}
|
||
|
executor->bdata++;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_at_least_one(rrex_executor_t *executor) {
|
||
|
bool v = true;
|
||
|
bool once_valid;
|
||
|
executor->bdata++;
|
||
|
char *method_position = executor->previous_position;
|
||
|
char *next = executor->bdata;
|
||
|
while (v) {
|
||
|
executor->bdata = method_position;
|
||
|
v = rrex_execute_one(executor);
|
||
|
|
||
|
if (v)
|
||
|
once_valid = true;
|
||
|
|
||
|
executor->bdata = next;
|
||
|
bool v_right = rrex_execute_one(executor);
|
||
|
if (v_right) {
|
||
|
once_valid = true;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
return once_valid;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_range(rrex_executor_t *executor) {
|
||
|
// Go to first parameter and remember
|
||
|
executor->bdata++;
|
||
|
char char_start = *executor->bdata;
|
||
|
// Go to second parameter and remember
|
||
|
executor->bdata++;
|
||
|
char char_end = *executor->bdata;
|
||
|
// Swap parameters if first one is higher than second one
|
||
|
if (char_start > char_end) {
|
||
|
char temp = char_end;
|
||
|
char_end = char_start;
|
||
|
char_start = temp;
|
||
|
}
|
||
|
// Compare if current char in sdata is between parameters
|
||
|
if (*executor->sdata >= char_start && *executor->sdata <= char_end) {
|
||
|
executor->bdata++;
|
||
|
executor->sdata++;
|
||
|
return true;
|
||
|
}
|
||
|
// Set pointer before parameters. Back to R.
|
||
|
executor->bdata--;
|
||
|
executor->bdata--;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
bool rrex_match_plus(rrex_executor_t *executor) {
|
||
|
char *plus_position = executor->bdata;
|
||
|
char *next = plus_position + 1;
|
||
|
char *to_repeat = executor->previous_position;
|
||
|
// Return value
|
||
|
bool valid = true;
|
||
|
bool matched_once = false;
|
||
|
char *sdata_before_fail;
|
||
|
while (valid) {
|
||
|
// Check if EOF is reached
|
||
|
if (!*executor->sdata) {
|
||
|
break;
|
||
|
}
|
||
|
executor->bdata = to_repeat;
|
||
|
sdata_before_fail = executor->sdata;
|
||
|
valid = rrex_execute_one(executor);
|
||
|
if (valid) {
|
||
|
matched_once = true;
|
||
|
} else {
|
||
|
// should other function do
|
||
|
executor->sdata = sdata_before_fail;
|
||
|
}
|
||
|
if (!valid && *(executor->bdata = next) && rrex_execute_one(executor)) {
|
||
|
// if(!valid)
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
if (matched_once && executor->bdata == plus_position) {
|
||
|
// Move pointer to after RN_PLUS sign.
|
||
|
executor->bdata++;
|
||
|
}
|
||
|
|
||
|
return matched_once;
|
||
|
}
|
||
|
|
||
|
bool rrex_execute_one(rrex_executor_t *executor) {
|
||
|
bool valid;
|
||
|
executor->current = *executor->bdata;
|
||
|
int previous = executor->current;
|
||
|
|
||
|
char *previous_position = executor->bdata;
|
||
|
if (executor->current > 31)
|
||
|
executor->current = RN_LITERAL;
|
||
|
valid = executor->functions[executor->current](executor);
|
||
|
// executor->current = *executor->bdata;
|
||
|
executor->previous = previous;
|
||
|
executor->previous_position = previous_position;
|
||
|
return valid;
|
||
|
}
|
||
|
|
||
|
bool rrex_match(char *sdata, char *bdata) {
|
||
|
rrex_executor_t executor;
|
||
|
executor.bdata = bdata;
|
||
|
executor._bdata = bdata;
|
||
|
executor.sdata = sdata;
|
||
|
executor._sdata = sdata;
|
||
|
executor.previous_position = executor.bdata;
|
||
|
executor.functions[RN_ARANGE] = rrex_match_range;
|
||
|
executor.functions[RN_CHOICE_START] = rrex_match_choice;
|
||
|
executor.functions[RN_DOLLAR] = rrex_match_dollar;
|
||
|
executor.functions[RN_DOT] = rrex_match_dot;
|
||
|
executor.functions[RN_DRANGE] = rrex_match_range;
|
||
|
executor.functions[RN_LITERAL] = rrex_match_literal;
|
||
|
executor.functions[RN_SLASH_CD] = rrex_match_not_digit;
|
||
|
executor.functions[RN_SLASH_CW] = rrex_match_not_word;
|
||
|
executor.functions[RN_PLUS] = rrex_match_plus;
|
||
|
executor.functions[RN_ASTERISK] = rrex_match_at_least_one;
|
||
|
executor.functions[RN_WHITESPACE] = rrex_match_whitespace;
|
||
|
executor.functions[RN_GROUP_START] = rrex_match_group;
|
||
|
executor.functions[RN_QUESTION] = rrex_match_optional;
|
||
|
executor.functions[RN_ROOF] = rrex_match_sol;
|
||
|
executor.functions[RN_DIGIT] = rrex_match_digit;
|
||
|
executor.functions[RN_ALPHA] = rrex_match_word;
|
||
|
rrex_executor_t *ex = &executor;
|
||
|
char *s_padding = ex->sdata;
|
||
|
bool valid = true;
|
||
|
while (valid && *ex->bdata) {
|
||
|
valid = rrex_execute_one(&executor);
|
||
|
if (!valid && *ex->sdata) {
|
||
|
if (*ex->_bdata == RN_ROOF) {
|
||
|
break;
|
||
|
}
|
||
|
s_padding++;
|
||
|
ex->sdata = s_padding;
|
||
|
ex->bdata = ex->_bdata;
|
||
|
if (*ex->bdata && *ex->sdata)
|
||
|
valid = true;
|
||
|
}
|
||
|
}
|
||
|
return valid;
|
||
|
}
|
||
|
|
||
|
void rrex_executor_tests() {
|
||
|
rtest_banner("rrex regular expressions");
|
||
|
|
||
|
// rassert(rrex("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaq", "[^qxyzv]+q$"));
|
||
|
|
||
|
rassert(rrex("abababa", "^(ab)+a$"));
|
||
|
|
||
|
rassert(rrex(" a ", "\\sa\\s"));
|
||
|
rassert(!rrex("a", "\\s"));
|
||
|
rassert(rrex("abc", "ab[def]?c"));
|
||
|
rassert(rrex("abc", "ab(d|e|f)?c"));
|
||
|
rassert(rrex("1990-01-13",
|
||
|
"^(19|20)\\d\\d-(0[1-9]|1[0-2])-(0[1-9]|[12]\\d|3[01])$"));
|
||
|
rassert(rrex("1990-01-13", "(19|20)\\d\\d-[0?1]\\d-[0123]\\d"));
|
||
|
// rassert(rrex("1990-1-3", "(19|20)\\d\\d-[0?1]\\d-[0123]\\d"));
|
||
|
// rassert(rrex("1990-1-3", "(19|20)\\d\\d-[01]?\\d-[0123]\\d"));
|
||
|
rassert(
|
||
|
rrex("1990-13-25", "(19|20)\\d\\d-([01]\\d?||\\d)-([0123]\\d|\\d)$"));
|
||
|
rassert(
|
||
|
!rrex("1990-13-45", "(19|20)\\d\\d-([01]\\d?||\\d)-([0123]\\d|\\d)$"))
|
||
|
//(19|20)\d\d-[01]?\d-[0123]\d
|
||
|
rassert(rrex("a", "[zsa]"));
|
||
|
rassert(rrex("abcdefg", "abcd?efg"));
|
||
|
rassert(rrex("abcefg", "abcd?efg"));
|
||
|
rassert(rrex("ce", "(a|b|c|d)e"));
|
||
|
rassert(rrex("A", "A-Z"));
|
||
|
rassert(rrex("a", "a-Z"));
|
||
|
rassert(rrex("abcab", "[abc][acb]{4}$"));
|
||
|
rassert(rrex("aa", "\\w{2}$"));
|
||
|
|
||
|
rassert(rrex("a", "[ca]"));
|
||
|
rassert(rrex("1-4", "1\\-4"));
|
||
|
|
||
|
rassert(rrex("a", "[ba]"));
|
||
|
rassert(rrex("5", "4-9"));
|
||
|
rassert(rrex("4", "4-9"));
|
||
|
rassert(rrex("9", "4-9"));
|
||
|
rassert(rrex("123A", "1-41-41-4A"));
|
||
|
rassert(!rrex("123B", "1-41-41-4A"));
|
||
|
rassert(!rrex("1", "4-9"));
|
||
|
rassert(rrex("abca", "[abc][abc][abc]a$"));
|
||
|
|
||
|
rassert(rrex("abca", "[a-z][abc][abc]a"));
|
||
|
rassert(rrex("abca", "[\\w][abc][abc]a"));
|
||
|
rassert(rrex("a5a5g!a", "a0-9a-z\\d\\D\\Wa"));
|
||
|
rassert(!rrex("1", "\\D"));
|
||
|
rassert(!rrex("a", "\\W"));
|
||
|
rassert(!rrex("1", "\\w"));
|
||
|
rassert(!rrex("a", "\\d"));
|
||
|
rassert(!rrex("\n", "."));
|
||
|
rassert(rrex("a", "a$"));
|
||
|
rassert(rrex("a1ba1ba1b", "[a-z\\db]{3}"));
|
||
|
rassert(rrex("abbc", "a{1}[a-z]{2}c{1}"));
|
||
|
rassert(rrex("aA", "[a-zA-Z]{2}"));
|
||
|
|
||
|
rassert(!rrex("123", "\\d+a"));
|
||
|
rassert(rrex("123a", "[123]+a"));
|
||
|
|
||
|
printf("JSSS\n");
|
||
|
rassert(rrex("123", "[123]+"));
|
||
|
rassert(!rrex("123b", "[123]+a"));
|
||
|
// rassert(!rrex("123", "[123]+b")); NOT READY YET
|
||
|
|
||
|
rassert(rrex("abababc", "^(ab)+c$"));
|
||
|
rassert(!rrex("abababb", "^(ab)+a$"));
|
||
|
rassert(!rrex("abababa", "^(ab)+b$"));
|
||
|
rassert(!rrex("abdabdabda", "^(abc)+a$"));
|
||
|
rassert(!rrex("abababa", "^(abc)+a$"));
|
||
|
|
||
|
rassert(rrex("123a33", "\\d+a\\d+"));
|
||
|
rassert(!rrex("123ab", "\\d+$"));
|
||
|
|
||
|
rassert(rrex("567", "[^1234]"));
|
||
|
rassert(rrex("400", "[^5]"));
|
||
|
rassert(!rrex("132213gh", ".*gd"));
|
||
|
rassert(!rrex("132213gd", ".*gh"));
|
||
|
rassert(rrex("#include \"test.h\"x", "#include *\"[a-z\\.]*\"x"));
|
||
|
|
||
|
// rassert(rrex("#include \"test.h\"x", "#include.*\".*\"x"));
|
||
|
rassert(!rrex("#include \"test.h\"y", ".*#include.*\".*\"x"));
|
||
|
|
||
|
rassert(rrex("123test", "^123"));
|
||
|
rassert(rrex("test123", "123"));
|
||
|
rassert(!rrex("test123", "^123"));
|
||
|
rassert(rrex("test123", "123$"));
|
||
|
rassert(rrex("test123test", "123"));
|
||
|
rassert(!rrex("test123test", "123$"));
|
||
|
}
|