2025-12-02 06:54:32 +01:00
|
|
|
#include "lexer.h"
|
2025-12-04 06:14:22 +01:00
|
|
|
#include "../utils/safe_alloc.h"
|
2025-12-02 06:54:32 +01:00
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <ctype.h>
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
|
|
|
|
|
extern RavaTokenType_e rava_lexer_lookup_keyword(const char *identifier);
|
|
|
|
|
extern RavaToken_t* rava_lexer_parse_string(RavaLexer_t *lexer);
|
|
|
|
|
extern RavaToken_t* rava_lexer_parse_character(RavaLexer_t *lexer);
|
|
|
|
|
extern RavaToken_t* rava_lexer_parse_number(RavaLexer_t *lexer);
|
|
|
|
|
|
|
|
|
|
RavaLexer_t* rava_lexer_create(const char *source) {
|
|
|
|
|
RavaLexer_t *lexer = malloc(sizeof(RavaLexer_t));
|
2025-12-04 06:14:22 +01:00
|
|
|
if (!lexer) return NULL;
|
2025-12-02 06:54:32 +01:00
|
|
|
lexer->source = source;
|
|
|
|
|
lexer->source_length = strlen(source);
|
|
|
|
|
lexer->current = 0;
|
|
|
|
|
lexer->start = 0;
|
|
|
|
|
lexer->line = 1;
|
|
|
|
|
lexer->column = 1;
|
|
|
|
|
lexer->start_column = 1;
|
|
|
|
|
lexer->error_message = NULL;
|
|
|
|
|
return lexer;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void rava_lexer_destroy(RavaLexer_t *lexer) {
|
|
|
|
|
if (lexer) {
|
|
|
|
|
if (lexer->error_message) {
|
|
|
|
|
free(lexer->error_message);
|
|
|
|
|
}
|
|
|
|
|
free(lexer);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void rava_token_destroy(RavaToken_t *token) {
|
|
|
|
|
if (token) {
|
|
|
|
|
if (token->lexeme) {
|
|
|
|
|
free(token->lexeme);
|
|
|
|
|
}
|
|
|
|
|
if (token->type == RAVA_TOKEN_LITERAL_STRING && token->value.string_value) {
|
|
|
|
|
free(token->value.string_value);
|
|
|
|
|
}
|
|
|
|
|
free(token);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool _rava_lexer_is_at_end(RavaLexer_t *lexer) {
|
|
|
|
|
return lexer->current >= lexer->source_length;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
char _rava_lexer_peek(RavaLexer_t *lexer) {
|
|
|
|
|
if (_rava_lexer_is_at_end(lexer)) return '\0';
|
|
|
|
|
return lexer->source[lexer->current];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
char _rava_lexer_peek_next(RavaLexer_t *lexer) {
|
|
|
|
|
if (lexer->current + 1 >= lexer->source_length) return '\0';
|
|
|
|
|
return lexer->source[lexer->current + 1];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
char _rava_lexer_advance(RavaLexer_t *lexer) {
|
|
|
|
|
if (_rava_lexer_is_at_end(lexer)) return '\0';
|
|
|
|
|
char c = lexer->source[lexer->current++];
|
|
|
|
|
if (c == '\n') {
|
|
|
|
|
lexer->line++;
|
|
|
|
|
lexer->column = 1;
|
|
|
|
|
} else {
|
|
|
|
|
lexer->column++;
|
|
|
|
|
}
|
|
|
|
|
return c;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool _rava_lexer_match(RavaLexer_t *lexer, char expected) {
|
|
|
|
|
if (_rava_lexer_is_at_end(lexer)) return false;
|
|
|
|
|
if (lexer->source[lexer->current] != expected) return false;
|
|
|
|
|
lexer->current++;
|
|
|
|
|
lexer->column++;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void _rava_lexer_skip_whitespace(RavaLexer_t *lexer) {
|
|
|
|
|
while (!_rava_lexer_is_at_end(lexer)) {
|
|
|
|
|
char c = _rava_lexer_peek(lexer);
|
|
|
|
|
switch (c) {
|
|
|
|
|
case ' ':
|
|
|
|
|
case '\t':
|
|
|
|
|
case '\r':
|
|
|
|
|
case '\n':
|
|
|
|
|
case '\f':
|
|
|
|
|
_rava_lexer_advance(lexer);
|
|
|
|
|
break;
|
|
|
|
|
case '/':
|
|
|
|
|
if (_rava_lexer_peek_next(lexer) == '/') {
|
|
|
|
|
while (!_rava_lexer_is_at_end(lexer) && _rava_lexer_peek(lexer) != '\n') {
|
|
|
|
|
_rava_lexer_advance(lexer);
|
|
|
|
|
}
|
|
|
|
|
} else if (_rava_lexer_peek_next(lexer) == '*') {
|
|
|
|
|
_rava_lexer_advance(lexer);
|
|
|
|
|
_rava_lexer_advance(lexer);
|
|
|
|
|
while (!_rava_lexer_is_at_end(lexer)) {
|
|
|
|
|
if (_rava_lexer_peek(lexer) == '*' && _rava_lexer_peek_next(lexer) == '/') {
|
|
|
|
|
_rava_lexer_advance(lexer);
|
|
|
|
|
_rava_lexer_advance(lexer);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
_rava_lexer_advance(lexer);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RavaToken_t* _rava_lexer_create_token(RavaLexer_t *lexer, RavaTokenType_e type) {
|
|
|
|
|
RavaToken_t *token = malloc(sizeof(RavaToken_t));
|
2025-12-04 06:14:22 +01:00
|
|
|
if (!token) return NULL;
|
2025-12-02 06:54:32 +01:00
|
|
|
token->type = type;
|
|
|
|
|
size_t length = lexer->current - lexer->start;
|
|
|
|
|
token->lexeme = malloc(length + 1);
|
2025-12-04 06:14:22 +01:00
|
|
|
if (!token->lexeme) {
|
|
|
|
|
free(token);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
2025-12-02 06:54:32 +01:00
|
|
|
memcpy(token->lexeme, lexer->source + lexer->start, length);
|
|
|
|
|
token->lexeme[length] = '\0';
|
|
|
|
|
token->line = lexer->line;
|
|
|
|
|
token->column = lexer->start_column;
|
|
|
|
|
token->value.int_value = 0;
|
|
|
|
|
return token;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool _rava_is_alpha(char c) {
|
|
|
|
|
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '$';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool _rava_is_alnum(char c) {
|
|
|
|
|
return _rava_is_alpha(c) || (c >= '0' && c <= '9');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static RavaToken_t* _rava_lexer_identifier(RavaLexer_t *lexer) {
|
|
|
|
|
while (_rava_is_alnum(_rava_lexer_peek(lexer))) {
|
|
|
|
|
_rava_lexer_advance(lexer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RavaToken_t *token = _rava_lexer_create_token(lexer, RAVA_TOKEN_IDENTIFIER);
|
|
|
|
|
RavaTokenType_e keyword_type = rava_lexer_lookup_keyword(token->lexeme);
|
|
|
|
|
token->type = keyword_type;
|
|
|
|
|
|
|
|
|
|
if (keyword_type == RAVA_TOKEN_LITERAL_TRUE) {
|
|
|
|
|
token->value.int_value = 1;
|
|
|
|
|
} else if (keyword_type == RAVA_TOKEN_LITERAL_FALSE) {
|
|
|
|
|
token->value.int_value = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return token;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
RavaToken_t* rava_lexer_next_token(RavaLexer_t *lexer) {
|
|
|
|
|
_rava_lexer_skip_whitespace(lexer);
|
|
|
|
|
|
|
|
|
|
lexer->start = lexer->current;
|
|
|
|
|
lexer->start_column = lexer->column;
|
|
|
|
|
|
|
|
|
|
if (_rava_lexer_is_at_end(lexer)) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_EOF);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
char c = _rava_lexer_advance(lexer);
|
|
|
|
|
|
|
|
|
|
if (_rava_is_alpha(c)) {
|
|
|
|
|
return _rava_lexer_identifier(lexer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (isdigit(c)) {
|
|
|
|
|
return rava_lexer_parse_number(lexer);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch (c) {
|
|
|
|
|
case '(': return _rava_lexer_create_token(lexer, RAVA_TOKEN_LPAREN);
|
|
|
|
|
case ')': return _rava_lexer_create_token(lexer, RAVA_TOKEN_RPAREN);
|
|
|
|
|
case '{': return _rava_lexer_create_token(lexer, RAVA_TOKEN_LBRACE);
|
|
|
|
|
case '}': return _rava_lexer_create_token(lexer, RAVA_TOKEN_RBRACE);
|
|
|
|
|
case '[': return _rava_lexer_create_token(lexer, RAVA_TOKEN_LBRACKET);
|
|
|
|
|
case ']': return _rava_lexer_create_token(lexer, RAVA_TOKEN_RBRACKET);
|
|
|
|
|
case ';': return _rava_lexer_create_token(lexer, RAVA_TOKEN_SEMICOLON);
|
|
|
|
|
case ',': return _rava_lexer_create_token(lexer, RAVA_TOKEN_COMMA);
|
|
|
|
|
case '~': return _rava_lexer_create_token(lexer, RAVA_TOKEN_TILDE);
|
|
|
|
|
case '?': return _rava_lexer_create_token(lexer, RAVA_TOKEN_QUESTION);
|
|
|
|
|
case '@': return _rava_lexer_create_token(lexer, RAVA_TOKEN_AT);
|
|
|
|
|
|
|
|
|
|
case '.':
|
|
|
|
|
if (_rava_lexer_peek(lexer) == '.' && _rava_lexer_peek_next(lexer) == '.') {
|
|
|
|
|
_rava_lexer_advance(lexer);
|
|
|
|
|
_rava_lexer_advance(lexer);
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ELLIPSIS);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_DOT);
|
|
|
|
|
|
|
|
|
|
case ':':
|
|
|
|
|
if (_rava_lexer_match(lexer, ':')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_COLONCOLON);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_COLON);
|
|
|
|
|
|
|
|
|
|
case '+':
|
|
|
|
|
if (_rava_lexer_match(lexer, '+')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_INC);
|
|
|
|
|
}
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_PLUSASSIGN);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_PLUS);
|
|
|
|
|
|
|
|
|
|
case '-':
|
|
|
|
|
if (_rava_lexer_match(lexer, '-')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_DEC);
|
|
|
|
|
}
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_MINUSASSIGN);
|
|
|
|
|
}
|
|
|
|
|
if (_rava_lexer_match(lexer, '>')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ARROW);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_MINUS);
|
|
|
|
|
|
|
|
|
|
case '*':
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_STARASSIGN);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_STAR);
|
|
|
|
|
|
|
|
|
|
case '/':
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_SLASHASSIGN);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_SLASH);
|
|
|
|
|
|
|
|
|
|
case '%':
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_PERCENTASSIGN);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_PERCENT);
|
|
|
|
|
|
|
|
|
|
case '&':
|
|
|
|
|
if (_rava_lexer_match(lexer, '&')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_AND);
|
|
|
|
|
}
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ANDASSIGN);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_AMP);
|
|
|
|
|
|
|
|
|
|
case '|':
|
|
|
|
|
if (_rava_lexer_match(lexer, '|')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_OR);
|
|
|
|
|
}
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ORASSIGN);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_PIPE);
|
|
|
|
|
|
|
|
|
|
case '^':
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_CARETASSIGN);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_CARET);
|
|
|
|
|
|
|
|
|
|
case '!':
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_NE);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_BANG);
|
|
|
|
|
|
|
|
|
|
case '=':
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_EQUAL);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ASSIGN);
|
|
|
|
|
|
|
|
|
|
case '<':
|
|
|
|
|
if (_rava_lexer_match(lexer, '<')) {
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_LSHIFTASSIGN);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_LSHIFT);
|
|
|
|
|
}
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_LE);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_LT);
|
|
|
|
|
|
|
|
|
|
case '>':
|
|
|
|
|
if (_rava_lexer_match(lexer, '>')) {
|
|
|
|
|
if (_rava_lexer_match(lexer, '>')) {
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_URSHIFTASSIGN);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_URSHIFT);
|
|
|
|
|
}
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_RSHIFTASSIGN);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_RSHIFT);
|
|
|
|
|
}
|
|
|
|
|
if (_rava_lexer_match(lexer, '=')) {
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_GE);
|
|
|
|
|
}
|
|
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_GT);
|
|
|
|
|
|
|
|
|
|
case '"':
|
|
|
|
|
return rava_lexer_parse_string(lexer);
|
|
|
|
|
|
|
|
|
|
case '\'':
|
|
|
|
|
return rava_lexer_parse_character(lexer);
|
|
|
|
|
|
|
|
|
|
default:
|
2025-12-04 06:14:22 +01:00
|
|
|
lexer->error_message = malloc(RAVA_ERROR_BUFFER_SIZE);
|
|
|
|
|
if (lexer->error_message) {
|
|
|
|
|
snprintf(lexer->error_message, RAVA_ERROR_BUFFER_SIZE, "Unexpected character '%c'", c);
|
|
|
|
|
}
|
2025-12-02 06:54:32 +01:00
|
|
|
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ERROR);
|
|
|
|
|
}
|
|
|
|
|
}
|