#include "lexer.h"
#include "../utils/safe_alloc.h"
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdio.h>
extern RavaTokenType_e rava_lexer_lookup_keyword(const char *identifier);
extern RavaToken_t* rava_lexer_parse_string(RavaLexer_t *lexer);
extern RavaToken_t* rava_lexer_parse_character(RavaLexer_t *lexer);
extern RavaToken_t* rava_lexer_parse_number(RavaLexer_t *lexer);
RavaLexer_t* rava_lexer_create(const char *source) {
RavaLexer_t *lexer = malloc(sizeof(RavaLexer_t));
if (!lexer) return NULL;
lexer->source = source;
lexer->source_length = strlen(source);
lexer->current = 0;
lexer->start = 0;
lexer->line = 1;
lexer->column = 1;
lexer->start_column = 1;
lexer->error_message = NULL;
return lexer;
}
void rava_lexer_destroy(RavaLexer_t *lexer) {
if (lexer) {
if (lexer->error_message) {
free(lexer->error_message);
}
free(lexer);
}
}
void rava_token_destroy(RavaToken_t *token) {
if (token) {
if (token->lexeme) {
free(token->lexeme);
}
if (token->type == RAVA_TOKEN_LITERAL_STRING && token->value.string_value) {
free(token->value.string_value);
}
free(token);
}
}
bool _rava_lexer_is_at_end(RavaLexer_t *lexer) {
return lexer->current >= lexer->source_length;
}
char _rava_lexer_peek(RavaLexer_t *lexer) {
if (_rava_lexer_is_at_end(lexer)) return '\0';
return lexer->source[lexer->current];
}
char _rava_lexer_peek_next(RavaLexer_t *lexer) {
if (lexer->current + 1 >= lexer->source_length) return '\0';
return lexer->source[lexer->current + 1];
}
char _rava_lexer_advance(RavaLexer_t *lexer) {
if (_rava_lexer_is_at_end(lexer)) return '\0';
char c = lexer->source[lexer->current++];
if (c == '\n') {
lexer->line++;
lexer->column = 1;
} else {
lexer->column++;
}
return c;
}
bool _rava_lexer_match(RavaLexer_t *lexer, char expected) {
if (_rava_lexer_is_at_end(lexer)) return false;
if (lexer->source[lexer->current] != expected) return false;
lexer->current++;
lexer->column++;
return true;
}
void _rava_lexer_skip_whitespace(RavaLexer_t *lexer) {
while (!_rava_lexer_is_at_end(lexer)) {
char c = _rava_lexer_peek(lexer);
switch (c) {
case ' ':
case '\t':
case '\r':
case '\n':
case '\f':
_rava_lexer_advance(lexer);
break;
case '/':
if (_rava_lexer_peek_next(lexer) == '/') {
while (!_rava_lexer_is_at_end(lexer) && _rava_lexer_peek(lexer) != '\n') {
_rava_lexer_advance(lexer);
}
} else if (_rava_lexer_peek_next(lexer) == '*') {
_rava_lexer_advance(lexer);
_rava_lexer_advance(lexer);
while (!_rava_lexer_is_at_end(lexer)) {
if (_rava_lexer_peek(lexer) == '*' && _rava_lexer_peek_next(lexer) == '/') {
_rava_lexer_advance(lexer);
_rava_lexer_advance(lexer);
break;
}
_rava_lexer_advance(lexer);
}
} else {
return;
}
break;
default:
return;
}
}
}
RavaToken_t* _rava_lexer_create_token(RavaLexer_t *lexer, RavaTokenType_e type) {
RavaToken_t *token = malloc(sizeof(RavaToken_t));
if (!token) return NULL;
token->type = type;
size_t length = lexer->current - lexer->start;
token->lexeme = malloc(length + 1);
if (!token->lexeme) {
free(token);
return NULL;
}
memcpy(token->lexeme, lexer->source + lexer->start, length);
token->lexeme[length] = '\0';
token->line = lexer->line;
token->column = lexer->start_column;
token->value.int_value = 0;
return token;
}
static bool _rava_is_alpha(char c) {
return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c == '$';
}
static bool _rava_is_alnum(char c) {
return _rava_is_alpha(c) || (c >= '0' && c <= '9');
}
static RavaToken_t* _rava_lexer_identifier(RavaLexer_t *lexer) {
while (_rava_is_alnum(_rava_lexer_peek(lexer))) {
_rava_lexer_advance(lexer);
}
RavaToken_t *token = _rava_lexer_create_token(lexer, RAVA_TOKEN_IDENTIFIER);
RavaTokenType_e keyword_type = rava_lexer_lookup_keyword(token->lexeme);
token->type = keyword_type;
if (keyword_type == RAVA_TOKEN_LITERAL_TRUE) {
token->value.int_value = 1;
} else if (keyword_type == RAVA_TOKEN_LITERAL_FALSE) {
token->value.int_value = 0;
}
return token;
}
RavaToken_t* rava_lexer_next_token(RavaLexer_t *lexer) {
_rava_lexer_skip_whitespace(lexer);
lexer->start = lexer->current;
lexer->start_column = lexer->column;
if (_rava_lexer_is_at_end(lexer)) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_EOF);
}
char c = _rava_lexer_advance(lexer);
if (_rava_is_alpha(c)) {
return _rava_lexer_identifier(lexer);
}
if (isdigit(c)) {
return rava_lexer_parse_number(lexer);
}
switch (c) {
case '(': return _rava_lexer_create_token(lexer, RAVA_TOKEN_LPAREN);
case ')': return _rava_lexer_create_token(lexer, RAVA_TOKEN_RPAREN);
case '{': return _rava_lexer_create_token(lexer, RAVA_TOKEN_LBRACE);
case '}': return _rava_lexer_create_token(lexer, RAVA_TOKEN_RBRACE);
case '[': return _rava_lexer_create_token(lexer, RAVA_TOKEN_LBRACKET);
case ']': return _rava_lexer_create_token(lexer, RAVA_TOKEN_RBRACKET);
case ';': return _rava_lexer_create_token(lexer, RAVA_TOKEN_SEMICOLON);
case ',': return _rava_lexer_create_token(lexer, RAVA_TOKEN_COMMA);
case '~': return _rava_lexer_create_token(lexer, RAVA_TOKEN_TILDE);
case '?': return _rava_lexer_create_token(lexer, RAVA_TOKEN_QUESTION);
case '@': return _rava_lexer_create_token(lexer, RAVA_TOKEN_AT);
case '.':
if (_rava_lexer_peek(lexer) == '.' && _rava_lexer_peek_next(lexer) == '.') {
_rava_lexer_advance(lexer);
_rava_lexer_advance(lexer);
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ELLIPSIS);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_DOT);
case ':':
if (_rava_lexer_match(lexer, ':')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_COLONCOLON);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_COLON);
case '+':
if (_rava_lexer_match(lexer, '+')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_INC);
}
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_PLUSASSIGN);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_PLUS);
case '-':
if (_rava_lexer_match(lexer, '-')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_DEC);
}
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_MINUSASSIGN);
}
if (_rava_lexer_match(lexer, '>')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ARROW);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_MINUS);
case '*':
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_STARASSIGN);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_STAR);
case '/':
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_SLASHASSIGN);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_SLASH);
case '%':
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_PERCENTASSIGN);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_PERCENT);
case '&':
if (_rava_lexer_match(lexer, '&')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_AND);
}
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ANDASSIGN);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_AMP);
case '|':
if (_rava_lexer_match(lexer, '|')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_OR);
}
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ORASSIGN);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_PIPE);
case '^':
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_CARETASSIGN);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_CARET);
case '!':
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_NE);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_BANG);
case '=':
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_EQUAL);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ASSIGN);
case '<':
if (_rava_lexer_match(lexer, '<')) {
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_LSHIFTASSIGN);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_LSHIFT);
}
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_LE);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_LT);
case '>':
if (_rava_lexer_match(lexer, '>')) {
if (_rava_lexer_match(lexer, '>')) {
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_URSHIFTASSIGN);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_URSHIFT);
}
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_RSHIFTASSIGN);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_RSHIFT);
}
if (_rava_lexer_match(lexer, '=')) {
return _rava_lexer_create_token(lexer, RAVA_TOKEN_GE);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_GT);
case '"':
return rava_lexer_parse_string(lexer);
case '\'':
return rava_lexer_parse_character(lexer);
default:
lexer->error_message = malloc(RAVA_ERROR_BUFFER_SIZE);
if (lexer->error_message) {
snprintf(lexer->error_message, RAVA_ERROR_BUFFER_SIZE, "Unexpected character '%c'", c);
}
return _rava_lexer_create_token(lexer, RAVA_TOKEN_ERROR);
}
}