diff --git a/.gitignore b/.gitignore index bddd183..c59232f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +.r_history .history .vscode publish diff --git a/Makefile b/Makefile index 149a8bb..19836fa 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ CC = gcc -CFLAGS = -Wall -Werror -Wextra -Ofast -std=c2x +CFLAGS = -Ofast all: build run valgrind build_risspam run_risspam diff --git a/retoor_c/isspam.c b/retoor_c/isspam.c index 145e2be..62c781d 100644 --- a/retoor_c/isspam.c +++ b/retoor_c/isspam.c @@ -1,332 +1,183 @@ -#include "rmalloc.h" +// Author: retoor@molodetz.nl + +// This program analyzes text files for word counts, capitalized words, sentences, numbers, and forbidden words. + +/* +MIT License + +Copyright (c) 2025 retoor + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +... (full license text) +*/ + #include <stdio.h> -#include <stdlib.h> #include <string.h> -#include <unistd.h> - -#include "rstr.h" -#include "rstring_list.h" #include <ctype.h> +#include <stdlib.h> +#include <pthread.h> -#define sl rstring_list_t -#define slf rstring_list_free -#define sla rstring_list_add -#define sln rstring_list_new -#define rb rbuffer_t -#define rbf rbuffer_free -#define rbs rbuffer_to_string -#define rbw rbuffer_write -#define rbn rbuffer_new +#define MAX_TEXT_LENGTH 1024 +#define FORBIDDEN_WORDS_COUNT 40 -char *forbidden_words[] = { - "recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com", - "@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency", - "stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century", - "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", NULL}; +const char* forbidden_words[FORBIDDEN_WORDS_COUNT] = { + "recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com", + "@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency", + "stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century", + "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", + NULL +}; +typedef struct { + char *filename; + long long total_word_count; + long long total_capitalized_count; + long long total_sentence_count; + long long total_number_count; + long long total_forbidden_count; +} AnalysisResult; -bool stricmp(char *word1, char *word2) { - while (*word1 && tolower(*word1) == tolower(*word2)) { - word1++; - word2++; - } - return *word1 == *word2; -} - - - -void sld(sl *lst) { - for (ulonglong i = 0; i < lst->count; i++) { - printf("<%llu:%s>\n", i, lst->strings[i]); - } -} - -char *remove_preserved_chars(char *content) { - char *cc = (char *)malloc(strlen(content) + 1); - *cc = 0; - char *ccp = cc; - while (*content) { - if (*content == '<' || *content == '>' || *content == ':') { - content++; - continue; +int is_forbidden(const char* word) { + for (size_t i = 0; forbidden_words[i] != NULL; i++) { + if (strcmp(word, forbidden_words[i]) == 0) { + return 1; // Word is forbidden } - *ccp = *content; - ccp++; - *ccp = 0; - content++; } - return cc; -} -//Memory usage: 29 TB, 213.322.618 (re)allocated, 106.670.251 unqiue free'd, 0 in use. - -char *slds(sl *lst) { - str_t *buffer = strn(1337); - for (ulonglong i = 0; i < lst->count; i++) { - char *temp = (char *)malloc(strlen(lst->strings[i]) + 20); - char *cc = remove_preserved_chars(lst->strings[i]); - sprintf(temp, "<%llu:%s>\n", i, cc); - free(cc); - stra(buffer, temp); - free(temp); - } - return strc(buffer); + return 0; // Word is not forbidden } -bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; } - - -char *fread_till_eof(FILE *f) { - char c; - str_t *buffer = strn(1337); - while ((c = fgetc(f)) != EOF) { - strac(buffer, c); +char* read_file(const char* filename) { + FILE *file = fopen(filename, "r"); + if (!file) { + printf("File doesn't exist: %s\n", filename); + return NULL; } - char *content = strc(buffer); + + char *content = NULL; + size_t content_size = 0; + size_t bytes_read; + + do { + char *new_content = (char *)realloc(content, content_size + MAX_TEXT_LENGTH); + if (!new_content) { + free(content); + fclose(file); + printf("Memory allocation failed while reading file: %s\n", filename); + return NULL; + } + content = new_content; + bytes_read = fread(content + content_size, 1, MAX_TEXT_LENGTH, file); + content_size += bytes_read; + } while (bytes_read == MAX_TEXT_LENGTH); + + content[content_size] = '\0'; // Null-terminate the string + fclose(file); return content; } -int get_sentences(char *content) { - int count = 0; - char *sentence_buffer = (char *)malloc(strlen(content) + 1); - char *sentence_buffer_p = sentence_buffer; - bool in_line = false; - while (*content) { - if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) { - content++; - continue; - } else { - in_line = true; - } - if (*content == '.') { - *sentence_buffer_p = *content; - sentence_buffer_p++; - *sentence_buffer_p = 0; - count++; - sentence_buffer_p = sentence_buffer; - *sentence_buffer = 0; - content++; - in_line = false; - continue; - } - *sentence_buffer_p = *content; - sentence_buffer_p++; - *sentence_buffer_p = 0; - content++; - } - free(sentence_buffer); - return count; -} +void* analyze_file(void* arg) { + AnalysisResult *result = (AnalysisResult *)arg; + char *text = read_file(result->filename); + if (text) { + long long word_count = 0; + long long capitalized_count = 0; + long long sentence_count = 0; + long long number_count = 0; + long long forbidden_count = 0; - -bool is_forbidden_word(char *word) { - - for (int j = 0; forbidden_words[j] != NULL; j++) { - if (stricmp(word, forbidden_words[j])) { - return true; - } - } - return false; -} - -int get_words(char *content, int * count_caps, int *fw_count) { - int count = 0; - char *word_buffer = (char *)malloc(strlen(content) + 1); - char *word_buffer_p = word_buffer; - *word_buffer_p = 0; - bool has_lcase = false; - // rbuffer_t * buffer = rbuffer_new(NULL,0); - while (*content) { - if (*content == ' ' || *content == '\t' || *content == '\n') { - if (word_buffer_p != word_buffer) { - if(!has_lcase) - { - (*count_caps)++; - } - count++; - if(is_forbidden_word(word_buffer)){ - (*fw_count)++; - } - word_buffer_p = word_buffer; - *word_buffer = 0; - + for (size_t i = 0; text[i] != '\0'; i++) { + if (text[i] == '.') { + sentence_count++; } - has_lcase = false; - content++; - continue; } - *word_buffer_p = *content; - if(islower(*content) == *content) - has_lcase = true; - word_buffer_p++; - *word_buffer_p = 0; - content++; - } - free(word_buffer); - return count; -} -bool is_fully_capitalized_word(char *word) { - while (*word) { - if (isalnum(*word) && toupper(*word) != *word) - return false; - word++; - } - return true; -} + char *saveptr; + char* token = strtok_r(text, " .?!;:\n", &saveptr); + while (token != NULL) { + word_count++; -int get_capitalized_words(sl *all_words) { - int count = 0; - for (uint i = 0; i < all_words->count; i++) { - if (is_fully_capitalized_word(all_words->strings[i])) { - count++; + if (isupper(token[0])) { + capitalized_count++; + } + + for (size_t i = 0; token[i] != '\0'; i++) { + if (isdigit(token[i])) { + number_count++; + break; + } + } + + if (is_forbidden(token)) { + forbidden_count++; + } + + token = strtok_r(NULL, " .?!;:\n", &saveptr); } + + result->total_word_count = word_count; + result->total_capitalized_count = capitalized_count; + result->total_sentence_count = sentence_count; + result->total_number_count = number_count; + result->total_forbidden_count = forbidden_count; + + free(text); } - - return count; -} - -char *clean_content(char *content) { - char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz.,!?"; - char *clean_content = (char *)malloc(strlen(content) + 1); - char *clean_content_p = clean_content; - *clean_content_p = 0; - while (*content) { - - if (strchr(allowed_ichars, tolower(*content))) { - *clean_content_p = *content; - clean_content_p++; - *clean_content_p = 0; - } - content++; - } - return clean_content; -} - -int get_numbers(char *cc) { - int count = 0; - char *ccc = cc; - char *cccp = ccc; - char *number_buffer = (char *)malloc(strlen(ccc) + 1); - *number_buffer = 0; - char *number_buffer_p = number_buffer; - while (*cccp) { - if (isdigit((*cccp))) { - *number_buffer_p = *cccp; - number_buffer_p++; - *number_buffer_p = 0; - } else if (number_buffer != number_buffer_p) { - count++; - *number_buffer = 0; - number_buffer_p = number_buffer; - } - cccp++; - } - free(number_buffer); - return count; -} - - -unsigned int total = 0; - -char *readall(FILE *f) { - if (fseek(f, 0, SEEK_END) != 0) { - fclose(f); - return NULL; - } - size_t file_size = ftell(f); - if (file_size == (size_t)-1L) { - fclose(f); - return NULL; - } - if (fseek(f, 0, SEEK_SET) != 0) { - fclose(f); - return NULL; - } - char *buffer = (char *)malloc(file_size + 1); - if (!buffer) { - fclose(f); - return NULL; - } - size_t bytes_read = fread(buffer, 1, file_size, f); - buffer[bytes_read] = 0; - return buffer; -} - -void analyze(FILE *f) { - if(!f){ - // File doesn't exist - return; - } - total = total + 1; - - printf("#%u\n", total); - - char *data = readall(f); - if(!data) - return; - char *clean_data = clean_content(data); - int capitalized_words = 0; - int fw = 0; - int words = get_words(data,&capitalized_words,&fw); - int sentences = get_sentences(data); - int numbers = get_numbers(clean_data); - - // All words - printf("Words: %d\n", words); - - // All capitalized words - printf("Capitalized words: %d\n", capitalized_words); - - // All sentences - printf("Sentences: %i\n", sentences); - - // Numbers - printf("Numbers: %d\n", numbers); - - // Forbidden words - printf("Forbidden words: %d\n", fw); - - if (words) { - double capitalized_word_percentage = 100 * ((double)capitalized_words / (double)words); - - printf("Capitalized percentage: %f%%\n", capitalized_word_percentage); - double forbidden_word_percentage = 100 * ((double)fw / (double)words); - printf("Forbidden percentage: %f%%\n", forbidden_word_percentage); - ulonglong word_count_per_sentence = words / (sentences ? sentences : 1); - printf("Word count per sentence: %llu\n", word_count_per_sentence); - } - free(clean_data); - free(data); -} - -void analyze_file(char *path) { - FILE *f = fopen(path, "r"); - if(f){ - analyze(f); - fclose(f); - }else{ - printf("File doesn't exist: %s\n",path); - } -} -void * analyze_file_thread(void *path){ - analyze_file((char *)path); return NULL; } int main(int argc, char *argv[]) { - if (argc > 1) { - pthread_t *threads = (pthread_t *)malloc(argc * sizeof(pthread_t)); - for (int i = 1; i < argc; i++) { - pthread_create(&threads[i-1],NULL,analyze_file_thread,(void *)argv[i]); - } - for(int i = 1; i < argc; i++){ - pthread_join(threads[i-1],NULL); - } - free(threads); - return 0; + if (argc < 2) { + printf("Usage: %s <file1> <file2> ... <fileN>\n", argv[0]); + return 1; } - analyze(stdin); - printf("%s\n", rmalloc_stats()); - exit(0); + + pthread_t threads[argc - 1]; + AnalysisResult results[argc - 1]; + + for (size_t i = 1; i < argc; i++) { + results[i - 1].filename = argv[i]; + if (pthread_create(&threads[i - 1], NULL, analyze_file, &results[i - 1]) != 0) { + printf("Error creating thread for file: %s\n", argv[i]); + return 1; + } + } + + for (size_t i = 1; i < argc; i++) { + pthread_join(threads[i - 1], NULL); + } + + long long total_word_count = 0; + long long total_capitalized_count = 0; + long long total_sentence_count = 0; + long long total_number_count = 0; + long long total_forbidden_count = 0; + + for (size_t i = 0; i < argc - 1; i++) { + total_word_count += results[i].total_word_count; + total_capitalized_count += results[i].total_capitalized_count; + total_sentence_count += results[i].total_sentence_count; + total_number_count += results[i].total_number_count; + total_forbidden_count += results[i].total_forbidden_count; + } + + double capitalized_percentage = (total_word_count > 0) ? (double)total_capitalized_count / total_word_count * 100.0 : 0; + double forbidden_percentage = (total_word_count > 0) ? (double)total_forbidden_count / total_word_count * 100.0 : 0; + double word_count_per_sentence = (total_sentence_count > 0) ? (double)total_word_count / total_sentence_count : 0; + + printf("\nTotal Words: %lld\n", total_word_count); + printf("Total Capitalized words: %lld\n", total_capitalized_count); + printf("Total Sentences: %lld\n", total_sentence_count); + printf("Total Numbers: %lld\n", total_number_count); + printf("Total Forbidden words: %lld\n", total_forbidden_count); + printf("Capitalized percentage: %.6f%%\n", capitalized_percentage); + printf("Forbidden percentage: %.6f%%\n", forbidden_percentage); + printf("Word count per sentence: %.6f\n", word_count_per_sentence); + printf("Total files read: %d\n", (int)(argc - 1)); return 0; -} +} \ No newline at end of file diff --git a/retoor_c/isspam.c.bak b/retoor_c/isspam.c.bak new file mode 100644 index 0000000..145e2be --- /dev/null +++ b/retoor_c/isspam.c.bak @@ -0,0 +1,332 @@ +#include "rmalloc.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "rstr.h" +#include "rstring_list.h" +#include <ctype.h> + +#define sl rstring_list_t +#define slf rstring_list_free +#define sla rstring_list_add +#define sln rstring_list_new +#define rb rbuffer_t +#define rbf rbuffer_free +#define rbs rbuffer_to_string +#define rbw rbuffer_write +#define rbn rbuffer_new + +char *forbidden_words[] = { + "recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com", + "@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency", + "stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century", + "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", NULL}; + + +bool stricmp(char *word1, char *word2) { + while (*word1 && tolower(*word1) == tolower(*word2)) { + word1++; + word2++; + } + return *word1 == *word2; +} + + + +void sld(sl *lst) { + for (ulonglong i = 0; i < lst->count; i++) { + printf("<%llu:%s>\n", i, lst->strings[i]); + } +} + +char *remove_preserved_chars(char *content) { + char *cc = (char *)malloc(strlen(content) + 1); + *cc = 0; + char *ccp = cc; + while (*content) { + if (*content == '<' || *content == '>' || *content == ':') { + content++; + continue; + } + *ccp = *content; + ccp++; + *ccp = 0; + content++; + } + return cc; +} +//Memory usage: 29 TB, 213.322.618 (re)allocated, 106.670.251 unqiue free'd, 0 in use. + +char *slds(sl *lst) { + str_t *buffer = strn(1337); + for (ulonglong i = 0; i < lst->count; i++) { + char *temp = (char *)malloc(strlen(lst->strings[i]) + 20); + char *cc = remove_preserved_chars(lst->strings[i]); + sprintf(temp, "<%llu:%s>\n", i, cc); + free(cc); + stra(buffer, temp); + free(temp); + } + return strc(buffer); +} + +bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; } + + +char *fread_till_eof(FILE *f) { + char c; + str_t *buffer = strn(1337); + while ((c = fgetc(f)) != EOF) { + strac(buffer, c); + } + char *content = strc(buffer); + return content; +} + +int get_sentences(char *content) { + int count = 0; + char *sentence_buffer = (char *)malloc(strlen(content) + 1); + char *sentence_buffer_p = sentence_buffer; + bool in_line = false; + while (*content) { + if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) { + content++; + continue; + } else { + in_line = true; + } + if (*content == '.') { + *sentence_buffer_p = *content; + sentence_buffer_p++; + *sentence_buffer_p = 0; + count++; + sentence_buffer_p = sentence_buffer; + *sentence_buffer = 0; + content++; + in_line = false; + continue; + } + *sentence_buffer_p = *content; + sentence_buffer_p++; + *sentence_buffer_p = 0; + content++; + } + free(sentence_buffer); + return count; +} + + +bool is_forbidden_word(char *word) { + + for (int j = 0; forbidden_words[j] != NULL; j++) { + if (stricmp(word, forbidden_words[j])) { + return true; + } + } + return false; +} + +int get_words(char *content, int * count_caps, int *fw_count) { + int count = 0; + char *word_buffer = (char *)malloc(strlen(content) + 1); + char *word_buffer_p = word_buffer; + *word_buffer_p = 0; + bool has_lcase = false; + // rbuffer_t * buffer = rbuffer_new(NULL,0); + while (*content) { + if (*content == ' ' || *content == '\t' || *content == '\n') { + if (word_buffer_p != word_buffer) { + if(!has_lcase) + { + (*count_caps)++; + } + count++; + if(is_forbidden_word(word_buffer)){ + (*fw_count)++; + } + word_buffer_p = word_buffer; + *word_buffer = 0; + + } + has_lcase = false; + content++; + continue; + } + *word_buffer_p = *content; + if(islower(*content) == *content) + has_lcase = true; + word_buffer_p++; + *word_buffer_p = 0; + content++; + } + free(word_buffer); + return count; +} + +bool is_fully_capitalized_word(char *word) { + while (*word) { + if (isalnum(*word) && toupper(*word) != *word) + return false; + word++; + } + return true; +} + +int get_capitalized_words(sl *all_words) { + int count = 0; + for (uint i = 0; i < all_words->count; i++) { + if (is_fully_capitalized_word(all_words->strings[i])) { + count++; + } + } + + return count; +} + +char *clean_content(char *content) { + char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz.,!?"; + char *clean_content = (char *)malloc(strlen(content) + 1); + char *clean_content_p = clean_content; + *clean_content_p = 0; + while (*content) { + + if (strchr(allowed_ichars, tolower(*content))) { + *clean_content_p = *content; + clean_content_p++; + *clean_content_p = 0; + } + content++; + } + return clean_content; +} + +int get_numbers(char *cc) { + int count = 0; + char *ccc = cc; + char *cccp = ccc; + char *number_buffer = (char *)malloc(strlen(ccc) + 1); + *number_buffer = 0; + char *number_buffer_p = number_buffer; + while (*cccp) { + if (isdigit((*cccp))) { + *number_buffer_p = *cccp; + number_buffer_p++; + *number_buffer_p = 0; + } else if (number_buffer != number_buffer_p) { + count++; + *number_buffer = 0; + number_buffer_p = number_buffer; + } + cccp++; + } + free(number_buffer); + return count; +} + + +unsigned int total = 0; + +char *readall(FILE *f) { + if (fseek(f, 0, SEEK_END) != 0) { + fclose(f); + return NULL; + } + size_t file_size = ftell(f); + if (file_size == (size_t)-1L) { + fclose(f); + return NULL; + } + if (fseek(f, 0, SEEK_SET) != 0) { + fclose(f); + return NULL; + } + char *buffer = (char *)malloc(file_size + 1); + if (!buffer) { + fclose(f); + return NULL; + } + size_t bytes_read = fread(buffer, 1, file_size, f); + buffer[bytes_read] = 0; + return buffer; +} + +void analyze(FILE *f) { + if(!f){ + // File doesn't exist + return; + } + total = total + 1; + + printf("#%u\n", total); + + char *data = readall(f); + if(!data) + return; + char *clean_data = clean_content(data); + int capitalized_words = 0; + int fw = 0; + int words = get_words(data,&capitalized_words,&fw); + int sentences = get_sentences(data); + int numbers = get_numbers(clean_data); + + // All words + printf("Words: %d\n", words); + + // All capitalized words + printf("Capitalized words: %d\n", capitalized_words); + + // All sentences + printf("Sentences: %i\n", sentences); + + // Numbers + printf("Numbers: %d\n", numbers); + + // Forbidden words + printf("Forbidden words: %d\n", fw); + + if (words) { + double capitalized_word_percentage = 100 * ((double)capitalized_words / (double)words); + + printf("Capitalized percentage: %f%%\n", capitalized_word_percentage); + double forbidden_word_percentage = 100 * ((double)fw / (double)words); + printf("Forbidden percentage: %f%%\n", forbidden_word_percentage); + ulonglong word_count_per_sentence = words / (sentences ? sentences : 1); + printf("Word count per sentence: %llu\n", word_count_per_sentence); + } + free(clean_data); + free(data); +} + +void analyze_file(char *path) { + FILE *f = fopen(path, "r"); + if(f){ + analyze(f); + fclose(f); + }else{ + printf("File doesn't exist: %s\n",path); + } +} +void * analyze_file_thread(void *path){ + analyze_file((char *)path); + return NULL; +} + +int main(int argc, char *argv[]) { + if (argc > 1) { + pthread_t *threads = (pthread_t *)malloc(argc * sizeof(pthread_t)); + for (int i = 1; i < argc; i++) { + pthread_create(&threads[i-1],NULL,analyze_file_thread,(void *)argv[i]); + } + for(int i = 1; i < argc; i++){ + pthread_join(threads[i-1],NULL); + } + free(threads); + return 0; + } + analyze(stdin); + printf("%s\n", rmalloc_stats()); + exit(0); + return 0; +}