diff --git a/Makefile b/Makefile index 078b003..22f42fc 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ CC = gcc CFLAGS = -Wall -Werror -Wextra -Ofast -std=c2x -all: build run valgrind build_risspam run_risspam +all: build run valgrind build_risspam run_risspam benchmark build: @echo "Compiling retoor_c project.". @@ -39,3 +39,10 @@ run_not_spam_risspam: valgrind: build valgrind ./isspam ./spam/*.txt + +benchmark: + -@rm -rf books + echo "Extracting books." + tar -xzf books.tar.gz books/ + echo "Extracted books." + python bench.py diff --git a/isspam b/isspam index 4e85c07..adb20b9 100755 Binary files a/isspam and b/isspam differ diff --git a/retoor_c/isspam.c b/retoor_c/isspam.c index 2907e89..97584fa 100644 --- a/retoor_c/isspam.c +++ b/retoor_c/isspam.c @@ -4,8 +4,8 @@ #include #include -#include "rstring_list.h" #include "rstr.h" +#include "rstring_list.h" #include #define sl rstring_list_t @@ -25,24 +25,16 @@ char *forbidden_words[] = { "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", NULL}; -bool show_capitalized = false; -bool show_sentences = false; -bool show_words = false; -bool show_numbers = false; -bool show_forbidden_words = true; - - - - -bool file_exists(char * path){ - FILE * f = fopen(path, "r"); - bool result = f != NULL; - if(f){ - fclose(f); +bool stricmp(char *word1, char *word2) { + while (*word1 && tolower(*word1) == tolower(*word2)) { + word1++; + word2++; } - return result; + return *word1 == *word2; } + + void sld(sl *lst) { for (ulonglong i = 0; i < lst->count; i++) { printf("<%llu:%s>\n", i, lst->strings[i]); @@ -65,6 +57,7 @@ char *remove_preserved_chars(char *content) { } return cc; } +//Memory usage: 29 TB, 213.322.618 (re)allocated, 106.670.251 unqiue free'd, 0 in use. char *slds(sl *lst) { str_t *buffer = strn(1337); @@ -81,20 +74,6 @@ char *slds(sl *lst) { bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; } -char *stripws(char *content) { - char *cc = (char *)malloc(strlen(content) + 1); - *cc = 0; - char *ccp = cc; - while (*content) { - if (!isws(*content)) { - *ccp = *content; - ccp++; - *ccp = 0; - } - content++; - } - return cc; -} char *fread_till_eof(FILE *f) { char c; @@ -106,12 +85,10 @@ char *fread_till_eof(FILE *f) { return content; } -rstring_list_t *get_sentences(char *content) { - - rstring_list_t *sentences = rstring_list_new(); +int get_sentences(char *content) { + int count = 0; char *sentence_buffer = (char *)malloc(strlen(content) + 1); char *sentence_buffer_p = sentence_buffer; - // rbuffer_t * buffer = rbuffer_new(NULL,0); bool in_line = false; while (*content) { if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) { @@ -124,7 +101,7 @@ rstring_list_t *get_sentences(char *content) { *sentence_buffer_p = *content; sentence_buffer_p++; *sentence_buffer_p = 0; - rstring_list_add(sentences, sentence_buffer); + count++; sentence_buffer_p = sentence_buffer; *sentence_buffer = 0; content++; @@ -137,32 +114,55 @@ rstring_list_t *get_sentences(char *content) { content++; } free(sentence_buffer); - return sentences; + return count; } -rstring_list_t *get_words(char *content) { - rstring_list_t *words = rstring_list_new(); + +bool is_forbidden_word(char *word) { + + for (int j = 0; forbidden_words[j] != NULL; j++) { + if (stricmp(word, forbidden_words[j])) { + return true; + } + } + return false; +} + +int get_words(char *content, int * count_caps, int *fw_count) { + int count = 0; char *word_buffer = (char *)malloc(strlen(content) + 1); char *word_buffer_p = word_buffer; *word_buffer_p = 0; + bool has_lcase = false; // rbuffer_t * buffer = rbuffer_new(NULL,0); while (*content) { if (*content == ' ' || *content == '\t' || *content == '\n') { if (word_buffer_p != word_buffer) { - rstring_list_add(words, word_buffer); + if(!has_lcase) + { + (*count_caps)++; + } + count++; + if(is_forbidden_word(word_buffer)){ + (*fw_count)++; + } word_buffer_p = word_buffer; *word_buffer = 0; + } + has_lcase = false; content++; continue; } *word_buffer_p = *content; + if(islower(*content) == *content) + has_lcase = true; word_buffer_p++; *word_buffer_p = 0; content++; } free(word_buffer); - return words; + return count; } bool is_fully_capitalized_word(char *word) { @@ -174,23 +174,24 @@ bool is_fully_capitalized_word(char *word) { return true; } -sl *get_capitalized_words(sl *all_words) { - sl *capitalized_words = sln(); - for (uint i = 0; i < all_words->count; i++) { - if (is_fully_capitalized_word(all_words->strings[i])) { - rstring_list_add(capitalized_words, all_words->strings[i]); - } +int get_capitalized_words(sl *all_words) { + int count = 0; + for (uint i = 0; i < all_words->count; i++) { + if (is_fully_capitalized_word(all_words->strings[i])) { + count++; } - - return capitalized_words; + } + + return count; } char *clean_content(char *content) { - char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz \n.,!?"; + char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz.,!?"; char *clean_content = (char *)malloc(strlen(content) + 1); char *clean_content_p = clean_content; *clean_content_p = 0; while (*content) { + if (strchr(allowed_ichars, tolower(*content))) { *clean_content_p = *content; clean_content_p++; @@ -201,176 +202,123 @@ char *clean_content(char *content) { return clean_content; } -sl *get_numbers(char *content) { - char *cc = clean_content(content); - char *ccc = stripws(cc); +int get_numbers(char *cc) { + int count = 0; + char *ccc = cc; char *cccp = ccc; - free(cc); char *number_buffer = (char *)malloc(strlen(ccc) + 1); *number_buffer = 0; char *number_buffer_p = number_buffer; - sl *numbers = sln(); while (*cccp) { if (isdigit((*cccp))) { *number_buffer_p = *cccp; number_buffer_p++; *number_buffer_p = 0; } else if (number_buffer != number_buffer_p) { - sla(numbers, number_buffer); + count++; *number_buffer = 0; number_buffer_p = number_buffer; } cccp++; } free(number_buffer); - free(ccc); - return numbers; + return count; } -bool stricmp(char *word1, char *word2) { - while (*word1 && tolower(*word1) == tolower(*word2)) { - word1++; - word2++; - } - return *word1 == *word2; -} -bool containswordi(sl *words, char *word) { - for (uint i = 0; i < words->count; i++) { - if (stricmp(words->strings[i], word)) - return true; - } - return false; -} - -sl *get_forbidden_words(sl *words) { - sl *found = sln(); - for (int j = 0; forbidden_words[j] != NULL; j++) { - if (containswordi(words, forbidden_words[j])) { - rstring_list_add(found, forbidden_words[j]); - } - } - return found; -} unsigned int total = 0; +char *readall(FILE *f) { + if (fseek(f, 0, SEEK_END) != 0) { + fclose(f); + return NULL; + } + size_t file_size = ftell(f); + if (file_size == (size_t)-1L) { + fclose(f); + return NULL; + } + if (fseek(f, 0, SEEK_SET) != 0) { + fclose(f); + return NULL; + } + char *buffer = (char *)malloc(file_size + 1); + if (!buffer) { + fclose(f); + return NULL; + } + size_t bytes_read = fread(buffer, 1, file_size, f); + buffer[bytes_read] = 0; + return buffer; +} + void analyze(FILE *f) { + if(!f){ + // File doesn't exist + return; + } total = total + 1; printf("#%u\n", total); - char *data = fread_till_eof(f); - - str_t *all = strn(1337); - char *sbuf = NULL; + char *data = readall(f); + if(!data) + return; char *clean_data = clean_content(data); - - free(clean_data); - - sl *words = get_words(data); + int capitalized_words = 0; + int fw = 0; + int words = get_words(data,&capitalized_words,&fw); + int sentences = get_sentences(data); + int numbers = get_numbers(clean_data); // All words - printf("Words: %llu\n", words->count); - if(show_words) - sld(words); - sbuf = slds(words); - stra(all, sbuf); - free(sbuf); + printf("Words: %d\n", words); // All capitalized words - sl *capitalized_words = get_capitalized_words(words); - ulonglong capitalized_words_count = capitalized_words->count; - printf("Capitalized words: %llu\n", capitalized_words_count); - if(show_capitalized) - sld(capitalized_words); - sbuf = slds(capitalized_words); - stra(all, sbuf); - free(sbuf); - - sl *sentences = get_sentences(data); - + printf("Capitalized words: %d\n", capitalized_words); + // All sentences - printf("Sentences: %llu\n", sentences->count); - if(show_sentences) - sld(sentences); - sbuf = slds(sentences); - stra(all, sbuf); - free(sbuf); - - + printf("Sentences: %i\n", sentences); // Numbers - sl *numbers = get_numbers(data); - printf("Numbers: %llu\n", numbers->count); - if(show_numbers) - sld(numbers); - sbuf = slds(numbers); - stra(all, sbuf); - free(sbuf); + printf("Numbers: %d\n", numbers); // Forbidden words - sl *fw = get_forbidden_words(words); - printf("Forbidden words: %llu\n", fw->count); - if(show_forbidden_words) - sld(fw); - sbuf = slds(fw); - stra(all, sbuf); - free(sbuf); - strd(all); - if(words->count){ - double capitalized_word_percentage = 100 * ((double)capitalized_words->count / (double)words->count); - - printf("Capitalized percentage: %f%%\n",capitalized_word_percentage); - double forbidden_word_percentage = 100 * ((double)fw->count / (double)words->count); - printf("Forbidden percentage: %f%%\n",forbidden_word_percentage); - ulonglong word_count_per_sentence = words->count / (sentences->count ? sentences->count : 1); - printf("Word count per sentence: %llu\n", word_count_per_sentence); - } - slf(capitalized_words); - slf(sentences); - slf(words); - slf(numbers); - slf(fw); + printf("Forbidden words: %d\n", fw); + + if (words) { + double capitalized_word_percentage = 100 * ((double)capitalized_words / (double)words); + printf("Capitalized percentage: %f%%\n", capitalized_word_percentage); + double forbidden_word_percentage = 100 * ((double)fw / (double)words); + printf("Forbidden percentage: %f%%\n", forbidden_word_percentage); + ulonglong word_count_per_sentence = words / (sentences ? sentences : 1); + printf("Word count per sentence: %llu\n", word_count_per_sentence); + } + free(clean_data); free(data); } void analyze_file(char *path) { FILE *f = fopen(path, "r"); + if(f){ analyze(f); fclose(f); + }else{ + printf("File doesn't exist: %s\n",path); + } } int main(int argc, char *argv[]) { - + if (argc > 1) { for (int i = 1; i < argc; i++) { - if(!strcmp(argv[1],"--hide-capitalized")){ - show_capitalized=false; - }else if(!strcmp(argv[1],"--show-sentences")){ - show_sentences=true; - }else if(!strcmp(argv[1],"--show-words")){ - show_words=true; - }else if(!strcmp(argv[1],"--show-numbers")){ - show_words=true; - }else if(!strcmp(argv[1],"--hide-forbidden-words")){ - show_forbidden_words=false; - }else if(!strcmp(argv[1],"help") || !strcmp(argv[1],"--help")){ - printf("%s", - "Usage: spam [file] [file] [file]\n" - "Flag defaults:\n" - " hide-capitalized = true\n" - " show-sentences = false\n" - " show-words = false\n" - " show-numbers = false\n" - " hide-forbidden-words = false\n"); - return 0; - } - + printf("File: %s\n", argv[i]); + analyze_file(argv[i]); - printf("%s\n", rmalloc_stats()); printf("\n"); + } return 0; diff --git a/retoor_c/rstr.h b/retoor_c/rstr.h index 9ae9c72..8ad3cb6 100644 --- a/retoor_c/rstr.h +++ b/retoor_c/rstr.h @@ -26,8 +26,6 @@ void stra(str_t *str, const char *to_append) { if (required_new_length > str->size) { str->size += required_new_length + str->buffer_size; str->content = (char *)realloc(str->content, str->size + 1); - } else { - // printf("NO NDEED\n"); } strcat(str->content, to_append); str->content[str->length] = 0; @@ -49,4 +47,4 @@ char *strc(str_t *str) { return content; } -#endif \ No newline at end of file +#endif