Performance upgrade C version.
This commit is contained in:
parent
6be2d372f2
commit
00d8bdf3ce
9
Makefile
9
Makefile
@ -1,7 +1,7 @@
|
|||||||
CC = gcc
|
CC = gcc
|
||||||
CFLAGS = -Wall -Werror -Wextra -Ofast -std=c2x
|
CFLAGS = -Wall -Werror -Wextra -Ofast -std=c2x
|
||||||
|
|
||||||
all: build run valgrind build_risspam run_risspam
|
all: build run valgrind build_risspam run_risspam benchmark
|
||||||
|
|
||||||
build:
|
build:
|
||||||
@echo "Compiling retoor_c project.".
|
@echo "Compiling retoor_c project.".
|
||||||
@ -39,3 +39,10 @@ run_not_spam_risspam:
|
|||||||
|
|
||||||
valgrind: build
|
valgrind: build
|
||||||
valgrind ./isspam ./spam/*.txt
|
valgrind ./isspam ./spam/*.txt
|
||||||
|
|
||||||
|
benchmark:
|
||||||
|
-@rm -rf books
|
||||||
|
echo "Extracting books."
|
||||||
|
tar -xzf books.tar.gz books/
|
||||||
|
echo "Extracted books."
|
||||||
|
python bench.py
|
||||||
|
@ -4,8 +4,8 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include "rstring_list.h"
|
|
||||||
#include "rstr.h"
|
#include "rstr.h"
|
||||||
|
#include "rstring_list.h"
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
|
||||||
#define sl rstring_list_t
|
#define sl rstring_list_t
|
||||||
@ -25,24 +25,16 @@ char *forbidden_words[] = {
|
|||||||
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", NULL};
|
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", NULL};
|
||||||
|
|
||||||
|
|
||||||
bool show_capitalized = false;
|
bool stricmp(char *word1, char *word2) {
|
||||||
bool show_sentences = false;
|
while (*word1 && tolower(*word1) == tolower(*word2)) {
|
||||||
bool show_words = false;
|
word1++;
|
||||||
bool show_numbers = false;
|
word2++;
|
||||||
bool show_forbidden_words = true;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
bool file_exists(char * path){
|
|
||||||
FILE * f = fopen(path, "r");
|
|
||||||
bool result = f != NULL;
|
|
||||||
if(f){
|
|
||||||
fclose(f);
|
|
||||||
}
|
}
|
||||||
return result;
|
return *word1 == *word2;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void sld(sl *lst) {
|
void sld(sl *lst) {
|
||||||
for (ulonglong i = 0; i < lst->count; i++) {
|
for (ulonglong i = 0; i < lst->count; i++) {
|
||||||
printf("<%llu:%s>\n", i, lst->strings[i]);
|
printf("<%llu:%s>\n", i, lst->strings[i]);
|
||||||
@ -65,6 +57,7 @@ char *remove_preserved_chars(char *content) {
|
|||||||
}
|
}
|
||||||
return cc;
|
return cc;
|
||||||
}
|
}
|
||||||
|
//Memory usage: 29 TB, 213.322.618 (re)allocated, 106.670.251 unqiue free'd, 0 in use.
|
||||||
|
|
||||||
char *slds(sl *lst) {
|
char *slds(sl *lst) {
|
||||||
str_t *buffer = strn(1337);
|
str_t *buffer = strn(1337);
|
||||||
@ -81,20 +74,6 @@ char *slds(sl *lst) {
|
|||||||
|
|
||||||
bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; }
|
bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; }
|
||||||
|
|
||||||
char *stripws(char *content) {
|
|
||||||
char *cc = (char *)malloc(strlen(content) + 1);
|
|
||||||
*cc = 0;
|
|
||||||
char *ccp = cc;
|
|
||||||
while (*content) {
|
|
||||||
if (!isws(*content)) {
|
|
||||||
*ccp = *content;
|
|
||||||
ccp++;
|
|
||||||
*ccp = 0;
|
|
||||||
}
|
|
||||||
content++;
|
|
||||||
}
|
|
||||||
return cc;
|
|
||||||
}
|
|
||||||
|
|
||||||
char *fread_till_eof(FILE *f) {
|
char *fread_till_eof(FILE *f) {
|
||||||
char c;
|
char c;
|
||||||
@ -106,12 +85,10 @@ char *fread_till_eof(FILE *f) {
|
|||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
rstring_list_t *get_sentences(char *content) {
|
int get_sentences(char *content) {
|
||||||
|
int count = 0;
|
||||||
rstring_list_t *sentences = rstring_list_new();
|
|
||||||
char *sentence_buffer = (char *)malloc(strlen(content) + 1);
|
char *sentence_buffer = (char *)malloc(strlen(content) + 1);
|
||||||
char *sentence_buffer_p = sentence_buffer;
|
char *sentence_buffer_p = sentence_buffer;
|
||||||
// rbuffer_t * buffer = rbuffer_new(NULL,0);
|
|
||||||
bool in_line = false;
|
bool in_line = false;
|
||||||
while (*content) {
|
while (*content) {
|
||||||
if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) {
|
if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) {
|
||||||
@ -124,7 +101,7 @@ rstring_list_t *get_sentences(char *content) {
|
|||||||
*sentence_buffer_p = *content;
|
*sentence_buffer_p = *content;
|
||||||
sentence_buffer_p++;
|
sentence_buffer_p++;
|
||||||
*sentence_buffer_p = 0;
|
*sentence_buffer_p = 0;
|
||||||
rstring_list_add(sentences, sentence_buffer);
|
count++;
|
||||||
sentence_buffer_p = sentence_buffer;
|
sentence_buffer_p = sentence_buffer;
|
||||||
*sentence_buffer = 0;
|
*sentence_buffer = 0;
|
||||||
content++;
|
content++;
|
||||||
@ -137,32 +114,55 @@ rstring_list_t *get_sentences(char *content) {
|
|||||||
content++;
|
content++;
|
||||||
}
|
}
|
||||||
free(sentence_buffer);
|
free(sentence_buffer);
|
||||||
return sentences;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
rstring_list_t *get_words(char *content) {
|
|
||||||
rstring_list_t *words = rstring_list_new();
|
bool is_forbidden_word(char *word) {
|
||||||
|
|
||||||
|
for (int j = 0; forbidden_words[j] != NULL; j++) {
|
||||||
|
if (stricmp(word, forbidden_words[j])) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int get_words(char *content, int * count_caps, int *fw_count) {
|
||||||
|
int count = 0;
|
||||||
char *word_buffer = (char *)malloc(strlen(content) + 1);
|
char *word_buffer = (char *)malloc(strlen(content) + 1);
|
||||||
char *word_buffer_p = word_buffer;
|
char *word_buffer_p = word_buffer;
|
||||||
*word_buffer_p = 0;
|
*word_buffer_p = 0;
|
||||||
|
bool has_lcase = false;
|
||||||
// rbuffer_t * buffer = rbuffer_new(NULL,0);
|
// rbuffer_t * buffer = rbuffer_new(NULL,0);
|
||||||
while (*content) {
|
while (*content) {
|
||||||
if (*content == ' ' || *content == '\t' || *content == '\n') {
|
if (*content == ' ' || *content == '\t' || *content == '\n') {
|
||||||
if (word_buffer_p != word_buffer) {
|
if (word_buffer_p != word_buffer) {
|
||||||
rstring_list_add(words, word_buffer);
|
if(!has_lcase)
|
||||||
|
{
|
||||||
|
(*count_caps)++;
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
if(is_forbidden_word(word_buffer)){
|
||||||
|
(*fw_count)++;
|
||||||
|
}
|
||||||
word_buffer_p = word_buffer;
|
word_buffer_p = word_buffer;
|
||||||
*word_buffer = 0;
|
*word_buffer = 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
has_lcase = false;
|
||||||
content++;
|
content++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
*word_buffer_p = *content;
|
*word_buffer_p = *content;
|
||||||
|
if(islower(*content) == *content)
|
||||||
|
has_lcase = true;
|
||||||
word_buffer_p++;
|
word_buffer_p++;
|
||||||
*word_buffer_p = 0;
|
*word_buffer_p = 0;
|
||||||
content++;
|
content++;
|
||||||
}
|
}
|
||||||
free(word_buffer);
|
free(word_buffer);
|
||||||
return words;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_fully_capitalized_word(char *word) {
|
bool is_fully_capitalized_word(char *word) {
|
||||||
@ -174,23 +174,24 @@ bool is_fully_capitalized_word(char *word) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
sl *get_capitalized_words(sl *all_words) {
|
int get_capitalized_words(sl *all_words) {
|
||||||
sl *capitalized_words = sln();
|
int count = 0;
|
||||||
for (uint i = 0; i < all_words->count; i++) {
|
for (uint i = 0; i < all_words->count; i++) {
|
||||||
if (is_fully_capitalized_word(all_words->strings[i])) {
|
if (is_fully_capitalized_word(all_words->strings[i])) {
|
||||||
rstring_list_add(capitalized_words, all_words->strings[i]);
|
count++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return capitalized_words;
|
return count;
|
||||||
}
|
}
|
||||||
|
|
||||||
char *clean_content(char *content) {
|
char *clean_content(char *content) {
|
||||||
char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz \n.,!?";
|
char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz.,!?";
|
||||||
char *clean_content = (char *)malloc(strlen(content) + 1);
|
char *clean_content = (char *)malloc(strlen(content) + 1);
|
||||||
char *clean_content_p = clean_content;
|
char *clean_content_p = clean_content;
|
||||||
*clean_content_p = 0;
|
*clean_content_p = 0;
|
||||||
while (*content) {
|
while (*content) {
|
||||||
|
|
||||||
if (strchr(allowed_ichars, tolower(*content))) {
|
if (strchr(allowed_ichars, tolower(*content))) {
|
||||||
*clean_content_p = *content;
|
*clean_content_p = *content;
|
||||||
clean_content_p++;
|
clean_content_p++;
|
||||||
@ -201,176 +202,123 @@ char *clean_content(char *content) {
|
|||||||
return clean_content;
|
return clean_content;
|
||||||
}
|
}
|
||||||
|
|
||||||
sl *get_numbers(char *content) {
|
int get_numbers(char *cc) {
|
||||||
char *cc = clean_content(content);
|
int count = 0;
|
||||||
char *ccc = stripws(cc);
|
char *ccc = cc;
|
||||||
char *cccp = ccc;
|
char *cccp = ccc;
|
||||||
free(cc);
|
|
||||||
char *number_buffer = (char *)malloc(strlen(ccc) + 1);
|
char *number_buffer = (char *)malloc(strlen(ccc) + 1);
|
||||||
*number_buffer = 0;
|
*number_buffer = 0;
|
||||||
char *number_buffer_p = number_buffer;
|
char *number_buffer_p = number_buffer;
|
||||||
sl *numbers = sln();
|
|
||||||
while (*cccp) {
|
while (*cccp) {
|
||||||
if (isdigit((*cccp))) {
|
if (isdigit((*cccp))) {
|
||||||
*number_buffer_p = *cccp;
|
*number_buffer_p = *cccp;
|
||||||
number_buffer_p++;
|
number_buffer_p++;
|
||||||
*number_buffer_p = 0;
|
*number_buffer_p = 0;
|
||||||
} else if (number_buffer != number_buffer_p) {
|
} else if (number_buffer != number_buffer_p) {
|
||||||
sla(numbers, number_buffer);
|
count++;
|
||||||
*number_buffer = 0;
|
*number_buffer = 0;
|
||||||
number_buffer_p = number_buffer;
|
number_buffer_p = number_buffer;
|
||||||
}
|
}
|
||||||
cccp++;
|
cccp++;
|
||||||
}
|
}
|
||||||
free(number_buffer);
|
free(number_buffer);
|
||||||
free(ccc);
|
return count;
|
||||||
return numbers;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool stricmp(char *word1, char *word2) {
|
|
||||||
while (*word1 && tolower(*word1) == tolower(*word2)) {
|
|
||||||
word1++;
|
|
||||||
word2++;
|
|
||||||
}
|
|
||||||
return *word1 == *word2;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool containswordi(sl *words, char *word) {
|
|
||||||
for (uint i = 0; i < words->count; i++) {
|
|
||||||
if (stricmp(words->strings[i], word))
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
sl *get_forbidden_words(sl *words) {
|
|
||||||
sl *found = sln();
|
|
||||||
for (int j = 0; forbidden_words[j] != NULL; j++) {
|
|
||||||
if (containswordi(words, forbidden_words[j])) {
|
|
||||||
rstring_list_add(found, forbidden_words[j]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return found;
|
|
||||||
}
|
|
||||||
unsigned int total = 0;
|
unsigned int total = 0;
|
||||||
|
|
||||||
|
char *readall(FILE *f) {
|
||||||
|
if (fseek(f, 0, SEEK_END) != 0) {
|
||||||
|
fclose(f);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
size_t file_size = ftell(f);
|
||||||
|
if (file_size == (size_t)-1L) {
|
||||||
|
fclose(f);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (fseek(f, 0, SEEK_SET) != 0) {
|
||||||
|
fclose(f);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
char *buffer = (char *)malloc(file_size + 1);
|
||||||
|
if (!buffer) {
|
||||||
|
fclose(f);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
size_t bytes_read = fread(buffer, 1, file_size, f);
|
||||||
|
buffer[bytes_read] = 0;
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
void analyze(FILE *f) {
|
void analyze(FILE *f) {
|
||||||
|
if(!f){
|
||||||
|
// File doesn't exist
|
||||||
|
return;
|
||||||
|
}
|
||||||
total = total + 1;
|
total = total + 1;
|
||||||
|
|
||||||
printf("#%u\n", total);
|
printf("#%u\n", total);
|
||||||
char *data = fread_till_eof(f);
|
|
||||||
|
|
||||||
str_t *all = strn(1337);
|
|
||||||
char *sbuf = NULL;
|
|
||||||
|
|
||||||
|
char *data = readall(f);
|
||||||
|
if(!data)
|
||||||
|
return;
|
||||||
char *clean_data = clean_content(data);
|
char *clean_data = clean_content(data);
|
||||||
|
int capitalized_words = 0;
|
||||||
free(clean_data);
|
int fw = 0;
|
||||||
|
int words = get_words(data,&capitalized_words,&fw);
|
||||||
sl *words = get_words(data);
|
int sentences = get_sentences(data);
|
||||||
|
int numbers = get_numbers(clean_data);
|
||||||
|
|
||||||
// All words
|
// All words
|
||||||
printf("Words: %llu\n", words->count);
|
printf("Words: %d\n", words);
|
||||||
if(show_words)
|
|
||||||
sld(words);
|
|
||||||
sbuf = slds(words);
|
|
||||||
stra(all, sbuf);
|
|
||||||
free(sbuf);
|
|
||||||
|
|
||||||
// All capitalized words
|
// All capitalized words
|
||||||
sl *capitalized_words = get_capitalized_words(words);
|
printf("Capitalized words: %d\n", capitalized_words);
|
||||||
ulonglong capitalized_words_count = capitalized_words->count;
|
|
||||||
printf("Capitalized words: %llu\n", capitalized_words_count);
|
|
||||||
if(show_capitalized)
|
|
||||||
sld(capitalized_words);
|
|
||||||
sbuf = slds(capitalized_words);
|
|
||||||
stra(all, sbuf);
|
|
||||||
free(sbuf);
|
|
||||||
|
|
||||||
sl *sentences = get_sentences(data);
|
|
||||||
|
|
||||||
// All sentences
|
// All sentences
|
||||||
printf("Sentences: %llu\n", sentences->count);
|
printf("Sentences: %i\n", sentences);
|
||||||
if(show_sentences)
|
|
||||||
sld(sentences);
|
|
||||||
sbuf = slds(sentences);
|
|
||||||
stra(all, sbuf);
|
|
||||||
free(sbuf);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// Numbers
|
// Numbers
|
||||||
sl *numbers = get_numbers(data);
|
printf("Numbers: %d\n", numbers);
|
||||||
printf("Numbers: %llu\n", numbers->count);
|
|
||||||
if(show_numbers)
|
|
||||||
sld(numbers);
|
|
||||||
sbuf = slds(numbers);
|
|
||||||
stra(all, sbuf);
|
|
||||||
free(sbuf);
|
|
||||||
|
|
||||||
// Forbidden words
|
// Forbidden words
|
||||||
sl *fw = get_forbidden_words(words);
|
printf("Forbidden words: %d\n", fw);
|
||||||
printf("Forbidden words: %llu\n", fw->count);
|
|
||||||
if(show_forbidden_words)
|
|
||||||
sld(fw);
|
|
||||||
sbuf = slds(fw);
|
|
||||||
stra(all, sbuf);
|
|
||||||
free(sbuf);
|
|
||||||
strd(all);
|
|
||||||
if(words->count){
|
|
||||||
double capitalized_word_percentage = 100 * ((double)capitalized_words->count / (double)words->count);
|
|
||||||
|
|
||||||
printf("Capitalized percentage: %f%%\n",capitalized_word_percentage);
|
if (words) {
|
||||||
double forbidden_word_percentage = 100 * ((double)fw->count / (double)words->count);
|
double capitalized_word_percentage = 100 * ((double)capitalized_words / (double)words);
|
||||||
printf("Forbidden percentage: %f%%\n",forbidden_word_percentage);
|
|
||||||
ulonglong word_count_per_sentence = words->count / (sentences->count ? sentences->count : 1);
|
printf("Capitalized percentage: %f%%\n", capitalized_word_percentage);
|
||||||
|
double forbidden_word_percentage = 100 * ((double)fw / (double)words);
|
||||||
|
printf("Forbidden percentage: %f%%\n", forbidden_word_percentage);
|
||||||
|
ulonglong word_count_per_sentence = words / (sentences ? sentences : 1);
|
||||||
printf("Word count per sentence: %llu\n", word_count_per_sentence);
|
printf("Word count per sentence: %llu\n", word_count_per_sentence);
|
||||||
}
|
}
|
||||||
slf(capitalized_words);
|
free(clean_data);
|
||||||
slf(sentences);
|
|
||||||
slf(words);
|
|
||||||
slf(numbers);
|
|
||||||
slf(fw);
|
|
||||||
|
|
||||||
free(data);
|
free(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
void analyze_file(char *path) {
|
void analyze_file(char *path) {
|
||||||
FILE *f = fopen(path, "r");
|
FILE *f = fopen(path, "r");
|
||||||
|
if(f){
|
||||||
analyze(f);
|
analyze(f);
|
||||||
fclose(f);
|
fclose(f);
|
||||||
|
}else{
|
||||||
|
printf("File doesn't exist: %s\n",path);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
|
|
||||||
if (argc > 1) {
|
if (argc > 1) {
|
||||||
for (int i = 1; i < argc; i++) {
|
for (int i = 1; i < argc; i++) {
|
||||||
if(!strcmp(argv[1],"--hide-capitalized")){
|
|
||||||
show_capitalized=false;
|
|
||||||
}else if(!strcmp(argv[1],"--show-sentences")){
|
|
||||||
show_sentences=true;
|
|
||||||
}else if(!strcmp(argv[1],"--show-words")){
|
|
||||||
show_words=true;
|
|
||||||
}else if(!strcmp(argv[1],"--show-numbers")){
|
|
||||||
show_words=true;
|
|
||||||
}else if(!strcmp(argv[1],"--hide-forbidden-words")){
|
|
||||||
show_forbidden_words=false;
|
|
||||||
}else if(!strcmp(argv[1],"help") || !strcmp(argv[1],"--help")){
|
|
||||||
printf("%s",
|
|
||||||
"Usage: spam [file] [file] [file]\n"
|
|
||||||
"Flag defaults:\n"
|
|
||||||
" hide-capitalized = true\n"
|
|
||||||
" show-sentences = false\n"
|
|
||||||
" show-words = false\n"
|
|
||||||
" show-numbers = false\n"
|
|
||||||
" hide-forbidden-words = false\n");
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
printf("File: %s\n", argv[i]);
|
printf("File: %s\n", argv[i]);
|
||||||
|
|
||||||
analyze_file(argv[i]);
|
analyze_file(argv[i]);
|
||||||
printf("%s\n", rmalloc_stats());
|
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -26,8 +26,6 @@ void stra(str_t *str, const char *to_append) {
|
|||||||
if (required_new_length > str->size) {
|
if (required_new_length > str->size) {
|
||||||
str->size += required_new_length + str->buffer_size;
|
str->size += required_new_length + str->buffer_size;
|
||||||
str->content = (char *)realloc(str->content, str->size + 1);
|
str->content = (char *)realloc(str->content, str->size + 1);
|
||||||
} else {
|
|
||||||
// printf("NO NDEED\n");
|
|
||||||
}
|
}
|
||||||
strcat(str->content, to_append);
|
strcat(str->content, to_append);
|
||||||
str->content[str->length] = 0;
|
str->content[str->length] = 0;
|
||||||
|
Loading…
Reference in New Issue
Block a user