333 lines
		
	
	
		
			8.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
		
		
			
		
	
	
			333 lines
		
	
	
		
			8.5 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
|  | #include "rmalloc.h"
 | ||
|  | #include <stdio.h>
 | ||
|  | #include <stdlib.h>
 | ||
|  | #include <string.h>
 | ||
|  | #include <unistd.h>
 | ||
|  | 
 | ||
|  | #include "rstr.h"
 | ||
|  | #include "rstring_list.h"
 | ||
|  | #include <ctype.h>
 | ||
|  | 
 | ||
|  | #define sl rstring_list_t
 | ||
|  | #define slf rstring_list_free
 | ||
|  | #define sla rstring_list_add
 | ||
|  | #define sln rstring_list_new
 | ||
|  | #define rb rbuffer_t
 | ||
|  | #define rbf rbuffer_free
 | ||
|  | #define rbs rbuffer_to_string
 | ||
|  | #define rbw rbuffer_write
 | ||
|  | #define rbn rbuffer_new
 | ||
|  | 
 | ||
|  | char *forbidden_words[] = { | ||
|  |     "recovery",    "techie",    "http",     "https",   "digital",    "hack",          "::",       "//",    "com", | ||
|  |     "@",           "crypto",    "bitcoin",  "wallet",  "hacker",     "welcome",       "whatsapp", "email", "cryptocurrency", | ||
|  |     "stolen",      "freeze",    "quick",    "crucial", "tracing",    "scammers",      "expers",   "hire",  "century", | ||
|  |     "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets",   "funds", NULL}; | ||
|  | 
 | ||
|  | 
 | ||
|  | bool stricmp(char *word1, char *word2) { | ||
|  |     while (*word1 && tolower(*word1) == tolower(*word2)) { | ||
|  |         word1++; | ||
|  |         word2++; | ||
|  |     } | ||
|  |     return *word1 == *word2; | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | 
 | ||
|  | void sld(sl *lst) { | ||
|  |     for (ulonglong i = 0; i < lst->count; i++) { | ||
|  |         printf("<%llu:%s>\n", i, lst->strings[i]); | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | char *remove_preserved_chars(char *content) { | ||
|  |     char *cc = (char *)malloc(strlen(content) + 1); | ||
|  |     *cc = 0; | ||
|  |     char *ccp = cc; | ||
|  |     while (*content) { | ||
|  |         if (*content == '<' || *content == '>' || *content == ':') { | ||
|  |             content++; | ||
|  |             continue; | ||
|  |         } | ||
|  |         *ccp = *content; | ||
|  |         ccp++; | ||
|  |         *ccp = 0; | ||
|  |         content++; | ||
|  |     } | ||
|  |     return cc; | ||
|  | } | ||
|  | //Memory usage: 29 TB, 213.322.618 (re)allocated, 106.670.251 unqiue free'd, 0 in use.
 | ||
|  | 
 | ||
|  | char *slds(sl *lst) { | ||
|  |     str_t *buffer = strn(1337); | ||
|  |     for (ulonglong i = 0; i < lst->count; i++) { | ||
|  |         char *temp = (char *)malloc(strlen(lst->strings[i]) + 20); | ||
|  |         char *cc = remove_preserved_chars(lst->strings[i]); | ||
|  |         sprintf(temp, "<%llu:%s>\n", i, cc); | ||
|  |         free(cc); | ||
|  |         stra(buffer, temp); | ||
|  |         free(temp); | ||
|  |     } | ||
|  |     return strc(buffer); | ||
|  | } | ||
|  | 
 | ||
|  | bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; } | ||
|  | 
 | ||
|  | 
 | ||
|  | char *fread_till_eof(FILE *f) { | ||
|  |     char c; | ||
|  |     str_t *buffer = strn(1337); | ||
|  |     while ((c = fgetc(f)) != EOF) { | ||
|  |         strac(buffer, c); | ||
|  |     } | ||
|  |     char *content = strc(buffer); | ||
|  |     return content; | ||
|  | } | ||
|  | 
 | ||
|  | int  get_sentences(char *content) { | ||
|  |     int count = 0; | ||
|  |     char *sentence_buffer = (char *)malloc(strlen(content) + 1); | ||
|  |     char *sentence_buffer_p = sentence_buffer; | ||
|  |     bool in_line = false; | ||
|  |     while (*content) { | ||
|  |         if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) { | ||
|  |             content++; | ||
|  |             continue; | ||
|  |         } else { | ||
|  |             in_line = true; | ||
|  |         } | ||
|  |         if (*content == '.') { | ||
|  |             *sentence_buffer_p = *content; | ||
|  |             sentence_buffer_p++; | ||
|  |             *sentence_buffer_p = 0; | ||
|  |             count++;  | ||
|  |             sentence_buffer_p = sentence_buffer; | ||
|  |             *sentence_buffer = 0; | ||
|  |             content++; | ||
|  |             in_line = false; | ||
|  |             continue; | ||
|  |         } | ||
|  |         *sentence_buffer_p = *content; | ||
|  |         sentence_buffer_p++; | ||
|  |         *sentence_buffer_p = 0; | ||
|  |         content++; | ||
|  |     } | ||
|  |     free(sentence_buffer); | ||
|  |     return count; | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | bool is_forbidden_word(char *word) { | ||
|  |      | ||
|  |     for (int j = 0; forbidden_words[j] != NULL; j++) { | ||
|  |         if (stricmp(word, forbidden_words[j])) { | ||
|  |             return true;  | ||
|  |         } | ||
|  |     } | ||
|  |     return false; | ||
|  | } | ||
|  | 
 | ||
|  | int get_words(char *content, int * count_caps, int *fw_count) { | ||
|  |     int count = 0; | ||
|  |     char *word_buffer = (char *)malloc(strlen(content) + 1); | ||
|  |     char *word_buffer_p = word_buffer; | ||
|  |     *word_buffer_p = 0; | ||
|  |     bool has_lcase = false; | ||
|  |     // rbuffer_t * buffer = rbuffer_new(NULL,0);
 | ||
|  |     while (*content) { | ||
|  |         if (*content == ' ' || *content == '\t' || *content == '\n') { | ||
|  |             if (word_buffer_p != word_buffer) { | ||
|  |                 if(!has_lcase) | ||
|  |                 { | ||
|  |                     (*count_caps)++; | ||
|  |                 } | ||
|  |                 count++; | ||
|  |                 if(is_forbidden_word(word_buffer)){ | ||
|  |                     (*fw_count)++; | ||
|  |                 } | ||
|  |                 word_buffer_p = word_buffer; | ||
|  |                 *word_buffer = 0; | ||
|  |                  | ||
|  |             } | ||
|  |             has_lcase = false;    | ||
|  |             content++; | ||
|  |             continue; | ||
|  |         } | ||
|  |         *word_buffer_p = *content; | ||
|  |         if(islower(*content) == *content) | ||
|  |             has_lcase = true; | ||
|  |         word_buffer_p++; | ||
|  |         *word_buffer_p = 0; | ||
|  |         content++; | ||
|  |     } | ||
|  |     free(word_buffer); | ||
|  |     return count; | ||
|  | } | ||
|  | 
 | ||
|  | bool is_fully_capitalized_word(char *word) { | ||
|  |     while (*word) { | ||
|  |         if (isalnum(*word) && toupper(*word) != *word) | ||
|  |             return false; | ||
|  |         word++; | ||
|  |     } | ||
|  |     return true; | ||
|  | } | ||
|  | 
 | ||
|  | int get_capitalized_words(sl *all_words) { | ||
|  |     int count = 0; | ||
|  |     for (uint i = 0; i < all_words->count; i++) { | ||
|  |         if (is_fully_capitalized_word(all_words->strings[i])) { | ||
|  |            count++; | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     return count; | ||
|  | } | ||
|  | 
 | ||
|  | char *clean_content(char *content) { | ||
|  |     char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz.,!?"; | ||
|  |     char *clean_content = (char *)malloc(strlen(content) + 1); | ||
|  |     char *clean_content_p = clean_content; | ||
|  |     *clean_content_p = 0; | ||
|  |     while (*content) { | ||
|  | 
 | ||
|  |         if (strchr(allowed_ichars, tolower(*content))) { | ||
|  |             *clean_content_p = *content; | ||
|  |             clean_content_p++; | ||
|  |             *clean_content_p = 0; | ||
|  |         } | ||
|  |         content++; | ||
|  |     } | ||
|  |     return clean_content; | ||
|  | } | ||
|  | 
 | ||
|  | int get_numbers(char *cc) { | ||
|  |     int count = 0; | ||
|  |     char *ccc = cc; | ||
|  |     char *cccp = ccc; | ||
|  |     char *number_buffer = (char *)malloc(strlen(ccc) + 1); | ||
|  |     *number_buffer = 0; | ||
|  |     char *number_buffer_p = number_buffer; | ||
|  |     while (*cccp) { | ||
|  |         if (isdigit((*cccp))) { | ||
|  |             *number_buffer_p = *cccp; | ||
|  |             number_buffer_p++; | ||
|  |             *number_buffer_p = 0; | ||
|  |         } else if (number_buffer != number_buffer_p) { | ||
|  |             count++; | ||
|  |             *number_buffer = 0; | ||
|  |             number_buffer_p = number_buffer; | ||
|  |         } | ||
|  |         cccp++; | ||
|  |     } | ||
|  |     free(number_buffer); | ||
|  |     return count; | ||
|  | } | ||
|  | 
 | ||
|  | 
 | ||
|  | unsigned int total = 0; | ||
|  | 
 | ||
|  | char *readall(FILE *f) { | ||
|  |     if (fseek(f, 0, SEEK_END) != 0) { | ||
|  |         fclose(f); | ||
|  |         return NULL; | ||
|  |     } | ||
|  |     size_t file_size = ftell(f); | ||
|  |     if (file_size == (size_t)-1L) { | ||
|  |         fclose(f); | ||
|  |         return NULL; | ||
|  |     } | ||
|  |     if (fseek(f, 0, SEEK_SET) != 0) { | ||
|  |         fclose(f); | ||
|  |         return NULL; | ||
|  |     } | ||
|  |     char *buffer = (char *)malloc(file_size + 1);  | ||
|  |     if (!buffer) { | ||
|  |         fclose(f); | ||
|  |         return NULL; | ||
|  |     } | ||
|  |     size_t bytes_read = fread(buffer, 1, file_size, f); | ||
|  |     buffer[bytes_read] = 0; | ||
|  |     return buffer; | ||
|  | } | ||
|  | 
 | ||
|  | void analyze(FILE *f) { | ||
|  |     if(!f){ | ||
|  |         // File doesn't exist
 | ||
|  |         return; | ||
|  |     } | ||
|  |     total = total + 1; | ||
|  | 
 | ||
|  |     printf("#%u\n", total); | ||
|  | 
 | ||
|  |     char *data = readall(f); | ||
|  |     if(!data) | ||
|  |         return; | ||
|  |     char *clean_data = clean_content(data); | ||
|  |     int capitalized_words = 0; | ||
|  |     int fw = 0; | ||
|  |     int words = get_words(data,&capitalized_words,&fw); | ||
|  |     int sentences = get_sentences(data); | ||
|  |     int numbers = get_numbers(clean_data); | ||
|  | 
 | ||
|  |     // All words
 | ||
|  |     printf("Words: %d\n", words); | ||
|  | 
 | ||
|  |     // All capitalized words
 | ||
|  |     printf("Capitalized words: %d\n", capitalized_words); | ||
|  |      | ||
|  |     // All sentences
 | ||
|  |     printf("Sentences: %i\n", sentences); | ||
|  | 
 | ||
|  |     // Numbers
 | ||
|  |     printf("Numbers: %d\n", numbers); | ||
|  | 
 | ||
|  |     // Forbidden words
 | ||
|  |     printf("Forbidden words: %d\n", fw); | ||
|  |      | ||
|  |     if (words) { | ||
|  |         double capitalized_word_percentage = 100 * ((double)capitalized_words / (double)words); | ||
|  | 
 | ||
|  |         printf("Capitalized percentage: %f%%\n", capitalized_word_percentage); | ||
|  |         double forbidden_word_percentage = 100 * ((double)fw / (double)words); | ||
|  |         printf("Forbidden percentage: %f%%\n", forbidden_word_percentage); | ||
|  |         ulonglong word_count_per_sentence = words / (sentences ? sentences : 1); | ||
|  |         printf("Word count per sentence: %llu\n", word_count_per_sentence); | ||
|  |     } | ||
|  |     free(clean_data); | ||
|  |     free(data); | ||
|  | } | ||
|  | 
 | ||
|  | void analyze_file(char *path) { | ||
|  |     FILE *f = fopen(path, "r"); | ||
|  |     if(f){ | ||
|  |     analyze(f); | ||
|  |     fclose(f); | ||
|  |     }else{ | ||
|  |         printf("File doesn't exist: %s\n",path); | ||
|  |     } | ||
|  | } | ||
|  | void * analyze_file_thread(void *path){ | ||
|  |     analyze_file((char *)path); | ||
|  |     return NULL; | ||
|  | } | ||
|  | 
 | ||
|  | int main(int argc, char *argv[]) { | ||
|  |     if (argc > 1) { | ||
|  |         pthread_t *threads = (pthread_t *)malloc(argc * sizeof(pthread_t)); | ||
|  |         for (int i = 1; i < argc; i++) { | ||
|  |            pthread_create(&threads[i-1],NULL,analyze_file_thread,(void *)argv[i]); | ||
|  |         } | ||
|  |         for(int i = 1; i < argc; i++){ | ||
|  |             pthread_join(threads[i-1],NULL); | ||
|  |         } | ||
|  |         free(threads); | ||
|  |         return 0; | ||
|  |     } | ||
|  |     analyze(stdin); | ||
|  |     printf("%s\n", rmalloc_stats()); | ||
|  |     exit(0); | ||
|  |     return 0; | ||
|  | } |