333 lines
8.6 KiB
C
333 lines
8.6 KiB
C
|
#include "rmalloc.h"
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
#include <unistd.h>
|
||
|
|
||
|
#include "rstring_list.h"
|
||
|
#include "rstr.h"
|
||
|
#include <ctype.h>
|
||
|
|
||
|
#define sl rstring_list_t
|
||
|
#define slf rstring_list_free
|
||
|
#define sla rstring_list_add
|
||
|
#define sln rstring_list_new
|
||
|
#define rb rbuffer_t
|
||
|
#define rbf rbuffer_free
|
||
|
#define rbs rbuffer_to_string
|
||
|
#define rbw rbuffer_write
|
||
|
#define rbn rbuffer_new
|
||
|
|
||
|
char *forbidden_words[] = {
|
||
|
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
|
||
|
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
|
||
|
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
|
||
|
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", NULL};
|
||
|
|
||
|
void sld(sl *lst) {
|
||
|
for (uint i = 0; i < lst->count; i++) {
|
||
|
printf("<%u:%s>\n", i, lst->strings[i]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
char *remove_preserved_chars(char *content) {
|
||
|
char *cc = (char *)malloc(strlen(content) + 1);
|
||
|
*cc = 0;
|
||
|
char *ccp = cc;
|
||
|
while (*content) {
|
||
|
if (*content == '<' || *content == '>' || *content == ':') {
|
||
|
content++;
|
||
|
continue;
|
||
|
}
|
||
|
*ccp = *content;
|
||
|
ccp++;
|
||
|
*ccp = 0;
|
||
|
content++;
|
||
|
}
|
||
|
return cc;
|
||
|
}
|
||
|
|
||
|
char *slds(sl *lst) {
|
||
|
str_t *buffer = strn(1337);
|
||
|
for (uint i = 0; i < lst->count; i++) {
|
||
|
char *temp = (char *)malloc(strlen(lst->strings[i]) + 20);
|
||
|
char *cc = remove_preserved_chars(lst->strings[i]);
|
||
|
sprintf(temp, "<%u:%s>\n", i, cc);
|
||
|
free(cc);
|
||
|
stra(buffer, temp);
|
||
|
free(temp);
|
||
|
}
|
||
|
return strc(buffer);
|
||
|
}
|
||
|
|
||
|
bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; }
|
||
|
|
||
|
char *stripws(char *content) {
|
||
|
char *cc = (char *)malloc(strlen(content) + 1);
|
||
|
*cc = 0;
|
||
|
char *ccp = cc;
|
||
|
while (*content) {
|
||
|
if (!isws(*content)) {
|
||
|
*ccp = *content;
|
||
|
ccp++;
|
||
|
*ccp = 0;
|
||
|
}
|
||
|
content++;
|
||
|
}
|
||
|
return cc;
|
||
|
}
|
||
|
|
||
|
char *fread_till_eof(FILE *f) {
|
||
|
char c;
|
||
|
str_t *buffer = strn(1337);
|
||
|
while ((c = fgetc(f)) != EOF) {
|
||
|
strac(buffer, c);
|
||
|
}
|
||
|
char *content = strc(buffer);
|
||
|
return content;
|
||
|
}
|
||
|
|
||
|
rstring_list_t *get_sentences(char *content) {
|
||
|
|
||
|
rstring_list_t *sentences = rstring_list_new();
|
||
|
char *sentence_buffer = (char *)malloc(strlen(content) + 1);
|
||
|
char *sentence_buffer_p = sentence_buffer;
|
||
|
// rbuffer_t * buffer = rbuffer_new(NULL,0);
|
||
|
bool in_line = false;
|
||
|
while (*content) {
|
||
|
if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) {
|
||
|
content++;
|
||
|
continue;
|
||
|
} else {
|
||
|
in_line = true;
|
||
|
}
|
||
|
if (*content == '.') {
|
||
|
*sentence_buffer_p = *content;
|
||
|
sentence_buffer_p++;
|
||
|
*sentence_buffer_p = 0;
|
||
|
rstring_list_add(sentences, sentence_buffer);
|
||
|
sentence_buffer_p = sentence_buffer;
|
||
|
*sentence_buffer = 0;
|
||
|
content++;
|
||
|
in_line = false;
|
||
|
continue;
|
||
|
}
|
||
|
*sentence_buffer_p = *content;
|
||
|
sentence_buffer_p++;
|
||
|
*sentence_buffer_p = 0;
|
||
|
content++;
|
||
|
}
|
||
|
free(sentence_buffer);
|
||
|
return sentences;
|
||
|
}
|
||
|
|
||
|
rstring_list_t *get_words(char *content) {
|
||
|
rstring_list_t *words = rstring_list_new();
|
||
|
char *word_buffer = (char *)malloc(strlen(content) + 1);
|
||
|
char *word_buffer_p = word_buffer;
|
||
|
*word_buffer_p = 0;
|
||
|
// rbuffer_t * buffer = rbuffer_new(NULL,0);
|
||
|
while (*content) {
|
||
|
if (*content == ' ' || *content == '\t' || *content == '\n') {
|
||
|
if (word_buffer_p != word_buffer) {
|
||
|
rstring_list_add(words, word_buffer);
|
||
|
word_buffer_p = word_buffer;
|
||
|
*word_buffer = 0;
|
||
|
}
|
||
|
content++;
|
||
|
continue;
|
||
|
}
|
||
|
*word_buffer_p = *content;
|
||
|
word_buffer_p++;
|
||
|
*word_buffer_p = 0;
|
||
|
content++;
|
||
|
}
|
||
|
free(word_buffer);
|
||
|
return words;
|
||
|
}
|
||
|
|
||
|
bool is_fully_capitalized_word(char *word) {
|
||
|
while (*word) {
|
||
|
if (isalnum(*word) && toupper(*word) != *word)
|
||
|
return false;
|
||
|
word++;
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
sl *get_capitalized_words(char *content) {
|
||
|
sl *capitalized_words = sln();
|
||
|
sl *sentences = get_sentences(content);
|
||
|
for (uint j = 0; j < sentences->count; j++) {
|
||
|
char *sentence = sentences->strings[j];
|
||
|
sl *all_words = get_words(sentence);
|
||
|
|
||
|
// Always skip the first word since sentences start with
|
||
|
for (uint i = 0; i < all_words->count; i++) {
|
||
|
if (is_fully_capitalized_word(all_words->strings[i])) {
|
||
|
rstring_list_add(capitalized_words, all_words->strings[i]);
|
||
|
}
|
||
|
}
|
||
|
slf(all_words);
|
||
|
}
|
||
|
slf(sentences);
|
||
|
return capitalized_words;
|
||
|
}
|
||
|
|
||
|
char *clean_content(char *content) {
|
||
|
char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz \n.,!?";
|
||
|
char *clean_content = (char *)malloc(strlen(content) + 1);
|
||
|
char *clean_content_p = clean_content;
|
||
|
*clean_content_p = 0;
|
||
|
while (*content) {
|
||
|
if (strchr(allowed_ichars, tolower(*content))) {
|
||
|
*clean_content_p = *content;
|
||
|
clean_content_p++;
|
||
|
*clean_content_p = 0;
|
||
|
}
|
||
|
content++;
|
||
|
}
|
||
|
return clean_content;
|
||
|
}
|
||
|
|
||
|
sl *get_numbers(char *content) {
|
||
|
char *cc = clean_content(content);
|
||
|
char *ccc = stripws(cc);
|
||
|
char *cccp = ccc;
|
||
|
free(cc);
|
||
|
char *number_buffer = (char *)malloc(strlen(ccc) + 1);
|
||
|
*number_buffer = 0;
|
||
|
char *number_buffer_p = number_buffer;
|
||
|
sl *numbers = sln();
|
||
|
while (*cccp) {
|
||
|
if (isdigit((*cccp))) {
|
||
|
*number_buffer_p = *cccp;
|
||
|
number_buffer_p++;
|
||
|
*number_buffer_p = 0;
|
||
|
} else if (number_buffer != number_buffer_p) {
|
||
|
sla(numbers, number_buffer);
|
||
|
*number_buffer = 0;
|
||
|
number_buffer_p = number_buffer;
|
||
|
}
|
||
|
cccp++;
|
||
|
}
|
||
|
free(number_buffer);
|
||
|
free(ccc);
|
||
|
return numbers;
|
||
|
}
|
||
|
|
||
|
bool stricmp(char *word1, char *word2) {
|
||
|
while (*word1 && tolower(*word1) == tolower(*word2)) {
|
||
|
word1++;
|
||
|
word2++;
|
||
|
}
|
||
|
return *word1 == *word2;
|
||
|
}
|
||
|
|
||
|
bool containswordi(sl *words, char *word) {
|
||
|
for (uint i = 0; i < words->count; i++) {
|
||
|
if (stricmp(words->strings[i], word))
|
||
|
return true;
|
||
|
}
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
sl *get_forbidden_words(char *content) {
|
||
|
sl *words = get_words(content);
|
||
|
sl *found = sln();
|
||
|
for (int j = 0; forbidden_words[j] != NULL; j++) {
|
||
|
if (containswordi(words, forbidden_words[j])) {
|
||
|
rstring_list_add(found, forbidden_words[j]);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
slf(words);
|
||
|
return found;
|
||
|
}
|
||
|
|
||
|
void analyze(FILE *f) {
|
||
|
char *data = fread_till_eof(f);
|
||
|
|
||
|
str_t *all = strn(1337);
|
||
|
char *sbuf = NULL;
|
||
|
|
||
|
char *clean_data = clean_content(data);
|
||
|
|
||
|
free(clean_data);
|
||
|
|
||
|
// All capitalized words
|
||
|
sl *capitalized_words = get_capitalized_words(data);
|
||
|
uint capitalized_words_count = capitalized_words->count;
|
||
|
printf("Capitalized words: %u\n", capitalized_words_count);
|
||
|
sbuf = slds(capitalized_words);
|
||
|
stra(all, sbuf);
|
||
|
free(sbuf);
|
||
|
|
||
|
sl *sentences = get_sentences(data);
|
||
|
|
||
|
// All sentences
|
||
|
printf("Sentences: %u\n", sentences->count);
|
||
|
// sld(sentences);
|
||
|
sbuf = slds(sentences);
|
||
|
stra(all, sbuf);
|
||
|
free(sbuf);
|
||
|
|
||
|
sl *words = get_words(data);
|
||
|
|
||
|
// All words
|
||
|
printf("Words: %u\n", words->count);
|
||
|
// sld(words);
|
||
|
sbuf = slds(words);
|
||
|
stra(all, sbuf);
|
||
|
free(sbuf);
|
||
|
|
||
|
// Numbers
|
||
|
sl *numbers = get_numbers(data);
|
||
|
printf("Numbers: %u\n", numbers->count);
|
||
|
// sld(numbers);
|
||
|
sbuf = slds(numbers);
|
||
|
stra(all, sbuf);
|
||
|
free(sbuf);
|
||
|
|
||
|
// Forbidden words
|
||
|
sl *fw = get_forbidden_words(data);
|
||
|
printf("Forbidden words: %u\n", fw->count);
|
||
|
// sld(fw);
|
||
|
sbuf = slds(fw);
|
||
|
stra(all, sbuf);
|
||
|
free(sbuf);
|
||
|
|
||
|
strd(all);
|
||
|
uint word_count_per_sentence = words->count / sentences->count;
|
||
|
printf("Word count per sentence: %u\n", word_count_per_sentence);
|
||
|
|
||
|
slf(capitalized_words);
|
||
|
slf(sentences);
|
||
|
slf(words);
|
||
|
slf(numbers);
|
||
|
slf(fw);
|
||
|
|
||
|
free(data);
|
||
|
}
|
||
|
|
||
|
void analyze_file(char *path) {
|
||
|
FILE *f = fopen(path, "r");
|
||
|
analyze(f);
|
||
|
fclose(f);
|
||
|
}
|
||
|
|
||
|
int main(int argc, char *argv[]) {
|
||
|
|
||
|
if (argc > 1) {
|
||
|
for (int i = 1; i < argc; i++) {
|
||
|
printf("File: %s\n", argv[i]);
|
||
|
analyze_file(argv[i]);
|
||
|
printf("\n");
|
||
|
}
|
||
|
printf("%s\n", rmalloc_stats());
|
||
|
return 0;
|
||
|
}
|
||
|
analyze(stdin);
|
||
|
printf("%s\n", rmalloc_stats());
|
||
|
return 0;
|
||
|
}
|