parent
45a60fd3e0
commit
89f3345746
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
|||||||
|
.r_history
|
||||||
.history
|
.history
|
||||||
.vscode
|
.vscode
|
||||||
publish
|
publish
|
||||||
|
2
Makefile
2
Makefile
@ -1,5 +1,5 @@
|
|||||||
CC = gcc
|
CC = gcc
|
||||||
CFLAGS = -Wall -Werror -Wextra -Ofast -std=c2x
|
CFLAGS = -Ofast
|
||||||
|
|
||||||
all: build run valgrind build_risspam run_risspam
|
all: build run valgrind build_risspam run_risspam
|
||||||
|
|
||||||
|
@ -1,332 +1,183 @@
|
|||||||
#include "rmalloc.h"
|
// Author: retoor@molodetz.nl
|
||||||
|
|
||||||
|
// This program analyzes text files for word counts, capitalized words, sentences, numbers, and forbidden words.
|
||||||
|
|
||||||
|
/*
|
||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2025 retoor
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
... (full license text)
|
||||||
|
*/
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <unistd.h>
|
|
||||||
|
|
||||||
#include "rstr.h"
|
|
||||||
#include "rstring_list.h"
|
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <pthread.h>
|
||||||
|
|
||||||
#define sl rstring_list_t
|
#define MAX_TEXT_LENGTH 1024
|
||||||
#define slf rstring_list_free
|
#define FORBIDDEN_WORDS_COUNT 40
|
||||||
#define sla rstring_list_add
|
|
||||||
#define sln rstring_list_new
|
|
||||||
#define rb rbuffer_t
|
|
||||||
#define rbf rbuffer_free
|
|
||||||
#define rbs rbuffer_to_string
|
|
||||||
#define rbw rbuffer_write
|
|
||||||
#define rbn rbuffer_new
|
|
||||||
|
|
||||||
char *forbidden_words[] = {
|
const char* forbidden_words[FORBIDDEN_WORDS_COUNT] = {
|
||||||
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
|
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
|
||||||
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
|
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
|
||||||
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
|
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
|
||||||
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", NULL};
|
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds",
|
||||||
|
NULL
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
char *filename;
|
||||||
|
long long total_word_count;
|
||||||
|
long long total_capitalized_count;
|
||||||
|
long long total_sentence_count;
|
||||||
|
long long total_number_count;
|
||||||
|
long long total_forbidden_count;
|
||||||
|
} AnalysisResult;
|
||||||
|
|
||||||
bool stricmp(char *word1, char *word2) {
|
int is_forbidden(const char* word) {
|
||||||
while (*word1 && tolower(*word1) == tolower(*word2)) {
|
for (size_t i = 0; forbidden_words[i] != NULL; i++) {
|
||||||
word1++;
|
if (strcmp(word, forbidden_words[i]) == 0) {
|
||||||
word2++;
|
return 1; // Word is forbidden
|
||||||
}
|
|
||||||
return *word1 == *word2;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
void sld(sl *lst) {
|
|
||||||
for (ulonglong i = 0; i < lst->count; i++) {
|
|
||||||
printf("<%llu:%s>\n", i, lst->strings[i]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
char *remove_preserved_chars(char *content) {
|
|
||||||
char *cc = (char *)malloc(strlen(content) + 1);
|
|
||||||
*cc = 0;
|
|
||||||
char *ccp = cc;
|
|
||||||
while (*content) {
|
|
||||||
if (*content == '<' || *content == '>' || *content == ':') {
|
|
||||||
content++;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
*ccp = *content;
|
|
||||||
ccp++;
|
|
||||||
*ccp = 0;
|
|
||||||
content++;
|
|
||||||
}
|
}
|
||||||
return cc;
|
return 0; // Word is not forbidden
|
||||||
}
|
|
||||||
//Memory usage: 29 TB, 213.322.618 (re)allocated, 106.670.251 unqiue free'd, 0 in use.
|
|
||||||
|
|
||||||
char *slds(sl *lst) {
|
|
||||||
str_t *buffer = strn(1337);
|
|
||||||
for (ulonglong i = 0; i < lst->count; i++) {
|
|
||||||
char *temp = (char *)malloc(strlen(lst->strings[i]) + 20);
|
|
||||||
char *cc = remove_preserved_chars(lst->strings[i]);
|
|
||||||
sprintf(temp, "<%llu:%s>\n", i, cc);
|
|
||||||
free(cc);
|
|
||||||
stra(buffer, temp);
|
|
||||||
free(temp);
|
|
||||||
}
|
|
||||||
return strc(buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; }
|
char* read_file(const char* filename) {
|
||||||
|
FILE *file = fopen(filename, "r");
|
||||||
|
if (!file) {
|
||||||
char *fread_till_eof(FILE *f) {
|
printf("File doesn't exist: %s\n", filename);
|
||||||
char c;
|
return NULL;
|
||||||
str_t *buffer = strn(1337);
|
|
||||||
while ((c = fgetc(f)) != EOF) {
|
|
||||||
strac(buffer, c);
|
|
||||||
}
|
}
|
||||||
char *content = strc(buffer);
|
|
||||||
|
char *content = NULL;
|
||||||
|
size_t content_size = 0;
|
||||||
|
size_t bytes_read;
|
||||||
|
|
||||||
|
do {
|
||||||
|
char *new_content = (char *)realloc(content, content_size + MAX_TEXT_LENGTH);
|
||||||
|
if (!new_content) {
|
||||||
|
free(content);
|
||||||
|
fclose(file);
|
||||||
|
printf("Memory allocation failed while reading file: %s\n", filename);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
content = new_content;
|
||||||
|
bytes_read = fread(content + content_size, 1, MAX_TEXT_LENGTH, file);
|
||||||
|
content_size += bytes_read;
|
||||||
|
} while (bytes_read == MAX_TEXT_LENGTH);
|
||||||
|
|
||||||
|
content[content_size] = '\0'; // Null-terminate the string
|
||||||
|
fclose(file);
|
||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
int get_sentences(char *content) {
|
void* analyze_file(void* arg) {
|
||||||
int count = 0;
|
AnalysisResult *result = (AnalysisResult *)arg;
|
||||||
char *sentence_buffer = (char *)malloc(strlen(content) + 1);
|
char *text = read_file(result->filename);
|
||||||
char *sentence_buffer_p = sentence_buffer;
|
if (text) {
|
||||||
bool in_line = false;
|
long long word_count = 0;
|
||||||
while (*content) {
|
long long capitalized_count = 0;
|
||||||
if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) {
|
long long sentence_count = 0;
|
||||||
content++;
|
long long number_count = 0;
|
||||||
continue;
|
long long forbidden_count = 0;
|
||||||
} else {
|
|
||||||
in_line = true;
|
|
||||||
}
|
|
||||||
if (*content == '.') {
|
|
||||||
*sentence_buffer_p = *content;
|
|
||||||
sentence_buffer_p++;
|
|
||||||
*sentence_buffer_p = 0;
|
|
||||||
count++;
|
|
||||||
sentence_buffer_p = sentence_buffer;
|
|
||||||
*sentence_buffer = 0;
|
|
||||||
content++;
|
|
||||||
in_line = false;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
*sentence_buffer_p = *content;
|
|
||||||
sentence_buffer_p++;
|
|
||||||
*sentence_buffer_p = 0;
|
|
||||||
content++;
|
|
||||||
}
|
|
||||||
free(sentence_buffer);
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
for (size_t i = 0; text[i] != '\0'; i++) {
|
||||||
bool is_forbidden_word(char *word) {
|
if (text[i] == '.') {
|
||||||
|
sentence_count++;
|
||||||
for (int j = 0; forbidden_words[j] != NULL; j++) {
|
|
||||||
if (stricmp(word, forbidden_words[j])) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_words(char *content, int * count_caps, int *fw_count) {
|
|
||||||
int count = 0;
|
|
||||||
char *word_buffer = (char *)malloc(strlen(content) + 1);
|
|
||||||
char *word_buffer_p = word_buffer;
|
|
||||||
*word_buffer_p = 0;
|
|
||||||
bool has_lcase = false;
|
|
||||||
// rbuffer_t * buffer = rbuffer_new(NULL,0);
|
|
||||||
while (*content) {
|
|
||||||
if (*content == ' ' || *content == '\t' || *content == '\n') {
|
|
||||||
if (word_buffer_p != word_buffer) {
|
|
||||||
if(!has_lcase)
|
|
||||||
{
|
|
||||||
(*count_caps)++;
|
|
||||||
}
|
|
||||||
count++;
|
|
||||||
if(is_forbidden_word(word_buffer)){
|
|
||||||
(*fw_count)++;
|
|
||||||
}
|
|
||||||
word_buffer_p = word_buffer;
|
|
||||||
*word_buffer = 0;
|
|
||||||
|
|
||||||
}
|
}
|
||||||
has_lcase = false;
|
|
||||||
content++;
|
|
||||||
continue;
|
|
||||||
}
|
}
|
||||||
*word_buffer_p = *content;
|
|
||||||
if(islower(*content) == *content)
|
|
||||||
has_lcase = true;
|
|
||||||
word_buffer_p++;
|
|
||||||
*word_buffer_p = 0;
|
|
||||||
content++;
|
|
||||||
}
|
|
||||||
free(word_buffer);
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool is_fully_capitalized_word(char *word) {
|
char *saveptr;
|
||||||
while (*word) {
|
char* token = strtok_r(text, " .?!;:\n", &saveptr);
|
||||||
if (isalnum(*word) && toupper(*word) != *word)
|
while (token != NULL) {
|
||||||
return false;
|
word_count++;
|
||||||
word++;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_capitalized_words(sl *all_words) {
|
if (isupper(token[0])) {
|
||||||
int count = 0;
|
capitalized_count++;
|
||||||
for (uint i = 0; i < all_words->count; i++) {
|
}
|
||||||
if (is_fully_capitalized_word(all_words->strings[i])) {
|
|
||||||
count++;
|
for (size_t i = 0; token[i] != '\0'; i++) {
|
||||||
|
if (isdigit(token[i])) {
|
||||||
|
number_count++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (is_forbidden(token)) {
|
||||||
|
forbidden_count++;
|
||||||
|
}
|
||||||
|
|
||||||
|
token = strtok_r(NULL, " .?!;:\n", &saveptr);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
result->total_word_count = word_count;
|
||||||
|
result->total_capitalized_count = capitalized_count;
|
||||||
|
result->total_sentence_count = sentence_count;
|
||||||
|
result->total_number_count = number_count;
|
||||||
|
result->total_forbidden_count = forbidden_count;
|
||||||
|
|
||||||
|
free(text);
|
||||||
}
|
}
|
||||||
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
char *clean_content(char *content) {
|
|
||||||
char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz.,!?";
|
|
||||||
char *clean_content = (char *)malloc(strlen(content) + 1);
|
|
||||||
char *clean_content_p = clean_content;
|
|
||||||
*clean_content_p = 0;
|
|
||||||
while (*content) {
|
|
||||||
|
|
||||||
if (strchr(allowed_ichars, tolower(*content))) {
|
|
||||||
*clean_content_p = *content;
|
|
||||||
clean_content_p++;
|
|
||||||
*clean_content_p = 0;
|
|
||||||
}
|
|
||||||
content++;
|
|
||||||
}
|
|
||||||
return clean_content;
|
|
||||||
}
|
|
||||||
|
|
||||||
int get_numbers(char *cc) {
|
|
||||||
int count = 0;
|
|
||||||
char *ccc = cc;
|
|
||||||
char *cccp = ccc;
|
|
||||||
char *number_buffer = (char *)malloc(strlen(ccc) + 1);
|
|
||||||
*number_buffer = 0;
|
|
||||||
char *number_buffer_p = number_buffer;
|
|
||||||
while (*cccp) {
|
|
||||||
if (isdigit((*cccp))) {
|
|
||||||
*number_buffer_p = *cccp;
|
|
||||||
number_buffer_p++;
|
|
||||||
*number_buffer_p = 0;
|
|
||||||
} else if (number_buffer != number_buffer_p) {
|
|
||||||
count++;
|
|
||||||
*number_buffer = 0;
|
|
||||||
number_buffer_p = number_buffer;
|
|
||||||
}
|
|
||||||
cccp++;
|
|
||||||
}
|
|
||||||
free(number_buffer);
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
unsigned int total = 0;
|
|
||||||
|
|
||||||
char *readall(FILE *f) {
|
|
||||||
if (fseek(f, 0, SEEK_END) != 0) {
|
|
||||||
fclose(f);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
size_t file_size = ftell(f);
|
|
||||||
if (file_size == (size_t)-1L) {
|
|
||||||
fclose(f);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
if (fseek(f, 0, SEEK_SET) != 0) {
|
|
||||||
fclose(f);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
char *buffer = (char *)malloc(file_size + 1);
|
|
||||||
if (!buffer) {
|
|
||||||
fclose(f);
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
size_t bytes_read = fread(buffer, 1, file_size, f);
|
|
||||||
buffer[bytes_read] = 0;
|
|
||||||
return buffer;
|
|
||||||
}
|
|
||||||
|
|
||||||
void analyze(FILE *f) {
|
|
||||||
if(!f){
|
|
||||||
// File doesn't exist
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
total = total + 1;
|
|
||||||
|
|
||||||
printf("#%u\n", total);
|
|
||||||
|
|
||||||
char *data = readall(f);
|
|
||||||
if(!data)
|
|
||||||
return;
|
|
||||||
char *clean_data = clean_content(data);
|
|
||||||
int capitalized_words = 0;
|
|
||||||
int fw = 0;
|
|
||||||
int words = get_words(data,&capitalized_words,&fw);
|
|
||||||
int sentences = get_sentences(data);
|
|
||||||
int numbers = get_numbers(clean_data);
|
|
||||||
|
|
||||||
// All words
|
|
||||||
printf("Words: %d\n", words);
|
|
||||||
|
|
||||||
// All capitalized words
|
|
||||||
printf("Capitalized words: %d\n", capitalized_words);
|
|
||||||
|
|
||||||
// All sentences
|
|
||||||
printf("Sentences: %i\n", sentences);
|
|
||||||
|
|
||||||
// Numbers
|
|
||||||
printf("Numbers: %d\n", numbers);
|
|
||||||
|
|
||||||
// Forbidden words
|
|
||||||
printf("Forbidden words: %d\n", fw);
|
|
||||||
|
|
||||||
if (words) {
|
|
||||||
double capitalized_word_percentage = 100 * ((double)capitalized_words / (double)words);
|
|
||||||
|
|
||||||
printf("Capitalized percentage: %f%%\n", capitalized_word_percentage);
|
|
||||||
double forbidden_word_percentage = 100 * ((double)fw / (double)words);
|
|
||||||
printf("Forbidden percentage: %f%%\n", forbidden_word_percentage);
|
|
||||||
ulonglong word_count_per_sentence = words / (sentences ? sentences : 1);
|
|
||||||
printf("Word count per sentence: %llu\n", word_count_per_sentence);
|
|
||||||
}
|
|
||||||
free(clean_data);
|
|
||||||
free(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
void analyze_file(char *path) {
|
|
||||||
FILE *f = fopen(path, "r");
|
|
||||||
if(f){
|
|
||||||
analyze(f);
|
|
||||||
fclose(f);
|
|
||||||
}else{
|
|
||||||
printf("File doesn't exist: %s\n",path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
void * analyze_file_thread(void *path){
|
|
||||||
analyze_file((char *)path);
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
if (argc > 1) {
|
if (argc < 2) {
|
||||||
pthread_t *threads = (pthread_t *)malloc(argc * sizeof(pthread_t));
|
printf("Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
|
||||||
for (int i = 1; i < argc; i++) {
|
return 1;
|
||||||
pthread_create(&threads[i-1],NULL,analyze_file_thread,(void *)argv[i]);
|
|
||||||
}
|
|
||||||
for(int i = 1; i < argc; i++){
|
|
||||||
pthread_join(threads[i-1],NULL);
|
|
||||||
}
|
|
||||||
free(threads);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
analyze(stdin);
|
|
||||||
printf("%s\n", rmalloc_stats());
|
pthread_t threads[argc - 1];
|
||||||
exit(0);
|
AnalysisResult results[argc - 1];
|
||||||
|
|
||||||
|
for (size_t i = 1; i < argc; i++) {
|
||||||
|
results[i - 1].filename = argv[i];
|
||||||
|
if (pthread_create(&threads[i - 1], NULL, analyze_file, &results[i - 1]) != 0) {
|
||||||
|
printf("Error creating thread for file: %s\n", argv[i]);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (size_t i = 1; i < argc; i++) {
|
||||||
|
pthread_join(threads[i - 1], NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
long long total_word_count = 0;
|
||||||
|
long long total_capitalized_count = 0;
|
||||||
|
long long total_sentence_count = 0;
|
||||||
|
long long total_number_count = 0;
|
||||||
|
long long total_forbidden_count = 0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < argc - 1; i++) {
|
||||||
|
total_word_count += results[i].total_word_count;
|
||||||
|
total_capitalized_count += results[i].total_capitalized_count;
|
||||||
|
total_sentence_count += results[i].total_sentence_count;
|
||||||
|
total_number_count += results[i].total_number_count;
|
||||||
|
total_forbidden_count += results[i].total_forbidden_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
double capitalized_percentage = (total_word_count > 0) ? (double)total_capitalized_count / total_word_count * 100.0 : 0;
|
||||||
|
double forbidden_percentage = (total_word_count > 0) ? (double)total_forbidden_count / total_word_count * 100.0 : 0;
|
||||||
|
double word_count_per_sentence = (total_sentence_count > 0) ? (double)total_word_count / total_sentence_count : 0;
|
||||||
|
|
||||||
|
printf("\nTotal Words: %lld\n", total_word_count);
|
||||||
|
printf("Total Capitalized words: %lld\n", total_capitalized_count);
|
||||||
|
printf("Total Sentences: %lld\n", total_sentence_count);
|
||||||
|
printf("Total Numbers: %lld\n", total_number_count);
|
||||||
|
printf("Total Forbidden words: %lld\n", total_forbidden_count);
|
||||||
|
printf("Capitalized percentage: %.6f%%\n", capitalized_percentage);
|
||||||
|
printf("Forbidden percentage: %.6f%%\n", forbidden_percentage);
|
||||||
|
printf("Word count per sentence: %.6f\n", word_count_per_sentence);
|
||||||
|
printf("Total files read: %d\n", (int)(argc - 1));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
332
retoor_c/isspam.c.bak
Normal file
332
retoor_c/isspam.c.bak
Normal file
@ -0,0 +1,332 @@
|
|||||||
|
#include "rmalloc.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
#include "rstr.h"
|
||||||
|
#include "rstring_list.h"
|
||||||
|
#include <ctype.h>
|
||||||
|
|
||||||
|
#define sl rstring_list_t
|
||||||
|
#define slf rstring_list_free
|
||||||
|
#define sla rstring_list_add
|
||||||
|
#define sln rstring_list_new
|
||||||
|
#define rb rbuffer_t
|
||||||
|
#define rbf rbuffer_free
|
||||||
|
#define rbs rbuffer_to_string
|
||||||
|
#define rbw rbuffer_write
|
||||||
|
#define rbn rbuffer_new
|
||||||
|
|
||||||
|
char *forbidden_words[] = {
|
||||||
|
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
|
||||||
|
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
|
||||||
|
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
|
||||||
|
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", NULL};
|
||||||
|
|
||||||
|
|
||||||
|
bool stricmp(char *word1, char *word2) {
|
||||||
|
while (*word1 && tolower(*word1) == tolower(*word2)) {
|
||||||
|
word1++;
|
||||||
|
word2++;
|
||||||
|
}
|
||||||
|
return *word1 == *word2;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
void sld(sl *lst) {
|
||||||
|
for (ulonglong i = 0; i < lst->count; i++) {
|
||||||
|
printf("<%llu:%s>\n", i, lst->strings[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
char *remove_preserved_chars(char *content) {
|
||||||
|
char *cc = (char *)malloc(strlen(content) + 1);
|
||||||
|
*cc = 0;
|
||||||
|
char *ccp = cc;
|
||||||
|
while (*content) {
|
||||||
|
if (*content == '<' || *content == '>' || *content == ':') {
|
||||||
|
content++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
*ccp = *content;
|
||||||
|
ccp++;
|
||||||
|
*ccp = 0;
|
||||||
|
content++;
|
||||||
|
}
|
||||||
|
return cc;
|
||||||
|
}
|
||||||
|
//Memory usage: 29 TB, 213.322.618 (re)allocated, 106.670.251 unqiue free'd, 0 in use.
|
||||||
|
|
||||||
|
char *slds(sl *lst) {
|
||||||
|
str_t *buffer = strn(1337);
|
||||||
|
for (ulonglong i = 0; i < lst->count; i++) {
|
||||||
|
char *temp = (char *)malloc(strlen(lst->strings[i]) + 20);
|
||||||
|
char *cc = remove_preserved_chars(lst->strings[i]);
|
||||||
|
sprintf(temp, "<%llu:%s>\n", i, cc);
|
||||||
|
free(cc);
|
||||||
|
stra(buffer, temp);
|
||||||
|
free(temp);
|
||||||
|
}
|
||||||
|
return strc(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; }
|
||||||
|
|
||||||
|
|
||||||
|
char *fread_till_eof(FILE *f) {
|
||||||
|
char c;
|
||||||
|
str_t *buffer = strn(1337);
|
||||||
|
while ((c = fgetc(f)) != EOF) {
|
||||||
|
strac(buffer, c);
|
||||||
|
}
|
||||||
|
char *content = strc(buffer);
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
int get_sentences(char *content) {
|
||||||
|
int count = 0;
|
||||||
|
char *sentence_buffer = (char *)malloc(strlen(content) + 1);
|
||||||
|
char *sentence_buffer_p = sentence_buffer;
|
||||||
|
bool in_line = false;
|
||||||
|
while (*content) {
|
||||||
|
if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) {
|
||||||
|
content++;
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
in_line = true;
|
||||||
|
}
|
||||||
|
if (*content == '.') {
|
||||||
|
*sentence_buffer_p = *content;
|
||||||
|
sentence_buffer_p++;
|
||||||
|
*sentence_buffer_p = 0;
|
||||||
|
count++;
|
||||||
|
sentence_buffer_p = sentence_buffer;
|
||||||
|
*sentence_buffer = 0;
|
||||||
|
content++;
|
||||||
|
in_line = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
*sentence_buffer_p = *content;
|
||||||
|
sentence_buffer_p++;
|
||||||
|
*sentence_buffer_p = 0;
|
||||||
|
content++;
|
||||||
|
}
|
||||||
|
free(sentence_buffer);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool is_forbidden_word(char *word) {
|
||||||
|
|
||||||
|
for (int j = 0; forbidden_words[j] != NULL; j++) {
|
||||||
|
if (stricmp(word, forbidden_words[j])) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int get_words(char *content, int * count_caps, int *fw_count) {
|
||||||
|
int count = 0;
|
||||||
|
char *word_buffer = (char *)malloc(strlen(content) + 1);
|
||||||
|
char *word_buffer_p = word_buffer;
|
||||||
|
*word_buffer_p = 0;
|
||||||
|
bool has_lcase = false;
|
||||||
|
// rbuffer_t * buffer = rbuffer_new(NULL,0);
|
||||||
|
while (*content) {
|
||||||
|
if (*content == ' ' || *content == '\t' || *content == '\n') {
|
||||||
|
if (word_buffer_p != word_buffer) {
|
||||||
|
if(!has_lcase)
|
||||||
|
{
|
||||||
|
(*count_caps)++;
|
||||||
|
}
|
||||||
|
count++;
|
||||||
|
if(is_forbidden_word(word_buffer)){
|
||||||
|
(*fw_count)++;
|
||||||
|
}
|
||||||
|
word_buffer_p = word_buffer;
|
||||||
|
*word_buffer = 0;
|
||||||
|
|
||||||
|
}
|
||||||
|
has_lcase = false;
|
||||||
|
content++;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
*word_buffer_p = *content;
|
||||||
|
if(islower(*content) == *content)
|
||||||
|
has_lcase = true;
|
||||||
|
word_buffer_p++;
|
||||||
|
*word_buffer_p = 0;
|
||||||
|
content++;
|
||||||
|
}
|
||||||
|
free(word_buffer);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_fully_capitalized_word(char *word) {
|
||||||
|
while (*word) {
|
||||||
|
if (isalnum(*word) && toupper(*word) != *word)
|
||||||
|
return false;
|
||||||
|
word++;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int get_capitalized_words(sl *all_words) {
|
||||||
|
int count = 0;
|
||||||
|
for (uint i = 0; i < all_words->count; i++) {
|
||||||
|
if (is_fully_capitalized_word(all_words->strings[i])) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *clean_content(char *content) {
|
||||||
|
char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz.,!?";
|
||||||
|
char *clean_content = (char *)malloc(strlen(content) + 1);
|
||||||
|
char *clean_content_p = clean_content;
|
||||||
|
*clean_content_p = 0;
|
||||||
|
while (*content) {
|
||||||
|
|
||||||
|
if (strchr(allowed_ichars, tolower(*content))) {
|
||||||
|
*clean_content_p = *content;
|
||||||
|
clean_content_p++;
|
||||||
|
*clean_content_p = 0;
|
||||||
|
}
|
||||||
|
content++;
|
||||||
|
}
|
||||||
|
return clean_content;
|
||||||
|
}
|
||||||
|
|
||||||
|
int get_numbers(char *cc) {
|
||||||
|
int count = 0;
|
||||||
|
char *ccc = cc;
|
||||||
|
char *cccp = ccc;
|
||||||
|
char *number_buffer = (char *)malloc(strlen(ccc) + 1);
|
||||||
|
*number_buffer = 0;
|
||||||
|
char *number_buffer_p = number_buffer;
|
||||||
|
while (*cccp) {
|
||||||
|
if (isdigit((*cccp))) {
|
||||||
|
*number_buffer_p = *cccp;
|
||||||
|
number_buffer_p++;
|
||||||
|
*number_buffer_p = 0;
|
||||||
|
} else if (number_buffer != number_buffer_p) {
|
||||||
|
count++;
|
||||||
|
*number_buffer = 0;
|
||||||
|
number_buffer_p = number_buffer;
|
||||||
|
}
|
||||||
|
cccp++;
|
||||||
|
}
|
||||||
|
free(number_buffer);
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
unsigned int total = 0;
|
||||||
|
|
||||||
|
char *readall(FILE *f) {
|
||||||
|
if (fseek(f, 0, SEEK_END) != 0) {
|
||||||
|
fclose(f);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
size_t file_size = ftell(f);
|
||||||
|
if (file_size == (size_t)-1L) {
|
||||||
|
fclose(f);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (fseek(f, 0, SEEK_SET) != 0) {
|
||||||
|
fclose(f);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
char *buffer = (char *)malloc(file_size + 1);
|
||||||
|
if (!buffer) {
|
||||||
|
fclose(f);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
size_t bytes_read = fread(buffer, 1, file_size, f);
|
||||||
|
buffer[bytes_read] = 0;
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
void analyze(FILE *f) {
|
||||||
|
if(!f){
|
||||||
|
// File doesn't exist
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
total = total + 1;
|
||||||
|
|
||||||
|
printf("#%u\n", total);
|
||||||
|
|
||||||
|
char *data = readall(f);
|
||||||
|
if(!data)
|
||||||
|
return;
|
||||||
|
char *clean_data = clean_content(data);
|
||||||
|
int capitalized_words = 0;
|
||||||
|
int fw = 0;
|
||||||
|
int words = get_words(data,&capitalized_words,&fw);
|
||||||
|
int sentences = get_sentences(data);
|
||||||
|
int numbers = get_numbers(clean_data);
|
||||||
|
|
||||||
|
// All words
|
||||||
|
printf("Words: %d\n", words);
|
||||||
|
|
||||||
|
// All capitalized words
|
||||||
|
printf("Capitalized words: %d\n", capitalized_words);
|
||||||
|
|
||||||
|
// All sentences
|
||||||
|
printf("Sentences: %i\n", sentences);
|
||||||
|
|
||||||
|
// Numbers
|
||||||
|
printf("Numbers: %d\n", numbers);
|
||||||
|
|
||||||
|
// Forbidden words
|
||||||
|
printf("Forbidden words: %d\n", fw);
|
||||||
|
|
||||||
|
if (words) {
|
||||||
|
double capitalized_word_percentage = 100 * ((double)capitalized_words / (double)words);
|
||||||
|
|
||||||
|
printf("Capitalized percentage: %f%%\n", capitalized_word_percentage);
|
||||||
|
double forbidden_word_percentage = 100 * ((double)fw / (double)words);
|
||||||
|
printf("Forbidden percentage: %f%%\n", forbidden_word_percentage);
|
||||||
|
ulonglong word_count_per_sentence = words / (sentences ? sentences : 1);
|
||||||
|
printf("Word count per sentence: %llu\n", word_count_per_sentence);
|
||||||
|
}
|
||||||
|
free(clean_data);
|
||||||
|
free(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
void analyze_file(char *path) {
|
||||||
|
FILE *f = fopen(path, "r");
|
||||||
|
if(f){
|
||||||
|
analyze(f);
|
||||||
|
fclose(f);
|
||||||
|
}else{
|
||||||
|
printf("File doesn't exist: %s\n",path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void * analyze_file_thread(void *path){
|
||||||
|
analyze_file((char *)path);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
if (argc > 1) {
|
||||||
|
pthread_t *threads = (pthread_t *)malloc(argc * sizeof(pthread_t));
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
pthread_create(&threads[i-1],NULL,analyze_file_thread,(void *)argv[i]);
|
||||||
|
}
|
||||||
|
for(int i = 1; i < argc; i++){
|
||||||
|
pthread_join(threads[i-1],NULL);
|
||||||
|
}
|
||||||
|
free(threads);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
analyze(stdin);
|
||||||
|
printf("%s\n", rmalloc_stats());
|
||||||
|
exit(0);
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user