This commit is contained in:
retoor 2025-03-20 01:18:41 +01:00
parent 45a60fd3e0
commit 89f3345746
4 changed files with 489 additions and 305 deletions

1
.gitignore vendored
View File

@ -1,3 +1,4 @@
.r_history
.history
.vscode
publish

View File

@ -1,5 +1,5 @@
CC = gcc
CFLAGS = -Wall -Werror -Wextra -Ofast -std=c2x
CFLAGS = -Ofast
all: build run valgrind build_risspam run_risspam

View File

@ -1,332 +1,183 @@
#include "rmalloc.h"
// Author: retoor@molodetz.nl
// This program analyzes text files for word counts, capitalized words, sentences, numbers, and forbidden words.
/*
MIT License
Copyright (c) 2025 retoor
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
... (full license text)
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "rstr.h"
#include "rstring_list.h"
#include <ctype.h>
#include <stdlib.h>
#include <pthread.h>
#define sl rstring_list_t
#define slf rstring_list_free
#define sla rstring_list_add
#define sln rstring_list_new
#define rb rbuffer_t
#define rbf rbuffer_free
#define rbs rbuffer_to_string
#define rbw rbuffer_write
#define rbn rbuffer_new
#define MAX_TEXT_LENGTH 1024
#define FORBIDDEN_WORDS_COUNT 40
char *forbidden_words[] = {
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", NULL};
const char* forbidden_words[FORBIDDEN_WORDS_COUNT] = {
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds",
NULL
};
typedef struct {
char *filename;
long long total_word_count;
long long total_capitalized_count;
long long total_sentence_count;
long long total_number_count;
long long total_forbidden_count;
} AnalysisResult;
bool stricmp(char *word1, char *word2) {
while (*word1 && tolower(*word1) == tolower(*word2)) {
word1++;
word2++;
}
return *word1 == *word2;
}
void sld(sl *lst) {
for (ulonglong i = 0; i < lst->count; i++) {
printf("<%llu:%s>\n", i, lst->strings[i]);
}
}
char *remove_preserved_chars(char *content) {
char *cc = (char *)malloc(strlen(content) + 1);
*cc = 0;
char *ccp = cc;
while (*content) {
if (*content == '<' || *content == '>' || *content == ':') {
content++;
continue;
int is_forbidden(const char* word) {
for (size_t i = 0; forbidden_words[i] != NULL; i++) {
if (strcmp(word, forbidden_words[i]) == 0) {
return 1; // Word is forbidden
}
*ccp = *content;
ccp++;
*ccp = 0;
content++;
}
return cc;
}
//Memory usage: 29 TB, 213.322.618 (re)allocated, 106.670.251 unqiue free'd, 0 in use.
char *slds(sl *lst) {
str_t *buffer = strn(1337);
for (ulonglong i = 0; i < lst->count; i++) {
char *temp = (char *)malloc(strlen(lst->strings[i]) + 20);
char *cc = remove_preserved_chars(lst->strings[i]);
sprintf(temp, "<%llu:%s>\n", i, cc);
free(cc);
stra(buffer, temp);
free(temp);
}
return strc(buffer);
return 0; // Word is not forbidden
}
bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; }
char *fread_till_eof(FILE *f) {
char c;
str_t *buffer = strn(1337);
while ((c = fgetc(f)) != EOF) {
strac(buffer, c);
char* read_file(const char* filename) {
FILE *file = fopen(filename, "r");
if (!file) {
printf("File doesn't exist: %s\n", filename);
return NULL;
}
char *content = strc(buffer);
char *content = NULL;
size_t content_size = 0;
size_t bytes_read;
do {
char *new_content = (char *)realloc(content, content_size + MAX_TEXT_LENGTH);
if (!new_content) {
free(content);
fclose(file);
printf("Memory allocation failed while reading file: %s\n", filename);
return NULL;
}
content = new_content;
bytes_read = fread(content + content_size, 1, MAX_TEXT_LENGTH, file);
content_size += bytes_read;
} while (bytes_read == MAX_TEXT_LENGTH);
content[content_size] = '\0'; // Null-terminate the string
fclose(file);
return content;
}
int get_sentences(char *content) {
int count = 0;
char *sentence_buffer = (char *)malloc(strlen(content) + 1);
char *sentence_buffer_p = sentence_buffer;
bool in_line = false;
while (*content) {
if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) {
content++;
continue;
} else {
in_line = true;
}
if (*content == '.') {
*sentence_buffer_p = *content;
sentence_buffer_p++;
*sentence_buffer_p = 0;
count++;
sentence_buffer_p = sentence_buffer;
*sentence_buffer = 0;
content++;
in_line = false;
continue;
}
*sentence_buffer_p = *content;
sentence_buffer_p++;
*sentence_buffer_p = 0;
content++;
}
free(sentence_buffer);
return count;
}
void* analyze_file(void* arg) {
AnalysisResult *result = (AnalysisResult *)arg;
char *text = read_file(result->filename);
if (text) {
long long word_count = 0;
long long capitalized_count = 0;
long long sentence_count = 0;
long long number_count = 0;
long long forbidden_count = 0;
bool is_forbidden_word(char *word) {
for (int j = 0; forbidden_words[j] != NULL; j++) {
if (stricmp(word, forbidden_words[j])) {
return true;
}
}
return false;
}
int get_words(char *content, int * count_caps, int *fw_count) {
int count = 0;
char *word_buffer = (char *)malloc(strlen(content) + 1);
char *word_buffer_p = word_buffer;
*word_buffer_p = 0;
bool has_lcase = false;
// rbuffer_t * buffer = rbuffer_new(NULL,0);
while (*content) {
if (*content == ' ' || *content == '\t' || *content == '\n') {
if (word_buffer_p != word_buffer) {
if(!has_lcase)
{
(*count_caps)++;
}
count++;
if(is_forbidden_word(word_buffer)){
(*fw_count)++;
}
word_buffer_p = word_buffer;
*word_buffer = 0;
for (size_t i = 0; text[i] != '\0'; i++) {
if (text[i] == '.') {
sentence_count++;
}
has_lcase = false;
content++;
continue;
}
*word_buffer_p = *content;
if(islower(*content) == *content)
has_lcase = true;
word_buffer_p++;
*word_buffer_p = 0;
content++;
}
free(word_buffer);
return count;
}
bool is_fully_capitalized_word(char *word) {
while (*word) {
if (isalnum(*word) && toupper(*word) != *word)
return false;
word++;
}
return true;
}
char *saveptr;
char* token = strtok_r(text, " .?!;:\n", &saveptr);
while (token != NULL) {
word_count++;
int get_capitalized_words(sl *all_words) {
int count = 0;
for (uint i = 0; i < all_words->count; i++) {
if (is_fully_capitalized_word(all_words->strings[i])) {
count++;
if (isupper(token[0])) {
capitalized_count++;
}
for (size_t i = 0; token[i] != '\0'; i++) {
if (isdigit(token[i])) {
number_count++;
break;
}
}
if (is_forbidden(token)) {
forbidden_count++;
}
token = strtok_r(NULL, " .?!;:\n", &saveptr);
}
result->total_word_count = word_count;
result->total_capitalized_count = capitalized_count;
result->total_sentence_count = sentence_count;
result->total_number_count = number_count;
result->total_forbidden_count = forbidden_count;
free(text);
}
return count;
}
char *clean_content(char *content) {
char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz.,!?";
char *clean_content = (char *)malloc(strlen(content) + 1);
char *clean_content_p = clean_content;
*clean_content_p = 0;
while (*content) {
if (strchr(allowed_ichars, tolower(*content))) {
*clean_content_p = *content;
clean_content_p++;
*clean_content_p = 0;
}
content++;
}
return clean_content;
}
int get_numbers(char *cc) {
int count = 0;
char *ccc = cc;
char *cccp = ccc;
char *number_buffer = (char *)malloc(strlen(ccc) + 1);
*number_buffer = 0;
char *number_buffer_p = number_buffer;
while (*cccp) {
if (isdigit((*cccp))) {
*number_buffer_p = *cccp;
number_buffer_p++;
*number_buffer_p = 0;
} else if (number_buffer != number_buffer_p) {
count++;
*number_buffer = 0;
number_buffer_p = number_buffer;
}
cccp++;
}
free(number_buffer);
return count;
}
unsigned int total = 0;
char *readall(FILE *f) {
if (fseek(f, 0, SEEK_END) != 0) {
fclose(f);
return NULL;
}
size_t file_size = ftell(f);
if (file_size == (size_t)-1L) {
fclose(f);
return NULL;
}
if (fseek(f, 0, SEEK_SET) != 0) {
fclose(f);
return NULL;
}
char *buffer = (char *)malloc(file_size + 1);
if (!buffer) {
fclose(f);
return NULL;
}
size_t bytes_read = fread(buffer, 1, file_size, f);
buffer[bytes_read] = 0;
return buffer;
}
void analyze(FILE *f) {
if(!f){
// File doesn't exist
return;
}
total = total + 1;
printf("#%u\n", total);
char *data = readall(f);
if(!data)
return;
char *clean_data = clean_content(data);
int capitalized_words = 0;
int fw = 0;
int words = get_words(data,&capitalized_words,&fw);
int sentences = get_sentences(data);
int numbers = get_numbers(clean_data);
// All words
printf("Words: %d\n", words);
// All capitalized words
printf("Capitalized words: %d\n", capitalized_words);
// All sentences
printf("Sentences: %i\n", sentences);
// Numbers
printf("Numbers: %d\n", numbers);
// Forbidden words
printf("Forbidden words: %d\n", fw);
if (words) {
double capitalized_word_percentage = 100 * ((double)capitalized_words / (double)words);
printf("Capitalized percentage: %f%%\n", capitalized_word_percentage);
double forbidden_word_percentage = 100 * ((double)fw / (double)words);
printf("Forbidden percentage: %f%%\n", forbidden_word_percentage);
ulonglong word_count_per_sentence = words / (sentences ? sentences : 1);
printf("Word count per sentence: %llu\n", word_count_per_sentence);
}
free(clean_data);
free(data);
}
void analyze_file(char *path) {
FILE *f = fopen(path, "r");
if(f){
analyze(f);
fclose(f);
}else{
printf("File doesn't exist: %s\n",path);
}
}
void * analyze_file_thread(void *path){
analyze_file((char *)path);
return NULL;
}
int main(int argc, char *argv[]) {
if (argc > 1) {
pthread_t *threads = (pthread_t *)malloc(argc * sizeof(pthread_t));
for (int i = 1; i < argc; i++) {
pthread_create(&threads[i-1],NULL,analyze_file_thread,(void *)argv[i]);
}
for(int i = 1; i < argc; i++){
pthread_join(threads[i-1],NULL);
}
free(threads);
return 0;
if (argc < 2) {
printf("Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
return 1;
}
analyze(stdin);
printf("%s\n", rmalloc_stats());
exit(0);
pthread_t threads[argc - 1];
AnalysisResult results[argc - 1];
for (size_t i = 1; i < argc; i++) {
results[i - 1].filename = argv[i];
if (pthread_create(&threads[i - 1], NULL, analyze_file, &results[i - 1]) != 0) {
printf("Error creating thread for file: %s\n", argv[i]);
return 1;
}
}
for (size_t i = 1; i < argc; i++) {
pthread_join(threads[i - 1], NULL);
}
long long total_word_count = 0;
long long total_capitalized_count = 0;
long long total_sentence_count = 0;
long long total_number_count = 0;
long long total_forbidden_count = 0;
for (size_t i = 0; i < argc - 1; i++) {
total_word_count += results[i].total_word_count;
total_capitalized_count += results[i].total_capitalized_count;
total_sentence_count += results[i].total_sentence_count;
total_number_count += results[i].total_number_count;
total_forbidden_count += results[i].total_forbidden_count;
}
double capitalized_percentage = (total_word_count > 0) ? (double)total_capitalized_count / total_word_count * 100.0 : 0;
double forbidden_percentage = (total_word_count > 0) ? (double)total_forbidden_count / total_word_count * 100.0 : 0;
double word_count_per_sentence = (total_sentence_count > 0) ? (double)total_word_count / total_sentence_count : 0;
printf("\nTotal Words: %lld\n", total_word_count);
printf("Total Capitalized words: %lld\n", total_capitalized_count);
printf("Total Sentences: %lld\n", total_sentence_count);
printf("Total Numbers: %lld\n", total_number_count);
printf("Total Forbidden words: %lld\n", total_forbidden_count);
printf("Capitalized percentage: %.6f%%\n", capitalized_percentage);
printf("Forbidden percentage: %.6f%%\n", forbidden_percentage);
printf("Word count per sentence: %.6f\n", word_count_per_sentence);
printf("Total files read: %d\n", (int)(argc - 1));
return 0;
}
}

332
retoor_c/isspam.c.bak Normal file
View File

@ -0,0 +1,332 @@
#include "rmalloc.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "rstr.h"
#include "rstring_list.h"
#include <ctype.h>
#define sl rstring_list_t
#define slf rstring_list_free
#define sla rstring_list_add
#define sln rstring_list_new
#define rb rbuffer_t
#define rbf rbuffer_free
#define rbs rbuffer_to_string
#define rbw rbuffer_write
#define rbn rbuffer_new
char *forbidden_words[] = {
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", NULL};
bool stricmp(char *word1, char *word2) {
while (*word1 && tolower(*word1) == tolower(*word2)) {
word1++;
word2++;
}
return *word1 == *word2;
}
void sld(sl *lst) {
for (ulonglong i = 0; i < lst->count; i++) {
printf("<%llu:%s>\n", i, lst->strings[i]);
}
}
char *remove_preserved_chars(char *content) {
char *cc = (char *)malloc(strlen(content) + 1);
*cc = 0;
char *ccp = cc;
while (*content) {
if (*content == '<' || *content == '>' || *content == ':') {
content++;
continue;
}
*ccp = *content;
ccp++;
*ccp = 0;
content++;
}
return cc;
}
//Memory usage: 29 TB, 213.322.618 (re)allocated, 106.670.251 unqiue free'd, 0 in use.
char *slds(sl *lst) {
str_t *buffer = strn(1337);
for (ulonglong i = 0; i < lst->count; i++) {
char *temp = (char *)malloc(strlen(lst->strings[i]) + 20);
char *cc = remove_preserved_chars(lst->strings[i]);
sprintf(temp, "<%llu:%s>\n", i, cc);
free(cc);
stra(buffer, temp);
free(temp);
}
return strc(buffer);
}
bool isws(char c) { return c == '\t' || c == '\n' || c == ' ' || c == ','; }
char *fread_till_eof(FILE *f) {
char c;
str_t *buffer = strn(1337);
while ((c = fgetc(f)) != EOF) {
strac(buffer, c);
}
char *content = strc(buffer);
return content;
}
int get_sentences(char *content) {
int count = 0;
char *sentence_buffer = (char *)malloc(strlen(content) + 1);
char *sentence_buffer_p = sentence_buffer;
bool in_line = false;
while (*content) {
if ((*content == ' ' || *content == '\t' || *content == '\n') && !in_line) {
content++;
continue;
} else {
in_line = true;
}
if (*content == '.') {
*sentence_buffer_p = *content;
sentence_buffer_p++;
*sentence_buffer_p = 0;
count++;
sentence_buffer_p = sentence_buffer;
*sentence_buffer = 0;
content++;
in_line = false;
continue;
}
*sentence_buffer_p = *content;
sentence_buffer_p++;
*sentence_buffer_p = 0;
content++;
}
free(sentence_buffer);
return count;
}
bool is_forbidden_word(char *word) {
for (int j = 0; forbidden_words[j] != NULL; j++) {
if (stricmp(word, forbidden_words[j])) {
return true;
}
}
return false;
}
int get_words(char *content, int * count_caps, int *fw_count) {
int count = 0;
char *word_buffer = (char *)malloc(strlen(content) + 1);
char *word_buffer_p = word_buffer;
*word_buffer_p = 0;
bool has_lcase = false;
// rbuffer_t * buffer = rbuffer_new(NULL,0);
while (*content) {
if (*content == ' ' || *content == '\t' || *content == '\n') {
if (word_buffer_p != word_buffer) {
if(!has_lcase)
{
(*count_caps)++;
}
count++;
if(is_forbidden_word(word_buffer)){
(*fw_count)++;
}
word_buffer_p = word_buffer;
*word_buffer = 0;
}
has_lcase = false;
content++;
continue;
}
*word_buffer_p = *content;
if(islower(*content) == *content)
has_lcase = true;
word_buffer_p++;
*word_buffer_p = 0;
content++;
}
free(word_buffer);
return count;
}
bool is_fully_capitalized_word(char *word) {
while (*word) {
if (isalnum(*word) && toupper(*word) != *word)
return false;
word++;
}
return true;
}
int get_capitalized_words(sl *all_words) {
int count = 0;
for (uint i = 0; i < all_words->count; i++) {
if (is_fully_capitalized_word(all_words->strings[i])) {
count++;
}
}
return count;
}
char *clean_content(char *content) {
char *allowed_ichars = "01234567891abcdefghijklmnopqrstuvwxyz.,!?";
char *clean_content = (char *)malloc(strlen(content) + 1);
char *clean_content_p = clean_content;
*clean_content_p = 0;
while (*content) {
if (strchr(allowed_ichars, tolower(*content))) {
*clean_content_p = *content;
clean_content_p++;
*clean_content_p = 0;
}
content++;
}
return clean_content;
}
int get_numbers(char *cc) {
int count = 0;
char *ccc = cc;
char *cccp = ccc;
char *number_buffer = (char *)malloc(strlen(ccc) + 1);
*number_buffer = 0;
char *number_buffer_p = number_buffer;
while (*cccp) {
if (isdigit((*cccp))) {
*number_buffer_p = *cccp;
number_buffer_p++;
*number_buffer_p = 0;
} else if (number_buffer != number_buffer_p) {
count++;
*number_buffer = 0;
number_buffer_p = number_buffer;
}
cccp++;
}
free(number_buffer);
return count;
}
unsigned int total = 0;
char *readall(FILE *f) {
if (fseek(f, 0, SEEK_END) != 0) {
fclose(f);
return NULL;
}
size_t file_size = ftell(f);
if (file_size == (size_t)-1L) {
fclose(f);
return NULL;
}
if (fseek(f, 0, SEEK_SET) != 0) {
fclose(f);
return NULL;
}
char *buffer = (char *)malloc(file_size + 1);
if (!buffer) {
fclose(f);
return NULL;
}
size_t bytes_read = fread(buffer, 1, file_size, f);
buffer[bytes_read] = 0;
return buffer;
}
void analyze(FILE *f) {
if(!f){
// File doesn't exist
return;
}
total = total + 1;
printf("#%u\n", total);
char *data = readall(f);
if(!data)
return;
char *clean_data = clean_content(data);
int capitalized_words = 0;
int fw = 0;
int words = get_words(data,&capitalized_words,&fw);
int sentences = get_sentences(data);
int numbers = get_numbers(clean_data);
// All words
printf("Words: %d\n", words);
// All capitalized words
printf("Capitalized words: %d\n", capitalized_words);
// All sentences
printf("Sentences: %i\n", sentences);
// Numbers
printf("Numbers: %d\n", numbers);
// Forbidden words
printf("Forbidden words: %d\n", fw);
if (words) {
double capitalized_word_percentage = 100 * ((double)capitalized_words / (double)words);
printf("Capitalized percentage: %f%%\n", capitalized_word_percentage);
double forbidden_word_percentage = 100 * ((double)fw / (double)words);
printf("Forbidden percentage: %f%%\n", forbidden_word_percentage);
ulonglong word_count_per_sentence = words / (sentences ? sentences : 1);
printf("Word count per sentence: %llu\n", word_count_per_sentence);
}
free(clean_data);
free(data);
}
void analyze_file(char *path) {
FILE *f = fopen(path, "r");
if(f){
analyze(f);
fclose(f);
}else{
printf("File doesn't exist: %s\n",path);
}
}
void * analyze_file_thread(void *path){
analyze_file((char *)path);
return NULL;
}
int main(int argc, char *argv[]) {
if (argc > 1) {
pthread_t *threads = (pthread_t *)malloc(argc * sizeof(pthread_t));
for (int i = 1; i < argc; i++) {
pthread_create(&threads[i-1],NULL,analyze_file_thread,(void *)argv[i]);
}
for(int i = 1; i < argc; i++){
pthread_join(threads[i-1],NULL);
}
free(threads);
return 0;
}
analyze(stdin);
printf("%s\n", rmalloc_stats());
exit(0);
return 0;
}