Makefile
All checks were successful
isspam build / build (push) Successful in 4m27s

This commit is contained in:
retoor 2025-10-07 20:20:58 +02:00
parent 3d28435e9b
commit e400fc8e0d
2 changed files with 301 additions and 151 deletions

View File

@ -1,5 +1,12 @@
CC = gcc CC = gcc
CFLAGS = -Ofast CFLAGS = -O3 -march=native -mtune=native -flto -ffast-math \
-funroll-all-loops -finline-functions -finline-limit=10000 \
-fprefetch-loop-arrays -ftracer -fmodulo-sched \
-fmodulo-sched-allow-regmoves -fgcse-sm -fgcse-las \
-ftree-loop-distribution -ftree-loop-im -ftree-loop-ivcanon \
-fivopts -fvariable-expansion-in-unroller -fvect-cost-model=unlimited \
-mavx2 -mfma -mbmi2 -mlzcnt -mpopcnt \
-pthread
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest

View File

@ -1,183 +1,326 @@
// Author: retoor@molodetz.nl #define _GNU_SOURCE
// This program analyzes text files for word counts, capitalized words, sentences, numbers, and forbidden words.
/*
MIT License
Copyright (c) 2025 retoor
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
... (full license text)
*/
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include <string.h> #include <string.h>
#include <ctype.h> #include <ctype.h>
#include <stdlib.h> #include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <pthread.h> #include <pthread.h>
#include <unistd.h>
#include <immintrin.h>
#define MAX_TEXT_LENGTH 1024 #define MAX_WORD_LEN 64
#define FORBIDDEN_WORDS_COUNT 40 #define MAX_THREADS 16
#define CACHE_LINE 64
#define MAX_FILES 1024
const char* forbidden_words[FORBIDDEN_WORDS_COUNT] = { // Compact trie with better cache locality
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com", typedef struct TrieNode {
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency", struct TrieNode* children[128]; // ASCII only
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century", unsigned char is_word;
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", } TrieNode;
NULL
TrieNode* trie_root;
static const char* forbidden[] = {
"recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker",
"welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick",
"crucial", "tracing", "scammers", "expers", "hire", "century", "transaction",
"essential", "managing", "contact", "contacting", "understanding", "assets", "funds"
}; };
// Build trie case-insensitive
void build_trie() {
trie_root = calloc(1, sizeof(TrieNode));
for (int i = 0; i < 29; i++) {
TrieNode* node = trie_root;
const char* w = forbidden[i];
while (*w) {
unsigned char c = tolower(*w++) & 0x7F;
if (!node->children[c]) {
node->children[c] = calloc(1, sizeof(TrieNode));
}
node = node->children[c];
}
node->is_word = 1;
}
}
// Fast inline trie search
static inline int trie_search(const char* word, int len) {
TrieNode* node = trie_root;
for (int i = 0; i < len; i++) {
unsigned char c = tolower((unsigned char)word[i]) & 0x7F;
node = node->children[c];
if (!node) return 0;
}
return node->is_word;
}
// Lookup tables
static unsigned char is_upper_tbl[256];
static unsigned char is_digit_tbl[256];
static unsigned char is_alpha_tbl[256];
static unsigned char to_lower_tbl[256];
void init_tables() {
for (int i = 0; i < 256; i++) {
is_upper_tbl[i] = isupper(i);
is_digit_tbl[i] = isdigit(i);
is_alpha_tbl[i] = isalpha(i);
to_lower_tbl[i] = tolower(i);
}
}
typedef struct { typedef struct {
char *filename; unsigned long long wc, cc, sc, nc, fc;
long long total_word_count; } Stats;
long long total_capitalized_count;
long long total_sentence_count;
long long total_number_count;
long long total_forbidden_count;
} AnalysisResult;
int is_forbidden(const char* word) { typedef struct {
for (size_t i = 0; forbidden_words[i] != NULL; i++) { char* path;
if (strcmp(word, forbidden_words[i]) == 0) { Stats* result;
return 1; // Word is forbidden } FileTask;
}
}
return 0; // Word is not forbidden
}
char* read_file(const char* filename) { typedef struct {
FILE *file = fopen(filename, "r"); FileTask* tasks;
if (!file) { int* next_task;
printf("File doesn't exist: %s\n", filename); int total_tasks;
return NULL; pthread_mutex_t* mutex;
} } WorkQueue;
char *content = NULL; // Process entire file optimized for 590KB files
size_t content_size = 0; void* process_file_worker(void* arg) {
size_t bytes_read; WorkQueue* queue = (WorkQueue*)arg;
do { while (1) {
char *new_content = (char *)realloc(content, content_size + MAX_TEXT_LENGTH); pthread_mutex_lock(queue->mutex);
if (!new_content) { int task_id = (*queue->next_task)++;
free(content); pthread_mutex_unlock(queue->mutex);
fclose(file);
printf("Memory allocation failed while reading file: %s\n", filename);
return NULL;
}
content = new_content;
bytes_read = fread(content + content_size, 1, MAX_TEXT_LENGTH, file);
content_size += bytes_read;
} while (bytes_read == MAX_TEXT_LENGTH);
content[content_size] = '\0'; // Null-terminate the string if (task_id >= queue->total_tasks) break;
fclose(file);
return content;
}
void* analyze_file(void* arg) { FileTask* task = &queue->tasks[task_id];
AnalysisResult *result = (AnalysisResult *)arg;
char *text = read_file(result->filename);
if (text) {
long long word_count = 0;
long long capitalized_count = 0;
long long sentence_count = 0;
long long number_count = 0;
long long forbidden_count = 0;
for (size_t i = 0; text[i] != '\0'; i++) { int fd = open(task->path, O_RDONLY);
if (text[i] == '.') { if (fd < 0) continue;
sentence_count++;
} struct stat st;
if (fstat(fd, &st) < 0) {
close(fd);
continue;
} }
char *saveptr; size_t size = st.st_size;
char* token = strtok_r(text, " \f\v\r\n\t", &saveptr); if (size == 0) {
while (token != NULL) { close(fd);
word_count++; continue;
}
if (isupper(token[0])) { unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
capitalized_count++; close(fd);
}
for (size_t i = 0; token[i] != '\0'; i++) { if (data == MAP_FAILED) continue;
if (isdigit(token[i])) {
number_count++; madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED);
break;
unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0;
char word[MAX_WORD_LEN + 1] __attribute__((aligned(16)));
int wlen = 0;
// AVX2 SIMD constants
const __m256i dot_vec = _mm256_set1_epi8('.');
size_t i = 0;
// Process 128 bytes at a time (4 AVX2 loads)
while (i + 128 <= size) {
__builtin_prefetch(data + i + 256, 0, 0);
// Load and count periods
__m256i v0 = _mm256_loadu_si256((__m256i*)(data + i));
__m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32));
__m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64));
__m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec)));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec)));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec)));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec)));
// Process characters
for (int j = 0; j < 128; j++) {
unsigned char c = data[i + j];
if (is_alpha_tbl[c]) {
if (wlen < MAX_WORD_LEN) {
word[wlen++] = c;
}
} else {
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
// Check for digits
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
nc += has_digit;
// Check forbidden
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
}
wlen = 0;
}
} }
} }
if (is_forbidden(token)) { i += 128;
forbidden_count++;
}
token = strtok_r(NULL, " \f\v\r\n\t", &saveptr);
} }
result->total_word_count = word_count; // Process remaining bytes
result->total_capitalized_count = capitalized_count; while (i < size) {
result->total_sentence_count = sentence_count; unsigned char c = data[i];
result->total_number_count = number_count;
result->total_forbidden_count = forbidden_count;
free(text); if (c == '.') sc++;
if (is_alpha_tbl[c]) {
if (wlen < MAX_WORD_LEN) {
word[wlen++] = c;
}
} else {
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
nc += has_digit;
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
}
wlen = 0;
}
}
i++;
}
// Final word
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
nc += has_digit;
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
}
}
task->result->wc = wc;
task->result->cc = cc;
task->result->sc = sc;
task->result->nc = nc;
task->result->fc = fc;
munmap(data, size);
} }
return NULL; return NULL;
} }
int main(int argc, char *argv[]) { int main(int argc, char* argv[]) {
if (argc < 2) { if (argc < 2) {
printf("Usage: %s <file1> <file2> ... <fileN>\n", argv[0]); fprintf(stderr, "Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
return 1; return 1;
} }
pthread_t threads[argc - 1]; init_tables();
AnalysisResult results[argc - 1]; build_trie();
for (size_t i = 1; i < argc; i++) { // Setup work queue
results[i - 1].filename = argv[i]; int num_files = argc - 1;
if (pthread_create(&threads[i - 1], NULL, analyze_file, &results[i - 1]) != 0) { FileTask* tasks = calloc(num_files, sizeof(FileTask));
printf("Error creating thread for file: %s\n", argv[i]); Stats* results = calloc(num_files, sizeof(Stats));
return 1;
} for (int i = 0; i < num_files; i++) {
tasks[i].path = argv[i + 1];
tasks[i].result = &results[i];
} }
for (size_t i = 1; i < argc; i++) { int next_task = 0;
pthread_join(threads[i - 1], NULL); pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
WorkQueue queue = {
.tasks = tasks,
.next_task = &next_task,
.total_tasks = num_files,
.mutex = &mutex
};
// Create thread pool
int nthreads = MAX_THREADS;
if (num_files < MAX_THREADS) nthreads = num_files;
pthread_t threads[MAX_THREADS];
for (int i = 0; i < nthreads; i++) {
pthread_create(&threads[i], NULL, process_file_worker, &queue);
} }
long long total_word_count = 0; for (int i = 0; i < nthreads; i++) {
long long total_capitalized_count = 0; pthread_join(threads[i], NULL);
long long total_sentence_count = 0;
long long total_number_count = 0;
long long total_forbidden_count = 0;
for (size_t i = 0; i < argc - 1; i++) {
total_word_count += results[i].total_word_count;
total_capitalized_count += results[i].total_capitalized_count;
total_sentence_count += results[i].total_sentence_count;
total_number_count += results[i].total_number_count;
total_forbidden_count += results[i].total_forbidden_count;
} }
double capitalized_percentage = (total_word_count > 0) ? (double)total_capitalized_count / total_word_count * 100.0 : 0; // Aggregate results
double forbidden_percentage = (total_word_count > 0) ? (double)total_forbidden_count / total_word_count * 100.0 : 0; unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0;
double word_count_per_sentence = (total_sentence_count > 0) ? (double)total_word_count / total_sentence_count : 0; for (int i = 0; i < num_files; i++) {
twc += results[i].wc;
tcc += results[i].cc;
tsc += results[i].sc;
tnc += results[i].nc;
tfc += results[i].fc;
}
double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0;
double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0;
double wps = (tsc > 0) ? (double)twc / tsc : 0;
printf("\nTotal Words: %llu\n", twc);
printf("Total Capitalized words: %llu\n", tcc);
printf("Total Sentences: %llu\n", tsc);
printf("Total Numbers: %llu\n", tnc);
printf("Total Forbidden words: %llu\n", tfc);
printf("Capitalized percentage: %.6f%%\n", cc_pct);
printf("Forbidden percentage: %.6f%%\n", fc_pct);
printf("Word count per sentence: %.6f\n", wps);
printf("Total files read: %d\n", num_files);
free(tasks);
free(results);
pthread_mutex_destroy(&mutex);
printf("\nTotal Words: %lld\n", total_word_count);
printf("Total Capitalized words: %lld\n", total_capitalized_count);
printf("Total Sentences: %lld\n", total_sentence_count);
printf("Total Numbers: %lld\n", total_number_count);
printf("Total Forbidden words: %lld\n", total_forbidden_count);
printf("Capitalized percentage: %.6f%%\n", capitalized_percentage);
printf("Forbidden percentage: %.6f%%\n", forbidden_percentage);
printf("Word count per sentence: %.6f\n", word_count_per_sentence);
printf("Total files read: %d\n", (int)(argc - 1));
return 0; return 0;
} }