Makefile
All checks were successful
isspam build / build (push) Successful in 4m27s

This commit is contained in:
retoor 2025-10-07 20:20:58 +02:00
parent 3d28435e9b
commit e400fc8e0d
2 changed files with 301 additions and 151 deletions

View File

@ -1,5 +1,12 @@
CC = gcc CC = gcc
CFLAGS = -Ofast CFLAGS = -O3 -march=native -mtune=native -flto -ffast-math \
-funroll-all-loops -finline-functions -finline-limit=10000 \
-fprefetch-loop-arrays -ftracer -fmodulo-sched \
-fmodulo-sched-allow-regmoves -fgcse-sm -fgcse-las \
-ftree-loop-distribution -ftree-loop-im -ftree-loop-ivcanon \
-fivopts -fvariable-expansion-in-unroller -fvect-cost-model=unlimited \
-mavx2 -mfma -mbmi2 -mlzcnt -mpopcnt \
-pthread
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest

View File

@ -1,183 +1,326 @@
// Author: retoor@molodetz.nl #define _GNU_SOURCE
// This program analyzes text files for word counts, capitalized words, sentences, numbers, and forbidden words.
/*
MIT License
Copyright (c) 2025 retoor
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
... (full license text)
*/
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#include <string.h> #include <string.h>
#include <ctype.h> #include <ctype.h>
#include <stdlib.h> #include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <pthread.h> #include <pthread.h>
#include <unistd.h>
#include <immintrin.h>
#define MAX_TEXT_LENGTH 1024 #define MAX_WORD_LEN 64
#define FORBIDDEN_WORDS_COUNT 40 #define MAX_THREADS 16
#define CACHE_LINE 64
#define MAX_FILES 1024
const char* forbidden_words[FORBIDDEN_WORDS_COUNT] = { // Compact trie with better cache locality
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com", typedef struct TrieNode {
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency", struct TrieNode* children[128]; // ASCII only
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century", unsigned char is_word;
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", } TrieNode;
NULL
TrieNode* trie_root;
static const char* forbidden[] = {
"recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker",
"welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick",
"crucial", "tracing", "scammers", "expers", "hire", "century", "transaction",
"essential", "managing", "contact", "contacting", "understanding", "assets", "funds"
}; };
// Build trie case-insensitive
void build_trie() {
trie_root = calloc(1, sizeof(TrieNode));
for (int i = 0; i < 29; i++) {
TrieNode* node = trie_root;
const char* w = forbidden[i];
while (*w) {
unsigned char c = tolower(*w++) & 0x7F;
if (!node->children[c]) {
node->children[c] = calloc(1, sizeof(TrieNode));
}
node = node->children[c];
}
node->is_word = 1;
}
}
// Fast inline trie search
static inline int trie_search(const char* word, int len) {
TrieNode* node = trie_root;
for (int i = 0; i < len; i++) {
unsigned char c = tolower((unsigned char)word[i]) & 0x7F;
node = node->children[c];
if (!node) return 0;
}
return node->is_word;
}
// Lookup tables
static unsigned char is_upper_tbl[256];
static unsigned char is_digit_tbl[256];
static unsigned char is_alpha_tbl[256];
static unsigned char to_lower_tbl[256];
void init_tables() {
for (int i = 0; i < 256; i++) {
is_upper_tbl[i] = isupper(i);
is_digit_tbl[i] = isdigit(i);
is_alpha_tbl[i] = isalpha(i);
to_lower_tbl[i] = tolower(i);
}
}
typedef struct { typedef struct {
char *filename; unsigned long long wc, cc, sc, nc, fc;
long long total_word_count; } Stats;
long long total_capitalized_count;
long long total_sentence_count;
long long total_number_count;
long long total_forbidden_count;
} AnalysisResult;
int is_forbidden(const char* word) { typedef struct {
for (size_t i = 0; forbidden_words[i] != NULL; i++) { char* path;
if (strcmp(word, forbidden_words[i]) == 0) { Stats* result;
return 1; // Word is forbidden } FileTask;
}
} typedef struct {
return 0; // Word is not forbidden FileTask* tasks;
int* next_task;
int total_tasks;
pthread_mutex_t* mutex;
} WorkQueue;
// Process entire file optimized for 590KB files
void* process_file_worker(void* arg) {
WorkQueue* queue = (WorkQueue*)arg;
while (1) {
pthread_mutex_lock(queue->mutex);
int task_id = (*queue->next_task)++;
pthread_mutex_unlock(queue->mutex);
if (task_id >= queue->total_tasks) break;
FileTask* task = &queue->tasks[task_id];
int fd = open(task->path, O_RDONLY);
if (fd < 0) continue;
struct stat st;
if (fstat(fd, &st) < 0) {
close(fd);
continue;
} }
char* read_file(const char* filename) { size_t size = st.st_size;
FILE *file = fopen(filename, "r"); if (size == 0) {
if (!file) { close(fd);
printf("File doesn't exist: %s\n", filename); continue;
return NULL;
} }
char *content = NULL; unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
size_t content_size = 0; close(fd);
size_t bytes_read;
do { if (data == MAP_FAILED) continue;
char *new_content = (char *)realloc(content, content_size + MAX_TEXT_LENGTH);
if (!new_content) { madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED);
free(content);
fclose(file); unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0;
printf("Memory allocation failed while reading file: %s\n", filename); char word[MAX_WORD_LEN + 1] __attribute__((aligned(16)));
return NULL; int wlen = 0;
// AVX2 SIMD constants
const __m256i dot_vec = _mm256_set1_epi8('.');
size_t i = 0;
// Process 128 bytes at a time (4 AVX2 loads)
while (i + 128 <= size) {
__builtin_prefetch(data + i + 256, 0, 0);
// Load and count periods
__m256i v0 = _mm256_loadu_si256((__m256i*)(data + i));
__m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32));
__m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64));
__m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec)));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec)));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec)));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec)));
// Process characters
for (int j = 0; j < 128; j++) {
unsigned char c = data[i + j];
if (is_alpha_tbl[c]) {
if (wlen < MAX_WORD_LEN) {
word[wlen++] = c;
} }
content = new_content; } else {
bytes_read = fread(content + content_size, 1, MAX_TEXT_LENGTH, file); if (wlen > 0) {
content_size += bytes_read; word[wlen] = 0;
} while (bytes_read == MAX_TEXT_LENGTH); wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
content[content_size] = '\0'; // Null-terminate the string // Check for digits
fclose(file); int has_digit = 0;
return content; for (int k = 0; k < wlen; k++) {
} if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
void* analyze_file(void* arg) {
AnalysisResult *result = (AnalysisResult *)arg;
char *text = read_file(result->filename);
if (text) {
long long word_count = 0;
long long capitalized_count = 0;
long long sentence_count = 0;
long long number_count = 0;
long long forbidden_count = 0;
for (size_t i = 0; text[i] != '\0'; i++) {
if (text[i] == '.') {
sentence_count++;
}
}
char *saveptr;
char* token = strtok_r(text, " \f\v\r\n\t", &saveptr);
while (token != NULL) {
word_count++;
if (isupper(token[0])) {
capitalized_count++;
}
for (size_t i = 0; token[i] != '\0'; i++) {
if (isdigit(token[i])) {
number_count++;
break; break;
} }
} }
nc += has_digit;
if (is_forbidden(token)) { // Check forbidden
forbidden_count++; if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
} }
token = strtok_r(NULL, " \f\v\r\n\t", &saveptr); wlen = 0;
}
}
} }
result->total_word_count = word_count; i += 128;
result->total_capitalized_count = capitalized_count;
result->total_sentence_count = sentence_count;
result->total_number_count = number_count;
result->total_forbidden_count = forbidden_count;
free(text);
} }
// Process remaining bytes
while (i < size) {
unsigned char c = data[i];
if (c == '.') sc++;
if (is_alpha_tbl[c]) {
if (wlen < MAX_WORD_LEN) {
word[wlen++] = c;
}
} else {
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
nc += has_digit;
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
}
wlen = 0;
}
}
i++;
}
// Final word
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
nc += has_digit;
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
}
}
task->result->wc = wc;
task->result->cc = cc;
task->result->sc = sc;
task->result->nc = nc;
task->result->fc = fc;
munmap(data, size);
}
return NULL; return NULL;
} }
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
if (argc < 2) { if (argc < 2) {
printf("Usage: %s <file1> <file2> ... <fileN>\n", argv[0]); fprintf(stderr, "Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
return 1; return 1;
} }
pthread_t threads[argc - 1]; init_tables();
AnalysisResult results[argc - 1]; build_trie();
for (size_t i = 1; i < argc; i++) { // Setup work queue
results[i - 1].filename = argv[i]; int num_files = argc - 1;
if (pthread_create(&threads[i - 1], NULL, analyze_file, &results[i - 1]) != 0) { FileTask* tasks = calloc(num_files, sizeof(FileTask));
printf("Error creating thread for file: %s\n", argv[i]); Stats* results = calloc(num_files, sizeof(Stats));
return 1;
} for (int i = 0; i < num_files; i++) {
tasks[i].path = argv[i + 1];
tasks[i].result = &results[i];
} }
for (size_t i = 1; i < argc; i++) { int next_task = 0;
pthread_join(threads[i - 1], NULL); pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
WorkQueue queue = {
.tasks = tasks,
.next_task = &next_task,
.total_tasks = num_files,
.mutex = &mutex
};
// Create thread pool
int nthreads = MAX_THREADS;
if (num_files < MAX_THREADS) nthreads = num_files;
pthread_t threads[MAX_THREADS];
for (int i = 0; i < nthreads; i++) {
pthread_create(&threads[i], NULL, process_file_worker, &queue);
} }
long long total_word_count = 0; for (int i = 0; i < nthreads; i++) {
long long total_capitalized_count = 0; pthread_join(threads[i], NULL);
long long total_sentence_count = 0;
long long total_number_count = 0;
long long total_forbidden_count = 0;
for (size_t i = 0; i < argc - 1; i++) {
total_word_count += results[i].total_word_count;
total_capitalized_count += results[i].total_capitalized_count;
total_sentence_count += results[i].total_sentence_count;
total_number_count += results[i].total_number_count;
total_forbidden_count += results[i].total_forbidden_count;
} }
double capitalized_percentage = (total_word_count > 0) ? (double)total_capitalized_count / total_word_count * 100.0 : 0; // Aggregate results
double forbidden_percentage = (total_word_count > 0) ? (double)total_forbidden_count / total_word_count * 100.0 : 0; unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0;
double word_count_per_sentence = (total_sentence_count > 0) ? (double)total_word_count / total_sentence_count : 0; for (int i = 0; i < num_files; i++) {
twc += results[i].wc;
tcc += results[i].cc;
tsc += results[i].sc;
tnc += results[i].nc;
tfc += results[i].fc;
}
double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0;
double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0;
double wps = (tsc > 0) ? (double)twc / tsc : 0;
printf("\nTotal Words: %llu\n", twc);
printf("Total Capitalized words: %llu\n", tcc);
printf("Total Sentences: %llu\n", tsc);
printf("Total Numbers: %llu\n", tnc);
printf("Total Forbidden words: %llu\n", tfc);
printf("Capitalized percentage: %.6f%%\n", cc_pct);
printf("Forbidden percentage: %.6f%%\n", fc_pct);
printf("Word count per sentence: %.6f\n", wps);
printf("Total files read: %d\n", num_files);
free(tasks);
free(results);
pthread_mutex_destroy(&mutex);
printf("\nTotal Words: %lld\n", total_word_count);
printf("Total Capitalized words: %lld\n", total_capitalized_count);
printf("Total Sentences: %lld\n", total_sentence_count);
printf("Total Numbers: %lld\n", total_number_count);
printf("Total Forbidden words: %lld\n", total_forbidden_count);
printf("Capitalized percentage: %.6f%%\n", capitalized_percentage);
printf("Forbidden percentage: %.6f%%\n", forbidden_percentage);
printf("Word count per sentence: %.6f\n", word_count_per_sentence);
printf("Total files read: %d\n", (int)(argc - 1));
return 0; return 0;
} }