Makefile
All checks were successful
isspam build / build (push) Successful in 4m27s

This commit is contained in:
retoor 2025-10-07 20:20:58 +02:00
parent 3d28435e9b
commit e400fc8e0d
2 changed files with 301 additions and 151 deletions

View File

@ -1,5 +1,12 @@
CC = gcc
CFLAGS = -Ofast
CFLAGS = -O3 -march=native -mtune=native -flto -ffast-math \
-funroll-all-loops -finline-functions -finline-limit=10000 \
-fprefetch-loop-arrays -ftracer -fmodulo-sched \
-fmodulo-sched-allow-regmoves -fgcse-sm -fgcse-las \
-ftree-loop-distribution -ftree-loop-im -ftree-loop-ivcanon \
-fivopts -fvariable-expansion-in-unroller -fvect-cost-model=unlimited \
-mavx2 -mfma -mbmi2 -mlzcnt -mpopcnt \
-pthread
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest

View File

@ -1,183 +1,326 @@
// Author: retoor@molodetz.nl
// This program analyzes text files for word counts, capitalized words, sentences, numbers, and forbidden words.
/*
MIT License
Copyright (c) 2025 retoor
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
... (full license text)
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <pthread.h>
#include <unistd.h>
#include <immintrin.h>
#define MAX_TEXT_LENGTH 1024
#define FORBIDDEN_WORDS_COUNT 40
#define MAX_WORD_LEN 64
#define MAX_THREADS 16
#define CACHE_LINE 64
#define MAX_FILES 1024
const char* forbidden_words[FORBIDDEN_WORDS_COUNT] = {
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds",
NULL
// Compact trie with better cache locality
typedef struct TrieNode {
struct TrieNode* children[128]; // ASCII only
unsigned char is_word;
} TrieNode;
TrieNode* trie_root;
static const char* forbidden[] = {
"recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker",
"welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick",
"crucial", "tracing", "scammers", "expers", "hire", "century", "transaction",
"essential", "managing", "contact", "contacting", "understanding", "assets", "funds"
};
// Build trie case-insensitive
void build_trie() {
trie_root = calloc(1, sizeof(TrieNode));
for (int i = 0; i < 29; i++) {
TrieNode* node = trie_root;
const char* w = forbidden[i];
while (*w) {
unsigned char c = tolower(*w++) & 0x7F;
if (!node->children[c]) {
node->children[c] = calloc(1, sizeof(TrieNode));
}
node = node->children[c];
}
node->is_word = 1;
}
}
// Fast inline trie search
static inline int trie_search(const char* word, int len) {
TrieNode* node = trie_root;
for (int i = 0; i < len; i++) {
unsigned char c = tolower((unsigned char)word[i]) & 0x7F;
node = node->children[c];
if (!node) return 0;
}
return node->is_word;
}
// Lookup tables
static unsigned char is_upper_tbl[256];
static unsigned char is_digit_tbl[256];
static unsigned char is_alpha_tbl[256];
static unsigned char to_lower_tbl[256];
void init_tables() {
for (int i = 0; i < 256; i++) {
is_upper_tbl[i] = isupper(i);
is_digit_tbl[i] = isdigit(i);
is_alpha_tbl[i] = isalpha(i);
to_lower_tbl[i] = tolower(i);
}
}
typedef struct {
char *filename;
long long total_word_count;
long long total_capitalized_count;
long long total_sentence_count;
long long total_number_count;
long long total_forbidden_count;
} AnalysisResult;
unsigned long long wc, cc, sc, nc, fc;
} Stats;
int is_forbidden(const char* word) {
for (size_t i = 0; forbidden_words[i] != NULL; i++) {
if (strcmp(word, forbidden_words[i]) == 0) {
return 1; // Word is forbidden
typedef struct {
char* path;
Stats* result;
} FileTask;
typedef struct {
FileTask* tasks;
int* next_task;
int total_tasks;
pthread_mutex_t* mutex;
} WorkQueue;
// Process entire file optimized for 590KB files
void* process_file_worker(void* arg) {
WorkQueue* queue = (WorkQueue*)arg;
while (1) {
pthread_mutex_lock(queue->mutex);
int task_id = (*queue->next_task)++;
pthread_mutex_unlock(queue->mutex);
if (task_id >= queue->total_tasks) break;
FileTask* task = &queue->tasks[task_id];
int fd = open(task->path, O_RDONLY);
if (fd < 0) continue;
struct stat st;
if (fstat(fd, &st) < 0) {
close(fd);
continue;
}
}
return 0; // Word is not forbidden
}
char* read_file(const char* filename) {
FILE *file = fopen(filename, "r");
if (!file) {
printf("File doesn't exist: %s\n", filename);
return NULL;
}
char *content = NULL;
size_t content_size = 0;
size_t bytes_read;
do {
char *new_content = (char *)realloc(content, content_size + MAX_TEXT_LENGTH);
if (!new_content) {
free(content);
fclose(file);
printf("Memory allocation failed while reading file: %s\n", filename);
return NULL;
size_t size = st.st_size;
if (size == 0) {
close(fd);
continue;
}
content = new_content;
bytes_read = fread(content + content_size, 1, MAX_TEXT_LENGTH, file);
content_size += bytes_read;
} while (bytes_read == MAX_TEXT_LENGTH);
content[content_size] = '\0'; // Null-terminate the string
fclose(file);
return content;
}
void* analyze_file(void* arg) {
AnalysisResult *result = (AnalysisResult *)arg;
char *text = read_file(result->filename);
if (text) {
long long word_count = 0;
long long capitalized_count = 0;
long long sentence_count = 0;
long long number_count = 0;
long long forbidden_count = 0;
for (size_t i = 0; text[i] != '\0'; i++) {
if (text[i] == '.') {
sentence_count++;
unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
if (data == MAP_FAILED) continue;
madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED);
unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0;
char word[MAX_WORD_LEN + 1] __attribute__((aligned(16)));
int wlen = 0;
// AVX2 SIMD constants
const __m256i dot_vec = _mm256_set1_epi8('.');
size_t i = 0;
// Process 128 bytes at a time (4 AVX2 loads)
while (i + 128 <= size) {
__builtin_prefetch(data + i + 256, 0, 0);
// Load and count periods
__m256i v0 = _mm256_loadu_si256((__m256i*)(data + i));
__m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32));
__m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64));
__m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec)));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec)));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec)));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec)));
// Process characters
for (int j = 0; j < 128; j++) {
unsigned char c = data[i + j];
if (is_alpha_tbl[c]) {
if (wlen < MAX_WORD_LEN) {
word[wlen++] = c;
}
} else {
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
// Check for digits
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
nc += has_digit;
// Check forbidden
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
}
wlen = 0;
}
}
}
i += 128;
}
char *saveptr;
char* token = strtok_r(text, " \f\v\r\n\t", &saveptr);
while (token != NULL) {
word_count++;
if (isupper(token[0])) {
capitalized_count++;
// Process remaining bytes
while (i < size) {
unsigned char c = data[i];
if (c == '.') sc++;
if (is_alpha_tbl[c]) {
if (wlen < MAX_WORD_LEN) {
word[wlen++] = c;
}
} else {
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
nc += has_digit;
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
}
wlen = 0;
}
}
for (size_t i = 0; token[i] != '\0'; i++) {
if (isdigit(token[i])) {
number_count++;
i++;
}
// Final word
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
if (is_forbidden(token)) {
forbidden_count++;
nc += has_digit;
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
}
token = strtok_r(NULL, " \f\v\r\n\t", &saveptr);
}
result->total_word_count = word_count;
result->total_capitalized_count = capitalized_count;
result->total_sentence_count = sentence_count;
result->total_number_count = number_count;
result->total_forbidden_count = forbidden_count;
free(text);
task->result->wc = wc;
task->result->cc = cc;
task->result->sc = sc;
task->result->nc = nc;
task->result->fc = fc;
munmap(data, size);
}
return NULL;
}
int main(int argc, char *argv[]) {
int main(int argc, char* argv[]) {
if (argc < 2) {
printf("Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
fprintf(stderr, "Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
return 1;
}
pthread_t threads[argc - 1];
AnalysisResult results[argc - 1];
for (size_t i = 1; i < argc; i++) {
results[i - 1].filename = argv[i];
if (pthread_create(&threads[i - 1], NULL, analyze_file, &results[i - 1]) != 0) {
printf("Error creating thread for file: %s\n", argv[i]);
return 1;
}
init_tables();
build_trie();
// Setup work queue
int num_files = argc - 1;
FileTask* tasks = calloc(num_files, sizeof(FileTask));
Stats* results = calloc(num_files, sizeof(Stats));
for (int i = 0; i < num_files; i++) {
tasks[i].path = argv[i + 1];
tasks[i].result = &results[i];
}
for (size_t i = 1; i < argc; i++) {
pthread_join(threads[i - 1], NULL);
int next_task = 0;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
WorkQueue queue = {
.tasks = tasks,
.next_task = &next_task,
.total_tasks = num_files,
.mutex = &mutex
};
// Create thread pool
int nthreads = MAX_THREADS;
if (num_files < MAX_THREADS) nthreads = num_files;
pthread_t threads[MAX_THREADS];
for (int i = 0; i < nthreads; i++) {
pthread_create(&threads[i], NULL, process_file_worker, &queue);
}
long long total_word_count = 0;
long long total_capitalized_count = 0;
long long total_sentence_count = 0;
long long total_number_count = 0;
long long total_forbidden_count = 0;
for (size_t i = 0; i < argc - 1; i++) {
total_word_count += results[i].total_word_count;
total_capitalized_count += results[i].total_capitalized_count;
total_sentence_count += results[i].total_sentence_count;
total_number_count += results[i].total_number_count;
total_forbidden_count += results[i].total_forbidden_count;
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
}
// Aggregate results
unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0;
for (int i = 0; i < num_files; i++) {
twc += results[i].wc;
tcc += results[i].cc;
tsc += results[i].sc;
tnc += results[i].nc;
tfc += results[i].fc;
}
double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0;
double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0;
double wps = (tsc > 0) ? (double)twc / tsc : 0;
double capitalized_percentage = (total_word_count > 0) ? (double)total_capitalized_count / total_word_count * 100.0 : 0;
double forbidden_percentage = (total_word_count > 0) ? (double)total_forbidden_count / total_word_count * 100.0 : 0;
double word_count_per_sentence = (total_sentence_count > 0) ? (double)total_word_count / total_sentence_count : 0;
printf("\nTotal Words: %lld\n", total_word_count);
printf("Total Capitalized words: %lld\n", total_capitalized_count);
printf("Total Sentences: %lld\n", total_sentence_count);
printf("Total Numbers: %lld\n", total_number_count);
printf("Total Forbidden words: %lld\n", total_forbidden_count);
printf("Capitalized percentage: %.6f%%\n", capitalized_percentage);
printf("Forbidden percentage: %.6f%%\n", forbidden_percentage);
printf("Word count per sentence: %.6f\n", word_count_per_sentence);
printf("Total files read: %d\n", (int)(argc - 1));
printf("\nTotal Words: %llu\n", twc);
printf("Total Capitalized words: %llu\n", tcc);
printf("Total Sentences: %llu\n", tsc);
printf("Total Numbers: %llu\n", tnc);
printf("Total Forbidden words: %llu\n", tfc);
printf("Capitalized percentage: %.6f%%\n", cc_pct);
printf("Forbidden percentage: %.6f%%\n", fc_pct);
printf("Word count per sentence: %.6f\n", wps);
printf("Total files read: %d\n", num_files);
free(tasks);
free(results);
pthread_mutex_destroy(&mutex);
return 0;
}