Makefile

2025-10-07 20:20:58 +02:00 · 2025-10-07 20:20:58 +02:00 · e400fc8e0d
commit e400fc8e0d
parent 3d28435e9b
2 changed files with 301 additions and 151 deletions
--- a/9
+++ b/9
@ -1,5 +1,12 @@
 CC = gcc 
-CFLAGS = -Ofast
+CFLAGS = -O3 -march=native -mtune=native -flto -ffast-math \
    -funroll-all-loops -finline-functions -finline-limit=10000 \
    -fprefetch-loop-arrays -ftracer -fmodulo-sched \
    -fmodulo-sched-allow-regmoves -fgcse-sm -fgcse-las \
    -ftree-loop-distribution -ftree-loop-im -ftree-loop-ivcanon \
    -fivopts -fvariable-expansion-in-unroller -fvect-cost-model=unlimited \
    -mavx2 -mfma -mbmi2 -mlzcnt -mpopcnt \
    -pthread
 all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
--- a/retoor_c/isspam.c
+++ b/retoor_c/isspam.c
@ -1,183 +1,326 @@
-// Author: retoor@molodetz.nl
+#define _GNU_SOURCE
 // This program analyzes text files for word counts, capitalized words, sentences, numbers, and forbidden words.
 /*
 MIT License
 Copyright (c) 2025 retoor
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 ... (full license text)
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
-#include <stdlib.h>
+#include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <pthread.h>
 #include <unistd.h>
 #include <immintrin.h>
-#define MAX_TEXT_LENGTH 1024
+#define MAX_WORD_LEN 64
-#define FORBIDDEN_WORDS_COUNT 40
+#define MAX_THREADS 16
 #define CACHE_LINE 64
 #define MAX_FILES 1024
-const char* forbidden_words[FORBIDDEN_WORDS_COUNT] = {
+// Compact trie with better cache locality
-    "recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
+typedef struct TrieNode {
-    "@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
+    struct TrieNode* children[128]; // ASCII only
-    "stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
+    unsigned char is_word;
-    "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", 
+} TrieNode;
-    NULL
+
 TrieNode* trie_root;
 static const char* forbidden[] = {
    "recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker",
    "welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick",
    "crucial", "tracing", "scammers", "expers", "hire", "century", "transaction",
    "essential", "managing", "contact", "contacting", "understanding", "assets", "funds"
 };
 // Build trie case-insensitive
 void build_trie() {
    trie_root = calloc(1, sizeof(TrieNode));
    for (int i = 0; i < 29; i++) {
        TrieNode* node = trie_root;
        const char* w = forbidden[i];
        while (*w) {
            unsigned char c = tolower(*w++) & 0x7F;
            if (!node->children[c]) {
                node->children[c] = calloc(1, sizeof(TrieNode));
            }
            node = node->children[c];
        }
        node->is_word = 1;
    }
 }
 // Fast inline trie search
 static inline int trie_search(const char* word, int len) {
    TrieNode* node = trie_root;
    for (int i = 0; i < len; i++) {
        unsigned char c = tolower((unsigned char)word[i]) & 0x7F;
        node = node->children[c];
        if (!node) return 0;
    }
    return node->is_word;
 }
 // Lookup tables
 static unsigned char is_upper_tbl[256];
 static unsigned char is_digit_tbl[256];
 static unsigned char is_alpha_tbl[256];
 static unsigned char to_lower_tbl[256];
 void init_tables() {
    for (int i = 0; i < 256; i++) {
        is_upper_tbl[i] = isupper(i);
        is_digit_tbl[i] = isdigit(i);
        is_alpha_tbl[i] = isalpha(i);
        to_lower_tbl[i] = tolower(i);
    }
 }
 typedef struct {
-    char *filename;
+    unsigned long long wc, cc, sc, nc, fc;
-    long long total_word_count;
+} Stats;
    long long total_capitalized_count;
    long long total_sentence_count;
    long long total_number_count;
    long long total_forbidden_count;
 } AnalysisResult;
-int is_forbidden(const char* word) {
+typedef struct {
-    for (size_t i = 0; forbidden_words[i] != NULL; i++) {
+    char* path;
-        if (strcmp(word, forbidden_words[i]) == 0) {
+    Stats* result;
-            return 1; // Word is forbidden
+} FileTask;
-        }
+
-    }
+typedef struct {
-    return 0; // Word is not forbidden
+    FileTask* tasks;
    int* next_task;
    int total_tasks;
    pthread_mutex_t* mutex;
 } WorkQueue;
 // Process entire file optimized for 590KB files
 void* process_file_worker(void* arg) {
    WorkQueue* queue = (WorkQueue*)arg;
    while (1) {
        pthread_mutex_lock(queue->mutex);
        int task_id = (*queue->next_task)++;
        pthread_mutex_unlock(queue->mutex);
        if (task_id >= queue->total_tasks) break;
        FileTask* task = &queue->tasks[task_id];
        int fd = open(task->path, O_RDONLY);
        if (fd < 0) continue;
        struct stat st;
        if (fstat(fd, &st) < 0) {
            close(fd);
            continue;
        }
-char* read_file(const char* filename) {
+        size_t size = st.st_size;
-    FILE *file = fopen(filename, "r");
+        if (size == 0) {
-    if (!file) {
+            close(fd);
-        printf("File doesn't exist: %s\n", filename);
+            continue;
        return NULL;
        }
-    char *content = NULL;
+        unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
-    size_t content_size = 0;
+        close(fd);
    size_t bytes_read;
-    do {
+        if (data == MAP_FAILED) continue;
-        char *new_content = (char *)realloc(content, content_size + MAX_TEXT_LENGTH);
+        
-        if (!new_content) {
+        madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED);
-            free(content);
+        
-            fclose(file);
+        unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0;
-            printf("Memory allocation failed while reading file: %s\n", filename);
+        char word[MAX_WORD_LEN + 1] __attribute__((aligned(16)));
-            return NULL;
+        int wlen = 0;
        // AVX2 SIMD constants
        const __m256i dot_vec = _mm256_set1_epi8('.');
        size_t i = 0;
        // Process 128 bytes at a time (4 AVX2 loads)
        while (i + 128 <= size) {
            __builtin_prefetch(data + i + 256, 0, 0);
            // Load and count periods
            __m256i v0 = _mm256_loadu_si256((__m256i*)(data + i));
            __m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32));
            __m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64));
            __m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96));
            sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec)));
            sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec)));
            sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec)));
            sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec)));
            // Process characters
            for (int j = 0; j < 128; j++) {
                unsigned char c = data[i + j];
                if (is_alpha_tbl[c]) {
                    if (wlen < MAX_WORD_LEN) {
                        word[wlen++] = c;
                    }
-        content = new_content;
+                } else {
-        bytes_read = fread(content + content_size, 1, MAX_TEXT_LENGTH, file);
+                    if (wlen > 0) {
-        content_size += bytes_read;
+                        word[wlen] = 0;
-    } while (bytes_read == MAX_TEXT_LENGTH);
+                        wc++;
                        cc += is_upper_tbl[(unsigned char)word[0]];
-    content[content_size] = '\0'; // Null-terminate the string
+                        // Check for digits
-    fclose(file);
+                        int has_digit = 0;
-    return content;
+                        for (int k = 0; k < wlen; k++) {
-}
+                            if (is_digit_tbl[(unsigned char)word[k]]) {
-
+                                has_digit = 1;
 void* analyze_file(void* arg) {
    AnalysisResult *result = (AnalysisResult *)arg;
    char *text = read_file(result->filename);
    if (text) {
        long long word_count = 0;
        long long capitalized_count = 0;
        long long sentence_count = 0;
        long long number_count = 0;
        long long forbidden_count = 0;
        for (size_t i = 0; text[i] != '\0'; i++) {
            if (text[i] == '.') {
                sentence_count++;
            }
        }
        char *saveptr;
        char* token = strtok_r(text, " \f\v\r\n\t", &saveptr);
        while (token != NULL) {
            word_count++;
            if (isupper(token[0])) {
                capitalized_count++;
            }
            for (size_t i = 0; token[i] != '\0'; i++) {
                if (isdigit(token[i])) {
                    number_count++;
                                break;
                            }
                        }
                        nc += has_digit;
-            if (is_forbidden(token)) {
+                        // Check forbidden
-                forbidden_count++;
+                        if (wlen >= 4 && wlen <= 17) {
                            fc += trie_search(word, wlen);
                        }
-            token = strtok_r(NULL, " \f\v\r\n\t", &saveptr);
+                        wlen = 0;
                    }
                }
            }
-        result->total_word_count = word_count;
+            i += 128;
        result->total_capitalized_count = capitalized_count;
        result->total_sentence_count = sentence_count;
        result->total_number_count = number_count;
        result->total_forbidden_count = forbidden_count;
        free(text);
        }
        // Process remaining bytes
        while (i < size) {
            unsigned char c = data[i];
            if (c == '.') sc++;
            if (is_alpha_tbl[c]) {
                if (wlen < MAX_WORD_LEN) {
                    word[wlen++] = c;
                }
            } else {
                if (wlen > 0) {
                    word[wlen] = 0;
                    wc++;
                    cc += is_upper_tbl[(unsigned char)word[0]];
                    int has_digit = 0;
                    for (int k = 0; k < wlen; k++) {
                        if (is_digit_tbl[(unsigned char)word[k]]) {
                            has_digit = 1;
                            break;
                        }
                    }
                    nc += has_digit;
                    if (wlen >= 4 && wlen <= 17) {
                        fc += trie_search(word, wlen);
                    }
                    wlen = 0;
                }
            }
            i++;
        }
        // Final word
        if (wlen > 0) {
            word[wlen] = 0;
            wc++;
            cc += is_upper_tbl[(unsigned char)word[0]];
            int has_digit = 0;
            for (int k = 0; k < wlen; k++) {
                if (is_digit_tbl[(unsigned char)word[k]]) {
                    has_digit = 1;
                    break;
                }
            }
            nc += has_digit;
            if (wlen >= 4 && wlen <= 17) {
                fc += trie_search(word, wlen);
            }
        }
        task->result->wc = wc;
        task->result->cc = cc;
        task->result->sc = sc;
        task->result->nc = nc;
        task->result->fc = fc;
        munmap(data, size);
    }
    return NULL;
 }
 int main(int argc, char* argv[]) {
    if (argc < 2) {
-        printf("Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
+        fprintf(stderr, "Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
        return 1;
    }
-    pthread_t threads[argc - 1];
+    init_tables();
-    AnalysisResult results[argc - 1];
+    build_trie();
-    for (size_t i = 1; i < argc; i++) {
+    // Setup work queue
-        results[i - 1].filename = argv[i];
+    int num_files = argc - 1;
-        if (pthread_create(&threads[i - 1], NULL, analyze_file, &results[i - 1]) != 0) {
+    FileTask* tasks = calloc(num_files, sizeof(FileTask));
-            printf("Error creating thread for file: %s\n", argv[i]);
+    Stats* results = calloc(num_files, sizeof(Stats));
-            return 1;
+    
-        }
+    for (int i = 0; i < num_files; i++) {
        tasks[i].path = argv[i + 1];
        tasks[i].result = &results[i];
    }
-    for (size_t i = 1; i < argc; i++) {
+    int next_task = 0;
-        pthread_join(threads[i - 1], NULL);
+    pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
    WorkQueue queue = {
        .tasks = tasks,
        .next_task = &next_task,
        .total_tasks = num_files,
        .mutex = &mutex
    };
    // Create thread pool
    int nthreads = MAX_THREADS;
    if (num_files < MAX_THREADS) nthreads = num_files;
    pthread_t threads[MAX_THREADS];
    for (int i = 0; i < nthreads; i++) {
        pthread_create(&threads[i], NULL, process_file_worker, &queue);
    }
-    long long total_word_count = 0;
+    for (int i = 0; i < nthreads; i++) {
-    long long total_capitalized_count = 0;
+        pthread_join(threads[i], NULL);
    long long total_sentence_count = 0;
    long long total_number_count = 0;
    long long total_forbidden_count = 0;
    for (size_t i = 0; i < argc - 1; i++) {
        total_word_count += results[i].total_word_count;
        total_capitalized_count += results[i].total_capitalized_count;
        total_sentence_count += results[i].total_sentence_count;
        total_number_count += results[i].total_number_count;
        total_forbidden_count += results[i].total_forbidden_count;
    }
-    double capitalized_percentage = (total_word_count > 0) ? (double)total_capitalized_count / total_word_count * 100.0 : 0;
+    // Aggregate results
-    double forbidden_percentage = (total_word_count > 0) ? (double)total_forbidden_count / total_word_count * 100.0 : 0;
+    unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0;
-    double word_count_per_sentence = (total_sentence_count > 0) ? (double)total_word_count / total_sentence_count : 0;
+    for (int i = 0; i < num_files; i++) {
        twc += results[i].wc;
        tcc += results[i].cc;
        tsc += results[i].sc;
        tnc += results[i].nc;
        tfc += results[i].fc;
    }
    double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0;
    double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0;
    double wps = (tsc > 0) ? (double)twc / tsc : 0;
    printf("\nTotal Words: %llu\n", twc);
    printf("Total Capitalized words: %llu\n", tcc);
    printf("Total Sentences: %llu\n", tsc);
    printf("Total Numbers: %llu\n", tnc);
    printf("Total Forbidden words: %llu\n", tfc);
    printf("Capitalized percentage: %.6f%%\n", cc_pct);
    printf("Forbidden percentage: %.6f%%\n", fc_pct);
    printf("Word count per sentence: %.6f\n", wps);
    printf("Total files read: %d\n", num_files);
    free(tasks);
    free(results);
    pthread_mutex_destroy(&mutex);
    printf("\nTotal Words: %lld\n", total_word_count);
    printf("Total Capitalized words: %lld\n", total_capitalized_count);
    printf("Total Sentences: %lld\n", total_sentence_count);
    printf("Total Numbers: %lld\n", total_number_count);
    printf("Total Forbidden words: %lld\n", total_forbidden_count);
    printf("Capitalized percentage: %.6f%%\n", capitalized_percentage);
    printf("Forbidden percentage: %.6f%%\n", forbidden_percentage);
    printf("Word count per sentence: %.6f\n", word_count_per_sentence);
    printf("Total files read: %d\n", (int)(argc - 1));
    return 0;
 }