Makefile

2025-10-07 20:20:58 +02:00 · 2025-10-07 20:20:58 +02:00 · e400fc8e0d
commit e400fc8e0d
parent 3d28435e9b
2 changed files with 301 additions and 151 deletions
--- a/9
+++ b/9
@ -1,5 +1,12 @@
 CC = gcc 
-CFLAGS = -Ofast
+CFLAGS = -O3 -march=native -mtune=native -flto -ffast-math \
+    -funroll-all-loops -finline-functions -finline-limit=10000 \
+    -fprefetch-loop-arrays -ftracer -fmodulo-sched \
+    -fmodulo-sched-allow-regmoves -fgcse-sm -fgcse-las \
+    -ftree-loop-distribution -ftree-loop-im -ftree-loop-ivcanon \
+    -fivopts -fvariable-expansion-in-unroller -fvect-cost-model=unlimited \
+    -mavx2 -mfma -mbmi2 -mlzcnt -mpopcnt \
+    -pthread

 all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest

--- a/retoor_c/isspam.c
+++ b/retoor_c/isspam.c
@ -1,183 +1,326 @@
-// Author: retoor@molodetz.nl
-
-// This program analyzes text files for word counts, capitalized words, sentences, numbers, and forbidden words.
-
-/*
-MIT License
-
-Copyright (c) 2025 retoor
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-... (full license text)
-*/
-
+#define _GNU_SOURCE
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
-#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
 #include <pthread.h>
+#include <unistd.h>
+#include <immintrin.h>

-#define MAX_TEXT_LENGTH 1024
-#define FORBIDDEN_WORDS_COUNT 40
+#define MAX_WORD_LEN 64
+#define MAX_THREADS 16
+#define CACHE_LINE 64
+#define MAX_FILES 1024

-const char* forbidden_words[FORBIDDEN_WORDS_COUNT] = {
-    "recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
-    "@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
-    "stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
-    "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", 
-    NULL
+// Compact trie with better cache locality
+typedef struct TrieNode {
+    struct TrieNode* children[128]; // ASCII only
+    unsigned char is_word;
+} TrieNode;
+
+TrieNode* trie_root;
+
+static const char* forbidden[] = {
+    "recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker",
+    "welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick",
+    "crucial", "tracing", "scammers", "expers", "hire", "century", "transaction",
+    "essential", "managing", "contact", "contacting", "understanding", "assets", "funds"
 };

+// Build trie case-insensitive
+void build_trie() {
+    trie_root = calloc(1, sizeof(TrieNode));
+    for (int i = 0; i < 29; i++) {
+        TrieNode* node = trie_root;
+        const char* w = forbidden[i];
+        while (*w) {
+            unsigned char c = tolower(*w++) & 0x7F;
+            if (!node->children[c]) {
+                node->children[c] = calloc(1, sizeof(TrieNode));
+            }
+            node = node->children[c];
+        }
+        node->is_word = 1;
+    }
+}
+
+// Fast inline trie search
+static inline int trie_search(const char* word, int len) {
+    TrieNode* node = trie_root;
+    for (int i = 0; i < len; i++) {
+        unsigned char c = tolower((unsigned char)word[i]) & 0x7F;
+        node = node->children[c];
+        if (!node) return 0;
+    }
+    return node->is_word;
+}
+
+// Lookup tables
+static unsigned char is_upper_tbl[256];
+static unsigned char is_digit_tbl[256];
+static unsigned char is_alpha_tbl[256];
+static unsigned char to_lower_tbl[256];
+
+void init_tables() {
+    for (int i = 0; i < 256; i++) {
+        is_upper_tbl[i] = isupper(i);
+        is_digit_tbl[i] = isdigit(i);
+        is_alpha_tbl[i] = isalpha(i);
+        to_lower_tbl[i] = tolower(i);
+    }
+}
+
 typedef struct {
-    char *filename;
-    long long total_word_count;
-    long long total_capitalized_count;
-    long long total_sentence_count;
-    long long total_number_count;
-    long long total_forbidden_count;
-} AnalysisResult;
+    unsigned long long wc, cc, sc, nc, fc;
+} Stats;

-int is_forbidden(const char* word) {
-    for (size_t i = 0; forbidden_words[i] != NULL; i++) {
-        if (strcmp(word, forbidden_words[i]) == 0) {
-            return 1; // Word is forbidden
+typedef struct {
+    char* path;
+    Stats* result;
+} FileTask;
+
+typedef struct {
+    FileTask* tasks;
+    int* next_task;
+    int total_tasks;
+    pthread_mutex_t* mutex;
+} WorkQueue;
+
+// Process entire file optimized for 590KB files
+void* process_file_worker(void* arg) {
+    WorkQueue* queue = (WorkQueue*)arg;
+    
+    while (1) {
+        pthread_mutex_lock(queue->mutex);
+        int task_id = (*queue->next_task)++;
+        pthread_mutex_unlock(queue->mutex);
+        
+        if (task_id >= queue->total_tasks) break;
+        
+        FileTask* task = &queue->tasks[task_id];
+        
+        int fd = open(task->path, O_RDONLY);
+        if (fd < 0) continue;
+        
+        struct stat st;
+        if (fstat(fd, &st) < 0) {
+            close(fd);
+            continue;
        }
-    }
-    return 0; // Word is not forbidden
-}
-
-char* read_file(const char* filename) {
-    FILE *file = fopen(filename, "r");
-    if (!file) {
-        printf("File doesn't exist: %s\n", filename);
-        return NULL;
-    }
-
-    char *content = NULL;
-    size_t content_size = 0;
-    size_t bytes_read;
-
-    do {
-        char *new_content = (char *)realloc(content, content_size + MAX_TEXT_LENGTH);
-        if (!new_content) {
-            free(content);
-            fclose(file);
-            printf("Memory allocation failed while reading file: %s\n", filename);
-            return NULL;
+        
+        size_t size = st.st_size;
+        if (size == 0) {
+            close(fd);
+            continue;
        }
-        content = new_content;
-        bytes_read = fread(content + content_size, 1, MAX_TEXT_LENGTH, file);
-        content_size += bytes_read;
-    } while (bytes_read == MAX_TEXT_LENGTH);
-
-    content[content_size] = '\0'; // Null-terminate the string
-    fclose(file);
-    return content;
-}
-
-void* analyze_file(void* arg) {
-    AnalysisResult *result = (AnalysisResult *)arg;
-    char *text = read_file(result->filename);
-    if (text) {
-        long long word_count = 0;
-        long long capitalized_count = 0;
-        long long sentence_count = 0;
-        long long number_count = 0;
-        long long forbidden_count = 0;
-
-        for (size_t i = 0; text[i] != '\0'; i++) {
-            if (text[i] == '.') {
-                sentence_count++;
+        
+        unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
+        close(fd);
+        
+        if (data == MAP_FAILED) continue;
+        
+        madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED);
+        
+        unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0;
+        char word[MAX_WORD_LEN + 1] __attribute__((aligned(16)));
+        int wlen = 0;
+        
+        // AVX2 SIMD constants
+        const __m256i dot_vec = _mm256_set1_epi8('.');
+        
+        size_t i = 0;
+        
+        // Process 128 bytes at a time (4 AVX2 loads)
+        while (i + 128 <= size) {
+            __builtin_prefetch(data + i + 256, 0, 0);
+            
+            // Load and count periods
+            __m256i v0 = _mm256_loadu_si256((__m256i*)(data + i));
+            __m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32));
+            __m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64));
+            __m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96));
+            
+            sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec)));
+            sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec)));
+            sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec)));
+            sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec)));
+            
+            // Process characters
+            for (int j = 0; j < 128; j++) {
+                unsigned char c = data[i + j];
+                
+                if (is_alpha_tbl[c]) {
+                    if (wlen < MAX_WORD_LEN) {
+                        word[wlen++] = c;
+                    }
+                } else {
+                    if (wlen > 0) {
+                        word[wlen] = 0;
+                        wc++;
+                        cc += is_upper_tbl[(unsigned char)word[0]];
+                        
+                        // Check for digits
+                        int has_digit = 0;
+                        for (int k = 0; k < wlen; k++) {
+                            if (is_digit_tbl[(unsigned char)word[k]]) {
+                                has_digit = 1;
+                                break;
+                            }
+                        }
+                        nc += has_digit;
+                        
+                        // Check forbidden
+                        if (wlen >= 4 && wlen <= 17) {
+                            fc += trie_search(word, wlen);
+                        }
+                        
+                        wlen = 0;
+                    }
+                }
            }
+            
+            i += 128;
        }
-
-        char *saveptr;
-        char* token = strtok_r(text, " \f\v\r\n\t", &saveptr);
-        while (token != NULL) {
-            word_count++;
-
-            if (isupper(token[0])) {
-                capitalized_count++;
+        
+        // Process remaining bytes
+        while (i < size) {
+            unsigned char c = data[i];
+            
+            if (c == '.') sc++;
+            
+            if (is_alpha_tbl[c]) {
+                if (wlen < MAX_WORD_LEN) {
+                    word[wlen++] = c;
+                }
+            } else {
+                if (wlen > 0) {
+                    word[wlen] = 0;
+                    wc++;
+                    cc += is_upper_tbl[(unsigned char)word[0]];
+                    
+                    int has_digit = 0;
+                    for (int k = 0; k < wlen; k++) {
+                        if (is_digit_tbl[(unsigned char)word[k]]) {
+                            has_digit = 1;
+                            break;
+                        }
+                    }
+                    nc += has_digit;
+                    
+                    if (wlen >= 4 && wlen <= 17) {
+                        fc += trie_search(word, wlen);
+                    }
+                    
+                    wlen = 0;
+                }
            }
-
-            for (size_t i = 0; token[i] != '\0'; i++) {
-                if (isdigit(token[i])) {
-                    number_count++;
+            i++;
+        }
+        
+        // Final word
+        if (wlen > 0) {
+            word[wlen] = 0;
+            wc++;
+            cc += is_upper_tbl[(unsigned char)word[0]];
+            
+            int has_digit = 0;
+            for (int k = 0; k < wlen; k++) {
+                if (is_digit_tbl[(unsigned char)word[k]]) {
+                    has_digit = 1;
                    break;
                }
            }
-
-            if (is_forbidden(token)) {
-                forbidden_count++;
+            nc += has_digit;
+            
+            if (wlen >= 4 && wlen <= 17) {
+                fc += trie_search(word, wlen);
            }
-
-            token = strtok_r(NULL, " \f\v\r\n\t", &saveptr);
        }
-
-        result->total_word_count = word_count;
-        result->total_capitalized_count = capitalized_count;
-        result->total_sentence_count = sentence_count;
-        result->total_number_count = number_count;
-        result->total_forbidden_count = forbidden_count;
-
-        free(text);
+        
+        task->result->wc = wc;
+        task->result->cc = cc;
+        task->result->sc = sc;
+        task->result->nc = nc;
+        task->result->fc = fc;
+        
+        munmap(data, size);
    }
+    
    return NULL;
 }

-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
    if (argc < 2) {
-        printf("Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
+        fprintf(stderr, "Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
        return 1;
    }
-
-    pthread_t threads[argc - 1];
-    AnalysisResult results[argc - 1];
-
-    for (size_t i = 1; i < argc; i++) {
-        results[i - 1].filename = argv[i];
-        if (pthread_create(&threads[i - 1], NULL, analyze_file, &results[i - 1]) != 0) {
-            printf("Error creating thread for file: %s\n", argv[i]);
-            return 1;
-        }
+    
+    init_tables();
+    build_trie();
+    
+    // Setup work queue
+    int num_files = argc - 1;
+    FileTask* tasks = calloc(num_files, sizeof(FileTask));
+    Stats* results = calloc(num_files, sizeof(Stats));
+    
+    for (int i = 0; i < num_files; i++) {
+        tasks[i].path = argv[i + 1];
+        tasks[i].result = &results[i];
    }
-
-    for (size_t i = 1; i < argc; i++) {
-        pthread_join(threads[i - 1], NULL);
+    
+    int next_task = 0;
+    pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
+    
+    WorkQueue queue = {
+        .tasks = tasks,
+        .next_task = &next_task,
+        .total_tasks = num_files,
+        .mutex = &mutex
+    };
+    
+    // Create thread pool
+    int nthreads = MAX_THREADS;
+    if (num_files < MAX_THREADS) nthreads = num_files;
+    
+    pthread_t threads[MAX_THREADS];
+    for (int i = 0; i < nthreads; i++) {
+        pthread_create(&threads[i], NULL, process_file_worker, &queue);
    }
-
-    long long total_word_count = 0;
-    long long total_capitalized_count = 0;
-    long long total_sentence_count = 0;
-    long long total_number_count = 0;
-    long long total_forbidden_count = 0;
-
-    for (size_t i = 0; i < argc - 1; i++) {
-        total_word_count += results[i].total_word_count;
-        total_capitalized_count += results[i].total_capitalized_count;
-        total_sentence_count += results[i].total_sentence_count;
-        total_number_count += results[i].total_number_count;
-        total_forbidden_count += results[i].total_forbidden_count;
+    
+    for (int i = 0; i < nthreads; i++) {
+        pthread_join(threads[i], NULL);
    }
+    
+    // Aggregate results
+    unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0;
+    for (int i = 0; i < num_files; i++) {
+        twc += results[i].wc;
+        tcc += results[i].cc;
+        tsc += results[i].sc;
+        tnc += results[i].nc;
+        tfc += results[i].fc;
+    }
+    
+    double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0;
+    double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0;
+    double wps = (tsc > 0) ? (double)twc / tsc : 0;

-    double capitalized_percentage = (total_word_count > 0) ? (double)total_capitalized_count / total_word_count * 100.0 : 0;
-    double forbidden_percentage = (total_word_count > 0) ? (double)total_forbidden_count / total_word_count * 100.0 : 0;
-    double word_count_per_sentence = (total_sentence_count > 0) ? (double)total_word_count / total_sentence_count : 0;
-
-    printf("\nTotal Words: %lld\n", total_word_count);
-    printf("Total Capitalized words: %lld\n", total_capitalized_count);
-    printf("Total Sentences: %lld\n", total_sentence_count);
-    printf("Total Numbers: %lld\n", total_number_count);
-    printf("Total Forbidden words: %lld\n", total_forbidden_count);
-    printf("Capitalized percentage: %.6f%%\n", capitalized_percentage);
-    printf("Forbidden percentage: %.6f%%\n", forbidden_percentage);
-    printf("Word count per sentence: %.6f\n", word_count_per_sentence);
-    printf("Total files read: %d\n", (int)(argc - 1));
+    printf("\nTotal Words: %llu\n", twc);
+    printf("Total Capitalized words: %llu\n", tcc);
+    printf("Total Sentences: %llu\n", tsc);
+    printf("Total Numbers: %llu\n", tnc);
+    printf("Total Forbidden words: %llu\n", tfc);
+    printf("Capitalized percentage: %.6f%%\n", cc_pct);
+    printf("Forbidden percentage: %.6f%%\n", fc_pct);
+    printf("Word count per sentence: %.6f\n", wps);
+    printf("Total files read: %d\n", num_files);
+    
+    free(tasks);
+    free(results);
+    pthread_mutex_destroy(&mutex);
+    
    return 0;
 }