isspam/isspam.c at e400fc8e0d8d81feb1d59b7b67510c1f3b96e8c8

 #define _GNU_SOURCE
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <pthread.h>
 #include <unistd.h>
 #include <immintrin.h>
 #define MAX_WORD_LEN 64
 #define MAX_THREADS 16
 #define CACHE_LINE 64
 #define MAX_FILES 1024
 // Compact trie with better cache locality
 typedef struct TrieNode {
     struct TrieNode* children[128]; // ASCII only
     unsigned char is_word;
 } TrieNode;
 TrieNode* trie_root;
 static const char* forbidden[] = {
     "recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker",
     "welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick",
     "crucial", "tracing", "scammers", "expers", "hire", "century", "transaction",
     "essential", "managing", "contact", "contacting", "understanding", "assets", "funds"
 };
 // Build trie case-insensitive
 void build_trie() {
     trie_root = calloc(1, sizeof(TrieNode));
     for (int i = 0; i < 29; i++) {
         TrieNode* node = trie_root;
         const char* w = forbidden[i];
         while (*w) {
             unsigned char c = tolower(*w++) & 0x7F;
             if (!node->children[c]) {
                 node->children[c] = calloc(1, sizeof(TrieNode));
             }
             node = node->children[c];
         }
         node->is_word = 1;
     }
 }
 // Fast inline trie search
 static inline int trie_search(const char* word, int len) {
     TrieNode* node = trie_root;
     for (int i = 0; i < len; i++) {
         unsigned char c = tolower((unsigned char)word[i]) & 0x7F;
         node = node->children[c];
         if (!node) return 0;
     }
     return node->is_word;
 }
 // Lookup tables
 static unsigned char is_upper_tbl[256];
 static unsigned char is_digit_tbl[256];
 static unsigned char is_alpha_tbl[256];
 static unsigned char to_lower_tbl[256];
 void init_tables() {
     for (int i = 0; i < 256; i++) {
         is_upper_tbl[i] = isupper(i);
         is_digit_tbl[i] = isdigit(i);
         is_alpha_tbl[i] = isalpha(i);
         to_lower_tbl[i] = tolower(i);
     }
 }
 typedef struct {
     unsigned long long wc, cc, sc, nc, fc;
 } Stats;
 typedef struct {
     char* path;
     Stats* result;
 } FileTask;
 typedef struct {
     FileTask* tasks;
     int* next_task;
     int total_tasks;
     pthread_mutex_t* mutex;
 } WorkQueue;
 // Process entire file optimized for 590KB files
 void* process_file_worker(void* arg) {
     WorkQueue* queue = (WorkQueue*)arg;
     while (1) {
         pthread_mutex_lock(queue->mutex);
         int task_id = (*queue->next_task)++;
         pthread_mutex_unlock(queue->mutex);
         if (task_id >= queue->total_tasks) break;
         FileTask* task = &queue->tasks[task_id];
         int fd = open(task->path, O_RDONLY);
         if (fd < 0) continue;
         struct stat st;
         if (fstat(fd, &st) < 0) {
             close(fd);
             continue;
         }
         size_t size = st.st_size;
         if (size == 0) {
             close(fd);
             continue;
         }
         unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
         close(fd);
         if (data == MAP_FAILED) continue;
         madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED);
         unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0;
         char word[MAX_WORD_LEN + 1] __attribute__((aligned(16)));
         int wlen = 0;
         // AVX2 SIMD constants
         const __m256i dot_vec = _mm256_set1_epi8('.');
         size_t i = 0;
         // Process 128 bytes at a time (4 AVX2 loads)
         while (i + 128 <= size) {
             __builtin_prefetch(data + i + 256, 0, 0);
             // Load and count periods
             __m256i v0 = _mm256_loadu_si256((__m256i*)(data + i));
             __m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32));
             __m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64));
             __m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96));
             sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec)));
             sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec)));
             sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec)));
             sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec)));
             // Process characters
             for (int j = 0; j < 128; j++) {
                 unsigned char c = data[i + j];
                 if (is_alpha_tbl[c]) {
                     if (wlen < MAX_WORD_LEN) {
                         word[wlen++] = c;
                     }
                 } else {
                     if (wlen > 0) {
                         word[wlen] = 0;
                         wc++;
                         cc += is_upper_tbl[(unsigned char)word[0]];
                         // Check for digits
                         int has_digit = 0;
                         for (int k = 0; k < wlen; k++) {
                             if (is_digit_tbl[(unsigned char)word[k]]) {
                                 has_digit = 1;
                                 break;
                             }
                         }
                         nc += has_digit;
                         // Check forbidden
                         if (wlen >= 4 && wlen <= 17) {
                             fc += trie_search(word, wlen);
                         }
                         wlen = 0;
                     }
                 }
             }
             i += 128;
         }
         // Process remaining bytes
         while (i < size) {
             unsigned char c = data[i];
             if (c == '.') sc++;
             if (is_alpha_tbl[c]) {
                 if (wlen < MAX_WORD_LEN) {
                     word[wlen++] = c;
                 }
             } else {
                 if (wlen > 0) {
                     word[wlen] = 0;
                     wc++;
                     cc += is_upper_tbl[(unsigned char)word[0]];
                     int has_digit = 0;
                     for (int k = 0; k < wlen; k++) {
                         if (is_digit_tbl[(unsigned char)word[k]]) {
                             has_digit = 1;
                             break;
                         }
                     }
                     nc += has_digit;
                     if (wlen >= 4 && wlen <= 17) {
                         fc += trie_search(word, wlen);
                     }
                     wlen = 0;
                 }
             }
             i++;
         }
         // Final word
         if (wlen > 0) {
             word[wlen] = 0;
             wc++;
             cc += is_upper_tbl[(unsigned char)word[0]];
             int has_digit = 0;
             for (int k = 0; k < wlen; k++) {
                 if (is_digit_tbl[(unsigned char)word[k]]) {
                     has_digit = 1;
                     break;
                 }
             }
             nc += has_digit;
             if (wlen >= 4 && wlen <= 17) {
                 fc += trie_search(word, wlen);
             }
         }
         task->result->wc = wc;
         task->result->cc = cc;
         task->result->sc = sc;
         task->result->nc = nc;
         task->result->fc = fc;
         munmap(data, size);
     }
     return NULL;
 }
 int main(int argc, char* argv[]) {
     if (argc < 2) {
         fprintf(stderr, "Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
         return 1;
     }
     init_tables();
     build_trie();
     // Setup work queue
     int num_files = argc - 1;
     FileTask* tasks = calloc(num_files, sizeof(FileTask));
     Stats* results = calloc(num_files, sizeof(Stats));
     for (int i = 0; i < num_files; i++) {
         tasks[i].path = argv[i + 1];
         tasks[i].result = &results[i];
     }
     int next_task = 0;
     pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
     WorkQueue queue = {
         .tasks = tasks,
         .next_task = &next_task,
         .total_tasks = num_files,
         .mutex = &mutex
     };
     // Create thread pool
     int nthreads = MAX_THREADS;
     if (num_files < MAX_THREADS) nthreads = num_files;
     pthread_t threads[MAX_THREADS];
     for (int i = 0; i < nthreads; i++) {
         pthread_create(&threads[i], NULL, process_file_worker, &queue);
     }
     for (int i = 0; i < nthreads; i++) {
         pthread_join(threads[i], NULL);
     }
     // Aggregate results
     unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0;
     for (int i = 0; i < num_files; i++) {
         twc += results[i].wc;
         tcc += results[i].cc;
         tsc += results[i].sc;
         tnc += results[i].nc;
         tfc += results[i].fc;
     }
     double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0;
     double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0;
     double wps = (tsc > 0) ? (double)twc / tsc : 0;
     printf("\nTotal Words: %llu\n", twc);
     printf("Total Capitalized words: %llu\n", tcc);
     printf("Total Sentences: %llu\n", tsc);
     printf("Total Numbers: %llu\n", tnc);
     printf("Total Forbidden words: %llu\n", tfc);
     printf("Capitalized percentage: %.6f%%\n", cc_pct);
     printf("Forbidden percentage: %.6f%%\n", fc_pct);
     printf("Word count per sentence: %.6f\n", wps);
     printf("Total files read: %d\n", num_files);
     free(tasks);
     free(results);
     pthread_mutex_destroy(&mutex);
     return 0;
 }