#define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #define MAX_WORD_LEN 64 #define MAX_THREADS 16 #define CACHE_LINE 64 #define MAX_FILES 1024 // Compact trie with better cache locality typedef struct TrieNode { struct TrieNode* children[128]; // ASCII only unsigned char is_word; } TrieNode; TrieNode* trie_root; static const char* forbidden[] = { "recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century", "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds" }; // Build trie case-insensitive void build_trie() { trie_root = calloc(1, sizeof(TrieNode)); for (int i = 0; i < 29; i++) { TrieNode* node = trie_root; const char* w = forbidden[i]; while (*w) { unsigned char c = tolower(*w++) & 0x7F; if (!node->children[c]) { node->children[c] = calloc(1, sizeof(TrieNode)); } node = node->children[c]; } node->is_word = 1; } } // Fast inline trie search static inline int trie_search(const char* word, int len) { TrieNode* node = trie_root; for (int i = 0; i < len; i++) { unsigned char c = tolower((unsigned char)word[i]) & 0x7F; node = node->children[c]; if (!node) return 0; } return node->is_word; } // Lookup tables static unsigned char is_upper_tbl[256]; static unsigned char is_digit_tbl[256]; static unsigned char is_alpha_tbl[256]; static unsigned char to_lower_tbl[256]; void init_tables() { for (int i = 0; i < 256; i++) { is_upper_tbl[i] = isupper(i); is_digit_tbl[i] = isdigit(i); is_alpha_tbl[i] = isalpha(i); to_lower_tbl[i] = tolower(i); } } typedef struct { unsigned long long wc, cc, sc, nc, fc; } Stats; typedef struct { char* path; Stats* result; } FileTask; typedef struct { FileTask* tasks; int* next_task; int total_tasks; pthread_mutex_t* mutex; } WorkQueue; // Process entire file optimized for 590KB files void* process_file_worker(void* arg) { WorkQueue* queue = (WorkQueue*)arg; while (1) { pthread_mutex_lock(queue->mutex); int task_id = (*queue->next_task)++; pthread_mutex_unlock(queue->mutex); if (task_id >= queue->total_tasks) break; FileTask* task = &queue->tasks[task_id]; int fd = open(task->path, O_RDONLY); if (fd < 0) continue; struct stat st; if (fstat(fd, &st) < 0) { close(fd); continue; } size_t size = st.st_size; if (size == 0) { close(fd); continue; } unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); close(fd); if (data == MAP_FAILED) continue; madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED); unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0; char word[MAX_WORD_LEN + 1] __attribute__((aligned(16))); int wlen = 0; // AVX2 SIMD constants const __m256i dot_vec = _mm256_set1_epi8('.'); size_t i = 0; // Process 128 bytes at a time (4 AVX2 loads) while (i + 128 <= size) { __builtin_prefetch(data + i + 256, 0, 0); // Load and count periods __m256i v0 = _mm256_loadu_si256((__m256i*)(data + i)); __m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32)); __m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64)); __m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96)); sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec))); sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec))); sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec))); sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec))); // Process characters for (int j = 0; j < 128; j++) { unsigned char c = data[i + j]; if (is_alpha_tbl[c]) { if (wlen < MAX_WORD_LEN) { word[wlen++] = c; } } else { if (wlen > 0) { word[wlen] = 0; wc++; cc += is_upper_tbl[(unsigned char)word[0]]; // Check for digits int has_digit = 0; for (int k = 0; k < wlen; k++) { if (is_digit_tbl[(unsigned char)word[k]]) { has_digit = 1; break; } } nc += has_digit; // Check forbidden if (wlen >= 4 && wlen <= 17) { fc += trie_search(word, wlen); } wlen = 0; } } } i += 128; } // Process remaining bytes while (i < size) { unsigned char c = data[i]; if (c == '.') sc++; if (is_alpha_tbl[c]) { if (wlen < MAX_WORD_LEN) { word[wlen++] = c; } } else { if (wlen > 0) { word[wlen] = 0; wc++; cc += is_upper_tbl[(unsigned char)word[0]]; int has_digit = 0; for (int k = 0; k < wlen; k++) { if (is_digit_tbl[(unsigned char)word[k]]) { has_digit = 1; break; } } nc += has_digit; if (wlen >= 4 && wlen <= 17) { fc += trie_search(word, wlen); } wlen = 0; } } i++; } // Final word if (wlen > 0) { word[wlen] = 0; wc++; cc += is_upper_tbl[(unsigned char)word[0]]; int has_digit = 0; for (int k = 0; k < wlen; k++) { if (is_digit_tbl[(unsigned char)word[k]]) { has_digit = 1; break; } } nc += has_digit; if (wlen >= 4 && wlen <= 17) { fc += trie_search(word, wlen); } } task->result->wc = wc; task->result->cc = cc; task->result->sc = sc; task->result->nc = nc; task->result->fc = fc; munmap(data, size); } return NULL; } int main(int argc, char* argv[]) { if (argc < 2) { fprintf(stderr, "Usage: %s ... \n", argv[0]); return 1; } init_tables(); build_trie(); // Setup work queue int num_files = argc - 1; FileTask* tasks = calloc(num_files, sizeof(FileTask)); Stats* results = calloc(num_files, sizeof(Stats)); for (int i = 0; i < num_files; i++) { tasks[i].path = argv[i + 1]; tasks[i].result = &results[i]; } int next_task = 0; pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; WorkQueue queue = { .tasks = tasks, .next_task = &next_task, .total_tasks = num_files, .mutex = &mutex }; // Create thread pool int nthreads = MAX_THREADS; if (num_files < MAX_THREADS) nthreads = num_files; pthread_t threads[MAX_THREADS]; for (int i = 0; i < nthreads; i++) { pthread_create(&threads[i], NULL, process_file_worker, &queue); } for (int i = 0; i < nthreads; i++) { pthread_join(threads[i], NULL); } // Aggregate results unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0; for (int i = 0; i < num_files; i++) { twc += results[i].wc; tcc += results[i].cc; tsc += results[i].sc; tnc += results[i].nc; tfc += results[i].fc; } double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0; double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0; double wps = (tsc > 0) ? (double)twc / tsc : 0; printf("\nTotal Words: %llu\n", twc); printf("Total Capitalized words: %llu\n", tcc); printf("Total Sentences: %llu\n", tsc); printf("Total Numbers: %llu\n", tnc); printf("Total Forbidden words: %llu\n", tfc); printf("Capitalized percentage: %.6f%%\n", cc_pct); printf("Forbidden percentage: %.6f%%\n", fc_pct); printf("Word count per sentence: %.6f\n", wps); printf("Total files read: %d\n", num_files); free(tasks); free(results); pthread_mutex_destroy(&mutex); return 0; }