// retoor #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #define MAX_THREADS 16 typedef struct { uint64_t wc, cc, sc, nc, fc; } Stats; typedef struct { char* path; Stats result; } FileTask; typedef struct { FileTask* tasks; int next_task; int total_tasks; pthread_mutex_t mutex; } WorkQueue; static uint8_t is_ws[256]; static uint8_t is_upper[256]; static uint16_t fw_len_bits[256]; static __uint128_t fw_words[256][8]; static uint8_t fw_counts[256]; static void init_tables(void) { static const char* forbidden[] = { "recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com", "@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century", "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds" }; for (int i = 0; i < 256; i++) { is_ws[i] = (i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\f'); is_upper[i] = (i >= 'A' && i <= 'Z'); fw_len_bits[i] = 0; fw_counts[i] = 0; } for (int i = 0; i < 35; i++) { int len = strlen(forbidden[i]); uint8_t first = (uint8_t)forbidden[i][0]; fw_len_bits[first] |= (1u << len); __uint128_t val = 0; memcpy(&val, forbidden[i], len); fw_words[first][fw_counts[first]++] = val; } } static const __uint128_t len_masks[17] = { 0, 0xFFULL, 0xFFFFULL, 0xFFFFFFULL, 0xFFFFFFFFULL, 0xFFFFFFFFFFULL, 0xFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, ((__uint128_t)0xFF << 64) | 0xFFFFFFFFFFFFFFFFULL, ((__uint128_t)0xFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, ((__uint128_t)0xFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, ((__uint128_t)0xFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, ((__uint128_t)0xFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, ((__uint128_t)0xFFFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, ((__uint128_t)0xFFFFFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, (__uint128_t)-1 }; static inline __attribute__((always_inline, hot)) int is_forbidden(const uint8_t* word, size_t len) { if (__builtin_expect(len > 14, 0)) return 0; uint8_t first = word[0]; uint16_t bits = fw_len_bits[first]; if (__builtin_expect((bits & (1u << len)) == 0, 1)) return 0; __uint128_t w = 0; memcpy(&w, word, len); w &= len_masks[len]; __uint128_t* fwords = fw_words[first]; int cnt = fw_counts[first]; for (int i = 0; i < cnt; i++) { if (fwords[i] == w) return 1; } return 0; } static void process_file(FileTask* task) { int fd = open(task->path, O_RDONLY); if (fd < 0) return; struct stat st; if (fstat(fd, &st) < 0 || st.st_size == 0) { close(fd); return; } size_t size = st.st_size; uint8_t* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); close(fd); if (data == MAP_FAILED) return; madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED); uint64_t wc = 0, cc = 0, sc = 0, nc = 0, fc = 0; const __m256i dot_vec = _mm256_set1_epi8('.'); const __m256i zero_vec = _mm256_set1_epi8('0' - 1); const __m256i nine_vec = _mm256_set1_epi8('9' + 1); size_t i = 0; size_t simd_end = size & ~127ULL; while (i < simd_end) { __builtin_prefetch(data + i + 512, 0, 0); __m256i v0 = _mm256_loadu_si256((__m256i*)(data + i)); __m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32)); __m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64)); __m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96)); sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec))); sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec))); sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec))); sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec))); nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v0, zero_vec), _mm256_cmpgt_epi8(nine_vec, v0)))); nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v1, zero_vec), _mm256_cmpgt_epi8(nine_vec, v1)))); nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v2, zero_vec), _mm256_cmpgt_epi8(nine_vec, v2)))); nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v3, zero_vec), _mm256_cmpgt_epi8(nine_vec, v3)))); i += 128; } while (i < size) { uint8_t c = data[i]; sc += (c == '.'); nc += (c >= '0' && c <= '9'); i++; } i = 0; while (i < size) { while (i < size && is_ws[data[i]]) i++; if (i >= size) break; size_t word_start = i; int all_upper = 1; while (i < size && !is_ws[data[i]]) { all_upper &= is_upper[data[i]]; i++; } wc++; cc += all_upper; size_t wlen = i - word_start; if (wlen <= 14) { fc += is_forbidden(data + word_start, wlen); } } task->result.wc = wc; task->result.cc = cc; task->result.sc = sc; task->result.nc = nc; task->result.fc = fc; munmap(data, size); } static void* worker(void* arg) { WorkQueue* q = (WorkQueue*)arg; while (1) { pthread_mutex_lock(&q->mutex); int task_id = q->next_task++; pthread_mutex_unlock(&q->mutex); if (task_id >= q->total_tasks) break; process_file(&q->tasks[task_id]); } return NULL; } int main(int argc, char* argv[]) { if (argc < 2) { fprintf(stderr, "Usage: %s [file2] ...\n", argv[0]); return 1; } init_tables(); int num_files = argc - 1; FileTask* tasks = calloc(num_files, sizeof(FileTask)); if (!tasks) return 1; for (int i = 0; i < num_files; i++) { tasks[i].path = argv[i + 1]; } WorkQueue queue = { .tasks = tasks, .next_task = 0, .total_tasks = num_files, .mutex = PTHREAD_MUTEX_INITIALIZER }; int nthreads = num_files < MAX_THREADS ? num_files : MAX_THREADS; pthread_t threads[MAX_THREADS]; for (int i = 0; i < nthreads; i++) { pthread_create(&threads[i], NULL, worker, &queue); } for (int i = 0; i < nthreads; i++) { pthread_join(threads[i], NULL); } unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0; for (int i = 0; i < num_files; i++) { twc += tasks[i].result.wc; tcc += tasks[i].result.cc; tsc += tasks[i].result.sc; tnc += tasks[i].result.nc; tfc += tasks[i].result.fc; } double cc_pct = twc > 0 ? (double)tcc / twc * 100.0 : 0; double fc_pct = twc > 0 ? (double)tfc / twc * 100.0 : 0; double wps = tsc > 0 ? (double)twc / tsc : 0; printf("\nTotal Words: %llu\n", twc); printf("Total Capitalized words: %llu\n", tcc); printf("Total Sentences: %llu\n", tsc); printf("Total Numbers: %llu\n", tnc); printf("Total Forbidden words: %llu\n", tfc); printf("Capitalized percentage: %.6f%%\n", cc_pct); printf("Forbidden percentage: %.6f%%\n", fc_pct); printf("Word count per sentence: %.6f\n", wps); printf("Total files read: %d\n", num_files); free(tasks); pthread_mutex_destroy(&queue.mutex); return 0; }