diff --git a/retoor_c/isspam.c b/retoor_c/isspam.c index 764f031..4670554 100644 --- a/retoor_c/isspam.c +++ b/retoor_c/isspam.c @@ -1,312 +1,232 @@ +// retoor #define _GNU_SOURCE #include #include #include -#include #include #include #include #include #include #include +#include -#define MAX_WORD_LEN 64 #define MAX_THREADS 16 -#define CACHE_LINE 64 -#define MAX_FILES 1024 - -// Compact trie with better cache locality -typedef struct TrieNode { - struct TrieNode* children[128]; // ASCII only - unsigned char is_word; -} TrieNode; - -TrieNode* trie_root; - -static const char* forbidden[] = { - "recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker", - "welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick", - "crucial", "tracing", "scammers", "expers", "hire", "century", "transaction", - "essential", "managing", "contact", "contacting", "understanding", "assets", "funds" -}; - -// Build trie case-insensitive -void build_trie() { - trie_root = calloc(1, sizeof(TrieNode)); - for (int i = 0; i < 29; i++) { - TrieNode* node = trie_root; - const char* w = forbidden[i]; - while (*w) { - unsigned char c = tolower(*w++) & 0x7F; - if (!node->children[c]) { - node->children[c] = calloc(1, sizeof(TrieNode)); - } - node = node->children[c]; - } - node->is_word = 1; - } -} - -// Fast inline trie search -static inline int trie_search(const char* word, int len) { - TrieNode* node = trie_root; - for (int i = 0; i < len; i++) { - unsigned char c = tolower((unsigned char)word[i]) & 0x7F; - node = node->children[c]; - if (!node) return 0; - } - return node->is_word; -} - -// Lookup tables -static unsigned char is_upper_tbl[256]; -static unsigned char is_digit_tbl[256]; -static unsigned char is_alpha_tbl[256]; -static unsigned char to_lower_tbl[256]; - -void init_tables() { - for (int i = 0; i < 256; i++) { - is_upper_tbl[i] = isupper(i); - is_digit_tbl[i] = isdigit(i); - is_alpha_tbl[i] = isalpha(i); - to_lower_tbl[i] = tolower(i); - } -} typedef struct { - unsigned long long wc, cc, sc, nc, fc; + uint64_t wc, cc, sc, nc, fc; } Stats; typedef struct { char* path; - Stats* result; + Stats result; } FileTask; typedef struct { FileTask* tasks; - int* next_task; + int next_task; int total_tasks; - pthread_mutex_t* mutex; + pthread_mutex_t mutex; } WorkQueue; -// Process entire file optimized for 590KB files -void* process_file_worker(void* arg) { - WorkQueue* queue = (WorkQueue*)arg; - - while (1) { - pthread_mutex_lock(queue->mutex); - int task_id = (*queue->next_task)++; - pthread_mutex_unlock(queue->mutex); - - if (task_id >= queue->total_tasks) break; - - FileTask* task = &queue->tasks[task_id]; - - int fd = open(task->path, O_RDONLY); - if (fd < 0) continue; - - struct stat st; - if (fstat(fd, &st) < 0) { - close(fd); - continue; - } - - size_t size = st.st_size; - if (size == 0) { - close(fd); - continue; - } - - unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); +static uint8_t is_ws[256]; +static uint8_t is_upper[256]; +static uint16_t fw_len_bits[256]; +static __uint128_t fw_words[256][8]; +static uint8_t fw_counts[256]; + +static void init_tables(void) { + static const char* forbidden[] = { + "recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com", + "@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", + "cryptocurrency", "stolen", "freeze", "quick", "crucial", "tracing", "scammers", + "expers", "hire", "century", "transaction", "essential", "managing", "contact", + "contacting", "understanding", "assets", "funds" + }; + for (int i = 0; i < 256; i++) { + is_ws[i] = (i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\f'); + is_upper[i] = (i >= 'A' && i <= 'Z'); + fw_len_bits[i] = 0; + fw_counts[i] = 0; + } + for (int i = 0; i < 35; i++) { + int len = strlen(forbidden[i]); + uint8_t first = (uint8_t)forbidden[i][0]; + fw_len_bits[first] |= (1u << len); + __uint128_t val = 0; + memcpy(&val, forbidden[i], len); + fw_words[first][fw_counts[first]++] = val; + } +} + +static const __uint128_t len_masks[17] = { + 0, 0xFFULL, 0xFFFFULL, 0xFFFFFFULL, 0xFFFFFFFFULL, + 0xFFFFFFFFFFULL, 0xFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL, + ((__uint128_t)0xFF << 64) | 0xFFFFFFFFFFFFFFFFULL, + ((__uint128_t)0xFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, + ((__uint128_t)0xFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, + ((__uint128_t)0xFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, + ((__uint128_t)0xFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, + ((__uint128_t)0xFFFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, + ((__uint128_t)0xFFFFFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL, + (__uint128_t)-1 +}; + +static inline __attribute__((always_inline, hot)) int is_forbidden(const uint8_t* word, size_t len) { + if (__builtin_expect(len > 14, 0)) return 0; + uint8_t first = word[0]; + uint16_t bits = fw_len_bits[first]; + if (__builtin_expect((bits & (1u << len)) == 0, 1)) return 0; + __uint128_t w = 0; + memcpy(&w, word, len); + w &= len_masks[len]; + __uint128_t* fwords = fw_words[first]; + int cnt = fw_counts[first]; + for (int i = 0; i < cnt; i++) { + if (fwords[i] == w) return 1; + } + return 0; +} + +static void process_file(FileTask* task) { + int fd = open(task->path, O_RDONLY); + if (fd < 0) return; + + struct stat st; + if (fstat(fd, &st) < 0 || st.st_size == 0) { close(fd); - - if (data == MAP_FAILED) continue; - - madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED); - - unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0; - char word[MAX_WORD_LEN + 1] __attribute__((aligned(16))); - int wlen = 0; - - // AVX2 SIMD constants - const __m256i dot_vec = _mm256_set1_epi8('.'); - - size_t i = 0; - - // Process 128 bytes at a time (4 AVX2 loads) - while (i + 128 <= size) { - __builtin_prefetch(data + i + 256, 0, 0); - - // Load and count periods - __m256i v0 = _mm256_loadu_si256((__m256i*)(data + i)); - __m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32)); - __m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64)); - __m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96)); - - sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec))); - sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec))); - sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec))); - sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec))); - - // Process characters - for (int j = 0; j < 128; j++) { - unsigned char c = data[i + j]; - - if (is_alpha_tbl[c]) { - if (wlen < MAX_WORD_LEN) { - word[wlen++] = c; - } - } else { - if (wlen > 0) { - word[wlen] = 0; - wc++; - cc += is_upper_tbl[(unsigned char)word[0]]; - - // Check for digits - int has_digit = 0; - for (int k = 0; k < wlen; k++) { - if (is_digit_tbl[(unsigned char)word[k]]) { - has_digit = 1; - break; - } - } - nc += has_digit; - - // Check forbidden - if (wlen >= 4 && wlen <= 17) { - fc += trie_search(word, wlen); - } - - wlen = 0; - } - } - } - - i += 128; - } - - // Process remaining bytes - while (i < size) { - unsigned char c = data[i]; - - if (c == '.') sc++; - - if (is_alpha_tbl[c]) { - if (wlen < MAX_WORD_LEN) { - word[wlen++] = c; - } - } else { - if (wlen > 0) { - word[wlen] = 0; - wc++; - cc += is_upper_tbl[(unsigned char)word[0]]; - - int has_digit = 0; - for (int k = 0; k < wlen; k++) { - if (is_digit_tbl[(unsigned char)word[k]]) { - has_digit = 1; - break; - } - } - nc += has_digit; - - if (wlen >= 4 && wlen <= 17) { - fc += trie_search(word, wlen); - } - - wlen = 0; - } - } + return; + } + + size_t size = st.st_size; + uint8_t* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); + if (data == MAP_FAILED) return; + + madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED); + + uint64_t wc = 0, cc = 0, sc = 0, nc = 0, fc = 0; + + const __m256i dot_vec = _mm256_set1_epi8('.'); + const __m256i zero_vec = _mm256_set1_epi8('0' - 1); + const __m256i nine_vec = _mm256_set1_epi8('9' + 1); + size_t i = 0; + size_t simd_end = size & ~127ULL; + + while (i < simd_end) { + __builtin_prefetch(data + i + 512, 0, 0); + __m256i v0 = _mm256_loadu_si256((__m256i*)(data + i)); + __m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32)); + __m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64)); + __m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96)); + + sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec))); + sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec))); + sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec))); + sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec))); + + nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v0, zero_vec), _mm256_cmpgt_epi8(nine_vec, v0)))); + nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v1, zero_vec), _mm256_cmpgt_epi8(nine_vec, v1)))); + nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v2, zero_vec), _mm256_cmpgt_epi8(nine_vec, v2)))); + nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v3, zero_vec), _mm256_cmpgt_epi8(nine_vec, v3)))); + i += 128; + } + while (i < size) { + uint8_t c = data[i]; + sc += (c == '.'); + nc += (c >= '0' && c <= '9'); + i++; + } + + i = 0; + while (i < size) { + while (i < size && is_ws[data[i]]) i++; + if (i >= size) break; + + size_t word_start = i; + int all_upper = 1; + + while (i < size && !is_ws[data[i]]) { + all_upper &= is_upper[data[i]]; i++; } - - // Final word - if (wlen > 0) { - word[wlen] = 0; - wc++; - cc += is_upper_tbl[(unsigned char)word[0]]; - - int has_digit = 0; - for (int k = 0; k < wlen; k++) { - if (is_digit_tbl[(unsigned char)word[k]]) { - has_digit = 1; - break; - } - } - nc += has_digit; - - if (wlen >= 4 && wlen <= 17) { - fc += trie_search(word, wlen); - } + + wc++; + cc += all_upper; + size_t wlen = i - word_start; + if (wlen <= 14) { + fc += is_forbidden(data + word_start, wlen); } - - task->result->wc = wc; - task->result->cc = cc; - task->result->sc = sc; - task->result->nc = nc; - task->result->fc = fc; - - munmap(data, size); } - + + task->result.wc = wc; + task->result.cc = cc; + task->result.sc = sc; + task->result.nc = nc; + task->result.fc = fc; + + munmap(data, size); +} + +static void* worker(void* arg) { + WorkQueue* q = (WorkQueue*)arg; + + while (1) { + pthread_mutex_lock(&q->mutex); + int task_id = q->next_task++; + pthread_mutex_unlock(&q->mutex); + + if (task_id >= q->total_tasks) break; + process_file(&q->tasks[task_id]); + } return NULL; } int main(int argc, char* argv[]) { if (argc < 2) { - fprintf(stderr, "Usage: %s ... \n", argv[0]); + fprintf(stderr, "Usage: %s [file2] ...\n", argv[0]); return 1; } - + init_tables(); - build_trie(); - - // Setup work queue + int num_files = argc - 1; FileTask* tasks = calloc(num_files, sizeof(FileTask)); - Stats* results = calloc(num_files, sizeof(Stats)); - + if (!tasks) return 1; + for (int i = 0; i < num_files; i++) { tasks[i].path = argv[i + 1]; - tasks[i].result = &results[i]; } - - int next_task = 0; - pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; - + WorkQueue queue = { .tasks = tasks, - .next_task = &next_task, + .next_task = 0, .total_tasks = num_files, - .mutex = &mutex + .mutex = PTHREAD_MUTEX_INITIALIZER }; - - // Create thread pool - int nthreads = MAX_THREADS; - if (num_files < MAX_THREADS) nthreads = num_files; - + + int nthreads = num_files < MAX_THREADS ? num_files : MAX_THREADS; pthread_t threads[MAX_THREADS]; + for (int i = 0; i < nthreads; i++) { - pthread_create(&threads[i], NULL, process_file_worker, &queue); + pthread_create(&threads[i], NULL, worker, &queue); } - for (int i = 0; i < nthreads; i++) { pthread_join(threads[i], NULL); } - - // Aggregate results + unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0; for (int i = 0; i < num_files; i++) { - twc += results[i].wc; - tcc += results[i].cc; - tsc += results[i].sc; - tnc += results[i].nc; - tfc += results[i].fc; + twc += tasks[i].result.wc; + tcc += tasks[i].result.cc; + tsc += tasks[i].result.sc; + tnc += tasks[i].result.nc; + tfc += tasks[i].result.fc; } - - double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0; - double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0; - double wps = (tsc > 0) ? (double)twc / tsc : 0; + + double cc_pct = twc > 0 ? (double)tcc / twc * 100.0 : 0; + double fc_pct = twc > 0 ? (double)tfc / twc * 100.0 : 0; + double wps = tsc > 0 ? (double)twc / tsc : 0; printf("\nTotal Words: %llu\n", twc); printf("Total Capitalized words: %llu\n", tcc); @@ -317,10 +237,9 @@ int main(int argc, char* argv[]) { printf("Forbidden percentage: %.6f%%\n", fc_pct); printf("Word count per sentence: %.6f\n", wps); printf("Total files read: %d\n", num_files); - + free(tasks); - free(results); - pthread_mutex_destroy(&mutex); - + pthread_mutex_destroy(&queue.mutex); + return 0; }