From e400fc8e0d8d81feb1d59b7b67510c1f3b96e8c8 Mon Sep 17 00:00:00 2001 From: retoor Date: Tue, 7 Oct 2025 20:20:58 +0200 Subject: [PATCH] Makefile --- Makefile | 9 +- retoor_c/isspam.c | 443 ++++++++++++++++++++++++++++++---------------- 2 files changed, 301 insertions(+), 151 deletions(-) diff --git a/Makefile b/Makefile index dd7b6ac..259ca33 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,12 @@ CC = gcc -CFLAGS = -Ofast +CFLAGS = -O3 -march=native -mtune=native -flto -ffast-math \ + -funroll-all-loops -finline-functions -finline-limit=10000 \ + -fprefetch-loop-arrays -ftracer -fmodulo-sched \ + -fmodulo-sched-allow-regmoves -fgcse-sm -fgcse-las \ + -ftree-loop-distribution -ftree-loop-im -ftree-loop-ivcanon \ + -fivopts -fvariable-expansion-in-unroller -fvect-cost-model=unlimited \ + -mavx2 -mfma -mbmi2 -mlzcnt -mpopcnt \ + -pthread all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest diff --git a/retoor_c/isspam.c b/retoor_c/isspam.c index 064ac14..764f031 100644 --- a/retoor_c/isspam.c +++ b/retoor_c/isspam.c @@ -1,183 +1,326 @@ -// Author: retoor@molodetz.nl - -// This program analyzes text files for word counts, capitalized words, sentences, numbers, and forbidden words. - -/* -MIT License - -Copyright (c) 2025 retoor - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -... (full license text) -*/ - +#define _GNU_SOURCE #include +#include #include #include -#include +#include +#include +#include #include +#include +#include -#define MAX_TEXT_LENGTH 1024 -#define FORBIDDEN_WORDS_COUNT 40 +#define MAX_WORD_LEN 64 +#define MAX_THREADS 16 +#define CACHE_LINE 64 +#define MAX_FILES 1024 -const char* forbidden_words[FORBIDDEN_WORDS_COUNT] = { - "recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com", - "@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency", - "stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century", - "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", - NULL +// Compact trie with better cache locality +typedef struct TrieNode { + struct TrieNode* children[128]; // ASCII only + unsigned char is_word; +} TrieNode; + +TrieNode* trie_root; + +static const char* forbidden[] = { + "recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker", + "welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick", + "crucial", "tracing", "scammers", "expers", "hire", "century", "transaction", + "essential", "managing", "contact", "contacting", "understanding", "assets", "funds" }; +// Build trie case-insensitive +void build_trie() { + trie_root = calloc(1, sizeof(TrieNode)); + for (int i = 0; i < 29; i++) { + TrieNode* node = trie_root; + const char* w = forbidden[i]; + while (*w) { + unsigned char c = tolower(*w++) & 0x7F; + if (!node->children[c]) { + node->children[c] = calloc(1, sizeof(TrieNode)); + } + node = node->children[c]; + } + node->is_word = 1; + } +} + +// Fast inline trie search +static inline int trie_search(const char* word, int len) { + TrieNode* node = trie_root; + for (int i = 0; i < len; i++) { + unsigned char c = tolower((unsigned char)word[i]) & 0x7F; + node = node->children[c]; + if (!node) return 0; + } + return node->is_word; +} + +// Lookup tables +static unsigned char is_upper_tbl[256]; +static unsigned char is_digit_tbl[256]; +static unsigned char is_alpha_tbl[256]; +static unsigned char to_lower_tbl[256]; + +void init_tables() { + for (int i = 0; i < 256; i++) { + is_upper_tbl[i] = isupper(i); + is_digit_tbl[i] = isdigit(i); + is_alpha_tbl[i] = isalpha(i); + to_lower_tbl[i] = tolower(i); + } +} + typedef struct { - char *filename; - long long total_word_count; - long long total_capitalized_count; - long long total_sentence_count; - long long total_number_count; - long long total_forbidden_count; -} AnalysisResult; + unsigned long long wc, cc, sc, nc, fc; +} Stats; -int is_forbidden(const char* word) { - for (size_t i = 0; forbidden_words[i] != NULL; i++) { - if (strcmp(word, forbidden_words[i]) == 0) { - return 1; // Word is forbidden +typedef struct { + char* path; + Stats* result; +} FileTask; + +typedef struct { + FileTask* tasks; + int* next_task; + int total_tasks; + pthread_mutex_t* mutex; +} WorkQueue; + +// Process entire file optimized for 590KB files +void* process_file_worker(void* arg) { + WorkQueue* queue = (WorkQueue*)arg; + + while (1) { + pthread_mutex_lock(queue->mutex); + int task_id = (*queue->next_task)++; + pthread_mutex_unlock(queue->mutex); + + if (task_id >= queue->total_tasks) break; + + FileTask* task = &queue->tasks[task_id]; + + int fd = open(task->path, O_RDONLY); + if (fd < 0) continue; + + struct stat st; + if (fstat(fd, &st) < 0) { + close(fd); + continue; } - } - return 0; // Word is not forbidden -} - -char* read_file(const char* filename) { - FILE *file = fopen(filename, "r"); - if (!file) { - printf("File doesn't exist: %s\n", filename); - return NULL; - } - - char *content = NULL; - size_t content_size = 0; - size_t bytes_read; - - do { - char *new_content = (char *)realloc(content, content_size + MAX_TEXT_LENGTH); - if (!new_content) { - free(content); - fclose(file); - printf("Memory allocation failed while reading file: %s\n", filename); - return NULL; + + size_t size = st.st_size; + if (size == 0) { + close(fd); + continue; } - content = new_content; - bytes_read = fread(content + content_size, 1, MAX_TEXT_LENGTH, file); - content_size += bytes_read; - } while (bytes_read == MAX_TEXT_LENGTH); - - content[content_size] = '\0'; // Null-terminate the string - fclose(file); - return content; -} - -void* analyze_file(void* arg) { - AnalysisResult *result = (AnalysisResult *)arg; - char *text = read_file(result->filename); - if (text) { - long long word_count = 0; - long long capitalized_count = 0; - long long sentence_count = 0; - long long number_count = 0; - long long forbidden_count = 0; - - for (size_t i = 0; text[i] != '\0'; i++) { - if (text[i] == '.') { - sentence_count++; + + unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + close(fd); + + if (data == MAP_FAILED) continue; + + madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED); + + unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0; + char word[MAX_WORD_LEN + 1] __attribute__((aligned(16))); + int wlen = 0; + + // AVX2 SIMD constants + const __m256i dot_vec = _mm256_set1_epi8('.'); + + size_t i = 0; + + // Process 128 bytes at a time (4 AVX2 loads) + while (i + 128 <= size) { + __builtin_prefetch(data + i + 256, 0, 0); + + // Load and count periods + __m256i v0 = _mm256_loadu_si256((__m256i*)(data + i)); + __m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32)); + __m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64)); + __m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96)); + + sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec))); + sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec))); + sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec))); + sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec))); + + // Process characters + for (int j = 0; j < 128; j++) { + unsigned char c = data[i + j]; + + if (is_alpha_tbl[c]) { + if (wlen < MAX_WORD_LEN) { + word[wlen++] = c; + } + } else { + if (wlen > 0) { + word[wlen] = 0; + wc++; + cc += is_upper_tbl[(unsigned char)word[0]]; + + // Check for digits + int has_digit = 0; + for (int k = 0; k < wlen; k++) { + if (is_digit_tbl[(unsigned char)word[k]]) { + has_digit = 1; + break; + } + } + nc += has_digit; + + // Check forbidden + if (wlen >= 4 && wlen <= 17) { + fc += trie_search(word, wlen); + } + + wlen = 0; + } + } } + + i += 128; } - - char *saveptr; - char* token = strtok_r(text, " \f\v\r\n\t", &saveptr); - while (token != NULL) { - word_count++; - - if (isupper(token[0])) { - capitalized_count++; + + // Process remaining bytes + while (i < size) { + unsigned char c = data[i]; + + if (c == '.') sc++; + + if (is_alpha_tbl[c]) { + if (wlen < MAX_WORD_LEN) { + word[wlen++] = c; + } + } else { + if (wlen > 0) { + word[wlen] = 0; + wc++; + cc += is_upper_tbl[(unsigned char)word[0]]; + + int has_digit = 0; + for (int k = 0; k < wlen; k++) { + if (is_digit_tbl[(unsigned char)word[k]]) { + has_digit = 1; + break; + } + } + nc += has_digit; + + if (wlen >= 4 && wlen <= 17) { + fc += trie_search(word, wlen); + } + + wlen = 0; + } } - - for (size_t i = 0; token[i] != '\0'; i++) { - if (isdigit(token[i])) { - number_count++; + i++; + } + + // Final word + if (wlen > 0) { + word[wlen] = 0; + wc++; + cc += is_upper_tbl[(unsigned char)word[0]]; + + int has_digit = 0; + for (int k = 0; k < wlen; k++) { + if (is_digit_tbl[(unsigned char)word[k]]) { + has_digit = 1; break; } } - - if (is_forbidden(token)) { - forbidden_count++; + nc += has_digit; + + if (wlen >= 4 && wlen <= 17) { + fc += trie_search(word, wlen); } - - token = strtok_r(NULL, " \f\v\r\n\t", &saveptr); } - - result->total_word_count = word_count; - result->total_capitalized_count = capitalized_count; - result->total_sentence_count = sentence_count; - result->total_number_count = number_count; - result->total_forbidden_count = forbidden_count; - - free(text); + + task->result->wc = wc; + task->result->cc = cc; + task->result->sc = sc; + task->result->nc = nc; + task->result->fc = fc; + + munmap(data, size); } + return NULL; } -int main(int argc, char *argv[]) { +int main(int argc, char* argv[]) { if (argc < 2) { - printf("Usage: %s ... \n", argv[0]); + fprintf(stderr, "Usage: %s ... \n", argv[0]); return 1; } - - pthread_t threads[argc - 1]; - AnalysisResult results[argc - 1]; - - for (size_t i = 1; i < argc; i++) { - results[i - 1].filename = argv[i]; - if (pthread_create(&threads[i - 1], NULL, analyze_file, &results[i - 1]) != 0) { - printf("Error creating thread for file: %s\n", argv[i]); - return 1; - } + + init_tables(); + build_trie(); + + // Setup work queue + int num_files = argc - 1; + FileTask* tasks = calloc(num_files, sizeof(FileTask)); + Stats* results = calloc(num_files, sizeof(Stats)); + + for (int i = 0; i < num_files; i++) { + tasks[i].path = argv[i + 1]; + tasks[i].result = &results[i]; } - - for (size_t i = 1; i < argc; i++) { - pthread_join(threads[i - 1], NULL); + + int next_task = 0; + pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + + WorkQueue queue = { + .tasks = tasks, + .next_task = &next_task, + .total_tasks = num_files, + .mutex = &mutex + }; + + // Create thread pool + int nthreads = MAX_THREADS; + if (num_files < MAX_THREADS) nthreads = num_files; + + pthread_t threads[MAX_THREADS]; + for (int i = 0; i < nthreads; i++) { + pthread_create(&threads[i], NULL, process_file_worker, &queue); } - - long long total_word_count = 0; - long long total_capitalized_count = 0; - long long total_sentence_count = 0; - long long total_number_count = 0; - long long total_forbidden_count = 0; - - for (size_t i = 0; i < argc - 1; i++) { - total_word_count += results[i].total_word_count; - total_capitalized_count += results[i].total_capitalized_count; - total_sentence_count += results[i].total_sentence_count; - total_number_count += results[i].total_number_count; - total_forbidden_count += results[i].total_forbidden_count; + + for (int i = 0; i < nthreads; i++) { + pthread_join(threads[i], NULL); } + + // Aggregate results + unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0; + for (int i = 0; i < num_files; i++) { + twc += results[i].wc; + tcc += results[i].cc; + tsc += results[i].sc; + tnc += results[i].nc; + tfc += results[i].fc; + } + + double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0; + double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0; + double wps = (tsc > 0) ? (double)twc / tsc : 0; - double capitalized_percentage = (total_word_count > 0) ? (double)total_capitalized_count / total_word_count * 100.0 : 0; - double forbidden_percentage = (total_word_count > 0) ? (double)total_forbidden_count / total_word_count * 100.0 : 0; - double word_count_per_sentence = (total_sentence_count > 0) ? (double)total_word_count / total_sentence_count : 0; - - printf("\nTotal Words: %lld\n", total_word_count); - printf("Total Capitalized words: %lld\n", total_capitalized_count); - printf("Total Sentences: %lld\n", total_sentence_count); - printf("Total Numbers: %lld\n", total_number_count); - printf("Total Forbidden words: %lld\n", total_forbidden_count); - printf("Capitalized percentage: %.6f%%\n", capitalized_percentage); - printf("Forbidden percentage: %.6f%%\n", forbidden_percentage); - printf("Word count per sentence: %.6f\n", word_count_per_sentence); - printf("Total files read: %d\n", (int)(argc - 1)); + printf("\nTotal Words: %llu\n", twc); + printf("Total Capitalized words: %llu\n", tcc); + printf("Total Sentences: %llu\n", tsc); + printf("Total Numbers: %llu\n", tnc); + printf("Total Forbidden words: %llu\n", tfc); + printf("Capitalized percentage: %.6f%%\n", cc_pct); + printf("Forbidden percentage: %.6f%%\n", fc_pct); + printf("Word count per sentence: %.6f\n", wps); + printf("Total files read: %d\n", num_files); + + free(tasks); + free(results); + pthread_mutex_destroy(&mutex); + return 0; }