DEMOLISHION OF RUST
All checks were successful
isspam build / build (push) Successful in 1m55s

This commit is contained in:
retoor 2025-12-14 21:09:30 +01:00
parent e400fc8e0d
commit 88ee6914b2

View File

@ -1,143 +1,120 @@
// retoor <retoor@molodetz.nl>
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <pthread.h>
#include <unistd.h>
#include <immintrin.h>
#include <stdint.h>
#define MAX_WORD_LEN 64
#define MAX_THREADS 16
#define CACHE_LINE 64
#define MAX_FILES 1024
// Compact trie with better cache locality
typedef struct TrieNode {
struct TrieNode* children[128]; // ASCII only
unsigned char is_word;
} TrieNode;
TrieNode* trie_root;
static const char* forbidden[] = {
"recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker",
"welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick",
"crucial", "tracing", "scammers", "expers", "hire", "century", "transaction",
"essential", "managing", "contact", "contacting", "understanding", "assets", "funds"
};
// Build trie case-insensitive
void build_trie() {
trie_root = calloc(1, sizeof(TrieNode));
for (int i = 0; i < 29; i++) {
TrieNode* node = trie_root;
const char* w = forbidden[i];
while (*w) {
unsigned char c = tolower(*w++) & 0x7F;
if (!node->children[c]) {
node->children[c] = calloc(1, sizeof(TrieNode));
}
node = node->children[c];
}
node->is_word = 1;
}
}
// Fast inline trie search
static inline int trie_search(const char* word, int len) {
TrieNode* node = trie_root;
for (int i = 0; i < len; i++) {
unsigned char c = tolower((unsigned char)word[i]) & 0x7F;
node = node->children[c];
if (!node) return 0;
}
return node->is_word;
}
// Lookup tables
static unsigned char is_upper_tbl[256];
static unsigned char is_digit_tbl[256];
static unsigned char is_alpha_tbl[256];
static unsigned char to_lower_tbl[256];
void init_tables() {
for (int i = 0; i < 256; i++) {
is_upper_tbl[i] = isupper(i);
is_digit_tbl[i] = isdigit(i);
is_alpha_tbl[i] = isalpha(i);
to_lower_tbl[i] = tolower(i);
}
}
typedef struct {
unsigned long long wc, cc, sc, nc, fc;
uint64_t wc, cc, sc, nc, fc;
} Stats;
typedef struct {
char* path;
Stats* result;
Stats result;
} FileTask;
typedef struct {
FileTask* tasks;
int* next_task;
int next_task;
int total_tasks;
pthread_mutex_t* mutex;
pthread_mutex_t mutex;
} WorkQueue;
// Process entire file optimized for 590KB files
void* process_file_worker(void* arg) {
WorkQueue* queue = (WorkQueue*)arg;
static uint8_t is_ws[256];
static uint8_t is_upper[256];
static uint16_t fw_len_bits[256];
static __uint128_t fw_words[256][8];
static uint8_t fw_counts[256];
while (1) {
pthread_mutex_lock(queue->mutex);
int task_id = (*queue->next_task)++;
pthread_mutex_unlock(queue->mutex);
static void init_tables(void) {
static const char* forbidden[] = {
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email",
"cryptocurrency", "stolen", "freeze", "quick", "crucial", "tracing", "scammers",
"expers", "hire", "century", "transaction", "essential", "managing", "contact",
"contacting", "understanding", "assets", "funds"
};
for (int i = 0; i < 256; i++) {
is_ws[i] = (i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\f');
is_upper[i] = (i >= 'A' && i <= 'Z');
fw_len_bits[i] = 0;
fw_counts[i] = 0;
}
for (int i = 0; i < 35; i++) {
int len = strlen(forbidden[i]);
uint8_t first = (uint8_t)forbidden[i][0];
fw_len_bits[first] |= (1u << len);
__uint128_t val = 0;
memcpy(&val, forbidden[i], len);
fw_words[first][fw_counts[first]++] = val;
}
}
if (task_id >= queue->total_tasks) break;
static const __uint128_t len_masks[17] = {
0, 0xFFULL, 0xFFFFULL, 0xFFFFFFULL, 0xFFFFFFFFULL,
0xFFFFFFFFFFULL, 0xFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
((__uint128_t)0xFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
((__uint128_t)0xFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
((__uint128_t)0xFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
((__uint128_t)0xFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
((__uint128_t)0xFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
((__uint128_t)0xFFFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
((__uint128_t)0xFFFFFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
(__uint128_t)-1
};
FileTask* task = &queue->tasks[task_id];
static inline __attribute__((always_inline, hot)) int is_forbidden(const uint8_t* word, size_t len) {
if (__builtin_expect(len > 14, 0)) return 0;
uint8_t first = word[0];
uint16_t bits = fw_len_bits[first];
if (__builtin_expect((bits & (1u << len)) == 0, 1)) return 0;
__uint128_t w = 0;
memcpy(&w, word, len);
w &= len_masks[len];
__uint128_t* fwords = fw_words[first];
int cnt = fw_counts[first];
for (int i = 0; i < cnt; i++) {
if (fwords[i] == w) return 1;
}
return 0;
}
static void process_file(FileTask* task) {
int fd = open(task->path, O_RDONLY);
if (fd < 0) continue;
if (fd < 0) return;
struct stat st;
if (fstat(fd, &st) < 0) {
if (fstat(fd, &st) < 0 || st.st_size == 0) {
close(fd);
continue;
return;
}
size_t size = st.st_size;
if (size == 0) {
uint8_t* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
continue;
}
unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
close(fd);
if (data == MAP_FAILED) continue;
if (data == MAP_FAILED) return;
madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED);
unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0;
char word[MAX_WORD_LEN + 1] __attribute__((aligned(16)));
int wlen = 0;
uint64_t wc = 0, cc = 0, sc = 0, nc = 0, fc = 0;
// AVX2 SIMD constants
const __m256i dot_vec = _mm256_set1_epi8('.');
const __m256i zero_vec = _mm256_set1_epi8('0' - 1);
const __m256i nine_vec = _mm256_set1_epi8('9' + 1);
size_t i = 0;
size_t simd_end = size & ~127ULL;
// Process 128 bytes at a time (4 AVX2 loads)
while (i + 128 <= size) {
__builtin_prefetch(data + i + 256, 0, 0);
// Load and count periods
while (i < simd_end) {
__builtin_prefetch(data + i + 512, 0, 0);
__m256i v0 = _mm256_loadu_si256((__m256i*)(data + i));
__m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32));
__m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64));
@ -148,165 +125,108 @@ void* process_file_worker(void* arg) {
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec)));
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec)));
// Process characters
for (int j = 0; j < 128; j++) {
unsigned char c = data[i + j];
if (is_alpha_tbl[c]) {
if (wlen < MAX_WORD_LEN) {
word[wlen++] = c;
}
} else {
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
// Check for digits
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
nc += has_digit;
// Check forbidden
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
}
wlen = 0;
}
}
}
nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v0, zero_vec), _mm256_cmpgt_epi8(nine_vec, v0))));
nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v1, zero_vec), _mm256_cmpgt_epi8(nine_vec, v1))));
nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v2, zero_vec), _mm256_cmpgt_epi8(nine_vec, v2))));
nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v3, zero_vec), _mm256_cmpgt_epi8(nine_vec, v3))));
i += 128;
}
// Process remaining bytes
while (i < size) {
unsigned char c = data[i];
if (c == '.') sc++;
if (is_alpha_tbl[c]) {
if (wlen < MAX_WORD_LEN) {
word[wlen++] = c;
}
} else {
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
nc += has_digit;
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
}
wlen = 0;
}
}
uint8_t c = data[i];
sc += (c == '.');
nc += (c >= '0' && c <= '9');
i++;
}
i = 0;
while (i < size) {
while (i < size && is_ws[data[i]]) i++;
if (i >= size) break;
size_t word_start = i;
int all_upper = 1;
while (i < size && !is_ws[data[i]]) {
all_upper &= is_upper[data[i]];
i++;
}
// Final word
if (wlen > 0) {
word[wlen] = 0;
wc++;
cc += is_upper_tbl[(unsigned char)word[0]];
int has_digit = 0;
for (int k = 0; k < wlen; k++) {
if (is_digit_tbl[(unsigned char)word[k]]) {
has_digit = 1;
break;
}
}
nc += has_digit;
if (wlen >= 4 && wlen <= 17) {
fc += trie_search(word, wlen);
cc += all_upper;
size_t wlen = i - word_start;
if (wlen <= 14) {
fc += is_forbidden(data + word_start, wlen);
}
}
task->result->wc = wc;
task->result->cc = cc;
task->result->sc = sc;
task->result->nc = nc;
task->result->fc = fc;
task->result.wc = wc;
task->result.cc = cc;
task->result.sc = sc;
task->result.nc = nc;
task->result.fc = fc;
munmap(data, size);
}
static void* worker(void* arg) {
WorkQueue* q = (WorkQueue*)arg;
while (1) {
pthread_mutex_lock(&q->mutex);
int task_id = q->next_task++;
pthread_mutex_unlock(&q->mutex);
if (task_id >= q->total_tasks) break;
process_file(&q->tasks[task_id]);
}
return NULL;
}
int main(int argc, char* argv[]) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
fprintf(stderr, "Usage: %s <file1> [file2] ...\n", argv[0]);
return 1;
}
init_tables();
build_trie();
// Setup work queue
int num_files = argc - 1;
FileTask* tasks = calloc(num_files, sizeof(FileTask));
Stats* results = calloc(num_files, sizeof(Stats));
if (!tasks) return 1;
for (int i = 0; i < num_files; i++) {
tasks[i].path = argv[i + 1];
tasks[i].result = &results[i];
}
int next_task = 0;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
WorkQueue queue = {
.tasks = tasks,
.next_task = &next_task,
.next_task = 0,
.total_tasks = num_files,
.mutex = &mutex
.mutex = PTHREAD_MUTEX_INITIALIZER
};
// Create thread pool
int nthreads = MAX_THREADS;
if (num_files < MAX_THREADS) nthreads = num_files;
int nthreads = num_files < MAX_THREADS ? num_files : MAX_THREADS;
pthread_t threads[MAX_THREADS];
for (int i = 0; i < nthreads; i++) {
pthread_create(&threads[i], NULL, process_file_worker, &queue);
}
for (int i = 0; i < nthreads; i++) {
pthread_create(&threads[i], NULL, worker, &queue);
}
for (int i = 0; i < nthreads; i++) {
pthread_join(threads[i], NULL);
}
// Aggregate results
unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0;
for (int i = 0; i < num_files; i++) {
twc += results[i].wc;
tcc += results[i].cc;
tsc += results[i].sc;
tnc += results[i].nc;
tfc += results[i].fc;
twc += tasks[i].result.wc;
tcc += tasks[i].result.cc;
tsc += tasks[i].result.sc;
tnc += tasks[i].result.nc;
tfc += tasks[i].result.fc;
}
double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0;
double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0;
double wps = (tsc > 0) ? (double)twc / tsc : 0;
double cc_pct = twc > 0 ? (double)tcc / twc * 100.0 : 0;
double fc_pct = twc > 0 ? (double)tfc / twc * 100.0 : 0;
double wps = tsc > 0 ? (double)twc / tsc : 0;
printf("\nTotal Words: %llu\n", twc);
printf("Total Capitalized words: %llu\n", tcc);
@ -319,8 +239,7 @@ int main(int argc, char* argv[]) {
printf("Total files read: %d\n", num_files);
free(tasks);
free(results);
pthread_mutex_destroy(&mutex);
pthread_mutex_destroy(&queue.mutex);
return 0;
}