This commit is contained in:
parent
3d28435e9b
commit
e400fc8e0d
9
Makefile
9
Makefile
@ -1,5 +1,12 @@
|
|||||||
CC = gcc
|
CC = gcc
|
||||||
CFLAGS = -Ofast
|
CFLAGS = -O3 -march=native -mtune=native -flto -ffast-math \
|
||||||
|
-funroll-all-loops -finline-functions -finline-limit=10000 \
|
||||||
|
-fprefetch-loop-arrays -ftracer -fmodulo-sched \
|
||||||
|
-fmodulo-sched-allow-regmoves -fgcse-sm -fgcse-las \
|
||||||
|
-ftree-loop-distribution -ftree-loop-im -ftree-loop-ivcanon \
|
||||||
|
-fivopts -fvariable-expansion-in-unroller -fvect-cost-model=unlimited \
|
||||||
|
-mavx2 -mfma -mbmi2 -mlzcnt -mpopcnt \
|
||||||
|
-pthread
|
||||||
|
|
||||||
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
|
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
|
||||||
|
|
||||||
|
|||||||
@ -1,183 +1,326 @@
|
|||||||
// Author: retoor@molodetz.nl
|
#define _GNU_SOURCE
|
||||||
|
|
||||||
// This program analyzes text files for word counts, capitalized words, sentences, numbers, and forbidden words.
|
|
||||||
|
|
||||||
/*
|
|
||||||
MIT License
|
|
||||||
|
|
||||||
Copyright (c) 2025 retoor
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
|
||||||
in the Software without restriction, including without limitation the rights
|
|
||||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
||||||
copies of the Software, and to permit persons to whom the Software is
|
|
||||||
furnished to do so, subject to the following conditions:
|
|
||||||
|
|
||||||
... (full license text)
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <stdlib.h>
|
#include <fcntl.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
#define MAX_TEXT_LENGTH 1024
|
#define MAX_WORD_LEN 64
|
||||||
#define FORBIDDEN_WORDS_COUNT 40
|
#define MAX_THREADS 16
|
||||||
|
#define CACHE_LINE 64
|
||||||
|
#define MAX_FILES 1024
|
||||||
|
|
||||||
const char* forbidden_words[FORBIDDEN_WORDS_COUNT] = {
|
// Compact trie with better cache locality
|
||||||
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
|
typedef struct TrieNode {
|
||||||
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency",
|
struct TrieNode* children[128]; // ASCII only
|
||||||
"stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century",
|
unsigned char is_word;
|
||||||
"transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds",
|
} TrieNode;
|
||||||
NULL
|
|
||||||
|
TrieNode* trie_root;
|
||||||
|
|
||||||
|
static const char* forbidden[] = {
|
||||||
|
"recovery", "techie", "digital", "hack", "crypto", "bitcoin", "wallet", "hacker",
|
||||||
|
"welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick",
|
||||||
|
"crucial", "tracing", "scammers", "expers", "hire", "century", "transaction",
|
||||||
|
"essential", "managing", "contact", "contacting", "understanding", "assets", "funds"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Build trie case-insensitive
|
||||||
|
void build_trie() {
|
||||||
|
trie_root = calloc(1, sizeof(TrieNode));
|
||||||
|
for (int i = 0; i < 29; i++) {
|
||||||
|
TrieNode* node = trie_root;
|
||||||
|
const char* w = forbidden[i];
|
||||||
|
while (*w) {
|
||||||
|
unsigned char c = tolower(*w++) & 0x7F;
|
||||||
|
if (!node->children[c]) {
|
||||||
|
node->children[c] = calloc(1, sizeof(TrieNode));
|
||||||
|
}
|
||||||
|
node = node->children[c];
|
||||||
|
}
|
||||||
|
node->is_word = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fast inline trie search
|
||||||
|
static inline int trie_search(const char* word, int len) {
|
||||||
|
TrieNode* node = trie_root;
|
||||||
|
for (int i = 0; i < len; i++) {
|
||||||
|
unsigned char c = tolower((unsigned char)word[i]) & 0x7F;
|
||||||
|
node = node->children[c];
|
||||||
|
if (!node) return 0;
|
||||||
|
}
|
||||||
|
return node->is_word;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Lookup tables
|
||||||
|
static unsigned char is_upper_tbl[256];
|
||||||
|
static unsigned char is_digit_tbl[256];
|
||||||
|
static unsigned char is_alpha_tbl[256];
|
||||||
|
static unsigned char to_lower_tbl[256];
|
||||||
|
|
||||||
|
void init_tables() {
|
||||||
|
for (int i = 0; i < 256; i++) {
|
||||||
|
is_upper_tbl[i] = isupper(i);
|
||||||
|
is_digit_tbl[i] = isdigit(i);
|
||||||
|
is_alpha_tbl[i] = isalpha(i);
|
||||||
|
to_lower_tbl[i] = tolower(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
char *filename;
|
unsigned long long wc, cc, sc, nc, fc;
|
||||||
long long total_word_count;
|
} Stats;
|
||||||
long long total_capitalized_count;
|
|
||||||
long long total_sentence_count;
|
|
||||||
long long total_number_count;
|
|
||||||
long long total_forbidden_count;
|
|
||||||
} AnalysisResult;
|
|
||||||
|
|
||||||
int is_forbidden(const char* word) {
|
typedef struct {
|
||||||
for (size_t i = 0; forbidden_words[i] != NULL; i++) {
|
char* path;
|
||||||
if (strcmp(word, forbidden_words[i]) == 0) {
|
Stats* result;
|
||||||
return 1; // Word is forbidden
|
} FileTask;
|
||||||
}
|
|
||||||
}
|
typedef struct {
|
||||||
return 0; // Word is not forbidden
|
FileTask* tasks;
|
||||||
|
int* next_task;
|
||||||
|
int total_tasks;
|
||||||
|
pthread_mutex_t* mutex;
|
||||||
|
} WorkQueue;
|
||||||
|
|
||||||
|
// Process entire file optimized for 590KB files
|
||||||
|
void* process_file_worker(void* arg) {
|
||||||
|
WorkQueue* queue = (WorkQueue*)arg;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
pthread_mutex_lock(queue->mutex);
|
||||||
|
int task_id = (*queue->next_task)++;
|
||||||
|
pthread_mutex_unlock(queue->mutex);
|
||||||
|
|
||||||
|
if (task_id >= queue->total_tasks) break;
|
||||||
|
|
||||||
|
FileTask* task = &queue->tasks[task_id];
|
||||||
|
|
||||||
|
int fd = open(task->path, O_RDONLY);
|
||||||
|
if (fd < 0) continue;
|
||||||
|
|
||||||
|
struct stat st;
|
||||||
|
if (fstat(fd, &st) < 0) {
|
||||||
|
close(fd);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
char* read_file(const char* filename) {
|
size_t size = st.st_size;
|
||||||
FILE *file = fopen(filename, "r");
|
if (size == 0) {
|
||||||
if (!file) {
|
close(fd);
|
||||||
printf("File doesn't exist: %s\n", filename);
|
continue;
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
char *content = NULL;
|
unsigned char* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
|
||||||
size_t content_size = 0;
|
close(fd);
|
||||||
size_t bytes_read;
|
|
||||||
|
|
||||||
do {
|
if (data == MAP_FAILED) continue;
|
||||||
char *new_content = (char *)realloc(content, content_size + MAX_TEXT_LENGTH);
|
|
||||||
if (!new_content) {
|
madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED);
|
||||||
free(content);
|
|
||||||
fclose(file);
|
unsigned long long wc = 0, cc = 0, sc = 0, nc = 0, fc = 0;
|
||||||
printf("Memory allocation failed while reading file: %s\n", filename);
|
char word[MAX_WORD_LEN + 1] __attribute__((aligned(16)));
|
||||||
return NULL;
|
int wlen = 0;
|
||||||
|
|
||||||
|
// AVX2 SIMD constants
|
||||||
|
const __m256i dot_vec = _mm256_set1_epi8('.');
|
||||||
|
|
||||||
|
size_t i = 0;
|
||||||
|
|
||||||
|
// Process 128 bytes at a time (4 AVX2 loads)
|
||||||
|
while (i + 128 <= size) {
|
||||||
|
__builtin_prefetch(data + i + 256, 0, 0);
|
||||||
|
|
||||||
|
// Load and count periods
|
||||||
|
__m256i v0 = _mm256_loadu_si256((__m256i*)(data + i));
|
||||||
|
__m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32));
|
||||||
|
__m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64));
|
||||||
|
__m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96));
|
||||||
|
|
||||||
|
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec)));
|
||||||
|
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec)));
|
||||||
|
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec)));
|
||||||
|
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec)));
|
||||||
|
|
||||||
|
// Process characters
|
||||||
|
for (int j = 0; j < 128; j++) {
|
||||||
|
unsigned char c = data[i + j];
|
||||||
|
|
||||||
|
if (is_alpha_tbl[c]) {
|
||||||
|
if (wlen < MAX_WORD_LEN) {
|
||||||
|
word[wlen++] = c;
|
||||||
}
|
}
|
||||||
content = new_content;
|
} else {
|
||||||
bytes_read = fread(content + content_size, 1, MAX_TEXT_LENGTH, file);
|
if (wlen > 0) {
|
||||||
content_size += bytes_read;
|
word[wlen] = 0;
|
||||||
} while (bytes_read == MAX_TEXT_LENGTH);
|
wc++;
|
||||||
|
cc += is_upper_tbl[(unsigned char)word[0]];
|
||||||
|
|
||||||
content[content_size] = '\0'; // Null-terminate the string
|
// Check for digits
|
||||||
fclose(file);
|
int has_digit = 0;
|
||||||
return content;
|
for (int k = 0; k < wlen; k++) {
|
||||||
}
|
if (is_digit_tbl[(unsigned char)word[k]]) {
|
||||||
|
has_digit = 1;
|
||||||
void* analyze_file(void* arg) {
|
|
||||||
AnalysisResult *result = (AnalysisResult *)arg;
|
|
||||||
char *text = read_file(result->filename);
|
|
||||||
if (text) {
|
|
||||||
long long word_count = 0;
|
|
||||||
long long capitalized_count = 0;
|
|
||||||
long long sentence_count = 0;
|
|
||||||
long long number_count = 0;
|
|
||||||
long long forbidden_count = 0;
|
|
||||||
|
|
||||||
for (size_t i = 0; text[i] != '\0'; i++) {
|
|
||||||
if (text[i] == '.') {
|
|
||||||
sentence_count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
char *saveptr;
|
|
||||||
char* token = strtok_r(text, " \f\v\r\n\t", &saveptr);
|
|
||||||
while (token != NULL) {
|
|
||||||
word_count++;
|
|
||||||
|
|
||||||
if (isupper(token[0])) {
|
|
||||||
capitalized_count++;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; token[i] != '\0'; i++) {
|
|
||||||
if (isdigit(token[i])) {
|
|
||||||
number_count++;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
nc += has_digit;
|
||||||
|
|
||||||
if (is_forbidden(token)) {
|
// Check forbidden
|
||||||
forbidden_count++;
|
if (wlen >= 4 && wlen <= 17) {
|
||||||
|
fc += trie_search(word, wlen);
|
||||||
}
|
}
|
||||||
|
|
||||||
token = strtok_r(NULL, " \f\v\r\n\t", &saveptr);
|
wlen = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
result->total_word_count = word_count;
|
i += 128;
|
||||||
result->total_capitalized_count = capitalized_count;
|
|
||||||
result->total_sentence_count = sentence_count;
|
|
||||||
result->total_number_count = number_count;
|
|
||||||
result->total_forbidden_count = forbidden_count;
|
|
||||||
|
|
||||||
free(text);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Process remaining bytes
|
||||||
|
while (i < size) {
|
||||||
|
unsigned char c = data[i];
|
||||||
|
|
||||||
|
if (c == '.') sc++;
|
||||||
|
|
||||||
|
if (is_alpha_tbl[c]) {
|
||||||
|
if (wlen < MAX_WORD_LEN) {
|
||||||
|
word[wlen++] = c;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (wlen > 0) {
|
||||||
|
word[wlen] = 0;
|
||||||
|
wc++;
|
||||||
|
cc += is_upper_tbl[(unsigned char)word[0]];
|
||||||
|
|
||||||
|
int has_digit = 0;
|
||||||
|
for (int k = 0; k < wlen; k++) {
|
||||||
|
if (is_digit_tbl[(unsigned char)word[k]]) {
|
||||||
|
has_digit = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nc += has_digit;
|
||||||
|
|
||||||
|
if (wlen >= 4 && wlen <= 17) {
|
||||||
|
fc += trie_search(word, wlen);
|
||||||
|
}
|
||||||
|
|
||||||
|
wlen = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Final word
|
||||||
|
if (wlen > 0) {
|
||||||
|
word[wlen] = 0;
|
||||||
|
wc++;
|
||||||
|
cc += is_upper_tbl[(unsigned char)word[0]];
|
||||||
|
|
||||||
|
int has_digit = 0;
|
||||||
|
for (int k = 0; k < wlen; k++) {
|
||||||
|
if (is_digit_tbl[(unsigned char)word[k]]) {
|
||||||
|
has_digit = 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nc += has_digit;
|
||||||
|
|
||||||
|
if (wlen >= 4 && wlen <= 17) {
|
||||||
|
fc += trie_search(word, wlen);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
task->result->wc = wc;
|
||||||
|
task->result->cc = cc;
|
||||||
|
task->result->sc = sc;
|
||||||
|
task->result->nc = nc;
|
||||||
|
task->result->fc = fc;
|
||||||
|
|
||||||
|
munmap(data, size);
|
||||||
|
}
|
||||||
|
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char* argv[]) {
|
int main(int argc, char* argv[]) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
printf("Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
|
fprintf(stderr, "Usage: %s <file1> <file2> ... <fileN>\n", argv[0]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
pthread_t threads[argc - 1];
|
init_tables();
|
||||||
AnalysisResult results[argc - 1];
|
build_trie();
|
||||||
|
|
||||||
for (size_t i = 1; i < argc; i++) {
|
// Setup work queue
|
||||||
results[i - 1].filename = argv[i];
|
int num_files = argc - 1;
|
||||||
if (pthread_create(&threads[i - 1], NULL, analyze_file, &results[i - 1]) != 0) {
|
FileTask* tasks = calloc(num_files, sizeof(FileTask));
|
||||||
printf("Error creating thread for file: %s\n", argv[i]);
|
Stats* results = calloc(num_files, sizeof(Stats));
|
||||||
return 1;
|
|
||||||
}
|
for (int i = 0; i < num_files; i++) {
|
||||||
|
tasks[i].path = argv[i + 1];
|
||||||
|
tasks[i].result = &results[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 1; i < argc; i++) {
|
int next_task = 0;
|
||||||
pthread_join(threads[i - 1], NULL);
|
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
|
||||||
|
|
||||||
|
WorkQueue queue = {
|
||||||
|
.tasks = tasks,
|
||||||
|
.next_task = &next_task,
|
||||||
|
.total_tasks = num_files,
|
||||||
|
.mutex = &mutex
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create thread pool
|
||||||
|
int nthreads = MAX_THREADS;
|
||||||
|
if (num_files < MAX_THREADS) nthreads = num_files;
|
||||||
|
|
||||||
|
pthread_t threads[MAX_THREADS];
|
||||||
|
for (int i = 0; i < nthreads; i++) {
|
||||||
|
pthread_create(&threads[i], NULL, process_file_worker, &queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
long long total_word_count = 0;
|
for (int i = 0; i < nthreads; i++) {
|
||||||
long long total_capitalized_count = 0;
|
pthread_join(threads[i], NULL);
|
||||||
long long total_sentence_count = 0;
|
|
||||||
long long total_number_count = 0;
|
|
||||||
long long total_forbidden_count = 0;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < argc - 1; i++) {
|
|
||||||
total_word_count += results[i].total_word_count;
|
|
||||||
total_capitalized_count += results[i].total_capitalized_count;
|
|
||||||
total_sentence_count += results[i].total_sentence_count;
|
|
||||||
total_number_count += results[i].total_number_count;
|
|
||||||
total_forbidden_count += results[i].total_forbidden_count;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
double capitalized_percentage = (total_word_count > 0) ? (double)total_capitalized_count / total_word_count * 100.0 : 0;
|
// Aggregate results
|
||||||
double forbidden_percentage = (total_word_count > 0) ? (double)total_forbidden_count / total_word_count * 100.0 : 0;
|
unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0;
|
||||||
double word_count_per_sentence = (total_sentence_count > 0) ? (double)total_word_count / total_sentence_count : 0;
|
for (int i = 0; i < num_files; i++) {
|
||||||
|
twc += results[i].wc;
|
||||||
|
tcc += results[i].cc;
|
||||||
|
tsc += results[i].sc;
|
||||||
|
tnc += results[i].nc;
|
||||||
|
tfc += results[i].fc;
|
||||||
|
}
|
||||||
|
|
||||||
|
double cc_pct = (twc > 0) ? (double)tcc / twc * 100.0 : 0;
|
||||||
|
double fc_pct = (twc > 0) ? (double)tfc / twc * 100.0 : 0;
|
||||||
|
double wps = (tsc > 0) ? (double)twc / tsc : 0;
|
||||||
|
|
||||||
|
printf("\nTotal Words: %llu\n", twc);
|
||||||
|
printf("Total Capitalized words: %llu\n", tcc);
|
||||||
|
printf("Total Sentences: %llu\n", tsc);
|
||||||
|
printf("Total Numbers: %llu\n", tnc);
|
||||||
|
printf("Total Forbidden words: %llu\n", tfc);
|
||||||
|
printf("Capitalized percentage: %.6f%%\n", cc_pct);
|
||||||
|
printf("Forbidden percentage: %.6f%%\n", fc_pct);
|
||||||
|
printf("Word count per sentence: %.6f\n", wps);
|
||||||
|
printf("Total files read: %d\n", num_files);
|
||||||
|
|
||||||
|
free(tasks);
|
||||||
|
free(results);
|
||||||
|
pthread_mutex_destroy(&mutex);
|
||||||
|
|
||||||
printf("\nTotal Words: %lld\n", total_word_count);
|
|
||||||
printf("Total Capitalized words: %lld\n", total_capitalized_count);
|
|
||||||
printf("Total Sentences: %lld\n", total_sentence_count);
|
|
||||||
printf("Total Numbers: %lld\n", total_number_count);
|
|
||||||
printf("Total Forbidden words: %lld\n", total_forbidden_count);
|
|
||||||
printf("Capitalized percentage: %.6f%%\n", capitalized_percentage);
|
|
||||||
printf("Forbidden percentage: %.6f%%\n", forbidden_percentage);
|
|
||||||
printf("Word count per sentence: %.6f\n", word_count_per_sentence);
|
|
||||||
printf("Total files read: %d\n", (int)(argc - 1));
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user