|
// retoor <retoor@molodetz.nl>
|
|
#define _GNU_SOURCE
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <fcntl.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/stat.h>
|
|
#include <pthread.h>
|
|
#include <unistd.h>
|
|
#include <immintrin.h>
|
|
#include <stdint.h>
|
|
|
|
#define MAX_THREADS 16
|
|
|
|
typedef struct {
|
|
uint64_t wc, cc, sc, nc, fc;
|
|
} Stats;
|
|
|
|
typedef struct {
|
|
char* path;
|
|
Stats result;
|
|
} FileTask;
|
|
|
|
typedef struct {
|
|
FileTask* tasks;
|
|
int next_task;
|
|
int total_tasks;
|
|
pthread_mutex_t mutex;
|
|
} WorkQueue;
|
|
|
|
static uint8_t is_ws[256];
|
|
static uint8_t is_upper[256];
|
|
static uint16_t fw_len_bits[256];
|
|
static __uint128_t fw_words[256][8];
|
|
static uint8_t fw_counts[256];
|
|
|
|
static void init_tables(void) {
|
|
static const char* forbidden[] = {
|
|
"recovery", "techie", "http", "https", "digital", "hack", "::", "//", "com",
|
|
"@", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email",
|
|
"cryptocurrency", "stolen", "freeze", "quick", "crucial", "tracing", "scammers",
|
|
"expers", "hire", "century", "transaction", "essential", "managing", "contact",
|
|
"contacting", "understanding", "assets", "funds"
|
|
};
|
|
for (int i = 0; i < 256; i++) {
|
|
is_ws[i] = (i == ' ' || i == '\t' || i == '\n' || i == '\r' || i == '\f');
|
|
is_upper[i] = (i >= 'A' && i <= 'Z');
|
|
fw_len_bits[i] = 0;
|
|
fw_counts[i] = 0;
|
|
}
|
|
for (int i = 0; i < 35; i++) {
|
|
int len = strlen(forbidden[i]);
|
|
uint8_t first = (uint8_t)forbidden[i][0];
|
|
fw_len_bits[first] |= (1u << len);
|
|
__uint128_t val = 0;
|
|
memcpy(&val, forbidden[i], len);
|
|
fw_words[first][fw_counts[first]++] = val;
|
|
}
|
|
}
|
|
|
|
static const __uint128_t len_masks[17] = {
|
|
0, 0xFFULL, 0xFFFFULL, 0xFFFFFFULL, 0xFFFFFFFFULL,
|
|
0xFFFFFFFFFFULL, 0xFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL,
|
|
((__uint128_t)0xFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
|
|
((__uint128_t)0xFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
|
|
((__uint128_t)0xFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
|
|
((__uint128_t)0xFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
|
|
((__uint128_t)0xFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
|
|
((__uint128_t)0xFFFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
|
|
((__uint128_t)0xFFFFFFFFFFFFFF << 64) | 0xFFFFFFFFFFFFFFFFULL,
|
|
(__uint128_t)-1
|
|
};
|
|
|
|
static inline __attribute__((always_inline, hot)) int is_forbidden(const uint8_t* word, size_t len) {
|
|
if (__builtin_expect(len > 14, 0)) return 0;
|
|
uint8_t first = word[0];
|
|
uint16_t bits = fw_len_bits[first];
|
|
if (__builtin_expect((bits & (1u << len)) == 0, 1)) return 0;
|
|
__uint128_t w = 0;
|
|
memcpy(&w, word, len);
|
|
w &= len_masks[len];
|
|
__uint128_t* fwords = fw_words[first];
|
|
int cnt = fw_counts[first];
|
|
for (int i = 0; i < cnt; i++) {
|
|
if (fwords[i] == w) return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void process_file(FileTask* task) {
|
|
int fd = open(task->path, O_RDONLY);
|
|
if (fd < 0) return;
|
|
|
|
struct stat st;
|
|
if (fstat(fd, &st) < 0 || st.st_size == 0) {
|
|
close(fd);
|
|
return;
|
|
}
|
|
|
|
size_t size = st.st_size;
|
|
uint8_t* data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
|
|
close(fd);
|
|
if (data == MAP_FAILED) return;
|
|
|
|
madvise(data, size, MADV_SEQUENTIAL | MADV_WILLNEED);
|
|
|
|
uint64_t wc = 0, cc = 0, sc = 0, nc = 0, fc = 0;
|
|
|
|
const __m256i dot_vec = _mm256_set1_epi8('.');
|
|
const __m256i zero_vec = _mm256_set1_epi8('0' - 1);
|
|
const __m256i nine_vec = _mm256_set1_epi8('9' + 1);
|
|
size_t i = 0;
|
|
size_t simd_end = size & ~127ULL;
|
|
|
|
while (i < simd_end) {
|
|
__builtin_prefetch(data + i + 512, 0, 0);
|
|
__m256i v0 = _mm256_loadu_si256((__m256i*)(data + i));
|
|
__m256i v1 = _mm256_loadu_si256((__m256i*)(data + i + 32));
|
|
__m256i v2 = _mm256_loadu_si256((__m256i*)(data + i + 64));
|
|
__m256i v3 = _mm256_loadu_si256((__m256i*)(data + i + 96));
|
|
|
|
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v0, dot_vec)));
|
|
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, dot_vec)));
|
|
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v2, dot_vec)));
|
|
sc += __builtin_popcount(_mm256_movemask_epi8(_mm256_cmpeq_epi8(v3, dot_vec)));
|
|
|
|
nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v0, zero_vec), _mm256_cmpgt_epi8(nine_vec, v0))));
|
|
nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v1, zero_vec), _mm256_cmpgt_epi8(nine_vec, v1))));
|
|
nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v2, zero_vec), _mm256_cmpgt_epi8(nine_vec, v2))));
|
|
nc += __builtin_popcount(_mm256_movemask_epi8(_mm256_and_si256(_mm256_cmpgt_epi8(v3, zero_vec), _mm256_cmpgt_epi8(nine_vec, v3))));
|
|
i += 128;
|
|
}
|
|
while (i < size) {
|
|
uint8_t c = data[i];
|
|
sc += (c == '.');
|
|
nc += (c >= '0' && c <= '9');
|
|
i++;
|
|
}
|
|
|
|
i = 0;
|
|
while (i < size) {
|
|
while (i < size && is_ws[data[i]]) i++;
|
|
if (i >= size) break;
|
|
|
|
size_t word_start = i;
|
|
int all_upper = 1;
|
|
|
|
while (i < size && !is_ws[data[i]]) {
|
|
all_upper &= is_upper[data[i]];
|
|
i++;
|
|
}
|
|
|
|
wc++;
|
|
cc += all_upper;
|
|
size_t wlen = i - word_start;
|
|
if (wlen <= 14) {
|
|
fc += is_forbidden(data + word_start, wlen);
|
|
}
|
|
}
|
|
|
|
task->result.wc = wc;
|
|
task->result.cc = cc;
|
|
task->result.sc = sc;
|
|
task->result.nc = nc;
|
|
task->result.fc = fc;
|
|
|
|
munmap(data, size);
|
|
}
|
|
|
|
static void* worker(void* arg) {
|
|
WorkQueue* q = (WorkQueue*)arg;
|
|
|
|
while (1) {
|
|
pthread_mutex_lock(&q->mutex);
|
|
int task_id = q->next_task++;
|
|
pthread_mutex_unlock(&q->mutex);
|
|
|
|
if (task_id >= q->total_tasks) break;
|
|
process_file(&q->tasks[task_id]);
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
int main(int argc, char* argv[]) {
|
|
if (argc < 2) {
|
|
fprintf(stderr, "Usage: %s <file1> [file2] ...\n", argv[0]);
|
|
return 1;
|
|
}
|
|
|
|
init_tables();
|
|
|
|
int num_files = argc - 1;
|
|
FileTask* tasks = calloc(num_files, sizeof(FileTask));
|
|
if (!tasks) return 1;
|
|
|
|
for (int i = 0; i < num_files; i++) {
|
|
tasks[i].path = argv[i + 1];
|
|
}
|
|
|
|
WorkQueue queue = {
|
|
.tasks = tasks,
|
|
.next_task = 0,
|
|
.total_tasks = num_files,
|
|
.mutex = PTHREAD_MUTEX_INITIALIZER
|
|
};
|
|
|
|
int nthreads = num_files < MAX_THREADS ? num_files : MAX_THREADS;
|
|
pthread_t threads[MAX_THREADS];
|
|
|
|
for (int i = 0; i < nthreads; i++) {
|
|
pthread_create(&threads[i], NULL, worker, &queue);
|
|
}
|
|
for (int i = 0; i < nthreads; i++) {
|
|
pthread_join(threads[i], NULL);
|
|
}
|
|
|
|
unsigned long long twc = 0, tcc = 0, tsc = 0, tnc = 0, tfc = 0;
|
|
for (int i = 0; i < num_files; i++) {
|
|
twc += tasks[i].result.wc;
|
|
tcc += tasks[i].result.cc;
|
|
tsc += tasks[i].result.sc;
|
|
tnc += tasks[i].result.nc;
|
|
tfc += tasks[i].result.fc;
|
|
}
|
|
|
|
double cc_pct = twc > 0 ? (double)tcc / twc * 100.0 : 0;
|
|
double fc_pct = twc > 0 ? (double)tfc / twc * 100.0 : 0;
|
|
double wps = tsc > 0 ? (double)twc / tsc : 0;
|
|
|
|
printf("\nTotal Words: %llu\n", twc);
|
|
printf("Total Capitalized words: %llu\n", tcc);
|
|
printf("Total Sentences: %llu\n", tsc);
|
|
printf("Total Numbers: %llu\n", tnc);
|
|
printf("Total Forbidden words: %llu\n", tfc);
|
|
printf("Capitalized percentage: %.6f%%\n", cc_pct);
|
|
printf("Forbidden percentage: %.6f%%\n", fc_pct);
|
|
printf("Word count per sentence: %.6f\n", wps);
|
|
printf("Total files read: %d\n", num_files);
|
|
|
|
free(tasks);
|
|
pthread_mutex_destroy(&queue.mutex);
|
|
|
|
return 0;
|
|
}
|