2025-03-24 04:58:34 +01:00
|
|
|
mod stats;
|
|
|
|
|
mod trie;
|
2025-03-24 01:03:09 +01:00
|
|
|
|
2025-03-24 04:58:34 +01:00
|
|
|
use stats::Stats;
|
2025-03-24 05:20:15 +01:00
|
|
|
use std::{env, fs, sync::LazyLock};
|
2025-03-24 04:58:34 +01:00
|
|
|
use tokio::sync::mpsc;
|
|
|
|
|
use trie::Trie;
|
2025-03-24 01:03:09 +01:00
|
|
|
|
2025-03-24 05:20:15 +01:00
|
|
|
static FORBIDDEN_WORDS: LazyLock<Trie> = LazyLock::new(|| {
|
|
|
|
|
let mut trie = Trie::default();
|
|
|
|
|
for word in [
|
|
|
|
|
"recovery",
|
|
|
|
|
"techie",
|
|
|
|
|
"http",
|
|
|
|
|
"https",
|
|
|
|
|
"digital",
|
|
|
|
|
"hack",
|
|
|
|
|
"::",
|
|
|
|
|
"//",
|
|
|
|
|
"@",
|
|
|
|
|
"com",
|
|
|
|
|
"crypto",
|
|
|
|
|
"bitcoin",
|
|
|
|
|
"wallet",
|
|
|
|
|
"hacker",
|
|
|
|
|
"welcome",
|
|
|
|
|
"whatsapp",
|
|
|
|
|
"email",
|
|
|
|
|
"cryptocurrency",
|
|
|
|
|
"stolen",
|
|
|
|
|
"freeze",
|
|
|
|
|
"quick",
|
|
|
|
|
"crucial",
|
|
|
|
|
"tracing",
|
|
|
|
|
"scammers",
|
|
|
|
|
"expers",
|
|
|
|
|
"hire",
|
|
|
|
|
"century",
|
|
|
|
|
"transaction",
|
|
|
|
|
"essential",
|
|
|
|
|
"managing",
|
|
|
|
|
"contact",
|
|
|
|
|
"contacting",
|
|
|
|
|
"understanding",
|
|
|
|
|
"assets",
|
|
|
|
|
"funds",
|
|
|
|
|
] {
|
|
|
|
|
trie.insert(word);
|
|
|
|
|
}
|
|
|
|
|
trie
|
|
|
|
|
});
|
2025-03-24 04:58:34 +01:00
|
|
|
|
2025-03-24 02:39:12 +01:00
|
|
|
impl Stats {
|
|
|
|
|
pub fn process(&mut self, file: &str) {
|
|
|
|
|
let Ok(text) = fs::read_to_string(&file) else {
|
|
|
|
|
self.failed_file_count += 1;
|
|
|
|
|
return;
|
|
|
|
|
};
|
|
|
|
|
self.file_count += 1;
|
2025-03-24 04:23:40 +01:00
|
|
|
// self.muncher(&text);
|
|
|
|
|
self.for_loops(&text);
|
2025-03-24 04:14:30 +01:00
|
|
|
}
|
|
|
|
|
#[allow(dead_code)]
|
|
|
|
|
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
|
|
|
|
|
/// 500ms is without forbidden words check, but...
|
|
|
|
|
/// 6000ms if adding forbidden words.. so not faster
|
2025-03-24 04:58:34 +01:00
|
|
|
/// with trie this is 2600ms
|
2025-03-24 04:14:30 +01:00
|
|
|
fn muncher(&mut self, text: &str) {
|
|
|
|
|
let mut capitalized = true;
|
|
|
|
|
let mut whitespaced = false;
|
|
|
|
|
let mut dotted = false;
|
|
|
|
|
let mut word = String::new();
|
|
|
|
|
for char in text.chars() {
|
|
|
|
|
if whitespaced {
|
|
|
|
|
if !char.is_whitespace() {
|
|
|
|
|
whitespaced = false; //end whiteness
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
} else if char.is_whitespace() {
|
|
|
|
|
whitespaced = true;
|
|
|
|
|
self.word_count += 1; //end of word
|
|
|
|
|
if capitalized {
|
|
|
|
|
self.capitalized_count += 1;
|
|
|
|
|
} else {
|
|
|
|
|
//reset capitalized word
|
|
|
|
|
capitalized = true;
|
|
|
|
|
}
|
|
|
|
|
let lowercase_word = word.to_lowercase();
|
2025-03-24 05:20:15 +01:00
|
|
|
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
|
|
|
|
self.forbidden_count += 1;
|
|
|
|
|
}
|
2025-03-24 04:14:30 +01:00
|
|
|
word = String::new();
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if dotted {
|
|
|
|
|
if char != '.' {
|
|
|
|
|
dotted = false; //end sentencing
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
} else if char == '.' {
|
|
|
|
|
dotted = true;
|
|
|
|
|
self.sentence_count += 1;
|
|
|
|
|
self.word_count += 1; //end of word
|
|
|
|
|
if capitalized {
|
|
|
|
|
self.capitalized_count += 1;
|
|
|
|
|
} else {
|
|
|
|
|
//reset capitalized word
|
|
|
|
|
capitalized = true;
|
|
|
|
|
}
|
|
|
|
|
let lowercase_word = word.to_lowercase();
|
2025-03-24 05:20:15 +01:00
|
|
|
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
|
|
|
|
self.forbidden_count += 1;
|
|
|
|
|
}
|
2025-03-24 04:14:30 +01:00
|
|
|
word = String::new();
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
word += &char.to_string();
|
|
|
|
|
if char.is_numeric() {
|
|
|
|
|
self.numeric_count += 1;
|
|
|
|
|
capitalized = false;
|
|
|
|
|
}
|
|
|
|
|
if !char.is_ascii_uppercase() {
|
|
|
|
|
capitalized = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#[allow(dead_code)]
|
|
|
|
|
/// typically 5000ms
|
2025-03-24 04:58:34 +01:00
|
|
|
/// with trie this is 1600ms
|
2025-03-24 04:14:30 +01:00
|
|
|
fn for_loops(&mut self, text: &str) {
|
2025-03-24 02:39:12 +01:00
|
|
|
for sentence in text
|
|
|
|
|
.split('.')
|
|
|
|
|
.map(|s| s.trim())
|
|
|
|
|
.filter(|s| !s.is_empty())
|
|
|
|
|
{
|
|
|
|
|
self.sentence_count += 1;
|
|
|
|
|
for word in sentence
|
|
|
|
|
.split_whitespace()
|
|
|
|
|
.map(|s| s.trim())
|
|
|
|
|
.filter(|s| !s.is_empty())
|
|
|
|
|
{
|
|
|
|
|
self.word_count += 1;
|
|
|
|
|
//get all numbers counted
|
|
|
|
|
let mut all_capitalized = true;
|
|
|
|
|
for char in word.chars() {
|
|
|
|
|
if char.is_numeric() {
|
|
|
|
|
self.numeric_count += 1;
|
2025-03-24 04:14:30 +01:00
|
|
|
//TODO are numbers capitalized or not? I don't know!
|
2025-03-24 02:39:12 +01:00
|
|
|
}
|
|
|
|
|
if !char.is_ascii_uppercase() {
|
|
|
|
|
all_capitalized = false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if all_capitalized {
|
|
|
|
|
self.capitalized_count += 1;
|
|
|
|
|
}
|
|
|
|
|
let lowercase_word = word.to_lowercase();
|
2025-03-24 05:20:15 +01:00
|
|
|
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
|
|
|
|
self.forbidden_count += 1;
|
|
|
|
|
}
|
2025-03-24 02:39:12 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2025-03-24 01:03:09 +01:00
|
|
|
|
2025-03-24 03:29:54 +01:00
|
|
|
#[tokio::main]
|
|
|
|
|
async fn main() {
|
2025-03-24 01:03:09 +01:00
|
|
|
let files = env::args().skip(1);
|
2025-03-24 04:58:34 +01:00
|
|
|
|
2025-03-24 03:29:54 +01:00
|
|
|
let mut rx = {
|
|
|
|
|
let (tx, rx) = mpsc::unbounded_channel();
|
|
|
|
|
for file in files {
|
|
|
|
|
let tx = tx.clone();
|
|
|
|
|
tokio::spawn(async move {
|
|
|
|
|
let mut stats = Stats::default();
|
|
|
|
|
stats.process(&file);
|
|
|
|
|
tx.send(stats).unwrap();
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
rx
|
|
|
|
|
};
|
2025-03-24 04:58:34 +01:00
|
|
|
let mut stats = Stats::default();
|
2025-03-24 03:29:54 +01:00
|
|
|
while let Some(file_stat) = rx.recv().await {
|
|
|
|
|
stats += file_stat;
|
|
|
|
|
}
|
|
|
|
|
println!("{stats}");
|
2025-03-24 01:03:09 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test() {
|
|
|
|
|
use std::{env, fs, process::Command, time::Instant};
|
|
|
|
|
println!("cwd: {}", env::current_dir().unwrap().display());
|
|
|
|
|
|
|
|
|
|
//compile
|
|
|
|
|
let mut compile = Command::new("cargo");
|
|
|
|
|
let compile_arged = compile.arg("build").arg("--release");
|
|
|
|
|
match compile_arged.output() {
|
|
|
|
|
Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
|
|
|
|
|
Err(err) => eprintln!("compile failed: {err}"),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//get test files
|
|
|
|
|
let files = fs::read_dir("test_files")
|
|
|
|
|
.unwrap()
|
|
|
|
|
.map(|f| {
|
|
|
|
|
f.unwrap()
|
|
|
|
|
.path()
|
|
|
|
|
.canonicalize()
|
|
|
|
|
.unwrap()
|
|
|
|
|
.to_str()
|
|
|
|
|
.unwrap()
|
|
|
|
|
.to_string()
|
|
|
|
|
})
|
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
|
println!("test files found: {:#?}", files);
|
|
|
|
|
|
|
|
|
|
//benchmark run
|
|
|
|
|
let benchmark = Instant::now();
|
|
|
|
|
let mut run = Command::new("target/release/jisspam");
|
|
|
|
|
let run_arged = run.args(files);
|
|
|
|
|
match run_arged.output() {
|
|
|
|
|
Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
|
|
|
|
|
Err(err) => eprintln!("run failed: {err}"),
|
|
|
|
|
}
|
|
|
|
|
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
|
|
|
|
|
}
|
2025-03-24 04:14:30 +01:00
|
|
|
#[test]
|
|
|
|
|
fn books_test() {
|
|
|
|
|
use std::{env, fs, process::Command, time::Instant};
|
|
|
|
|
println!("cwd: {}", env::current_dir().unwrap().display());
|
|
|
|
|
|
|
|
|
|
//compile
|
|
|
|
|
let mut compile = Command::new("cargo");
|
|
|
|
|
let compile_arged = compile.arg("build").arg("--release");
|
|
|
|
|
match compile_arged.output() {
|
|
|
|
|
Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
|
|
|
|
|
Err(err) => eprintln!("compile failed: {err}"),
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//get test files
|
|
|
|
|
let files = fs::read_dir("../books")
|
|
|
|
|
.unwrap()
|
|
|
|
|
.map(|f| {
|
|
|
|
|
f.unwrap()
|
|
|
|
|
.path()
|
|
|
|
|
.canonicalize()
|
|
|
|
|
.unwrap()
|
|
|
|
|
.to_str()
|
|
|
|
|
.unwrap()
|
|
|
|
|
.to_string()
|
|
|
|
|
})
|
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
|
println!("test files found: {:#?}", files);
|
|
|
|
|
|
|
|
|
|
//benchmark run
|
|
|
|
|
let benchmark = Instant::now();
|
|
|
|
|
let mut run = Command::new("target/release/jisspam");
|
|
|
|
|
let run_arged = run.args(files);
|
|
|
|
|
match run_arged.output() {
|
|
|
|
|
Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
|
|
|
|
|
Err(err) => eprintln!("run failed: {err}"),
|
|
|
|
|
}
|
|
|
|
|
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
|
|
|
|
|
}
|