273 lines
6.0 KiB
Rust
Raw Normal View History

mod stats;
mod trie;
2025-03-24 01:03:09 +01:00
use stats::Stats;
use std::{cell::LazyCell, env, fs};
use tokio::sync::mpsc;
use trie::Trie;
2025-03-24 01:03:09 +01:00
thread_local! {
static FORBIDON: LazyCell<Trie> = LazyCell::new(|| {
let mut trie = Trie::default();
for word in [
"recovery",
"techie",
"http",
"https",
"digital",
"hack",
"::",
"//",
"@",
"com",
"crypto",
"bitcoin",
"wallet",
"hacker",
"welcome",
"whatsapp",
"email",
"cryptocurrency",
"stolen",
"freeze",
"quick",
"crucial",
"tracing",
"scammers",
"expers",
"hire",
"century",
"transaction",
"essential",
"managing",
"contact",
"contacting",
"understanding",
"assets",
"funds",
] {
trie.insert(word);
}
trie
});
2025-03-24 01:03:09 +01:00
}
2025-03-24 02:39:12 +01:00
impl Stats {
pub fn process(&mut self, file: &str) {
let Ok(text) = fs::read_to_string(&file) else {
self.failed_file_count += 1;
return;
};
self.file_count += 1;
// self.muncher(&text);
self.for_loops(&text);
2025-03-24 04:14:30 +01:00
}
#[allow(dead_code)]
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
/// 500ms is without forbidden words check, but...
/// 6000ms if adding forbidden words.. so not faster
/// with trie this is 2600ms
2025-03-24 04:14:30 +01:00
fn muncher(&mut self, text: &str) {
let mut capitalized = true;
let mut whitespaced = false;
let mut dotted = false;
let mut word = String::new();
for char in text.chars() {
if whitespaced {
if !char.is_whitespace() {
whitespaced = false; //end whiteness
}
continue;
} else if char.is_whitespace() {
whitespaced = true;
self.word_count += 1; //end of word
if capitalized {
self.capitalized_count += 1;
} else {
//reset capitalized word
capitalized = true;
}
let lowercase_word = word.to_lowercase();
FORBIDON.with(|trie| {
if trie.contains(&lowercase_word) {
2025-03-24 04:14:30 +01:00
self.forbidden_count += 1;
}
});
2025-03-24 04:14:30 +01:00
word = String::new();
continue;
}
if dotted {
if char != '.' {
dotted = false; //end sentencing
}
continue;
} else if char == '.' {
dotted = true;
self.sentence_count += 1;
self.word_count += 1; //end of word
if capitalized {
self.capitalized_count += 1;
} else {
//reset capitalized word
capitalized = true;
}
let lowercase_word = word.to_lowercase();
FORBIDON.with(|trie| {
if trie.contains(&lowercase_word) {
2025-03-24 04:14:30 +01:00
self.forbidden_count += 1;
}
});
2025-03-24 04:14:30 +01:00
word = String::new();
continue;
}
word += &char.to_string();
if char.is_numeric() {
self.numeric_count += 1;
capitalized = false;
}
if !char.is_ascii_uppercase() {
capitalized = false;
}
}
}
#[allow(dead_code)]
/// typically 5000ms
/// with trie this is 1600ms
2025-03-24 04:14:30 +01:00
fn for_loops(&mut self, text: &str) {
2025-03-24 02:39:12 +01:00
for sentence in text
.split('.')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
self.sentence_count += 1;
for word in sentence
.split_whitespace()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
self.word_count += 1;
//get all numbers counted
let mut all_capitalized = true;
for char in word.chars() {
if char.is_numeric() {
self.numeric_count += 1;
2025-03-24 04:14:30 +01:00
//TODO are numbers capitalized or not? I don't know!
2025-03-24 02:39:12 +01:00
}
if !char.is_ascii_uppercase() {
all_capitalized = false;
}
}
if all_capitalized {
self.capitalized_count += 1;
}
let lowercase_word = word.to_lowercase();
FORBIDON.with(|trie| {
if trie.contains(&lowercase_word) {
2025-03-24 02:39:12 +01:00
self.forbidden_count += 1;
}
});
2025-03-24 02:39:12 +01:00
}
}
}
}
2025-03-24 01:03:09 +01:00
#[tokio::main]
async fn main() {
2025-03-24 01:03:09 +01:00
let files = env::args().skip(1);
let mut rx = {
let (tx, rx) = mpsc::unbounded_channel();
for file in files {
let tx = tx.clone();
tokio::spawn(async move {
let mut stats = Stats::default();
stats.process(&file);
tx.send(stats).unwrap();
});
}
rx
};
let mut stats = Stats::default();
while let Some(file_stat) = rx.recv().await {
stats += file_stat;
}
println!("{stats}");
2025-03-24 01:03:09 +01:00
}
#[test]
fn test() {
use std::{env, fs, process::Command, time::Instant};
println!("cwd: {}", env::current_dir().unwrap().display());
//compile
let mut compile = Command::new("cargo");
let compile_arged = compile.arg("build").arg("--release");
match compile_arged.output() {
Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
Err(err) => eprintln!("compile failed: {err}"),
}
//get test files
let files = fs::read_dir("test_files")
.unwrap()
.map(|f| {
f.unwrap()
.path()
.canonicalize()
.unwrap()
.to_str()
.unwrap()
.to_string()
})
.collect::<Vec<_>>();
println!("test files found: {:#?}", files);
//benchmark run
let benchmark = Instant::now();
let mut run = Command::new("target/release/jisspam");
let run_arged = run.args(files);
match run_arged.output() {
Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
Err(err) => eprintln!("run failed: {err}"),
}
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
}
2025-03-24 04:14:30 +01:00
#[test]
fn books_test() {
use std::{env, fs, process::Command, time::Instant};
println!("cwd: {}", env::current_dir().unwrap().display());
//compile
let mut compile = Command::new("cargo");
let compile_arged = compile.arg("build").arg("--release");
match compile_arged.output() {
Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
Err(err) => eprintln!("compile failed: {err}"),
}
//get test files
let files = fs::read_dir("../books")
.unwrap()
.map(|f| {
f.unwrap()
.path()
.canonicalize()
.unwrap()
.to_str()
.unwrap()
.to_string()
})
.collect::<Vec<_>>();
println!("test files found: {:#?}", files);
//benchmark run
let benchmark = Instant::now();
let mut run = Command::new("target/release/jisspam");
let run_arged = run.args(files);
match run_arged.output() {
Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
Err(err) => eprintln!("run failed: {err}"),
}
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
}