mod stats; mod trie; use stats::Stats; use std::{cell::LazyCell, env, fs}; use tokio::sync::mpsc; use trie::Trie; thread_local! { static FORBIDON: LazyCell = LazyCell::new(|| { let mut trie = Trie::default(); for word in [ "recovery", "techie", "http", "https", "digital", "hack", "::", "//", "@", "com", "crypto", "bitcoin", "wallet", "hacker", "welcome", "whatsapp", "email", "cryptocurrency", "stolen", "freeze", "quick", "crucial", "tracing", "scammers", "expers", "hire", "century", "transaction", "essential", "managing", "contact", "contacting", "understanding", "assets", "funds", ] { trie.insert(word); } trie }); } impl Stats { pub fn process(&mut self, file: &str) { let Ok(text) = fs::read_to_string(&file) else { self.failed_file_count += 1; return; }; self.file_count += 1; // self.muncher(&text); self.for_loops(&text); } #[allow(dead_code)] /// probably buggy. for example, are new lines sentences? what if the text has no last period? /// 500ms is without forbidden words check, but... /// 6000ms if adding forbidden words.. so not faster /// with trie this is 2600ms fn muncher(&mut self, text: &str) { let mut capitalized = true; let mut whitespaced = false; let mut dotted = false; let mut word = String::new(); for char in text.chars() { if whitespaced { if !char.is_whitespace() { whitespaced = false; //end whiteness } continue; } else if char.is_whitespace() { whitespaced = true; self.word_count += 1; //end of word if capitalized { self.capitalized_count += 1; } else { //reset capitalized word capitalized = true; } let lowercase_word = word.to_lowercase(); FORBIDON.with(|trie| { if trie.contains(&lowercase_word) { self.forbidden_count += 1; } }); word = String::new(); continue; } if dotted { if char != '.' { dotted = false; //end sentencing } continue; } else if char == '.' { dotted = true; self.sentence_count += 1; self.word_count += 1; //end of word if capitalized { self.capitalized_count += 1; } else { //reset capitalized word capitalized = true; } let lowercase_word = word.to_lowercase(); FORBIDON.with(|trie| { if trie.contains(&lowercase_word) { self.forbidden_count += 1; } }); word = String::new(); continue; } word += &char.to_string(); if char.is_numeric() { self.numeric_count += 1; capitalized = false; } if !char.is_ascii_uppercase() { capitalized = false; } } } #[allow(dead_code)] /// typically 5000ms /// with trie this is 1600ms fn for_loops(&mut self, text: &str) { for sentence in text .split('.') .map(|s| s.trim()) .filter(|s| !s.is_empty()) { self.sentence_count += 1; for word in sentence .split_whitespace() .map(|s| s.trim()) .filter(|s| !s.is_empty()) { self.word_count += 1; //get all numbers counted let mut all_capitalized = true; for char in word.chars() { if char.is_numeric() { self.numeric_count += 1; //TODO are numbers capitalized or not? I don't know! } if !char.is_ascii_uppercase() { all_capitalized = false; } } if all_capitalized { self.capitalized_count += 1; } let lowercase_word = word.to_lowercase(); FORBIDON.with(|trie| { if trie.contains(&lowercase_word) { self.forbidden_count += 1; } }); } } } } #[tokio::main] async fn main() { let files = env::args().skip(1); let mut rx = { let (tx, rx) = mpsc::unbounded_channel(); for file in files { let tx = tx.clone(); tokio::spawn(async move { let mut stats = Stats::default(); stats.process(&file); tx.send(stats).unwrap(); }); } rx }; let mut stats = Stats::default(); while let Some(file_stat) = rx.recv().await { stats += file_stat; } println!("{stats}"); } #[test] fn test() { use std::{env, fs, process::Command, time::Instant}; println!("cwd: {}", env::current_dir().unwrap().display()); //compile let mut compile = Command::new("cargo"); let compile_arged = compile.arg("build").arg("--release"); match compile_arged.output() { Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)), Err(err) => eprintln!("compile failed: {err}"), } //get test files let files = fs::read_dir("test_files") .unwrap() .map(|f| { f.unwrap() .path() .canonicalize() .unwrap() .to_str() .unwrap() .to_string() }) .collect::>(); println!("test files found: {:#?}", files); //benchmark run let benchmark = Instant::now(); let mut run = Command::new("target/release/jisspam"); let run_arged = run.args(files); match run_arged.output() { Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)), Err(err) => eprintln!("run failed: {err}"), } println!("benchmark: {}ms", benchmark.elapsed().as_millis()); } #[test] fn books_test() { use std::{env, fs, process::Command, time::Instant}; println!("cwd: {}", env::current_dir().unwrap().display()); //compile let mut compile = Command::new("cargo"); let compile_arged = compile.arg("build").arg("--release"); match compile_arged.output() { Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)), Err(err) => eprintln!("compile failed: {err}"), } //get test files let files = fs::read_dir("../books") .unwrap() .map(|f| { f.unwrap() .path() .canonicalize() .unwrap() .to_str() .unwrap() .to_string() }) .collect::>(); println!("test files found: {:#?}", files); //benchmark run let benchmark = Instant::now(); let mut run = Command::new("target/release/jisspam"); let run_arged = run.args(files); match run_arged.output() { Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)), Err(err) => eprintln!("run failed: {err}"), } println!("benchmark: {}ms", benchmark.elapsed().as_millis()); }