diff --git a/jest_rust/README.md b/jest_rust/README.md index 038c064..7eb3278 100644 --- a/jest_rust/README.md +++ b/jest_rust/README.md @@ -131,6 +131,21 @@ capitalized word percentage: 2% benchmark: 4737ms ``` +with trie: +``` +file count: 904 +failed file count: 0 +sentence count: 5602301 +word count: 81701260 +capitalized count: 1753639 +numeric count: 14981248 +forbidden count: 176528 +words per sentence average: 14.6 +forbidden word percentage: 0% +capitalized word percentage: 2% + +benchmark: 1588ms +``` muncher: ``` @@ -161,4 +176,5 @@ forbidden word percentage: 0% capitalized word percentage: 16% benchmark: 6078ms -``` \ No newline at end of file +``` +this is 2600ms with trie. eee \ No newline at end of file diff --git a/jest_rust/src/main.rs b/jest_rust/src/main.rs index 9066c52..1f5464d 100644 --- a/jest_rust/src/main.rs +++ b/jest_rust/src/main.rs @@ -1,56 +1,57 @@ -use std::{env, fmt::Display, fs, ops::AddAssign}; +mod stats; +mod trie; + +use stats::Stats; +use std::{cell::LazyCell, env, fs}; use tokio::sync::mpsc; +use trie::Trie; -static FORBIDDEN_WORDS: &'static [&'static str] = &[ - "recovery", - "techie", - "http", - "https", - "digital", - "hack", - "::", - "//", - "@", - "com", - "crypto", - "bitcoin", - "wallet", - "hacker", - "welcome", - "whatsapp", - "email", - "cryptocurrency", - "stolen", - "freeze", - "quick", - "crucial", - "tracing", - "scammers", - "expers", - "hire", - "century", - "transaction", - "essential", - "managing", - "contact", - "contacting", - "understanding", - "assets", - "funds", -]; - -#[derive(Debug, Default)] -pub struct Stats { - file_count: u32, - failed_file_count: u32, - - sentence_count: u32, - word_count: u32, - - capitalized_count: u32, - numeric_count: u32, - forbidden_count: u32, +thread_local! { + static FORBIDON: LazyCell<Trie> = LazyCell::new(|| { + let mut trie = Trie::default(); + for word in [ + "recovery", + "techie", + "http", + "https", + "digital", + "hack", + "::", + "//", + "@", + "com", + "crypto", + "bitcoin", + "wallet", + "hacker", + "welcome", + "whatsapp", + "email", + "cryptocurrency", + "stolen", + "freeze", + "quick", + "crucial", + "tracing", + "scammers", + "expers", + "hire", + "century", + "transaction", + "essential", + "managing", + "contact", + "contacting", + "understanding", + "assets", + "funds", + ] { + trie.insert(word); + } + trie + }); } + impl Stats { pub fn process(&mut self, file: &str) { let Ok(text) = fs::read_to_string(&file) else { @@ -65,6 +66,7 @@ impl Stats { /// probably buggy. for example, are new lines sentences? what if the text has no last period? /// 500ms is without forbidden words check, but... /// 6000ms if adding forbidden words.. so not faster + /// with trie this is 2600ms fn muncher(&mut self, text: &str) { let mut capitalized = true; let mut whitespaced = false; @@ -86,12 +88,11 @@ impl Stats { capitalized = true; } let lowercase_word = word.to_lowercase(); - for forbidden_word in FORBIDDEN_WORDS { - if lowercase_word.contains(forbidden_word) { + FORBIDON.with(|trie| { + if trie.contains(&lowercase_word) { self.forbidden_count += 1; - break; //if you find one count it as a whole word } - } + }); word = String::new(); continue; } @@ -111,12 +112,11 @@ impl Stats { capitalized = true; } let lowercase_word = word.to_lowercase(); - for forbidden_word in FORBIDDEN_WORDS { - if lowercase_word.contains(forbidden_word) { + FORBIDON.with(|trie| { + if trie.contains(&lowercase_word) { self.forbidden_count += 1; - break; //if you find one count it as a whole word } - } + }); word = String::new(); continue; } @@ -132,6 +132,7 @@ impl Stats { } #[allow(dead_code)] /// typically 5000ms + /// with trie this is 1600ms fn for_loops(&mut self, text: &str) { for sentence in text .split('.') @@ -160,64 +161,20 @@ impl Stats { self.capitalized_count += 1; } let lowercase_word = word.to_lowercase(); - for forbidden_word in FORBIDDEN_WORDS { - if lowercase_word.contains(forbidden_word) { + FORBIDON.with(|trie| { + if trie.contains(&lowercase_word) { self.forbidden_count += 1; - break; //if you find one count it as a whole word } - } + }); } } } } -impl AddAssign for Stats { - fn add_assign(&mut self, rhs: Self) { - self.file_count += rhs.file_count; - self.failed_file_count += rhs.failed_file_count; - - self.sentence_count += rhs.sentence_count; - self.word_count += rhs.word_count; - - self.capitalized_count += rhs.capitalized_count; - self.numeric_count += rhs.numeric_count; - self.forbidden_count += rhs.forbidden_count; - } -} -impl Display for Stats { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - writeln!(f, "file count: {}", self.file_count)?; - writeln!(f, "failed file count: {}", self.failed_file_count)?; - - writeln!(f, "sentence count: {}", self.sentence_count)?; - writeln!(f, "word count: {}", self.word_count)?; - - writeln!(f, "capitalized count: {}", self.capitalized_count)?; - writeln!(f, "numeric count: {}", self.numeric_count)?; - writeln!(f, "forbidden count: {}", self.forbidden_count)?; - - let word_count = self.word_count as f32; - writeln!( - f, - "words per sentence average: {:.1}", - word_count / self.sentence_count as f32 - )?; - writeln!( - f, - "forbidden word percentage: {:.0}%", - (self.forbidden_count as f32 / word_count) * 100.0, - )?; - write!( - f, - "capitalized word percentage: {:.0}%", - (self.capitalized_count as f32 / word_count) * 100.0, - ) - } -} #[tokio::main] async fn main() { let files = env::args().skip(1); - let mut stats = Stats::default(); + let mut rx = { let (tx, rx) = mpsc::unbounded_channel(); for file in files { @@ -230,6 +187,7 @@ async fn main() { } rx }; + let mut stats = Stats::default(); while let Some(file_stat) = rx.recv().await { stats += file_stat; } diff --git a/jest_rust/src/stats.rs b/jest_rust/src/stats.rs new file mode 100644 index 0000000..8be9453 --- /dev/null +++ b/jest_rust/src/stats.rs @@ -0,0 +1,58 @@ +use std::{fmt::Display, ops::AddAssign}; + +#[derive(Debug, Default)] +pub struct Stats { + pub file_count: u32, + pub failed_file_count: u32, + + pub sentence_count: u32, + pub word_count: u32, + + pub capitalized_count: u32, + pub numeric_count: u32, + pub forbidden_count: u32, +} + +impl AddAssign for Stats { + fn add_assign(&mut self, rhs: Self) { + self.file_count += rhs.file_count; + self.failed_file_count += rhs.failed_file_count; + + self.sentence_count += rhs.sentence_count; + self.word_count += rhs.word_count; + + self.capitalized_count += rhs.capitalized_count; + self.numeric_count += rhs.numeric_count; + self.forbidden_count += rhs.forbidden_count; + } +} +impl Display for Stats { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + writeln!(f, "file count: {}", self.file_count)?; + writeln!(f, "failed file count: {}", self.failed_file_count)?; + + writeln!(f, "sentence count: {}", self.sentence_count)?; + writeln!(f, "word count: {}", self.word_count)?; + + writeln!(f, "capitalized count: {}", self.capitalized_count)?; + writeln!(f, "numeric count: {}", self.numeric_count)?; + writeln!(f, "forbidden count: {}", self.forbidden_count)?; + + let word_count = self.word_count as f32; + writeln!( + f, + "words per sentence average: {:.1}", + word_count / self.sentence_count as f32 + )?; + writeln!( + f, + "forbidden word percentage: {:.0}%", + (self.forbidden_count as f32 / word_count) * 100.0, + )?; + write!( + f, + "capitalized word percentage: {:.0}%", + (self.capitalized_count as f32 / word_count) * 100.0, + ) + } +} diff --git a/jest_rust/src/trie.rs b/jest_rust/src/trie.rs new file mode 100644 index 0000000..f6be9f4 --- /dev/null +++ b/jest_rust/src/trie.rs @@ -0,0 +1,30 @@ +use std::collections::HashMap; + +#[derive(Default, Debug, Clone)] +struct Node { + end: bool, + children: HashMap<char, Node>, +} +#[derive(Default, Debug, Clone)] +pub struct Trie { + root: Node, +} +impl Trie { + pub fn insert(&mut self, word: &str) { + let mut node = &mut self.root; + for char in word.chars() { + node = node.children.entry(char).or_default(); + } + node.end = true; + } + pub fn contains(&self, word: &str) -> bool { + let mut current_node = &self.root; + for char in word.chars() { + match current_node.children.get(&char) { + Some(node) => current_node = node, + None => return false, + } + } + current_node.end + } +}