do or do not, there is always a trie organization

This commit is contained in:
JestDotty 2025-03-23 23:58:34 -04:00
parent 12f2494411
commit 5b8dd08348
4 changed files with 169 additions and 107 deletions

View File

@ -131,6 +131,21 @@ capitalized word percentage: 2%
benchmark: 4737ms
```
with trie:
```
file count: 904
failed file count: 0
sentence count: 5602301
word count: 81701260
capitalized count: 1753639
numeric count: 14981248
forbidden count: 176528
words per sentence average: 14.6
forbidden word percentage: 0%
capitalized word percentage: 2%
benchmark: 1588ms
```
muncher:
```
@ -161,4 +176,5 @@ forbidden word percentage: 0%
capitalized word percentage: 16%
benchmark: 6078ms
```
```
this is 2600ms with trie. eee

View File

@ -1,56 +1,57 @@
use std::{env, fmt::Display, fs, ops::AddAssign};
mod stats;
mod trie;
use stats::Stats;
use std::{cell::LazyCell, env, fs};
use tokio::sync::mpsc;
use trie::Trie;
static FORBIDDEN_WORDS: &'static [&'static str] = &[
"recovery",
"techie",
"http",
"https",
"digital",
"hack",
"::",
"//",
"@",
"com",
"crypto",
"bitcoin",
"wallet",
"hacker",
"welcome",
"whatsapp",
"email",
"cryptocurrency",
"stolen",
"freeze",
"quick",
"crucial",
"tracing",
"scammers",
"expers",
"hire",
"century",
"transaction",
"essential",
"managing",
"contact",
"contacting",
"understanding",
"assets",
"funds",
];
#[derive(Debug, Default)]
pub struct Stats {
file_count: u32,
failed_file_count: u32,
sentence_count: u32,
word_count: u32,
capitalized_count: u32,
numeric_count: u32,
forbidden_count: u32,
thread_local! {
static FORBIDON: LazyCell<Trie> = LazyCell::new(|| {
let mut trie = Trie::default();
for word in [
"recovery",
"techie",
"http",
"https",
"digital",
"hack",
"::",
"//",
"@",
"com",
"crypto",
"bitcoin",
"wallet",
"hacker",
"welcome",
"whatsapp",
"email",
"cryptocurrency",
"stolen",
"freeze",
"quick",
"crucial",
"tracing",
"scammers",
"expers",
"hire",
"century",
"transaction",
"essential",
"managing",
"contact",
"contacting",
"understanding",
"assets",
"funds",
] {
trie.insert(word);
}
trie
});
}
impl Stats {
pub fn process(&mut self, file: &str) {
let Ok(text) = fs::read_to_string(&file) else {
@ -65,6 +66,7 @@ impl Stats {
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
/// 500ms is without forbidden words check, but...
/// 6000ms if adding forbidden words.. so not faster
/// with trie this is 2600ms
fn muncher(&mut self, text: &str) {
let mut capitalized = true;
let mut whitespaced = false;
@ -86,12 +88,11 @@ impl Stats {
capitalized = true;
}
let lowercase_word = word.to_lowercase();
for forbidden_word in FORBIDDEN_WORDS {
if lowercase_word.contains(forbidden_word) {
FORBIDON.with(|trie| {
if trie.contains(&lowercase_word) {
self.forbidden_count += 1;
break; //if you find one count it as a whole word
}
}
});
word = String::new();
continue;
}
@ -111,12 +112,11 @@ impl Stats {
capitalized = true;
}
let lowercase_word = word.to_lowercase();
for forbidden_word in FORBIDDEN_WORDS {
if lowercase_word.contains(forbidden_word) {
FORBIDON.with(|trie| {
if trie.contains(&lowercase_word) {
self.forbidden_count += 1;
break; //if you find one count it as a whole word
}
}
});
word = String::new();
continue;
}
@ -132,6 +132,7 @@ impl Stats {
}
#[allow(dead_code)]
/// typically 5000ms
/// with trie this is 1600ms
fn for_loops(&mut self, text: &str) {
for sentence in text
.split('.')
@ -160,64 +161,20 @@ impl Stats {
self.capitalized_count += 1;
}
let lowercase_word = word.to_lowercase();
for forbidden_word in FORBIDDEN_WORDS {
if lowercase_word.contains(forbidden_word) {
FORBIDON.with(|trie| {
if trie.contains(&lowercase_word) {
self.forbidden_count += 1;
break; //if you find one count it as a whole word
}
}
});
}
}
}
}
impl AddAssign for Stats {
fn add_assign(&mut self, rhs: Self) {
self.file_count += rhs.file_count;
self.failed_file_count += rhs.failed_file_count;
self.sentence_count += rhs.sentence_count;
self.word_count += rhs.word_count;
self.capitalized_count += rhs.capitalized_count;
self.numeric_count += rhs.numeric_count;
self.forbidden_count += rhs.forbidden_count;
}
}
impl Display for Stats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "file count: {}", self.file_count)?;
writeln!(f, "failed file count: {}", self.failed_file_count)?;
writeln!(f, "sentence count: {}", self.sentence_count)?;
writeln!(f, "word count: {}", self.word_count)?;
writeln!(f, "capitalized count: {}", self.capitalized_count)?;
writeln!(f, "numeric count: {}", self.numeric_count)?;
writeln!(f, "forbidden count: {}", self.forbidden_count)?;
let word_count = self.word_count as f32;
writeln!(
f,
"words per sentence average: {:.1}",
word_count / self.sentence_count as f32
)?;
writeln!(
f,
"forbidden word percentage: {:.0}%",
(self.forbidden_count as f32 / word_count) * 100.0,
)?;
write!(
f,
"capitalized word percentage: {:.0}%",
(self.capitalized_count as f32 / word_count) * 100.0,
)
}
}
#[tokio::main]
async fn main() {
let files = env::args().skip(1);
let mut stats = Stats::default();
let mut rx = {
let (tx, rx) = mpsc::unbounded_channel();
for file in files {
@ -230,6 +187,7 @@ async fn main() {
}
rx
};
let mut stats = Stats::default();
while let Some(file_stat) = rx.recv().await {
stats += file_stat;
}

58
jest_rust/src/stats.rs Normal file
View File

@ -0,0 +1,58 @@
use std::{fmt::Display, ops::AddAssign};
#[derive(Debug, Default)]
pub struct Stats {
pub file_count: u32,
pub failed_file_count: u32,
pub sentence_count: u32,
pub word_count: u32,
pub capitalized_count: u32,
pub numeric_count: u32,
pub forbidden_count: u32,
}
impl AddAssign for Stats {
fn add_assign(&mut self, rhs: Self) {
self.file_count += rhs.file_count;
self.failed_file_count += rhs.failed_file_count;
self.sentence_count += rhs.sentence_count;
self.word_count += rhs.word_count;
self.capitalized_count += rhs.capitalized_count;
self.numeric_count += rhs.numeric_count;
self.forbidden_count += rhs.forbidden_count;
}
}
impl Display for Stats {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
writeln!(f, "file count: {}", self.file_count)?;
writeln!(f, "failed file count: {}", self.failed_file_count)?;
writeln!(f, "sentence count: {}", self.sentence_count)?;
writeln!(f, "word count: {}", self.word_count)?;
writeln!(f, "capitalized count: {}", self.capitalized_count)?;
writeln!(f, "numeric count: {}", self.numeric_count)?;
writeln!(f, "forbidden count: {}", self.forbidden_count)?;
let word_count = self.word_count as f32;
writeln!(
f,
"words per sentence average: {:.1}",
word_count / self.sentence_count as f32
)?;
writeln!(
f,
"forbidden word percentage: {:.0}%",
(self.forbidden_count as f32 / word_count) * 100.0,
)?;
write!(
f,
"capitalized word percentage: {:.0}%",
(self.capitalized_count as f32 / word_count) * 100.0,
)
}
}

30
jest_rust/src/trie.rs Normal file
View File

@ -0,0 +1,30 @@
use std::collections::HashMap;
#[derive(Default, Debug, Clone)]
struct Node {
end: bool,
children: HashMap<char, Node>,
}
#[derive(Default, Debug, Clone)]
pub struct Trie {
root: Node,
}
impl Trie {
pub fn insert(&mut self, word: &str) {
let mut node = &mut self.root;
for char in word.chars() {
node = node.children.entry(char).or_default();
}
node.end = true;
}
pub fn contains(&self, word: &str) -> bool {
let mut current_node = &self.root;
for char in word.chars() {
match current_node.children.get(&char) {
Some(node) => current_node = node,
None => return false,
}
}
current_node.end
}
}