parent
12f2494411
commit
5b8dd08348
@ -131,6 +131,21 @@ capitalized word percentage: 2%
|
||||
|
||||
benchmark: 4737ms
|
||||
```
|
||||
with trie:
|
||||
```
|
||||
file count: 904
|
||||
failed file count: 0
|
||||
sentence count: 5602301
|
||||
word count: 81701260
|
||||
capitalized count: 1753639
|
||||
numeric count: 14981248
|
||||
forbidden count: 176528
|
||||
words per sentence average: 14.6
|
||||
forbidden word percentage: 0%
|
||||
capitalized word percentage: 2%
|
||||
|
||||
benchmark: 1588ms
|
||||
```
|
||||
|
||||
muncher:
|
||||
```
|
||||
@ -161,4 +176,5 @@ forbidden word percentage: 0%
|
||||
capitalized word percentage: 16%
|
||||
|
||||
benchmark: 6078ms
|
||||
```
|
||||
```
|
||||
this is 2600ms with trie. eee
|
@ -1,56 +1,57 @@
|
||||
use std::{env, fmt::Display, fs, ops::AddAssign};
|
||||
mod stats;
|
||||
mod trie;
|
||||
|
||||
use stats::Stats;
|
||||
use std::{cell::LazyCell, env, fs};
|
||||
use tokio::sync::mpsc;
|
||||
use trie::Trie;
|
||||
|
||||
static FORBIDDEN_WORDS: &'static [&'static str] = &[
|
||||
"recovery",
|
||||
"techie",
|
||||
"http",
|
||||
"https",
|
||||
"digital",
|
||||
"hack",
|
||||
"::",
|
||||
"//",
|
||||
"@",
|
||||
"com",
|
||||
"crypto",
|
||||
"bitcoin",
|
||||
"wallet",
|
||||
"hacker",
|
||||
"welcome",
|
||||
"whatsapp",
|
||||
"email",
|
||||
"cryptocurrency",
|
||||
"stolen",
|
||||
"freeze",
|
||||
"quick",
|
||||
"crucial",
|
||||
"tracing",
|
||||
"scammers",
|
||||
"expers",
|
||||
"hire",
|
||||
"century",
|
||||
"transaction",
|
||||
"essential",
|
||||
"managing",
|
||||
"contact",
|
||||
"contacting",
|
||||
"understanding",
|
||||
"assets",
|
||||
"funds",
|
||||
];
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Stats {
|
||||
file_count: u32,
|
||||
failed_file_count: u32,
|
||||
|
||||
sentence_count: u32,
|
||||
word_count: u32,
|
||||
|
||||
capitalized_count: u32,
|
||||
numeric_count: u32,
|
||||
forbidden_count: u32,
|
||||
thread_local! {
|
||||
static FORBIDON: LazyCell<Trie> = LazyCell::new(|| {
|
||||
let mut trie = Trie::default();
|
||||
for word in [
|
||||
"recovery",
|
||||
"techie",
|
||||
"http",
|
||||
"https",
|
||||
"digital",
|
||||
"hack",
|
||||
"::",
|
||||
"//",
|
||||
"@",
|
||||
"com",
|
||||
"crypto",
|
||||
"bitcoin",
|
||||
"wallet",
|
||||
"hacker",
|
||||
"welcome",
|
||||
"whatsapp",
|
||||
"email",
|
||||
"cryptocurrency",
|
||||
"stolen",
|
||||
"freeze",
|
||||
"quick",
|
||||
"crucial",
|
||||
"tracing",
|
||||
"scammers",
|
||||
"expers",
|
||||
"hire",
|
||||
"century",
|
||||
"transaction",
|
||||
"essential",
|
||||
"managing",
|
||||
"contact",
|
||||
"contacting",
|
||||
"understanding",
|
||||
"assets",
|
||||
"funds",
|
||||
] {
|
||||
trie.insert(word);
|
||||
}
|
||||
trie
|
||||
});
|
||||
}
|
||||
|
||||
impl Stats {
|
||||
pub fn process(&mut self, file: &str) {
|
||||
let Ok(text) = fs::read_to_string(&file) else {
|
||||
@ -65,6 +66,7 @@ impl Stats {
|
||||
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
|
||||
/// 500ms is without forbidden words check, but...
|
||||
/// 6000ms if adding forbidden words.. so not faster
|
||||
/// with trie this is 2600ms
|
||||
fn muncher(&mut self, text: &str) {
|
||||
let mut capitalized = true;
|
||||
let mut whitespaced = false;
|
||||
@ -86,12 +88,11 @@ impl Stats {
|
||||
capitalized = true;
|
||||
}
|
||||
let lowercase_word = word.to_lowercase();
|
||||
for forbidden_word in FORBIDDEN_WORDS {
|
||||
if lowercase_word.contains(forbidden_word) {
|
||||
FORBIDON.with(|trie| {
|
||||
if trie.contains(&lowercase_word) {
|
||||
self.forbidden_count += 1;
|
||||
break; //if you find one count it as a whole word
|
||||
}
|
||||
}
|
||||
});
|
||||
word = String::new();
|
||||
continue;
|
||||
}
|
||||
@ -111,12 +112,11 @@ impl Stats {
|
||||
capitalized = true;
|
||||
}
|
||||
let lowercase_word = word.to_lowercase();
|
||||
for forbidden_word in FORBIDDEN_WORDS {
|
||||
if lowercase_word.contains(forbidden_word) {
|
||||
FORBIDON.with(|trie| {
|
||||
if trie.contains(&lowercase_word) {
|
||||
self.forbidden_count += 1;
|
||||
break; //if you find one count it as a whole word
|
||||
}
|
||||
}
|
||||
});
|
||||
word = String::new();
|
||||
continue;
|
||||
}
|
||||
@ -132,6 +132,7 @@ impl Stats {
|
||||
}
|
||||
#[allow(dead_code)]
|
||||
/// typically 5000ms
|
||||
/// with trie this is 1600ms
|
||||
fn for_loops(&mut self, text: &str) {
|
||||
for sentence in text
|
||||
.split('.')
|
||||
@ -160,64 +161,20 @@ impl Stats {
|
||||
self.capitalized_count += 1;
|
||||
}
|
||||
let lowercase_word = word.to_lowercase();
|
||||
for forbidden_word in FORBIDDEN_WORDS {
|
||||
if lowercase_word.contains(forbidden_word) {
|
||||
FORBIDON.with(|trie| {
|
||||
if trie.contains(&lowercase_word) {
|
||||
self.forbidden_count += 1;
|
||||
break; //if you find one count it as a whole word
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
impl AddAssign for Stats {
|
||||
fn add_assign(&mut self, rhs: Self) {
|
||||
self.file_count += rhs.file_count;
|
||||
self.failed_file_count += rhs.failed_file_count;
|
||||
|
||||
self.sentence_count += rhs.sentence_count;
|
||||
self.word_count += rhs.word_count;
|
||||
|
||||
self.capitalized_count += rhs.capitalized_count;
|
||||
self.numeric_count += rhs.numeric_count;
|
||||
self.forbidden_count += rhs.forbidden_count;
|
||||
}
|
||||
}
|
||||
impl Display for Stats {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
writeln!(f, "file count: {}", self.file_count)?;
|
||||
writeln!(f, "failed file count: {}", self.failed_file_count)?;
|
||||
|
||||
writeln!(f, "sentence count: {}", self.sentence_count)?;
|
||||
writeln!(f, "word count: {}", self.word_count)?;
|
||||
|
||||
writeln!(f, "capitalized count: {}", self.capitalized_count)?;
|
||||
writeln!(f, "numeric count: {}", self.numeric_count)?;
|
||||
writeln!(f, "forbidden count: {}", self.forbidden_count)?;
|
||||
|
||||
let word_count = self.word_count as f32;
|
||||
writeln!(
|
||||
f,
|
||||
"words per sentence average: {:.1}",
|
||||
word_count / self.sentence_count as f32
|
||||
)?;
|
||||
writeln!(
|
||||
f,
|
||||
"forbidden word percentage: {:.0}%",
|
||||
(self.forbidden_count as f32 / word_count) * 100.0,
|
||||
)?;
|
||||
write!(
|
||||
f,
|
||||
"capitalized word percentage: {:.0}%",
|
||||
(self.capitalized_count as f32 / word_count) * 100.0,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
let files = env::args().skip(1);
|
||||
let mut stats = Stats::default();
|
||||
|
||||
let mut rx = {
|
||||
let (tx, rx) = mpsc::unbounded_channel();
|
||||
for file in files {
|
||||
@ -230,6 +187,7 @@ async fn main() {
|
||||
}
|
||||
rx
|
||||
};
|
||||
let mut stats = Stats::default();
|
||||
while let Some(file_stat) = rx.recv().await {
|
||||
stats += file_stat;
|
||||
}
|
||||
|
58
jest_rust/src/stats.rs
Normal file
58
jest_rust/src/stats.rs
Normal file
@ -0,0 +1,58 @@
|
||||
use std::{fmt::Display, ops::AddAssign};
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Stats {
|
||||
pub file_count: u32,
|
||||
pub failed_file_count: u32,
|
||||
|
||||
pub sentence_count: u32,
|
||||
pub word_count: u32,
|
||||
|
||||
pub capitalized_count: u32,
|
||||
pub numeric_count: u32,
|
||||
pub forbidden_count: u32,
|
||||
}
|
||||
|
||||
impl AddAssign for Stats {
|
||||
fn add_assign(&mut self, rhs: Self) {
|
||||
self.file_count += rhs.file_count;
|
||||
self.failed_file_count += rhs.failed_file_count;
|
||||
|
||||
self.sentence_count += rhs.sentence_count;
|
||||
self.word_count += rhs.word_count;
|
||||
|
||||
self.capitalized_count += rhs.capitalized_count;
|
||||
self.numeric_count += rhs.numeric_count;
|
||||
self.forbidden_count += rhs.forbidden_count;
|
||||
}
|
||||
}
|
||||
impl Display for Stats {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
writeln!(f, "file count: {}", self.file_count)?;
|
||||
writeln!(f, "failed file count: {}", self.failed_file_count)?;
|
||||
|
||||
writeln!(f, "sentence count: {}", self.sentence_count)?;
|
||||
writeln!(f, "word count: {}", self.word_count)?;
|
||||
|
||||
writeln!(f, "capitalized count: {}", self.capitalized_count)?;
|
||||
writeln!(f, "numeric count: {}", self.numeric_count)?;
|
||||
writeln!(f, "forbidden count: {}", self.forbidden_count)?;
|
||||
|
||||
let word_count = self.word_count as f32;
|
||||
writeln!(
|
||||
f,
|
||||
"words per sentence average: {:.1}",
|
||||
word_count / self.sentence_count as f32
|
||||
)?;
|
||||
writeln!(
|
||||
f,
|
||||
"forbidden word percentage: {:.0}%",
|
||||
(self.forbidden_count as f32 / word_count) * 100.0,
|
||||
)?;
|
||||
write!(
|
||||
f,
|
||||
"capitalized word percentage: {:.0}%",
|
||||
(self.capitalized_count as f32 / word_count) * 100.0,
|
||||
)
|
||||
}
|
||||
}
|
30
jest_rust/src/trie.rs
Normal file
30
jest_rust/src/trie.rs
Normal file
@ -0,0 +1,30 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
#[derive(Default, Debug, Clone)]
|
||||
struct Node {
|
||||
end: bool,
|
||||
children: HashMap<char, Node>,
|
||||
}
|
||||
#[derive(Default, Debug, Clone)]
|
||||
pub struct Trie {
|
||||
root: Node,
|
||||
}
|
||||
impl Trie {
|
||||
pub fn insert(&mut self, word: &str) {
|
||||
let mut node = &mut self.root;
|
||||
for char in word.chars() {
|
||||
node = node.children.entry(char).or_default();
|
||||
}
|
||||
node.end = true;
|
||||
}
|
||||
pub fn contains(&self, word: &str) -> bool {
|
||||
let mut current_node = &self.root;
|
||||
for char in word.chars() {
|
||||
match current_node.children.get(&char) {
|
||||
Some(node) => current_node = node,
|
||||
None => return false,
|
||||
}
|
||||
}
|
||||
current_node.end
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user