parent
12f2494411
commit
5b8dd08348
@ -131,6 +131,21 @@ capitalized word percentage: 2%
|
|||||||
|
|
||||||
benchmark: 4737ms
|
benchmark: 4737ms
|
||||||
```
|
```
|
||||||
|
with trie:
|
||||||
|
```
|
||||||
|
file count: 904
|
||||||
|
failed file count: 0
|
||||||
|
sentence count: 5602301
|
||||||
|
word count: 81701260
|
||||||
|
capitalized count: 1753639
|
||||||
|
numeric count: 14981248
|
||||||
|
forbidden count: 176528
|
||||||
|
words per sentence average: 14.6
|
||||||
|
forbidden word percentage: 0%
|
||||||
|
capitalized word percentage: 2%
|
||||||
|
|
||||||
|
benchmark: 1588ms
|
||||||
|
```
|
||||||
|
|
||||||
muncher:
|
muncher:
|
||||||
```
|
```
|
||||||
@ -161,4 +176,5 @@ forbidden word percentage: 0%
|
|||||||
capitalized word percentage: 16%
|
capitalized word percentage: 16%
|
||||||
|
|
||||||
benchmark: 6078ms
|
benchmark: 6078ms
|
||||||
```
|
```
|
||||||
|
this is 2600ms with trie. eee
|
@ -1,56 +1,57 @@
|
|||||||
use std::{env, fmt::Display, fs, ops::AddAssign};
|
mod stats;
|
||||||
|
mod trie;
|
||||||
|
|
||||||
|
use stats::Stats;
|
||||||
|
use std::{cell::LazyCell, env, fs};
|
||||||
use tokio::sync::mpsc;
|
use tokio::sync::mpsc;
|
||||||
|
use trie::Trie;
|
||||||
|
|
||||||
static FORBIDDEN_WORDS: &'static [&'static str] = &[
|
thread_local! {
|
||||||
"recovery",
|
static FORBIDON: LazyCell<Trie> = LazyCell::new(|| {
|
||||||
"techie",
|
let mut trie = Trie::default();
|
||||||
"http",
|
for word in [
|
||||||
"https",
|
"recovery",
|
||||||
"digital",
|
"techie",
|
||||||
"hack",
|
"http",
|
||||||
"::",
|
"https",
|
||||||
"//",
|
"digital",
|
||||||
"@",
|
"hack",
|
||||||
"com",
|
"::",
|
||||||
"crypto",
|
"//",
|
||||||
"bitcoin",
|
"@",
|
||||||
"wallet",
|
"com",
|
||||||
"hacker",
|
"crypto",
|
||||||
"welcome",
|
"bitcoin",
|
||||||
"whatsapp",
|
"wallet",
|
||||||
"email",
|
"hacker",
|
||||||
"cryptocurrency",
|
"welcome",
|
||||||
"stolen",
|
"whatsapp",
|
||||||
"freeze",
|
"email",
|
||||||
"quick",
|
"cryptocurrency",
|
||||||
"crucial",
|
"stolen",
|
||||||
"tracing",
|
"freeze",
|
||||||
"scammers",
|
"quick",
|
||||||
"expers",
|
"crucial",
|
||||||
"hire",
|
"tracing",
|
||||||
"century",
|
"scammers",
|
||||||
"transaction",
|
"expers",
|
||||||
"essential",
|
"hire",
|
||||||
"managing",
|
"century",
|
||||||
"contact",
|
"transaction",
|
||||||
"contacting",
|
"essential",
|
||||||
"understanding",
|
"managing",
|
||||||
"assets",
|
"contact",
|
||||||
"funds",
|
"contacting",
|
||||||
];
|
"understanding",
|
||||||
|
"assets",
|
||||||
#[derive(Debug, Default)]
|
"funds",
|
||||||
pub struct Stats {
|
] {
|
||||||
file_count: u32,
|
trie.insert(word);
|
||||||
failed_file_count: u32,
|
}
|
||||||
|
trie
|
||||||
sentence_count: u32,
|
});
|
||||||
word_count: u32,
|
|
||||||
|
|
||||||
capitalized_count: u32,
|
|
||||||
numeric_count: u32,
|
|
||||||
forbidden_count: u32,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Stats {
|
impl Stats {
|
||||||
pub fn process(&mut self, file: &str) {
|
pub fn process(&mut self, file: &str) {
|
||||||
let Ok(text) = fs::read_to_string(&file) else {
|
let Ok(text) = fs::read_to_string(&file) else {
|
||||||
@ -65,6 +66,7 @@ impl Stats {
|
|||||||
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
|
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
|
||||||
/// 500ms is without forbidden words check, but...
|
/// 500ms is without forbidden words check, but...
|
||||||
/// 6000ms if adding forbidden words.. so not faster
|
/// 6000ms if adding forbidden words.. so not faster
|
||||||
|
/// with trie this is 2600ms
|
||||||
fn muncher(&mut self, text: &str) {
|
fn muncher(&mut self, text: &str) {
|
||||||
let mut capitalized = true;
|
let mut capitalized = true;
|
||||||
let mut whitespaced = false;
|
let mut whitespaced = false;
|
||||||
@ -86,12 +88,11 @@ impl Stats {
|
|||||||
capitalized = true;
|
capitalized = true;
|
||||||
}
|
}
|
||||||
let lowercase_word = word.to_lowercase();
|
let lowercase_word = word.to_lowercase();
|
||||||
for forbidden_word in FORBIDDEN_WORDS {
|
FORBIDON.with(|trie| {
|
||||||
if lowercase_word.contains(forbidden_word) {
|
if trie.contains(&lowercase_word) {
|
||||||
self.forbidden_count += 1;
|
self.forbidden_count += 1;
|
||||||
break; //if you find one count it as a whole word
|
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
word = String::new();
|
word = String::new();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -111,12 +112,11 @@ impl Stats {
|
|||||||
capitalized = true;
|
capitalized = true;
|
||||||
}
|
}
|
||||||
let lowercase_word = word.to_lowercase();
|
let lowercase_word = word.to_lowercase();
|
||||||
for forbidden_word in FORBIDDEN_WORDS {
|
FORBIDON.with(|trie| {
|
||||||
if lowercase_word.contains(forbidden_word) {
|
if trie.contains(&lowercase_word) {
|
||||||
self.forbidden_count += 1;
|
self.forbidden_count += 1;
|
||||||
break; //if you find one count it as a whole word
|
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
word = String::new();
|
word = String::new();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -132,6 +132,7 @@ impl Stats {
|
|||||||
}
|
}
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
/// typically 5000ms
|
/// typically 5000ms
|
||||||
|
/// with trie this is 1600ms
|
||||||
fn for_loops(&mut self, text: &str) {
|
fn for_loops(&mut self, text: &str) {
|
||||||
for sentence in text
|
for sentence in text
|
||||||
.split('.')
|
.split('.')
|
||||||
@ -160,64 +161,20 @@ impl Stats {
|
|||||||
self.capitalized_count += 1;
|
self.capitalized_count += 1;
|
||||||
}
|
}
|
||||||
let lowercase_word = word.to_lowercase();
|
let lowercase_word = word.to_lowercase();
|
||||||
for forbidden_word in FORBIDDEN_WORDS {
|
FORBIDON.with(|trie| {
|
||||||
if lowercase_word.contains(forbidden_word) {
|
if trie.contains(&lowercase_word) {
|
||||||
self.forbidden_count += 1;
|
self.forbidden_count += 1;
|
||||||
break; //if you find one count it as a whole word
|
|
||||||
}
|
}
|
||||||
}
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl AddAssign for Stats {
|
|
||||||
fn add_assign(&mut self, rhs: Self) {
|
|
||||||
self.file_count += rhs.file_count;
|
|
||||||
self.failed_file_count += rhs.failed_file_count;
|
|
||||||
|
|
||||||
self.sentence_count += rhs.sentence_count;
|
|
||||||
self.word_count += rhs.word_count;
|
|
||||||
|
|
||||||
self.capitalized_count += rhs.capitalized_count;
|
|
||||||
self.numeric_count += rhs.numeric_count;
|
|
||||||
self.forbidden_count += rhs.forbidden_count;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
impl Display for Stats {
|
|
||||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
||||||
writeln!(f, "file count: {}", self.file_count)?;
|
|
||||||
writeln!(f, "failed file count: {}", self.failed_file_count)?;
|
|
||||||
|
|
||||||
writeln!(f, "sentence count: {}", self.sentence_count)?;
|
|
||||||
writeln!(f, "word count: {}", self.word_count)?;
|
|
||||||
|
|
||||||
writeln!(f, "capitalized count: {}", self.capitalized_count)?;
|
|
||||||
writeln!(f, "numeric count: {}", self.numeric_count)?;
|
|
||||||
writeln!(f, "forbidden count: {}", self.forbidden_count)?;
|
|
||||||
|
|
||||||
let word_count = self.word_count as f32;
|
|
||||||
writeln!(
|
|
||||||
f,
|
|
||||||
"words per sentence average: {:.1}",
|
|
||||||
word_count / self.sentence_count as f32
|
|
||||||
)?;
|
|
||||||
writeln!(
|
|
||||||
f,
|
|
||||||
"forbidden word percentage: {:.0}%",
|
|
||||||
(self.forbidden_count as f32 / word_count) * 100.0,
|
|
||||||
)?;
|
|
||||||
write!(
|
|
||||||
f,
|
|
||||||
"capitalized word percentage: {:.0}%",
|
|
||||||
(self.capitalized_count as f32 / word_count) * 100.0,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
let files = env::args().skip(1);
|
let files = env::args().skip(1);
|
||||||
let mut stats = Stats::default();
|
|
||||||
let mut rx = {
|
let mut rx = {
|
||||||
let (tx, rx) = mpsc::unbounded_channel();
|
let (tx, rx) = mpsc::unbounded_channel();
|
||||||
for file in files {
|
for file in files {
|
||||||
@ -230,6 +187,7 @@ async fn main() {
|
|||||||
}
|
}
|
||||||
rx
|
rx
|
||||||
};
|
};
|
||||||
|
let mut stats = Stats::default();
|
||||||
while let Some(file_stat) = rx.recv().await {
|
while let Some(file_stat) = rx.recv().await {
|
||||||
stats += file_stat;
|
stats += file_stat;
|
||||||
}
|
}
|
||||||
|
58
jest_rust/src/stats.rs
Normal file
58
jest_rust/src/stats.rs
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
use std::{fmt::Display, ops::AddAssign};
|
||||||
|
|
||||||
|
#[derive(Debug, Default)]
|
||||||
|
pub struct Stats {
|
||||||
|
pub file_count: u32,
|
||||||
|
pub failed_file_count: u32,
|
||||||
|
|
||||||
|
pub sentence_count: u32,
|
||||||
|
pub word_count: u32,
|
||||||
|
|
||||||
|
pub capitalized_count: u32,
|
||||||
|
pub numeric_count: u32,
|
||||||
|
pub forbidden_count: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl AddAssign for Stats {
|
||||||
|
fn add_assign(&mut self, rhs: Self) {
|
||||||
|
self.file_count += rhs.file_count;
|
||||||
|
self.failed_file_count += rhs.failed_file_count;
|
||||||
|
|
||||||
|
self.sentence_count += rhs.sentence_count;
|
||||||
|
self.word_count += rhs.word_count;
|
||||||
|
|
||||||
|
self.capitalized_count += rhs.capitalized_count;
|
||||||
|
self.numeric_count += rhs.numeric_count;
|
||||||
|
self.forbidden_count += rhs.forbidden_count;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl Display for Stats {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
writeln!(f, "file count: {}", self.file_count)?;
|
||||||
|
writeln!(f, "failed file count: {}", self.failed_file_count)?;
|
||||||
|
|
||||||
|
writeln!(f, "sentence count: {}", self.sentence_count)?;
|
||||||
|
writeln!(f, "word count: {}", self.word_count)?;
|
||||||
|
|
||||||
|
writeln!(f, "capitalized count: {}", self.capitalized_count)?;
|
||||||
|
writeln!(f, "numeric count: {}", self.numeric_count)?;
|
||||||
|
writeln!(f, "forbidden count: {}", self.forbidden_count)?;
|
||||||
|
|
||||||
|
let word_count = self.word_count as f32;
|
||||||
|
writeln!(
|
||||||
|
f,
|
||||||
|
"words per sentence average: {:.1}",
|
||||||
|
word_count / self.sentence_count as f32
|
||||||
|
)?;
|
||||||
|
writeln!(
|
||||||
|
f,
|
||||||
|
"forbidden word percentage: {:.0}%",
|
||||||
|
(self.forbidden_count as f32 / word_count) * 100.0,
|
||||||
|
)?;
|
||||||
|
write!(
|
||||||
|
f,
|
||||||
|
"capitalized word percentage: {:.0}%",
|
||||||
|
(self.capitalized_count as f32 / word_count) * 100.0,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
30
jest_rust/src/trie.rs
Normal file
30
jest_rust/src/trie.rs
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
#[derive(Default, Debug, Clone)]
|
||||||
|
struct Node {
|
||||||
|
end: bool,
|
||||||
|
children: HashMap<char, Node>,
|
||||||
|
}
|
||||||
|
#[derive(Default, Debug, Clone)]
|
||||||
|
pub struct Trie {
|
||||||
|
root: Node,
|
||||||
|
}
|
||||||
|
impl Trie {
|
||||||
|
pub fn insert(&mut self, word: &str) {
|
||||||
|
let mut node = &mut self.root;
|
||||||
|
for char in word.chars() {
|
||||||
|
node = node.children.entry(char).or_default();
|
||||||
|
}
|
||||||
|
node.end = true;
|
||||||
|
}
|
||||||
|
pub fn contains(&self, word: &str) -> bool {
|
||||||
|
let mut current_node = &self.root;
|
||||||
|
for char in word.chars() {
|
||||||
|
match current_node.children.get(&char) {
|
||||||
|
Some(node) => current_node = node,
|
||||||
|
None => return false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
current_node.end
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user