parent
ab32a81d9c
commit
854a1c3991
@ -97,4 +97,37 @@ opt-level z: `Time Jest Rust: 7.045313119888306` slower
|
|||||||
strip true: `Time Jest Rust: 4.337219476699829` faster
|
strip true: `Time Jest Rust: 4.337219476699829` faster
|
||||||
lto true: `Time Jest Rust: 4.703521728515625` slower
|
lto true: `Time Jest Rust: 4.703521728515625` slower
|
||||||
lto none: `Time Jest Rust: 4.817203998565674`
|
lto none: `Time Jest Rust: 4.817203998565674`
|
||||||
lto thin: `Time Jest Rust: 4.429729223251343` faster
|
lto thin: `Time Jest Rust: 4.429729223251343` faster
|
||||||
|
|
||||||
|
# data integrity
|
||||||
|
(this isn't tested, just guessed, and I don't have data to compare it with)
|
||||||
|
for loops:
|
||||||
|
```
|
||||||
|
file count: 904
|
||||||
|
failed file count: 0
|
||||||
|
sentence count: 5602301
|
||||||
|
word count: 81701260
|
||||||
|
capitalized count: 1753639
|
||||||
|
numeric count: 14981248
|
||||||
|
forbidden count: 1237059
|
||||||
|
words per sentence average: 14.6
|
||||||
|
forbidden word percentage: 2%
|
||||||
|
capitalized word percentage: 2%
|
||||||
|
|
||||||
|
benchmark: 5033ms
|
||||||
|
```
|
||||||
|
muncher:
|
||||||
|
```
|
||||||
|
file count: 904
|
||||||
|
failed file count: 0
|
||||||
|
sentence count: 5338705
|
||||||
|
word count: 86765116
|
||||||
|
capitalized count: 13640820
|
||||||
|
numeric count: 10902254
|
||||||
|
forbidden count: 0
|
||||||
|
words per sentence average: 16.3
|
||||||
|
forbidden word percentage: 0%
|
||||||
|
capitalized word percentage: 16%
|
||||||
|
|
||||||
|
benchmark: 504ms
|
||||||
|
```
|
@ -58,6 +58,79 @@ impl Stats {
|
|||||||
return;
|
return;
|
||||||
};
|
};
|
||||||
self.file_count += 1;
|
self.file_count += 1;
|
||||||
|
self.muncher(&text);
|
||||||
|
// self.for_loops(&text);
|
||||||
|
}
|
||||||
|
#[allow(dead_code)]
|
||||||
|
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
|
||||||
|
/// 500ms is without forbidden words check, but...
|
||||||
|
/// 6000ms if adding forbidden words.. so not faster
|
||||||
|
fn muncher(&mut self, text: &str) {
|
||||||
|
let mut capitalized = true;
|
||||||
|
let mut whitespaced = false;
|
||||||
|
let mut dotted = false;
|
||||||
|
let mut word = String::new();
|
||||||
|
for char in text.chars() {
|
||||||
|
if whitespaced {
|
||||||
|
if !char.is_whitespace() {
|
||||||
|
whitespaced = false; //end whiteness
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else if char.is_whitespace() {
|
||||||
|
whitespaced = true;
|
||||||
|
self.word_count += 1; //end of word
|
||||||
|
if capitalized {
|
||||||
|
self.capitalized_count += 1;
|
||||||
|
} else {
|
||||||
|
//reset capitalized word
|
||||||
|
capitalized = true;
|
||||||
|
}
|
||||||
|
let lowercase_word = word.to_lowercase();
|
||||||
|
for forbidden_word in FORBIDDEN_WORDS {
|
||||||
|
if lowercase_word.contains(forbidden_word) {
|
||||||
|
self.forbidden_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
word = String::new();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if dotted {
|
||||||
|
if char != '.' {
|
||||||
|
dotted = false; //end sentencing
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else if char == '.' {
|
||||||
|
dotted = true;
|
||||||
|
self.sentence_count += 1;
|
||||||
|
self.word_count += 1; //end of word
|
||||||
|
if capitalized {
|
||||||
|
self.capitalized_count += 1;
|
||||||
|
} else {
|
||||||
|
//reset capitalized word
|
||||||
|
capitalized = true;
|
||||||
|
}
|
||||||
|
let lowercase_word = word.to_lowercase();
|
||||||
|
for forbidden_word in FORBIDDEN_WORDS {
|
||||||
|
if lowercase_word.contains(forbidden_word) {
|
||||||
|
self.forbidden_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
word = String::new();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
word += &char.to_string();
|
||||||
|
if char.is_numeric() {
|
||||||
|
self.numeric_count += 1;
|
||||||
|
capitalized = false;
|
||||||
|
}
|
||||||
|
if !char.is_ascii_uppercase() {
|
||||||
|
capitalized = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[allow(dead_code)]
|
||||||
|
/// typically 5000ms
|
||||||
|
fn for_loops(&mut self, text: &str) {
|
||||||
for sentence in text
|
for sentence in text
|
||||||
.split('.')
|
.split('.')
|
||||||
.map(|s| s.trim())
|
.map(|s| s.trim())
|
||||||
@ -75,6 +148,7 @@ impl Stats {
|
|||||||
for char in word.chars() {
|
for char in word.chars() {
|
||||||
if char.is_numeric() {
|
if char.is_numeric() {
|
||||||
self.numeric_count += 1;
|
self.numeric_count += 1;
|
||||||
|
//TODO are numbers capitalized or not? I don't know!
|
||||||
}
|
}
|
||||||
if !char.is_ascii_uppercase() {
|
if !char.is_ascii_uppercase() {
|
||||||
all_capitalized = false;
|
all_capitalized = false;
|
||||||
@ -197,3 +271,41 @@ fn test() {
|
|||||||
}
|
}
|
||||||
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
|
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
|
||||||
}
|
}
|
||||||
|
#[test]
|
||||||
|
fn books_test() {
|
||||||
|
use std::{env, fs, process::Command, time::Instant};
|
||||||
|
println!("cwd: {}", env::current_dir().unwrap().display());
|
||||||
|
|
||||||
|
//compile
|
||||||
|
let mut compile = Command::new("cargo");
|
||||||
|
let compile_arged = compile.arg("build").arg("--release");
|
||||||
|
match compile_arged.output() {
|
||||||
|
Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
|
||||||
|
Err(err) => eprintln!("compile failed: {err}"),
|
||||||
|
}
|
||||||
|
|
||||||
|
//get test files
|
||||||
|
let files = fs::read_dir("../books")
|
||||||
|
.unwrap()
|
||||||
|
.map(|f| {
|
||||||
|
f.unwrap()
|
||||||
|
.path()
|
||||||
|
.canonicalize()
|
||||||
|
.unwrap()
|
||||||
|
.to_str()
|
||||||
|
.unwrap()
|
||||||
|
.to_string()
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
println!("test files found: {:#?}", files);
|
||||||
|
|
||||||
|
//benchmark run
|
||||||
|
let benchmark = Instant::now();
|
||||||
|
let mut run = Command::new("target/release/jisspam");
|
||||||
|
let run_arged = run.args(files);
|
||||||
|
match run_arged.output() {
|
||||||
|
Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
|
||||||
|
Err(err) => eprintln!("run failed: {err}"),
|
||||||
|
}
|
||||||
|
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
|
||||||
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user