more tests

This commit is contained in:
JestDotty 2025-10-04 09:58:40 -04:00
parent d73d4ff7c1
commit c459fe6d79
7 changed files with 54 additions and 128 deletions

View File

@ -9,6 +9,7 @@ tokio = { version = "1.44.1", features = ["full"] }
[profile.release]
codegen-units = 1 # less means more compile work but better optimized
lto = "thin" # thin has best performance. fat the worst
lto = "fat" # thin has best performance. fat the worst
strip = true
# opt-level = "z" # slows down
panic = "abort"

View File

@ -4,116 +4,28 @@ extract `../books.tar.gz`
# local machine benchmarks
single threaded: `Time Jest Rust: 33.63373279571533`
single threaded: `33.63373279571533`
rayon: `Time Jest Rust: 4.294418811798096`
rayon: `4.294418811798096`
tokio: `Time Jest Rust: 4.717588901519775`
tokio: `4.717588901519775`
tokio:
muncher: `2486ms`
for_loops: `1227ms`
for_loops_forbidden_only: `987ms`
trie creation and stats accumulation take 0ms
## compile options benchmarks
lto not thin: `Time Jest Rust: 5.306957483291626` slower
`lto` thin, fat doesn't change much
lto fat: `Time Jest Rust: 5.413678407669067` slower
`codegen-units` 0, 1 doesn't change much
codegen-units 1: `Time Jest Rust: 4.451631546020508` faster
opt-level z: `Time Jest Rust: 7.045313119888306` slower
strip true: `Time Jest Rust: 4.337219476699829` faster
lto true: `Time Jest Rust: 4.703521728515625` slower
lto none: `Time Jest Rust: 4.817203998565674`
lto thin: `Time Jest Rust: 4.429729223251343` faster
# data integrity
(this isn't tested, just guessed, and I don't have data to compare it with)
for loops:
```
file count: 904
failed file count: 0
sentence count: 5602301
word count: 81701260
capitalized count: 1753639
numeric count: 14981248
forbidden count: 1237059
words per sentence average: 14.6
forbidden word percentage: 2%
capitalized word percentage: 2%
benchmark: 5033ms
```
muncher:
```
file count: 904
failed file count: 0
sentence count: 5338705
word count: 86765116
capitalized count: 13640820
numeric count: 10902254
forbidden count: 0
words per sentence average: 16.3
forbidden word percentage: 0%
capitalized word percentage: 16%
benchmark: 504ms
```
with forbidden words:
```
file count: 904
failed file count: 0
sentence count: 5338705
word count: 86765116
capitalized count: 13640820
numeric count: 10902254
forbidden count: 279717
words per sentence average: 16.3
forbidden word percentage: 0%
capitalized word percentage: 16%
benchmark: 6078ms
```
# forbidden words benchmarks
seems they take up about 4000ms to churn through in the original version
for loops count forbidden word once only:
```
file count: 904
failed file count: 0
sentence count: 5602301
word count: 81701260
capitalized count: 1753639
numeric count: 14981248
forbidden count: 1143234
words per sentence average: 14.6
forbidden word percentage: 1%
capitalized word percentage: 2%
benchmark: 4737ms
```
for loops with trie:
```
file count: 904
failed file count: 0
sentence count: 5602301
word count: 81701260
capitalized count: 1753639
numeric count: 14981248
forbidden count: 176528
words per sentence average: 14.6
forbidden word percentage: 0%
capitalized word percentage: 2%
benchmark: 1588ms
```
muncher with trie is 2600ms
for loops with fxhash trie: 1200ms
`opt-level = "z"` slow things down
# ubuntu terminal running
https://snek.molodetz.nl/terminal.html ubuntu running thing instructions:

View File

@ -54,32 +54,32 @@ static FORBIDDEN_WORDS: LazyLock<Trie> = LazyLock::new(|| {
#[tokio::main]
async fn main() {
let files = env::args().skip(1);
let mut stats = Stats::default();
let mut rx = {
let (tx, rx) = mpsc::unbounded_channel();
for file in files {
//reading files not sequentially average shaves 30ms (of 1250ms), and that's on a NVMe SSD so why not
if let Ok(text) = fs::read_to_string(&file) {
stats.file_count += 1;
let tx = tx.clone();
tokio::spawn(async move {
let mut stats = Stats::default();
//reading files in threads doesn't change speed of any sort but oh well
if let Ok(text) = fs::read_to_string(&file) {
stats.file_count += 1;
parser::for_loops::parse(&mut stats, &text);
let _ = tx.send(stats);
});
} else {
stats.failed_file_count += 1;
}
let _ = tx.send(stats);
});
}
rx
};
let mut stats = Stats::default();
while let Some(file_stat) = rx.recv().await {
stats += file_stat;
}
println!("{stats}");
}
/// needs ../books.tar.gz to be extracted
/// needs ../books.tar.gz to be extracted into ../books
#[test]
fn test() {
use std::{env, fs, process::Command, time::Instant};

View File

@ -11,7 +11,7 @@ pub fn parse(stats: &mut Stats, text: &str) {
{
stats.sentence_count += 1;
for word in sentence
.split_whitespace()
.split_ascii_whitespace()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
@ -21,17 +21,15 @@ pub fn parse(stats: &mut Stats, text: &str) {
for char in word.chars() {
if char.is_numeric() {
stats.numeric_count += 1;
//TODO are numbers capitalized or not? I don't know!
}
if !char.is_ascii_uppercase() {
all_capitalized = false;
} else if !char.is_ascii_uppercase() {
all_capitalized = false;
}
}
if all_capitalized {
stats.capitalized_count += 1;
}
let lowercase_word = word.to_lowercase();
if FORBIDDEN_WORDS.contains(&lowercase_word) {
if FORBIDDEN_WORDS.contains(&word.to_lowercase()) {
stats.forbidden_count += 1;
}
}

View File

@ -0,0 +1,14 @@
use crate::{FORBIDDEN_WORDS, stats::Stats};
#[allow(dead_code)]
pub fn parse(stats: &mut Stats, text: &str) {
for word in text
.split_ascii_whitespace()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
if FORBIDDEN_WORDS.contains(&word.to_lowercase()) {
stats.forbidden_count += 1;
}
}
}

View File

@ -1,2 +1,3 @@
pub mod muncher;
pub mod for_loops;
pub mod for_loops_forbidden_only;
pub mod muncher;

View File

@ -46,12 +46,12 @@ impl Display for Stats {
)?;
writeln!(
f,
"forbidden word percentage: {:.0}%",
"forbidden word percentage: {:.2}%",
(self.forbidden_count as f32 / word_count) * 100.0,
)?;
write!(
f,
"capitalized word percentage: {:.0}%",
"capitalized word percentage: {:.2}%",
(self.capitalized_count as f32 / word_count) * 100.0,
)
}