From c459fe6d79d971881e52f7bfb65e3930036a3b78 Mon Sep 17 00:00:00 2001 From: JestDotty Date: Sat, 4 Oct 2025 09:58:40 -0400 Subject: [PATCH] more tests --- jest_rust/Cargo.toml | 3 +- jest_rust/README.md | 120 +++--------------- jest_rust/src/main.rs | 26 ++-- jest_rust/src/parser/for_loops.rs | 10 +- .../src/parser/for_loops_forbidden_only.rs | 14 ++ jest_rust/src/parser/mod.rs | 5 +- jest_rust/src/stats.rs | 4 +- 7 files changed, 54 insertions(+), 128 deletions(-) create mode 100644 jest_rust/src/parser/for_loops_forbidden_only.rs diff --git a/jest_rust/Cargo.toml b/jest_rust/Cargo.toml index 4143cb2..3ce4afa 100644 --- a/jest_rust/Cargo.toml +++ b/jest_rust/Cargo.toml @@ -9,6 +9,7 @@ tokio = { version = "1.44.1", features = ["full"] } [profile.release] codegen-units = 1 # less means more compile work but better optimized -lto = "thin" # thin has best performance. fat the worst +lto = "fat" # thin has best performance. fat the worst strip = true +# opt-level = "z" # slows down panic = "abort" diff --git a/jest_rust/README.md b/jest_rust/README.md index fa8dc67..21af281 100644 --- a/jest_rust/README.md +++ b/jest_rust/README.md @@ -4,116 +4,28 @@ extract `../books.tar.gz` # local machine benchmarks -single threaded: `Time Jest Rust: 33.63373279571533` +single threaded: `33.63373279571533` -rayon: `Time Jest Rust: 4.294418811798096` +rayon: `4.294418811798096` -tokio: `Time Jest Rust: 4.717588901519775` +tokio: `4.717588901519775` + +tokio: + +muncher: `2486ms` + +for_loops: `1227ms` + +for_loops_forbidden_only: `987ms` + +trie creation and stats accumulation take 0ms ## compile options benchmarks -lto not thin: `Time Jest Rust: 5.306957483291626` slower +`lto` thin, fat doesn't change much -lto fat: `Time Jest Rust: 5.413678407669067` slower +`codegen-units` 0, 1 doesn't change much -codegen-units 1: `Time Jest Rust: 4.451631546020508` faster - -opt-level z: `Time Jest Rust: 7.045313119888306` slower - -strip true: `Time Jest Rust: 4.337219476699829` faster - -lto true: `Time Jest Rust: 4.703521728515625` slower - -lto none: `Time Jest Rust: 4.817203998565674` - -lto thin: `Time Jest Rust: 4.429729223251343` faster - -# data integrity -(this isn't tested, just guessed, and I don't have data to compare it with) - -for loops: -``` -file count: 904 -failed file count: 0 -sentence count: 5602301 -word count: 81701260 -capitalized count: 1753639 -numeric count: 14981248 -forbidden count: 1237059 -words per sentence average: 14.6 -forbidden word percentage: 2% -capitalized word percentage: 2% - -benchmark: 5033ms -``` - -muncher: -``` -file count: 904 -failed file count: 0 -sentence count: 5338705 -word count: 86765116 -capitalized count: 13640820 -numeric count: 10902254 -forbidden count: 0 -words per sentence average: 16.3 -forbidden word percentage: 0% -capitalized word percentage: 16% - -benchmark: 504ms -``` -with forbidden words: -``` -file count: 904 -failed file count: 0 -sentence count: 5338705 -word count: 86765116 -capitalized count: 13640820 -numeric count: 10902254 -forbidden count: 279717 -words per sentence average: 16.3 -forbidden word percentage: 0% -capitalized word percentage: 16% - -benchmark: 6078ms -``` - -# forbidden words benchmarks -seems they take up about 4000ms to churn through in the original version - -for loops count forbidden word once only: -``` -file count: 904 -failed file count: 0 -sentence count: 5602301 -word count: 81701260 -capitalized count: 1753639 -numeric count: 14981248 -forbidden count: 1143234 -words per sentence average: 14.6 -forbidden word percentage: 1% -capitalized word percentage: 2% - -benchmark: 4737ms -``` -for loops with trie: -``` -file count: 904 -failed file count: 0 -sentence count: 5602301 -word count: 81701260 -capitalized count: 1753639 -numeric count: 14981248 -forbidden count: 176528 -words per sentence average: 14.6 -forbidden word percentage: 0% -capitalized word percentage: 2% - -benchmark: 1588ms -``` - -muncher with trie is 2600ms - -for loops with fxhash trie: 1200ms +`opt-level = "z"` slow things down # ubuntu terminal running https://snek.molodetz.nl/terminal.html ubuntu running thing instructions: diff --git a/jest_rust/src/main.rs b/jest_rust/src/main.rs index de2051c..4c1614a 100644 --- a/jest_rust/src/main.rs +++ b/jest_rust/src/main.rs @@ -54,32 +54,32 @@ static FORBIDDEN_WORDS: LazyLock = LazyLock::new(|| { #[tokio::main] async fn main() { let files = env::args().skip(1); - let mut stats = Stats::default(); let mut rx = { let (tx, rx) = mpsc::unbounded_channel(); for file in files { - //reading files not sequentially average shaves 30ms (of 1250ms), and that's on a NVMe SSD so why not - if let Ok(text) = fs::read_to_string(&file) { - stats.file_count += 1; - let tx = tx.clone(); - tokio::spawn(async move { - let mut stats = Stats::default(); + let tx = tx.clone(); + tokio::spawn(async move { + let mut stats = Stats::default(); + //reading files in threads doesn't change speed of any sort but oh well + if let Ok(text) = fs::read_to_string(&file) { + stats.file_count += 1; parser::for_loops::parse(&mut stats, &text); - let _ = tx.send(stats); - }); - } else { - stats.failed_file_count += 1; - } + } else { + stats.failed_file_count += 1; + } + let _ = tx.send(stats); + }); } rx }; + let mut stats = Stats::default(); while let Some(file_stat) = rx.recv().await { stats += file_stat; } println!("{stats}"); } -/// needs ../books.tar.gz to be extracted +/// needs ../books.tar.gz to be extracted into ../books #[test] fn test() { use std::{env, fs, process::Command, time::Instant}; diff --git a/jest_rust/src/parser/for_loops.rs b/jest_rust/src/parser/for_loops.rs index c0d23ef..eb01bd6 100644 --- a/jest_rust/src/parser/for_loops.rs +++ b/jest_rust/src/parser/for_loops.rs @@ -11,7 +11,7 @@ pub fn parse(stats: &mut Stats, text: &str) { { stats.sentence_count += 1; for word in sentence - .split_whitespace() + .split_ascii_whitespace() .map(|s| s.trim()) .filter(|s| !s.is_empty()) { @@ -21,17 +21,15 @@ pub fn parse(stats: &mut Stats, text: &str) { for char in word.chars() { if char.is_numeric() { stats.numeric_count += 1; - //TODO are numbers capitalized or not? I don't know! - } - if !char.is_ascii_uppercase() { + all_capitalized = false; + } else if !char.is_ascii_uppercase() { all_capitalized = false; } } if all_capitalized { stats.capitalized_count += 1; } - let lowercase_word = word.to_lowercase(); - if FORBIDDEN_WORDS.contains(&lowercase_word) { + if FORBIDDEN_WORDS.contains(&word.to_lowercase()) { stats.forbidden_count += 1; } } diff --git a/jest_rust/src/parser/for_loops_forbidden_only.rs b/jest_rust/src/parser/for_loops_forbidden_only.rs new file mode 100644 index 0000000..e0818f4 --- /dev/null +++ b/jest_rust/src/parser/for_loops_forbidden_only.rs @@ -0,0 +1,14 @@ +use crate::{FORBIDDEN_WORDS, stats::Stats}; + +#[allow(dead_code)] +pub fn parse(stats: &mut Stats, text: &str) { + for word in text + .split_ascii_whitespace() + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + { + if FORBIDDEN_WORDS.contains(&word.to_lowercase()) { + stats.forbidden_count += 1; + } + } +} diff --git a/jest_rust/src/parser/mod.rs b/jest_rust/src/parser/mod.rs index a07edad..aacab99 100644 --- a/jest_rust/src/parser/mod.rs +++ b/jest_rust/src/parser/mod.rs @@ -1,2 +1,3 @@ -pub mod muncher; -pub mod for_loops; \ No newline at end of file +pub mod for_loops; +pub mod for_loops_forbidden_only; +pub mod muncher; \ No newline at end of file diff --git a/jest_rust/src/stats.rs b/jest_rust/src/stats.rs index 8be9453..3d6fc54 100644 --- a/jest_rust/src/stats.rs +++ b/jest_rust/src/stats.rs @@ -46,12 +46,12 @@ impl Display for Stats { )?; writeln!( f, - "forbidden word percentage: {:.0}%", + "forbidden word percentage: {:.2}%", (self.forbidden_count as f32 / word_count) * 100.0, )?; write!( f, - "capitalized word percentage: {:.0}%", + "capitalized word percentage: {:.2}%", (self.capitalized_count as f32 / word_count) * 100.0, ) }