more tests

2025-10-04 09:58:40 -04:00 · 2025-10-04 09:58:40 -04:00 · c459fe6d79
commit c459fe6d79
parent d73d4ff7c1
7 changed files with 54 additions and 128 deletions
--- a/jest_rust/Cargo.toml
+++ b/jest_rust/Cargo.toml
@ -9,6 +9,7 @@ tokio = { version = "1.44.1", features = ["full"] }

 [profile.release]
 codegen-units = 1 # less means more compile work but better optimized
-lto = "thin"      # thin has best performance. fat the worst
+lto = "fat"      # thin has best performance. fat the worst
 strip = true
+# opt-level = "z" # slows down
 panic = "abort"
--- a/jest_rust/README.md
+++ b/jest_rust/README.md
@ -4,116 +4,28 @@ extract `../books.tar.gz`

 # local machine benchmarks

-single threaded: `Time Jest Rust: 33.63373279571533`
+single threaded: `33.63373279571533`

-rayon: `Time Jest Rust: 4.294418811798096`
+rayon: `4.294418811798096`

-tokio: `Time Jest Rust: 4.717588901519775`
+tokio: `4.717588901519775`
+
+tokio:
+
+muncher: `2486ms`
+
+for_loops: `1227ms`
+
+for_loops_forbidden_only: `987ms`
+
+trie creation and stats accumulation take 0ms

 ## compile options benchmarks
-lto not thin: `Time Jest Rust: 5.306957483291626` slower
+`lto` thin, fat doesn't change much

-lto fat: `Time Jest Rust: 5.413678407669067` slower
+`codegen-units` 0, 1 doesn't change much

-codegen-units 1: `Time Jest Rust: 4.451631546020508` faster
-
-opt-level z: `Time Jest Rust: 7.045313119888306` slower
-
-strip true: `Time Jest Rust: 4.337219476699829` faster
-
-lto true: `Time Jest Rust: 4.703521728515625` slower
-
-lto none: `Time Jest Rust: 4.817203998565674`
-
-lto thin: `Time Jest Rust: 4.429729223251343` faster
-
-# data integrity
-(this isn't tested, just guessed, and I don't have data to compare it with)
-
-for loops:
-```
-file count: 904
-failed file count: 0
-sentence count: 5602301
-word count: 81701260
-capitalized count: 1753639
-numeric count: 14981248
-forbidden count: 1237059
-words per sentence average: 14.6
-forbidden word percentage: 2%
-capitalized word percentage: 2%
-
-benchmark: 5033ms
-```
-
-muncher:
-```
-file count: 904
-failed file count: 0
-sentence count: 5338705
-word count: 86765116
-capitalized count: 13640820
-numeric count: 10902254
-forbidden count: 0
-words per sentence average: 16.3
-forbidden word percentage: 0%
-capitalized word percentage: 16%
-
-benchmark: 504ms
-```
-with forbidden words:
-```
-file count: 904
-failed file count: 0
-sentence count: 5338705
-word count: 86765116
-capitalized count: 13640820
-numeric count: 10902254
-forbidden count: 279717
-words per sentence average: 16.3
-forbidden word percentage: 0%
-capitalized word percentage: 16%
-
-benchmark: 6078ms
-```
-
-# forbidden words benchmarks
-seems they take up about 4000ms to churn through in the original version
-
-for loops count forbidden word once only:
-```
-file count: 904
-failed file count: 0
-sentence count: 5602301
-word count: 81701260
-capitalized count: 1753639
-numeric count: 14981248
-forbidden count: 1143234
-words per sentence average: 14.6
-forbidden word percentage: 1%
-capitalized word percentage: 2%
-
-benchmark: 4737ms
-```
-for loops with trie:
-```
-file count: 904
-failed file count: 0
-sentence count: 5602301
-word count: 81701260
-capitalized count: 1753639
-numeric count: 14981248
-forbidden count: 176528
-words per sentence average: 14.6
-forbidden word percentage: 0%
-capitalized word percentage: 2%
-
-benchmark: 1588ms
-```
-
-muncher with trie is 2600ms
-
-for loops with fxhash trie: 1200ms
+`opt-level = "z"` slow things down

 # ubuntu terminal running
 https://snek.molodetz.nl/terminal.html ubuntu running thing instructions:
--- a/jest_rust/src/main.rs
+++ b/jest_rust/src/main.rs
@ -54,32 +54,32 @@ static FORBIDDEN_WORDS: LazyLock<Trie> = LazyLock::new(|| {
 #[tokio::main]
 async fn main() {
 	let files = env::args().skip(1);
-	let mut stats = Stats::default();
 	let mut rx = {
 		let (tx, rx) = mpsc::unbounded_channel();
 		for file in files {
-			//reading files not sequentially average shaves 30ms (of 1250ms), and that's on a NVMe SSD so why not
-			if let Ok(text) = fs::read_to_string(&file) {
-				stats.file_count += 1;
-				let tx = tx.clone();
-				tokio::spawn(async move {
-					let mut stats = Stats::default();
+			let tx = tx.clone();
+			tokio::spawn(async move {
+				let mut stats = Stats::default();
+				//reading files in threads doesn't change speed of any sort but oh well
+				if let Ok(text) = fs::read_to_string(&file) {
+					stats.file_count += 1;
 					parser::for_loops::parse(&mut stats, &text);
-					let _ = tx.send(stats);
-				});
-			} else {
-				stats.failed_file_count += 1;
-			}
+				} else {
+					stats.failed_file_count += 1;
+				}
+				let _ = tx.send(stats);
+			});
 		}
 		rx
 	};
+	let mut stats = Stats::default();
 	while let Some(file_stat) = rx.recv().await {
 		stats += file_stat;
 	}
 	println!("{stats}");
 }

-/// needs ../books.tar.gz to be extracted
+/// needs ../books.tar.gz to be extracted into ../books
 #[test]
 fn test() {
 	use std::{env, fs, process::Command, time::Instant};
--- a/jest_rust/src/parser/for_loops.rs
+++ b/jest_rust/src/parser/for_loops.rs
@ -11,7 +11,7 @@ pub fn parse(stats: &mut Stats, text: &str) {
 	{
 		stats.sentence_count += 1;
 		for word in sentence
-			.split_whitespace()
+			.split_ascii_whitespace()
 			.map(|s| s.trim())
 			.filter(|s| !s.is_empty())
 		{
@ -21,17 +21,15 @@ pub fn parse(stats: &mut Stats, text: &str) {
 			for char in word.chars() {
 				if char.is_numeric() {
 					stats.numeric_count += 1;
-					//TODO are numbers capitalized or not? I don't know!
-				}
-				if !char.is_ascii_uppercase() {
+					all_capitalized = false;
+				} else if !char.is_ascii_uppercase() {
 					all_capitalized = false;
 				}
 			}
 			if all_capitalized {
 				stats.capitalized_count += 1;
 			}
-			let lowercase_word = word.to_lowercase();
-			if FORBIDDEN_WORDS.contains(&lowercase_word) {
+			if FORBIDDEN_WORDS.contains(&word.to_lowercase()) {
 				stats.forbidden_count += 1;
 			}
 		}
--- a/jest_rust/src/parser/for_loops_forbidden_only.rs
+++ b/jest_rust/src/parser/for_loops_forbidden_only.rs
@ -0,0 +1,14 @@
+use crate::{FORBIDDEN_WORDS, stats::Stats};
+
+#[allow(dead_code)]
+pub fn parse(stats: &mut Stats, text: &str) {
+	for word in text
+		.split_ascii_whitespace()
+		.map(|s| s.trim())
+		.filter(|s| !s.is_empty())
+	{
+		if FORBIDDEN_WORDS.contains(&word.to_lowercase()) {
+			stats.forbidden_count += 1;
+		}
+	}
+}
--- a/jest_rust/src/parser/mod.rs
+++ b/jest_rust/src/parser/mod.rs
@ -1,2 +1,3 @@
-pub mod muncher;
-pub mod for_loops;
+pub mod for_loops;
+pub mod for_loops_forbidden_only;
+pub mod muncher;
--- a/jest_rust/src/stats.rs
+++ b/jest_rust/src/stats.rs
@ -46,12 +46,12 @@ impl Display for Stats {
 		)?;
 		writeln!(
 			f,
-			"forbidden word percentage: {:.0}%",
+			"forbidden word percentage: {:.2}%",
 			(self.forbidden_count as f32 / word_count) * 100.0,
 		)?;
 		write!(
 			f,
-			"capitalized word percentage: {:.0}%",
+			"capitalized word percentage: {:.2}%",
 			(self.capitalized_count as f32 / word_count) * 100.0,
 		)
 	}