more tests

2025-10-04 09:58:40 -04:00 · 2025-10-04 09:58:40 -04:00 · c459fe6d79
commit c459fe6d79
parent d73d4ff7c1
7 changed files with 54 additions and 128 deletions
--- a/jest_rust/Cargo.toml
+++ b/jest_rust/Cargo.toml
@ -9,6 +9,7 @@ tokio = { version = "1.44.1", features = ["full"] }
 [profile.release]
 codegen-units = 1 # less means more compile work but better optimized
-lto = "thin"      # thin has best performance. fat the worst
+lto = "fat"      # thin has best performance. fat the worst
 strip = true
 # opt-level = "z" # slows down
 panic = "abort"
--- a/jest_rust/README.md
+++ b/jest_rust/README.md
@ -4,116 +4,28 @@ extract `../books.tar.gz`
 # local machine benchmarks
-single threaded: `Time Jest Rust: 33.63373279571533`
+single threaded: `33.63373279571533`
-rayon: `Time Jest Rust: 4.294418811798096`
+rayon: `4.294418811798096`
-tokio: `Time Jest Rust: 4.717588901519775`
+tokio: `4.717588901519775`
 tokio:
 muncher: `2486ms`
 for_loops: `1227ms`
 for_loops_forbidden_only: `987ms`
 trie creation and stats accumulation take 0ms
 ## compile options benchmarks
-lto not thin: `Time Jest Rust: 5.306957483291626` slower
+`lto` thin, fat doesn't change much
-lto fat: `Time Jest Rust: 5.413678407669067` slower
+`codegen-units` 0, 1 doesn't change much
-codegen-units 1: `Time Jest Rust: 4.451631546020508` faster
+`opt-level = "z"` slow things down
 opt-level z: `Time Jest Rust: 7.045313119888306` slower
 strip true: `Time Jest Rust: 4.337219476699829` faster
 lto true: `Time Jest Rust: 4.703521728515625` slower
 lto none: `Time Jest Rust: 4.817203998565674`
 lto thin: `Time Jest Rust: 4.429729223251343` faster
 # data integrity
 (this isn't tested, just guessed, and I don't have data to compare it with)
 for loops:
 ```
 file count: 904
 failed file count: 0
 sentence count: 5602301
 word count: 81701260
 capitalized count: 1753639
 numeric count: 14981248
 forbidden count: 1237059
 words per sentence average: 14.6
 forbidden word percentage: 2%
 capitalized word percentage: 2%
 benchmark: 5033ms
 ```
 muncher:
 ```
 file count: 904
 failed file count: 0
 sentence count: 5338705
 word count: 86765116
 capitalized count: 13640820
 numeric count: 10902254
 forbidden count: 0
 words per sentence average: 16.3
 forbidden word percentage: 0%
 capitalized word percentage: 16%
 benchmark: 504ms
 ```
 with forbidden words:
 ```
 file count: 904
 failed file count: 0
 sentence count: 5338705
 word count: 86765116
 capitalized count: 13640820
 numeric count: 10902254
 forbidden count: 279717
 words per sentence average: 16.3
 forbidden word percentage: 0%
 capitalized word percentage: 16%
 benchmark: 6078ms
 ```
 # forbidden words benchmarks
 seems they take up about 4000ms to churn through in the original version
 for loops count forbidden word once only:
 ```
 file count: 904
 failed file count: 0
 sentence count: 5602301
 word count: 81701260
 capitalized count: 1753639
 numeric count: 14981248
 forbidden count: 1143234
 words per sentence average: 14.6
 forbidden word percentage: 1%
 capitalized word percentage: 2%
 benchmark: 4737ms
 ```
 for loops with trie:
 ```
 file count: 904
 failed file count: 0
 sentence count: 5602301
 word count: 81701260
 capitalized count: 1753639
 numeric count: 14981248
 forbidden count: 176528
 words per sentence average: 14.6
 forbidden word percentage: 0%
 capitalized word percentage: 2%
 benchmark: 1588ms
 ```
 muncher with trie is 2600ms
 for loops with fxhash trie: 1200ms
 # ubuntu terminal running
 https://snek.molodetz.nl/terminal.html ubuntu running thing instructions:
--- a/jest_rust/src/main.rs
+++ b/jest_rust/src/main.rs
@ -54,32 +54,32 @@ static FORBIDDEN_WORDS: LazyLock<Trie> = LazyLock::new(|| {
 #[tokio::main]
 async fn main() {
 	let files = env::args().skip(1);
 	let mut stats = Stats::default();
 	let mut rx = {
 		let (tx, rx) = mpsc::unbounded_channel();
 		for file in files {
-			//reading files not sequentially average shaves 30ms (of 1250ms), and that's on a NVMe SSD so why not
+			let tx = tx.clone();
-			if let Ok(text) = fs::read_to_string(&file) {
+			tokio::spawn(async move {
-				stats.file_count += 1;
+				let mut stats = Stats::default();
-				let tx = tx.clone();
+				//reading files in threads doesn't change speed of any sort but oh well
-				tokio::spawn(async move {
+				if let Ok(text) = fs::read_to_string(&file) {
-					let mut stats = Stats::default();
+					stats.file_count += 1;
 					parser::for_loops::parse(&mut stats, &text);
-					let _ = tx.send(stats);
+				} else {
-				});
+					stats.failed_file_count += 1;
-			} else {
+				}
-				stats.failed_file_count += 1;
+				let _ = tx.send(stats);
-			}
+			});
 		}
 		rx
 	};
 	let mut stats = Stats::default();
 	while let Some(file_stat) = rx.recv().await {
 		stats += file_stat;
 	}
 	println!("{stats}");
 }
-/// needs ../books.tar.gz to be extracted
+/// needs ../books.tar.gz to be extracted into ../books
 #[test]
 fn test() {
 	use std::{env, fs, process::Command, time::Instant};
--- a/jest_rust/src/parser/for_loops.rs
+++ b/jest_rust/src/parser/for_loops.rs
@ -11,7 +11,7 @@ pub fn parse(stats: &mut Stats, text: &str) {
 	{
 		stats.sentence_count += 1;
 		for word in sentence
-			.split_whitespace()
+			.split_ascii_whitespace()
 			.map(|s| s.trim())
 			.filter(|s| !s.is_empty())
 		{
@ -21,17 +21,15 @@ pub fn parse(stats: &mut Stats, text: &str) {
 			for char in word.chars() {
 				if char.is_numeric() {
 					stats.numeric_count += 1;
-					//TODO are numbers capitalized or not? I don't know!
+					all_capitalized = false;
-				}
+				} else if !char.is_ascii_uppercase() {
 				if !char.is_ascii_uppercase() {
 					all_capitalized = false;
 				}
 			}
 			if all_capitalized {
 				stats.capitalized_count += 1;
 			}
-			let lowercase_word = word.to_lowercase();
+			if FORBIDDEN_WORDS.contains(&word.to_lowercase()) {
 			if FORBIDDEN_WORDS.contains(&lowercase_word) {
 				stats.forbidden_count += 1;
 			}
 		}
--- a/jest_rust/src/parser/for_loops_forbidden_only.rs
+++ b/jest_rust/src/parser/for_loops_forbidden_only.rs
@ -0,0 +1,14 @@
 use crate::{FORBIDDEN_WORDS, stats::Stats};
 #[allow(dead_code)]
 pub fn parse(stats: &mut Stats, text: &str) {
 	for word in text
 		.split_ascii_whitespace()
 		.map(|s| s.trim())
 		.filter(|s| !s.is_empty())
 	{
 		if FORBIDDEN_WORDS.contains(&word.to_lowercase()) {
 			stats.forbidden_count += 1;
 		}
 	}
 }
--- a/jest_rust/src/parser/mod.rs
+++ b/jest_rust/src/parser/mod.rs
@ -1,2 +1,3 @@
-pub mod muncher;
+pub mod for_loops;
-pub mod for_loops;
+pub mod for_loops_forbidden_only;
 pub mod muncher;
--- a/jest_rust/src/stats.rs
+++ b/jest_rust/src/stats.rs
@ -46,12 +46,12 @@ impl Display for Stats {
 		)?;
 		writeln!(
 			f,
-			"forbidden word percentage: {:.0}%",
+			"forbidden word percentage: {:.2}%",
 			(self.forbidden_count as f32 / word_count) * 100.0,
 		)?;
 		write!(
 			f,
-			"capitalized word percentage: {:.0}%",
+			"capitalized word percentage: {:.2}%",
 			(self.capitalized_count as f32 / word_count) * 100.0,
 		)
 	}