muncher benchmark and data

2025-03-23 23:14:30 -04:00 · 2025-03-23 23:14:30 -04:00 · 854a1c3991
commit 854a1c3991
parent ab32a81d9c
2 changed files with 146 additions and 1 deletions
--- a/jest_rust/README.md
+++ b/jest_rust/README.md
@ -97,4 +97,37 @@ opt-level z: `Time Jest Rust: 7.045313119888306` slower
 strip true: `Time Jest Rust: 4.337219476699829` faster
 lto true: `Time Jest Rust: 4.703521728515625` slower
 lto none: `Time Jest Rust: 4.817203998565674`
-lto thin: `Time Jest Rust: 4.429729223251343` faster
+lto thin: `Time Jest Rust: 4.429729223251343` faster
+
+# data integrity
+(this isn't tested, just guessed, and I don't have data to compare it with)
+for loops:
+```
+file count: 904
+failed file count: 0
+sentence count: 5602301
+word count: 81701260
+capitalized count: 1753639
+numeric count: 14981248
+forbidden count: 1237059
+words per sentence average: 14.6
+forbidden word percentage: 2%
+capitalized word percentage: 2%
+
+benchmark: 5033ms
+```
+muncher:
+```
+file count: 904
+failed file count: 0
+sentence count: 5338705
+word count: 86765116
+capitalized count: 13640820
+numeric count: 10902254
+forbidden count: 0
+words per sentence average: 16.3
+forbidden word percentage: 0%
+capitalized word percentage: 16%
+
+benchmark: 504ms
+```
--- a/jest_rust/src/main.rs
+++ b/jest_rust/src/main.rs
@ -58,6 +58,79 @@ impl Stats {
 			return;
 		};
 		self.file_count += 1;
+		self.muncher(&text);
+		// self.for_loops(&text);
+	}
+	#[allow(dead_code)]
+	/// probably buggy. for example, are new lines sentences? what if the text has no last period?
+	/// 500ms is without forbidden words check, but...
+	/// 6000ms if adding forbidden words.. so not faster
+	fn muncher(&mut self, text: &str) {
+		let mut capitalized = true;
+		let mut whitespaced = false;
+		let mut dotted = false;
+		let mut word = String::new();
+		for char in text.chars() {
+			if whitespaced {
+				if !char.is_whitespace() {
+					whitespaced = false; //end whiteness
+				}
+				continue;
+			} else if char.is_whitespace() {
+				whitespaced = true;
+				self.word_count += 1; //end of word
+				if capitalized {
+					self.capitalized_count += 1;
+				} else {
+					//reset capitalized word
+					capitalized = true;
+				}
+				let lowercase_word = word.to_lowercase();
+				for forbidden_word in FORBIDDEN_WORDS {
+					if lowercase_word.contains(forbidden_word) {
+						self.forbidden_count += 1;
+					}
+				}
+				word = String::new();
+				continue;
+			}
+			if dotted {
+				if char != '.' {
+					dotted = false; //end sentencing
+				}
+				continue;
+			} else if char == '.' {
+				dotted = true;
+				self.sentence_count += 1;
+				self.word_count += 1; //end of word
+				if capitalized {
+					self.capitalized_count += 1;
+				} else {
+					//reset capitalized word
+					capitalized = true;
+				}
+				let lowercase_word = word.to_lowercase();
+				for forbidden_word in FORBIDDEN_WORDS {
+					if lowercase_word.contains(forbidden_word) {
+						self.forbidden_count += 1;
+					}
+				}
+				word = String::new();
+				continue;
+			}
+			word += &char.to_string();
+			if char.is_numeric() {
+				self.numeric_count += 1;
+				capitalized = false;
+			}
+			if !char.is_ascii_uppercase() {
+				capitalized = false;
+			}
+		}
+	}
+	#[allow(dead_code)]
+	/// typically 5000ms
+	fn for_loops(&mut self, text: &str) {
 		for sentence in text
 			.split('.')
 			.map(|s| s.trim())
@ -75,6 +148,7 @@ impl Stats {
 				for char in word.chars() {
 					if char.is_numeric() {
 						self.numeric_count += 1;
+						//TODO are numbers capitalized or not? I don't know!
 					}
 					if !char.is_ascii_uppercase() {
 						all_capitalized = false;
@ -197,3 +271,41 @@ fn test() {
 	}
 	println!("benchmark: {}ms", benchmark.elapsed().as_millis());
 }
+#[test]
+fn books_test() {
+	use std::{env, fs, process::Command, time::Instant};
+	println!("cwd: {}", env::current_dir().unwrap().display());
+
+	//compile
+	let mut compile = Command::new("cargo");
+	let compile_arged = compile.arg("build").arg("--release");
+	match compile_arged.output() {
+		Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
+		Err(err) => eprintln!("compile failed: {err}"),
+	}
+
+	//get test files
+	let files = fs::read_dir("../books")
+		.unwrap()
+		.map(|f| {
+			f.unwrap()
+				.path()
+				.canonicalize()
+				.unwrap()
+				.to_str()
+				.unwrap()
+				.to_string()
+		})
+		.collect::<Vec<_>>();
+	println!("test files found: {:#?}", files);
+
+	//benchmark run
+	let benchmark = Instant::now();
+	let mut run = Command::new("target/release/jisspam");
+	let run_arged = run.args(files);
+	match run_arged.output() {
+		Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
+		Err(err) => eprintln!("run failed: {err}"),
+	}
+	println!("benchmark: {}ms", benchmark.elapsed().as_millis());
+}