reorg

2025-10-04 09:18:03 -04:00 · 2025-10-04 09:18:03 -04:00 · d73d4ff7c1
commit d73d4ff7c1
parent 94b786f83a
6 changed files with 183 additions and 242 deletions
--- a/jest_rust/.gitignore
+++ b/jest_rust/.gitignore
@ -1,2 +1,3 @@
 /target
 /Cargo.lock
+/test_books
--- a/jest_rust/README.md
+++ b/jest_rust/README.md
@ -1,95 +1,15 @@
 for https://retoor.molodetz.nl/retoor/isspam

-https://snek.molodetz.nl/terminal.html ubuntu running thing instructions:
-```
-mkdir /project
-cd /project
-git clone https://retoor.molodetz.nl/retoor/isspam.git
-apt install valgrind curl
-export RUSTUP_HOME=/project/.rustup
-export CARGO_HOME=/project/.cargo
-curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
-. "/project/.cargo/env"
-cd isspam
-rustup install nightly
-rustup default nightly
-make
-make benchmark
-python3 bench.py
-```
-
-clone: `git clone https://gitlab.com/jestdotty-group/draft/jisspam.git jest_rust`
-
-edit make: `vi makefile` and add build:
-```
-build_jest:
-	@echo "compiling jest_rust project"
-	cd jest_rust && cargo build --release && cp target/release/jisspam ..
-```
-append to all script:
-```
-all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
-```
-
-add to bench: `vi bench.py`
-```py
-time_start = time.time()
-subprocess.check_output('./jisspam books/*.txt', shell=True)
-print("Time Jest Rust:", time.time() - time_start)
-```
-
-run: `python3 bench.py`
-output looks something like this:
-```
-***benchmarking***
-Time C: 31.315868377685547
-Time Rust: 41.232205867767334
-Time CPP: 20.1683189868927
-Time Borded CPP: 15.468477964401245
-Time Jest Rust: 54.74523115158081
-Time Retoor Python: 287.63036131858826
-***end benchmark***
-```
-
-add `/jisspam` to `.gitignore` to not commit the executable accidentally
+extract `../books.tar.gz`

 # local machine benchmarks

-single threaded:
-```
-***benchmarking***
-Time C: 2.4082751274108887
-Time Rust: 2.865687847137451
-Time CPP: 1.1568822860717773
-Time Borded CPP: 1.9657189846038818
-Time Jest Rust: 33.63373279571533
-Time Retoor Python: 133.92413425445557
-***end benchmark***
-```
+single threaded: `Time Jest Rust: 33.63373279571533`

-rayon:
-```
-***benchmarking***
-Time C: 2.457853317260742
-Time Rust: 3.0170154571533203
-Time CPP: 1.1482579708099365
-Time Borded CPP: 2.002591371536255
-Time Jest Rust: 4.294418811798096
-Time Retoor Python: 201.2997748851776
-***end benchmark***
-```
+rayon: `Time Jest Rust: 4.294418811798096`
+
+tokio: `Time Jest Rust: 4.717588901519775`

-tokio:
-```
-***benchmarking***
-Time C: 2.448648452758789
-Time Rust: 3.095592737197876
-Time CPP: 1.1662013530731201
-Time Borded CPP: 1.9207634925842285
-Time Jest Rust: 4.717588901519775
-Time Retoor Python: 139.8203284740448
-***end benchmark***
-```
 ## compile options benchmarks
 lto not thin: `Time Jest Rust: 5.306957483291626` slower

@ -109,6 +29,7 @@ lto thin: `Time Jest Rust: 4.429729223251343` faster

 # data integrity
 (this isn't tested, just guessed, and I don't have data to compare it with)
+
 for loops:
 ```
 file count: 904
@ -193,3 +114,57 @@ benchmark: 1588ms
 muncher with trie is 2600ms

 for loops with fxhash trie: 1200ms
+
+# ubuntu terminal running
+https://snek.molodetz.nl/terminal.html ubuntu running thing instructions:
+```
+mkdir /project
+cd /project
+git clone https://retoor.molodetz.nl/retoor/isspam.git
+apt install valgrind curl
+export RUSTUP_HOME=/project/.rustup
+export CARGO_HOME=/project/.cargo
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+. "/project/.cargo/env"
+cd isspam
+rustup install nightly
+rustup default nightly
+make
+make benchmark
+python3 bench.py
+```
+
+clone: `git clone https://gitlab.com/jestdotty-group/draft/jisspam.git jest_rust`
+
+edit make: `vi makefile` and add build:
+```
+build_jest:
+	@echo "compiling jest_rust project"
+	cd jest_rust && cargo build --release && cp target/release/jisspam ..
+```
+append to all script:
+```
+all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
+```
+
+add to bench: `vi bench.py`
+```py
+time_start = time.time()
+subprocess.check_output('./jisspam books/*.txt', shell=True)
+print("Time Jest Rust:", time.time() - time_start)
+```
+
+run: `python3 bench.py`
+output looks something like this:
+```
+***benchmarking***
+Time C: 31.315868377685547
+Time Rust: 41.232205867767334
+Time CPP: 20.1683189868927
+Time Borded CPP: 15.468477964401245
+Time Jest Rust: 54.74523115158081
+Time Retoor Python: 287.63036131858826
+***end benchmark***
+```
+
+add `/jisspam` to `.gitignore` to not commit the executable accidentally
--- a/jest_rust/src/main.rs
+++ b/jest_rust/src/main.rs
@ -1,3 +1,4 @@
+mod parser;
 mod stats;
 mod trie;

@ -50,114 +51,6 @@ static FORBIDDEN_WORDS: LazyLock<Trie> = LazyLock::new(|| {
 	trie
 });

-impl Stats {
-	pub fn process(&mut self, text: &str) {
-		// self.muncher(&text);
-		self.for_loops(&text);
-	}
-	#[allow(dead_code)]
-	/// probably buggy. for example, are new lines sentences? what if the text has no last period?
-	/// 500ms is without forbidden words check, but...
-	/// 6000ms if adding forbidden words.. so not faster
-	/// with trie this is 2600ms
-	fn muncher(&mut self, text: &str) {
-		let mut capitalized = true;
-		let mut whitespaced = false;
-		let mut dotted = false;
-		let mut word = String::new();
-		for char in text.chars() {
-			if whitespaced {
-				if !char.is_whitespace() {
-					whitespaced = false; //end whiteness
-				}
-				continue;
-			} else if char.is_whitespace() {
-				whitespaced = true;
-				self.word_count += 1; //end of word
-				if capitalized {
-					self.capitalized_count += 1;
-				} else {
-					//reset capitalized word
-					capitalized = true;
-				}
-				let lowercase_word = word.to_lowercase();
-				if FORBIDDEN_WORDS.contains(&lowercase_word) {
-					self.forbidden_count += 1;
-				}
-				word = String::new();
-				continue;
-			}
-			if dotted {
-				if char != '.' {
-					dotted = false; //end sentencing
-				}
-				continue;
-			} else if char == '.' {
-				dotted = true;
-				self.sentence_count += 1;
-				self.word_count += 1; //end of word
-				if capitalized {
-					self.capitalized_count += 1;
-				} else {
-					//reset capitalized word
-					capitalized = true;
-				}
-				let lowercase_word = word.to_lowercase();
-				if FORBIDDEN_WORDS.contains(&lowercase_word) {
-					self.forbidden_count += 1;
-				}
-				word = String::new();
-				continue;
-			}
-			word += &char.to_string();
-			if char.is_numeric() {
-				self.numeric_count += 1;
-				capitalized = false;
-			}
-			if !char.is_ascii_uppercase() {
-				capitalized = false;
-			}
-		}
-	}
-	#[allow(dead_code)]
-	/// typically 5000ms
-	/// with trie this is 1600ms
-	fn for_loops(&mut self, text: &str) {
-		for sentence in text
-			.split('.')
-			.map(|s| s.trim())
-			.filter(|s| !s.is_empty())
-		{
-			self.sentence_count += 1;
-			for word in sentence
-				.split_whitespace()
-				.map(|s| s.trim())
-				.filter(|s| !s.is_empty())
-			{
-				self.word_count += 1;
-				//get all numbers counted
-				let mut all_capitalized = true;
-				for char in word.chars() {
-					if char.is_numeric() {
-						self.numeric_count += 1;
-						//TODO are numbers capitalized or not? I don't know!
-					}
-					if !char.is_ascii_uppercase() {
-						all_capitalized = false;
-					}
-				}
-				if all_capitalized {
-					self.capitalized_count += 1;
-				}
-				let lowercase_word = word.to_lowercase();
-				if FORBIDDEN_WORDS.contains(&lowercase_word) {
-					self.forbidden_count += 1;
-				}
-			}
-		}
-	}
-}
-
 #[tokio::main]
 async fn main() {
 	let files = env::args().skip(1);
@ -166,17 +59,17 @@ async fn main() {
 		let (tx, rx) = mpsc::unbounded_channel();
 		for file in files {
 			//reading files not sequentially average shaves 30ms (of 1250ms), and that's on a NVMe SSD so why not
-			let Ok(text) = fs::read_to_string(&file) else {
-				stats.failed_file_count += 1;
-				continue;
-			};
+			if let Ok(text) = fs::read_to_string(&file) {
 				stats.file_count += 1;
 				let tx = tx.clone();
 				tokio::spawn(async move {
 					let mut stats = Stats::default();
-				stats.process(&text);
-				tx.send(stats).unwrap();
+					parser::for_loops::parse(&mut stats, &text);
+					let _ = tx.send(stats);
 				});
+			} else {
+				stats.failed_file_count += 1;
+			}
 		}
 		rx
 	};
@ -186,6 +79,7 @@ async fn main() {
 	println!("{stats}");
 }

+/// needs ../books.tar.gz to be extracted
 #[test]
 fn test() {
 	use std::{env, fs, process::Command, time::Instant};
@ -199,44 +93,6 @@ fn test() {
 		Err(err) => eprintln!("compile failed: {err}"),
 	}

-	//get test files
-	let files = fs::read_dir("test_files")
-		.unwrap()
-		.map(|f| {
-			f.unwrap()
-				.path()
-				.canonicalize()
-				.unwrap()
-				.to_str()
-				.unwrap()
-				.to_string()
-		})
-		.collect::<Vec<_>>();
-	println!("test files found: {:#?}", files);
-
-	//benchmark run
-	let benchmark = Instant::now();
-	let mut run = Command::new("target/release/jisspam");
-	let run_arged = run.args(files);
-	match run_arged.output() {
-		Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
-		Err(err) => eprintln!("run failed: {err}"),
-	}
-	println!("benchmark: {}ms", benchmark.elapsed().as_millis());
-}
-#[test]
-fn books_test() {
-	use std::{env, fs, process::Command, time::Instant};
-	println!("cwd: {}", env::current_dir().unwrap().display());
-
-	//compile
-	let mut compile = Command::new("cargo");
-	let compile_arged = compile.arg("build").arg("--release");
-	match compile_arged.output() {
-		Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
-		Err(err) => eprintln!("compile failed: {err}"),
-	}
-
 	//get test files
 	let files = fs::read_dir("../books")
 		.unwrap()
@ -250,7 +106,9 @@ fn books_test() {
 				.to_string()
 		})
 		.collect::<Vec<_>>();
-	println!("test files found: {:#?}", files);
+	println!("test files found: {}", files.len());
+
+	println!();

 	//benchmark run
 	let benchmark = Instant::now();
--- a/jest_rust/src/parser/for_loops.rs
+++ b/jest_rust/src/parser/for_loops.rs
@ -0,0 +1,39 @@
+use crate::{FORBIDDEN_WORDS, stats::Stats};
+
+#[allow(dead_code)]
+/// typically 5000ms
+/// with trie this is 1600ms
+pub fn parse(stats: &mut Stats, text: &str) {
+	for sentence in text
+		.split('.')
+		.map(|s| s.trim())
+		.filter(|s| !s.is_empty())
+	{
+		stats.sentence_count += 1;
+		for word in sentence
+			.split_whitespace()
+			.map(|s| s.trim())
+			.filter(|s| !s.is_empty())
+		{
+			stats.word_count += 1;
+			//get all numbers counted
+			let mut all_capitalized = true;
+			for char in word.chars() {
+				if char.is_numeric() {
+					stats.numeric_count += 1;
+					//TODO are numbers capitalized or not? I don't know!
+				}
+				if !char.is_ascii_uppercase() {
+					all_capitalized = false;
+				}
+			}
+			if all_capitalized {
+				stats.capitalized_count += 1;
+			}
+			let lowercase_word = word.to_lowercase();
+			if FORBIDDEN_WORDS.contains(&lowercase_word) {
+				stats.forbidden_count += 1;
+			}
+		}
+	}
+}
--- a/jest_rust/src/parser/mod.rs
+++ b/jest_rust/src/parser/mod.rs
@ -0,0 +1,2 @@
+pub mod muncher;
+pub mod for_loops;
--- a/jest_rust/src/parser/muncher.rs
+++ b/jest_rust/src/parser/muncher.rs
@ -0,0 +1,66 @@
+use crate::{FORBIDDEN_WORDS, stats::Stats};
+
+#[allow(dead_code)]
+/// probably buggy. for example, are new lines sentences? what if the text has no last period?
+/// 500ms is without forbidden words check, but...
+/// 6000ms if adding forbidden words.. so not faster
+/// with trie this is 2600ms
+pub fn parse(stats: &mut Stats, text: &str) {
+	let mut capitalized = true;
+	let mut whitespaced = false;
+	let mut dotted = false;
+	let mut word = String::new();
+	for char in text.chars() {
+		if whitespaced {
+			if !char.is_whitespace() {
+				whitespaced = false; //end whiteness
+			}
+			continue;
+		} else if char.is_whitespace() {
+			whitespaced = true;
+			stats.word_count += 1; //end of word
+			if capitalized {
+				stats.capitalized_count += 1;
+			} else {
+				//reset capitalized word
+				capitalized = true;
+			}
+			let lowercase_word = word.to_lowercase();
+			if FORBIDDEN_WORDS.contains(&lowercase_word) {
+				stats.forbidden_count += 1;
+			}
+			word = String::new();
+			continue;
+		}
+		if dotted {
+			if char != '.' {
+				dotted = false; //end sentencing
+			}
+			continue;
+		} else if char == '.' {
+			dotted = true;
+			stats.sentence_count += 1;
+			stats.word_count += 1; //end of word
+			if capitalized {
+				stats.capitalized_count += 1;
+			} else {
+				//reset capitalized word
+				capitalized = true;
+			}
+			let lowercase_word = word.to_lowercase();
+			if FORBIDDEN_WORDS.contains(&lowercase_word) {
+				stats.forbidden_count += 1;
+			}
+			word = String::new();
+			continue;
+		}
+		word += &char.to_string();
+		if char.is_numeric() {
+			stats.numeric_count += 1;
+			capitalized = false;
+		}
+		if !char.is_ascii_uppercase() {
+			capitalized = false;
+		}
+	}
+}