diff --git a/jest_rust/README.md b/jest_rust/README.md index d90b2bf..ff93277 100644 --- a/jest_rust/README.md +++ b/jest_rust/README.md @@ -97,4 +97,37 @@ opt-level z: `Time Jest Rust: 7.045313119888306` slower strip true: `Time Jest Rust: 4.337219476699829` faster lto true: `Time Jest Rust: 4.703521728515625` slower lto none: `Time Jest Rust: 4.817203998565674` -lto thin: `Time Jest Rust: 4.429729223251343` faster \ No newline at end of file +lto thin: `Time Jest Rust: 4.429729223251343` faster + +# data integrity +(this isn't tested, just guessed, and I don't have data to compare it with) +for loops: +``` +file count: 904 +failed file count: 0 +sentence count: 5602301 +word count: 81701260 +capitalized count: 1753639 +numeric count: 14981248 +forbidden count: 1237059 +words per sentence average: 14.6 +forbidden word percentage: 2% +capitalized word percentage: 2% + +benchmark: 5033ms +``` +muncher: +``` +file count: 904 +failed file count: 0 +sentence count: 5338705 +word count: 86765116 +capitalized count: 13640820 +numeric count: 10902254 +forbidden count: 0 +words per sentence average: 16.3 +forbidden word percentage: 0% +capitalized word percentage: 16% + +benchmark: 504ms +``` \ No newline at end of file diff --git a/jest_rust/src/main.rs b/jest_rust/src/main.rs index 7482fd0..19a93fd 100644 --- a/jest_rust/src/main.rs +++ b/jest_rust/src/main.rs @@ -58,6 +58,79 @@ impl Stats { return; }; self.file_count += 1; + self.muncher(&text); + // self.for_loops(&text); + } + #[allow(dead_code)] + /// probably buggy. for example, are new lines sentences? what if the text has no last period? + /// 500ms is without forbidden words check, but... + /// 6000ms if adding forbidden words.. so not faster + fn muncher(&mut self, text: &str) { + let mut capitalized = true; + let mut whitespaced = false; + let mut dotted = false; + let mut word = String::new(); + for char in text.chars() { + if whitespaced { + if !char.is_whitespace() { + whitespaced = false; //end whiteness + } + continue; + } else if char.is_whitespace() { + whitespaced = true; + self.word_count += 1; //end of word + if capitalized { + self.capitalized_count += 1; + } else { + //reset capitalized word + capitalized = true; + } + let lowercase_word = word.to_lowercase(); + for forbidden_word in FORBIDDEN_WORDS { + if lowercase_word.contains(forbidden_word) { + self.forbidden_count += 1; + } + } + word = String::new(); + continue; + } + if dotted { + if char != '.' { + dotted = false; //end sentencing + } + continue; + } else if char == '.' { + dotted = true; + self.sentence_count += 1; + self.word_count += 1; //end of word + if capitalized { + self.capitalized_count += 1; + } else { + //reset capitalized word + capitalized = true; + } + let lowercase_word = word.to_lowercase(); + for forbidden_word in FORBIDDEN_WORDS { + if lowercase_word.contains(forbidden_word) { + self.forbidden_count += 1; + } + } + word = String::new(); + continue; + } + word += &char.to_string(); + if char.is_numeric() { + self.numeric_count += 1; + capitalized = false; + } + if !char.is_ascii_uppercase() { + capitalized = false; + } + } + } + #[allow(dead_code)] + /// typically 5000ms + fn for_loops(&mut self, text: &str) { for sentence in text .split('.') .map(|s| s.trim()) @@ -75,6 +148,7 @@ impl Stats { for char in word.chars() { if char.is_numeric() { self.numeric_count += 1; + //TODO are numbers capitalized or not? I don't know! } if !char.is_ascii_uppercase() { all_capitalized = false; @@ -197,3 +271,41 @@ fn test() { } println!("benchmark: {}ms", benchmark.elapsed().as_millis()); } +#[test] +fn books_test() { + use std::{env, fs, process::Command, time::Instant}; + println!("cwd: {}", env::current_dir().unwrap().display()); + + //compile + let mut compile = Command::new("cargo"); + let compile_arged = compile.arg("build").arg("--release"); + match compile_arged.output() { + Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)), + Err(err) => eprintln!("compile failed: {err}"), + } + + //get test files + let files = fs::read_dir("../books") + .unwrap() + .map(|f| { + f.unwrap() + .path() + .canonicalize() + .unwrap() + .to_str() + .unwrap() + .to_string() + }) + .collect::<Vec<_>>(); + println!("test files found: {:#?}", files); + + //benchmark run + let benchmark = Instant::now(); + let mut run = Command::new("target/release/jisspam"); + let run_arged = run.args(files); + match run_arged.output() { + Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)), + Err(err) => eprintln!("run failed: {err}"), + } + println!("benchmark: {}ms", benchmark.elapsed().as_millis()); +}