diff --git a/jest_rust/.gitignore b/jest_rust/.gitignore index 4fffb2f..f7e9d5a 100644 --- a/jest_rust/.gitignore +++ b/jest_rust/.gitignore @@ -1,2 +1,3 @@ /target /Cargo.lock +/test_books diff --git a/jest_rust/README.md b/jest_rust/README.md index 5d4364b..fa8dc67 100644 --- a/jest_rust/README.md +++ b/jest_rust/README.md @@ -1,95 +1,15 @@ for https://retoor.molodetz.nl/retoor/isspam -https://snek.molodetz.nl/terminal.html ubuntu running thing instructions: -``` -mkdir /project -cd /project -git clone https://retoor.molodetz.nl/retoor/isspam.git -apt install valgrind curl -export RUSTUP_HOME=/project/.rustup -export CARGO_HOME=/project/.cargo -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -. "/project/.cargo/env" -cd isspam -rustup install nightly -rustup default nightly -make -make benchmark -python3 bench.py -``` - -clone: `git clone https://gitlab.com/jestdotty-group/draft/jisspam.git jest_rust` - -edit make: `vi makefile` and add build: -``` -build_jest: - @echo "compiling jest_rust project" - cd jest_rust && cargo build --release && cp target/release/jisspam .. -``` -append to all script: -``` -all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest -``` - -add to bench: `vi bench.py` -```py -time_start = time.time() -subprocess.check_output('./jisspam books/*.txt', shell=True) -print("Time Jest Rust:", time.time() - time_start) -``` - -run: `python3 bench.py` -output looks something like this: -``` -***benchmarking*** -Time C: 31.315868377685547 -Time Rust: 41.232205867767334 -Time CPP: 20.1683189868927 -Time Borded CPP: 15.468477964401245 -Time Jest Rust: 54.74523115158081 -Time Retoor Python: 287.63036131858826 -***end benchmark*** -``` - -add `/jisspam` to `.gitignore` to not commit the executable accidentally +extract `../books.tar.gz` # local machine benchmarks -single threaded: -``` -***benchmarking*** -Time C: 2.4082751274108887 -Time Rust: 2.865687847137451 -Time CPP: 1.1568822860717773 -Time Borded CPP: 1.9657189846038818 -Time Jest Rust: 33.63373279571533 -Time Retoor Python: 133.92413425445557 -***end benchmark*** -``` +single threaded: `Time Jest Rust: 33.63373279571533` -rayon: -``` -***benchmarking*** -Time C: 2.457853317260742 -Time Rust: 3.0170154571533203 -Time CPP: 1.1482579708099365 -Time Borded CPP: 2.002591371536255 -Time Jest Rust: 4.294418811798096 -Time Retoor Python: 201.2997748851776 -***end benchmark*** -``` +rayon: `Time Jest Rust: 4.294418811798096` + +tokio: `Time Jest Rust: 4.717588901519775` -tokio: -``` -***benchmarking*** -Time C: 2.448648452758789 -Time Rust: 3.095592737197876 -Time CPP: 1.1662013530731201 -Time Borded CPP: 1.9207634925842285 -Time Jest Rust: 4.717588901519775 -Time Retoor Python: 139.8203284740448 -***end benchmark*** -``` ## compile options benchmarks lto not thin: `Time Jest Rust: 5.306957483291626` slower @@ -109,6 +29,7 @@ lto thin: `Time Jest Rust: 4.429729223251343` faster # data integrity (this isn't tested, just guessed, and I don't have data to compare it with) + for loops: ``` file count: 904 @@ -193,3 +114,57 @@ benchmark: 1588ms muncher with trie is 2600ms for loops with fxhash trie: 1200ms + +# ubuntu terminal running +https://snek.molodetz.nl/terminal.html ubuntu running thing instructions: +``` +mkdir /project +cd /project +git clone https://retoor.molodetz.nl/retoor/isspam.git +apt install valgrind curl +export RUSTUP_HOME=/project/.rustup +export CARGO_HOME=/project/.cargo +curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh +. "/project/.cargo/env" +cd isspam +rustup install nightly +rustup default nightly +make +make benchmark +python3 bench.py +``` + +clone: `git clone https://gitlab.com/jestdotty-group/draft/jisspam.git jest_rust` + +edit make: `vi makefile` and add build: +``` +build_jest: + @echo "compiling jest_rust project" + cd jest_rust && cargo build --release && cp target/release/jisspam .. +``` +append to all script: +``` +all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest +``` + +add to bench: `vi bench.py` +```py +time_start = time.time() +subprocess.check_output('./jisspam books/*.txt', shell=True) +print("Time Jest Rust:", time.time() - time_start) +``` + +run: `python3 bench.py` +output looks something like this: +``` +***benchmarking*** +Time C: 31.315868377685547 +Time Rust: 41.232205867767334 +Time CPP: 20.1683189868927 +Time Borded CPP: 15.468477964401245 +Time Jest Rust: 54.74523115158081 +Time Retoor Python: 287.63036131858826 +***end benchmark*** +``` + +add `/jisspam` to `.gitignore` to not commit the executable accidentally \ No newline at end of file diff --git a/jest_rust/src/main.rs b/jest_rust/src/main.rs index 1e0609e..de2051c 100644 --- a/jest_rust/src/main.rs +++ b/jest_rust/src/main.rs @@ -1,3 +1,4 @@ +mod parser; mod stats; mod trie; @@ -50,114 +51,6 @@ static FORBIDDEN_WORDS: LazyLock = LazyLock::new(|| { trie }); -impl Stats { - pub fn process(&mut self, text: &str) { - // self.muncher(&text); - self.for_loops(&text); - } - #[allow(dead_code)] - /// probably buggy. for example, are new lines sentences? what if the text has no last period? - /// 500ms is without forbidden words check, but... - /// 6000ms if adding forbidden words.. so not faster - /// with trie this is 2600ms - fn muncher(&mut self, text: &str) { - let mut capitalized = true; - let mut whitespaced = false; - let mut dotted = false; - let mut word = String::new(); - for char in text.chars() { - if whitespaced { - if !char.is_whitespace() { - whitespaced = false; //end whiteness - } - continue; - } else if char.is_whitespace() { - whitespaced = true; - self.word_count += 1; //end of word - if capitalized { - self.capitalized_count += 1; - } else { - //reset capitalized word - capitalized = true; - } - let lowercase_word = word.to_lowercase(); - if FORBIDDEN_WORDS.contains(&lowercase_word) { - self.forbidden_count += 1; - } - word = String::new(); - continue; - } - if dotted { - if char != '.' { - dotted = false; //end sentencing - } - continue; - } else if char == '.' { - dotted = true; - self.sentence_count += 1; - self.word_count += 1; //end of word - if capitalized { - self.capitalized_count += 1; - } else { - //reset capitalized word - capitalized = true; - } - let lowercase_word = word.to_lowercase(); - if FORBIDDEN_WORDS.contains(&lowercase_word) { - self.forbidden_count += 1; - } - word = String::new(); - continue; - } - word += &char.to_string(); - if char.is_numeric() { - self.numeric_count += 1; - capitalized = false; - } - if !char.is_ascii_uppercase() { - capitalized = false; - } - } - } - #[allow(dead_code)] - /// typically 5000ms - /// with trie this is 1600ms - fn for_loops(&mut self, text: &str) { - for sentence in text - .split('.') - .map(|s| s.trim()) - .filter(|s| !s.is_empty()) - { - self.sentence_count += 1; - for word in sentence - .split_whitespace() - .map(|s| s.trim()) - .filter(|s| !s.is_empty()) - { - self.word_count += 1; - //get all numbers counted - let mut all_capitalized = true; - for char in word.chars() { - if char.is_numeric() { - self.numeric_count += 1; - //TODO are numbers capitalized or not? I don't know! - } - if !char.is_ascii_uppercase() { - all_capitalized = false; - } - } - if all_capitalized { - self.capitalized_count += 1; - } - let lowercase_word = word.to_lowercase(); - if FORBIDDEN_WORDS.contains(&lowercase_word) { - self.forbidden_count += 1; - } - } - } - } -} - #[tokio::main] async fn main() { let files = env::args().skip(1); @@ -166,17 +59,17 @@ async fn main() { let (tx, rx) = mpsc::unbounded_channel(); for file in files { //reading files not sequentially average shaves 30ms (of 1250ms), and that's on a NVMe SSD so why not - let Ok(text) = fs::read_to_string(&file) else { + if let Ok(text) = fs::read_to_string(&file) { + stats.file_count += 1; + let tx = tx.clone(); + tokio::spawn(async move { + let mut stats = Stats::default(); + parser::for_loops::parse(&mut stats, &text); + let _ = tx.send(stats); + }); + } else { stats.failed_file_count += 1; - continue; - }; - stats.file_count += 1; - let tx = tx.clone(); - tokio::spawn(async move { - let mut stats = Stats::default(); - stats.process(&text); - tx.send(stats).unwrap(); - }); + } } rx }; @@ -186,6 +79,7 @@ async fn main() { println!("{stats}"); } +/// needs ../books.tar.gz to be extracted #[test] fn test() { use std::{env, fs, process::Command, time::Instant}; @@ -199,44 +93,6 @@ fn test() { Err(err) => eprintln!("compile failed: {err}"), } - //get test files - let files = fs::read_dir("test_files") - .unwrap() - .map(|f| { - f.unwrap() - .path() - .canonicalize() - .unwrap() - .to_str() - .unwrap() - .to_string() - }) - .collect::>(); - println!("test files found: {:#?}", files); - - //benchmark run - let benchmark = Instant::now(); - let mut run = Command::new("target/release/jisspam"); - let run_arged = run.args(files); - match run_arged.output() { - Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)), - Err(err) => eprintln!("run failed: {err}"), - } - println!("benchmark: {}ms", benchmark.elapsed().as_millis()); -} -#[test] -fn books_test() { - use std::{env, fs, process::Command, time::Instant}; - println!("cwd: {}", env::current_dir().unwrap().display()); - - //compile - let mut compile = Command::new("cargo"); - let compile_arged = compile.arg("build").arg("--release"); - match compile_arged.output() { - Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)), - Err(err) => eprintln!("compile failed: {err}"), - } - //get test files let files = fs::read_dir("../books") .unwrap() @@ -250,7 +106,9 @@ fn books_test() { .to_string() }) .collect::>(); - println!("test files found: {:#?}", files); + println!("test files found: {}", files.len()); + + println!(); //benchmark run let benchmark = Instant::now(); diff --git a/jest_rust/src/parser/for_loops.rs b/jest_rust/src/parser/for_loops.rs new file mode 100644 index 0000000..c0d23ef --- /dev/null +++ b/jest_rust/src/parser/for_loops.rs @@ -0,0 +1,39 @@ +use crate::{FORBIDDEN_WORDS, stats::Stats}; + +#[allow(dead_code)] +/// typically 5000ms +/// with trie this is 1600ms +pub fn parse(stats: &mut Stats, text: &str) { + for sentence in text + .split('.') + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + { + stats.sentence_count += 1; + for word in sentence + .split_whitespace() + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + { + stats.word_count += 1; + //get all numbers counted + let mut all_capitalized = true; + for char in word.chars() { + if char.is_numeric() { + stats.numeric_count += 1; + //TODO are numbers capitalized or not? I don't know! + } + if !char.is_ascii_uppercase() { + all_capitalized = false; + } + } + if all_capitalized { + stats.capitalized_count += 1; + } + let lowercase_word = word.to_lowercase(); + if FORBIDDEN_WORDS.contains(&lowercase_word) { + stats.forbidden_count += 1; + } + } + } +} diff --git a/jest_rust/src/parser/mod.rs b/jest_rust/src/parser/mod.rs new file mode 100644 index 0000000..a07edad --- /dev/null +++ b/jest_rust/src/parser/mod.rs @@ -0,0 +1,2 @@ +pub mod muncher; +pub mod for_loops; \ No newline at end of file diff --git a/jest_rust/src/parser/muncher.rs b/jest_rust/src/parser/muncher.rs new file mode 100644 index 0000000..2323691 --- /dev/null +++ b/jest_rust/src/parser/muncher.rs @@ -0,0 +1,66 @@ +use crate::{FORBIDDEN_WORDS, stats::Stats}; + +#[allow(dead_code)] +/// probably buggy. for example, are new lines sentences? what if the text has no last period? +/// 500ms is without forbidden words check, but... +/// 6000ms if adding forbidden words.. so not faster +/// with trie this is 2600ms +pub fn parse(stats: &mut Stats, text: &str) { + let mut capitalized = true; + let mut whitespaced = false; + let mut dotted = false; + let mut word = String::new(); + for char in text.chars() { + if whitespaced { + if !char.is_whitespace() { + whitespaced = false; //end whiteness + } + continue; + } else if char.is_whitespace() { + whitespaced = true; + stats.word_count += 1; //end of word + if capitalized { + stats.capitalized_count += 1; + } else { + //reset capitalized word + capitalized = true; + } + let lowercase_word = word.to_lowercase(); + if FORBIDDEN_WORDS.contains(&lowercase_word) { + stats.forbidden_count += 1; + } + word = String::new(); + continue; + } + if dotted { + if char != '.' { + dotted = false; //end sentencing + } + continue; + } else if char == '.' { + dotted = true; + stats.sentence_count += 1; + stats.word_count += 1; //end of word + if capitalized { + stats.capitalized_count += 1; + } else { + //reset capitalized word + capitalized = true; + } + let lowercase_word = word.to_lowercase(); + if FORBIDDEN_WORDS.contains(&lowercase_word) { + stats.forbidden_count += 1; + } + word = String::new(); + continue; + } + word += &char.to_string(); + if char.is_numeric() { + stats.numeric_count += 1; + capitalized = false; + } + if !char.is_ascii_uppercase() { + capitalized = false; + } + } +}