This commit is contained in:
JestDotty 2025-10-04 09:18:03 -04:00
parent 94b786f83a
commit d73d4ff7c1
6 changed files with 183 additions and 242 deletions

View File

@ -1,2 +1,3 @@
/target /target
/Cargo.lock /Cargo.lock
/test_books

View File

@ -1,95 +1,15 @@
for https://retoor.molodetz.nl/retoor/isspam for https://retoor.molodetz.nl/retoor/isspam
https://snek.molodetz.nl/terminal.html ubuntu running thing instructions: extract `../books.tar.gz`
```
mkdir /project
cd /project
git clone https://retoor.molodetz.nl/retoor/isspam.git
apt install valgrind curl
export RUSTUP_HOME=/project/.rustup
export CARGO_HOME=/project/.cargo
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
. "/project/.cargo/env"
cd isspam
rustup install nightly
rustup default nightly
make
make benchmark
python3 bench.py
```
clone: `git clone https://gitlab.com/jestdotty-group/draft/jisspam.git jest_rust`
edit make: `vi makefile` and add build:
```
build_jest:
@echo "compiling jest_rust project"
cd jest_rust && cargo build --release && cp target/release/jisspam ..
```
append to all script:
```
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
```
add to bench: `vi bench.py`
```py
time_start = time.time()
subprocess.check_output('./jisspam books/*.txt', shell=True)
print("Time Jest Rust:", time.time() - time_start)
```
run: `python3 bench.py`
output looks something like this:
```
***benchmarking***
Time C: 31.315868377685547
Time Rust: 41.232205867767334
Time CPP: 20.1683189868927
Time Borded CPP: 15.468477964401245
Time Jest Rust: 54.74523115158081
Time Retoor Python: 287.63036131858826
***end benchmark***
```
add `/jisspam` to `.gitignore` to not commit the executable accidentally
# local machine benchmarks # local machine benchmarks
single threaded: single threaded: `Time Jest Rust: 33.63373279571533`
```
***benchmarking***
Time C: 2.4082751274108887
Time Rust: 2.865687847137451
Time CPP: 1.1568822860717773
Time Borded CPP: 1.9657189846038818
Time Jest Rust: 33.63373279571533
Time Retoor Python: 133.92413425445557
***end benchmark***
```
rayon: rayon: `Time Jest Rust: 4.294418811798096`
```
***benchmarking*** tokio: `Time Jest Rust: 4.717588901519775`
Time C: 2.457853317260742
Time Rust: 3.0170154571533203
Time CPP: 1.1482579708099365
Time Borded CPP: 2.002591371536255
Time Jest Rust: 4.294418811798096
Time Retoor Python: 201.2997748851776
***end benchmark***
```
tokio:
```
***benchmarking***
Time C: 2.448648452758789
Time Rust: 3.095592737197876
Time CPP: 1.1662013530731201
Time Borded CPP: 1.9207634925842285
Time Jest Rust: 4.717588901519775
Time Retoor Python: 139.8203284740448
***end benchmark***
```
## compile options benchmarks ## compile options benchmarks
lto not thin: `Time Jest Rust: 5.306957483291626` slower lto not thin: `Time Jest Rust: 5.306957483291626` slower
@ -109,6 +29,7 @@ lto thin: `Time Jest Rust: 4.429729223251343` faster
# data integrity # data integrity
(this isn't tested, just guessed, and I don't have data to compare it with) (this isn't tested, just guessed, and I don't have data to compare it with)
for loops: for loops:
``` ```
file count: 904 file count: 904
@ -193,3 +114,57 @@ benchmark: 1588ms
muncher with trie is 2600ms muncher with trie is 2600ms
for loops with fxhash trie: 1200ms for loops with fxhash trie: 1200ms
# ubuntu terminal running
https://snek.molodetz.nl/terminal.html ubuntu running thing instructions:
```
mkdir /project
cd /project
git clone https://retoor.molodetz.nl/retoor/isspam.git
apt install valgrind curl
export RUSTUP_HOME=/project/.rustup
export CARGO_HOME=/project/.cargo
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
. "/project/.cargo/env"
cd isspam
rustup install nightly
rustup default nightly
make
make benchmark
python3 bench.py
```
clone: `git clone https://gitlab.com/jestdotty-group/draft/jisspam.git jest_rust`
edit make: `vi makefile` and add build:
```
build_jest:
@echo "compiling jest_rust project"
cd jest_rust && cargo build --release && cp target/release/jisspam ..
```
append to all script:
```
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
```
add to bench: `vi bench.py`
```py
time_start = time.time()
subprocess.check_output('./jisspam books/*.txt', shell=True)
print("Time Jest Rust:", time.time() - time_start)
```
run: `python3 bench.py`
output looks something like this:
```
***benchmarking***
Time C: 31.315868377685547
Time Rust: 41.232205867767334
Time CPP: 20.1683189868927
Time Borded CPP: 15.468477964401245
Time Jest Rust: 54.74523115158081
Time Retoor Python: 287.63036131858826
***end benchmark***
```
add `/jisspam` to `.gitignore` to not commit the executable accidentally

View File

@ -1,3 +1,4 @@
mod parser;
mod stats; mod stats;
mod trie; mod trie;
@ -50,114 +51,6 @@ static FORBIDDEN_WORDS: LazyLock<Trie> = LazyLock::new(|| {
trie trie
}); });
impl Stats {
pub fn process(&mut self, text: &str) {
// self.muncher(&text);
self.for_loops(&text);
}
#[allow(dead_code)]
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
/// 500ms is without forbidden words check, but...
/// 6000ms if adding forbidden words.. so not faster
/// with trie this is 2600ms
fn muncher(&mut self, text: &str) {
let mut capitalized = true;
let mut whitespaced = false;
let mut dotted = false;
let mut word = String::new();
for char in text.chars() {
if whitespaced {
if !char.is_whitespace() {
whitespaced = false; //end whiteness
}
continue;
} else if char.is_whitespace() {
whitespaced = true;
self.word_count += 1; //end of word
if capitalized {
self.capitalized_count += 1;
} else {
//reset capitalized word
capitalized = true;
}
let lowercase_word = word.to_lowercase();
if FORBIDDEN_WORDS.contains(&lowercase_word) {
self.forbidden_count += 1;
}
word = String::new();
continue;
}
if dotted {
if char != '.' {
dotted = false; //end sentencing
}
continue;
} else if char == '.' {
dotted = true;
self.sentence_count += 1;
self.word_count += 1; //end of word
if capitalized {
self.capitalized_count += 1;
} else {
//reset capitalized word
capitalized = true;
}
let lowercase_word = word.to_lowercase();
if FORBIDDEN_WORDS.contains(&lowercase_word) {
self.forbidden_count += 1;
}
word = String::new();
continue;
}
word += &char.to_string();
if char.is_numeric() {
self.numeric_count += 1;
capitalized = false;
}
if !char.is_ascii_uppercase() {
capitalized = false;
}
}
}
#[allow(dead_code)]
/// typically 5000ms
/// with trie this is 1600ms
fn for_loops(&mut self, text: &str) {
for sentence in text
.split('.')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
self.sentence_count += 1;
for word in sentence
.split_whitespace()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
self.word_count += 1;
//get all numbers counted
let mut all_capitalized = true;
for char in word.chars() {
if char.is_numeric() {
self.numeric_count += 1;
//TODO are numbers capitalized or not? I don't know!
}
if !char.is_ascii_uppercase() {
all_capitalized = false;
}
}
if all_capitalized {
self.capitalized_count += 1;
}
let lowercase_word = word.to_lowercase();
if FORBIDDEN_WORDS.contains(&lowercase_word) {
self.forbidden_count += 1;
}
}
}
}
}
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
let files = env::args().skip(1); let files = env::args().skip(1);
@ -166,17 +59,17 @@ async fn main() {
let (tx, rx) = mpsc::unbounded_channel(); let (tx, rx) = mpsc::unbounded_channel();
for file in files { for file in files {
//reading files not sequentially average shaves 30ms (of 1250ms), and that's on a NVMe SSD so why not //reading files not sequentially average shaves 30ms (of 1250ms), and that's on a NVMe SSD so why not
let Ok(text) = fs::read_to_string(&file) else { if let Ok(text) = fs::read_to_string(&file) {
stats.file_count += 1;
let tx = tx.clone();
tokio::spawn(async move {
let mut stats = Stats::default();
parser::for_loops::parse(&mut stats, &text);
let _ = tx.send(stats);
});
} else {
stats.failed_file_count += 1; stats.failed_file_count += 1;
continue; }
};
stats.file_count += 1;
let tx = tx.clone();
tokio::spawn(async move {
let mut stats = Stats::default();
stats.process(&text);
tx.send(stats).unwrap();
});
} }
rx rx
}; };
@ -186,6 +79,7 @@ async fn main() {
println!("{stats}"); println!("{stats}");
} }
/// needs ../books.tar.gz to be extracted
#[test] #[test]
fn test() { fn test() {
use std::{env, fs, process::Command, time::Instant}; use std::{env, fs, process::Command, time::Instant};
@ -199,44 +93,6 @@ fn test() {
Err(err) => eprintln!("compile failed: {err}"), Err(err) => eprintln!("compile failed: {err}"),
} }
//get test files
let files = fs::read_dir("test_files")
.unwrap()
.map(|f| {
f.unwrap()
.path()
.canonicalize()
.unwrap()
.to_str()
.unwrap()
.to_string()
})
.collect::<Vec<_>>();
println!("test files found: {:#?}", files);
//benchmark run
let benchmark = Instant::now();
let mut run = Command::new("target/release/jisspam");
let run_arged = run.args(files);
match run_arged.output() {
Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
Err(err) => eprintln!("run failed: {err}"),
}
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
}
#[test]
fn books_test() {
use std::{env, fs, process::Command, time::Instant};
println!("cwd: {}", env::current_dir().unwrap().display());
//compile
let mut compile = Command::new("cargo");
let compile_arged = compile.arg("build").arg("--release");
match compile_arged.output() {
Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
Err(err) => eprintln!("compile failed: {err}"),
}
//get test files //get test files
let files = fs::read_dir("../books") let files = fs::read_dir("../books")
.unwrap() .unwrap()
@ -250,7 +106,9 @@ fn books_test() {
.to_string() .to_string()
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
println!("test files found: {:#?}", files); println!("test files found: {}", files.len());
println!();
//benchmark run //benchmark run
let benchmark = Instant::now(); let benchmark = Instant::now();

View File

@ -0,0 +1,39 @@
use crate::{FORBIDDEN_WORDS, stats::Stats};
#[allow(dead_code)]
/// typically 5000ms
/// with trie this is 1600ms
pub fn parse(stats: &mut Stats, text: &str) {
for sentence in text
.split('.')
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
stats.sentence_count += 1;
for word in sentence
.split_whitespace()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
{
stats.word_count += 1;
//get all numbers counted
let mut all_capitalized = true;
for char in word.chars() {
if char.is_numeric() {
stats.numeric_count += 1;
//TODO are numbers capitalized or not? I don't know!
}
if !char.is_ascii_uppercase() {
all_capitalized = false;
}
}
if all_capitalized {
stats.capitalized_count += 1;
}
let lowercase_word = word.to_lowercase();
if FORBIDDEN_WORDS.contains(&lowercase_word) {
stats.forbidden_count += 1;
}
}
}
}

View File

@ -0,0 +1,2 @@
pub mod muncher;
pub mod for_loops;

View File

@ -0,0 +1,66 @@
use crate::{FORBIDDEN_WORDS, stats::Stats};
#[allow(dead_code)]
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
/// 500ms is without forbidden words check, but...
/// 6000ms if adding forbidden words.. so not faster
/// with trie this is 2600ms
pub fn parse(stats: &mut Stats, text: &str) {
let mut capitalized = true;
let mut whitespaced = false;
let mut dotted = false;
let mut word = String::new();
for char in text.chars() {
if whitespaced {
if !char.is_whitespace() {
whitespaced = false; //end whiteness
}
continue;
} else if char.is_whitespace() {
whitespaced = true;
stats.word_count += 1; //end of word
if capitalized {
stats.capitalized_count += 1;
} else {
//reset capitalized word
capitalized = true;
}
let lowercase_word = word.to_lowercase();
if FORBIDDEN_WORDS.contains(&lowercase_word) {
stats.forbidden_count += 1;
}
word = String::new();
continue;
}
if dotted {
if char != '.' {
dotted = false; //end sentencing
}
continue;
} else if char == '.' {
dotted = true;
stats.sentence_count += 1;
stats.word_count += 1; //end of word
if capitalized {
stats.capitalized_count += 1;
} else {
//reset capitalized word
capitalized = true;
}
let lowercase_word = word.to_lowercase();
if FORBIDDEN_WORDS.contains(&lowercase_word) {
stats.forbidden_count += 1;
}
word = String::new();
continue;
}
word += &char.to_string();
if char.is_numeric() {
stats.numeric_count += 1;
capitalized = false;
}
if !char.is_ascii_uppercase() {
capitalized = false;
}
}
}