reorg
This commit is contained in:
parent
94b786f83a
commit
d73d4ff7c1
1
jest_rust/.gitignore
vendored
1
jest_rust/.gitignore
vendored
@ -1,2 +1,3 @@
|
|||||||
/target
|
/target
|
||||||
/Cargo.lock
|
/Cargo.lock
|
||||||
|
/test_books
|
||||||
|
|||||||
@ -1,95 +1,15 @@
|
|||||||
for https://retoor.molodetz.nl/retoor/isspam
|
for https://retoor.molodetz.nl/retoor/isspam
|
||||||
|
|
||||||
https://snek.molodetz.nl/terminal.html ubuntu running thing instructions:
|
extract `../books.tar.gz`
|
||||||
```
|
|
||||||
mkdir /project
|
|
||||||
cd /project
|
|
||||||
git clone https://retoor.molodetz.nl/retoor/isspam.git
|
|
||||||
apt install valgrind curl
|
|
||||||
export RUSTUP_HOME=/project/.rustup
|
|
||||||
export CARGO_HOME=/project/.cargo
|
|
||||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
|
||||||
. "/project/.cargo/env"
|
|
||||||
cd isspam
|
|
||||||
rustup install nightly
|
|
||||||
rustup default nightly
|
|
||||||
make
|
|
||||||
make benchmark
|
|
||||||
python3 bench.py
|
|
||||||
```
|
|
||||||
|
|
||||||
clone: `git clone https://gitlab.com/jestdotty-group/draft/jisspam.git jest_rust`
|
|
||||||
|
|
||||||
edit make: `vi makefile` and add build:
|
|
||||||
```
|
|
||||||
build_jest:
|
|
||||||
@echo "compiling jest_rust project"
|
|
||||||
cd jest_rust && cargo build --release && cp target/release/jisspam ..
|
|
||||||
```
|
|
||||||
append to all script:
|
|
||||||
```
|
|
||||||
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
|
|
||||||
```
|
|
||||||
|
|
||||||
add to bench: `vi bench.py`
|
|
||||||
```py
|
|
||||||
time_start = time.time()
|
|
||||||
subprocess.check_output('./jisspam books/*.txt', shell=True)
|
|
||||||
print("Time Jest Rust:", time.time() - time_start)
|
|
||||||
```
|
|
||||||
|
|
||||||
run: `python3 bench.py`
|
|
||||||
output looks something like this:
|
|
||||||
```
|
|
||||||
***benchmarking***
|
|
||||||
Time C: 31.315868377685547
|
|
||||||
Time Rust: 41.232205867767334
|
|
||||||
Time CPP: 20.1683189868927
|
|
||||||
Time Borded CPP: 15.468477964401245
|
|
||||||
Time Jest Rust: 54.74523115158081
|
|
||||||
Time Retoor Python: 287.63036131858826
|
|
||||||
***end benchmark***
|
|
||||||
```
|
|
||||||
|
|
||||||
add `/jisspam` to `.gitignore` to not commit the executable accidentally
|
|
||||||
|
|
||||||
# local machine benchmarks
|
# local machine benchmarks
|
||||||
|
|
||||||
single threaded:
|
single threaded: `Time Jest Rust: 33.63373279571533`
|
||||||
```
|
|
||||||
***benchmarking***
|
|
||||||
Time C: 2.4082751274108887
|
|
||||||
Time Rust: 2.865687847137451
|
|
||||||
Time CPP: 1.1568822860717773
|
|
||||||
Time Borded CPP: 1.9657189846038818
|
|
||||||
Time Jest Rust: 33.63373279571533
|
|
||||||
Time Retoor Python: 133.92413425445557
|
|
||||||
***end benchmark***
|
|
||||||
```
|
|
||||||
|
|
||||||
rayon:
|
rayon: `Time Jest Rust: 4.294418811798096`
|
||||||
```
|
|
||||||
***benchmarking***
|
tokio: `Time Jest Rust: 4.717588901519775`
|
||||||
Time C: 2.457853317260742
|
|
||||||
Time Rust: 3.0170154571533203
|
|
||||||
Time CPP: 1.1482579708099365
|
|
||||||
Time Borded CPP: 2.002591371536255
|
|
||||||
Time Jest Rust: 4.294418811798096
|
|
||||||
Time Retoor Python: 201.2997748851776
|
|
||||||
***end benchmark***
|
|
||||||
```
|
|
||||||
|
|
||||||
tokio:
|
|
||||||
```
|
|
||||||
***benchmarking***
|
|
||||||
Time C: 2.448648452758789
|
|
||||||
Time Rust: 3.095592737197876
|
|
||||||
Time CPP: 1.1662013530731201
|
|
||||||
Time Borded CPP: 1.9207634925842285
|
|
||||||
Time Jest Rust: 4.717588901519775
|
|
||||||
Time Retoor Python: 139.8203284740448
|
|
||||||
***end benchmark***
|
|
||||||
```
|
|
||||||
## compile options benchmarks
|
## compile options benchmarks
|
||||||
lto not thin: `Time Jest Rust: 5.306957483291626` slower
|
lto not thin: `Time Jest Rust: 5.306957483291626` slower
|
||||||
|
|
||||||
@ -109,6 +29,7 @@ lto thin: `Time Jest Rust: 4.429729223251343` faster
|
|||||||
|
|
||||||
# data integrity
|
# data integrity
|
||||||
(this isn't tested, just guessed, and I don't have data to compare it with)
|
(this isn't tested, just guessed, and I don't have data to compare it with)
|
||||||
|
|
||||||
for loops:
|
for loops:
|
||||||
```
|
```
|
||||||
file count: 904
|
file count: 904
|
||||||
@ -193,3 +114,57 @@ benchmark: 1588ms
|
|||||||
muncher with trie is 2600ms
|
muncher with trie is 2600ms
|
||||||
|
|
||||||
for loops with fxhash trie: 1200ms
|
for loops with fxhash trie: 1200ms
|
||||||
|
|
||||||
|
# ubuntu terminal running
|
||||||
|
https://snek.molodetz.nl/terminal.html ubuntu running thing instructions:
|
||||||
|
```
|
||||||
|
mkdir /project
|
||||||
|
cd /project
|
||||||
|
git clone https://retoor.molodetz.nl/retoor/isspam.git
|
||||||
|
apt install valgrind curl
|
||||||
|
export RUSTUP_HOME=/project/.rustup
|
||||||
|
export CARGO_HOME=/project/.cargo
|
||||||
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||||
|
. "/project/.cargo/env"
|
||||||
|
cd isspam
|
||||||
|
rustup install nightly
|
||||||
|
rustup default nightly
|
||||||
|
make
|
||||||
|
make benchmark
|
||||||
|
python3 bench.py
|
||||||
|
```
|
||||||
|
|
||||||
|
clone: `git clone https://gitlab.com/jestdotty-group/draft/jisspam.git jest_rust`
|
||||||
|
|
||||||
|
edit make: `vi makefile` and add build:
|
||||||
|
```
|
||||||
|
build_jest:
|
||||||
|
@echo "compiling jest_rust project"
|
||||||
|
cd jest_rust && cargo build --release && cp target/release/jisspam ..
|
||||||
|
```
|
||||||
|
append to all script:
|
||||||
|
```
|
||||||
|
all: build run valgrind build_risspam run_risspam build_cpp build_borded_cpp build_py build_jest
|
||||||
|
```
|
||||||
|
|
||||||
|
add to bench: `vi bench.py`
|
||||||
|
```py
|
||||||
|
time_start = time.time()
|
||||||
|
subprocess.check_output('./jisspam books/*.txt', shell=True)
|
||||||
|
print("Time Jest Rust:", time.time() - time_start)
|
||||||
|
```
|
||||||
|
|
||||||
|
run: `python3 bench.py`
|
||||||
|
output looks something like this:
|
||||||
|
```
|
||||||
|
***benchmarking***
|
||||||
|
Time C: 31.315868377685547
|
||||||
|
Time Rust: 41.232205867767334
|
||||||
|
Time CPP: 20.1683189868927
|
||||||
|
Time Borded CPP: 15.468477964401245
|
||||||
|
Time Jest Rust: 54.74523115158081
|
||||||
|
Time Retoor Python: 287.63036131858826
|
||||||
|
***end benchmark***
|
||||||
|
```
|
||||||
|
|
||||||
|
add `/jisspam` to `.gitignore` to not commit the executable accidentally
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
mod parser;
|
||||||
mod stats;
|
mod stats;
|
||||||
mod trie;
|
mod trie;
|
||||||
|
|
||||||
@ -50,114 +51,6 @@ static FORBIDDEN_WORDS: LazyLock<Trie> = LazyLock::new(|| {
|
|||||||
trie
|
trie
|
||||||
});
|
});
|
||||||
|
|
||||||
impl Stats {
|
|
||||||
pub fn process(&mut self, text: &str) {
|
|
||||||
// self.muncher(&text);
|
|
||||||
self.for_loops(&text);
|
|
||||||
}
|
|
||||||
#[allow(dead_code)]
|
|
||||||
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
|
|
||||||
/// 500ms is without forbidden words check, but...
|
|
||||||
/// 6000ms if adding forbidden words.. so not faster
|
|
||||||
/// with trie this is 2600ms
|
|
||||||
fn muncher(&mut self, text: &str) {
|
|
||||||
let mut capitalized = true;
|
|
||||||
let mut whitespaced = false;
|
|
||||||
let mut dotted = false;
|
|
||||||
let mut word = String::new();
|
|
||||||
for char in text.chars() {
|
|
||||||
if whitespaced {
|
|
||||||
if !char.is_whitespace() {
|
|
||||||
whitespaced = false; //end whiteness
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
} else if char.is_whitespace() {
|
|
||||||
whitespaced = true;
|
|
||||||
self.word_count += 1; //end of word
|
|
||||||
if capitalized {
|
|
||||||
self.capitalized_count += 1;
|
|
||||||
} else {
|
|
||||||
//reset capitalized word
|
|
||||||
capitalized = true;
|
|
||||||
}
|
|
||||||
let lowercase_word = word.to_lowercase();
|
|
||||||
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
|
||||||
self.forbidden_count += 1;
|
|
||||||
}
|
|
||||||
word = String::new();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if dotted {
|
|
||||||
if char != '.' {
|
|
||||||
dotted = false; //end sentencing
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
} else if char == '.' {
|
|
||||||
dotted = true;
|
|
||||||
self.sentence_count += 1;
|
|
||||||
self.word_count += 1; //end of word
|
|
||||||
if capitalized {
|
|
||||||
self.capitalized_count += 1;
|
|
||||||
} else {
|
|
||||||
//reset capitalized word
|
|
||||||
capitalized = true;
|
|
||||||
}
|
|
||||||
let lowercase_word = word.to_lowercase();
|
|
||||||
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
|
||||||
self.forbidden_count += 1;
|
|
||||||
}
|
|
||||||
word = String::new();
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
word += &char.to_string();
|
|
||||||
if char.is_numeric() {
|
|
||||||
self.numeric_count += 1;
|
|
||||||
capitalized = false;
|
|
||||||
}
|
|
||||||
if !char.is_ascii_uppercase() {
|
|
||||||
capitalized = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#[allow(dead_code)]
|
|
||||||
/// typically 5000ms
|
|
||||||
/// with trie this is 1600ms
|
|
||||||
fn for_loops(&mut self, text: &str) {
|
|
||||||
for sentence in text
|
|
||||||
.split('.')
|
|
||||||
.map(|s| s.trim())
|
|
||||||
.filter(|s| !s.is_empty())
|
|
||||||
{
|
|
||||||
self.sentence_count += 1;
|
|
||||||
for word in sentence
|
|
||||||
.split_whitespace()
|
|
||||||
.map(|s| s.trim())
|
|
||||||
.filter(|s| !s.is_empty())
|
|
||||||
{
|
|
||||||
self.word_count += 1;
|
|
||||||
//get all numbers counted
|
|
||||||
let mut all_capitalized = true;
|
|
||||||
for char in word.chars() {
|
|
||||||
if char.is_numeric() {
|
|
||||||
self.numeric_count += 1;
|
|
||||||
//TODO are numbers capitalized or not? I don't know!
|
|
||||||
}
|
|
||||||
if !char.is_ascii_uppercase() {
|
|
||||||
all_capitalized = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if all_capitalized {
|
|
||||||
self.capitalized_count += 1;
|
|
||||||
}
|
|
||||||
let lowercase_word = word.to_lowercase();
|
|
||||||
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
|
||||||
self.forbidden_count += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
let files = env::args().skip(1);
|
let files = env::args().skip(1);
|
||||||
@ -166,17 +59,17 @@ async fn main() {
|
|||||||
let (tx, rx) = mpsc::unbounded_channel();
|
let (tx, rx) = mpsc::unbounded_channel();
|
||||||
for file in files {
|
for file in files {
|
||||||
//reading files not sequentially average shaves 30ms (of 1250ms), and that's on a NVMe SSD so why not
|
//reading files not sequentially average shaves 30ms (of 1250ms), and that's on a NVMe SSD so why not
|
||||||
let Ok(text) = fs::read_to_string(&file) else {
|
if let Ok(text) = fs::read_to_string(&file) {
|
||||||
stats.failed_file_count += 1;
|
|
||||||
continue;
|
|
||||||
};
|
|
||||||
stats.file_count += 1;
|
stats.file_count += 1;
|
||||||
let tx = tx.clone();
|
let tx = tx.clone();
|
||||||
tokio::spawn(async move {
|
tokio::spawn(async move {
|
||||||
let mut stats = Stats::default();
|
let mut stats = Stats::default();
|
||||||
stats.process(&text);
|
parser::for_loops::parse(&mut stats, &text);
|
||||||
tx.send(stats).unwrap();
|
let _ = tx.send(stats);
|
||||||
});
|
});
|
||||||
|
} else {
|
||||||
|
stats.failed_file_count += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
rx
|
rx
|
||||||
};
|
};
|
||||||
@ -186,6 +79,7 @@ async fn main() {
|
|||||||
println!("{stats}");
|
println!("{stats}");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// needs ../books.tar.gz to be extracted
|
||||||
#[test]
|
#[test]
|
||||||
fn test() {
|
fn test() {
|
||||||
use std::{env, fs, process::Command, time::Instant};
|
use std::{env, fs, process::Command, time::Instant};
|
||||||
@ -199,44 +93,6 @@ fn test() {
|
|||||||
Err(err) => eprintln!("compile failed: {err}"),
|
Err(err) => eprintln!("compile failed: {err}"),
|
||||||
}
|
}
|
||||||
|
|
||||||
//get test files
|
|
||||||
let files = fs::read_dir("test_files")
|
|
||||||
.unwrap()
|
|
||||||
.map(|f| {
|
|
||||||
f.unwrap()
|
|
||||||
.path()
|
|
||||||
.canonicalize()
|
|
||||||
.unwrap()
|
|
||||||
.to_str()
|
|
||||||
.unwrap()
|
|
||||||
.to_string()
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
println!("test files found: {:#?}", files);
|
|
||||||
|
|
||||||
//benchmark run
|
|
||||||
let benchmark = Instant::now();
|
|
||||||
let mut run = Command::new("target/release/jisspam");
|
|
||||||
let run_arged = run.args(files);
|
|
||||||
match run_arged.output() {
|
|
||||||
Ok(output) => println!("{}", String::from_utf8_lossy(&output.stdout)),
|
|
||||||
Err(err) => eprintln!("run failed: {err}"),
|
|
||||||
}
|
|
||||||
println!("benchmark: {}ms", benchmark.elapsed().as_millis());
|
|
||||||
}
|
|
||||||
#[test]
|
|
||||||
fn books_test() {
|
|
||||||
use std::{env, fs, process::Command, time::Instant};
|
|
||||||
println!("cwd: {}", env::current_dir().unwrap().display());
|
|
||||||
|
|
||||||
//compile
|
|
||||||
let mut compile = Command::new("cargo");
|
|
||||||
let compile_arged = compile.arg("build").arg("--release");
|
|
||||||
match compile_arged.output() {
|
|
||||||
Ok(output) => println!("compiled {}", String::from_utf8_lossy(&output.stdout)),
|
|
||||||
Err(err) => eprintln!("compile failed: {err}"),
|
|
||||||
}
|
|
||||||
|
|
||||||
//get test files
|
//get test files
|
||||||
let files = fs::read_dir("../books")
|
let files = fs::read_dir("../books")
|
||||||
.unwrap()
|
.unwrap()
|
||||||
@ -250,7 +106,9 @@ fn books_test() {
|
|||||||
.to_string()
|
.to_string()
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
println!("test files found: {:#?}", files);
|
println!("test files found: {}", files.len());
|
||||||
|
|
||||||
|
println!();
|
||||||
|
|
||||||
//benchmark run
|
//benchmark run
|
||||||
let benchmark = Instant::now();
|
let benchmark = Instant::now();
|
||||||
|
|||||||
39
jest_rust/src/parser/for_loops.rs
Normal file
39
jest_rust/src/parser/for_loops.rs
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
use crate::{FORBIDDEN_WORDS, stats::Stats};
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
/// typically 5000ms
|
||||||
|
/// with trie this is 1600ms
|
||||||
|
pub fn parse(stats: &mut Stats, text: &str) {
|
||||||
|
for sentence in text
|
||||||
|
.split('.')
|
||||||
|
.map(|s| s.trim())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
{
|
||||||
|
stats.sentence_count += 1;
|
||||||
|
for word in sentence
|
||||||
|
.split_whitespace()
|
||||||
|
.map(|s| s.trim())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
{
|
||||||
|
stats.word_count += 1;
|
||||||
|
//get all numbers counted
|
||||||
|
let mut all_capitalized = true;
|
||||||
|
for char in word.chars() {
|
||||||
|
if char.is_numeric() {
|
||||||
|
stats.numeric_count += 1;
|
||||||
|
//TODO are numbers capitalized or not? I don't know!
|
||||||
|
}
|
||||||
|
if !char.is_ascii_uppercase() {
|
||||||
|
all_capitalized = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if all_capitalized {
|
||||||
|
stats.capitalized_count += 1;
|
||||||
|
}
|
||||||
|
let lowercase_word = word.to_lowercase();
|
||||||
|
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
||||||
|
stats.forbidden_count += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
2
jest_rust/src/parser/mod.rs
Normal file
2
jest_rust/src/parser/mod.rs
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
pub mod muncher;
|
||||||
|
pub mod for_loops;
|
||||||
66
jest_rust/src/parser/muncher.rs
Normal file
66
jest_rust/src/parser/muncher.rs
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
use crate::{FORBIDDEN_WORDS, stats::Stats};
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
/// probably buggy. for example, are new lines sentences? what if the text has no last period?
|
||||||
|
/// 500ms is without forbidden words check, but...
|
||||||
|
/// 6000ms if adding forbidden words.. so not faster
|
||||||
|
/// with trie this is 2600ms
|
||||||
|
pub fn parse(stats: &mut Stats, text: &str) {
|
||||||
|
let mut capitalized = true;
|
||||||
|
let mut whitespaced = false;
|
||||||
|
let mut dotted = false;
|
||||||
|
let mut word = String::new();
|
||||||
|
for char in text.chars() {
|
||||||
|
if whitespaced {
|
||||||
|
if !char.is_whitespace() {
|
||||||
|
whitespaced = false; //end whiteness
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else if char.is_whitespace() {
|
||||||
|
whitespaced = true;
|
||||||
|
stats.word_count += 1; //end of word
|
||||||
|
if capitalized {
|
||||||
|
stats.capitalized_count += 1;
|
||||||
|
} else {
|
||||||
|
//reset capitalized word
|
||||||
|
capitalized = true;
|
||||||
|
}
|
||||||
|
let lowercase_word = word.to_lowercase();
|
||||||
|
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
||||||
|
stats.forbidden_count += 1;
|
||||||
|
}
|
||||||
|
word = String::new();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if dotted {
|
||||||
|
if char != '.' {
|
||||||
|
dotted = false; //end sentencing
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
} else if char == '.' {
|
||||||
|
dotted = true;
|
||||||
|
stats.sentence_count += 1;
|
||||||
|
stats.word_count += 1; //end of word
|
||||||
|
if capitalized {
|
||||||
|
stats.capitalized_count += 1;
|
||||||
|
} else {
|
||||||
|
//reset capitalized word
|
||||||
|
capitalized = true;
|
||||||
|
}
|
||||||
|
let lowercase_word = word.to_lowercase();
|
||||||
|
if FORBIDDEN_WORDS.contains(&lowercase_word) {
|
||||||
|
stats.forbidden_count += 1;
|
||||||
|
}
|
||||||
|
word = String::new();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
word += &char.to_string();
|
||||||
|
if char.is_numeric() {
|
||||||
|
stats.numeric_count += 1;
|
||||||
|
capitalized = false;
|
||||||
|
}
|
||||||
|
if !char.is_ascii_uppercase() {
|
||||||
|
capitalized = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user