diff --git a/12bitfloat_rust/risspam/.cargo/config.toml b/12bitfloat_rust/risspam/.cargo/config.toml index df7011c..afcb2d2 100644 --- a/12bitfloat_rust/risspam/.cargo/config.toml +++ b/12bitfloat_rust/risspam/.cargo/config.toml @@ -3,3 +3,7 @@ rustflags = [ "-Ztls-model=initial-exec", "-Ctarget-cpu=native" ] + +#[unstable] +#build-std = ["compiler_builtins", "alloc", "std", "panic_abort"] # choose only what you need +#build-std-features = ["compiler-builtins-mem"] diff --git a/12bitfloat_rust/risspam/Cargo.lock b/12bitfloat_rust/risspam/Cargo.lock index f3bc7f5..ebe8c85 100644 --- a/12bitfloat_rust/risspam/Cargo.lock +++ b/12bitfloat_rust/risspam/Cargo.lock @@ -2,17 +2,6 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "core_affinity" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a034b3a7b624016c6e13f5df875747cc25f884156aad2abd12b6c46797971342" -dependencies = [ - "libc", - "num_cpus", - "winapi", -] - [[package]] name = "crossbeam-deque" version = "0.8.5" @@ -44,37 +33,6 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" -[[package]] -name = "hermit-abi" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" - -[[package]] -name = "libc" -version = "0.2.176" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174" - -[[package]] -name = "memmap2" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "843a98750cd611cc2965a8213b53b43e715f13c37a9e096c6408e69990961db7" -dependencies = [ - "libc", -] - -[[package]] -name = "num_cpus" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "rayon" version = "1.10.0" @@ -99,29 +57,5 @@ dependencies = [ name = "risspam" version = "0.1.0" dependencies = [ - "core_affinity", - "memmap2", "rayon", ] - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" diff --git a/12bitfloat_rust/risspam/Cargo.toml b/12bitfloat_rust/risspam/Cargo.toml index 13d4136..25530b0 100644 --- a/12bitfloat_rust/risspam/Cargo.toml +++ b/12bitfloat_rust/risspam/Cargo.toml @@ -11,8 +11,10 @@ debug = "line-tables-only" [dependencies] rayon = "1.10.0" -memmap2 = "0.9.8" -core_affinity = "0.8.3" +#mimalloc = "0.1.48" +#io-uring = "0.7" +#memmap2 = "0.9.8" +#core_affinity = "0.8.3" #libc = "0.2.176" #glommio = "0.9.0" #monoio = "0.2.4" diff --git a/12bitfloat_rust/risspam/src/main.rs b/12bitfloat_rust/risspam/src/main.rs index 546b3a1..fe4999c 100644 --- a/12bitfloat_rust/risspam/src/main.rs +++ b/12bitfloat_rust/risspam/src/main.rs @@ -5,6 +5,8 @@ //mod books; +mod uring; + use rayon::prelude::*; use std::cell::RefCell; use std::ffi::OsStr; @@ -15,6 +17,10 @@ use std::thread::available_parallelism; use std::time::{Duration, Instant}; use std::{array, env, hint, process}; +// NOTE: mimalloc is actually slower +//#[global_allocator] +//static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + #[inline] fn is_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ') @@ -74,9 +80,9 @@ extern "rust-cold" fn die() -> ! { fn work(file_path: &OsStr, stats: &mut Stats) { WORK_STATE.with_borrow_mut(|state: &mut WorkState| { -// // Load file -// let start_time = Instant::now(); - + // Load file + let start_time = Instant::now(); + // let Ok(text) = fs::read(file_path) else { // eprintln!("invalid file!"); // process::abort(); @@ -102,19 +108,260 @@ fn work(file_path: &OsStr, stats: &mut Stats) { read_offset += rb; } let text = &state.work_mem[..read_offset]; - + // file.read_exact(&mut state.work_mem[..file_len]).unwrap(); - + // let text = include_bytes!("../../../books/Advanced Techniques in Web Intelligence – Part II.txt").as_slice(); + + let time_reading = start_time.elapsed(); + { + let mut guard = TIME_SPENT_READING_FILES.lock().unwrap(); + *guard += time_reading; + } + +// analyze(&text, stats); +// analyze2(&text, stats); + analyze_simd(&text, stats); + }); +} -// let time_reading = start_time.elapsed(); -// { -// let mut guard = TIME_SPENT_READING_FILES.lock().unwrap(); -// *guard += time_reading; +fn analyze_simd(text: &[u8], stats: &mut Stats) { + use std::arch::x86_64::*; + + let mut sentences = 0; + let mut words = 0; + let mut capitalizeds = 0; + let mut numbers = 0; + let mut forbiddens = 0; + + const BLOCK_ITEMS: usize = 32; // MUST be 32! 256 bit ymm regs + const REGION_NUM_BLOCKS: usize = 16; + + if text.len() < (BLOCK_ITEMS * REGION_NUM_BLOCKS) { + return; + } + + let mut offset = 0; + while offset < text.len() { + for block_idx in 0..REGION_NUM_BLOCKS { + // Get block ptr from mem + let block_slice: &[u8; 32] = unsafe { + &*text.as_ptr() + .byte_offset((offset + block_idx * BLOCK_ITEMS) as isize) + .cast() + }; + + // NOTE: This has *truly* terrible codegen + // for i in 0..32 { + // dots += (chunk[i] == b'.') as u32; + // numbers += (chunk[i] >= b'0' && chunk[i] <= b'9') as u32; + // } + + unsafe { + let m_chars = _mm256_loadu_si256(block_slice.as_ptr().cast()); + + // Count dots + let m_dots_masks = _mm256_cmpeq_epi8(m_chars, _mm256_set1_epi8(b'.' as i8)); + sentences += i32::count_ones(_mm256_movemask_epi8(m_dots_masks)); + + // Count digits + let m_ge_0_masks = _mm256_cmpgt_epi8(m_chars, _mm256_set1_epi8(b'0' as i8 - 1)); + let m_le_9_masks = _mm256_cmpgt_epi8(_mm256_set1_epi8(b'9' as i8 + 1), m_chars); + numbers += i32::count_ones(_mm256_movemask_epi8(_mm256_and_si256(m_ge_0_masks, m_le_9_masks))); + } + } + + offset += BLOCK_ITEMS * REGION_NUM_BLOCKS; + } + + if text.len() == 0 { + return; + } + + let mut idx = 0; + 'full_loop: loop { + unsafe { + hint::assert_unchecked(idx < text.len()); + } + + // Skip whitespace + while is_ascii_whitespace(text[idx]) { // Pretty sure this is UB. There have been out of bounds panics + idx += 1; + if hint::unlikely(idx >= text.len()) { + break 'full_loop; + } + } + + // Find end of word + let word_start = idx; + let mut has_non_upper = false; + + 'find_word_end: while let b = text[idx] && !is_ascii_whitespace(b) { + idx += 1; + if hint::unlikely(idx >= text.len()) { + break 'find_word_end; + } + + if !is_ascii_upper(b) { + has_non_upper = true; + } + } + + unsafe { + hint::assert_unchecked(word_start < idx && idx <= text.len()); + } + + let word = &text[word_start..idx]; + + words += 1; + + if !has_non_upper { + capitalizeds += 1; + } + + // Check forbidden + if unsafe { FW_TAB.lookup(word) } { +// if FW_PHF.contains(word) { // phf is a lot slower than my FwTab + forbiddens += 1; + } + } + + stats.sentences = sentences; + stats.words = words; + stats.capitalizeds = capitalizeds; + stats.numbers = numbers; + stats.forbiddens = forbiddens; +} + +fn analyze2(text: &[u8], stats: &mut Stats) { +// // NOTE: mmap is quite a bit slower +// // Load file +// let Ok(file) = File::open(file_path) else { +// eprintln!("invalid file!"); +// std::process::abort(); +// }; +// let mmap = unsafe { +// Mmap::map(&file).unwrap() +// }; +// mem::forget(file); +// let text = &*mmap; + +// // Load file +// let start_time = Instant::now(); +// let Ok(text) = fs::read(file_path) else { +// eprintln!("invalid file!"); +// process::abort(); +// }; +// let time_reading = start_time.elapsed(); +// { +// let mut guard = TIME_SPENT_READING_FILES.lock().unwrap(); +// *guard += time_reading; +// } + + let mut sentences = 0; + let mut words = 0; + let mut capitalizeds = 0; + let mut numbers = 0; + let mut forbiddens = 0; + + // __A_ + + // B_BB + + if text.len() == 0 { + return; + } + + let mut idx = 0; + 'full_loop: loop { + unsafe { + hint::assert_unchecked(idx < text.len()); + } + + // Skip whitespace + while is_ascii_whitespace(text[idx]) { // Pretty sure this is UB. There have been out of bounds panics + idx += 1; + if hint::unlikely(idx >= text.len()) { + break 'full_loop; + } + } + + // Find end of word + let word_start = idx; + let mut has_non_upper = false; + + 'find_word_end: while let b = text[idx] && !is_ascii_whitespace(b) { + idx += 1; + if hint::unlikely(idx >= text.len()) { + break 'find_word_end; + } + + if !is_ascii_upper(b) { + has_non_upper = true; + } + if b == b'.' { + sentences += 1; + } + if is_ascii_digit(b) { + numbers += 1; + } + +// sentences += (b == b'.') as u32; +// numbers += is_ascii_digit(b) as u32; + } + + unsafe { + hint::assert_unchecked(word_start < idx && idx <= text.len()); + } + + let word = &text[word_start..idx]; + + // Per-char logic +// for &b in word { +// if !is_ascii_upper(b) { +// has_non_upper = true; +// } +// if b == b'.' { +// sentences += 1; +// } +// if is_ascii_digit(b) { +// numbers += 1; +// } +// } +// for &b in word { +// if !is_ascii_upper(b) { +// has_non_upper = true; +// } +// } +// for &b in word { +// if b == b'.' { +// sentences += 1; +// } +// } +// for &b in word { +// if is_ascii_digit(b) { +// numbers += 1; +// } // } - analyze(&text, stats); - }); + words += 1; + + if !has_non_upper { + capitalizeds += 1; + } + + // Check forbidden + if unsafe { FW_TAB.lookup(word) } { +// if FW_PHF.contains(word) { // phf is a lot slower than my FwTab + forbiddens += 1; + } + } + + stats.sentences = sentences; + stats.words = words; + stats.capitalizeds = capitalizeds; + stats.numbers = numbers; + stats.forbiddens = forbiddens; } fn analyze(text: &[u8], stats: &mut Stats) { @@ -148,6 +395,14 @@ fn analyze(text: &[u8], stats: &mut Stats) { let mut numbers = 0; let mut forbiddens = 0; + // __A_ + + // B_BB + + if text.len() == 0 { + return; + } + let mut idx = 0; 'full_loop: loop { // Skip whitespace @@ -184,7 +439,7 @@ fn analyze(text: &[u8], stats: &mut Stats) { let word = &text[word_start..idx]; // let word = unsafe { &text.get_unchecked(word_start..idx) }; - + // dbg!(str::from_utf8(word).unwrap()); words += 1; @@ -360,6 +615,12 @@ fn analyze_old(file_path: &OsStr, stats: &mut Stats) { */ fn main() { +// // DEBUG: +// uring::test(); +// if 1 == 1 { +// return; +// } + // Read in files from args let mut files = Vec::with_capacity(env::args().len()); // let mut do_parallel = false; diff --git a/12bitfloat_rust/risspam/src/uring.rs b/12bitfloat_rust/risspam/src/uring.rs new file mode 100644 index 0000000..8a1ebac --- /dev/null +++ b/12bitfloat_rust/risspam/src/uring.rs @@ -0,0 +1,8 @@ + +pub fn test() { +// let ring = io_uring::Builder::::default() +// .build(128) +// .unwrap(); +// +// ring. +} diff --git a/Makefile b/Makefile index ea7c836..dd7b6ac 100644 --- a/Makefile +++ b/Makefile @@ -33,6 +33,8 @@ run: run_spam wl run_not_spam run_risspam: run_spam_risspam run_not_spam_risspam bench_rust: build_risspam benchmark_only +bench_rust_only: build_risspam + cd 12bitfloat_rust/risspam && time ./target/release/risspam ../../books/*.txt format: clang-format *.c *.h -i