Improve rust performance with some initial simd stuff

2025-10-07 00:55:06 +02:00 · 2025-10-07 00:55:06 +02:00 · dc3e47f365
commit dc3e47f365
parent 8224a609c5
6 changed files with 291 additions and 80 deletions
--- a/12bitfloat_rust/risspam/.cargo/config.toml
+++ b/12bitfloat_rust/risspam/.cargo/config.toml
@ -3,3 +3,7 @@ rustflags = [
    "-Ztls-model=initial-exec",
    "-Ctarget-cpu=native"
 ]
+
+#[unstable]
+#build-std = ["compiler_builtins", "alloc", "std", "panic_abort"] # choose only what you need
+#build-std-features = ["compiler-builtins-mem"]    
--- a/12bitfloat_rust/risspam/Cargo.lock
+++ b/12bitfloat_rust/risspam/Cargo.lock
@ -2,17 +2,6 @@
 # It is not intended for manual editing.
 version = 4

-[[package]]
-name = "core_affinity"
-version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a034b3a7b624016c6e13f5df875747cc25f884156aad2abd12b6c46797971342"
-dependencies = [
- "libc",
- "num_cpus",
- "winapi",
-]
-
 [[package]]
 name = "crossbeam-deque"
 version = "0.8.5"
@ -44,37 +33,6 @@ version = "1.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"

-[[package]]
-name = "hermit-abi"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
-
-[[package]]
-name = "libc"
-version = "0.2.176"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "58f929b4d672ea937a23a1ab494143d968337a5f47e56d0815df1e0890ddf174"
-
-[[package]]
-name = "memmap2"
-version = "0.9.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "843a98750cd611cc2965a8213b53b43e715f13c37a9e096c6408e69990961db7"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "num_cpus"
-version = "1.17.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b"
-dependencies = [
- "hermit-abi",
- "libc",
-]
-
 [[package]]
 name = "rayon"
 version = "1.10.0"
@ -99,29 +57,5 @@ dependencies = [
 name = "risspam"
 version = "0.1.0"
 dependencies = [
- "core_affinity",
- "memmap2",
 "rayon",
 ]
-
-[[package]]
-name = "winapi"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
-dependencies = [
- "winapi-i686-pc-windows-gnu",
- "winapi-x86_64-pc-windows-gnu",
-]
-
-[[package]]
-name = "winapi-i686-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
-
-[[package]]
-name = "winapi-x86_64-pc-windows-gnu"
-version = "0.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
--- a/12bitfloat_rust/risspam/Cargo.toml
+++ b/12bitfloat_rust/risspam/Cargo.toml
@ -11,8 +11,10 @@ debug = "line-tables-only"

 [dependencies]
 rayon = "1.10.0"
-memmap2 = "0.9.8"
-core_affinity = "0.8.3"
+#mimalloc = "0.1.48"
+#io-uring = "0.7"
+#memmap2 = "0.9.8"
+#core_affinity = "0.8.3"
 #libc = "0.2.176"
 #glommio = "0.9.0"
 #monoio = "0.2.4"
--- a/12bitfloat_rust/risspam/src/main.rs
+++ b/12bitfloat_rust/risspam/src/main.rs
@ -5,6 +5,8 @@

 //mod books;

+mod uring;
+
 use rayon::prelude::*;
 use std::cell::RefCell;
 use std::ffi::OsStr;
@ -15,6 +17,10 @@ use std::thread::available_parallelism;
 use std::time::{Duration, Instant};
 use std::{array, env, hint, process};

+// NOTE: mimalloc is actually slower
+//#[global_allocator]
+//static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
 #[inline]
 fn is_ascii_whitespace(b: u8) -> bool {
 	matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ')
@ -74,9 +80,9 @@ extern "rust-cold" fn die() -> ! {

 fn work(file_path: &OsStr, stats: &mut Stats) {
 	WORK_STATE.with_borrow_mut(|state: &mut WorkState| {
-//		// Load file
-//		let start_time = Instant::now();
-
+		// Load file
+		let start_time = Instant::now();
+		
 //		let Ok(text) = fs::read(file_path) else {
 //			eprintln!("invalid file!");
 //			process::abort();
@ -102,19 +108,260 @@ fn work(file_path: &OsStr, stats: &mut Stats) {
 			read_offset += rb;
 		}
 		let text = &state.work_mem[..read_offset];
-
+		
 //		file.read_exact(&mut state.work_mem[..file_len]).unwrap();
-
+		
 //		let text = include_bytes!("../../../books/Advanced Techniques in Web Intelligence – Part II.txt").as_slice();
+		
+		let time_reading = start_time.elapsed();
+		{
+			let mut guard = TIME_SPENT_READING_FILES.lock().unwrap();
+			*guard += time_reading;
+		}
+		
+//		analyze(&text, stats);
+//		analyze2(&text, stats);
+		analyze_simd(&text, stats);
+	});
+}

-//		let time_reading = start_time.elapsed();
-//		{
-//			let mut guard = TIME_SPENT_READING_FILES.lock().unwrap();
-//			*guard += time_reading;
+fn analyze_simd(text: &[u8], stats: &mut Stats) {
+	use std::arch::x86_64::*;
+	
+	let mut sentences = 0;
+	let mut words = 0;
+	let mut capitalizeds = 0;
+	let mut numbers = 0;
+	let mut forbiddens = 0;
+	
+	const BLOCK_ITEMS: usize = 32; // MUST be 32! 256 bit ymm regs
+	const REGION_NUM_BLOCKS: usize = 16;
+	
+	if text.len() < (BLOCK_ITEMS * REGION_NUM_BLOCKS) {
+		return;
+	}
+	
+	let mut offset = 0;
+	while offset < text.len() {
+		for block_idx in 0..REGION_NUM_BLOCKS {
+			// Get block ptr from mem
+			let block_slice: &[u8; 32] = unsafe {
+				&*text.as_ptr()
+					.byte_offset((offset + block_idx * BLOCK_ITEMS) as isize)
+					.cast()
+			};
+			
+			// NOTE: This has *truly* terrible codegen
+	//		for i in 0..32 {
+	//			dots += (chunk[i] == b'.') as u32;
+	//			numbers += (chunk[i] >= b'0' && chunk[i] <= b'9') as u32;
+	//		}
+			
+			unsafe {
+				let m_chars = _mm256_loadu_si256(block_slice.as_ptr().cast());
+				
+				// Count dots
+				let m_dots_masks = _mm256_cmpeq_epi8(m_chars, _mm256_set1_epi8(b'.' as i8));
+				sentences += i32::count_ones(_mm256_movemask_epi8(m_dots_masks));
+				
+				// Count digits
+				let m_ge_0_masks = _mm256_cmpgt_epi8(m_chars, _mm256_set1_epi8(b'0' as i8 - 1));
+				let m_le_9_masks = _mm256_cmpgt_epi8(_mm256_set1_epi8(b'9' as i8 + 1), m_chars);
+				numbers += i32::count_ones(_mm256_movemask_epi8(_mm256_and_si256(m_ge_0_masks, m_le_9_masks)));
+			}
+		}
+		
+		offset += BLOCK_ITEMS * REGION_NUM_BLOCKS;
+	}
+	
+	if text.len() == 0 {
+		return;
+	}
+	
+	let mut idx = 0;
+	'full_loop: loop {
+		unsafe {
+			hint::assert_unchecked(idx < text.len());
+		}
+		
+		// Skip whitespace
+		while is_ascii_whitespace(text[idx]) { // Pretty sure this is UB. There have been out of bounds panics
+			idx += 1;
+			if hint::unlikely(idx >= text.len()) {
+				break 'full_loop;
+			}
+		}
+		
+		// Find end of word
+		let word_start = idx;
+		let mut has_non_upper = false;
+		
+		'find_word_end: while let b = text[idx] && !is_ascii_whitespace(b) {
+			idx += 1;
+			if hint::unlikely(idx >= text.len()) {
+				break 'find_word_end;
+			}
+			
+			if !is_ascii_upper(b) {
+				has_non_upper = true;
+			}
+		}
+		
+		unsafe {
+			hint::assert_unchecked(word_start < idx && idx <= text.len());
+		}
+		
+		let word = &text[word_start..idx];
+		
+		words += 1;
+		
+		if !has_non_upper {
+			capitalizeds += 1;
+		}
+		
+		// Check forbidden
+		if unsafe { FW_TAB.lookup(word) } {
+//		if FW_PHF.contains(word) {  // phf is a lot slower than my FwTab
+			forbiddens += 1;
+		}
+	}
+	
+	stats.sentences = sentences;
+	stats.words = words;
+	stats.capitalizeds = capitalizeds;
+	stats.numbers = numbers;
+	stats.forbiddens = forbiddens;
+}
+
+fn analyze2(text: &[u8], stats: &mut Stats) {
+//	// NOTE: mmap is quite a bit slower
+//	// Load file
+//	let Ok(file) = File::open(file_path) else {
+//		eprintln!("invalid file!");
+//		std::process::abort();
+//	};
+//	let mmap = unsafe {
+//		Mmap::map(&file).unwrap()
+//	};
+//	mem::forget(file);
+//	let text = &*mmap;
+
+//	// Load file
+//	let start_time = Instant::now();
+//	let Ok(text) = fs::read(file_path) else {
+//		eprintln!("invalid file!");
+//		process::abort();
+//	};
+//	let time_reading = start_time.elapsed();
+//	{
+//		let mut guard = TIME_SPENT_READING_FILES.lock().unwrap();
+//		*guard += time_reading;
+//	}
+	
+	let mut sentences = 0;
+	let mut words = 0;
+	let mut capitalizeds = 0;
+	let mut numbers = 0;
+	let mut forbiddens = 0;
+	
+	// __A_
+	
+	// B_BB
+	
+	if text.len() == 0 {
+		return;
+	}
+	
+	let mut idx = 0;
+	'full_loop: loop {
+		unsafe {
+			hint::assert_unchecked(idx < text.len());
+		}
+		
+		// Skip whitespace
+		while is_ascii_whitespace(text[idx]) { // Pretty sure this is UB. There have been out of bounds panics
+			idx += 1;
+			if hint::unlikely(idx >= text.len()) {
+				break 'full_loop;
+			}
+		}
+		
+		// Find end of word
+		let word_start = idx;
+		let mut has_non_upper = false;
+		
+		'find_word_end: while let b = text[idx] && !is_ascii_whitespace(b) {
+			idx += 1;
+			if hint::unlikely(idx >= text.len()) {
+				break 'find_word_end;
+			}
+			
+			if !is_ascii_upper(b) {
+				has_non_upper = true;
+			}
+			if b == b'.' {
+				sentences += 1;
+			}
+			if is_ascii_digit(b) {
+				numbers += 1;
+			}
+			
+//			sentences += (b == b'.') as u32;
+//			numbers += is_ascii_digit(b) as u32;
+		}
+		
+		unsafe {
+			hint::assert_unchecked(word_start < idx && idx <= text.len());
+		}
+		
+		let word = &text[word_start..idx];
+		
+		// Per-char logic
+//		for &b in word {
+//			if !is_ascii_upper(b) {
+//				has_non_upper = true;
+//			}
+//			if b == b'.' {
+//				sentences += 1;
+//			}
+//			if is_ascii_digit(b) {
+//				numbers += 1;
+//			}
+//		}
+//		for &b in word {
+//			if !is_ascii_upper(b) {
+//				has_non_upper = true;
+//			}
+//		}
+//		for &b in word {
+//			if b == b'.' {
+//				sentences += 1;
+//			}
+//		}
+//		for &b in word {
+//			if is_ascii_digit(b) {
+//				numbers += 1;
+//			}
 //		}
 		
-		analyze(&text, stats);
-	});
+		words += 1;
+		
+		if !has_non_upper {
+			capitalizeds += 1;
+		}
+		
+		// Check forbidden
+		if unsafe { FW_TAB.lookup(word) } {
+//		if FW_PHF.contains(word) {  // phf is a lot slower than my FwTab
+			forbiddens += 1;
+		}
+	}
+	
+	stats.sentences = sentences;
+	stats.words = words;
+	stats.capitalizeds = capitalizeds;
+	stats.numbers = numbers;
+	stats.forbiddens = forbiddens;
 }

 fn analyze(text: &[u8], stats: &mut Stats) {
@ -148,6 +395,14 @@ fn analyze(text: &[u8], stats: &mut Stats) {
 	let mut numbers = 0;
 	let mut forbiddens = 0;
 	
+	// __A_
+	
+	// B_BB
+	
+	if text.len() == 0 {
+		return;
+	}
+	
 	let mut idx = 0;
 	'full_loop: loop {
 		// Skip whitespace
@ -184,7 +439,7 @@ fn analyze(text: &[u8], stats: &mut Stats) {
 		
 		let word = &text[word_start..idx];
 //		let word = unsafe { &text.get_unchecked(word_start..idx) };
-		
+
 //		dbg!(str::from_utf8(word).unwrap());
 		
 		words += 1;
@ -360,6 +615,12 @@ fn analyze_old(file_path: &OsStr, stats: &mut Stats) {
 */

 fn main() {
+//	// DEBUG:
+//	uring::test();
+//	if 1 == 1 {
+//		return;
+//	}
+	
 	// Read in files from args
 	let mut files = Vec::with_capacity(env::args().len());
 //	let mut do_parallel = false;
--- a/12bitfloat_rust/risspam/src/uring.rs
+++ b/12bitfloat_rust/risspam/src/uring.rs
@ -0,0 +1,8 @@
+
+pub fn test() {
+//	let ring = io_uring::Builder::<io_uring::squeue::Entry, io_uring::cqueue::Entry>::default()
+//		.build(128)
+//		.unwrap();
+//	
+//	ring.
+}
--- a/2
+++ b/2
@ -33,6 +33,8 @@ run: run_spam wl run_not_spam
 run_risspam: run_spam_risspam run_not_spam_risspam

 bench_rust: build_risspam benchmark_only
+bench_rust_only: build_risspam
+	cd 12bitfloat_rust/risspam && time ./target/release/risspam ../../books/*.txt

 format:
 	clang-format *.c *.h -i