1pub mod async_updates;
15
16use scirs2_core::ndarray::{s, Array1, Array2, Array3, Axis};
17use sklears_core::error::SklearsError;
18use std::collections::VecDeque;
19use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
20use std::sync::{Arc, Condvar, Mutex};
21use std::thread;
22
23pub use async_updates::{
24 async_sgd_simulation, AsyncADMM, AsyncCoordinateDescent, AsyncUpdateConfig, AsyncUpdateResults,
25 BoundedAsyncCoordinator,
26};
27
28pub struct WorkStealingThreadPool {
34 workers: Vec<WorkerThread>,
35 shared_queue: Arc<Mutex<VecDeque<Box<dyn FnOnce() + Send + 'static>>>>,
36 shutdown: Arc<AtomicBool>,
37 condvar: Arc<Condvar>,
38 n_threads: usize,
39}
40
41struct WorkerThread {
42 handle: Option<thread::JoinHandle<()>>,
43 local_queue: Arc<Mutex<VecDeque<Box<dyn FnOnce() + Send + 'static>>>>,
44}
45
46impl WorkStealingThreadPool {
47 pub fn new(n_threads: usize) -> Self {
49 let n_threads = n_threads.max(1);
50 let shared_queue = Arc::new(Mutex::new(VecDeque::new()));
51 let shutdown = Arc::new(AtomicBool::new(false));
52 let condvar = Arc::new(Condvar::new());
53
54 let mut workers = Vec::with_capacity(n_threads);
55
56 for worker_id in 0..n_threads {
57 let local_queue = Arc::new(Mutex::new(VecDeque::new()));
58 let worker = WorkerThread::spawn(
59 worker_id,
60 local_queue.clone(),
61 shared_queue.clone(),
62 shutdown.clone(),
63 condvar.clone(),
64 n_threads,
65 );
66 workers.push(WorkerThread {
67 handle: Some(worker),
68 local_queue,
69 });
70 }
71
72 Self {
73 workers,
74 shared_queue,
75 shutdown,
76 condvar,
77 n_threads,
78 }
79 }
80
81 pub fn execute<F>(&self, task: F)
83 where
84 F: FnOnce() + Send + 'static,
85 {
86 let mut min_load = usize::MAX;
88 let mut best_worker = 0;
89
90 for (i, worker) in self.workers.iter().enumerate() {
91 if let Ok(queue) = worker.local_queue.try_lock() {
92 let load = queue.len();
93 if load < min_load {
94 min_load = load;
95 best_worker = i;
96 }
97 }
98 }
99
100 if let Ok(mut queue) = self.workers[best_worker].local_queue.try_lock() {
102 queue.push_back(Box::new(task));
103 drop(queue);
104 self.condvar.notify_one();
105 } else {
106 let mut shared = self.shared_queue.lock().unwrap();
108 shared.push_back(Box::new(task));
109 drop(shared);
110 self.condvar.notify_all();
111 }
112 }
113
114 pub fn execute_parallel<F, T>(&self, tasks: Vec<F>) -> Vec<T>
116 where
117 F: FnOnce() -> T + Send + 'static,
118 T: Send + 'static,
119 {
120 if tasks.is_empty() {
121 return Vec::new();
122 }
123
124 let results = Arc::new(Mutex::new(Vec::with_capacity(tasks.len())));
125 let remaining = Arc::new(AtomicUsize::new(tasks.len()));
126
127 for (i, task) in tasks.into_iter().enumerate() {
128 let results_clone = results.clone();
129 let remaining_clone = remaining.clone();
130
131 self.execute(move || {
132 let result = task();
133 {
134 let mut results_guard = results_clone.lock().unwrap();
135 if results_guard.len() <= i {
137 results_guard.resize_with(i + 1, || unsafe { std::mem::zeroed() });
138 }
139 results_guard[i] = result;
140 }
141 remaining_clone.fetch_sub(1, Ordering::SeqCst);
142 });
143 }
144
145 while remaining.load(Ordering::SeqCst) > 0 {
147 std::thread::yield_now();
148 }
149
150 let results_guard = results.lock().unwrap();
152 let mut final_results = Vec::with_capacity(results_guard.len());
153 for item in results_guard.iter() {
154 final_results.push(unsafe { std::ptr::read(item as *const T) });
156 }
157 final_results
158 }
159
160 pub fn n_threads(&self) -> usize {
162 self.n_threads
163 }
164}
165
166impl WorkerThread {
167 fn spawn(
168 worker_id: usize,
169 local_queue: Arc<Mutex<VecDeque<Box<dyn FnOnce() + Send + 'static>>>>,
170 shared_queue: Arc<Mutex<VecDeque<Box<dyn FnOnce() + Send + 'static>>>>,
171 shutdown: Arc<AtomicBool>,
172 condvar: Arc<Condvar>,
173 n_workers: usize,
174 ) -> thread::JoinHandle<()> {
175 thread::spawn(move || {
176 let mut rng_seed = worker_id * 7 + 13; while !shutdown.load(Ordering::Relaxed) {
179 let mut task = None;
181 if let Ok(mut queue) = local_queue.try_lock() {
182 task = queue.pop_front();
183 }
184
185 if task.is_none() {
187 for _ in 0..n_workers {
188 rng_seed = rng_seed.wrapping_mul(1103515245).wrapping_add(12345);
189 let target_worker = rng_seed % n_workers;
190
191 if target_worker != worker_id {
192 break;
195 }
196 }
197 }
198
199 if task.is_none() {
201 if let Ok(mut shared) = shared_queue.try_lock() {
202 task = shared.pop_front();
203 }
204 }
205
206 if let Some(work) = task {
207 work();
208 } else {
209 let _guard = shared_queue.lock().unwrap();
211 let _ = condvar.wait_timeout(_guard, std::time::Duration::from_millis(10));
212 }
213 }
214 })
215 }
216}
217
218impl Drop for WorkStealingThreadPool {
219 fn drop(&mut self) {
220 self.shutdown.store(true, Ordering::SeqCst);
221 self.condvar.notify_all();
222
223 for worker in &mut self.workers {
224 if let Some(handle) = worker.handle.take() {
225 let _ = handle.join();
226 }
227 }
228 }
229}
230
231impl std::fmt::Debug for WorkStealingThreadPool {
232 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
233 f.debug_struct("WorkStealingThreadPool")
234 .field("n_threads", &self.n_threads)
235 .field("workers_count", &self.workers.len())
236 .field("shutdown", &self.shutdown)
237 .finish()
238 }
239}
240
241#[derive(Debug, Clone)]
243pub struct OptimizedMatrixOps {
244 thread_pool: Option<Arc<WorkStealingThreadPool>>,
245 block_size: usize,
246 use_simd: bool,
247}
248
249impl OptimizedMatrixOps {
250 pub fn new() -> Self {
252 Self {
253 thread_pool: None,
254 block_size: 64,
255 use_simd: true,
256 }
257 }
258
259 pub fn with_thread_pool(mut self, pool: Arc<WorkStealingThreadPool>) -> Self {
261 self.thread_pool = Some(pool);
262 self
263 }
264
265 pub fn block_size(mut self, size: usize) -> Self {
267 self.block_size = size.max(8);
268 self
269 }
270
271 pub fn use_simd(mut self, enable: bool) -> Self {
273 self.use_simd = enable;
274 self
275 }
276
277 pub fn block_matmul(
279 &self,
280 a: &Array2<f64>,
281 b: &Array2<f64>,
282 ) -> Result<Array2<f64>, SklearsError> {
283 let (m, k) = a.dim();
284 let (k2, n) = b.dim();
285
286 if k != k2 {
287 return Err(SklearsError::InvalidInput(
288 "Matrix dimensions don't match for multiplication".to_string(),
289 ));
290 }
291
292 if let Some(pool) = &self.thread_pool {
293 self.parallel_block_matmul(a, b, pool)
294 } else {
295 let mut c = Array2::zeros((m, n));
296 self.sequential_block_matmul(a, b, &mut c);
297 Ok(c)
298 }
299 }
300
301 fn parallel_block_matmul(
302 &self,
303 a: &Array2<f64>,
304 b: &Array2<f64>,
305 pool: &WorkStealingThreadPool,
306 ) -> Result<Array2<f64>, SklearsError> {
307 let (m, k) = a.dim();
308 let (_, n) = b.dim();
309
310 let block_size = self.block_size;
311 let m_blocks = (m + block_size - 1) / block_size;
312 let n_blocks = (n + block_size - 1) / block_size;
313 let k_blocks = (k + block_size - 1) / block_size;
314
315 let result = Arc::new(Mutex::new(Array2::zeros((m, n))));
317
318 let mut tasks = Vec::new();
320
321 let a_shared = Arc::new(a.clone());
323 let b_shared = Arc::new(b.clone());
324
325 for i_block in 0..m_blocks {
326 for j_block in 0..n_blocks {
327 let a_ref = a_shared.clone();
328 let b_ref = b_shared.clone();
329 let result_ref = result.clone();
330
331 let block_size_local = block_size;
332
333 let task = move || {
334 let i_start = i_block * block_size_local;
335 let i_end = ((i_block + 1) * block_size_local).min(m);
336 let j_start = j_block * block_size_local;
337 let j_end = ((j_block + 1) * block_size_local).min(n);
338
339 let mut local_result = Array2::zeros((i_end - i_start, j_end - j_start));
341
342 for k_block in 0..k_blocks {
343 let k_start = k_block * block_size_local;
344 let k_end = ((k_block + 1) * block_size_local).min(k);
345
346 for i in 0..(i_end - i_start) {
347 for j in 0..(j_end - j_start) {
348 let mut sum = 0.0;
349 for k_idx in k_start..k_end {
350 sum +=
351 a_ref[[i_start + i, k_idx]] * b_ref[[k_idx, j_start + j]];
352 }
353 local_result[[i, j]] += sum;
354 }
355 }
356 }
357
358 {
360 let mut result_guard = result_ref.lock().unwrap();
361 for i in 0..(i_end - i_start) {
362 for j in 0..(j_end - j_start) {
363 result_guard[[i_start + i, j_start + j]] = local_result[[i, j]];
364 }
365 }
366 }
367 };
368
369 tasks.push(task);
370 }
371 }
372
373 pool.execute_parallel(tasks);
375
376 let final_result = result.lock().unwrap().clone();
378 Ok(final_result)
379 }
380
381 fn sequential_block_matmul(&self, a: &Array2<f64>, b: &Array2<f64>, c: &mut Array2<f64>) {
382 let (m, k) = a.dim();
383 let (_, n) = b.dim();
384 let block_size = self.block_size;
385
386 for i_block in (0..m).step_by(block_size) {
388 for j_block in (0..n).step_by(block_size) {
389 for k_block in (0..k).step_by(block_size) {
390 let i_end = (i_block + block_size).min(m);
391 let j_end = (j_block + block_size).min(n);
392 let k_end = (k_block + block_size).min(k);
393
394 for i in i_block..i_end {
396 for j in j_block..j_end {
397 let mut sum = 0.0;
398 for k_idx in k_block..k_end {
399 sum += a[[i, k_idx]] * b[[k_idx, j]];
400 }
401 c[[i, j]] += sum;
402 }
403 }
404 }
405 }
406 }
407 }
408
409 pub fn simd_dot_product(&self, a: &Array1<f64>, b: &Array1<f64>) -> Result<f64, SklearsError> {
411 if a.len() != b.len() {
412 return Err(SklearsError::InvalidInput(
413 "Vector lengths must match".to_string(),
414 ));
415 }
416
417 if !self.use_simd {
418 return Ok(a.dot(b));
419 }
420
421 let n = a.len();
423 let mut sum = 0.0;
424
425 let chunks = n / 4;
427 let remainder = n % 4;
428
429 for i in 0..chunks {
430 let idx = i * 4;
431 sum += a[idx] * b[idx]
432 + a[idx + 1] * b[idx + 1]
433 + a[idx + 2] * b[idx + 2]
434 + a[idx + 3] * b[idx + 3];
435 }
436
437 for i in (chunks * 4)..n {
439 sum += a[i] * b[i];
440 }
441
442 Ok(sum)
443 }
444
445 pub fn cache_friendly_transpose(&self, matrix: &Array2<f64>) -> Array2<f64> {
447 let (m, n) = matrix.dim();
448 let mut result = Array2::zeros((n, m));
449 let block_size = self.block_size;
450
451 for i_block in (0..m).step_by(block_size) {
453 for j_block in (0..n).step_by(block_size) {
454 let i_end = (i_block + block_size).min(m);
455 let j_end = (j_block + block_size).min(n);
456
457 for i in i_block..i_end {
458 for j in j_block..j_end {
459 result[[j, i]] = matrix[[i, j]];
460 }
461 }
462 }
463 }
464
465 result
466 }
467}
468
469impl Default for OptimizedMatrixOps {
470 fn default() -> Self {
471 Self::new()
472 }
473}
474
475#[derive(Debug, Clone)]
505pub struct ParallelEigenSolver {
506 n_threads: usize,
507 block_size: usize,
508 tolerance: f64,
509 max_iterations: usize,
510 method: EigenMethod,
511 thread_pool: Option<Arc<WorkStealingThreadPool>>,
512 matrix_ops: OptimizedMatrixOps,
513}
514
515#[derive(Debug, Clone, Copy, PartialEq)]
517pub enum EigenMethod {
518 Jacobi,
520 QR,
522 DivideConquer,
524 Power,
526}
527
528impl ParallelEigenSolver {
529 pub fn new() -> Self {
531 let n_threads = std::thread::available_parallelism()
532 .map(|n| n.get())
533 .unwrap_or(4);
534
535 Self {
536 n_threads,
537 block_size: 64,
538 tolerance: 1e-10,
539 max_iterations: 1000,
540 method: EigenMethod::DivideConquer,
541 thread_pool: None,
542 matrix_ops: OptimizedMatrixOps::new(),
543 }
544 }
545
546 pub fn with_thread_pool(mut self, pool: Arc<WorkStealingThreadPool>) -> Self {
548 self.thread_pool = Some(pool.clone());
549 self.matrix_ops = self.matrix_ops.with_thread_pool(pool);
550 self
551 }
552
553 pub fn n_threads(mut self, n_threads: usize) -> Self {
555 self.n_threads = n_threads.max(1);
556 if self.thread_pool.is_none() {
558 let pool = Arc::new(WorkStealingThreadPool::new(self.n_threads));
559 self.thread_pool = Some(pool.clone());
560 self.matrix_ops = self.matrix_ops.with_thread_pool(pool);
561 }
562 self
563 }
564
565 pub fn block_size(mut self, block_size: usize) -> Self {
567 self.block_size = block_size.max(8);
568 self
569 }
570
571 pub fn tolerance(mut self, tol: f64) -> Self {
573 self.tolerance = tol;
574 self
575 }
576
577 pub fn max_iterations(mut self, max_iter: usize) -> Self {
579 self.max_iterations = max_iter;
580 self
581 }
582
583 pub fn method(mut self, method: EigenMethod) -> Self {
585 self.method = method;
586 self
587 }
588
589 pub fn solve(&self, matrix: &Array2<f64>) -> Result<(Array1<f64>, Array2<f64>), SklearsError> {
591 let n = matrix.nrows();
592
593 if n != matrix.ncols() {
594 return Err(SklearsError::InvalidInput(
595 "Matrix must be square for eigenvalue decomposition".to_string(),
596 ));
597 }
598
599 if !self.is_symmetric(matrix) {
600 return Err(SklearsError::InvalidInput(
601 "Matrix must be symmetric for this implementation".to_string(),
602 ));
603 }
604
605 match self.method {
606 EigenMethod::Jacobi => self.solve_jacobi(matrix),
607 EigenMethod::QR => self.solve_qr(matrix),
608 EigenMethod::DivideConquer => self.solve_divide_conquer(matrix),
609 EigenMethod::Power => self.solve_power(matrix),
610 }
611 }
612
613 fn is_symmetric(&self, matrix: &Array2<f64>) -> bool {
614 let n = matrix.nrows();
615 for i in 0..n {
616 for j in 0..n {
617 if (matrix[[i, j]] - matrix[[j, i]]).abs() > self.tolerance {
618 return false;
619 }
620 }
621 }
622 true
623 }
624
625 fn solve_jacobi(
626 &self,
627 matrix: &Array2<f64>,
628 ) -> Result<(Array1<f64>, Array2<f64>), SklearsError> {
629 let n = matrix.nrows();
630 let mut a = matrix.clone();
631 let mut v = Array2::eye(n);
632
633 for _iteration in 0..self.max_iterations {
634 let mut max_off_diagonal = 0.0;
635 let mut rotation_pairs: Vec<(usize, usize, f64)> = Vec::new();
636
637 let pairs = Arc::new(Mutex::new(Vec::new()));
639 let chunk_size = n / self.n_threads + 1;
640
641 thread::scope(|s| {
642 for thread_id in 0..self.n_threads {
643 let pairs_clone = Arc::clone(&pairs);
644 let a_ref = &a;
645
646 s.spawn(move || {
647 let start = thread_id * chunk_size;
648 let end = ((thread_id + 1) * chunk_size).min(n);
649 let mut local_pairs = Vec::new();
650
651 for i in start..end {
652 for j in i + 1..n {
653 let off_diag = a_ref[[i, j]].abs();
654 if off_diag > self.tolerance {
655 local_pairs.push((i, j, off_diag));
656 }
657 }
658 }
659
660 let mut pairs_guard = pairs_clone.lock().unwrap();
661 pairs_guard.extend(local_pairs);
662 });
663 }
664 });
665
666 let pairs_vec = pairs.lock().unwrap().clone();
667
668 if pairs_vec.is_empty() {
669 break; }
671
672 max_off_diagonal = pairs_vec.iter().map(|&(_, _, val)| val).fold(0.0, f64::max);
674
675 if max_off_diagonal < self.tolerance {
676 break; }
678
679 self.apply_jacobi_rotations(&mut a, &mut v, &pairs_vec)?;
681 }
682
683 let mut eigenvalues = Array1::zeros(n);
685 for i in 0..n {
686 eigenvalues[i] = a[[i, i]];
687 }
688
689 self.sort_eigen_pairs(&mut eigenvalues, &mut v)?;
691
692 Ok((eigenvalues, v))
693 }
694
695 fn apply_jacobi_rotations(
696 &self,
697 a: &mut Array2<f64>,
698 v: &mut Array2<f64>,
699 pairs: &[(usize, usize, f64)],
700 ) -> Result<(), SklearsError> {
701 let n = a.nrows();
702
703 let mut groups = Vec::new();
705 let mut used_indices = vec![false; n];
706
707 for &(i, j, _) in pairs {
708 if !used_indices[i] && !used_indices[j] {
709 groups.push((i, j));
710 used_indices[i] = true;
711 used_indices[j] = true;
712 }
713 }
714
715 for (i, j) in groups {
719 self.apply_single_jacobi_rotation(a, v, i, j);
720 }
721
722 Ok(())
723 }
724
725 fn apply_single_jacobi_rotation(
726 &self,
727 a: &mut Array2<f64>,
728 v: &mut Array2<f64>,
729 p: usize,
730 q: usize,
731 ) {
732 let n = a.nrows();
733
734 if a[[p, q]].abs() < self.tolerance {
735 return;
736 }
737
738 let theta = if (a[[p, p]] - a[[q, q]]).abs() < self.tolerance {
740 std::f64::consts::PI / 4.0
741 } else {
742 0.5 * (2.0 * a[[p, q]] / (a[[q, q]] - a[[p, p]])).atan()
743 };
744
745 let cos_theta = theta.cos();
746 let sin_theta = theta.sin();
747
748 let a_pp = a[[p, p]];
750 let a_qq = a[[q, q]];
751 let a_pq = a[[p, q]];
752
753 a[[p, p]] = cos_theta * cos_theta * a_pp + sin_theta * sin_theta * a_qq
754 - 2.0 * cos_theta * sin_theta * a_pq;
755 a[[q, q]] = sin_theta * sin_theta * a_pp
756 + cos_theta * cos_theta * a_qq
757 + 2.0 * cos_theta * sin_theta * a_pq;
758 a[[p, q]] = 0.0;
759 a[[q, p]] = 0.0;
760
761 for i in 0..n {
763 if i != p && i != q {
764 let a_ip = a[[i, p]];
765 let a_iq = a[[i, q]];
766
767 a[[i, p]] = cos_theta * a_ip - sin_theta * a_iq;
768 a[[p, i]] = a[[i, p]];
769
770 a[[i, q]] = sin_theta * a_ip + cos_theta * a_iq;
771 a[[q, i]] = a[[i, q]];
772 }
773 }
774
775 for i in 0..n {
777 let v_ip = v[[i, p]];
778 let v_iq = v[[i, q]];
779
780 v[[i, p]] = cos_theta * v_ip - sin_theta * v_iq;
781 v[[i, q]] = sin_theta * v_ip + cos_theta * v_iq;
782 }
783 }
784
785 fn solve_qr(&self, matrix: &Array2<f64>) -> Result<(Array1<f64>, Array2<f64>), SklearsError> {
786 let n = matrix.nrows();
787 let mut a = matrix.clone();
788 let mut q_total = Array2::eye(n);
789
790 for _iteration in 0..self.max_iterations {
791 let (q, r) = self.parallel_qr_decomposition(&a)?;
793
794 a = r.dot(&q);
796
797 q_total = q_total.dot(&q);
799
800 let mut converged = true;
802 for i in 0..n {
803 for j in 0..i {
804 if a[[i, j]].abs() > self.tolerance {
805 converged = false;
806 break;
807 }
808 }
809 if !converged {
810 break;
811 }
812 }
813
814 if converged {
815 break;
816 }
817 }
818
819 let mut eigenvalues = Array1::zeros(n);
821 for i in 0..n {
822 eigenvalues[i] = a[[i, i]];
823 }
824
825 self.sort_eigen_pairs(&mut eigenvalues, &mut q_total)?;
827
828 Ok((eigenvalues, q_total))
829 }
830
831 fn parallel_qr_decomposition(
832 &self,
833 matrix: &Array2<f64>,
834 ) -> Result<(Array2<f64>, Array2<f64>), SklearsError> {
835 let n = matrix.nrows();
836 let mut q = Array2::eye(n);
837 let mut r = matrix.clone();
838
839 for k in 0..n {
841 let column = r.slice(s![k.., k]).to_owned();
843 let (householder_v, beta) = self.compute_householder_vector(&column)?;
844
845 self.apply_householder_reflection(&mut r, &householder_v, beta, k)?;
847
848 self.apply_householder_reflection_to_q(&mut q, &householder_v, beta, k)?;
850 }
851
852 Ok((q, r))
853 }
854
855 fn compute_householder_vector(
856 &self,
857 x: &Array1<f64>,
858 ) -> Result<(Array1<f64>, f64), SklearsError> {
859 let n = x.len();
860 if n == 0 {
861 return Err(SklearsError::InvalidInput("Empty vector".to_string()));
862 }
863
864 let norm_x = (x.dot(x)).sqrt();
865 if norm_x < self.tolerance {
866 return Ok((Array1::zeros(n), 0.0));
867 }
868
869 let mut v = x.clone();
870 let sign = if x[0] >= 0.0 { 1.0 } else { -1.0 };
871 v[0] += sign * norm_x;
872
873 let norm_v = (v.dot(&v)).sqrt();
874 if norm_v < self.tolerance {
875 return Ok((Array1::zeros(n), 0.0));
876 }
877
878 v /= norm_v;
879 let beta = 2.0;
880
881 Ok((v, beta))
882 }
883
884 fn apply_householder_reflection(
885 &self,
886 matrix: &mut Array2<f64>,
887 v: &Array1<f64>,
888 beta: f64,
889 start_row: usize,
890 ) -> Result<(), SklearsError> {
891 let (m, n) = matrix.dim();
892 let reflection_size = v.len();
893
894 if start_row + reflection_size > m {
895 return Ok(()); }
897
898 for j in 0..n {
900 let mut dot_product = 0.0;
901 for i in 0..reflection_size {
902 dot_product += v[i] * matrix[[start_row + i, j]];
903 }
904
905 for i in 0..reflection_size {
906 matrix[[start_row + i, j]] -= beta * v[i] * dot_product;
907 }
908 }
909
910 Ok(())
911 }
912
913 fn apply_householder_reflection_to_q(
914 &self,
915 q: &mut Array2<f64>,
916 v: &Array1<f64>,
917 beta: f64,
918 start_col: usize,
919 ) -> Result<(), SklearsError> {
920 let (m, n) = q.dim();
921 let reflection_size = v.len();
922
923 if start_col + reflection_size > n {
924 return Ok(()); }
926
927 for i in 0..m {
929 let mut dot_product = 0.0;
930 for j in 0..reflection_size {
931 dot_product += q[[i, start_col + j]] * v[j];
932 }
933
934 for j in 0..reflection_size {
935 q[[i, start_col + j]] -= beta * dot_product * v[j];
936 }
937 }
938
939 Ok(())
940 }
941
942 fn solve_divide_conquer(
943 &self,
944 matrix: &Array2<f64>,
945 ) -> Result<(Array1<f64>, Array2<f64>), SklearsError> {
946 let n = matrix.nrows();
947
948 if n <= self.block_size {
949 return self.solve_jacobi(matrix);
951 }
952
953 let mid = n / 2;
955
956 let top_left = matrix.slice(s![0..mid, 0..mid]).to_owned();
957 let top_right = matrix.slice(s![0..mid, mid..]).to_owned();
958 let bottom_left = matrix.slice(s![mid.., 0..mid]).to_owned();
959 let bottom_right = matrix.slice(s![mid.., mid..]).to_owned();
960
961 let results = Arc::new(Mutex::new(Vec::new()));
963
964 thread::scope(|s| {
965 {
967 let results_clone = Arc::clone(&results);
968 let top_left_owned = top_left.clone();
969 s.spawn(move || {
970 if let Ok((evals, evecs)) = self.solve_divide_conquer(&top_left_owned) {
971 let mut results_guard = results_clone.lock().unwrap();
972 results_guard.push((0, evals, evecs));
973 }
974 });
975 }
976
977 {
979 let results_clone = Arc::clone(&results);
980 let bottom_right_owned = bottom_right.clone();
981 s.spawn(move || {
982 if let Ok((evals, evecs)) = self.solve_divide_conquer(&bottom_right_owned) {
983 let mut results_guard = results_clone.lock().unwrap();
984 results_guard.push((1, evals, evecs));
985 }
986 });
987 }
988 });
989
990 let results_vec = results.lock().unwrap().clone();
991
992 if results_vec.len() != 2 {
993 return Err(SklearsError::InvalidInput(
994 "Failed to solve sub-problems".to_string(),
995 ));
996 }
997
998 self.merge_eigenvalue_solutions(&results_vec, &top_right, &bottom_left, mid)
1000 }
1001
1002 fn merge_eigenvalue_solutions(
1003 &self,
1004 sub_results: &[(usize, Array1<f64>, Array2<f64>)],
1005 _top_right: &Array2<f64>,
1006 _bottom_left: &Array2<f64>,
1007 mid: usize,
1008 ) -> Result<(Array1<f64>, Array2<f64>), SklearsError> {
1009 let mut top_result = None;
1011 let mut bottom_result = None;
1012
1013 for (id, evals, evecs) in sub_results {
1014 if *id == 0 {
1015 top_result = Some((evals.clone(), evecs.clone()));
1016 } else if *id == 1 {
1017 bottom_result = Some((evals.clone(), evecs.clone()));
1018 }
1019 }
1020
1021 let (top_evals, top_evecs) = top_result
1022 .ok_or_else(|| SklearsError::InvalidInput("Missing top sub-result".to_string()))?;
1023
1024 let (bottom_evals, bottom_evecs) = bottom_result
1025 .ok_or_else(|| SklearsError::InvalidInput("Missing bottom sub-result".to_string()))?;
1026
1027 let total_size = top_evals.len() + bottom_evals.len();
1029 let mut combined_evals = Array1::zeros(total_size);
1030 let mut combined_evecs = Array2::zeros((total_size, total_size));
1031
1032 for i in 0..top_evals.len() {
1034 combined_evals[i] = top_evals[i];
1035 for j in 0..top_evecs.nrows() {
1036 combined_evecs[[j, i]] = top_evecs[[j, i]];
1037 }
1038 }
1039
1040 for i in 0..bottom_evals.len() {
1042 combined_evals[top_evals.len() + i] = bottom_evals[i];
1043 for j in 0..bottom_evecs.nrows() {
1044 combined_evecs[[mid + j, top_evals.len() + i]] = bottom_evecs[[j, i]];
1045 }
1046 }
1047
1048 self.sort_eigen_pairs(&mut combined_evals, &mut combined_evecs)?;
1050
1051 Ok((combined_evals, combined_evecs))
1052 }
1053
1054 fn solve_power(
1055 &self,
1056 matrix: &Array2<f64>,
1057 ) -> Result<(Array1<f64>, Array2<f64>), SklearsError> {
1058 let n = matrix.nrows();
1059 let mut eigenvalues = Array1::zeros(n);
1060 let mut eigenvectors = Array2::zeros((n, n));
1061
1062 let mut deflated_matrix = matrix.clone();
1064
1065 for k in 0..n.min(self.n_threads) {
1066 let mut v = Array1::from_vec((0..n).map(|i| (i + k + 1) as f64).collect());
1068 v /= (v.dot(&v)).sqrt();
1069
1070 let mut lambda = 0.0;
1071
1072 for _iter in 0..self.max_iterations {
1073 let av = deflated_matrix.dot(&v);
1074 let new_lambda = v.dot(&av);
1075
1076 let av_norm = av.dot(&av).sqrt();
1077 let new_v = av / av_norm;
1078
1079 if (new_lambda - lambda).abs() < self.tolerance {
1080 lambda = new_lambda;
1081 v = new_v;
1082 break;
1083 }
1084
1085 lambda = new_lambda;
1086 v = new_v;
1087 }
1088
1089 eigenvalues[k] = lambda;
1090 eigenvectors.column_mut(k).assign(&v);
1091
1092 let vvt = v
1094 .clone()
1095 .insert_axis(Axis(1))
1096 .dot(&v.clone().insert_axis(Axis(0)));
1097 deflated_matrix = &deflated_matrix - &(lambda * vvt);
1098 }
1099
1100 for k in self.n_threads..n {
1102 eigenvalues[k] = deflated_matrix[[k, k]];
1103 eigenvectors[[k, k]] = 1.0;
1104 }
1105
1106 Ok((eigenvalues, eigenvectors))
1107 }
1108
1109 fn sort_eigen_pairs(
1110 &self,
1111 eigenvalues: &mut Array1<f64>,
1112 eigenvectors: &mut Array2<f64>,
1113 ) -> Result<(), SklearsError> {
1114 let n = eigenvalues.len();
1115 let mut indices: Vec<usize> = (0..n).collect();
1116
1117 indices.sort_by(|&i, &j| {
1119 eigenvalues[j]
1120 .abs()
1121 .partial_cmp(&eigenvalues[i].abs())
1122 .unwrap()
1123 });
1124
1125 let sorted_eigenvalues = indices.iter().map(|&i| eigenvalues[i]).collect::<Vec<_>>();
1127 for (i, &val) in sorted_eigenvalues.iter().enumerate() {
1128 eigenvalues[i] = val;
1129 }
1130
1131 let mut sorted_eigenvectors = Array2::zeros((n, n));
1133 for (new_col, &old_col) in indices.iter().enumerate() {
1134 sorted_eigenvectors
1135 .column_mut(new_col)
1136 .assign(&eigenvectors.column(old_col));
1137 }
1138 *eigenvectors = sorted_eigenvectors;
1139
1140 Ok(())
1141 }
1142}
1143
1144impl Default for ParallelEigenSolver {
1145 fn default() -> Self {
1146 Self::new()
1147 }
1148}
1149
1150#[derive(Debug, Clone)]
1155pub struct ParallelSVD {
1156 n_threads: usize,
1157 algorithm: SVDAlgorithm,
1158 tolerance: f64,
1159 max_iterations: usize,
1160}
1161
1162#[derive(Debug, Clone, Copy, PartialEq)]
1164pub enum SVDAlgorithm {
1165 GolubKahan,
1167 Jacobi,
1169 Randomized,
1171}
1172
1173impl ParallelSVD {
1174 pub fn new() -> Self {
1176 Self {
1177 n_threads: std::thread::available_parallelism()
1178 .map(|n| n.get())
1179 .unwrap_or(4),
1180 algorithm: SVDAlgorithm::GolubKahan,
1181 tolerance: 1e-10,
1182 max_iterations: 1000,
1183 }
1184 }
1185
1186 pub fn n_threads(mut self, n_threads: usize) -> Self {
1188 self.n_threads = n_threads.max(1);
1189 self
1190 }
1191
1192 pub fn algorithm(mut self, algorithm: SVDAlgorithm) -> Self {
1194 self.algorithm = algorithm;
1195 self
1196 }
1197
1198 pub fn tolerance(mut self, tol: f64) -> Self {
1200 self.tolerance = tol;
1201 self
1202 }
1203
1204 pub fn max_iterations(mut self, max_iter: usize) -> Self {
1206 self.max_iterations = max_iter;
1207 self
1208 }
1209
1210 pub fn decompose(
1212 &self,
1213 matrix: &Array2<f64>,
1214 ) -> Result<(Array2<f64>, Array1<f64>, Array2<f64>), SklearsError> {
1215 match self.algorithm {
1216 SVDAlgorithm::GolubKahan => self.golub_kahan_svd(matrix),
1217 SVDAlgorithm::Jacobi => self.jacobi_svd(matrix),
1218 SVDAlgorithm::Randomized => self.randomized_svd(matrix),
1219 }
1220 }
1221
1222 fn golub_kahan_svd(
1223 &self,
1224 matrix: &Array2<f64>,
1225 ) -> Result<(Array2<f64>, Array1<f64>, Array2<f64>), SklearsError> {
1226 let (m, n) = matrix.dim();
1227 let min_dim = m.min(n);
1228
1229 let aat = matrix.dot(&matrix.t());
1231 let eigen_solver = ParallelEigenSolver::new().n_threads(self.n_threads);
1232 let (eigenvalues, u) = eigen_solver.solve(&aat)?;
1233
1234 let singular_values = eigenvalues.mapv(|x| x.max(0.0).sqrt());
1235
1236 let at = matrix.t();
1238 let mut v = Array2::zeros((n, min_dim));
1239
1240 for i in 0..min_dim {
1241 if singular_values[i] > self.tolerance {
1242 let v_col = at.dot(&u.column(i)) / singular_values[i];
1243 v.column_mut(i).assign(&v_col);
1244 }
1245 }
1246
1247 Ok((u, singular_values, v))
1248 }
1249
1250 fn jacobi_svd(
1251 &self,
1252 matrix: &Array2<f64>,
1253 ) -> Result<(Array2<f64>, Array1<f64>, Array2<f64>), SklearsError> {
1254 let ata = matrix.t().dot(matrix);
1256 let eigen_solver = ParallelEigenSolver::new()
1257 .n_threads(self.n_threads)
1258 .method(EigenMethod::Jacobi);
1259
1260 let (eigenvalues, v) = eigen_solver.solve(&ata)?;
1261 let singular_values = eigenvalues.mapv(|x| x.max(0.0).sqrt());
1262
1263 let (m, n) = matrix.dim();
1265 let min_dim = m.min(n);
1266 let mut u = Array2::zeros((m, min_dim));
1267
1268 for i in 0..min_dim {
1269 if singular_values[i] > self.tolerance {
1270 let u_col = matrix.dot(&v.column(i)) / singular_values[i];
1271 u.column_mut(i).assign(&u_col);
1272 }
1273 }
1274
1275 Ok((u, singular_values, v))
1276 }
1277
1278 fn randomized_svd(
1279 &self,
1280 matrix: &Array2<f64>,
1281 ) -> Result<(Array2<f64>, Array1<f64>, Array2<f64>), SklearsError> {
1282 let (m, n) = matrix.dim();
1283 let k = (m.min(n) / 2).max(1); let mut omega = Array2::zeros((n, k));
1287 for i in 0..n {
1288 for j in 0..k {
1289 omega[[i, j]] = ((i * 7 + j * 13) % 1000) as f64 / 1000.0 - 0.5;
1291 }
1292 }
1293
1294 let y = matrix.dot(&omega);
1296
1297 let eigen_solver = ParallelEigenSolver::new().n_threads(self.n_threads);
1299 let (q, _) = ParallelSVD::new().parallel_qr_thin(&y)?;
1300
1301 let b = q.t().dot(matrix);
1303
1304 let (u_b, s, vt) = self.golub_kahan_svd(&b)?;
1306
1307 let u = q.dot(&u_b);
1309
1310 Ok((u, s, vt))
1311 }
1312
1313 fn parallel_qr_thin(
1314 &self,
1315 matrix: &Array2<f64>,
1316 ) -> Result<(Array2<f64>, Array2<f64>), SklearsError> {
1317 let (m, n) = matrix.dim();
1318 let mut q = Array2::zeros((m, n));
1319 let mut r = Array2::zeros((n, n));
1320
1321 for j in 0..n {
1323 let mut v = matrix.column(j).to_owned();
1324
1325 for i in 0..j {
1327 let q_i = q.column(i);
1328 let proj = q_i.dot(&v);
1329 r[[i, j]] = proj;
1330
1331 for (v_elem, &q_elem) in v.iter_mut().zip(q_i.iter()) {
1333 *v_elem -= proj * q_elem;
1334 }
1335 }
1336
1337 let norm = (v.dot(&v)).sqrt();
1339 r[[j, j]] = norm;
1340
1341 if norm > self.tolerance {
1342 v /= norm;
1343 }
1344
1345 q.column_mut(j).assign(&v);
1346 }
1347
1348 Ok((q, r))
1349 }
1350}
1351
1352impl Default for ParallelSVD {
1353 fn default() -> Self {
1354 Self::new()
1355 }
1356}
1357
1358#[derive(Debug, Clone)]
1363pub struct ParallelMatrixOps {
1364 n_threads: usize,
1365 block_size: usize,
1366}
1367
1368impl ParallelMatrixOps {
1369 pub fn new() -> Self {
1371 Self {
1372 n_threads: num_cpus::get(),
1373 block_size: 64,
1374 }
1375 }
1376
1377 pub fn n_threads(mut self, n_threads: usize) -> Self {
1379 self.n_threads = n_threads.max(1);
1380 self
1381 }
1382
1383 pub fn block_size(mut self, block_size: usize) -> Self {
1385 self.block_size = block_size.max(8);
1386 self
1387 }
1388
1389 pub fn matmul(&self, a: &Array2<f64>, b: &Array2<f64>) -> Result<Array2<f64>, SklearsError> {
1391 let (m, k) = a.dim();
1392 let (k2, n) = b.dim();
1393
1394 if k != k2 {
1395 return Err(SklearsError::InvalidInput(
1396 "Matrix dimensions don't match for multiplication".to_string(),
1397 ));
1398 }
1399
1400 let c = a.dot(b);
1403
1404 Ok(c)
1405 }
1406
1407 pub fn transpose(&self, matrix: &Array2<f64>) -> Array2<f64> {
1409 let (m, n) = matrix.dim();
1410
1411 if m * n < 10000 || self.n_threads == 1 {
1413 return matrix.t().to_owned();
1414 }
1415
1416 let block_size = self.block_size;
1417
1418 let m_blocks = (m + block_size - 1) / block_size;
1420 let n_blocks = (n + block_size - 1) / block_size;
1421
1422 let result = Arc::new(Mutex::new(Array2::zeros((n, m))));
1424
1425 thread::scope(|s| {
1426 let chunks_per_thread = ((m_blocks * n_blocks) + self.n_threads - 1) / self.n_threads;
1427
1428 for thread_id in 0..self.n_threads {
1429 let start_block = thread_id * chunks_per_thread;
1430 let end_block = ((thread_id + 1) * chunks_per_thread).min(m_blocks * n_blocks);
1431 let result_clone = result.clone();
1432
1433 s.spawn(move || {
1434 let mut local_updates = Vec::new();
1436
1437 for block_idx in start_block..end_block {
1438 let i_block = block_idx / n_blocks;
1439 let j_block = block_idx % n_blocks;
1440
1441 let i_start = i_block * block_size;
1442 let i_end = ((i_block + 1) * block_size).min(m);
1443 let j_start = j_block * block_size;
1444 let j_end = ((j_block + 1) * block_size).min(n);
1445
1446 for i in i_start..i_end {
1448 for j in j_start..j_end {
1449 local_updates.push((j, i, matrix[[i, j]]));
1450 }
1451 }
1452 }
1453
1454 let mut result_guard = result_clone.lock().unwrap();
1456 for (row, col, val) in local_updates {
1457 result_guard[[row, col]] = val;
1458 }
1459 });
1460 }
1461 });
1462
1463 match Arc::try_unwrap(result) {
1465 Ok(mutex) => mutex.into_inner().unwrap(),
1466 Err(arc) => arc.lock().unwrap().clone(),
1467 }
1468 }
1469}
1470
1471impl Default for ParallelMatrixOps {
1472 fn default() -> Self {
1473 Self::new()
1474 }
1475}
1476
1477#[allow(non_snake_case)]
1478#[cfg(test)]
1479mod tests {
1480 use super::*;
1481 use approx::assert_abs_diff_eq;
1482 use scirs2_core::ndarray::{array, Array1, Array2};
1483
1484 #[test]
1485 fn test_work_stealing_thread_pool_creation() {
1486 let pool = WorkStealingThreadPool::new(4);
1487 assert_eq!(pool.n_threads(), 4);
1488 }
1489
1490 #[test]
1491 fn test_work_stealing_thread_pool_execute() {
1492 let pool = WorkStealingThreadPool::new(2);
1493 let result = Arc::new(Mutex::new(0));
1494
1495 let result_clone = result.clone();
1496 pool.execute(move || {
1497 let mut val = result_clone.lock().unwrap();
1498 *val += 1;
1499 });
1500
1501 std::thread::sleep(std::time::Duration::from_millis(100));
1503
1504 let final_result = *result.lock().unwrap();
1505 assert_eq!(final_result, 1);
1506 }
1507
1508 #[test]
1509 fn test_work_stealing_thread_pool_parallel_execution() {
1510 let pool = WorkStealingThreadPool::new(4);
1511
1512 let tasks: Vec<Box<dyn FnOnce() -> i32 + Send>> = (0..10)
1513 .map(|i| Box::new(move || i * 2) as Box<dyn FnOnce() -> i32 + Send>)
1514 .collect();
1515
1516 let results = pool.execute_parallel(tasks);
1517
1518 assert_eq!(results.len(), 10);
1519 for (i, &result) in results.iter().enumerate() {
1520 assert_eq!(result, i as i32 * 2);
1521 }
1522 }
1523
1524 #[test]
1525 fn test_optimized_matrix_ops_creation() {
1526 let ops = OptimizedMatrixOps::new();
1527 assert!(ops.use_simd);
1528 assert_eq!(ops.block_size, 64);
1529 }
1530
1531 #[test]
1532 fn test_optimized_matrix_ops_configuration() {
1533 let pool = Arc::new(WorkStealingThreadPool::new(4));
1534 let ops = OptimizedMatrixOps::new()
1535 .with_thread_pool(pool)
1536 .block_size(32)
1537 .use_simd(false);
1538
1539 assert!(!ops.use_simd);
1540 assert_eq!(ops.block_size, 32);
1541 assert!(ops.thread_pool.is_some());
1542 }
1543
1544 #[test]
1545 fn test_block_matrix_multiplication() {
1546 let a = array![[1.0, 2.0], [3.0, 4.0]];
1547 let b = array![[2.0, 0.0], [1.0, 2.0]];
1548
1549 let ops = OptimizedMatrixOps::new();
1550 let result = ops.block_matmul(&a, &b).unwrap();
1551
1552 let expected = array![[4.0, 4.0], [10.0, 8.0]];
1553
1554 assert_abs_diff_eq!(result, expected, epsilon = 1e-10);
1555 }
1556
1557 #[test]
1558 fn test_parallel_block_matrix_multiplication() {
1559 let pool = Arc::new(WorkStealingThreadPool::new(2));
1560 let ops = OptimizedMatrixOps::new().with_thread_pool(pool);
1561
1562 let a = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
1563 let b = array![[7.0, 8.0], [9.0, 10.0], [11.0, 12.0]];
1564
1565 let result = ops.block_matmul(&a, &b).unwrap();
1566
1567 let expected = array![[58.0, 64.0], [139.0, 154.0]];
1569
1570 assert_abs_diff_eq!(result, expected, epsilon = 1e-10);
1571 }
1572
1573 #[test]
1574 fn test_simd_dot_product() {
1575 let ops = OptimizedMatrixOps::new();
1576 let a = array![1.0, 2.0, 3.0, 4.0, 5.0];
1577 let b = array![2.0, 3.0, 4.0, 5.0, 6.0];
1578
1579 let result = ops.simd_dot_product(&a, &b).unwrap();
1580 let expected = 1.0 * 2.0 + 2.0 * 3.0 + 3.0 * 4.0 + 4.0 * 5.0 + 5.0 * 6.0;
1581
1582 assert_abs_diff_eq!(result, expected, epsilon = 1e-10);
1583 }
1584
1585 #[test]
1586 fn test_simd_dot_product_disabled() {
1587 let ops = OptimizedMatrixOps::new().use_simd(false);
1588 let a = array![1.0, 2.0, 3.0];
1589 let b = array![4.0, 5.0, 6.0];
1590
1591 let result = ops.simd_dot_product(&a, &b).unwrap();
1592 let expected = a.dot(&b);
1593
1594 assert_abs_diff_eq!(result, expected, epsilon = 1e-10);
1595 }
1596
1597 #[test]
1598 fn test_cache_friendly_transpose() {
1599 let ops = OptimizedMatrixOps::new();
1600 let matrix = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
1601
1602 let result = ops.cache_friendly_transpose(&matrix);
1603 let expected = array![[1.0, 4.0], [2.0, 5.0], [3.0, 6.0]];
1604
1605 assert_abs_diff_eq!(result, expected, epsilon = 1e-10);
1606 }
1607
1608 #[test]
1609 fn test_parallel_eigen_solver_with_thread_pool() {
1610 let pool = Arc::new(WorkStealingThreadPool::new(2));
1611 let solver = ParallelEigenSolver::new()
1612 .with_thread_pool(pool)
1613 .tolerance(1e-8);
1614
1615 let matrix = array![[4.0, 2.0], [2.0, 1.0]];
1617
1618 let result = solver.solve(&matrix);
1619 assert!(result.is_ok());
1620
1621 let (eigenvalues, eigenvectors) = result.unwrap();
1622 assert_eq!(eigenvalues.len(), 2);
1623 assert_eq!(eigenvectors.dim(), (2, 2));
1624
1625 assert!(eigenvalues[0] >= eigenvalues[1]);
1627 }
1628
1629 #[test]
1630 fn test_parallel_eigen_solver_methods() {
1631 let solver = ParallelEigenSolver::new();
1632
1633 let jacobi_solver = solver.clone().method(EigenMethod::Jacobi);
1635 let qr_solver = solver.clone().method(EigenMethod::QR);
1636 let power_solver = solver.clone().method(EigenMethod::Power);
1637
1638 let matrix = Array2::eye(3);
1639
1640 assert!(jacobi_solver.solve(&matrix).is_ok());
1642 assert!(qr_solver.solve(&matrix).is_ok());
1643 assert!(power_solver.solve(&matrix).is_ok());
1644 }
1645
1646 #[test]
1647 fn test_eigen_solver_error_cases() {
1648 let solver = ParallelEigenSolver::new();
1649
1650 let non_square = array![[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]];
1652 assert!(solver.solve(&non_square).is_err());
1653
1654 let non_symmetric = array![[1.0, 2.0], [3.0, 4.0]];
1656 assert!(solver.solve(&non_symmetric).is_err());
1657 }
1658
1659 #[test]
1660 fn test_matrix_ops_error_cases() {
1661 let ops = OptimizedMatrixOps::new();
1662
1663 let a = array![[1.0, 2.0]];
1665 let b = array![[1.0], [2.0], [3.0]];
1666 assert!(ops.block_matmul(&a, &b).is_err());
1667
1668 let v1 = array![1.0, 2.0];
1670 let v2 = array![1.0, 2.0, 3.0];
1671 assert!(ops.simd_dot_product(&v1, &v2).is_err());
1672 }
1673
1674 #[test]
1675 fn test_thread_pool_performance_characteristics() {
1676 let n_threads = 4;
1677 let pool = WorkStealingThreadPool::new(n_threads);
1678
1679 let n_tasks = 1000;
1681 let tasks: Vec<Box<dyn FnOnce() -> i32 + Send>> = (0..n_tasks)
1682 .map(|i| Box::new(move || (i % 100) as i32) as Box<dyn FnOnce() -> i32 + Send>)
1683 .collect();
1684
1685 let start = std::time::Instant::now();
1686 let results = pool.execute_parallel(tasks);
1687 let duration = start.elapsed();
1688
1689 assert_eq!(results.len(), n_tasks);
1690
1691 assert!(duration.as_secs() < 1);
1693
1694 for (i, &result) in results.iter().enumerate() {
1696 assert_eq!(result, (i % 100) as i32);
1697 }
1698 }
1699
1700 #[test]
1701 fn test_cache_friendly_operations_large_matrix() {
1702 let ops = OptimizedMatrixOps::new().block_size(16);
1703
1704 let size = 64;
1706 let mut matrix = Array2::zeros((size, size));
1707 for i in 0..size {
1708 for j in 0..size {
1709 matrix[[i, j]] = (i * size + j) as f64;
1710 }
1711 }
1712
1713 let transposed = ops.cache_friendly_transpose(&matrix);
1714
1715 for i in 0..size {
1717 for j in 0..size {
1718 assert_abs_diff_eq!(transposed[[j, i]], matrix[[i, j]], epsilon = 1e-10);
1719 }
1720 }
1721 }
1722
1723 #[test]
1724 fn test_optimized_operations_consistency() {
1725 let pool = Arc::new(WorkStealingThreadPool::new(4));
1726 let parallel_ops = OptimizedMatrixOps::new().with_thread_pool(pool);
1727 let sequential_ops = OptimizedMatrixOps::new();
1728
1729 let a = Array2::from_shape_fn((10, 8), |(i, j)| (i + j) as f64);
1730 let b = Array2::from_shape_fn((8, 12), |(i, j)| (i * 2 + j) as f64);
1731
1732 let parallel_result = parallel_ops.block_matmul(&a, &b).unwrap();
1733 let sequential_result = sequential_ops.block_matmul(&a, &b).unwrap();
1734
1735 assert_abs_diff_eq!(parallel_result, sequential_result, epsilon = 1e-10);
1737 }
1738}