grassmann/functions/
gemm.rs

1extern crate wasm_bindgen;
2extern crate num_cpus;
3use std::{cell::RefCell, future::Future, mem::size_of, pin::Pin, rc::Rc, task::{Context, Poll}};
4
5use num::ToPrimitive;
6use wasm_bindgen::prelude::*;
7use wasm_bindgen::JsCast;
8use js_sys::{Array, Atomics, Float32Array, Float64Array, Int32Array, SharedArrayBuffer, Uint32Array, Uint8Array};
9use rand::prelude::*;
10use rand::Rng;
11use web_sys::Event;
12
13use crate::{Number, core::matrix::{Matrix, add}, workers::{WorkerOperation, Workers}};
14
15
16
17//what i can conclude from matrices shapes that is going to help select correct mult method to boost perf ?
18
19#[wasm_bindgen]
20extern "C" {
21    #[wasm_bindgen(js_namespace = console)]
22    fn log(s: &str);
23}
24
25
26
27pub fn pack_mul_task(
28    t: [usize; 8], 
29    sab:&SharedArrayBuffer, 
30    A_rows: u32,
31    A_columns: u32,
32    B_rows: u32,
33    B_columns: u32
34) -> js_sys::Array {
35    let array: js_sys::Array = js_sys::Array::new();
36
37    array.push(&sab);
38    
39    let a_rows = JsValue::from(A_rows);
40    let a_columns = JsValue::from(A_columns);
41
42    let b_rows = JsValue::from(B_rows);
43    let b_columns = JsValue::from(B_columns);
44
45    let t0 = JsValue::from(t[0] as u32);
46    let t1 = JsValue::from(t[1] as u32);
47    let t2 = JsValue::from(t[2] as u32); 
48    let t3 = JsValue::from(t[3] as u32); 
49
50    let t4 = JsValue::from(t[4] as u32); 
51    let t5 = JsValue::from(t[5] as u32); 
52    let t6 = JsValue::from(t[6] as u32); 
53    let t7 = JsValue::from(t[7] as u32); 
54
55    array.push(&a_rows);
56    array.push(&a_columns);
57
58    array.push(&b_rows);
59    array.push(&b_columns);
60
61    array.push(&t0);
62    array.push(&t1);
63    array.push(&t2);
64    array.push(&t3);
65
66    array.push(&t4);
67    array.push(&t5);
68    array.push(&t6);
69    array.push(&t7);
70
71    array
72}
73
74
75
76pub fn division_level<T: Number>(A: &Matrix<T>, optimal_block_size: usize, threads: usize) -> usize {
77    
78    let mut s = optimal_block_size;
79
80    if (s as i32) == -1 {
81        s = 10000;
82    }
83
84    let total_size = A.size();
85
86    let mut blocks = 1;
87
88    let x = total_size / optimal_block_size;
89
90    if x < 1 {
91
92        blocks = 1;
93
94    } else {
95
96        let c = if x > threads { threads } else { x };
97
98        let n = (c as f64).log(4.).ceil();
99
100        blocks = (4. as f64).powf(n) as usize;
101    }
102
103    blocks
104}
105
106
107
108pub fn get_optimal_depth<T: Number> (A: &Matrix<T>, optimal_element_size: usize) -> usize {
109
110    assert_eq!(A.rows, A.columns, "A should be square matrix");
111
112    let p = (optimal_element_size as f64).log(4.).ceil();
113
114    let optimal_element_size = (4. as f64).powf(p) as usize;
115
116    let size = A.rows * A.columns;
117
118    if size < 64 {
119        return 0;
120    }
121    
122    let chunks = size / optimal_element_size;
123
124    if chunks < 2 {
125        return 4;
126    }
127    
128    (chunks as f64).log(4.).ceil() as usize
129}
130
131
132
133//what exactly about matrix indicates that this is going to give advantage ?
134pub fn mul_blocks<T: Number>(
135    a: &mut Matrix<T>, 
136    b: &mut Matrix<T>, 
137    optimal_block_size: usize,
138    discard_zero_blocks: bool,
139    threads: usize
140) -> Matrix<T> {
141
142    let s1 = Matrix::augment_sq2n_size(&a);
143
144    let s2 = Matrix::augment_sq2n_size(&b);
145
146    let s = std::cmp::max(s1,s2);
147
148    let mut A: Matrix<T> = Matrix::new(s,s); 
149
150    A = &A + a;
151
152    let mut B: Matrix<T> = Matrix::new(s,s); 
153
154    B = &B + b;
155    
156    let blocks = division_level(&A, optimal_block_size, threads);
157    
158    unsafe {
159        log(&format!("A:({},{}), B:({},{}), size {}, computing with {} blocks", A.rows, A.columns, B.rows, B.columns, A.size(), blocks));
160    }
161    
162    let block_size = A.size() / blocks;
163
164    let y = (block_size as f64).sqrt() as usize;
165
166    let mut acc: Matrix<T> = Matrix::new(A.rows, B.columns);
167
168    for i in (0..A.rows).step_by(y) {
169        for j in (0..B.columns).step_by(y) {
170            for k in (0..A.columns).step_by(y) {
171                let ax0 = k;
172                let ax1 = k + y;
173                let ay0 = i;
174                let ay1 = i + y;
175                
176                let bx0 = j;
177                let bx1 = j + y;
178                let by0 = k;
179                let by1 = k + y;
180                
181                if discard_zero_blocks {
182                    let mut sa = T::zero();
183                    let mut sb = T::zero();
184
185                    for m in ay0..ay1 {
186                        for n in ax0..ax1 {
187                            sa += A[[m,n]];
188                        }
189                    }
190
191                    if sa == T::zero() {
192                        continue;
193                    }
194                    
195                    for m in by0..by1 {
196                        for n in bx0..bx1 {
197                            sb += B[[m,n]];
198                        }
199                    }
200
201                    if sb == T::zero() {
202                        continue;
203                    }
204                }
205                
206                for m in ay0..ay1 {
207                    for n in bx0..bx1 {
208                        for p in ax0..ax1 {
209                            acc[[m,n]] += A[[m,p]] * B[[p,n]];    
210                        }
211                    }
212                }
213            }
214        }
215    }
216
217    acc
218}
219
220
221
222pub fn prepare_multiply_threads(
223    a: &Matrix<i32>, 
224    b: &Matrix<i32>, 
225    optimal_block_size: usize,
226    threads: usize
227) -> (
228    usize,
229    Matrix<i32>,
230    Matrix<i32>,
231    Vec<[usize; 8]>
232) {
233    
234    let mut tasks: Vec<[usize; 8]> = Vec::new();
235
236    let s1 = Matrix::augment_sq2n_size(&a);
237
238    let s2 = Matrix::augment_sq2n_size(&b);
239
240    let s = std::cmp::max(s1,s2);
241
242    let mut A: Matrix<i32> = Matrix::new(s,s); 
243
244    A = &A + &a;
245
246    let mut B: Matrix<i32> = Matrix::new(s,s); 
247
248    B = &B + &b;
249    
250    let blocks = division_level(&A, optimal_block_size, threads);
251    
252    let block_size = A.size() / blocks;
253
254    let y = (block_size as f64).sqrt() as usize;
255    
256    for i in (0..A.rows).step_by(y) {
257        for j in (0..B.columns).step_by(y) {
258            for k in (0..A.columns).step_by(y) {
259
260                let ax0 = k;
261                let ax1 = k + y;
262                let ay0 = i;
263                let ay1 = i + y;
264                
265                let bx0 = j;
266                let bx1 = j + y;
267                let by0 = k;
268                let by1 = k + y;
269                
270                let t: [usize; 8] = [
271
272                    ax0, ay0, ax1, ay1, bx0, by0, bx1, by1
273
274                ];
275
276                tasks.push(t);
277            }
278        }
279    }
280
281    (
282        blocks,
283        A,
284        B,
285        tasks
286    )
287}
288
289
290
291pub fn multiply_threads(
292    hc: usize,
293    optimal_block_size: usize, 
294    A: &Matrix<i32>, 
295    B: &Matrix<i32>
296) -> WorkerOperation<Matrix<i32>> {
297    
298    let (blocks, mut A, mut B, tasks) = prepare_multiply_threads(
299        A,
300        B,
301        optimal_block_size,
302        hc
303    );
304    
305    let ar = A.rows;
306    let ac = A.columns;
307    let bc = B.columns;
308    let sa = A.mem_size();
309    let sb = B.mem_size();
310
311    let sab_rc: Rc<SharedArrayBuffer> = Rc::new(
312        Matrix::<i32>::transfer_into_sab(&A, &B)
313    );
314
315    let workers = Workers::new("./worker.js", tasks.len());
316    let workers = Rc::new( RefCell::new(workers) );
317    let mut list = workers.borrow_mut();
318    
319    list.work = tasks.len() as i32;
320
321    for i in 0..tasks.len() {
322        let task = tasks[i];
323        let worker = &mut list.ws[i];
324        let sab = sab_rc.clone();
325        let array = pack_mul_task(task, &sab, A.rows as u32, A.columns as u32, B.rows as u32, B.columns as u32);
326        let hook = workers.clone();
327
328        let c = Box::new(
329            move |event: Event| {
330
331                let mut list = hook.borrow_mut();
332
333                list.ws[i].cb = None;
334
335                list.work -= 1;
336                
337                if list.work == 0 {
338                    let result = Int32Array::new_with_byte_offset(&sab, (sa + sb) as u32);
339                    let data = result.to_vec();
340                    let mut ma: Matrix<i32> = Matrix::new(ar, bc);
341                    ma.set_vec(data);
342                    
343                    unsafe {
344                        log(&format!("\n \n result {} \n \n", ma));
345                    }
346                    list.result = Some(ma);
347                    list.waker.take().unwrap().wake();
348                }
349            }
350        ) as Box<dyn FnMut(Event)>;
351        
352        let callback = Closure::wrap(c);
353        
354        worker.w.set_onmessage(
355            Some(
356                callback.as_ref().dyn_ref().unwrap()
357            )
358        );
359        
360        worker.cb = Some(callback);
361
362        let result = worker.w.post_message(&array);
363    }
364    
365    WorkerOperation {
366        _ref: workers.clone(),
367        extract: Box::new(
368            move |s:&mut Workers<Matrix<i32>>| -> Option<Matrix<i32>> {
369                let m = s.result.take();
370                m
371            }
372        )
373    }
374}
375
376
377
378#[wasm_bindgen]
379pub fn ml_thread(
380    sab:&SharedArrayBuffer,
381
382    a_rows:usize,
383    a_columns:usize,
384    b_rows:usize,
385    b_columns:usize,
386    
387    ax0:usize, ay0:usize,
388    ax1:usize, ay1:usize,
389    bx0:usize, by0:usize,
390    bx1:usize, by1:usize
391) {
392    
393    console_error_panic_hook::set_once();
394    
395    let s = size_of::<i32>();
396    let sa = a_rows * a_columns * s;
397    let sb = b_rows * b_columns * s;
398    
399    let ra = Int32Array::new_with_byte_offset(&sab, 0);
400    let rb = Int32Array::new_with_byte_offset(&sab, sa as u32);
401    let rc = Int32Array::new_with_byte_offset(&sab, (sa + sb) as u32);
402
403    let vec1 = ra.to_vec();
404    let vec2 = rb.to_vec();
405    let vec3 = rc.to_vec();
406    
407    for m in ay0..ay1 {
408        for n in bx0..bx1 {
409            for p in ax0..ax1 {
410                    let v1: i32 = ra.get_index((m * a_columns + p) as u32);
411                    let v2: i32 = rb.get_index((p * b_columns + n) as u32);
412                    let result = v1.saturating_mul(v2);
413                    unsafe{
414                        Atomics::add(&rc, (m * b_columns + n) as u32, result);
415                    }
416                    //compare performance
417                    //rc.set_index((m * b_columns + n) as u32, result);
418                    //let old = Atomics::load(&rc2, (m * b_columns + n) as u32).unwrap(); 
419                    //Atomics::store(&rc2, (m * b_columns + n) as u32, kkk as i32);
420            }
421        }
422    }
423}
424
425//TODO discarding zero blocks
426//TODO different partitioning strategy
427#[wasm_bindgen]
428pub async fn test_multiplication(hc: f64) {
429
430    unsafe {
431        log(&format!("\n hardware concurrency is {} \n", hc));
432    }
433
434    console_error_panic_hook::set_once();
435
436    let window = web_sys::window().unwrap();
437    let performance = window.performance().unwrap();
438    let c = 12;
439    let max = (2. as f64).powf(3.);
440
441    
442    
443    for i in 10..c {
444        let optimal_block_size = 5 * i;
445        let max_side = 1000; //4 * i;
446        let mut A: Matrix<f64> = Matrix::rand_shape(max_side, max);
447        //let mut B: Matrix<f64> = Matrix::rand_shape(max_side, max);
448        let mut B: Matrix<f64> = Matrix::rand(A.columns, A.rows, max);
449        
450        let mut A = A.cast();
451        let mut B = B.cast();
452
453
454        unsafe {
455            log(&format!("\n multiplying A({}, {}) B({},{}) \n", A.rows, A.columns, B.rows, B.columns));
456        }
457
458        let start = performance.now();
459        //TODO profile, when this is advantageous ?
460        let r: Matrix<i32> = multiply_threads(hc as usize, optimal_block_size, &A, &B).await;
461        //mul_blocks(&mut A, &mut B, optimal_block_size, false, hc as usize); 
462        
463        let end = performance.now();
464        
465        unsafe {
466            log(&format!("\n by blocks {} \n", end - start));
467        }
468
469        let start = performance.now();
470        let r2: Matrix<i32> = &A * &B;
471        let end = performance.now();
472
473        unsafe {
474            log(&format!("\n naive {} \n", end - start));
475        }
476
477
478
479
480        unsafe {
481            log(&format!("\n by blocks {} \n", r));
482            log(&format!("\n naive {} \n", r2));
483        }
484
485
486        let mut r3: Matrix<f64> = Matrix::new(A.rows, B.columns);
487        /*
488        let r: Matrix<f64> = add(&r, &r3, 0); //TODO ? dim B
489        
490        if !(r == r2) {
491            let diff: Matrix<f64> = &r - &r2;
492            let s = diff.sum(); 
493            unsafe {
494                log(&format!("\n not equal, sum {} \n | r ({},{}) | r2 ({},{}) | {} \n", s, r.rows, r.columns, r2.rows, r2.columns, diff));
495
496                //log(&format!("\n in threads {} \n in local {} \n", r, r2));
497            }
498            break;
499        }
500        */
501
502        unsafe {
503            //log(&format!("\n without threads {} \n", end - start));
504        }
505        
506        //assert!(r == r2, "they should be equal {} \n \n {}", r, r2);
507
508        unsafe {
509            //log(&format!("\n final result is {} \n \n {} \n", r, r2));
510        }
511    }
512}
513
514
515
516mod tests {
517    
518    use super::{ Matrix, mul_blocks };
519
520    //#[test]
521    fn multiply_test() {
522                
523        let discard_zero_blocks = true;
524        let threads = 10;
525
526        for h in (1..2) {
527            let max_side = 156;
528            let max = 20000.;
529            let optimal_block_size = 5000;
530            let mut A: Matrix<f64> = Matrix::rand_shape(max_side, max);
531            let mut B: Matrix<f64> = Matrix::rand(A.columns, A.rows, max); //_shape(max_side, max);
532        
533            let C1: Matrix<f64> = &A * &B;
534            let C: Matrix<f64> = mul_blocks(&mut A, &mut B, optimal_block_size, discard_zero_blocks, threads);
535
536            assert_eq!(C.sum(), C1.sum(), "sum should be equal");
537
538            for i in 0..C1.rows {
539                for j in 0..C1.columns {
540                    assert_eq!(C1[[i,j]], C1[[i,j]], "all entries should be equal");
541                }
542            }
543        } 
544    }
545}