1extern crate wasm_bindgen;
2extern crate num_cpus;
3use std::{cell::RefCell, future::Future, mem::size_of, pin::Pin, rc::Rc, task::{Context, Poll}};
4
5use num::ToPrimitive;
6use wasm_bindgen::prelude::*;
7use wasm_bindgen::JsCast;
8use js_sys::{Array, Atomics, Float32Array, Float64Array, Int32Array, SharedArrayBuffer, Uint32Array, Uint8Array};
9use rand::prelude::*;
10use rand::Rng;
11use web_sys::Event;
12
13use crate::{Number, core::matrix::{Matrix, add}, workers::{WorkerOperation, Workers}};
14
15
16
17#[wasm_bindgen]
20extern "C" {
21 #[wasm_bindgen(js_namespace = console)]
22 fn log(s: &str);
23}
24
25
26
27pub fn pack_mul_task(
28 t: [usize; 8],
29 sab:&SharedArrayBuffer,
30 A_rows: u32,
31 A_columns: u32,
32 B_rows: u32,
33 B_columns: u32
34) -> js_sys::Array {
35 let array: js_sys::Array = js_sys::Array::new();
36
37 array.push(&sab);
38
39 let a_rows = JsValue::from(A_rows);
40 let a_columns = JsValue::from(A_columns);
41
42 let b_rows = JsValue::from(B_rows);
43 let b_columns = JsValue::from(B_columns);
44
45 let t0 = JsValue::from(t[0] as u32);
46 let t1 = JsValue::from(t[1] as u32);
47 let t2 = JsValue::from(t[2] as u32);
48 let t3 = JsValue::from(t[3] as u32);
49
50 let t4 = JsValue::from(t[4] as u32);
51 let t5 = JsValue::from(t[5] as u32);
52 let t6 = JsValue::from(t[6] as u32);
53 let t7 = JsValue::from(t[7] as u32);
54
55 array.push(&a_rows);
56 array.push(&a_columns);
57
58 array.push(&b_rows);
59 array.push(&b_columns);
60
61 array.push(&t0);
62 array.push(&t1);
63 array.push(&t2);
64 array.push(&t3);
65
66 array.push(&t4);
67 array.push(&t5);
68 array.push(&t6);
69 array.push(&t7);
70
71 array
72}
73
74
75
76pub fn division_level<T: Number>(A: &Matrix<T>, optimal_block_size: usize, threads: usize) -> usize {
77
78 let mut s = optimal_block_size;
79
80 if (s as i32) == -1 {
81 s = 10000;
82 }
83
84 let total_size = A.size();
85
86 let mut blocks = 1;
87
88 let x = total_size / optimal_block_size;
89
90 if x < 1 {
91
92 blocks = 1;
93
94 } else {
95
96 let c = if x > threads { threads } else { x };
97
98 let n = (c as f64).log(4.).ceil();
99
100 blocks = (4. as f64).powf(n) as usize;
101 }
102
103 blocks
104}
105
106
107
108pub fn get_optimal_depth<T: Number> (A: &Matrix<T>, optimal_element_size: usize) -> usize {
109
110 assert_eq!(A.rows, A.columns, "A should be square matrix");
111
112 let p = (optimal_element_size as f64).log(4.).ceil();
113
114 let optimal_element_size = (4. as f64).powf(p) as usize;
115
116 let size = A.rows * A.columns;
117
118 if size < 64 {
119 return 0;
120 }
121
122 let chunks = size / optimal_element_size;
123
124 if chunks < 2 {
125 return 4;
126 }
127
128 (chunks as f64).log(4.).ceil() as usize
129}
130
131
132
133pub fn mul_blocks<T: Number>(
135 a: &mut Matrix<T>,
136 b: &mut Matrix<T>,
137 optimal_block_size: usize,
138 discard_zero_blocks: bool,
139 threads: usize
140) -> Matrix<T> {
141
142 let s1 = Matrix::augment_sq2n_size(&a);
143
144 let s2 = Matrix::augment_sq2n_size(&b);
145
146 let s = std::cmp::max(s1,s2);
147
148 let mut A: Matrix<T> = Matrix::new(s,s);
149
150 A = &A + a;
151
152 let mut B: Matrix<T> = Matrix::new(s,s);
153
154 B = &B + b;
155
156 let blocks = division_level(&A, optimal_block_size, threads);
157
158 unsafe {
159 log(&format!("A:({},{}), B:({},{}), size {}, computing with {} blocks", A.rows, A.columns, B.rows, B.columns, A.size(), blocks));
160 }
161
162 let block_size = A.size() / blocks;
163
164 let y = (block_size as f64).sqrt() as usize;
165
166 let mut acc: Matrix<T> = Matrix::new(A.rows, B.columns);
167
168 for i in (0..A.rows).step_by(y) {
169 for j in (0..B.columns).step_by(y) {
170 for k in (0..A.columns).step_by(y) {
171 let ax0 = k;
172 let ax1 = k + y;
173 let ay0 = i;
174 let ay1 = i + y;
175
176 let bx0 = j;
177 let bx1 = j + y;
178 let by0 = k;
179 let by1 = k + y;
180
181 if discard_zero_blocks {
182 let mut sa = T::zero();
183 let mut sb = T::zero();
184
185 for m in ay0..ay1 {
186 for n in ax0..ax1 {
187 sa += A[[m,n]];
188 }
189 }
190
191 if sa == T::zero() {
192 continue;
193 }
194
195 for m in by0..by1 {
196 for n in bx0..bx1 {
197 sb += B[[m,n]];
198 }
199 }
200
201 if sb == T::zero() {
202 continue;
203 }
204 }
205
206 for m in ay0..ay1 {
207 for n in bx0..bx1 {
208 for p in ax0..ax1 {
209 acc[[m,n]] += A[[m,p]] * B[[p,n]];
210 }
211 }
212 }
213 }
214 }
215 }
216
217 acc
218}
219
220
221
222pub fn prepare_multiply_threads(
223 a: &Matrix<i32>,
224 b: &Matrix<i32>,
225 optimal_block_size: usize,
226 threads: usize
227) -> (
228 usize,
229 Matrix<i32>,
230 Matrix<i32>,
231 Vec<[usize; 8]>
232) {
233
234 let mut tasks: Vec<[usize; 8]> = Vec::new();
235
236 let s1 = Matrix::augment_sq2n_size(&a);
237
238 let s2 = Matrix::augment_sq2n_size(&b);
239
240 let s = std::cmp::max(s1,s2);
241
242 let mut A: Matrix<i32> = Matrix::new(s,s);
243
244 A = &A + &a;
245
246 let mut B: Matrix<i32> = Matrix::new(s,s);
247
248 B = &B + &b;
249
250 let blocks = division_level(&A, optimal_block_size, threads);
251
252 let block_size = A.size() / blocks;
253
254 let y = (block_size as f64).sqrt() as usize;
255
256 for i in (0..A.rows).step_by(y) {
257 for j in (0..B.columns).step_by(y) {
258 for k in (0..A.columns).step_by(y) {
259
260 let ax0 = k;
261 let ax1 = k + y;
262 let ay0 = i;
263 let ay1 = i + y;
264
265 let bx0 = j;
266 let bx1 = j + y;
267 let by0 = k;
268 let by1 = k + y;
269
270 let t: [usize; 8] = [
271
272 ax0, ay0, ax1, ay1, bx0, by0, bx1, by1
273
274 ];
275
276 tasks.push(t);
277 }
278 }
279 }
280
281 (
282 blocks,
283 A,
284 B,
285 tasks
286 )
287}
288
289
290
291pub fn multiply_threads(
292 hc: usize,
293 optimal_block_size: usize,
294 A: &Matrix<i32>,
295 B: &Matrix<i32>
296) -> WorkerOperation<Matrix<i32>> {
297
298 let (blocks, mut A, mut B, tasks) = prepare_multiply_threads(
299 A,
300 B,
301 optimal_block_size,
302 hc
303 );
304
305 let ar = A.rows;
306 let ac = A.columns;
307 let bc = B.columns;
308 let sa = A.mem_size();
309 let sb = B.mem_size();
310
311 let sab_rc: Rc<SharedArrayBuffer> = Rc::new(
312 Matrix::<i32>::transfer_into_sab(&A, &B)
313 );
314
315 let workers = Workers::new("./worker.js", tasks.len());
316 let workers = Rc::new( RefCell::new(workers) );
317 let mut list = workers.borrow_mut();
318
319 list.work = tasks.len() as i32;
320
321 for i in 0..tasks.len() {
322 let task = tasks[i];
323 let worker = &mut list.ws[i];
324 let sab = sab_rc.clone();
325 let array = pack_mul_task(task, &sab, A.rows as u32, A.columns as u32, B.rows as u32, B.columns as u32);
326 let hook = workers.clone();
327
328 let c = Box::new(
329 move |event: Event| {
330
331 let mut list = hook.borrow_mut();
332
333 list.ws[i].cb = None;
334
335 list.work -= 1;
336
337 if list.work == 0 {
338 let result = Int32Array::new_with_byte_offset(&sab, (sa + sb) as u32);
339 let data = result.to_vec();
340 let mut ma: Matrix<i32> = Matrix::new(ar, bc);
341 ma.set_vec(data);
342
343 unsafe {
344 log(&format!("\n \n result {} \n \n", ma));
345 }
346 list.result = Some(ma);
347 list.waker.take().unwrap().wake();
348 }
349 }
350 ) as Box<dyn FnMut(Event)>;
351
352 let callback = Closure::wrap(c);
353
354 worker.w.set_onmessage(
355 Some(
356 callback.as_ref().dyn_ref().unwrap()
357 )
358 );
359
360 worker.cb = Some(callback);
361
362 let result = worker.w.post_message(&array);
363 }
364
365 WorkerOperation {
366 _ref: workers.clone(),
367 extract: Box::new(
368 move |s:&mut Workers<Matrix<i32>>| -> Option<Matrix<i32>> {
369 let m = s.result.take();
370 m
371 }
372 )
373 }
374}
375
376
377
378#[wasm_bindgen]
379pub fn ml_thread(
380 sab:&SharedArrayBuffer,
381
382 a_rows:usize,
383 a_columns:usize,
384 b_rows:usize,
385 b_columns:usize,
386
387 ax0:usize, ay0:usize,
388 ax1:usize, ay1:usize,
389 bx0:usize, by0:usize,
390 bx1:usize, by1:usize
391) {
392
393 console_error_panic_hook::set_once();
394
395 let s = size_of::<i32>();
396 let sa = a_rows * a_columns * s;
397 let sb = b_rows * b_columns * s;
398
399 let ra = Int32Array::new_with_byte_offset(&sab, 0);
400 let rb = Int32Array::new_with_byte_offset(&sab, sa as u32);
401 let rc = Int32Array::new_with_byte_offset(&sab, (sa + sb) as u32);
402
403 let vec1 = ra.to_vec();
404 let vec2 = rb.to_vec();
405 let vec3 = rc.to_vec();
406
407 for m in ay0..ay1 {
408 for n in bx0..bx1 {
409 for p in ax0..ax1 {
410 let v1: i32 = ra.get_index((m * a_columns + p) as u32);
411 let v2: i32 = rb.get_index((p * b_columns + n) as u32);
412 let result = v1.saturating_mul(v2);
413 unsafe{
414 Atomics::add(&rc, (m * b_columns + n) as u32, result);
415 }
416 }
421 }
422 }
423}
424
425#[wasm_bindgen]
428pub async fn test_multiplication(hc: f64) {
429
430 unsafe {
431 log(&format!("\n hardware concurrency is {} \n", hc));
432 }
433
434 console_error_panic_hook::set_once();
435
436 let window = web_sys::window().unwrap();
437 let performance = window.performance().unwrap();
438 let c = 12;
439 let max = (2. as f64).powf(3.);
440
441
442
443 for i in 10..c {
444 let optimal_block_size = 5 * i;
445 let max_side = 1000; let mut A: Matrix<f64> = Matrix::rand_shape(max_side, max);
447 let mut B: Matrix<f64> = Matrix::rand(A.columns, A.rows, max);
449
450 let mut A = A.cast();
451 let mut B = B.cast();
452
453
454 unsafe {
455 log(&format!("\n multiplying A({}, {}) B({},{}) \n", A.rows, A.columns, B.rows, B.columns));
456 }
457
458 let start = performance.now();
459 let r: Matrix<i32> = multiply_threads(hc as usize, optimal_block_size, &A, &B).await;
461 let end = performance.now();
464
465 unsafe {
466 log(&format!("\n by blocks {} \n", end - start));
467 }
468
469 let start = performance.now();
470 let r2: Matrix<i32> = &A * &B;
471 let end = performance.now();
472
473 unsafe {
474 log(&format!("\n naive {} \n", end - start));
475 }
476
477
478
479
480 unsafe {
481 log(&format!("\n by blocks {} \n", r));
482 log(&format!("\n naive {} \n", r2));
483 }
484
485
486 let mut r3: Matrix<f64> = Matrix::new(A.rows, B.columns);
487 unsafe {
503 }
505
506 unsafe {
509 }
511 }
512}
513
514
515
516mod tests {
517
518 use super::{ Matrix, mul_blocks };
519
520 fn multiply_test() {
522
523 let discard_zero_blocks = true;
524 let threads = 10;
525
526 for h in (1..2) {
527 let max_side = 156;
528 let max = 20000.;
529 let optimal_block_size = 5000;
530 let mut A: Matrix<f64> = Matrix::rand_shape(max_side, max);
531 let mut B: Matrix<f64> = Matrix::rand(A.columns, A.rows, max); let C1: Matrix<f64> = &A * &B;
534 let C: Matrix<f64> = mul_blocks(&mut A, &mut B, optimal_block_size, discard_zero_blocks, threads);
535
536 assert_eq!(C.sum(), C1.sum(), "sum should be equal");
537
538 for i in 0..C1.rows {
539 for j in 0..C1.columns {
540 assert_eq!(C1[[i,j]], C1[[i,j]], "all entries should be equal");
541 }
542 }
543 }
544 }
545}