Skip to main content

webarkitlib_rs/
pattern.rs

1/*
2 *  pattern.rs
3 *  WebARKitLib-rs
4 *
5 *  This file is part of WebARKitLib-rs - WebARKit.
6 *
7 *  WebARKitLib-rs is free software: you can redistribute it and/or modify
8 *  it under the terms of the GNU Lesser General Public License as published by
9 *  the Free Software Foundation, either version 3 of the License, or
10 *  (at your option) any later version.
11 *
12 *  WebARKitLib-rs is distributed in the hope that it will be useful,
13 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 *  GNU Lesser General Public License for more details.
16 *
17 *  You should have received a copy of the GNU Lesser General Public License
18 *  along with WebARKitLib-rs.  If not, see <http://www.gnu.org/licenses/>.
19 *
20 *  As a special exception, the copyright holders of this library give you
21 *  permission to link this library with independent modules to produce an
22 *  executable, regardless of the license terms of these independent modules, and to
23 *  copy and distribute the resulting executable under terms of your choice,
24 *  provided that you also meet, for each linked independent module, the terms and
25 *  conditions of the license of that module. An independent module is a module
26 *  which is neither derived from nor based on this library. If you modify this
27 *  library, you may extend this exception to your version of the library, but you
28 *  are not obligated to do so. If you do not wish to do so, delete this exception
29 *  statement from your version.
30 *
31 *  Copyright 2026 WebARKit.
32 *
33 *  Author(s): Walter Perdan @kalwalt https://github.com/kalwalt
34 *
35 */
36
37//! Pattern Template matching logic
38//! Ported natively to safe Rust from arPattLoad.c and arPattGetID.c
39
40use crate::types::{ARPattHandle, ARdouble};
41
42pub const AR_PATT_NUM_MAX: i32 = 50;
43pub const AR_PATT_SIZE1: i32 = 16;
44pub const AR_TEMPLATE_MATCHING_COLOR: i32 = 0;
45pub const AR_TEMPLATE_MATCHING_MONO: i32 = 1;
46pub const AR_PATT_CONTRAST_THRESH1: ARdouble = 5.0;
47
48impl ARPattHandle {
49    /// Creates a new Handle for pattern matching with pre-allocated buffer capacities.
50    pub fn new(patt_size: i32, pattern_count_max: i32) -> Self {
51        let max_alloc = (pattern_count_max * 4) as usize;
52        Self {
53            patt_num: 0,
54            patt_num_max: pattern_count_max,
55            pattf: vec![0; pattern_count_max as usize],
56            patt: vec![vec![0; (patt_size * patt_size * 3) as usize]; max_alloc],
57            pattpow: vec![0.0; max_alloc],
58            patt_bw: vec![vec![0; (patt_size * patt_size) as usize]; max_alloc],
59            pattpow_bw: vec![0.0; max_alloc],
60            patt_size,
61        }
62    }
63}
64
65/// Loads a pattern into the provided ARPattHandle from a text-based buffer.
66/// Returns the index of the loaded pattern.
67pub fn ar_patt_load_from_buffer(patt_handle: &mut ARPattHandle, buffer: &str) -> Result<i32, &'static str> {
68    let mut tokens = buffer.split_whitespace();
69    
70    let mut patno = -1;
71    for i in 0..patt_handle.patt_num_max as usize {
72        if patt_handle.pattf[i] == 0 {
73            patno = i as i32;
74            break;
75        }
76    }
77    
78    if patno == -1 {
79        return Err("Maximum pattern limit reached");
80    }
81
82    let p_idx = patno as usize;
83    let size = patt_handle.patt_size as usize;
84
85    for h in 0..4 {
86        let mut l = 0;
87        let mut i_out = 0;
88        
89        let mut read_tokens = Vec::with_capacity(size * size * 3);
90        
91        for _ in 0..(size * size * 3) {
92            if let Some(t) = tokens.next() {
93                if let Ok(val) = t.parse::<i32>() {
94                    read_tokens.push(val);
95                } else {
96                    return Err("Failed to parse pattern number");
97                }
98            } else {
99                return Err("Pattern data read error (unexpected EOF)");
100            }
101        }
102        
103        for i3 in 0..3 {
104            for i2 in 0..size {
105                for i1 in 0..size {
106                    let mut j = read_tokens[i_out];
107                    i_out += 1;
108                    
109                    j = 255 - j;
110                    patt_handle.patt[p_idx * 4 + h][(i2 * size + i1) * 3 + i3] = j;
111                    
112                    if i3 == 0 {
113                        patt_handle.patt_bw[p_idx * 4 + h][i2 * size + i1] = j;
114                    } else {
115                        patt_handle.patt_bw[p_idx * 4 + h][i2 * size + i1] += j;
116                    }
117                    
118                    if i3 == 2 {
119                        patt_handle.patt_bw[p_idx * 4 + h][i2 * size + i1] /= 3;
120                    }
121                    l += j;
122                }
123            }
124        }
125        
126        l /= (size * size * 3) as i32;
127
128        let mut m_col = 0i64;
129        for i in 0..(size * size * 3) {
130            patt_handle.patt[p_idx * 4 + h][i] -= l;
131            m_col += (patt_handle.patt[p_idx * 4 + h][i] * patt_handle.patt[p_idx * 4 + h][i]) as i64;
132        }
133        
134        patt_handle.pattpow[p_idx * 4 + h] = (m_col as ARdouble).sqrt();
135        if patt_handle.pattpow[p_idx * 4 + h] == 0.0 {
136            patt_handle.pattpow[p_idx * 4 + h] = 0.0000001;
137        }
138
139        let mut m_bw = 0i64;
140        for i in 0..(size * size) {
141            patt_handle.patt_bw[p_idx * 4 + h][i] -= l;
142            m_bw += (patt_handle.patt_bw[p_idx * 4 + h][i] * patt_handle.patt_bw[p_idx * 4 + h][i]) as i64;
143        }
144        
145        patt_handle.pattpow_bw[p_idx * 4 + h] = (m_bw as ARdouble).sqrt();
146        if patt_handle.pattpow_bw[p_idx * 4 + h] == 0.0 {
147            patt_handle.pattpow_bw[p_idx * 4 + h] = 0.0000001;
148        }
149    }
150
151    patt_handle.pattf[p_idx] = 1;
152    patt_handle.patt_num += 1;
153
154    Ok(patno)
155}
156
157/// Loads a pattern from a specified file path.
158pub fn ar_patt_load(patt_handle: &mut ARPattHandle, filename: &str) -> Result<i32, String> {
159    use std::fs;
160    
161    let buffer = fs::read_to_string(filename)
162        .map_err(|e| format!("Error reading pattern file '{}': {}", filename, e))?;
163        
164    ar_patt_load_from_buffer(patt_handle, &buffer).map_err(|e| e.to_string())
165}
166
167/// Matches the unwarped square marker against loaded templates via NCC (Normalized Cross Correlation).
168pub fn pattern_match(
169    patt_handle: &ARPattHandle,
170    mode: i32,
171    data: &[u8],
172    size: i32,
173    code: &mut i32,
174    dir: &mut i32,
175    cf: &mut ARdouble,
176) -> Result<(), &'static str> {
177    if size <= 0 {
178        *code = 0;
179        *dir = 0;
180        *cf = -1.0;
181        return Err("Invalid size");
182    }
183
184    let size_u = size as usize;
185
186    if mode == AR_TEMPLATE_MATCHING_COLOR {
187        let size_sqd_x3 = size_u * size_u * 3;
188        if data.len() < size_sqd_x3 {
189            return Err("Data array too small");
190        }
191        let mut input = vec![0; size_sqd_x3];
192
193        let mut ave = 0i32;
194        for i in 0..size_sqd_x3 {
195            ave += (255 - data[i]) as i32;
196        }
197        ave /= size_sqd_x3 as i32;
198
199        let mut sum = 0i64;
200        for i in 0..size_sqd_x3 {
201            input[i] = (255 - data[i]) as i32 - ave;
202            sum += (input[i] * input[i]) as i64;
203        }
204
205        let datapow = (sum as ARdouble).sqrt();
206        if datapow / ((size as ARdouble) * 3.0f64.sqrt()) < AR_PATT_CONTRAST_THRESH1 {
207            *code = 0;
208            *dir = 0;
209            *cf = -1.0;
210            return Err("Insufficient contrast");
211        }
212
213        let mut res1 = -1;
214        let mut res2 = -1;
215        let mut max = 0.0;
216        
217        let mut k = -1isize;
218        for _ in 0..patt_handle.patt_num {
219            k += 1;
220            while (k as usize) < patt_handle.pattf.len() && patt_handle.pattf[k as usize] == 0 {
221                k += 1;
222            }
223            if k as usize >= patt_handle.pattf.len() { break; }
224            if patt_handle.pattf[k as usize] == 2 {
225                continue;
226            }
227            
228            for j in 0..4 {
229                let pattern_ref = &patt_handle.patt[k as usize * 4 + j];
230                let sum_cc = dot_product(&input, pattern_ref);
231                
232                let sum2 = (sum_cc as ARdouble) / patt_handle.pattpow[k as usize * 4 + j] / datapow;
233                if sum2 > max {
234                    max = sum2;
235                    res1 = j as i32;
236                    res2 = k as i32;
237                }
238            }
239        }
240
241        *dir = res1;
242        *code = res2;
243        *cf = max;
244        Ok(())
245    } else if mode == AR_TEMPLATE_MATCHING_MONO {
246        let size_sqd = size_u * size_u;
247        if data.len() < size_sqd {
248            return Err("Data array too small");
249        }
250        
251        let mut input = vec![0; size_sqd];
252
253        let mut ave = 0i32;
254        for i in 0..size_sqd {
255            ave += (255 - data[i]) as i32;
256        }
257        ave /= size_sqd as i32;
258
259        let mut sum = 0i64;
260        for i in 0..size_sqd {
261            input[i] = (255 - data[i]) as i32 - ave;
262            sum += (input[i] * input[i]) as i64;
263        }
264
265        let datapow = (sum as ARdouble).sqrt();
266        if datapow / (size as ARdouble) < AR_PATT_CONTRAST_THRESH1 {
267            *code = 0;
268            *dir = 0;
269            *cf = -1.0;
270            return Err("Insufficient contrast");
271        }
272
273        let mut res1 = -1;
274        let mut res2 = -1;
275        let mut max = 0.0;
276        
277        let mut k = -1isize;
278        for _ in 0..patt_handle.patt_num {
279            k += 1;
280            while (k as usize) < patt_handle.pattf.len() && patt_handle.pattf[k as usize] == 0 {
281                k += 1;
282            }
283            if k as usize >= patt_handle.pattf.len() { break; }
284            if patt_handle.pattf[k as usize] == 2 {
285                continue;
286            }
287            
288            for j in 0..4 {
289                let pattern_ref = &patt_handle.patt_bw[k as usize * 4 + j];
290                let sum_cc = dot_product(&input, pattern_ref);
291                
292                let sum2 = (sum_cc as ARdouble) / patt_handle.pattpow_bw[k as usize * 4 + j] / datapow;
293                if sum2 > max {
294                    max = sum2;
295                    res1 = j as i32;
296                    res2 = k as i32;
297                }
298            }
299        }
300
301        *dir = res1;
302        *code = res2;
303        *cf = max;
304        Ok(())
305    } else {
306        Err("Unsupported matching mode")
307    }
308}
309
310pub fn get_cpara(world: &[[ARdouble; 2]; 4], vertex: &[[ARdouble; 2]; 4], para: &mut [[ARdouble; 3]; 3]) -> Result<(), &'static str> {
311    use crate::math::{ARMat};
312    let mut a = ARMat::new(8, 8);
313    let mut b = ARMat::new(8, 1);
314
315    for i in 0..4 {
316        a.m[i * 16 + 0] = world[i][0];
317        a.m[i * 16 + 1] = world[i][1];
318        a.m[i * 16 + 2] = 1.0;
319        a.m[i * 16 + 3] = 0.0;
320        a.m[i * 16 + 4] = 0.0;
321        a.m[i * 16 + 5] = 0.0;
322        a.m[i * 16 + 6] = -world[i][0] * vertex[i][0];
323        a.m[i * 16 + 7] = -world[i][1] * vertex[i][0];
324        
325        a.m[i * 16 + 8] = 0.0;
326        a.m[i * 16 + 9] = 0.0;
327        a.m[i * 16 + 10] = 0.0;
328        a.m[i * 16 + 11] = world[i][0];
329        a.m[i * 16 + 12] = world[i][1];
330        a.m[i * 16 + 13] = 1.0;
331        a.m[i * 16 + 14] = -world[i][0] * vertex[i][1];
332        a.m[i * 16 + 15] = -world[i][1] * vertex[i][1];
333
334        b.m[i * 2 + 0] = vertex[i][0];
335        b.m[i * 2 + 1] = vertex[i][1];
336    }
337
338    a.self_inv()?;
339    let c = (&a * &b)?;
340
341    for i in 0..2 {
342        para[i][0] = c.m[i * 3 + 0];
343        para[i][1] = c.m[i * 3 + 1];
344        para[i][2] = c.m[i * 3 + 2];
345    }
346    para[2][0] = c.m[6];
347    para[2][1] = c.m[7];
348    para[2][2] = 1.0;
349
350    Ok(())
351}
352
353pub fn ar_patt_get_image(
354    image_proc_mode: i32,
355    patt_detect_mode: i32,
356    patt_size: i32,
357    sample_size: i32,
358    image: &[u8],
359    xsize: i32,
360    ysize: i32,
361    _pixel_format: crate::types::ARPixelFormat,
362    vertex: &[[ARdouble; 2]; 4],
363    patt_ratio: ARdouble,
364    ext_patt: &mut [u8],
365) -> Result<(), &'static str> {
366    let mut world = [[0.0; 2]; 4];
367    let mut para = [[0.0; 3]; 3];
368
369    world[0][0] = 100.0;
370    world[0][1] = 100.0;
371    world[1][0] = 110.0;
372    world[1][1] = 100.0;
373    world[2][0] = 110.0;
374    world[2][1] = 110.0;
375    world[3][0] = 100.0;
376    world[3][1] = 110.0;
377
378    get_cpara(&world, vertex, &mut para)?;
379
380    let mut lx1 = ((vertex[0][0] - vertex[1][0]).powi(2) + (vertex[0][1] - vertex[1][1]).powi(2)) as i32;
381    let lx2 = ((vertex[2][0] - vertex[3][0]).powi(2) + (vertex[2][1] - vertex[3][1]).powi(2)) as i32;
382    let mut ly1 = ((vertex[1][0] - vertex[2][0]).powi(2) + (vertex[1][1] - vertex[2][1]).powi(2)) as i32;
383    let ly2 = ((vertex[3][0] - vertex[0][0]).powi(2) + (vertex[3][1] - vertex[0][1]).powi(2)) as i32;
384
385    if lx2 > lx1 { lx1 = lx2; }
386    if ly2 > ly1 { ly1 = ly2; }
387
388    let lx_patt = (lx1 as ARdouble * patt_ratio * patt_ratio) as i32;
389    let ly_patt = (ly1 as ARdouble * patt_ratio * patt_ratio) as i32;
390
391    let mut xdiv2 = patt_size;
392    let mut ydiv2 = patt_size;
393
394    if image_proc_mode == 0 { // AR_IMAGE_PROC_FRAME_IMAGE
395        while xdiv2 * xdiv2 < lx_patt && xdiv2 < sample_size { xdiv2 *= 2; }
396        while ydiv2 * ydiv2 < ly_patt && ydiv2 < sample_size { ydiv2 *= 2; }
397    } else {
398        while xdiv2 * xdiv2 * 4 < lx_patt && xdiv2 < sample_size { xdiv2 *= 2; }
399        while ydiv2 * ydiv2 * 4 < ly_patt && ydiv2 < sample_size { ydiv2 *= 2; }
400    }
401    
402    if xdiv2 > sample_size { xdiv2 = sample_size; }
403    if ydiv2 > sample_size { ydiv2 = sample_size; }
404
405    let xdiv = xdiv2 / patt_size;
406    let ydiv = ydiv2 / patt_size;
407    
408    let patt_ratio1 = (1.0 - patt_ratio) / 2.0 * 10.0;
409    let patt_ratio2 = patt_ratio * 10.0;
410
411    if patt_detect_mode == AR_TEMPLATE_MATCHING_COLOR {
412        let mut ext_patt2 = vec![0u32; (patt_size * patt_size * 3) as usize];
413        
414        for j in 0..ydiv2 {
415            let yw = (100.0 + patt_ratio1) + patt_ratio2 * (j as f64 + 0.5) / (ydiv2 as f64);
416            for i in 0..xdiv2 {
417                let xw = (100.0 + patt_ratio1) + patt_ratio2 * (i as f64 + 0.5) / (xdiv2 as f64);
418                let d = para[2][0] * xw + para[2][1] * yw + para[2][2];
419                if d == 0.0 { return Err("Division by zero in homography"); }
420                
421                let xc = ((para[0][0] * xw + para[0][1] * yw + para[0][2]) / d) as i32;
422                let yc = ((para[1][0] * xw + para[1][1] * yw + para[1][2]) / d) as i32;
423
424                if xc >= 0 && xc < xsize && yc >= 0 && yc < ysize {
425                    // RGB assumes 3 bytes per pixel in the source buffer
426                    let src_idx = ((yc * xsize + xc) * 3) as usize;
427                    if src_idx + 2 < image.len() {
428                        let dst_idx = (((j / ydiv) * patt_size + (i / xdiv)) * 3) as usize;
429                        ext_patt2[dst_idx + 0] += image[src_idx + 0] as u32; // R
430                        ext_patt2[dst_idx + 1] += image[src_idx + 1] as u32; // G
431                        ext_patt2[dst_idx + 2] += image[src_idx + 2] as u32; // B
432                    }
433                }
434            }
435        }
436        
437        for i in 0..(patt_size * patt_size * 3) as usize {
438            if i < ext_patt.len() {
439                ext_patt[i] = (ext_patt2[i] / (xdiv * ydiv) as u32) as u8;
440            }
441        }
442    } else {
443        let mut ext_patt2 = vec![0u32; (patt_size * patt_size) as usize];
444        
445        for j in 0..ydiv2 {
446            let yw = (100.0 + patt_ratio1) + patt_ratio2 * (j as f64 + 0.5) / (ydiv2 as f64);
447            for i in 0..xdiv2 {
448                let xw = (100.0 + patt_ratio1) + patt_ratio2 * (i as f64 + 0.5) / (xdiv2 as f64);
449                let d = para[2][0] * xw + para[2][1] * yw + para[2][2];
450                if d == 0.0 { return Err("Division by zero in homography"); }
451                
452                let xc = ((para[0][0] * xw + para[0][1] * yw + para[0][2]) / d) as i32;
453                let yc = ((para[1][0] * xw + para[1][1] * yw + para[1][2]) / d) as i32;
454
455                if xc >= 0 && xc < xsize && yc >= 0 && yc < ysize {
456                    // Assuming Luma takes 1 byte per pixel! Wait, if image is RGB, Luma would be 3 bytes?
457                    // if it's RGB buffer, we must convert to luma. Assuming Luma/Mono buffer passed in natively:
458                    let src_idx = (yc * xsize + xc) as usize;
459                    if src_idx < image.len() {
460                        let dst_idx = ((j / ydiv) * patt_size + (i / xdiv)) as usize;
461                        ext_patt2[dst_idx] += image[src_idx] as u32;
462                    }
463                }
464            }
465        }
466        
467        for i in 0..(patt_size * patt_size) as usize {
468            if i < ext_patt.len() {
469                ext_patt[i] = (ext_patt2[i] / (xdiv * ydiv) as u32) as u8;
470            }
471        }
472    }
473
474    Ok(())
475}
476
477#[cfg(test)]
478mod tests {
479    use super::*;
480
481    #[test]
482    fn test_pattern_load_and_match() {
483        let mut handle = ARPattHandle::new(AR_PATT_SIZE1, AR_PATT_NUM_MAX);
484        
485        let mut mock_patt = String::new();
486        // 4 orientations * size * size * 3 colors
487        for _ in 0..(4 * AR_PATT_SIZE1 * AR_PATT_SIZE1 * 3) {
488            mock_patt.push_str("128 ");
489        }
490        
491        let patno = ar_patt_load_from_buffer(&mut handle, &mock_patt).unwrap();
492        assert_eq!(patno, 0);
493        assert_eq!(handle.patt_num, 1);
494        
495        // Mock extracted pattern with high contrast
496        let mut mock_data = vec![0; (AR_PATT_SIZE1 * AR_PATT_SIZE1 * 3) as usize];
497        for i in 0..mock_data.len() {
498            if i % 2 == 0 {
499                mock_data[i] = 10;
500            } else {
501                mock_data[i] = 240;
502            }
503        }
504        
505        let mut code = 0;
506        let mut dir = 0;
507        let mut cf = 0.0;
508        let result = pattern_match(&handle, AR_TEMPLATE_MATCHING_COLOR, &mock_data, AR_PATT_SIZE1, &mut code, &mut dir, &mut cf);
509        assert!(result.is_ok());
510    }
511}
512
513#[inline]
514fn dot_product(a: &[i32], b: &[i32]) -> i64 {
515    #[cfg(feature = "simd-pattern")]
516    {
517        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
518        {
519            return unsafe { dot_product_simd_wasm(a, b) };
520        }
521        #[cfg(all(target_arch = "x86_64", target_feature = "sse4.1"))]
522        {
523            if is_x86_feature_detected!("sse4.1") {
524                return unsafe { dot_product_simd_x86(a, b) };
525            }
526        }
527    }
528
529    #[cfg(not(any(
530        all(target_arch = "wasm32", target_feature = "simd128"),
531        all(target_arch = "x86_64", target_feature = "sse4.1")
532    )))]
533    dot_product_scalar(a, b)
534}
535
536pub fn dot_product_scalar(a: &[i32], b: &[i32]) -> i64 {
537    let mut sum = 0i64;
538    for i in 0..a.len() {
539        sum += (a[i] * b[i]) as i64;
540    }
541    sum
542}
543
544#[cfg(all(feature = "simd-pattern", target_arch = "wasm32", target_feature = "simd128"))]
545#[target_feature(enable = "simd128")]
546unsafe fn dot_product_simd_wasm(a: &[i32], b: &[i32]) -> i64 {
547    use std::arch::wasm32::*;
548    
549    let mut sum_v = i64x2_splat(0);
550    let chunks_len = a.len() / 4;
551    
552    let mut a_ptr = a.as_ptr();
553    let mut b_ptr = b.as_ptr();
554    
555    for _ in 0..chunks_len {
556        let va = v128_load(a_ptr as *const v128);
557        let vb = v128_load(b_ptr as *const v128);
558        
559        // Low parts
560        let va_low = i64x2_extend_low_i32x4(va);
561        let vb_low = i64x2_extend_low_i32x4(vb);
562        sum_v = i64x2_add(sum_v, i64x2_mul(va_low, vb_low));
563        
564        // High parts
565        let va_high = i64x2_extend_high_i32x4(va);
566        let vb_high = i64x2_extend_high_i32x4(vb);
567        sum_v = i64x2_add(sum_v, i64x2_mul(va_high, vb_high));
568        
569        a_ptr = a_ptr.add(4);
570        b_ptr = b_ptr.add(4);
571    }
572    
573    let mut res = [0i64; 2];
574    v128_store(res.as_mut_ptr() as *mut v128, sum_v);
575    let mut total = res[0] + res[1];
576    
577    let rem_start = chunks_len * 4;
578    for i in rem_start..a.len() {
579        total += (a[i] * b[i]) as i64;
580    }
581    
582    total
583}
584
585#[cfg(all(feature = "simd-pattern", target_arch = "x86_64", target_feature = "sse4.1"))]
586#[target_feature(enable = "sse4.1")]
587pub unsafe fn dot_product_simd_x86(a: &[i32], b: &[i32]) -> i64 {
588    use std::arch::x86_64::*;
589    
590    let mut sum_v = _mm_setzero_si128(); // i64x2
591    let chunks_len = a.len() / 4;
592    
593    let mut a_ptr = a.as_ptr();
594    let mut b_ptr = b.as_ptr();
595    
596    for _ in 0..chunks_len {
597        let va = _mm_loadu_si128(a_ptr as *const __m128i);
598        let vb = _mm_loadu_si128(b_ptr as *const __m128i);
599        
600        // Use 32-bit multiplication (SSE4.1). 
601        // This is safe for pattern matching where (255*255)*768 << 2^31.
602        let prod = _mm_mullo_epi32(va, vb);
603        sum_v = _mm_add_epi32(sum_v, prod);
604        
605        a_ptr = a_ptr.add(4);
606        b_ptr = b_ptr.add(4);
607    }
608    
609    let mut res = [0i32; 4];
610    _mm_storeu_si128(res.as_mut_ptr() as *mut __m128i, sum_v);
611    let mut total = (res[0] as i64) + (res[1] as i64) + (res[2] as i64) + (res[3] as i64);
612    
613    let rem_start = chunks_len * 4;
614    for i in rem_start..a.len() {
615        total += (a[i] * b[i]) as i64;
616    }
617    
618    total
619}