Skip to main content

webarkitlib_rs/
pattern.rs

1/*
2 *  pattern.rs
3 *  WebARKitLib-rs
4 *
5 *  This file is part of WebARKitLib-rs - WebARKit.
6 *
7 *  WebARKitLib-rs is free software: you can redistribute it and/or modify
8 *  it under the terms of the GNU Lesser General Public License as published by
9 *  the Free Software Foundation, either version 3 of the License, or
10 *  (at your option) any later version.
11 *
12 *  WebARKitLib-rs is distributed in the hope that it will be useful,
13 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
14 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 *  GNU Lesser General Public License for more details.
16 *
17 *  You should have received a copy of the GNU Lesser General Public License
18 *  along with WebARKitLib-rs.  If not, see <http://www.gnu.org/licenses/>.
19 *
20 *  As a special exception, the copyright holders of this library give you
21 *  permission to link this library with independent modules to produce an
22 *  executable, regardless of the license terms of these independent modules, and to
23 *  copy and distribute the resulting executable under terms of your choice,
24 *  provided that you also meet, for each linked independent module, the terms and
25 *  conditions of the license of that module. An independent module is a module
26 *  which is neither derived from nor based on this library. If you modify this
27 *  library, you may extend this exception to your version of the library, but you
28 *  are not obligated to do so. If you do not wish to do so, delete this exception
29 *  statement from your version.
30 *
31 *  Copyright 2026 WebARKit.
32 *
33 *  Author(s): Walter Perdan @kalwalt https://github.com/kalwalt
34 *
35 */
36
37//! Pattern Template matching logic
38//! Ported natively to safe Rust from arPattLoad.c and arPattGetID.c
39
40use crate::types::*;
41
42
43
44pub const AR_PATT_NUM_MAX: i32 = 50;
45pub const AR_PATT_SIZE1: i32 = 16;
46pub const AR_TEMPLATE_MATCHING_COLOR: i32 = 0;
47pub const AR_TEMPLATE_MATCHING_MONO: i32 = 1;
48pub const AR_PATT_CONTRAST_THRESH1: ARdouble = 5.0;
49
50impl ARPattHandle {
51    /// Creates a new Handle for pattern matching with pre-allocated buffer capacities.
52    pub fn new(patt_size: i32, pattern_count_max: i32) -> Self {
53        let max_alloc = (pattern_count_max * 4) as usize;
54        Self {
55            patt_num: 0,
56            patt_num_max: pattern_count_max,
57            pattf: vec![0; pattern_count_max as usize],
58            patt: vec![vec![0i16; (patt_size * patt_size * 3) as usize]; max_alloc],
59            pattpow: vec![0.0; max_alloc],
60            patt_bw: vec![vec![0i16; (patt_size * patt_size) as usize]; max_alloc],
61            pattpow_bw: vec![0.0; max_alloc],
62            patt_size,
63        }
64    }
65}
66
67/// Loads a pattern into the provided ARPattHandle from a text-based buffer.
68/// Returns the index of the loaded pattern.
69pub fn ar_patt_load_from_buffer(patt_handle: &mut ARPattHandle, buffer: &str) -> Result<i32, &'static str> {
70    let mut tokens = buffer.split_whitespace();
71    
72    let mut patno = -1;
73    for i in 0..patt_handle.patt_num_max as usize {
74        if patt_handle.pattf[i] == 0 {
75            patno = i as i32;
76            break;
77        }
78    }
79    
80    if patno == -1 {
81        return Err("Maximum pattern limit reached");
82    }
83
84    let p_idx = patno as usize;
85    let size = patt_handle.patt_size as usize;
86
87    for h in 0..4 {
88        let mut l = 0;
89        let mut i_out = 0;
90        
91        let mut read_tokens = Vec::with_capacity(size * size * 3);
92        
93        for _ in 0..(size * size * 3) {
94            if let Some(t) = tokens.next() {
95                if let Ok(val) = t.parse::<i32>() {
96                    read_tokens.push(val);
97                } else {
98                    return Err("Failed to parse pattern number");
99                }
100            } else {
101                return Err("Pattern data read error (unexpected EOF)");
102            }
103        }
104        
105        for i3 in 0..3 {
106            for i2 in 0..size {
107                for i1 in 0..size {
108                    let mut j = read_tokens[i_out];
109                    i_out += 1;
110                    
111                    j = 255 - j;
112                    patt_handle.patt[p_idx * 4 + h][(i2 * size + i1) * 3 + i3] = j as i16;
113                    
114                    if i3 == 0 {
115                        patt_handle.patt_bw[p_idx * 4 + h][i2 * size + i1] = j as i16;
116                    } else {
117                        patt_handle.patt_bw[p_idx * 4 + h][i2 * size + i1] += j as i16;
118                    }
119                    
120                    if i3 == 2 {
121                        patt_handle.patt_bw[p_idx * 4 + h][i2 * size + i1] /= 3;
122                    }
123                    l += j;
124                }
125            }
126        }
127        
128        l /= (size * size * 3) as i32;
129
130        let mut m_col = 0i64;
131        let l16 = l as i16;
132        for i in 0..(size * size * 3) {
133            patt_handle.patt[p_idx * 4 + h][i] -= l16;
134            let val = patt_handle.patt[p_idx * 4 + h][i] as i32;
135            m_col += (val * val) as i64;
136        }
137        
138        patt_handle.pattpow[p_idx * 4 + h] = (m_col as ARdouble).sqrt();
139        if patt_handle.pattpow[p_idx * 4 + h] == 0.0 {
140            patt_handle.pattpow[p_idx * 4 + h] = 0.0000001;
141        }
142
143        let mut m_bw = 0i64;
144        for i in 0..(size * size) {
145            patt_handle.patt_bw[p_idx * 4 + h][i] -= l16;
146            let val = patt_handle.patt_bw[p_idx * 4 + h][i] as i32;
147            m_bw += (val * val) as i64;
148        }
149        
150        patt_handle.pattpow_bw[p_idx * 4 + h] = (m_bw as ARdouble).sqrt();
151        if patt_handle.pattpow_bw[p_idx * 4 + h] == 0.0 {
152            patt_handle.pattpow_bw[p_idx * 4 + h] = 0.0000001;
153        }
154    }
155
156    patt_handle.pattf[p_idx] = 1;
157    patt_handle.patt_num += 1;
158
159    Ok(patno)
160}
161
162/// Loads a pattern from a specified file path.
163pub fn ar_patt_load(patt_handle: &mut ARPattHandle, filename: &str) -> Result<i32, String> {
164    use std::fs;
165    
166    let buffer = fs::read_to_string(filename)
167        .map_err(|e| format!("Error reading pattern file '{}': {}", filename, e))?;
168        
169    ar_patt_load_from_buffer(patt_handle, &buffer).map_err(|e| e.to_string())
170}
171
172/// Matches the unwarped square marker against loaded templates via NCC (Normalized Cross Correlation).
173pub fn pattern_match(
174    patt_handle: &ARPattHandle,
175    mode: i32,
176    data: &[u8],
177    size: i32,
178    code: &mut i32,
179    dir: &mut i32,
180    cf: &mut ARdouble,
181) -> Result<(), &'static str> {
182    if size <= 0 {
183        *code = 0;
184        *dir = 0;
185        *cf = -1.0;
186        return Err("Invalid size");
187    }
188
189    let size_u = size as usize;
190
191    if mode == AR_TEMPLATE_MATCHING_COLOR {
192        let size_sqd_x3 = size_u * size_u * 3;
193        if data.len() < size_sqd_x3 {
194            return Err("Data array too small");
195        }
196        let mut input = vec![0i16; size_sqd_x3];
197
198        let mut ave = 0i32;
199        for i in 0..size_sqd_x3 {
200            ave += (255 - data[i]) as i32;
201        }
202        ave /= size_sqd_x3 as i32;
203
204        let mut sum = 0i64;
205        let ave16 = ave as i16;
206        for i in 0..size_sqd_x3 {
207            input[i] = ((255 - data[i]) as i16) - ave16;
208            let val = input[i] as i32;
209            sum += (val * val) as i64;
210        }
211
212        let datapow = (sum as ARdouble).sqrt();
213        if datapow / ((size as ARdouble) * 3.0f64.sqrt()) < AR_PATT_CONTRAST_THRESH1 {
214            *code = 0;
215            *dir = 0;
216            *cf = -1.0;
217            return Err("Insufficient contrast");
218        }
219
220        let mut res1 = -1;
221        let mut res2 = -1;
222        let mut max = 0.0;
223        
224        let mut k = -1isize;
225        for _ in 0..patt_handle.patt_num {
226            k += 1;
227            while (k as usize) < patt_handle.pattf.len() && patt_handle.pattf[k as usize] == 0 {
228                k += 1;
229            }
230            if k as usize >= patt_handle.pattf.len() { break; }
231            if patt_handle.pattf[k as usize] == 2 {
232                continue;
233            }
234            
235            for j in 0..4 {
236                let pattern_ref = &patt_handle.patt[k as usize * 4 + j];
237                let sum_cc = dot_product(&input, pattern_ref);
238                
239                let sum2 = (sum_cc as ARdouble) / patt_handle.pattpow[k as usize * 4 + j] / datapow;
240                if sum2 > max {
241                    max = sum2;
242                    res1 = j as i32;
243                    res2 = k as i32;
244                }
245            }
246        }
247
248        *dir = res1;
249        *code = res2;
250        *cf = max;
251        Ok(())
252    } else if mode == AR_TEMPLATE_MATCHING_MONO {
253        let size_sqd = size_u * size_u;
254        if data.len() < size_sqd {
255            return Err("Data array too small");
256        }
257        
258        let mut input = vec![0i16; size_sqd];
259
260        let mut ave = 0i32;
261        for i in 0..size_sqd {
262            ave += (255 - data[i]) as i32;
263        }
264        ave /= size_sqd as i32;
265
266        let mut sum = 0i64;
267        let ave16 = ave as i16;
268        for i in 0..size_sqd {
269            input[i] = ((255 - data[i]) as i16) - ave16;
270            let val = input[i] as i32;
271            sum += (val * val) as i64;
272        }
273
274        let datapow = (sum as ARdouble).sqrt();
275        if datapow / (size as ARdouble) < AR_PATT_CONTRAST_THRESH1 {
276            *code = 0;
277            *dir = 0;
278            *cf = -1.0;
279            return Err("Insufficient contrast");
280        }
281
282        let mut res1 = -1;
283        let mut res2 = -1;
284        let mut max = 0.0;
285        
286        let mut k = -1isize;
287        for _ in 0..patt_handle.patt_num {
288            k += 1;
289            while (k as usize) < patt_handle.pattf.len() && patt_handle.pattf[k as usize] == 0 {
290                k += 1;
291            }
292            if k as usize >= patt_handle.pattf.len() { break; }
293            if patt_handle.pattf[k as usize] == 2 {
294                continue;
295            }
296            
297            for j in 0..4 {
298                let pattern_ref = &patt_handle.patt_bw[k as usize * 4 + j];
299                let sum_cc = dot_product(&input, pattern_ref);
300                
301                let sum2 = (sum_cc as ARdouble) / patt_handle.pattpow_bw[k as usize * 4 + j] / datapow;
302                if sum2 > max {
303                    max = sum2;
304                    res1 = j as i32;
305                    res2 = k as i32;
306                }
307            }
308        }
309
310        *dir = res1;
311        *code = res2;
312        *cf = max;
313        Ok(())
314    } else {
315        Err("Unsupported matching mode")
316    }
317}
318
319pub fn get_cpara(world: &[[ARdouble; 2]; 4], vertex: &[[ARdouble; 2]; 4], para: &mut [[ARdouble; 3]; 3]) -> Result<(), &'static str> {
320    use crate::math::{ARMat};
321    let mut a = ARMat::new(8, 8);
322    let mut b = ARMat::new(8, 1);
323
324    for i in 0..4 {
325        a.m[i * 16 + 0] = world[i][0];
326        a.m[i * 16 + 1] = world[i][1];
327        a.m[i * 16 + 2] = 1.0;
328        a.m[i * 16 + 3] = 0.0;
329        a.m[i * 16 + 4] = 0.0;
330        a.m[i * 16 + 5] = 0.0;
331        a.m[i * 16 + 6] = -world[i][0] * vertex[i][0];
332        a.m[i * 16 + 7] = -world[i][1] * vertex[i][0];
333        
334        a.m[i * 16 + 8] = 0.0;
335        a.m[i * 16 + 9] = 0.0;
336        a.m[i * 16 + 10] = 0.0;
337        a.m[i * 16 + 11] = world[i][0];
338        a.m[i * 16 + 12] = world[i][1];
339        a.m[i * 16 + 13] = 1.0;
340        a.m[i * 16 + 14] = -world[i][0] * vertex[i][1];
341        a.m[i * 16 + 15] = -world[i][1] * vertex[i][1];
342
343        b.m[i * 2 + 0] = vertex[i][0];
344        b.m[i * 2 + 1] = vertex[i][1];
345    }
346
347    a.self_inv()?;
348    let c = (&a * &b)?;
349
350    for i in 0..2 {
351        para[i][0] = c.m[i * 3 + 0];
352        para[i][1] = c.m[i * 3 + 1];
353        para[i][2] = c.m[i * 3 + 2];
354    }
355    para[2][0] = c.m[6];
356    para[2][1] = c.m[7];
357    para[2][2] = 1.0;
358
359    Ok(())
360}
361
362pub fn ar_patt_get_image(
363    image_proc_mode: i32,
364    patt_detect_mode: i32,
365    patt_size: i32,
366    sample_size: i32,
367    image: &[u8],
368    xsize: i32,
369    ysize: i32,
370    _pixel_format: crate::types::ARPixelFormat,
371    vertex: &[[ARdouble; 2]; 4],
372    patt_ratio: ARdouble,
373    ext_patt: &mut [u8],
374) -> Result<(), &'static str> {
375    let mut world = [[0.0; 2]; 4];
376    let mut para = [[0.0; 3]; 3];
377
378    world[0][0] = 100.0;
379    world[0][1] = 100.0;
380    world[1][0] = 110.0;
381    world[1][1] = 100.0;
382    world[2][0] = 110.0;
383    world[2][1] = 110.0;
384    world[3][0] = 100.0;
385    world[3][1] = 110.0;
386
387    get_cpara(&world, vertex, &mut para)?;
388
389    let mut lx1 = ((vertex[0][0] - vertex[1][0]).powi(2) + (vertex[0][1] - vertex[1][1]).powi(2)) as i32;
390    let lx2 = ((vertex[2][0] - vertex[3][0]).powi(2) + (vertex[2][1] - vertex[3][1]).powi(2)) as i32;
391    let mut ly1 = ((vertex[1][0] - vertex[2][0]).powi(2) + (vertex[1][1] - vertex[2][1]).powi(2)) as i32;
392    let ly2 = ((vertex[3][0] - vertex[0][0]).powi(2) + (vertex[3][1] - vertex[0][1]).powi(2)) as i32;
393
394    if lx2 > lx1 { lx1 = lx2; }
395    if ly2 > ly1 { ly1 = ly2; }
396
397    let lx_patt = (lx1 as ARdouble * patt_ratio * patt_ratio) as i32;
398    let ly_patt = (ly1 as ARdouble * patt_ratio * patt_ratio) as i32;
399
400    let mut xdiv2 = patt_size;
401    let mut ydiv2 = patt_size;
402
403    if image_proc_mode == 0 { // AR_IMAGE_PROC_FRAME_IMAGE
404        while xdiv2 * xdiv2 < lx_patt && xdiv2 < sample_size { xdiv2 *= 2; }
405        while ydiv2 * ydiv2 < ly_patt && ydiv2 < sample_size { ydiv2 *= 2; }
406    } else {
407        while xdiv2 * xdiv2 * 4 < lx_patt && xdiv2 < sample_size { xdiv2 *= 2; }
408        while ydiv2 * ydiv2 * 4 < ly_patt && ydiv2 < sample_size { ydiv2 *= 2; }
409    }
410    
411    if xdiv2 > sample_size { xdiv2 = sample_size; }
412    if ydiv2 > sample_size { ydiv2 = sample_size; }
413
414    let xdiv = xdiv2 / patt_size;
415    let ydiv = ydiv2 / patt_size;
416    
417    let patt_ratio1 = (1.0 - patt_ratio) / 2.0 * 10.0;
418    let patt_ratio2 = patt_ratio * 10.0;
419
420    if patt_detect_mode == AR_TEMPLATE_MATCHING_COLOR {
421        let mut ext_patt2 = vec![0u32; (patt_size * patt_size * 3) as usize];
422        
423        for j in 0..ydiv2 {
424            let yw = (100.0 + patt_ratio1) + patt_ratio2 * (j as f64 + 0.5) / (ydiv2 as f64);
425            for i in 0..xdiv2 {
426                let xw = (100.0 + patt_ratio1) + patt_ratio2 * (i as f64 + 0.5) / (xdiv2 as f64);
427                let d = para[2][0] * xw + para[2][1] * yw + para[2][2];
428                if d == 0.0 { return Err("Division by zero in homography"); }
429                
430                let xc = ((para[0][0] * xw + para[0][1] * yw + para[0][2]) / d) as i32;
431                let yc = ((para[1][0] * xw + para[1][1] * yw + para[1][2]) / d) as i32;
432
433                if xc >= 0 && xc < xsize && yc >= 0 && yc < ysize {
434                    // RGB assumes 3 bytes per pixel in the source buffer
435                    let src_idx = ((yc * xsize + xc) * 3) as usize;
436                    if src_idx + 2 < image.len() {
437                        let dst_idx = (((j / ydiv) * patt_size + (i / xdiv)) * 3) as usize;
438                        ext_patt2[dst_idx + 0] += image[src_idx + 0] as u32; // R
439                        ext_patt2[dst_idx + 1] += image[src_idx + 1] as u32; // G
440                        ext_patt2[dst_idx + 2] += image[src_idx + 2] as u32; // B
441                    }
442                }
443            }
444        }
445        
446        for i in 0..(patt_size * patt_size * 3) as usize {
447            if i < ext_patt.len() {
448                ext_patt[i] = (ext_patt2[i] / (xdiv * ydiv) as u32) as u8;
449            }
450        }
451    } else {
452        let mut ext_patt2 = vec![0u32; (patt_size * patt_size) as usize];
453        
454        for j in 0..ydiv2 {
455            let yw = (100.0 + patt_ratio1) + patt_ratio2 * (j as f64 + 0.5) / (ydiv2 as f64);
456            for i in 0..xdiv2 {
457                let xw = (100.0 + patt_ratio1) + patt_ratio2 * (i as f64 + 0.5) / (xdiv2 as f64);
458                let d = para[2][0] * xw + para[2][1] * yw + para[2][2];
459                if d == 0.0 { return Err("Division by zero in homography"); }
460                
461                let xc = ((para[0][0] * xw + para[0][1] * yw + para[0][2]) / d) as i32;
462                let yc = ((para[1][0] * xw + para[1][1] * yw + para[1][2]) / d) as i32;
463
464                if xc >= 0 && xc < xsize && yc >= 0 && yc < ysize {
465                    // Assuming Luma takes 1 byte per pixel! Wait, if image is RGB, Luma would be 3 bytes?
466                    // if it's RGB buffer, we must convert to luma. Assuming Luma/Mono buffer passed in natively:
467                    let src_idx = (yc * xsize + xc) as usize;
468                    if src_idx < image.len() {
469                        let dst_idx = ((j / ydiv) * patt_size + (i / xdiv)) as usize;
470                        ext_patt2[dst_idx] += image[src_idx] as u32;
471                    }
472                }
473            }
474        }
475        
476        for i in 0..(patt_size * patt_size) as usize {
477            if i < ext_patt.len() {
478                ext_patt[i] = (ext_patt2[i] / (xdiv * ydiv) as u32) as u8;
479            }
480        }
481    }
482
483    Ok(())
484}
485
486#[cfg(test)]
487mod tests {
488    use super::*;
489
490    #[test]
491    fn test_pattern_load_and_match() {
492        let mut handle = ARPattHandle::new(AR_PATT_SIZE1, AR_PATT_NUM_MAX);
493        
494        let mut mock_patt = String::new();
495        // 4 orientations * size * size * 3 colors
496        for _ in 0..(4 * AR_PATT_SIZE1 * AR_PATT_SIZE1 * 3) {
497            mock_patt.push_str("128 ");
498        }
499        
500        let patno = ar_patt_load_from_buffer(&mut handle, &mock_patt).unwrap();
501        assert_eq!(patno, 0);
502        assert_eq!(handle.patt_num, 1);
503        
504        // Mock extracted pattern with high contrast
505        let mut mock_data = vec![0; (AR_PATT_SIZE1 * AR_PATT_SIZE1 * 3) as usize];
506        for i in 0..mock_data.len() {
507            if i % 2 == 0 {
508                mock_data[i] = 10;
509            } else {
510                mock_data[i] = 240;
511            }
512        }
513        
514        let mut code = 0;
515        let mut dir = 0;
516        let mut cf = 0.0;
517        let result = pattern_match(&handle, AR_TEMPLATE_MATCHING_COLOR, &mock_data, AR_PATT_SIZE1, &mut code, &mut dir, &mut cf);
518        assert!(result.is_ok());
519    }
520}
521
522#[inline]
523fn dot_product(a: &[i16], b: &[i16]) -> i64 {
524    #[cfg(feature = "simd-pattern")]
525    {
526        #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
527        {
528            return unsafe { dot_product_simd_wasm(a, b) };
529        }
530        #[cfg(all(target_arch = "x86_64", target_feature = "sse4.1"))]
531        {
532            if is_x86_feature_detected!("sse4.1") {
533                return unsafe { dot_product_simd_x86(a, b) };
534            }
535        }
536    }
537
538    dot_product_scalar(a, b)
539}
540
541pub fn dot_product_scalar(a: &[i16], b: &[i16]) -> i64 {
542    let mut sum = 0i64;
543    for i in 0..a.len() {
544        sum += (a[i] as i32 * b[i] as i32) as i64;
545    }
546    sum
547}
548
549#[cfg(all(feature = "simd-pattern", target_arch = "wasm32", target_feature = "simd128"))]
550#[target_feature(enable = "simd128")]
551unsafe fn dot_product_simd_wasm(a: &[i16], b: &[i16]) -> i64 {
552    use std::arch::wasm32::*;
553    
554    let mut sum_v = i32x4_splat(0);
555    let chunks_len = a.len() / 8;
556    
557    let mut a_ptr = a.as_ptr();
558    let mut b_ptr = b.as_ptr();
559    
560    for _ in 0..chunks_len {
561        let va = v128_load(a_ptr as *const v128);
562        let vb = v128_load(b_ptr as *const v128);
563        
564        // i32x4_dot_i16x8 computes (a0*b0 + a1*b1), (a2*b2 + a3*b3), ...
565        let dot = i32x4_dot_i16x8(va, vb);
566        sum_v = i32x4_add(sum_v, dot);
567        
568        a_ptr = a_ptr.add(8);
569        b_ptr = b_ptr.add(8);
570    }
571    
572    // Sum the 4 horizontal parts into i64x2
573    let low = i64x2_extend_low_i32x4(sum_v);
574    let high = i64x2_extend_high_i32x4(sum_v);
575    let sum64 = i64x2_add(low, high);
576
577    let mut res = [0i64; 2];
578    v128_store(res.as_mut_ptr() as *mut v128, sum64);
579    let mut total = res[0] + res[1];
580    
581    let rem_start = chunks_len * 8;
582    for i in rem_start..a.len() {
583        total += (a[i] as i32 * b[i] as i32) as i64;
584    }
585    
586    total
587}
588
589/// Calculates the dot product of two i16 slices using SSE4.1 intrinsics.
590///
591/// Processes 8 elements at a time using `_mm_madd_epi16`.
592///
593/// # Safety
594/// This function is unsafe because it uses SIMD intrinsics and raw pointer arithmetic.
595/// The caller must ensure that the target CPU supports SSE4.1.
596#[cfg(all(feature = "simd-pattern", target_arch = "x86_64", target_feature = "sse4.1"))]
597#[target_feature(enable = "sse4.1")]
598pub unsafe fn dot_product_simd_x86(a: &[i16], b: &[i16]) -> i64 {
599    use std::arch::x86_64::*;
600    
601    let mut sum_v = _mm_setzero_si128(); // i32x4
602    let chunks_len = a.len() / 8;
603    
604    let mut a_ptr = a.as_ptr();
605    let mut b_ptr = b.as_ptr();
606    
607    for _ in 0..chunks_len {
608        let va = _mm_loadu_si128(a_ptr as *const __m128i);
609        let vb = _mm_loadu_si128(b_ptr as *const __m128i);
610        
611        // Multiply and add adjacent pairs.
612        let m = _mm_madd_epi16(va, vb);
613        sum_v = _mm_add_epi32(sum_v, m);
614        
615        a_ptr = a_ptr.add(8);
616        b_ptr = b_ptr.add(8);
617    }
618    
619    let mut res = [0i32; 4];
620    _mm_storeu_si128(res.as_mut_ptr() as *mut __m128i, sum_v);
621    let mut total = (res[0] as i64) + (res[1] as i64) + (res[2] as i64) + (res[3] as i64);
622    
623    let rem_start = chunks_len * 8;
624    for i in rem_start..a.len() {
625        total += (a[i] as i32 * b[i] as i32) as i64;
626    }
627    
628    total
629}