Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
/***********************************************************************
Copyright (c) 2006-2012, Skype Limited. All rights reserved. 
Redistribution and use in source and binary forms, with or without 
modification, (subject to the limitations in the disclaimer below) 
are permitted provided that the following conditions are met:
- Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright 
notice, this list of conditions and the following disclaimer in the 
documentation and/or other materials provided with the distribution.
- Neither the name of Skype Limited, nor the names of specific 
contributors, may be used to endorse or promote products derived from 
this software without specific prior written permission.
NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE GRANTED 
BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 
CONTRIBUTORS ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF 
USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
***********************************************************************/

#if defined(__arm__)

#include "SKP_Silk_AsmPreproc.h"
#if ( EMBEDDED_ARM >= 7 )

	VARDEF len32, r3
	VARDEF len32tmp, lr
	VARDEF ptr1, r2
	VARDEF ptr2, r1
	VARDEF tmp1, r4
	VARDEF tmp2, r5

	VARDEFD val_0, d0
	VARDEFD val_1, d1
	VARDEFD val_2, d2
	VARDEFD val_3, d3
	VARDEFQ sum_tmp1, q2
	VARDEFQ sum_tmp2, q3
	VARDEFD sum_tmp1_lo, d4
	VARDEFD sum_tmp1_hi, d5

.globl	SYM(SKP_Silk_inner_prod_aligned)
SYM(SKP_Silk_inner_prod_aligned):
	stmdb	sp!,  {r4-r10, fp, ip, lr}
	vpush	{q0-q7}
	add		fp, sp, #164
	mov			len32, r2								//	put length into r3
	mov			ptr1, r0								//  put in1 to r2
	mov			r0, #0								//	put result to r0	
//	USE SL8D, SI4D
L(2)
	cmp			len32, #24
	and			len32tmp, len32, #0x7
	blt			LR(3, f)

	vmov.i32	sum_tmp1, #0
	vld1.16		{val_0, val_1}, [ptr2]!
	vld1.16		{val_2, val_3}, [ptr1]!
	vmov.i32	sum_tmp2, #0								// Set Q2, Q3 to 0
	sub			len32, len32, #16
L(0)	
	subs		len32, len32, #8
	vmlal.s16	sum_tmp1, val_0, val_2
	vmlal.s16	sum_tmp2, val_1, val_3
	vld1.16		{val_0, val_1}, [ptr2]!
	vld1.16		{val_2, val_3}, [ptr1]!
	bge			LR(0, b)
	
	vmlal.s16	sum_tmp1, val_0, val_2
	vmlal.s16	sum_tmp2, val_1, val_3
	vadd.s32	sum_tmp1, sum_tmp1, sum_tmp2
	vadd.s32	val_0, sum_tmp1_lo, sum_tmp1_hi
	vmov		tmp1, tmp2, val_0
	cmp			len32tmp, #0								// Check if length%4 == 0
	add			r0, r0, tmp1
	add			r0, r0, tmp2
	bgt			LR(1, f)									// Jump to process the reminder
	
	vpop	{q0-q7}
	ldmia	sp!,  {r4-r10, fp, ip, pc}
	
	VARDEFQ sum_tmp3, q1
	VARDEFD sum_tmp3_lo, d2
	VARDEFD sum_tmp3_hi, d3
//	USE SL4D, SI4D
L(3)	
	cmp			len32, #12
	and			len32tmp, len32, #0x3
	movlt		len32tmp, len32								//	if length is not enough for SIMD. 
	blt			LR(1, f)
	
	vld1.16		val_0, [ptr2]!
	vld1.16		val_1, [ptr1]!
	vmov.i32	sum_tmp3, #0 
	sub			len32, len32, #8
L(0)	
	subs		len32, len32, #4
	vmlal.s16	sum_tmp3, val_0, val_1
	vld1.16		val_0, [ptr2]!
	vld1.16		val_1, [ptr1]!
	bge			LR(0, b)
	
	vmlal.s16	sum_tmp3, val_0, val_1
	vadd.s32	val_0, sum_tmp3_lo, sum_tmp3_hi
	vmov		tmp1, tmp2, val_0
	cmp			len32tmp, #0								// Check if length%4 == 0
	add			r0, r0, tmp1
	add			r0, r0, tmp2
	bgt			LR(1, f)									// Jump to process the reminder
	
	vpop	{q0-q7}
	ldmia	sp!,  {r4-r10, fp, ip, pc}

	VARDEF tmp0, r3

L(1)
	subs		len32tmp, len32tmp, #1
	ldrsh		tmp0, [ptr2], #2
	ldrsh		tmp1, [ptr1], #2
	beq			LR(2, f)	
L(0)	
	smlabb		r0, tmp0, tmp1, r0	
	ldrsh		tmp0, [ptr2], #2
	ldrsh		tmp1, [ptr1], #2
	subs		len32tmp, len32tmp, #1
	bgt			LR(0, b)

L(2)
	smlabb		r0, tmp0, tmp1, r0
	vpop	{q0-q7}
	ldmia	sp!,  {r4-r10, fp, ip, pc}

	VARDEF len64, r4
	VARDEF len64tmp, lr
	VARDEF ptr00, r2
	VARDEF ptr01, r3
	VARDEFD val0, d0
	VARDEFD val1, d1
	VARDEFD val2, d2
	VARDEFD val3, d3
	VARDEFQ mul0, q2
	VARDEFD mul0_lo, d4
	VARDEFD mul0_hi, d5
	VARDEFQ mul1, q3
	VARDEFD mul1_lo, d6
	VARDEFD mul1_hi, d7
	VARDEFQ accu0, q4
	VARDEFD accu0_lo, d8
	VARDEFD accu0_hi, d9
	VARDEFQ accu1, q5
	VARDEFD accu1_lo, d10
	VARDEFD accu1_hi, d11
	
.globl	SYM(SKP_Silk_inner_prod16_aligned_64)
SYM(SKP_Silk_inner_prod16_aligned_64):
	stmdb	sp!,  {r4-r10, fp, ip, lr}
	vpush	{q0-q7}
	add		fp, sp, #164
	mov			len64, r2
	mov			ptr00, r0
	mov			ptr01, r1
	mov			r0, #0						/*Output*/
	mov			r1, #0

//	USE SL8D, SI4D
L(2)	
	cmp			len64, #24
	and			len64tmp, len64, #0x7
	blt			LR(3, f)
	
	vld1.16		{val0, val1}, [ptr00]!
	vld1.16		{val2, val3}, [ptr01]!
	vmov		accu0_lo, r0, r1 
	vmov		accu0_hi, r0, r1
	vmov		accu1, accu0 
	sub			len64, len64, #16
L(0)	
	vmull.s16	mul0, val0, val2
	vmull.s16	mul1, val1, val3
	vld1.16		{val0, val1}, [ptr00]!
	subs		len64, len64, #8
	//vqadd.s32	mul0, mul0, mul1
	vld1.16		{val2, val3}, [ptr01]!
	vaddw.s32	accu0, accu0, mul0_lo
	vaddw.s32	accu1, accu1, mul0_hi
	vaddw.s32	accu0, accu0, mul1_lo
	vaddw.s32	accu1, accu1, mul1_hi
	bge			LR(0, b)
	
	vmull.s16	mul0, val0, val2
	vmull.s16	mul1, val1, val3
	//vqadd.s32	mul0, mul0, mul1
	vaddw.s32	accu0, accu0, mul0_lo
	vaddw.s32	accu1, accu1, mul0_hi
	vaddw.s32	accu0, accu0, mul1_lo
	vaddw.s32	accu1, accu1, mul1_hi
	vqadd.s64	accu0, accu0, accu1
	vqadd.s64	accu0_lo, accu0_lo, accu0_hi
	vmov		r0, r1, accu0_lo
	cmp			len64tmp, #0								// Check if length%4 == 0
	bgt			LR(1, f)									// Jump to process the reminder
	
	vpop	{q0-q7}
	ldmia	sp!,  {r4-r10, fp, ip, pc}

	VARDEFQ mul2, q1
	VARDEFD mul2_lo, d2
	VARDEFD mul2_hi, d3
	VARDEFQ accu2, q2
	VARDEFD accu2_lo, d4
	VARDEFD accu2_hi, d5
	VARDEFQ accu3, q3

//	USE SL4D, SI4D
L(3)	
	cmp			len64, #12
	and			len64tmp, len64, #0x3
	movlt		len64tmp, len64								//	if length is not enough for SIMD. 
	blt			LR(1, f)
	
	vld1.16		val0, [ptr00]!
	vld1.16		val1, [ptr01]!
	vmov		accu2_lo, r0, r1 
	vmov		accu2_hi, r0, r1
	vmov		accu3, accu2 
	sub			len64, len64, #8
L(0)	
	vmull.s16	mul2, val0, val1
	vld1.16		val0, [ptr00]!
	subs		len64, len64, #4
	vaddw.s32	accu2, accu2, mul2_lo
	vld1.16		val1, [ptr01]!
	vaddw.s32	accu3, accu3, mul2_hi
	bge			LR(0, b)
	
	vmull.s16	mul2, val0, val1
	vaddw.s32	accu2, accu2, mul2_lo
	vaddw.s32	accu3, accu3, mul2_hi
	vqadd.s64	accu2, accu2, accu3
	vqadd.s64	accu2_lo, accu2_lo, accu2_hi
	vmov		r0, r1, accu2_lo
	cmp			len64tmp, #0
	bgt			LR(1, f)
	
	vpop	{q0-q7}
	ldmia	sp!,  {r4-r10, fp, ip, pc}

	VARDEF val4, r4
	VARDEF val5, r5
L(1)
	subs		len64tmp, len64tmp, #1
	ldrsh		val4, [ptr00], #2
	ldrsh		val5, [ptr01], #2
	beq			LR(2, f)	
L(0)	
	smlalbb		r0, r1, val4, val5
	ldrsh		val4, [ptr00], #2
	ldrsh		val5, [ptr01], #2
	subs		len64tmp, len64tmp, #1
	bgt			LR(0, b)

L(2)
	smlalbb		r0, r1, val4, val5
	vpop	{q0-q7}
	ldmia	sp!,  {r4-r10, fp, ip, pc}

#elif EMBEDDED_ARM >=5 

/*
 *	SKP_Silk_inner_prod_aligned(val1_16bit[], val2_16bit[], len)
 *	
 *	Known issue: 
 *		1. val1_16bit and val2_16bit needs to be 16bit aligned. 
 *		2. result is in 32bit, no saturation, wrap around instead.
 */
	VARDEF sum, r0
	VARDEF val_p1, r1
	VARDEF val_p2, r2
	VARDEF len, r3
	VARDEF val1, r4
	VARDEF val2, r5
	VARDEF val3, r6

#ifdef IPHONE
	VARDEF val4, r8
	VARDEF tmp, sb
	VARDEF val5, sl
	VARDEF val6, _r7
	VARDEF val7, lr
	VARDEF val8, ip
#else
	VARDEF val4, _r7
	VARDEF tmp, r8
	VARDEF val5, sb
	VARDEF val6, sl
	VARDEF val7, lr
	VARDEF val8, ip
#endif

.globl	SYM(SKP_Silk_inner_prod_aligned)
SYM(SKP_Silk_inner_prod_aligned):
	stmdb	sp!,  {r4-r10, fp, ip, lr}
	add		fp, sp, #36

	cmp		r2, #14
	blt		LR(9, f)/*LenLessThan14*/
	
	ands	tmp, r2, #1						/*check if len is a even number*/
	mov		len, r2
	mov		val_p2, r0
	mov		sum, #0
	beq		LR(0, f)/*LenEven*/
	
	ldrsh	val3, [val_p1], #2
	ldrsh	val4, [val_p2], #2
	sub		len, len, #1
	smulbb	sum, val3, val4
/*LenEven:*/
L(0)
	ands	val1, val_p1, #2				/*Check if val_p1 is	LR(4, B) aligned.*/
	bgt		LR(1, f)/*R1Odd*/							
	ands	val2, val_p2, #2				/*Check if val_p2 is	LR(4, B) aligned*/
	bgt		LR(2, f)/*R2Odd*/
	
/*R1R2Even:*/
	ands	tmp, len, #3
	beq		LR(4, f)/*Len4*/
	sub		len, len, #2
	ldr		val1, [val_p1], #4
	ldr		val2, [val_p2], #4
	SKP_SMLAD	sum, val1, val2, sum
L(4)/*Len4:*/
	ands	tmp, len, #7
	beq		LR(8, f)/*Len8*/
	ldmia	val_p1!, {val1, val3}
	ldmia	val_p2!, {val2, val4}
	sub		len, len, #4
	SKP_SMLAD	sum, val1, val2, sum
	SKP_SMLAD	sum, val3, val4, sum

L(8)/*Len8:*/
	ldmia	val_p1!, {val1, val3, val5, val7}
	ldmia	val_p2!, {val2, val4, val6, val8}
L(0)
	subs	len, len, #8	
	SKP_SMLAD	sum, val1, val2, sum
	SKP_SMLAD	sum, val3, val4, sum
	SKP_SMLAD	sum, val5, val6, sum
	SKP_SMLAD	sum, val7, val8, sum
	ldmgtia	val_p1!, {val1, val3, val5, val7}
	ldmgtia	val_p2!, {val2, val4, val6, val8}
	bgt		LR(0, b)	
	ldmia	sp!,  {r4-r10, fp, ip, pc}

L(2)/*R2Odd:*/
	ands	tmp, len, #3
	beq		LR(6, f)/*Len4R2Odd*/
	ldr		val1, [val_p1], #4
	ldrsh	val3, [val_p2], #2
	ldrsh	val4, [val_p2], #2				/*make val_p2 even*/
	sub		len, len, #2
	smlabb	sum, val1, val3, sum
	smlatb	sum, val1, val4, sum
L(6)/*Len4R2Odd:*/
	sub		len, len, #4
	ldrsh	tmp, [val_p2], #2				/*make val_p2 even*/
	ldmia	val_p1!, {val1, val3}
	ldmia	val_p2!, {val2, val4}
	mov		tmp, tmp, lsl #16
L(0)
	subs	len, len, #4	
	smlabt	sum, val1, tmp, sum
	smlatb	sum, val1, val2, sum
	smlabt	sum, val3, val2, sum
	smlatb	sum, val3, val4, sum
	mov		tmp, val4
	ldmia	val_p1!, {val1, val3}
	ldmia	val_p2!, {val2, val4}
	bgt		LR(0, b)	
	smlabt	sum, val1, tmp, sum
	smlatb	sum, val1, val2, sum
	smlabt	sum, val3, val2, sum
	smlatb	sum, val3, val4, sum
	ldmia	sp!,  {r4-r10, fp, ip, pc}
	
L(1)/*R1Odd:*/
	ands	val2, val_p2, #2				/*Check if val_p2 is	LR(4, B) aligned*/
	bgt		LR(3, f)/*R1R2Odd*/
	
	ands	tmp, len, #3
	beq		LR(5, f)/*Len4R1Odd*/
	ldrsh	val1, [val_p1], #2
	ldrsh	val2, [val_p1], #2
	ldr		val3, [val_p2], #4				/*make val_p2 even*/
	sub		len, len, #2
	smlabb	sum, val1, val3, sum
	smlabt	sum, val2, val3, sum
L(5)/*Len4R1Odd:*/
	sub		len, len, #4
	ldrsh	tmp, [val_p1], #2				/*make val_p2 even*/
	ldmia	val_p1!, {val1, val3}
	ldmia	val_p2!, {val2, val4}
	mov		tmp, tmp, lsl #16
L(0)
	subs	len, len, #4	
	smlatb	sum, tmp, val2, sum
	smlabt	sum, val1, val2, sum
	smlatb	sum, val1, val4, sum
	smlabt	sum, val3, val4, sum
	mov		tmp, val3
	ldmia	val_p1!, {val1, val3}
	ldmia	val_p2!, {val2, val4}
	bgt		LR(0, b)	
	smlatb	sum, tmp, val2, sum
	smlabt	sum, val1, val2, sum
	smlatb	sum, val1, val4, sum
	smlabt	sum, val3, val4, sum
	ldmia	sp!,  {r4-r10, fp, ip, pc}

L(3)/*R1R2Odd:*/
	sub		len, len, #4
	ldrsh	val3, [val_p1], #2
	ldrsh	val4, [val_p2], #2
	ldr		val1, [val_p1], #4
	ldr		val2, [val_p2], #4
	smlabb	sum, val3, val4, sum
L(0)
	subs	len, len, #2	
	SKP_SMLAD	sum, val1, val2, sum
	ldr		val1, [val_p1], #4
	ldr		val2, [val_p2], #4
	bgt		LR(0, b)	
	ldrsh	val3, [val_p1], #2
	ldrsh	val4, [val_p2], #2
	SKP_SMLAD	sum, val1, val2, sum
	smlabb	sum, val3, val4, sum	
	ldmia	sp!,  {r4-r10, fp, ip, pc}
	
L(9)/*LenLessThan14:*/
	mov		len, r2
	mov		val_p2, r0
	mov		sum, #0
L(0)
	ldrsh	val1, [val_p1], #2
	ldrsh	val2, [val_p2], #2
	subs	len, len, #1
	smlabb	sum, val1, val2, sum
	bgt		LR(0, b)
	ldmia	sp!,  {r4-r10, fp, ip, pc}

/*
 *	SKP_Silk_inner_prod16_aligned_64(val1_16bit[], val2_16bit[], len)
 *	
 *	Known issue: 
 *		1. val1_16bit and val2_16bit needs to be 16bit aligned. 
 *		2. result is in 64bit.
 */

// only redefine those registers. 
	VARDEF sumLo, r0
	VARDEF sumHi, r1

#ifdef IPHONE
	VARDEF val_p3, sl
	VARDEF val_5, sb
	VARDEF val_6, _r7
	VARDEF val_7, lr
	VARDEF val_8, ip
#else
	VARDEF val_p3, sb
	VARDEF val_5, r8
	VARDEF val_6, sl
	VARDEF val_7, lr
	VARDEF val_8, ip
#endif

 
.globl	SYM(SKP_Silk_inner_prod16_aligned_64)
SYM(SKP_Silk_inner_prod16_aligned_64):
	stmdb	sp!,  {r4-r10, fp, ip, lr}
	add		fp, sp, #36
	
	cmp			r2, #14
	blt			LR(9, f)/*LenLessThan14_64*/
	
	ands		tmp, r2, #1						/*check if len is a even number*/
	mov			len, r2
	mov			val_p2, r0
	mov			val_p3, r1
	mov			sumLo, #0
	mov			sumHi, #0
	beq			LR(0, f)/*LenEven64*/
	
	ldrsh		val3, [val_p3], #2
	ldrsh		val4, [val_p2], #2
	sub			len, len, #1
	smlalbb		sumLo, sumHi, val3, val4
L(0)/*LenEven64:*/
	ands		val1, val_p3, #2				/*Check if val_p3 is	LR(4, B) aligned.*/
	bgt			LR(1, f)/*R1Odd64*/							
	ands		val2, val_p2, #2				/*Check if val_p2 is	LR(4, B) aligned*/
	bgt			LR(2, f)/*R2Odd64*/
/*R1R2Even64:*/			
	ands		tmp, len, #3
	beq			LR(4, f)/*Len464*/
	sub			len, len, #2
	ldr			val1, [val_p3], #4
	ldr			val2, [val_p2], #4
	SKP_SMLALD	sumLo, sumHi, val1, val2
	
L(4)/*Len464:*/	
	ands		tmp, len, #7
	beq			LR(8, f)/*Len864*/
	sub			len, len, #4
	ldmia		val_p3!, {val1, val3}
	ldmia		val_p2!, {val2, val4}
	SKP_SMLALD	sumLo, sumHi, val1, val2
	SKP_SMLALD	sumLo, sumHi, val3, val4
	
L(8)/*Len864:*/	
	ldmia		val_p3!, {val1, val3, val_5, val_7}
	ldmia		val_p2!, {val2, val4, val_6, val_8}
L(0)
	subs		len, len, #8	
	SKP_SMLALD	sumLo, sumHi, val1, val2
	SKP_SMLALD	sumLo, sumHi, val3, val4
	SKP_SMLALD	sumLo, sumHi, val_5, val_6
	SKP_SMLALD	sumLo, sumHi, val_7, val_8
	ldmgtia		val_p3!, {val1, val3, val_5, val_7}
	ldmgtia		val_p2!, {val2, val4, val_6, val_8}
	bgt			LR(0, b)	
	ldmia	sp!,  {r4-r10, fp, ip, pc}

L(2)/*R2Odd64:*/
	sub			len, len, #2
	sub			val_p2, val_p2, #2				/*make val_p2 even*/
	ldr			val1, [val_p3], #4
	ldr			val3, [val_p2], #4
	ldr			val2, [val_p2], #4
L(0)
	subs		len, len, #2	
	smlalbt		sumLo, sumHi, val1, val3
	smlaltb		sumLo, sumHi, val1, val2
	mov			val3, val2
	ldr			val1, [val_p3], #4
	ldr			val2, [val_p2], #4
	bgt			LR(0, b)	
	smlalbt		sumLo, sumHi, val1, val3
	smlaltb		sumLo, sumHi, val1, val2
	ldmia	sp!,  {r4-r10, fp, ip, pc}

L(1)/*R1Odd64:*/
	ands		val2, r2, #2					/*Check if val_p2 is	LR(4, B) aligned*/
	bgt			LR(3, f)/*R1R2Odd64*/
	sub			len, len, #2
	sub			val_p3, val_p3, #2				/*make val_p3 even*/
	ldr			val3, [val_p3], #4
	ldr			val1, [val_p3], #4
	ldr			val2, [val_p2], #4
L(0)
	subs		len, len, #2	
	smlaltb		sumLo, sumHi, val3, val2
	smlalbt		sumLo, sumHi, val1, val2
	mov			val3, val1
	ldr			val1, [val_p3], #4
	ldr			val2, [val_p2], #4
	bgt			LR(0, b)	
	smlaltb		sumLo, sumHi, val3, val2
	smlalbt		sumLo, sumHi, val1, val2
	ldmia	sp!,  {r4-r10, fp, ip, pc}

L(3)/*R1R2Odd64:*/
	sub			len, len, #4
	ldrsh		val3, [val_p3], #2
	ldrsh		val4, [val_p2], #2
	ldr			val1, [val_p3], #4
	ldr			val2, [val_p2], #4
	smlalbb		sumLo, sumHi, val3, val4
L(0)
	subs		len, len, #2	
	SKP_SMLALD		sumLo, sumHi, val1, val2
	ldr			val1, [val_p3], #4
	ldr			val2, [val_p2], #4
	bgt			LR(0, b)	
	ldrsh		val3, [val_p3], #2
	ldrsh		val4, [val_p2], #2
	SKP_SMLALD		sumLo, sumHi, val1, val2
	smlalbb		sumLo, sumHi, val3, val4
	ldmia	sp!,  {r4-r10, fp, ip, pc}
	
L(9)/*LenLessThan14_64:*/
	mov			len, r2
	mov			val_p2, r0
	mov			val_p3, r1
	mov			sumLo, #0
	mov			sumHi, #0
L(0)
	ldrsh		val1, [val_p3], #2
	ldrsh		val2, [val_p2], #2
	subs		len, len, #1
	smlalbb		sumLo, sumHi, val1, val2
	bgt			LR(0, b)
	ldmia	sp!,  {r4-r10, fp, ip, pc}
#endif	
	END
#endif