sdl2-sys 0.38.0

Raw SDL2 bindings for Rust, used internally rust-sdl2
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
/*
 * Copyright (c) 2016 RISC OS Open Ltd
 *
 * This software is provided 'as-is', without any express or implied
 * warranty.  In no event will the authors be held liable for any damages
 * arising from the use of this software.
 *
 * Permission is granted to anyone to use this software for any purpose,
 * including commercial applications, and to alter it and redistribute it
 * freely, subject to the following restrictions:
 *
 * 1. The origin of this software must not be misrepresented; you must not
 *    claim that you wrote the original software. If you use this software
 *    in a product, an acknowledgment in the product documentation would be
 *    appreciated but is not required.
 * 2. Altered source versions must be plainly marked as such, and must not be
 *    misrepresented as being the original software.
 * 3. This notice may not be removed or altered from any source distribution.
 */

/* Prevent the stack from becoming executable */
#if defined(__linux__) && defined(__ELF__)
.section .note.GNU-stack,"",%progbits
#endif

	.text
	.arch armv6
	.object_arch armv4
	.arm
	.altmacro
	.p2align 2

#include "pixman-arm-asm.h"
#include "pixman-arm-simd-asm.h"

/* A head macro should do all processing which results in an output of up to
 * 16 bytes, as far as the final load instruction. The corresponding tail macro
 * should complete the processing of the up-to-16 bytes. The calling macro will
 * sometimes choose to insert a preload or a decrement of X between them.
 *   cond           ARM condition code for code block
 *   numbytes       Number of output bytes that should be generated this time
 *   firstreg       First WK register in which to place output
 *   unaligned_src  Whether to use non-wordaligned loads of source image
 *   unaligned_mask Whether to use non-wordaligned loads of mask image
 *   preload        If outputting 16 bytes causes 64 bytes to be read, whether an extra preload should be output
 */

/******************************************************************************/

.macro FillRect32_init
        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
.endm

.macro FillRect16_init
        ldrh    SRC, [sp, #ARGS_STACK_OFFSET]
        orr     SRC, SRC, lsl #16
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
.endm

.macro FillRect8_init
        ldrb    SRC, [sp, #ARGS_STACK_OFFSET]
        orr     SRC, SRC, lsl #8
        orr     SRC, SRC, lsl #16
        mov     STRIDE_S, SRC
        mov     MASK, SRC
        mov     STRIDE_M, SRC
.endm

.macro FillRect_process_tail  cond, numbytes, firstreg
    WK4     .req    SRC
    WK5     .req    STRIDE_S
    WK6     .req    MASK
    WK7     .req    STRIDE_M
        pixst   cond, numbytes, 4, DST
    .unreq  WK4
    .unreq  WK5
    .unreq  WK6
    .unreq  WK7
.endm

generate_composite_function \
    FillRect32ARMSIMDAsm, 0, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    FillRect32_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    FillRect_process_tail

generate_composite_function \
    FillRect16ARMSIMDAsm, 0, 0, 16, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    FillRect16_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    FillRect_process_tail

generate_composite_function \
    FillRect8ARMSIMDAsm, 0, 0, 8, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_PSR | FLAG_PROCESS_DOES_STORE | FLAG_PROCESS_PRESERVES_SCRATCH \
    0, /* prefetch distance doesn't apply */ \
    FillRect8_init \
    nop_macro, /* newline */ \
    nop_macro /* cleanup */ \
    nop_macro /* process head */ \
    FillRect_process_tail

/******************************************************************************/

/* This differs from the over_8888_8888 routine in Pixman in that the destination
 * alpha component is always left unchanged, and RGB components are not
 * premultiplied by alpha. It differs from BlitRGBtoRGBPixelAlpha in that
 * renormalisation is done by multiplying by 257/256 (with rounding) rather than
 * simply shifting right by 8 bits - removing the need to special-case alpha=0xff.
 */

.macro RGBtoRGBPixelAlpha_init
        line_saved_regs STRIDE_S, ORIG_W
        mov     MASK, #0x80
.endm

.macro RGBtoRGBPixelAlpha_1pixel_translucent  s, d, tmp0, tmp1, tmp2, tmp3, half
        uxtb    tmp3, s
        uxtb    tmp0, d
        sub     tmp0, tmp3, tmp0
        uxtb    tmp3, s, ror #16
        uxtb    tmp1, d, ror #16
        sub     tmp1, tmp3, tmp1
        uxtb    tmp3, s, ror #8
        mov     s, s, lsr #24
        uxtb    tmp2, d, ror #8
        sub     tmp2, tmp3, tmp2
        smlabb  tmp0, tmp0, s, half
        smlabb  tmp1, tmp1, s, half
        smlabb  tmp2, tmp2, s, half
        add     tmp0, tmp0, asr #8
        add     tmp1, tmp1, asr #8
        add     tmp2, tmp2, asr #8
        pkhbt   tmp0, tmp0, tmp1, lsl #16
        and     tmp2, tmp2, #0xff00
        uxtb16  tmp0, tmp0, ror #8
        orr     tmp0, tmp0, tmp2
        uadd8   d, d, tmp0
.endm

.macro RGBtoRGBPixelAlpha_1pixel_opaque  s, d
        and     d, d, #0xff000000
        bic     s, s, #0xff000000
        orr     d, d, s
.endm

.macro RGBtoRGBPixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 .if numbytes == 16
        ldm     SRC!, {WK0, WK1}
        ldm     SRC!, {STRIDE_S, STRIDE_M}
        ldrd    WK2, WK3, [DST], #16
        orr     SCRATCH, WK0, WK1
        and     ORIG_W, WK0, WK1
        orr     SCRATCH, SCRATCH, STRIDE_S
        and     ORIG_W, ORIG_W, STRIDE_S
        orr     SCRATCH, SCRATCH, STRIDE_M
        and     ORIG_W, ORIG_W, STRIDE_M
        tst     SCRATCH, #0xff000000
 .elseif numbytes == 8
        ldm     SRC!, {WK0, WK1}
        ldm     DST!, {WK2, WK3}
        orr     SCRATCH, WK0, WK1
        and     ORIG_W, WK0, WK1
        tst     SCRATCH, #0xff000000
 .else // numbytes == 4
        ldr     WK0, [SRC], #4
        ldr     WK2, [DST], #4
        tst     WK0, #0xff000000
 .endif
.endm

.macro RGBtoRGBPixelAlpha_process_tail  cond, numbytes, firstreg
        beq     20f @ all transparent
 .if numbytes == 16
        cmp     ORIG_W, #0xff000000
        bhs     10f @ all opaque
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        strd    WK2, WK3, [DST, #-16]
        ldrd    WK0, WK1, [SRC, #-8]
        ldrd    WK2, WK3, [DST, #-8]
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        b       19f
10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
        strd    WK2, WK3, [DST, #-16]
        ldrd    WK0, WK1, [SRC, #-8]
        ldrd    WK2, WK3, [DST, #-8]
        RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
19:     strd    WK2, WK3, [DST, #-8]
 .elseif numbytes == 8
        cmp     ORIG_W, #0xff000000
        bhs     10f @ all opaque
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        RGBtoRGBPixelAlpha_1pixel_translucent WK1, WK3, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        b       19f
10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
        RGBtoRGBPixelAlpha_1pixel_opaque WK1, WK3
19:     strd    WK2, WK3, [DST, #-8]
 .else // numbytes == 4
        cmp     WK0, #0xff000000
        bhs     10f @ opaque
        RGBtoRGBPixelAlpha_1pixel_translucent WK0, WK2, STRIDE_S, STRIDE_M, SCRATCH, ORIG_W, MASK
        b       19f
10:     RGBtoRGBPixelAlpha_1pixel_opaque WK0, WK2
19:     str     WK2, [DST, #-4]
 .endif
20:
.endm

generate_composite_function \
    BlitRGBtoRGBPixelAlphaARMSIMDAsm, 32, 0, 32, \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
    2, /* prefetch distance */ \
    RGBtoRGBPixelAlpha_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    RGBtoRGBPixelAlpha_process_head, \
    RGBtoRGBPixelAlpha_process_tail

/******************************************************************************/

.macro ARGBto565PixelAlpha_init
        line_saved_regs STRIDE_D, STRIDE_S, ORIG_W
        mov     MASK, #0x001f
        mov     STRIDE_M, #0x0010
        orr     MASK, MASK, MASK, lsl #16
        orr     STRIDE_M, STRIDE_M, STRIDE_M, lsl #16
.endm

.macro ARGBto565PixelAlpha_newline
        mov     STRIDE_S, #0x0200
.endm

/* On entry:
 * s1 holds 1 32bpp source pixel
 * d holds 1 16bpp destination pixel
 * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
 * other registers are temporaries
 * On exit:
 * Constant registers preserved
 */

.macro ARGBto565PixelAlpha_1pixel_translucent  s, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
        mov     alpha, s, lsr #27
        and     misc, s, #0xfc00
        and     g, d, #0x07e0
        pkhbt   rb, d, d, lsl #5
        rsb     misc, g, misc, lsr #5
        and     s, rbmask, s, lsr #3
        and     rb, rbmask, rb
        sub     s, s, rb
        smlabb  misc, misc, alpha, ghalf
        mla     s, s, alpha, rbhalf
        add     misc, misc, misc, lsl #5
        add     g, g, misc, asr #10
        add     s, s, s, lsl #5
        and     g, g, #0x07e0
        add     rb, rb, s, asr #10
        and     rb, rb, rbmask
        pkhbt   rb, rb, rb, lsl #11
        orr     d, rb, g
        orr     d, d, rb, lsr #16
.endm

/* On entry:
 * s1 holds 1 32bpp source pixel
 * d holds 1 16bpp destination pixel
 * rbmask holds 0x001f001f
 * On exit:
 * Constant registers preserved
 */

.macro ARGBto565PixelAlpha_1pixel_opaque  s, d, rbmask
        and     d, rbmask, s, lsr #3
        and     s, s, #0xfc00
        orr     d, d, d, lsr #5
        orr     d, d, s, lsr #5
.endm

/* On entry:
 * s1, s2 hold 2 32bpp source pixels
 * d holds 2 16bpp destination pixels
 * rbmask, rbhalf, ghalf hold 0x001f001f, 0x00100010, 0x00000200 respectively
 * other registers are temporaries
 * On exit:
 * Constant registers preserved
 * Blended results have been written through destination pointer
 */

.macro ARGBto565PixelAlpha_2pixels_translucent  s1, s2, d, rbmask, rbhalf, ghalf, alpha, rb, g, misc
        mov     alpha, s1, lsr #27
        and     misc, s1, #0xfc00
        and     g, d, #0x07e0
        pkhbt   rb, d, d, lsl #5
        rsb     misc, g, misc, lsr #5
        and     s1, rbmask, s1, lsr #3
        and     rb, rbmask, rb
        sub     s1, s1, rb
        smlabb  misc, misc, alpha, ghalf
        mla     s1, s1, alpha, rbhalf
          uxth    d, d, ror #16
        add     misc, misc, misc, lsl #5
          mov     alpha, s2, lsr #27
        add     g, g, misc, asr #10
        add     s1, s1, s1, lsl #5
        and     g, g, #0x07e0
        add     rb, rb, s1, asr #10
        and     rb, rb, rbmask
          and     misc, s2, #0xfc00
        pkhbt   rb, rb, rb, lsl #11
          and     s1, d, #0x07e0
          pkhbt   d, d, d, lsl #5
          rsb     misc, s1, misc, lsr #5
          and     s2, rbmask, s2, lsr #3
          and     d, rbmask, d
          sub     s2, s2, d
          smlabb  misc, misc, alpha, ghalf
          mla     s2, s2, alpha, rbhalf
        orr     alpha, rb, g
          add     misc, misc, misc, lsl #5
        orr     alpha, alpha, rb, lsr #16
          add     s1, s1, misc, asr #10
          add     s2, s2, s2, lsl #5
          and     s1, s1, #0x07e0
          add     d, d, s2, asr #10
          and     d, d, rbmask
        strh    alpha, [DST, #-4]
          pkhbt   d, d, d, lsl #11
          orr     alpha, d, s1
          orr     alpha, alpha, d, lsr #16
          strh    alpha, [DST, #-2]
.endm

/* On entry:
 * s1, s2 hold 2 32bpp source pixels
 * rbmask holds 0x001f001f
 * other registers are temporaries
 * On exit:
 * Constant registers preserved
 * Blended results have been written through destination pointer
 */

.macro ARGBto565PixelAlpha_2pixels_opaque  s1, s2, d, rbmask, g
        and     g, s1, #0xfc00
        and     d, rbmask, s1, lsr #3
          and     s1, rbmask, s2, lsr #3
        orr     d, d, d, lsr #5
        orr     d, d, g, lsr #5
          and     g, s2, #0xfc00
        strh    d, [DST, #-4]
          orr     s1, s1, s1, lsr #5
          orr     s1, s1, g, lsr #5
          strh    s1, [DST, #-2]
.endm

.macro ARGBto565PixelAlpha_2pixels_head
        ldrd    WK0, WK1, [SRC], #8
        ldr     WK2, [DST], #4
        orr     SCRATCH, WK0, WK1
        and     ORIG_W, WK0, WK1
        tst     SCRATCH, #0xff000000
.endm

.macro ARGBto565PixelAlpha_2pixels_tail
        beq     20f @ all transparent
        cmp     ORIG_W, #0xff000000
        bhs     10f @ all opaque
        ARGBto565PixelAlpha_2pixels_translucent  WK0, WK1, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
        b       20f
10:     ARGBto565PixelAlpha_2pixels_opaque  WK0, WK1, WK2, MASK, SCRATCH
20:
.endm

.macro ARGBto565PixelAlpha_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
 .if numbytes == 16
        ARGBto565PixelAlpha_2pixels_head
        ARGBto565PixelAlpha_2pixels_tail
        ARGBto565PixelAlpha_2pixels_head
        ARGBto565PixelAlpha_2pixels_tail
 .endif
 .if numbytes >= 8
        ARGBto565PixelAlpha_2pixels_head
        ARGBto565PixelAlpha_2pixels_tail
 .endif
 .if numbytes >= 4
        ARGBto565PixelAlpha_2pixels_head
 .else // numbytes == 2
        ldr     WK0, [SRC], #4
        ldrh    WK2, [DST], #2
        tst     WK0, #0xff000000
 .endif
.endm

.macro ARGBto565PixelAlpha_process_tail  cond, numbytes, firstreg
 .if numbytes >= 4
        ARGBto565PixelAlpha_2pixels_tail
 .else // numbytes == 2
        beq     20f @ all transparent
        cmp     WK0, #0xff000000
        bhs     10f @ opaque
        ARGBto565PixelAlpha_1pixel_translucent  WK0, WK2, MASK, STRIDE_M, STRIDE_S, STRIDE_D, WK3, SCRATCH, ORIG_W
        b       19f
10:     ARGBto565PixelAlpha_1pixel_opaque WK0, WK2, MASK
19:     strh    WK2, [DST, #-2]
20:
 .endif
.endm

generate_composite_function \
    BlitARGBto565PixelAlphaARMSIMDAsm, 32, 0, 16, \
    FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_WK0, \
    2, /* prefetch distance */ \
    ARGBto565PixelAlpha_init, \
    ARGBto565PixelAlpha_newline, \
    nop_macro, /* cleanup */ \
    ARGBto565PixelAlpha_process_head, \
    ARGBto565PixelAlpha_process_tail

 /******************************************************************************/

.macro BGR888toRGB888_1pixel cond, reg, tmp
        uxtb16&cond  tmp, WK&reg, ror #8
        uxtb16&cond  WK&reg, WK&reg, ror #16
        orr&cond     WK&reg, WK&reg, tmp, lsl #8
.endm

.macro BGR888toRGB888_2pixels cond, reg1, reg2, tmp1, tmp2
        uxtb16&cond  tmp1, WK&reg1, ror #8
        uxtb16&cond  WK&reg1, WK&reg1, ror #16
        uxtb16&cond  tmp2, WK&reg2, ror #8
        uxtb16&cond  WK&reg2, WK&reg2, ror #16
        orr&cond     WK&reg1, WK&reg1, tmp1, lsl #8
        orr&cond     WK&reg2, WK&reg2, tmp2, lsl #8
.endm

.macro BGR888toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        pixld   cond, numbytes, firstreg, SRC, unaligned_src
.endm

.macro BGR888toRGB888_process_tail  cond, numbytes, firstreg
 .if numbytes >= 8
        BGR888toRGB888_2pixels cond, %(firstreg+0), %(firstreg+1), MASK, STRIDE_M
  .if numbytes == 16
        BGR888toRGB888_2pixels cond, %(firstreg+2), %(firstreg+3), MASK, STRIDE_M
  .endif
 .else @ numbytes == 4
        BGR888toRGB888_1pixel cond, %(firstreg+0), MASK
 .endif
.endm

generate_composite_function \
    Blit_BGR888_RGB888ARMSIMDAsm, 32, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_COND_EXEC | FLAG_PROCESS_PRESERVES_SCRATCH, \
    2, /* prefetch distance */ \
    nop_macro, /* init */ \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    BGR888toRGB888_process_head, \
    BGR888toRGB888_process_tail

/******************************************************************************/

.macro RGB444toRGB888_init
        ldr     MASK, =0x0f0f0f0f
        /* Set GE[3:0] to 0101 so SEL instructions do what we want */
        msr     CPSR_s, #0x50000
.endm

.macro RGB444toRGB888_1pixel reg, mask, tmp
        pkhbt   WK&reg, WK&reg, WK&reg, lsl #12      @ 0000aaaarrrrggggaaaarrrrggggbbbb
        and     WK&reg, mask, WK&reg                 @ 0000aaaa0000gggg0000rrrr0000bbbb
        orr     WK&reg, WK&reg, WK&reg, lsl #4       @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
        pkhtb   tmp, WK&reg, WK&reg, asr #8          @ aaaaaaaaggggggggggggggggrrrrrrrr
        pkhbt   WK&reg, WK&reg, WK&reg, lsl #8       @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
        sel     WK&reg, WK&reg, tmp                  @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
.endm

.macro RGB444toRGB888_2pixels in, out1, out2, mask, tmp1, tmp2
        and     tmp1, mask, WK&in                    @ 0000RRRR0000BBBB0000rrrr0000bbbb
        and     tmp2, mask, WK&in, lsr #4            @ 0000AAAA0000GGGG0000aaaa0000gggg
        orr     tmp1, tmp1, tmp1, lsl #4             @ RRRRRRRRBBBBBBBBrrrrrrrrbbbbbbbb
        orr     tmp2, tmp2, tmp2, lsl #4             @ AAAAAAAAGGGGGGGGaaaaaaaagggggggg
        pkhtb   WK&out2, tmp2, tmp1, asr #16         @ AAAAAAAAGGGGGGGGRRRRRRRRBBBBBBBB
        pkhbt   WK&out1, tmp1, tmp2, lsl #16         @ aaaaaaaaggggggggrrrrrrrrbbbbbbbb
        pkhtb   tmp2, WK&out2, WK&out2, asr #8       @ AAAAAAAAGGGGGGGGGGGGGGGGRRRRRRRR
        pkhtb   tmp1, WK&out1, WK&out1, asr #8       @ aaaaaaaaggggggggggggggggrrrrrrrr
        pkhbt   WK&out1, WK&out1, WK&out1, lsl #8    @ ggggggggrrrrrrrrrrrrrrrrbbbbbbbb
        pkhbt   WK&out2, WK&out2, WK&out2, lsl #8    @ GGGGGGGGRRRRRRRRRRRRRRRRBBBBBBBB
        sel     WK&out1, WK&out1, tmp1               @ aaaaaaaarrrrrrrrggggggggbbbbbbbb
        sel     WK&out2, WK&out2, tmp2               @ AAAAAAAARRRRRRRRGGGGGGGGBBBBBBBB
.endm

.macro RGB444toRGB888_process_head  cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
        pixld   cond, numbytes/2, firstreg, SRC, unaligned_src
.endm

.macro RGB444toRGB888_process_tail  cond, numbytes, firstreg
 .if numbytes >= 8
  .if numbytes == 16
        RGB444toRGB888_2pixels %(firstreg+1), %(firstreg+2), %(firstreg+3), MASK, STRIDE_M, SCRATCH
  .endif
        RGB444toRGB888_2pixels %(firstreg+0), %(firstreg+0), %(firstreg+1), MASK, STRIDE_M, SCRATCH
 .else @ numbytes == 4
        RGB444toRGB888_1pixel %(firstreg+0), MASK, SCRATCH
 .endif
.endm

generate_composite_function \
    Blit_RGB444_RGB888ARMSIMDAsm, 16, 0, 32, \
    FLAG_DST_WRITEONLY | FLAG_BRANCH_OVER, \
    2, /* prefetch distance */ \
    RGB444toRGB888_init, \
    nop_macro, /* newline */ \
    nop_macro, /* cleanup */ \
    RGB444toRGB888_process_head, \
    RGB444toRGB888_process_tail