use cranelift_codegen::ir::condcodes::IntCC;
use cranelift_codegen::ir::types;
use cranelift_codegen::ir::{InstBuilder, MemFlags, Type, Value};
use cranelift_frontend::FunctionBuilder;
use super::{
block_args, emit_expand_packed_coverage_i32x4, emit_extract_channels_simd,
emit_pack_channels_simd,
};
pub(super) fn build_src_copy(mut bcx: FunctionBuilder, ptr_type: Type) {
let entry = bcx.create_block();
let simd_loop = bcx.create_block();
let scalar_check = bcx.create_block();
let scalar_loop = bcx.create_block();
let exit = bcx.create_block();
bcx.switch_to_block(entry);
bcx.append_block_params_for_function_params(entry);
let dst = bcx.block_params(entry)[0]; let src_solid = bcx.block_params(entry)[1]; let count = bcx.block_params(entry)[2];
let simd_count = bcx.ins().ushr_imm(count, 2);
let remainder = bcx.ins().band_imm(count, 3);
let zero = bcx.ins().iconst(ptr_type, 0);
let src_vec = bcx.ins().splat(types::I32X4, src_solid);
let has_simd = bcx.ins().icmp(IntCC::NotEqual, simd_count, zero);
let args_simd = block_args(&[dst, zero]);
let args_scalar = block_args(&[dst]);
bcx.ins()
.brif(has_simd, simd_loop, &args_simd, scalar_check, &args_scalar);
bcx.append_block_param(simd_loop, ptr_type);
bcx.append_block_param(simd_loop, ptr_type);
bcx.switch_to_block(simd_loop);
let current_dst = bcx.block_params(simd_loop)[0];
let simd_i = bcx.block_params(simd_loop)[1];
bcx.ins().store(MemFlags::new(), src_vec, current_dst, 0);
let sixteen = bcx.ins().iconst(ptr_type, 16);
let next_dst = bcx.ins().iadd(current_dst, sixteen);
let one = bcx.ins().iconst(ptr_type, 1);
let next_si = bcx.ins().iadd(simd_i, one);
let cont = bcx.ins().icmp(IntCC::UnsignedLessThan, next_si, simd_count);
let args_loop = block_args(&[next_dst, next_si]);
let args_check = block_args(&[next_dst]);
bcx.ins()
.brif(cont, simd_loop, &args_loop, scalar_check, &args_check);
bcx.append_block_param(scalar_check, ptr_type);
bcx.switch_to_block(scalar_check);
let current_dst = bcx.block_params(scalar_check)[0];
let has_remainder = bcx.ins().icmp(IntCC::NotEqual, remainder, zero);
let args_scalar = block_args(&[current_dst, zero]);
bcx.ins()
.brif(has_remainder, scalar_loop, &args_scalar, exit, &[]);
bcx.append_block_param(scalar_loop, ptr_type);
bcx.append_block_param(scalar_loop, ptr_type);
bcx.switch_to_block(scalar_loop);
let current_dst = bcx.block_params(scalar_loop)[0];
let scalar_i = bcx.block_params(scalar_loop)[1];
bcx.ins().store(MemFlags::new(), src_solid, current_dst, 0);
let four = bcx.ins().iconst(ptr_type, 4);
let next_dst = bcx.ins().iadd(current_dst, four);
let one = bcx.ins().iconst(ptr_type, 1);
let next_si = bcx.ins().iadd(scalar_i, one);
let cont = bcx.ins().icmp(IntCC::UnsignedLessThan, next_si, remainder);
let args_loop = block_args(&[next_dst, next_si]);
bcx.ins().brif(cont, scalar_loop, &args_loop, exit, &[]);
bcx.switch_to_block(exit);
bcx.ins().return_(&[]);
bcx.seal_all_blocks();
bcx.finalize();
}
pub(super) fn build_src_copy_cov(mut bcx: FunctionBuilder, ptr_type: Type) {
let entry = bcx.create_block();
let simd_loop = bcx.create_block();
let simd_fast = bcx.create_block();
let simd_slow = bcx.create_block();
let simd_next = bcx.create_block();
let scalar_check = bcx.create_block();
let scalar_loop = bcx.create_block();
let exit = bcx.create_block();
bcx.switch_to_block(entry);
bcx.append_block_params_for_function_params(entry);
let dst = bcx.block_params(entry)[0];
let src_solid = bcx.block_params(entry)[1];
let count = bcx.block_params(entry)[2];
let coverage = bcx.block_params(entry)[3];
let src_a = bcx.ins().ushr_imm(src_solid, 24);
let src_a = bcx.ins().band_imm(src_a, 0xFF);
let src_r = bcx.ins().ushr_imm(src_solid, 16);
let src_r = bcx.ins().band_imm(src_r, 0xFF);
let src_g = bcx.ins().ushr_imm(src_solid, 8);
let src_g = bcx.ins().band_imm(src_g, 0xFF);
let src_b = bcx.ins().band_imm(src_solid, 0xFF);
let src_a_vec = bcx.ins().splat(types::I32X4, src_a);
let src_r_vec = bcx.ins().splat(types::I32X4, src_r);
let src_g_vec = bcx.ins().splat(types::I32X4, src_g);
let src_b_vec = bcx.ins().splat(types::I32X4, src_b);
let c257_scalar = bcx.ins().iconst(types::I32, 257);
let c257_vec = bcx.ins().splat(types::I32X4, c257_scalar);
let src_vec = bcx.ins().splat(types::I32X4, src_solid);
let all_ff = bcx.ins().iconst(types::I32, -1);
let simd_count = bcx.ins().ushr_imm(count, 2);
let remainder = bcx.ins().band_imm(count, 3);
let zero = bcx.ins().iconst(ptr_type, 0);
let has_simd = bcx.ins().icmp(IntCC::NotEqual, simd_count, zero);
let args_simd = block_args(&[dst, coverage, zero]);
let args_scalar = block_args(&[dst, coverage]);
bcx.ins()
.brif(has_simd, simd_loop, &args_simd, scalar_check, &args_scalar);
bcx.append_block_param(simd_loop, ptr_type);
bcx.append_block_param(simd_loop, ptr_type);
bcx.append_block_param(simd_loop, ptr_type);
bcx.switch_to_block(simd_loop);
let current_dst = bcx.block_params(simd_loop)[0];
let current_cov = bcx.block_params(simd_loop)[1];
let simd_i = bcx.block_params(simd_loop)[2];
let packed_cov = bcx.ins().load(types::I32, MemFlags::new(), current_cov, 0);
let is_all_ff = bcx.ins().icmp(IntCC::Equal, packed_cov, all_ff);
bcx.ins().brif(is_all_ff, simd_fast, &[], simd_slow, &[]);
bcx.switch_to_block(simd_fast);
bcx.ins().store(MemFlags::new(), src_vec, current_dst, 0);
bcx.ins().jump(simd_next, &[]);
bcx.switch_to_block(simd_slow);
let cov_vec = emit_expand_packed_coverage_i32x4(&mut bcx, packed_cov);
let ca = bcx.ins().imul(src_a_vec, cov_vec);
let ca = bcx.ins().imul(ca, c257_vec);
let ca = bcx.ins().iadd(ca, c257_vec);
let out_a = bcx.ins().ushr_imm(ca, 16);
let cr = bcx.ins().imul(src_r_vec, cov_vec);
let cr = bcx.ins().imul(cr, c257_vec);
let cr = bcx.ins().iadd(cr, c257_vec);
let out_r = bcx.ins().ushr_imm(cr, 16);
let cg = bcx.ins().imul(src_g_vec, cov_vec);
let cg = bcx.ins().imul(cg, c257_vec);
let cg = bcx.ins().iadd(cg, c257_vec);
let out_g = bcx.ins().ushr_imm(cg, 16);
let cb = bcx.ins().imul(src_b_vec, cov_vec);
let cb = bcx.ins().imul(cb, c257_vec);
let cb = bcx.ins().iadd(cb, c257_vec);
let out_b = bcx.ins().ushr_imm(cb, 16);
let result = emit_pack_channels_simd(&mut bcx, out_a, out_r, out_g, out_b);
bcx.ins().store(MemFlags::new(), result, current_dst, 0);
bcx.ins().jump(simd_next, &[]);
bcx.switch_to_block(simd_next);
let sixteen = bcx.ins().iconst(ptr_type, 16);
let next_dst = bcx.ins().iadd(current_dst, sixteen);
let four_ptr = bcx.ins().iconst(ptr_type, 4);
let next_cov = bcx.ins().iadd(current_cov, four_ptr);
let one = bcx.ins().iconst(ptr_type, 1);
let next_si = bcx.ins().iadd(simd_i, one);
let cont = bcx.ins().icmp(IntCC::UnsignedLessThan, next_si, simd_count);
let args_loop = block_args(&[next_dst, next_cov, next_si]);
let args_check = block_args(&[next_dst, next_cov]);
bcx.ins()
.brif(cont, simd_loop, &args_loop, scalar_check, &args_check);
bcx.append_block_param(scalar_check, ptr_type);
bcx.append_block_param(scalar_check, ptr_type);
bcx.switch_to_block(scalar_check);
let current_dst = bcx.block_params(scalar_check)[0];
let current_cov = bcx.block_params(scalar_check)[1];
let has_remainder = bcx.ins().icmp(IntCC::NotEqual, remainder, zero);
let args_scalar = block_args(&[current_dst, current_cov, zero]);
bcx.ins()
.brif(has_remainder, scalar_loop, &args_scalar, exit, &[]);
bcx.append_block_param(scalar_loop, ptr_type);
bcx.append_block_param(scalar_loop, ptr_type);
bcx.append_block_param(scalar_loop, ptr_type);
bcx.switch_to_block(scalar_loop);
let current_dst = bcx.block_params(scalar_loop)[0];
let current_cov = bcx.block_params(scalar_loop)[1];
let scalar_i = bcx.block_params(scalar_loop)[2];
let cov_u8 = bcx.ins().load(types::I8, MemFlags::new(), current_cov, 0);
let cov = bcx.ins().uextend(types::I32, cov_u8);
let ca = bcx.ins().imul(src_a, cov);
let ca = bcx.ins().imul(ca, c257_scalar);
let ca = bcx.ins().iadd(ca, c257_scalar);
let out_a = bcx.ins().ushr_imm(ca, 16);
let cr = bcx.ins().imul(src_r, cov);
let cr = bcx.ins().imul(cr, c257_scalar);
let cr = bcx.ins().iadd(cr, c257_scalar);
let out_r = bcx.ins().ushr_imm(cr, 16);
let cg = bcx.ins().imul(src_g, cov);
let cg = bcx.ins().imul(cg, c257_scalar);
let cg = bcx.ins().iadd(cg, c257_scalar);
let out_g = bcx.ins().ushr_imm(cg, 16);
let cb = bcx.ins().imul(src_b, cov);
let cb = bcx.ins().imul(cb, c257_scalar);
let cb = bcx.ins().iadd(cb, c257_scalar);
let out_b = bcx.ins().ushr_imm(cb, 16);
let result = bcx.ins().ishl_imm(out_a, 24);
let tmp = bcx.ins().ishl_imm(out_r, 16);
let result = bcx.ins().bor(result, tmp);
let tmp = bcx.ins().ishl_imm(out_g, 8);
let result = bcx.ins().bor(result, tmp);
let result = bcx.ins().bor(result, out_b);
bcx.ins().store(MemFlags::new(), result, current_dst, 0);
let four = bcx.ins().iconst(ptr_type, 4);
let next_dst = bcx.ins().iadd(current_dst, four);
let one_ptr = bcx.ins().iconst(ptr_type, 1);
let next_cov = bcx.ins().iadd(current_cov, one_ptr);
let next_si = bcx.ins().iadd(scalar_i, one_ptr);
let cont = bcx.ins().icmp(IntCC::UnsignedLessThan, next_si, remainder);
let args_loop = block_args(&[next_dst, next_cov, next_si]);
bcx.ins().brif(cont, scalar_loop, &args_loop, exit, &[]);
bcx.switch_to_block(exit);
bcx.ins().return_(&[]);
bcx.seal_all_blocks();
bcx.finalize();
}
pub(super) fn build_src_over_cov(mut bcx: FunctionBuilder, ptr_type: Type) {
let entry = bcx.create_block();
let simd_loop = bcx.create_block();
let simd_fast = bcx.create_block();
let simd_slow = bcx.create_block();
let simd_next = bcx.create_block();
let scalar_check = bcx.create_block();
let scalar_loop = bcx.create_block();
let exit = bcx.create_block();
bcx.switch_to_block(entry);
bcx.append_block_params_for_function_params(entry);
let dst = bcx.block_params(entry)[0];
let src_solid = bcx.block_params(entry)[1];
let count = bcx.block_params(entry)[2];
let coverage = bcx.block_params(entry)[3];
let src_a = bcx.ins().ushr_imm(src_solid, 24);
let src_a = bcx.ins().band_imm(src_a, 0xFF);
let src_r = bcx.ins().ushr_imm(src_solid, 16);
let src_r = bcx.ins().band_imm(src_r, 0xFF);
let src_g = bcx.ins().ushr_imm(src_solid, 8);
let src_g = bcx.ins().band_imm(src_g, 0xFF);
let src_b = bcx.ins().band_imm(src_solid, 0xFF);
let src_a_vec = bcx.ins().splat(types::I32X4, src_a);
let src_r_vec = bcx.ins().splat(types::I32X4, src_r);
let src_g_vec = bcx.ins().splat(types::I32X4, src_g);
let src_b_vec = bcx.ins().splat(types::I32X4, src_b);
let c257_scalar = bcx.ins().iconst(types::I32, 257);
let c257_vec = bcx.ins().splat(types::I32X4, c257_scalar);
let c256_scalar = bcx.ins().iconst(types::I32, 256);
let c256_vec = bcx.ins().splat(types::I32X4, c256_scalar);
let mask_0xff = bcx.ins().iconst(types::I32, 0xFF);
let mask_0xff_vec = bcx.ins().splat(types::I32X4, mask_0xff);
let inv_alpha_src = bcx.ins().isub(c256_scalar, src_a);
let inv_alpha_src_vec = bcx.ins().splat(types::I32X4, inv_alpha_src);
let all_ff = bcx.ins().iconst(types::I32, -1);
let simd_count = bcx.ins().ushr_imm(count, 2);
let remainder = bcx.ins().band_imm(count, 3);
let zero = bcx.ins().iconst(ptr_type, 0);
let has_simd = bcx.ins().icmp(IntCC::NotEqual, simd_count, zero);
let args_simd = block_args(&[dst, coverage, zero]);
let args_scalar = block_args(&[dst, coverage]);
bcx.ins()
.brif(has_simd, simd_loop, &args_simd, scalar_check, &args_scalar);
bcx.append_block_param(simd_loop, ptr_type);
bcx.append_block_param(simd_loop, ptr_type);
bcx.append_block_param(simd_loop, ptr_type);
bcx.switch_to_block(simd_loop);
let current_dst = bcx.block_params(simd_loop)[0];
let current_cov = bcx.block_params(simd_loop)[1];
let simd_i = bcx.block_params(simd_loop)[2];
let packed_cov = bcx.ins().load(types::I32, MemFlags::new(), current_cov, 0);
let is_all_ff = bcx.ins().icmp(IntCC::Equal, packed_cov, all_ff);
bcx.ins().brif(is_all_ff, simd_fast, &[], simd_slow, &[]);
bcx.switch_to_block(simd_fast);
let dst_pixels = bcx
.ins()
.load(types::I32X4, MemFlags::new(), current_dst, 0);
let (dst_a_v, dst_r_v, dst_g_v, dst_b_v) =
emit_extract_channels_simd(&mut bcx, dst_pixels, mask_0xff_vec);
let da = bcx.ins().imul(dst_a_v, inv_alpha_src_vec);
let da = bcx.ins().ushr_imm(da, 8);
let out_a = bcx.ins().iadd(src_a_vec, da);
let dr = bcx.ins().imul(dst_r_v, inv_alpha_src_vec);
let dr = bcx.ins().ushr_imm(dr, 8);
let out_r = bcx.ins().iadd(src_r_vec, dr);
let dg = bcx.ins().imul(dst_g_v, inv_alpha_src_vec);
let dg = bcx.ins().ushr_imm(dg, 8);
let out_g = bcx.ins().iadd(src_g_vec, dg);
let db = bcx.ins().imul(dst_b_v, inv_alpha_src_vec);
let db = bcx.ins().ushr_imm(db, 8);
let out_b = bcx.ins().iadd(src_b_vec, db);
let result = emit_pack_channels_simd(&mut bcx, out_a, out_r, out_g, out_b);
bcx.ins().store(MemFlags::new(), result, current_dst, 0);
bcx.ins().jump(simd_next, &[]);
bcx.switch_to_block(simd_slow);
let cov_vec = emit_expand_packed_coverage_i32x4(&mut bcx, packed_cov);
let ca = bcx.ins().imul(src_a_vec, cov_vec);
let ca = bcx.ins().imul(ca, c257_vec);
let ca = bcx.ins().iadd(ca, c257_vec);
let cov_src_a = bcx.ins().ushr_imm(ca, 16);
let cr = bcx.ins().imul(src_r_vec, cov_vec);
let cr = bcx.ins().imul(cr, c257_vec);
let cr = bcx.ins().iadd(cr, c257_vec);
let cov_src_r = bcx.ins().ushr_imm(cr, 16);
let cg = bcx.ins().imul(src_g_vec, cov_vec);
let cg = bcx.ins().imul(cg, c257_vec);
let cg = bcx.ins().iadd(cg, c257_vec);
let cov_src_g = bcx.ins().ushr_imm(cg, 16);
let cb = bcx.ins().imul(src_b_vec, cov_vec);
let cb = bcx.ins().imul(cb, c257_vec);
let cb = bcx.ins().iadd(cb, c257_vec);
let cov_src_b = bcx.ins().ushr_imm(cb, 16);
let inv_alpha_v = bcx.ins().isub(c256_vec, cov_src_a);
let dst_pixels = bcx
.ins()
.load(types::I32X4, MemFlags::new(), current_dst, 0);
let (dst_a_v, dst_r_v, dst_g_v, dst_b_v) =
emit_extract_channels_simd(&mut bcx, dst_pixels, mask_0xff_vec);
let da = bcx.ins().imul(dst_a_v, inv_alpha_v);
let da = bcx.ins().ushr_imm(da, 8);
let out_a = bcx.ins().iadd(cov_src_a, da);
let dr = bcx.ins().imul(dst_r_v, inv_alpha_v);
let dr = bcx.ins().ushr_imm(dr, 8);
let out_r = bcx.ins().iadd(cov_src_r, dr);
let dg = bcx.ins().imul(dst_g_v, inv_alpha_v);
let dg = bcx.ins().ushr_imm(dg, 8);
let out_g = bcx.ins().iadd(cov_src_g, dg);
let db = bcx.ins().imul(dst_b_v, inv_alpha_v);
let db = bcx.ins().ushr_imm(db, 8);
let out_b = bcx.ins().iadd(cov_src_b, db);
let result = emit_pack_channels_simd(&mut bcx, out_a, out_r, out_g, out_b);
bcx.ins().store(MemFlags::new(), result, current_dst, 0);
bcx.ins().jump(simd_next, &[]);
bcx.switch_to_block(simd_next);
let sixteen = bcx.ins().iconst(ptr_type, 16);
let next_dst = bcx.ins().iadd(current_dst, sixteen);
let four_ptr = bcx.ins().iconst(ptr_type, 4);
let next_cov = bcx.ins().iadd(current_cov, four_ptr);
let one = bcx.ins().iconst(ptr_type, 1);
let next_si = bcx.ins().iadd(simd_i, one);
let cont = bcx.ins().icmp(IntCC::UnsignedLessThan, next_si, simd_count);
let args_loop = block_args(&[next_dst, next_cov, next_si]);
let args_check = block_args(&[next_dst, next_cov]);
bcx.ins()
.brif(cont, simd_loop, &args_loop, scalar_check, &args_check);
bcx.append_block_param(scalar_check, ptr_type);
bcx.append_block_param(scalar_check, ptr_type);
bcx.switch_to_block(scalar_check);
let current_dst = bcx.block_params(scalar_check)[0];
let current_cov = bcx.block_params(scalar_check)[1];
let has_remainder = bcx.ins().icmp(IntCC::NotEqual, remainder, zero);
let args_scalar = block_args(&[current_dst, current_cov, zero]);
bcx.ins()
.brif(has_remainder, scalar_loop, &args_scalar, exit, &[]);
bcx.append_block_param(scalar_loop, ptr_type);
bcx.append_block_param(scalar_loop, ptr_type);
bcx.append_block_param(scalar_loop, ptr_type);
bcx.switch_to_block(scalar_loop);
let current_dst = bcx.block_params(scalar_loop)[0];
let current_cov = bcx.block_params(scalar_loop)[1];
let scalar_i = bcx.block_params(scalar_loop)[2];
let cov_u8 = bcx.ins().load(types::I8, MemFlags::new(), current_cov, 0);
let cov = bcx.ins().uextend(types::I32, cov_u8);
let ca = bcx.ins().imul(src_a, cov);
let ca = bcx.ins().imul(ca, c257_scalar);
let ca = bcx.ins().iadd(ca, c257_scalar);
let cov_src_a = bcx.ins().ushr_imm(ca, 16);
let cr = bcx.ins().imul(src_r, cov);
let cr = bcx.ins().imul(cr, c257_scalar);
let cr = bcx.ins().iadd(cr, c257_scalar);
let cov_src_r = bcx.ins().ushr_imm(cr, 16);
let cg = bcx.ins().imul(src_g, cov);
let cg = bcx.ins().imul(cg, c257_scalar);
let cg = bcx.ins().iadd(cg, c257_scalar);
let cov_src_g = bcx.ins().ushr_imm(cg, 16);
let cb = bcx.ins().imul(src_b, cov);
let cb = bcx.ins().imul(cb, c257_scalar);
let cb = bcx.ins().iadd(cb, c257_scalar);
let cov_src_b = bcx.ins().ushr_imm(cb, 16);
let inv_alpha = bcx.ins().isub(c256_scalar, cov_src_a);
let dst_pixel = bcx.ins().load(types::I32, MemFlags::new(), current_dst, 0);
let dst_a_s = bcx.ins().ushr_imm(dst_pixel, 24);
let dst_a_s = bcx.ins().band_imm(dst_a_s, 0xFF);
let dst_r_s = bcx.ins().ushr_imm(dst_pixel, 16);
let dst_r_s = bcx.ins().band_imm(dst_r_s, 0xFF);
let dst_g_s = bcx.ins().ushr_imm(dst_pixel, 8);
let dst_g_s = bcx.ins().band_imm(dst_g_s, 0xFF);
let dst_b_s = bcx.ins().band_imm(dst_pixel, 0xFF);
let da = bcx.ins().imul(dst_a_s, inv_alpha);
let da = bcx.ins().ushr_imm(da, 8);
let out_a = bcx.ins().iadd(cov_src_a, da);
let dr = bcx.ins().imul(dst_r_s, inv_alpha);
let dr = bcx.ins().ushr_imm(dr, 8);
let out_r = bcx.ins().iadd(cov_src_r, dr);
let dg = bcx.ins().imul(dst_g_s, inv_alpha);
let dg = bcx.ins().ushr_imm(dg, 8);
let out_g = bcx.ins().iadd(cov_src_g, dg);
let db = bcx.ins().imul(dst_b_s, inv_alpha);
let db = bcx.ins().ushr_imm(db, 8);
let out_b = bcx.ins().iadd(cov_src_b, db);
let result = bcx.ins().ishl_imm(out_a, 24);
let tmp = bcx.ins().ishl_imm(out_r, 16);
let result = bcx.ins().bor(result, tmp);
let tmp = bcx.ins().ishl_imm(out_g, 8);
let result = bcx.ins().bor(result, tmp);
let result = bcx.ins().bor(result, out_b);
bcx.ins().store(MemFlags::new(), result, current_dst, 0);
let four = bcx.ins().iconst(ptr_type, 4);
let next_dst = bcx.ins().iadd(current_dst, four);
let one_ptr = bcx.ins().iconst(ptr_type, 1);
let next_cov = bcx.ins().iadd(current_cov, one_ptr);
let next_si = bcx.ins().iadd(scalar_i, one_ptr);
let cont = bcx.ins().icmp(IntCC::UnsignedLessThan, next_si, remainder);
let args_loop = block_args(&[next_dst, next_cov, next_si]);
bcx.ins().brif(cont, scalar_loop, &args_loop, exit, &[]);
bcx.switch_to_block(exit);
bcx.ins().return_(&[]);
bcx.seal_all_blocks();
bcx.finalize();
}
pub(super) fn build_src_over(mut bcx: FunctionBuilder, ptr_type: Type) {
let entry = bcx.create_block();
let unroll_loop = bcx.create_block();
let tail_check = bcx.create_block();
let tail_loop = bcx.create_block();
let scalar_check = bcx.create_block();
let scalar_loop = bcx.create_block();
let exit = bcx.create_block();
bcx.switch_to_block(entry);
bcx.append_block_params_for_function_params(entry);
let dst = bcx.block_params(entry)[0];
let src_solid = bcx.block_params(entry)[1];
let count = bcx.block_params(entry)[2];
let mask_00ff00ff = bcx.ins().iconst(types::I32, 0x00FF00FFu32 as i64);
let src_ag = bcx.ins().ushr_imm(src_solid, 8);
let src_ag = bcx.ins().band(src_ag, mask_00ff00ff);
let src_rb = bcx.ins().band(src_solid, mask_00ff00ff);
let src_a = bcx.ins().ushr_imm(src_solid, 24);
let src_a = bcx.ins().band_imm(src_a, 0xFF);
let c256 = bcx.ins().iconst(types::I32, 256);
let inv_alpha = bcx.ins().isub(c256, src_a);
let src_ag_vec = bcx.ins().splat(types::I32X4, src_ag);
let src_rb_vec = bcx.ins().splat(types::I32X4, src_rb);
let inv_alpha_vec = bcx.ins().splat(types::I32X4, inv_alpha);
let mask_vec = bcx.ins().splat(types::I32X4, mask_00ff00ff);
let count16 = bcx.ins().ushr_imm(count, 4);
let tail_quads = bcx.ins().band_imm(count, 0xF);
let tail_quads = bcx.ins().ushr_imm(tail_quads, 2);
let remainder = bcx.ins().band_imm(count, 3);
let zero = bcx.ins().iconst(ptr_type, 0);
let has_unroll = bcx.ins().icmp(IntCC::NotEqual, count16, zero);
bcx.ins().brif(
has_unroll,
unroll_loop,
&block_args(&[dst, zero]),
tail_check,
&block_args(&[dst]),
);
bcx.append_block_param(unroll_loop, ptr_type);
bcx.append_block_param(unroll_loop, ptr_type);
bcx.switch_to_block(unroll_loop);
let current_dst = bcx.block_params(unroll_loop)[0];
let unroll_i = bcx.block_params(unroll_loop)[1];
let px0 = bcx
.ins()
.load(types::I32X4, MemFlags::new(), current_dst, 0);
let r0 = emit_src_over_ag_rb_simd(
&mut bcx,
px0,
src_ag_vec,
src_rb_vec,
inv_alpha_vec,
mask_vec,
);
bcx.ins().store(MemFlags::new(), r0, current_dst, 0);
let px1 = bcx
.ins()
.load(types::I32X4, MemFlags::new(), current_dst, 16);
let r1 = emit_src_over_ag_rb_simd(
&mut bcx,
px1,
src_ag_vec,
src_rb_vec,
inv_alpha_vec,
mask_vec,
);
bcx.ins().store(MemFlags::new(), r1, current_dst, 16);
let px2 = bcx
.ins()
.load(types::I32X4, MemFlags::new(), current_dst, 32);
let r2 = emit_src_over_ag_rb_simd(
&mut bcx,
px2,
src_ag_vec,
src_rb_vec,
inv_alpha_vec,
mask_vec,
);
bcx.ins().store(MemFlags::new(), r2, current_dst, 32);
let px3 = bcx
.ins()
.load(types::I32X4, MemFlags::new(), current_dst, 48);
let r3 = emit_src_over_ag_rb_simd(
&mut bcx,
px3,
src_ag_vec,
src_rb_vec,
inv_alpha_vec,
mask_vec,
);
bcx.ins().store(MemFlags::new(), r3, current_dst, 48);
let sixty_four = bcx.ins().iconst(ptr_type, 64);
let next_dst = bcx.ins().iadd(current_dst, sixty_four);
let one = bcx.ins().iconst(ptr_type, 1);
let next_i = bcx.ins().iadd(unroll_i, one);
let cont = bcx.ins().icmp(IntCC::UnsignedLessThan, next_i, count16);
bcx.ins().brif(
cont,
unroll_loop,
&block_args(&[next_dst, next_i]),
tail_check,
&block_args(&[next_dst]),
);
bcx.append_block_param(tail_check, ptr_type);
bcx.switch_to_block(tail_check);
let current_dst = bcx.block_params(tail_check)[0];
let has_tail = bcx.ins().icmp(IntCC::NotEqual, tail_quads, zero);
bcx.ins().brif(
has_tail,
tail_loop,
&block_args(&[current_dst, zero]),
scalar_check,
&block_args(&[current_dst]),
);
bcx.append_block_param(tail_loop, ptr_type);
bcx.append_block_param(tail_loop, ptr_type);
bcx.switch_to_block(tail_loop);
let current_dst = bcx.block_params(tail_loop)[0];
let tail_i = bcx.block_params(tail_loop)[1];
let px = bcx
.ins()
.load(types::I32X4, MemFlags::new(), current_dst, 0);
let r = emit_src_over_ag_rb_simd(
&mut bcx,
px,
src_ag_vec,
src_rb_vec,
inv_alpha_vec,
mask_vec,
);
bcx.ins().store(MemFlags::new(), r, current_dst, 0);
let sixteen = bcx.ins().iconst(ptr_type, 16);
let next_dst = bcx.ins().iadd(current_dst, sixteen);
let one = bcx.ins().iconst(ptr_type, 1);
let next_ti = bcx.ins().iadd(tail_i, one);
let cont = bcx.ins().icmp(IntCC::UnsignedLessThan, next_ti, tail_quads);
bcx.ins().brif(
cont,
tail_loop,
&block_args(&[next_dst, next_ti]),
scalar_check,
&block_args(&[next_dst]),
);
bcx.append_block_param(scalar_check, ptr_type);
bcx.switch_to_block(scalar_check);
let current_dst = bcx.block_params(scalar_check)[0];
let has_remainder = bcx.ins().icmp(IntCC::NotEqual, remainder, zero);
bcx.ins().brif(
has_remainder,
scalar_loop,
&block_args(&[current_dst, zero]),
exit,
&[],
);
bcx.append_block_param(scalar_loop, ptr_type);
bcx.append_block_param(scalar_loop, ptr_type);
bcx.switch_to_block(scalar_loop);
let current_dst = bcx.block_params(scalar_loop)[0];
let scalar_i = bcx.block_params(scalar_loop)[1];
let dst_pixel = bcx.ins().load(types::I32, MemFlags::new(), current_dst, 0);
let dst_ag = bcx.ins().ushr_imm(dst_pixel, 8);
let dst_ag = bcx.ins().band(dst_ag, mask_00ff00ff);
let dst_rb = bcx.ins().band(dst_pixel, mask_00ff00ff);
let tmp_ag = bcx.ins().imul(dst_ag, inv_alpha);
let tmp_ag = bcx.ins().ushr_imm(tmp_ag, 8);
let tmp_ag = bcx.ins().band(tmp_ag, mask_00ff00ff);
let out_ag = bcx.ins().iadd(src_ag, tmp_ag);
let tmp_rb = bcx.ins().imul(dst_rb, inv_alpha);
let tmp_rb = bcx.ins().ushr_imm(tmp_rb, 8);
let tmp_rb = bcx.ins().band(tmp_rb, mask_00ff00ff);
let out_rb = bcx.ins().iadd(src_rb, tmp_rb);
let result = bcx.ins().ishl_imm(out_ag, 8);
let result = bcx.ins().bor(result, out_rb);
bcx.ins().store(MemFlags::new(), result, current_dst, 0);
let four = bcx.ins().iconst(ptr_type, 4);
let next_dst = bcx.ins().iadd(current_dst, four);
let one = bcx.ins().iconst(ptr_type, 1);
let next_si = bcx.ins().iadd(scalar_i, one);
let cont = bcx.ins().icmp(IntCC::UnsignedLessThan, next_si, remainder);
bcx.ins().brif(
cont,
scalar_loop,
&block_args(&[next_dst, next_si]),
exit,
&[],
);
bcx.switch_to_block(exit);
bcx.ins().return_(&[]);
bcx.seal_all_blocks();
bcx.finalize();
}
pub(super) fn emit_src_over_ag_rb_simd(
bcx: &mut FunctionBuilder,
dst_pixels: Value,
src_ag_vec: Value,
src_rb_vec: Value,
inv_alpha_vec: Value,
mask_vec: Value,
) -> Value {
let dst_ag = bcx.ins().ushr_imm(dst_pixels, 8);
let dst_ag = bcx.ins().band(dst_ag, mask_vec);
let dst_rb = bcx.ins().band(dst_pixels, mask_vec);
let tmp_ag = bcx.ins().imul(dst_ag, inv_alpha_vec);
let tmp_ag = bcx.ins().ushr_imm(tmp_ag, 8);
let tmp_ag = bcx.ins().band(tmp_ag, mask_vec);
let out_ag = bcx.ins().iadd(src_ag_vec, tmp_ag);
let tmp_rb = bcx.ins().imul(dst_rb, inv_alpha_vec);
let tmp_rb = bcx.ins().ushr_imm(tmp_rb, 8);
let tmp_rb = bcx.ins().band(tmp_rb, mask_vec);
let out_rb = bcx.ins().iadd(src_rb_vec, tmp_rb);
let result = bcx.ins().ishl_imm(out_ag, 8);
bcx.ins().bor(result, out_rb)
}