basic_dsp_vector 0.5.6

Digital signal processing based on real or complex vectors in time or frequency domain. Vectors come with basic arithmetic, convolution, Fourier transformation and interpolation operations. The vectors are optimized for sizes of a couple of thousand elements or more.
Documentation
pub const CONV_KERNEL: &str = r#"
    #define mulc32(a,b) ((float4)((float)mad(-(a).y, (b).y, (a).x * (b).x), (float)mad((a).y, (b).x, (a).x * (b).y), (float)mad(-(a).z, (b).z, (a).w * (b).w), (float)mad((a).z, (b).w, (a).w * (b).z)))

    __kernel
    void conv_vecs_r(
                __global float4 const* const src,
                __global float4 const* const conv,
                __global float4* const res)
    {
        ulong const idx = get_global_id(0);

        float4 sum1 = (float4)0.0;
        float4 sum2 = (float4)0.0;
        float4 sum3 = (float4)0.0;
        float4 sum4 = (float4)0.0;
        long start = idx - FILTER_LENGTH / 2;
        long end = start + FILTER_LENGTH;
        long j = 0;
        for (long i = start; i < end; i++) {
            float4 current = src[i];
            sum1 += current * conv[j];
            j++;
            sum2 += current * conv[j];
            j++;
            sum3 += current * conv[j];
            j++;
            sum4 += current * conv[j];
            j++;
        }

        res[idx] = (float4)
                   (sum1.x+sum1.y+sum1.z+sum1.w,
                    sum2.x+sum2.y+sum2.z+sum2.w,
                    sum3.x+sum3.y+sum3.z+sum3.w,
                    sum4.x+sum4.y+sum4.z+sum4.w);
    }

    __kernel
    void conv_vecs_c(
                __global float4 const* const src,
                __global float4 const* const conv,
                __global float4* const res)
    {
        ulong const idx = get_global_id(0);

        float4 sum1 = (float4)0.0;
        float4 sum2 = (float4)0.0;
        long start = idx - FILTER_LENGTH / 2;
        long end = start + FILTER_LENGTH;
        long j = 0;
        for (long i = start; i < end; i++) {
            float4 current = src[i];
            sum1 += mulc32(current, conv[j]);
            j++;
            sum2 += mulc32(current, conv[j]);
            j++;
        }

        // float4 order is: xyzw
        res[idx] = (float4)
                   (sum1.x+sum1.z,
                    sum1.y+sum1.w,
                    sum2.x+sum2.z,
                    sum2.y+sum2.w);
    }
"#;

pub static MUL_KERNEL: &'static str = r#"
    #define mulc32(a,b) ((float2)((float)mad(-(a).y, (b).y, (a).x * (b).x), (float)mad((a).y, (b).x, (a).x * (b).y)))

    __kernel void multiply_vector(
                __global float2 const* const coeff,
                __global float2* const srcres)
    {
        uint const idx = get_global_id(0);
        srcres[idx] = mulc32(srcres[idx], coeff[idx]);
    }
"#;