ispc-downsampler 0.4.0

#include "weights.ispc"
#include "formats.ispc"
#include "image.ispc"
#include "math.ispc"


inline float<3> sample_3_channels(const uniform uint8* varying pixel_ptr) {
    // Memory reinterpretation to read all channels at once rather than one-by-one.
    // While testing, this proved more performant than reading one-by-one.
    const uniform uint8<3>* pixel_ptr3 = (const uniform uint8<3>*)(pixel_ptr);
    varying float<3> dst = {0.0f, 0.0f, 0.0f};
    dst = *pixel_ptr3 / 255.0f;

    return dst;
}

inline float<4> sample_4_channels(const uniform uint8* varying pixel_ptr) {
    const uniform uint8<4>* pixel_ptr4 = (const uniform uint8<4>*)(pixel_ptr);
    varying float<4> dst = {0.0f, 0.0f, 0.0f, 0.0f};
    dst = *pixel_ptr4 / 255.0f;

    return dst;
}

inline float<3> sample_normal(const uniform uint8* varying pixel_ptr, uniform NormalMapFormat normal_map_format) {
    float<3> normal = {0.0f, 0.0f, 0.0f};
    switch (normal_map_format) {
        case R8g8b8: {
            // Memory reinterpretation to read all channels at once rather than one-by-one.
            // While testing, this proved more performant than reading one-by-one.
            const uniform uint8<3>* pixel_ptr3 = (const uniform uint8<3>*)(pixel_ptr);

            normal = *pixel_ptr3 / 255.0f;
            normal = normal * 2.0f - 1.0f;
            break;
        }
        case R8g8TangentSpaceReconstructedZ: {
            // Memory reinterpretation to read all channels at once rather than one-by-one.
            // While testing, this proved more performant than reading one-by-one.
            const uniform uint8<2>* pixel_ptr2 = (const uniform uint8<2>*)(pixel_ptr);
            normal.x = pixel_ptr2->x / 255.0f;
            normal.y = pixel_ptr2->y / 255.0f;
            normal = normal * 2.0f - 1.0f;

            normal.z = sqrt(max(0.01, 1.0 - (normal.x * normal.x + normal.y * normal.y)));
            break;
        }
    }

    return normal;
}

inline void clean_and_write_3_channels(varying float<3> color, uniform uint8* varying pixel_ptr) {
    // The final color is a sum of numbers that are multiplied by the weights of their respective pixels.
    // Because of their numbers, floating point precision leads to the final color being potentially outside of the 0-255 range by a slight margin.
    // This would cause an underflow/overflow, which we avoid with the clamps.

    color[0] = clamp(color[0], 0.0f, 1.0f);
    color[1] = clamp(color[1], 0.0f, 1.0f);
    color[2] = clamp(color[2], 0.0f, 1.0f);
    color *= 255.0f;

    pixel_ptr[0] = color[0];
    pixel_ptr[1] = color[1];
    pixel_ptr[2] = color[2];
}

inline void clean_and_write_4_channels(varying float<4> color, uniform uint8* varying pixel_ptr) {
    color[0] = clamp(color[0], 0.0f, 1.0f);
    color[1] = clamp(color[1], 0.0f, 1.0f);
    color[2] = clamp(color[2], 0.0f, 1.0f);
    color[3] = clamp(color[3], 0.0f, 1.0f);
    color *= 255.0f;

    pixel_ptr[0] = color[0];
    pixel_ptr[1] = color[1];
    pixel_ptr[2] = color[2];
    pixel_ptr[3] = color[3];
}

inline void clean_and_write_normal(varying float<3> normal, uniform uint8* varying pixel_ptr, uniform NormalMapFormat normal_map_format) {
    normal = normalize(normal);
    normal = normal * 0.5 + 0.5;

    switch (normal_map_format) {
        case R8g8b8: {
            normal[0] = clamp(normal[0], 0.0f, 1.0f);
            normal[1] = clamp(normal[1], 0.0f, 1.0f);
            normal[2] = clamp(normal[2], 0.0f, 1.0f);
            normal *= 255.0f;

            pixel_ptr[0] = (uint8)normal[0];
            pixel_ptr[1] = (uint8)normal[1];
            pixel_ptr[2] = (uint8)normal[2];
            break;
        }
        case R8g8TangentSpaceReconstructedZ: {
            normal[0] = clamp(normal[0], 0.0f, 1.0f);
            normal[1] = clamp(normal[1], 0.0f, 1.0f);
            normal *= 255.0f;

            pixel_ptr[0] = normal[0];
            pixel_ptr[1] = normal[1];
        }
    }
}

struct SourceImage {
    uniform uint32 width;
    uniform uint32 height;
    uniform const uint8* data;
    uniform uint pixel_stride;
};

struct DownsampledImage {
    uniform uint32 width;
    uniform uint32 height;
    uniform uint8* data;
    uniform uint pixel_stride;
};

struct DownsamplingContext {
    uniform const SampleWeights weights;
    uniform uint8* scratch_space;
};

inline void resample_with_cached_weights(uniform SourceImage * uniform src, uniform DownsampledImage * uniform dst, uniform PixelFormat pixel_format, uniform DownsamplingContext * uniform ctx) {
    uniform WeightCollection * uniform vertical_weight_collection = ctx->weights.vertical_weights;
    uniform WeightCollection * uniform horizontal_weight_collection = ctx->weights.horizontal_weights;
    const uniform uint num_channels = channel_count(pixel_format);

    // Accumulate only along the width for each pixel, sampling from the source image
    // Results in the source image being downsampled to src->height X dst->width
    foreach_tiled(y =  0 ... src->height, x = 0 ... dst->width) {

        uint32 src_width_start = horizontal_weight_collection->starts[x];
        uint32 num_horizontal_weights = horizontal_weight_collection->weight_counts[x];
        float* horizontal_weights = horizontal_weight_collection->values[x];

        float<3> color3 = {0.0f, 0.0f, 0.0f};
        float<4> color4 = {0.0f, 0.0f, 0.0f, 0.0f};
        for (uint32 i = 0; i < num_horizontal_weights; i++) {
            float weight = horizontal_weights[i];
            uint32 src_x = src_width_start + i;
            uint64 src_read_address = (y * src->width + src_x) * src->pixel_stride;

            if (channel_count(pixel_format) == 3)
                color3 += sample_3_channels(src->data + src_read_address) * weight;
            if (channel_count(pixel_format) == 4)
                color4 += sample_4_channels(src->data + src_read_address) * weight;
        }

        uint64 scratch_write_address = (y * dst->width + x) * num_channels;

        if (channel_count(pixel_format) == 3)
            clean_and_write_3_channels(color3, ctx->scratch_space + scratch_write_address);
        if (channel_count(pixel_format) == 4)
            clean_and_write_4_channels(color4, ctx->scratch_space + scratch_write_address);
    }
    // Accumulate the scratch space data along the height
    // Downsamples the src_height X dst->width image to dst->height * dst->width
    foreach_tiled(y =  0 ... dst->height, x = 0 ... dst->width) {

        uint32 src_height_start = vertical_weight_collection->starts[y];
        uint32 num_vertical_weights = vertical_weight_collection->weight_counts[y];
        float* vertical_weights = vertical_weight_collection->values[y];
        float<3> color3 = {0.0f, 0.0f, 0.0f};
        float<4> color4 = {0.0f, 0.0f, 0.0f, 0.0f};
        for (uint32 i = 0; i < num_vertical_weights; i++) {
            float weight = vertical_weights[i];
            uint32 scratch_y = src_height_start + i;
            uint64 scratch_read_address = (scratch_y * dst->width + x) * num_channels;

            if (channel_count(pixel_format) == 3)
                color3 += sample_3_channels(ctx->scratch_space + scratch_read_address) * weight;
            if (channel_count(pixel_format) == 4)
                color4 += sample_4_channels(ctx->scratch_space + scratch_read_address) * weight;
        }

        uint64 out_write_address = (y * dst->width + x) * dst->pixel_stride;
        assert(out_write_address < dst->height * dst->width * dst->pixel_stride);
        if (channel_count(pixel_format) == 3)
            clean_and_write_3_channels(color3, dst->data + out_write_address);
        if (channel_count(pixel_format) == 4)
            clean_and_write_4_channels(color4, dst->data + out_write_address);
    }
}

/// scratch_space must be at least src_height * dst->width pixels big
export void resample_with_cached_weights_3(const uniform SourceImage * uniform src, uniform DownsampledImage * uniform dst, uniform PixelFormat pixel_format, uniform DownsamplingContext * uniform ctx) {
    const uniform bool is_normal_map = false;
    assume(channel_count(pixel_format) == 3);
    resample_with_cached_weights(src, dst, pixel_format, ctx);
}

/// scratch_space must be at least src_height * dst->width pixels big
export void resample_with_cached_weights_4(const uniform SourceImage * uniform src, uniform DownsampledImage * uniform dst, uniform PixelFormat pixel_format, uniform DownsamplingContext * uniform ctx) {
    assume(channel_count(pixel_format) == 4);
    resample_with_cached_weights(src, dst, pixel_format, ctx);
}

/// scratch_space must be at least src_height * dst->width pixels big
export void downsample_normal_map(const uniform SourceImage * uniform src, uniform DownsampledImage * uniform dst, uniform NormalMapFormat normal_map_format) {
    float<2> ratio = { (float)src->width / (float)dst->width, (float)src->height / (float)dst->height };
    foreach_tiled(y =  0 ... dst->height, x = 0 ... dst->width) {

        float<2> pixel_center = { (float)x + 0.5f, (float)y + 0.5f };

        uint<2> start = { 0, 0 };
        uint<2> end = { 0, 0 };
        float pixel_weight = 0.0;

        {
            start.x = max((int)floor(x * ratio.x - floor(ratio.x / 2.0f) + 0.5f), 0);
            start.y = max((int)floor(y * ratio.y - floor(ratio.y / 2.0f) + 0.5f), 0);

            end.x = min((int)floor(x * ratio.x + ceil(ratio.x / 2.0f) + 0.5f) + 1, src->width);
            end.y = min((int)floor(y * ratio.y + ceil(ratio.y / 2.0f) + 0.5f) + 1, src->height);

            pixel_weight = (float)((end.x - start.x) * (end.y - start.y));
        }


        float<3> normal = {0.0f, 0.0f, 0.0f};
        for (uint32 i = start.x; i < end.x; i++) {
            for (uint32 j = start.y; j < end.y; j++) {

                uint64 read_address = (j * src->width + i) * src->pixel_stride;
                normal += sample_normal(src->data + read_address, normal_map_format);
            }
        }

        normal /= pixel_weight;

        uint64 out_write_address = (y * dst->width + x) * dst->pixel_stride;
        assert(out_write_address < dst->height * dst->width * dst->pixel_stride);
        clean_and_write_normal(normal, dst->data + out_write_address, normal_map_format);

        uniform uint8* pixel_ptr = (dst->data + out_write_address);

    }
}