use crate::buffer::{GpuBuffer, GpuRasterBuffer};
use crate::context::GpuContext;
use crate::error::{GpuError, GpuResult};
use crate::kernels::{
convolution::gaussian_blur,
raster::{ElementWiseOp, RasterKernel, ScalarKernel, ScalarOp, UnaryKernel, UnaryOp},
resampling::{ResamplingMethod, resize},
statistics::{
HistogramKernel, HistogramParams, ReductionKernel, ReductionOp, Statistics,
compute_statistics,
},
};
use crate::shaders::{
ComputePipelineBuilder, WgslShader, create_compute_bind_group_layout, storage_buffer_layout,
uniform_buffer_layout,
};
use bytemuck::{Pod, Zeroable};
use std::marker::PhantomData;
use tracing::debug;
use wgpu::{
BindGroupDescriptor, BindGroupEntry, BufferUsages, CommandEncoderDescriptor,
ComputePassDescriptor, ComputePipeline as WgpuComputePipeline,
};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum GpuDataType {
U8,
U16,
U32,
I8,
I16,
I32,
F32,
F64Emulated,
}
impl GpuDataType {
pub fn size_bytes(&self) -> usize {
match self {
Self::U8 | Self::I8 => 1,
Self::U16 | Self::I16 => 2,
Self::U32 | Self::I32 | Self::F32 => 4,
Self::F64Emulated => 8,
}
}
pub fn min_value(&self) -> f64 {
match self {
Self::U8 => 0.0,
Self::U16 => 0.0,
Self::U32 => 0.0,
Self::I8 => -128.0,
Self::I16 => -32768.0,
Self::I32 => -2147483648.0,
Self::F32 => f32::MIN as f64,
Self::F64Emulated => f64::MIN,
}
}
pub fn max_value(&self) -> f64 {
match self {
Self::U8 => 255.0,
Self::U16 => 65535.0,
Self::U32 => 4294967295.0,
Self::I8 => 127.0,
Self::I16 => 32767.0,
Self::I32 => 2147483647.0,
Self::F32 => f32::MAX as f64,
Self::F64Emulated => f64::MAX,
}
}
pub fn is_signed(&self) -> bool {
matches!(
self,
Self::I8 | Self::I16 | Self::I32 | Self::F32 | Self::F64Emulated
)
}
pub fn is_float(&self) -> bool {
matches!(self, Self::F32 | Self::F64Emulated)
}
fn wgsl_storage_type(&self) -> &'static str {
match self {
Self::U8 | Self::I8 | Self::U16 | Self::I16 | Self::U32 | Self::I32 => "u32",
Self::F32 => "f32",
Self::F64Emulated => "vec2<f32>",
}
}
}
#[derive(Debug, Clone, Copy, Pod, Zeroable)]
#[repr(C)]
pub struct ConversionParams {
pub scale: f32,
pub offset: f32,
pub out_min: f32,
pub out_max: f32,
pub nodata_in: f32,
pub nodata_out: f32,
pub use_nodata: u32,
_padding: u32,
}
impl Default for ConversionParams {
fn default() -> Self {
Self {
scale: 1.0,
offset: 0.0,
out_min: f32::MIN,
out_max: f32::MAX,
nodata_in: 0.0,
nodata_out: 0.0,
use_nodata: 0,
_padding: 0,
}
}
}
impl ConversionParams {
pub fn new(scale: f32, offset: f32) -> Self {
Self {
scale,
offset,
..Default::default()
}
}
pub fn for_type_conversion(src: GpuDataType, dst: GpuDataType) -> Self {
let src_range = src.max_value() - src.min_value();
let dst_range = dst.max_value() - dst.min_value();
let scale = if src_range > 0.0 && dst_range > 0.0 {
(dst_range / src_range) as f32
} else {
1.0
};
let offset = if src.min_value() != dst.min_value() {
(dst.min_value() - src.min_value() * scale as f64) as f32
} else {
0.0
};
Self {
scale,
offset,
out_min: dst.min_value() as f32,
out_max: dst.max_value() as f32,
..Default::default()
}
}
pub fn with_clamp(mut self, min: f32, max: f32) -> Self {
self.out_min = min;
self.out_max = max;
self
}
pub fn with_nodata(mut self, input_nodata: f32, output_nodata: f32) -> Self {
self.nodata_in = input_nodata;
self.nodata_out = output_nodata;
self.use_nodata = 1;
self
}
pub fn u8_to_normalized() -> Self {
Self {
scale: 1.0 / 255.0,
offset: 0.0,
out_min: 0.0,
out_max: 1.0,
..Default::default()
}
}
pub fn normalized_to_u8() -> Self {
Self {
scale: 255.0,
offset: 0.0,
out_min: 0.0,
out_max: 255.0,
..Default::default()
}
}
pub fn u16_to_normalized() -> Self {
Self {
scale: 1.0 / 65535.0,
offset: 0.0,
out_min: 0.0,
out_max: 1.0,
..Default::default()
}
}
}
pub struct DataTypeConversionKernel {
context: GpuContext,
pipeline: WgpuComputePipeline,
bind_group_layout: wgpu::BindGroupLayout,
workgroup_size: u32,
}
impl DataTypeConversionKernel {
pub fn new(context: &GpuContext, src_type: GpuDataType) -> GpuResult<Self> {
debug!(
"Creating data type conversion kernel for {:?} -> f32",
src_type
);
let shader_source = Self::conversion_shader(src_type);
let mut shader = WgslShader::new(shader_source, "convert_type");
let shader_module = shader.compile(context.device())?;
let bind_group_layout = create_compute_bind_group_layout(
context.device(),
&[
storage_buffer_layout(0, true), uniform_buffer_layout(1), storage_buffer_layout(2, false), ],
Some("DataTypeConversionKernel BindGroupLayout"),
)?;
let pipeline = ComputePipelineBuilder::new(context.device(), shader_module, "convert_type")
.bind_group_layout(&bind_group_layout)
.label(format!(
"DataTypeConversion Pipeline: {:?} -> f32",
src_type
))
.build()?;
Ok(Self {
context: context.clone(),
pipeline,
bind_group_layout,
workgroup_size: 256,
})
}
fn conversion_shader(src_type: GpuDataType) -> String {
let (input_type, unpack_code) = match src_type {
GpuDataType::U8 => (
"u32",
r#"
// Unpack 4 u8 values from one u32
let packed = input[idx / 4u];
let byte_idx = idx % 4u;
var value: f32;
switch (byte_idx) {
case 0u: { value = f32(packed & 0xFFu); }
case 1u: { value = f32((packed >> 8u) & 0xFFu); }
case 2u: { value = f32((packed >> 16u) & 0xFFu); }
case 3u: { value = f32((packed >> 24u) & 0xFFu); }
default: { value = 0.0; }
}"#,
),
GpuDataType::I8 => (
"u32",
r#"
// Unpack 4 i8 values from one u32
let packed = input[idx / 4u];
let byte_idx = idx % 4u;
var raw: u32;
switch (byte_idx) {
case 0u: { raw = packed & 0xFFu; }
case 1u: { raw = (packed >> 8u) & 0xFFu; }
case 2u: { raw = (packed >> 16u) & 0xFFu; }
case 3u: { raw = (packed >> 24u) & 0xFFu; }
default: { raw = 0u; }
}
// Sign extend from 8 bits
var value: f32;
if (raw >= 128u) {
value = f32(i32(raw) - 256);
} else {
value = f32(raw);
}"#,
),
GpuDataType::U16 => (
"u32",
r#"
// Unpack 2 u16 values from one u32
let packed = input[idx / 2u];
let half_idx = idx % 2u;
var value: f32;
if (half_idx == 0u) {
value = f32(packed & 0xFFFFu);
} else {
value = f32((packed >> 16u) & 0xFFFFu);
}"#,
),
GpuDataType::I16 => (
"u32",
r#"
// Unpack 2 i16 values from one u32
let packed = input[idx / 2u];
let half_idx = idx % 2u;
var raw: u32;
if (half_idx == 0u) {
raw = packed & 0xFFFFu;
} else {
raw = (packed >> 16u) & 0xFFFFu;
}
// Sign extend from 16 bits
var value: f32;
if (raw >= 32768u) {
value = f32(i32(raw) - 65536);
} else {
value = f32(raw);
}"#,
),
GpuDataType::U32 => (
"u32",
r#"
let value = f32(input[idx]);"#,
),
GpuDataType::I32 => (
"u32",
r#"
let value = f32(bitcast<i32>(input[idx]));"#,
),
GpuDataType::F32 => (
"f32",
r#"
let value = input[idx];"#,
),
GpuDataType::F64Emulated => (
"vec2<f32>",
r#"
// Emulate f64 using two f32s (high and low parts)
let packed = input[idx];
// This is a simplified conversion - full f64 support would need more complex handling
let value = packed.x + packed.y;"#,
),
};
format!(
r#"
struct ConversionParams {{
scale: f32,
offset: f32,
out_min: f32,
out_max: f32,
nodata_in: f32,
nodata_out: f32,
use_nodata: u32,
_padding: u32,
}}
@group(0) @binding(0) var<storage, read> input: array<{input_type}>;
@group(0) @binding(1) var<uniform> params: ConversionParams;
@group(0) @binding(2) var<storage, read_write> output: array<f32>;
@compute @workgroup_size(256)
fn convert_type(@builtin(global_invocation_id) global_id: vec3<u32>) {{
let idx = global_id.x;
let output_len = arrayLength(&output);
if (idx >= output_len) {{
return;
}}
{unpack_code}
// Check for nodata
if (params.use_nodata != 0u && abs(value - params.nodata_in) < 1e-6) {{
output[idx] = params.nodata_out;
return;
}}
// Apply scale and offset
var result = value * params.scale + params.offset;
// Clamp to output range
result = clamp(result, params.out_min, params.out_max);
output[idx] = result;
}}
"#,
input_type = input_type,
unpack_code = unpack_code
)
}
pub fn execute<T: Pod>(
&self,
input: &GpuBuffer<T>,
output: &mut GpuBuffer<f32>,
params: &ConversionParams,
) -> GpuResult<()> {
let params_buffer = GpuBuffer::from_data(
&self.context,
&[*params],
BufferUsages::UNIFORM | BufferUsages::COPY_DST,
)?;
let bind_group = self
.context
.device()
.create_bind_group(&BindGroupDescriptor {
label: Some("DataTypeConversionKernel BindGroup"),
layout: &self.bind_group_layout,
entries: &[
BindGroupEntry {
binding: 0,
resource: input.buffer().as_entire_binding(),
},
BindGroupEntry {
binding: 1,
resource: params_buffer.buffer().as_entire_binding(),
},
BindGroupEntry {
binding: 2,
resource: output.buffer().as_entire_binding(),
},
],
});
let mut encoder = self
.context
.device()
.create_command_encoder(&CommandEncoderDescriptor {
label: Some("DataTypeConversionKernel Encoder"),
});
{
let mut compute_pass = encoder.begin_compute_pass(&ComputePassDescriptor {
label: Some("DataTypeConversionKernel Pass"),
timestamp_writes: None,
});
compute_pass.set_pipeline(&self.pipeline);
compute_pass.set_bind_group(0, &bind_group, &[]);
let num_workgroups =
(output.len() as u32 + self.workgroup_size - 1) / self.workgroup_size;
compute_pass.dispatch_workgroups(num_workgroups, 1, 1);
}
self.context.queue().submit(Some(encoder.finish()));
debug!(
"Executed type conversion kernel on {} elements",
output.len()
);
Ok(())
}
}
pub struct F32ToTypeKernel {
context: GpuContext,
pipeline: WgpuComputePipeline,
bind_group_layout: wgpu::BindGroupLayout,
workgroup_size: u32,
dst_type: GpuDataType,
}
impl F32ToTypeKernel {
pub fn new(context: &GpuContext, dst_type: GpuDataType) -> GpuResult<Self> {
debug!(
"Creating data type conversion kernel for f32 -> {:?}",
dst_type
);
let shader_source = Self::conversion_shader(dst_type);
let mut shader = WgslShader::new(shader_source, "convert_to_type");
let shader_module = shader.compile(context.device())?;
let bind_group_layout = create_compute_bind_group_layout(
context.device(),
&[
storage_buffer_layout(0, true), uniform_buffer_layout(1), storage_buffer_layout(2, false), ],
Some("F32ToTypeKernel BindGroupLayout"),
)?;
let pipeline =
ComputePipelineBuilder::new(context.device(), shader_module, "convert_to_type")
.bind_group_layout(&bind_group_layout)
.label(format!("F32ToType Pipeline: f32 -> {:?}", dst_type))
.build()?;
Ok(Self {
context: context.clone(),
pipeline,
bind_group_layout,
workgroup_size: 256,
dst_type,
})
}
fn conversion_shader(dst_type: GpuDataType) -> String {
let (output_type, pack_code) = match dst_type {
GpuDataType::U8 => (
"u32",
r#"
// Pack 4 u8 values into one u32
let base_idx = idx * 4u;
var packed = 0u;
for (var i = 0u; i < 4u; i = i + 1u) {
let src_idx = base_idx + i;
if (src_idx < arrayLength(&input)) {
var value = input[src_idx];
// Check nodata
if (params.use_nodata != 0u && abs(value - params.nodata_in) < 1e-6) {
value = params.nodata_out;
}
// Apply scale and offset, then clamp
value = clamp(value * params.scale + params.offset, params.out_min, params.out_max);
let byte_val = u32(value) & 0xFFu;
packed = packed | (byte_val << (i * 8u));
}
}
output[idx] = packed;"#,
),
GpuDataType::U16 => (
"u32",
r#"
// Pack 2 u16 values into one u32
let base_idx = idx * 2u;
var packed = 0u;
for (var i = 0u; i < 2u; i = i + 1u) {
let src_idx = base_idx + i;
if (src_idx < arrayLength(&input)) {
var value = input[src_idx];
if (params.use_nodata != 0u && abs(value - params.nodata_in) < 1e-6) {
value = params.nodata_out;
}
value = clamp(value * params.scale + params.offset, params.out_min, params.out_max);
let half_val = u32(value) & 0xFFFFu;
packed = packed | (half_val << (i * 16u));
}
}
output[idx] = packed;"#,
),
GpuDataType::U32 => (
"u32",
r#"
var value = input[idx];
if (params.use_nodata != 0u && abs(value - params.nodata_in) < 1e-6) {
value = params.nodata_out;
}
value = clamp(value * params.scale + params.offset, params.out_min, params.out_max);
output[idx] = u32(value);"#,
),
GpuDataType::I8 => (
"u32",
r#"
// Pack 4 i8 values into one u32
let base_idx = idx * 4u;
var packed = 0u;
for (var i = 0u; i < 4u; i = i + 1u) {
let src_idx = base_idx + i;
if (src_idx < arrayLength(&input)) {
var value = input[src_idx];
if (params.use_nodata != 0u && abs(value - params.nodata_in) < 1e-6) {
value = params.nodata_out;
}
value = clamp(value * params.scale + params.offset, params.out_min, params.out_max);
var byte_val: u32;
if (value < 0.0) {
byte_val = u32(i32(value) + 256) & 0xFFu;
} else {
byte_val = u32(value) & 0xFFu;
}
packed = packed | (byte_val << (i * 8u));
}
}
output[idx] = packed;"#,
),
GpuDataType::I16 => (
"u32",
r#"
// Pack 2 i16 values into one u32
let base_idx = idx * 2u;
var packed = 0u;
for (var i = 0u; i < 2u; i = i + 1u) {
let src_idx = base_idx + i;
if (src_idx < arrayLength(&input)) {
var value = input[src_idx];
if (params.use_nodata != 0u && abs(value - params.nodata_in) < 1e-6) {
value = params.nodata_out;
}
value = clamp(value * params.scale + params.offset, params.out_min, params.out_max);
var half_val: u32;
if (value < 0.0) {
half_val = u32(i32(value) + 65536) & 0xFFFFu;
} else {
half_val = u32(value) & 0xFFFFu;
}
packed = packed | (half_val << (i * 16u));
}
}
output[idx] = packed;"#,
),
GpuDataType::I32 => (
"u32",
r#"
var value = input[idx];
if (params.use_nodata != 0u && abs(value - params.nodata_in) < 1e-6) {
value = params.nodata_out;
}
value = clamp(value * params.scale + params.offset, params.out_min, params.out_max);
output[idx] = bitcast<u32>(i32(value));"#,
),
GpuDataType::F32 => (
"f32",
r#"
var value = input[idx];
if (params.use_nodata != 0u && abs(value - params.nodata_in) < 1e-6) {
value = params.nodata_out;
}
output[idx] = clamp(value * params.scale + params.offset, params.out_min, params.out_max);"#,
),
GpuDataType::F64Emulated => (
"vec2<f32>",
r#"
var value = input[idx];
if (params.use_nodata != 0u && abs(value - params.nodata_in) < 1e-6) {
value = params.nodata_out;
}
value = clamp(value * params.scale + params.offset, params.out_min, params.out_max);
// Split into high and low parts for f64 emulation
output[idx] = vec2<f32>(value, 0.0);"#,
),
};
format!(
r#"
struct ConversionParams {{
scale: f32,
offset: f32,
out_min: f32,
out_max: f32,
nodata_in: f32,
nodata_out: f32,
use_nodata: u32,
_padding: u32,
}}
@group(0) @binding(0) var<storage, read> input: array<f32>;
@group(0) @binding(1) var<uniform> params: ConversionParams;
@group(0) @binding(2) var<storage, read_write> output: array<{output_type}>;
@compute @workgroup_size(256)
fn convert_to_type(@builtin(global_invocation_id) global_id: vec3<u32>) {{
let idx = global_id.x;
let output_len = arrayLength(&output);
if (idx >= output_len) {{
return;
}}
{pack_code}
}}
"#,
output_type = output_type,
pack_code = pack_code
)
}
pub fn execute<T: Pod>(
&self,
input: &GpuBuffer<f32>,
output: &mut GpuBuffer<T>,
params: &ConversionParams,
) -> GpuResult<()> {
let params_buffer = GpuBuffer::from_data(
&self.context,
&[*params],
BufferUsages::UNIFORM | BufferUsages::COPY_DST,
)?;
let bind_group = self
.context
.device()
.create_bind_group(&BindGroupDescriptor {
label: Some("F32ToTypeKernel BindGroup"),
layout: &self.bind_group_layout,
entries: &[
BindGroupEntry {
binding: 0,
resource: input.buffer().as_entire_binding(),
},
BindGroupEntry {
binding: 1,
resource: params_buffer.buffer().as_entire_binding(),
},
BindGroupEntry {
binding: 2,
resource: output.buffer().as_entire_binding(),
},
],
});
let mut encoder = self
.context
.device()
.create_command_encoder(&CommandEncoderDescriptor {
label: Some("F32ToTypeKernel Encoder"),
});
{
let mut compute_pass = encoder.begin_compute_pass(&ComputePassDescriptor {
label: Some("F32ToTypeKernel Pass"),
timestamp_writes: None,
});
compute_pass.set_pipeline(&self.pipeline);
compute_pass.set_bind_group(0, &bind_group, &[]);
let num_workgroups =
(output.len() as u32 + self.workgroup_size - 1) / self.workgroup_size;
compute_pass.dispatch_workgroups(num_workgroups, 1, 1);
}
self.context.queue().submit(Some(encoder.finish()));
debug!(
"Executed f32 -> {:?} conversion on {} elements",
self.dst_type,
input.len()
);
Ok(())
}
}
pub struct BatchTypeConverter {
context: GpuContext,
tile_size: usize,
}
impl BatchTypeConverter {
pub fn new(context: &GpuContext) -> Self {
Self {
context: context.clone(),
tile_size: 1024 * 1024, }
}
pub fn with_tile_size(mut self, size: usize) -> Self {
self.tile_size = size;
self
}
pub fn convert_to_f32<T: Pod>(
&self,
input: &GpuBuffer<T>,
src_type: GpuDataType,
params: &ConversionParams,
) -> GpuResult<GpuBuffer<f32>> {
let kernel = DataTypeConversionKernel::new(&self.context, src_type)?;
let output_len = match src_type {
GpuDataType::U8 | GpuDataType::I8 => input.len() * 4,
GpuDataType::U16 | GpuDataType::I16 => input.len() * 2,
_ => input.len(),
};
let mut output = GpuBuffer::new(
&self.context,
output_len,
BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
)?;
kernel.execute(input, &mut output, params)?;
Ok(output)
}
pub fn convert_from_f32<T: Pod>(
&self,
input: &GpuBuffer<f32>,
dst_type: GpuDataType,
params: &ConversionParams,
) -> GpuResult<GpuBuffer<T>> {
let kernel = F32ToTypeKernel::new(&self.context, dst_type)?;
let output_len = match dst_type {
GpuDataType::U8 | GpuDataType::I8 => (input.len() + 3) / 4,
GpuDataType::U16 | GpuDataType::I16 => (input.len() + 1) / 2,
_ => input.len(),
};
let mut output = GpuBuffer::new(
&self.context,
output_len,
BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
)?;
kernel.execute(input, &mut output, params)?;
Ok(output)
}
}
pub struct ComputePipeline<T: Pod> {
context: GpuContext,
current_buffer: GpuBuffer<T>,
width: u32,
height: u32,
_phantom: PhantomData<T>,
}
impl<T: Pod + Zeroable> ComputePipeline<T> {
pub fn new(
context: &GpuContext,
input: GpuBuffer<T>,
width: u32,
height: u32,
) -> GpuResult<Self> {
let expected_size = (width as usize) * (height as usize);
if input.len() != expected_size {
return Err(GpuError::invalid_kernel_params(format!(
"Buffer size mismatch: expected {}, got {}",
expected_size,
input.len()
)));
}
Ok(Self {
context: context.clone(),
current_buffer: input,
width,
height,
_phantom: PhantomData,
})
}
pub fn from_data(context: &GpuContext, data: &[T], width: u32, height: u32) -> GpuResult<Self> {
let buffer = GpuBuffer::from_data(
context,
data,
BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
)?;
Self::new(context, buffer, width, height)
}
pub fn buffer(&self) -> &GpuBuffer<T> {
&self.current_buffer
}
pub fn dimensions(&self) -> (u32, u32) {
(self.width, self.height)
}
pub fn element_wise(mut self, op: ElementWiseOp, other: &GpuBuffer<T>) -> GpuResult<Self> {
debug!("Pipeline: applying {:?}", op);
let kernel = RasterKernel::new(&self.context, op)?;
let mut output = GpuBuffer::new(
&self.context,
self.current_buffer.len(),
BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
)?;
kernel.execute(&self.current_buffer, other, &mut output)?;
self.current_buffer = output;
Ok(self)
}
pub fn unary(mut self, op: UnaryOp) -> GpuResult<Self> {
debug!("Pipeline: applying unary {:?}", op);
let kernel = UnaryKernel::new(&self.context, op)?;
let mut output = GpuBuffer::new(
&self.context,
self.current_buffer.len(),
BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
)?;
kernel.execute(&self.current_buffer, &mut output)?;
self.current_buffer = output;
Ok(self)
}
pub fn scalar(mut self, op: ScalarOp) -> GpuResult<Self> {
debug!("Pipeline: applying scalar {:?}", op);
let kernel = ScalarKernel::new(&self.context, op)?;
let mut output = GpuBuffer::new(
&self.context,
self.current_buffer.len(),
BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
)?;
kernel.execute(&self.current_buffer, &mut output)?;
self.current_buffer = output;
Ok(self)
}
pub fn gaussian_blur(mut self, sigma: f32) -> GpuResult<Self> {
debug!("Pipeline: applying Gaussian blur (sigma={})", sigma);
let output = gaussian_blur(
&self.context,
&self.current_buffer,
self.width,
self.height,
sigma,
)?;
self.current_buffer = output;
Ok(self)
}
pub fn resize(
mut self,
new_width: u32,
new_height: u32,
method: ResamplingMethod,
) -> GpuResult<Self> {
debug!(
"Pipeline: resizing {}x{} -> {}x{} ({:?})",
self.width, self.height, new_width, new_height, method
);
let output = resize(
&self.context,
&self.current_buffer,
self.width,
self.height,
new_width,
new_height,
method,
)?;
self.width = new_width;
self.height = new_height;
self.current_buffer = output;
Ok(self)
}
pub fn add(self, value: f32) -> GpuResult<Self> {
self.scalar(ScalarOp::Add(value))
}
pub fn multiply(self, value: f32) -> GpuResult<Self> {
self.scalar(ScalarOp::Multiply(value))
}
pub fn clamp(self, min: f32, max: f32) -> GpuResult<Self> {
self.scalar(ScalarOp::Clamp { min, max })
}
pub fn threshold(self, threshold: f32, above: f32, below: f32) -> GpuResult<Self> {
self.scalar(ScalarOp::Threshold {
threshold,
above,
below,
})
}
pub fn abs(self) -> GpuResult<Self> {
self.unary(UnaryOp::Abs)
}
pub fn sqrt(self) -> GpuResult<Self> {
self.unary(UnaryOp::Sqrt)
}
pub fn log(self) -> GpuResult<Self> {
self.unary(UnaryOp::Log)
}
pub fn exp(self) -> GpuResult<Self> {
self.unary(UnaryOp::Exp)
}
pub async fn statistics(&self) -> GpuResult<Statistics> {
let staging = GpuBuffer::staging(&self.context, self.current_buffer.len())?;
let mut staging_mut = staging.clone();
staging_mut.copy_from(&self.current_buffer)?;
let data = staging.read().await?;
let f32_data: Vec<f32> = data
.into_iter()
.map(|v: T| {
let bytes = bytemuck::bytes_of(&v);
if bytes.len() == 4 {
f32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]])
} else {
0.0f32
}
})
.collect();
let input_buffer = GpuBuffer::from_data(
&self.context,
&f32_data,
BufferUsages::STORAGE | BufferUsages::COPY_SRC,
)?;
compute_statistics(&self.context, &input_buffer).await
}
pub async fn statistics_with_conversion(
&self,
src_type: GpuDataType,
params: &ConversionParams,
) -> GpuResult<Statistics> {
let converter = BatchTypeConverter::new(&self.context);
let f32_buffer = converter.convert_to_f32(&self.current_buffer, src_type, params)?;
compute_statistics(&self.context, &f32_buffer).await
}
pub async fn histogram(
&self,
num_bins: u32,
min_value: f32,
max_value: f32,
) -> GpuResult<Vec<u32>> {
let kernel = HistogramKernel::new(&self.context)?;
let params = HistogramParams::new(num_bins, min_value, max_value);
kernel.execute(&self.current_buffer, params).await
}
pub async fn reduce(&self, op: ReductionOp) -> GpuResult<T>
where
T: Copy,
{
let kernel = ReductionKernel::new(&self.context, op)?;
kernel.execute(&self.current_buffer, op).await
}
pub fn finish(self) -> GpuBuffer<T> {
self.current_buffer
}
pub async fn read(self) -> GpuResult<Vec<T>> {
let staging = GpuBuffer::staging(&self.context, self.current_buffer.len())?;
let mut staging_mut = staging.clone();
staging_mut.copy_from(&self.current_buffer)?;
staging.read().await
}
pub fn read_blocking(self) -> GpuResult<Vec<T>> {
pollster::block_on(self.read())
}
pub fn convert_to_f32(
self,
src_type: GpuDataType,
params: &ConversionParams,
) -> GpuResult<ComputePipeline<f32>> {
let converter = BatchTypeConverter::new(&self.context);
let f32_buffer = converter.convert_to_f32(&self.current_buffer, src_type, params)?;
Ok(ComputePipeline {
context: self.context,
current_buffer: f32_buffer,
width: self.width,
height: self.height,
_phantom: PhantomData,
})
}
pub fn linear_transform(self, scale: f32, offset: f32) -> GpuResult<Self> {
self.scalar(ScalarOp::Multiply(scale))?
.scalar(ScalarOp::Add(offset))
}
pub fn normalize_range(
self,
current_min: f32,
current_max: f32,
new_min: f32,
new_max: f32,
) -> GpuResult<Self> {
let current_range = current_max - current_min;
let new_range = new_max - new_min;
if current_range.abs() < 1e-10 {
return Err(GpuError::invalid_kernel_params(
"Current range is too small for normalization",
));
}
let scale = new_range / current_range;
let offset = new_min - current_min * scale;
self.linear_transform(scale, offset)
}
}
impl ComputePipeline<f32> {
pub fn convert_to_type<U: Pod + Zeroable>(
self,
dst_type: GpuDataType,
params: &ConversionParams,
) -> GpuResult<ComputePipeline<U>> {
let converter = BatchTypeConverter::new(&self.context);
let output_buffer: GpuBuffer<U> =
converter.convert_from_f32(&self.current_buffer, dst_type, params)?;
let (new_width, new_height) = match dst_type {
GpuDataType::U8 | GpuDataType::I8 => {
let total_elements = (self.width * self.height) as usize;
let packed_len = (total_elements + 3) / 4;
(packed_len as u32, 1)
}
GpuDataType::U16 | GpuDataType::I16 => {
let total_elements = (self.width * self.height) as usize;
let packed_len = (total_elements + 1) / 2;
(packed_len as u32, 1)
}
_ => (self.width, self.height),
};
Ok(ComputePipeline {
context: self.context,
current_buffer: output_buffer,
width: new_width,
height: new_height,
_phantom: PhantomData,
})
}
pub fn from_u8_normalized(
context: &GpuContext,
data: &[u8],
width: u32,
height: u32,
) -> GpuResult<Self> {
let f32_data: Vec<f32> = data.iter().map(|&v| v as f32 / 255.0).collect();
Self::from_data(context, &f32_data, width, height)
}
pub fn from_u16_normalized(
context: &GpuContext,
data: &[u16],
width: u32,
height: u32,
) -> GpuResult<Self> {
let f32_data: Vec<f32> = data.iter().map(|&v| v as f32 / 65535.0).collect();
Self::from_data(context, &f32_data, width, height)
}
pub fn scale_offset(self, scale: f32, offset: f32) -> GpuResult<Self> {
if (scale - 1.0).abs() < 1e-10 && offset.abs() < 1e-10 {
return Ok(self);
}
self.linear_transform(scale, offset)
}
}
pub struct MultibandPipeline<T: Pod> {
context: GpuContext,
bands: Vec<ComputePipeline<T>>,
}
impl<T: Pod + Zeroable> MultibandPipeline<T> {
pub fn new(context: &GpuContext, raster: &GpuRasterBuffer<T>) -> GpuResult<Self> {
let (width, height) = raster.dimensions();
let bands = raster
.bands()
.iter()
.map(|band| ComputePipeline::new(context, band.clone(), width, height))
.collect::<GpuResult<Vec<_>>>()?;
Ok(Self {
context: context.clone(),
bands,
})
}
pub fn num_bands(&self) -> usize {
self.bands.len()
}
pub fn band(&self, index: usize) -> Option<&ComputePipeline<T>> {
self.bands.get(index)
}
pub fn map<F>(mut self, mut f: F) -> GpuResult<Self>
where
F: FnMut(ComputePipeline<T>) -> GpuResult<ComputePipeline<T>>,
{
self.bands = self
.bands
.into_iter()
.map(|band| f(band))
.collect::<GpuResult<Vec<_>>>()?;
Ok(self)
}
pub fn ndvi(self) -> GpuResult<ComputePipeline<T>> {
if self.bands.len() < 4 {
return Err(GpuError::invalid_kernel_params(
"NDVI requires at least 4 bands (R,G,B,NIR)",
));
}
let nir = self
.bands
.get(3)
.ok_or_else(|| GpuError::internal("Missing NIR band"))?;
let red = self
.bands
.get(0)
.ok_or_else(|| GpuError::internal("Missing Red band"))?;
let nir_buffer = nir.buffer().clone();
let red_buffer = red.buffer().clone();
let width = nir.width;
let height = nir.height;
let diff_kernel = RasterKernel::new(&self.context, ElementWiseOp::Subtract)?;
let mut diff_buffer = GpuBuffer::new(
&self.context,
nir_buffer.len(),
BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
)?;
diff_kernel.execute(&nir_buffer, &red_buffer, &mut diff_buffer)?;
let sum_kernel = RasterKernel::new(&self.context, ElementWiseOp::Add)?;
let mut sum_buffer = GpuBuffer::new(
&self.context,
nir_buffer.len(),
BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
)?;
sum_kernel.execute(&nir_buffer, &red_buffer, &mut sum_buffer)?;
let div_kernel = RasterKernel::new(&self.context, ElementWiseOp::Divide)?;
let mut ndvi_buffer = GpuBuffer::new(
&self.context,
nir_buffer.len(),
BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
)?;
div_kernel.execute(&diff_buffer, &sum_buffer, &mut ndvi_buffer)?;
ComputePipeline::new(&self.context, ndvi_buffer, width, height)
}
pub fn finish(self) -> Vec<GpuBuffer<T>> {
self.bands.into_iter().map(|b| b.finish()).collect()
}
pub async fn read_all(self) -> GpuResult<Vec<Vec<T>>> {
let mut results = Vec::with_capacity(self.bands.len());
for band in self.bands {
results.push(band.read().await?);
}
Ok(results)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_compute_pipeline() {
if let Ok(context) = GpuContext::new().await {
let data: Vec<f32> = (0..100).map(|i| i as f32).collect();
if let Ok(pipeline) = ComputePipeline::from_data(&context, &data, 10, 10) {
if let Ok(result) = pipeline.add(5.0).and_then(|p| p.multiply(2.0)) {
let _ = result.finish();
}
}
}
}
#[tokio::test]
#[ignore]
async fn test_pipeline_chaining() {
if let Ok(context) = GpuContext::new().await {
let data: Vec<f32> = vec![1.0; 64 * 64];
if let Ok(pipeline) = ComputePipeline::from_data(&context, &data, 64, 64) {
if let Ok(result) = pipeline
.add(10.0)
.and_then(|p| p.multiply(2.0))
.and_then(|p| p.clamp(0.0, 100.0))
{
let stats = result.statistics().await;
if let Ok(stats) = stats {
println!("Mean: {}", stats.mean());
}
}
}
}
}
#[test]
fn test_gpu_data_type_properties() {
assert_eq!(GpuDataType::U8.size_bytes(), 1);
assert_eq!(GpuDataType::U16.size_bytes(), 2);
assert_eq!(GpuDataType::U32.size_bytes(), 4);
assert_eq!(GpuDataType::F32.size_bytes(), 4);
assert_eq!(GpuDataType::F64Emulated.size_bytes(), 8);
assert_eq!(GpuDataType::U8.min_value(), 0.0);
assert_eq!(GpuDataType::U8.max_value(), 255.0);
assert_eq!(GpuDataType::I8.min_value(), -128.0);
assert_eq!(GpuDataType::I8.max_value(), 127.0);
assert_eq!(GpuDataType::U16.max_value(), 65535.0);
assert!(!GpuDataType::U8.is_signed());
assert!(GpuDataType::I8.is_signed());
assert!(GpuDataType::F32.is_signed());
assert!(!GpuDataType::U8.is_float());
assert!(GpuDataType::F32.is_float());
assert!(GpuDataType::F64Emulated.is_float());
}
#[test]
fn test_conversion_params_default() {
let params = ConversionParams::default();
assert_eq!(params.scale, 1.0);
assert_eq!(params.offset, 0.0);
assert_eq!(params.use_nodata, 0);
}
#[test]
fn test_conversion_params_u8_to_normalized() {
let params = ConversionParams::u8_to_normalized();
assert!((params.scale - (1.0 / 255.0)).abs() < 1e-6);
assert_eq!(params.offset, 0.0);
assert_eq!(params.out_min, 0.0);
assert_eq!(params.out_max, 1.0);
}
#[test]
fn test_conversion_params_normalized_to_u8() {
let params = ConversionParams::normalized_to_u8();
assert_eq!(params.scale, 255.0);
assert_eq!(params.offset, 0.0);
assert_eq!(params.out_min, 0.0);
assert_eq!(params.out_max, 255.0);
}
#[test]
fn test_conversion_params_with_clamp() {
let params = ConversionParams::new(2.0, 10.0).with_clamp(0.0, 100.0);
assert_eq!(params.scale, 2.0);
assert_eq!(params.offset, 10.0);
assert_eq!(params.out_min, 0.0);
assert_eq!(params.out_max, 100.0);
}
#[test]
fn test_conversion_params_with_nodata() {
let params = ConversionParams::default().with_nodata(-9999.0, f32::NAN);
assert_eq!(params.nodata_in, -9999.0);
assert_eq!(params.use_nodata, 1);
}
#[test]
fn test_conversion_params_for_type_conversion() {
let params = ConversionParams::for_type_conversion(GpuDataType::U8, GpuDataType::U16);
let expected_scale = 65535.0 / 255.0;
assert!((params.scale - expected_scale as f32).abs() < 0.01);
}
#[tokio::test]
async fn test_data_type_conversion_kernel_creation() {
if let Ok(context) = GpuContext::new().await {
for dtype in &[
GpuDataType::U8,
GpuDataType::U16,
GpuDataType::U32,
GpuDataType::I8,
GpuDataType::I16,
GpuDataType::I32,
GpuDataType::F32,
] {
let result = DataTypeConversionKernel::new(&context, *dtype);
assert!(result.is_ok(), "Failed to create kernel for {:?}", dtype);
}
}
}
#[tokio::test]
async fn test_f32_to_type_kernel_creation() {
if let Ok(context) = GpuContext::new().await {
for dtype in &[
GpuDataType::U8,
GpuDataType::U16,
GpuDataType::U32,
GpuDataType::F32,
] {
let result = F32ToTypeKernel::new(&context, *dtype);
assert!(
result.is_ok(),
"Failed to create F32ToType kernel for {:?}",
dtype
);
}
}
}
#[tokio::test]
async fn test_batch_type_converter() {
if let Ok(context) = GpuContext::new().await {
let converter = BatchTypeConverter::new(&context);
let f32_data: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
if let Ok(buffer) = GpuBuffer::from_data(
&context,
&f32_data,
BufferUsages::STORAGE | BufferUsages::COPY_SRC | BufferUsages::COPY_DST,
) {
let params = ConversionParams::default();
let result = converter.convert_to_f32(&buffer, GpuDataType::F32, ¶ms);
assert!(result.is_ok());
}
}
}
#[tokio::test]
#[ignore]
async fn test_pipeline_with_u8_normalized() {
if let Ok(context) = GpuContext::new().await {
let u8_data: Vec<u8> = (0..100).collect();
if let Ok(pipeline) =
ComputePipeline::<f32>::from_u8_normalized(&context, &u8_data, 10, 10)
{
if let Ok(data) = pipeline.read_blocking() {
assert!(data[0].abs() < 1e-6);
let expected = 99.0 / 255.0;
assert!((data[99] - expected).abs() < 1e-4);
}
}
}
}
#[tokio::test]
#[ignore]
async fn test_pipeline_linear_transform() {
if let Ok(context) = GpuContext::new().await {
let data: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
if let Ok(pipeline) = ComputePipeline::from_data(&context, &data, 2, 2) {
if let Ok(result) = pipeline.linear_transform(2.0, 10.0) {
if let Ok(output) = result.read_blocking() {
assert!((output[0] - 12.0).abs() < 1e-4); assert!((output[1] - 14.0).abs() < 1e-4); assert!((output[2] - 16.0).abs() < 1e-4); assert!((output[3] - 18.0).abs() < 1e-4); }
}
}
}
}
#[tokio::test]
#[ignore]
async fn test_pipeline_normalize_range() {
if let Ok(context) = GpuContext::new().await {
let data: Vec<f32> = vec![0.0, 50.0, 100.0, 25.0];
if let Ok(pipeline) = ComputePipeline::from_data(&context, &data, 2, 2) {
if let Ok(result) = pipeline.normalize_range(0.0, 100.0, 0.0, 1.0) {
if let Ok(output) = result.read_blocking() {
assert!(output[0].abs() < 1e-4); assert!((output[1] - 0.5).abs() < 1e-4); assert!((output[2] - 1.0).abs() < 1e-4); assert!((output[3] - 0.25).abs() < 1e-4); }
}
}
}
}
#[tokio::test]
#[ignore]
async fn test_pipeline_scale_offset_noop() {
if let Ok(context) = GpuContext::new().await {
let data: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
if let Ok(pipeline) = ComputePipeline::from_data(&context, &data, 2, 2) {
if let Ok(result) = pipeline.scale_offset(1.0, 0.0) {
if let Ok(output) = result.read_blocking() {
for (i, &v) in output.iter().enumerate() {
assert!((v - data[i]).abs() < 1e-6);
}
}
}
}
}
}
#[test]
fn test_gpu_data_type_wgsl_storage_type() {
assert_eq!(GpuDataType::U8.wgsl_storage_type(), "u32");
assert_eq!(GpuDataType::F32.wgsl_storage_type(), "f32");
assert_eq!(GpuDataType::F64Emulated.wgsl_storage_type(), "vec2<f32>");
}
}