runmat-accelerate 0.5.0

use anyhow::{anyhow, ensure, Result};
use bytemuck::cast_slice;
use runmat_accelerate_api::{
    GpuTensorHandle, ProviderConv1dOptions, ProviderConvMode, ProviderPolyderQuotient,
    ProviderPolyfitResult, ProviderPolyvalOptions,
};
use runmat_runtime::builtins::math::poly::polyfit::polyfit_host_real_for_provider;
use std::sync::Arc;
use wgpu::util::DeviceExt;

use super::{
    conv_orientation_for, polynomial_orientation, shape_for_orientation, trim_leading_zeros_real,
    NumericPrecision, PolyderParams, PolyintParamsF32, PolyintParamsF64, PolynomialOrientation,
    WgpuProvider,
};

impl WgpuProvider {
    pub(crate) async fn polyfit_exec(
        &self,
        x: &GpuTensorHandle,
        y: &GpuTensorHandle,
        degree: usize,
        weights: Option<&GpuTensorHandle>,
    ) -> Result<ProviderPolyfitResult> {
        let x_host = self.download_exec(x).await?;
        let y_host = self.download_exec(y).await?;
        ensure!(
            x_host.data.len() == y_host.data.len(),
            "polyfit: X and Y vectors must match in length"
        );
        let weights_host = match weights {
            Some(handle) => Some(self.download_exec(handle).await?),
            None => None,
        };
        let weights_slice = weights_host.as_ref().map(|w| w.data.as_slice());
        let host_result =
            polyfit_host_real_for_provider(&x_host.data, &y_host.data, degree, weights_slice)
                .map_err(|err| anyhow!(err))?;
        Ok(ProviderPolyfitResult {
            coefficients: host_result.coefficients,
            r_matrix: host_result.r_matrix,
            normr: host_result.normr,
            df: host_result.df,
            mu: host_result.mu,
        })
    }

    pub(crate) fn polyval_exec(
        &self,
        coeffs: &GpuTensorHandle,
        points: &GpuTensorHandle,
        options: &ProviderPolyvalOptions,
    ) -> Result<GpuTensorHandle> {
        let coeff_entry = self.get_entry(coeffs)?;
        let points_entry = self.get_entry(points)?;

        ensure!(
            coeff_entry.precision == self.precision && points_entry.precision == self.precision,
            "polyval: precision mismatch between tensors and provider"
        );
        ensure!(
            coeff_entry.len > 0,
            "polyval: coefficient vector must contain at least one element"
        );

        let len = points_entry.len;
        let shape = points_entry.shape.clone();
        if len == 0 {
            let out_buffer = self.create_storage_buffer(0, "runmat-polyval-out");
            return Ok(self.register_existing_buffer(out_buffer, shape, 0));
        }

        ensure!(
            len <= u32::MAX as usize,
            "polyval: evaluation tensor exceeds GPU limits"
        );
        ensure!(
            coeff_entry.len <= u32::MAX as usize,
            "polyval: coefficient vector exceeds GPU limits"
        );

        {
            let mut enc =
                self.device_ref()
                    .create_command_encoder(&wgpu::CommandEncoderDescriptor {
                        label: Some("runmat-polyval-warmup"),
                    });
            let mut pass = enc.begin_compute_pass(&wgpu::ComputePassDescriptor {
                label: Some("runmat-polyval-warmup-pass"),
                timestamp_writes: None,
            });
            pass.set_pipeline(&self.pipelines.polyval.pipeline);
            drop(pass);
            self.submit(enc);
        }

        self.device_ref().poll(wgpu::Maintain::Poll);
        {
            let enc = self
                .device_ref()
                .create_command_encoder(&wgpu::CommandEncoderDescriptor {
                    label: Some("runmat-polyval-gap"),
                });
            self.submit(enc);
        }

        let out_buffer = self.create_storage_buffer_checked(len, "runmat-polyval-out")?;
        let chunk_capacity = (crate::backend::wgpu::config::MAX_DISPATCH_WORKGROUPS as usize)
            * crate::backend::wgpu::config::WORKGROUP_SIZE as usize;
        let (mu_mean, mu_scale) = options.mu.map(|m| (m.mean, m.scale)).unwrap_or((0.0, 1.0));
        let has_mu = options.mu.is_some() as u32;

        let mut offset = 0usize;
        while offset < len {
            let remaining = len - offset;
            let chunk_len = remaining.min(chunk_capacity).max(1);
            let params_buffer = match self.precision {
                NumericPrecision::F64 => {
                    let params = crate::backend::wgpu::params::PolyvalParamsF64 {
                        len: chunk_len as u32,
                        coeff_len: coeff_entry.len as u32,
                        offset: offset as u32,
                        has_mu,
                        mu_mean,
                        mu_scale,
                    };
                    self.uniform_buffer(&params, "runmat-polyval-params-f64")
                }
                NumericPrecision::F32 => {
                    let params = crate::backend::wgpu::params::PolyvalParamsF32 {
                        len: chunk_len as u32,
                        coeff_len: coeff_entry.len as u32,
                        offset: offset as u32,
                        has_mu,
                        mu_mean: mu_mean as f32,
                        mu_scale: mu_scale as f32,
                        _pad0: 0,
                        _pad1: 0,
                    };
                    self.uniform_buffer(&params, "runmat-polyval-params-f32")
                }
            };

            let bind_group = self
                .device_ref()
                .create_bind_group(&wgpu::BindGroupDescriptor {
                    label: Some("runmat-polyval-bind"),
                    layout: &self.pipelines.polyval.layout,
                    entries: &[
                        wgpu::BindGroupEntry {
                            binding: 0,
                            resource: coeff_entry.buffer.as_ref().as_entire_binding(),
                        },
                        wgpu::BindGroupEntry {
                            binding: 1,
                            resource: points_entry.buffer.as_ref().as_entire_binding(),
                        },
                        wgpu::BindGroupEntry {
                            binding: 2,
                            resource: out_buffer.as_ref().as_entire_binding(),
                        },
                        wgpu::BindGroupEntry {
                            binding: 3,
                            resource: params_buffer.as_entire_binding(),
                        },
                    ],
                });

            let workgroups = crate::backend::wgpu::dispatch::common::dispatch_size(
                chunk_len as u32,
                crate::backend::wgpu::config::WORKGROUP_SIZE,
            );

            crate::backend::wgpu::dispatch::elementwise::run(
                self.device_ref(),
                self.queue_ref(),
                &self.pipelines.polyval.pipeline,
                &bind_group,
                workgroups,
            );

            offset += chunk_len;
        }

        Ok(self.register_existing_buffer(out_buffer, shape, len))
    }

    pub(crate) fn polyint_exec(
        &self,
        polynomial: &GpuTensorHandle,
        constant: f64,
    ) -> Result<GpuTensorHandle> {
        let entry = self.get_entry(polynomial)?;
        ensure!(
            entry.precision == self.precision,
            "polyint: precision mismatch between tensor and provider"
        );
        let orientation = polynomial_orientation(&entry.shape)?;

        if entry.len == 0 {
            let shape = shape_for_orientation(orientation, 1);
            let buffer = match self.precision {
                NumericPrecision::F64 => Arc::new(self.device.create_buffer_init(
                    &wgpu::util::BufferInitDescriptor {
                        label: Some("runmat-polyint-const-f64"),
                        contents: cast_slice(&[constant]),
                        usage: wgpu::BufferUsages::STORAGE
                            | wgpu::BufferUsages::COPY_DST
                            | wgpu::BufferUsages::COPY_SRC,
                    },
                )),
                NumericPrecision::F32 => {
                    let value = constant as f32;
                    Arc::new(
                        self.device
                            .create_buffer_init(&wgpu::util::BufferInitDescriptor {
                                label: Some("runmat-polyint-const-f32"),
                                contents: cast_slice(&[value]),
                                usage: wgpu::BufferUsages::STORAGE
                                    | wgpu::BufferUsages::COPY_DST
                                    | wgpu::BufferUsages::COPY_SRC,
                            }),
                    )
                }
            };
            return Ok(self.register_existing_buffer(buffer, shape, 1));
        }

        ensure!(
            entry.len <= u32::MAX as usize,
            "polyint: polynomial length exceeds GPU limits"
        );

        let output_len = entry.len + 1;
        let out_buffer = self.create_storage_buffer_checked(output_len, "runmat-polyint-out")?;
        let params_buffer = match self.precision {
            NumericPrecision::F64 => {
                let params = PolyintParamsF64 {
                    input_len: entry.len as u32,
                    output_len: output_len as u32,
                    constant,
                };
                self.uniform_buffer(&params, "runmat-polyint-params-f64")
            }
            NumericPrecision::F32 => {
                let params = PolyintParamsF32 {
                    input_len: entry.len as u32,
                    output_len: output_len as u32,
                    constant: constant as f32,
                    _pad0: 0.0,
                };
                self.uniform_buffer(&params, "runmat-polyint-params-f32")
            }
        };

        let bind_group = self.device.create_bind_group(&wgpu::BindGroupDescriptor {
            label: Some("runmat-polyint-bind"),
            layout: &self.pipelines.polyint.layout,
            entries: &[
                wgpu::BindGroupEntry {
                    binding: 0,
                    resource: entry.buffer.as_ref().as_entire_binding(),
                },
                wgpu::BindGroupEntry {
                    binding: 1,
                    resource: out_buffer.as_ref().as_entire_binding(),
                },
                wgpu::BindGroupEntry {
                    binding: 2,
                    resource: params_buffer.as_entire_binding(),
                },
            ],
        });

        let workgroups = crate::backend::wgpu::dispatch::common::dispatch_size(
            output_len as u32,
            crate::backend::wgpu::config::WORKGROUP_SIZE,
        );
        let mut encoder = self
            .device
            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
                label: Some("runmat-polyint-encoder"),
            });
        {
            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
                label: Some("runmat-polyint-pass"),
                timestamp_writes: None,
            });
            pass.set_pipeline(&self.pipelines.polyint.pipeline);
            pass.set_bind_group(0, &bind_group, &[]);
            pass.dispatch_workgroups(workgroups, 1, 1);
        }
        self.submit(encoder);

        let out_shape = shape_for_orientation(orientation, output_len);
        Ok(self.register_existing_buffer(out_buffer, out_shape, output_len))
    }
    pub(crate) async fn polyder_exec(
        &self,
        polynomial: &GpuTensorHandle,
    ) -> Result<GpuTensorHandle> {
        let entry = self.get_entry(polynomial)?;
        ensure!(
            entry.precision == self.precision,
            "polyder: precision mismatch between tensor and provider"
        );
        let orientation = polynomial_orientation(&entry.shape)?;
        if entry.len <= 1 {
            let shape = shape_for_orientation(orientation, 1);
            let buffer = match self.precision {
                NumericPrecision::F64 => Arc::new(self.device.create_buffer_init(
                    &wgpu::util::BufferInitDescriptor {
                        label: Some("runmat-polyder-const-f64"),
                        contents: cast_slice(&[0.0f64]),
                        usage: wgpu::BufferUsages::STORAGE
                            | wgpu::BufferUsages::COPY_DST
                            | wgpu::BufferUsages::COPY_SRC,
                    },
                )),
                NumericPrecision::F32 => {
                    let zeros: [f32; 1] = [0.0];
                    Arc::new(
                        self.device
                            .create_buffer_init(&wgpu::util::BufferInitDescriptor {
                                label: Some("runmat-polyder-const-f32"),
                                contents: cast_slice(&zeros),
                                usage: wgpu::BufferUsages::STORAGE
                                    | wgpu::BufferUsages::COPY_DST
                                    | wgpu::BufferUsages::COPY_SRC,
                            }),
                    )
                }
            };
            return Ok(self.register_existing_buffer(buffer, shape, 1));
        }

        ensure!(
            entry.len <= u32::MAX as usize,
            "polyder: polynomial length exceeds GPU limits"
        );
        let output_len = entry.len - 1;
        let out_buffer = self.create_storage_buffer_checked(output_len, "runmat-polyder-out")?;
        let params = PolyderParams {
            input_len: entry.len as u32,
            output_len: output_len as u32,
            _pad0: 0,
            _pad1: 0,
        };
        let params_buffer = self.uniform_buffer(&params, "runmat-polyder-params");
        let bind_group = self.device.create_bind_group(&wgpu::BindGroupDescriptor {
            label: Some("runmat-polyder-bind"),
            layout: &self.pipelines.polyder.layout,
            entries: &[
                wgpu::BindGroupEntry {
                    binding: 0,
                    resource: entry.buffer.as_ref().as_entire_binding(),
                },
                wgpu::BindGroupEntry {
                    binding: 1,
                    resource: out_buffer.as_ref().as_entire_binding(),
                },
                wgpu::BindGroupEntry {
                    binding: 2,
                    resource: params_buffer.as_entire_binding(),
                },
            ],
        });
        let groups = crate::backend::wgpu::dispatch::common::dispatch_size(
            params.output_len,
            crate::backend::wgpu::config::WORKGROUP_SIZE,
        );
        let mut encoder = self
            .device
            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
                label: Some("runmat-polyder-encoder"),
            });
        {
            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
                label: Some("runmat-polyder-pass"),
                timestamp_writes: None,
            });
            pass.set_pipeline(&self.pipelines.polyder.pipeline);
            pass.set_bind_group(0, &bind_group, &[]);
            pass.dispatch_workgroups(groups, 1, 1);
        }
        self.submit(encoder);

        let out_shape = shape_for_orientation(orientation, output_len);
        let handle = self.register_existing_buffer(out_buffer, out_shape, output_len);
        self.trim_polynomial_handle(handle, orientation).await
    }

    pub(crate) async fn polyder_product_exec(
        &self,
        p: &GpuTensorHandle,
        q: &GpuTensorHandle,
    ) -> Result<GpuTensorHandle> {
        let p_entry = self.get_entry(p)?;
        let q_entry = self.get_entry(q)?;
        ensure!(
            p_entry.precision == self.precision && q_entry.precision == self.precision,
            "polyder: precision mismatch between tensors and provider"
        );
        let orientation = polynomial_orientation(&p_entry.shape)?;
        let conv_orientation = conv_orientation_for(orientation);

        let dp = self.polyder_exec(p).await?;
        let dq = self.polyder_exec(q).await?;
        let options = ProviderConv1dOptions {
            mode: ProviderConvMode::Full,
            orientation: conv_orientation,
        };
        let term1 = self.conv1d_exec(&dp, q, options)?;
        let term2 = self.conv1d_exec(p, &dq, options)?;
        let result = self.binary_op_exec(
            crate::backend::wgpu::types::BinaryOpCode::Add,
            &term1,
            &term2,
        )?;
        self.free_exec(&dp).ok();
        self.free_exec(&dq).ok();
        self.free_exec(&term1).ok();
        self.free_exec(&term2).ok();
        self.trim_polynomial_handle(result, orientation).await
    }

    pub(crate) async fn polyder_quotient_exec(
        &self,
        u: &GpuTensorHandle,
        v: &GpuTensorHandle,
    ) -> Result<ProviderPolyderQuotient> {
        let u_entry = self.get_entry(u)?;
        let v_entry = self.get_entry(v)?;
        ensure!(
            u_entry.precision == self.precision && v_entry.precision == self.precision,
            "polyder: precision mismatch between tensors and provider"
        );
        let orientation_u = polynomial_orientation(&u_entry.shape)?;
        let orientation_v = polynomial_orientation(&v_entry.shape)?;
        let options_num = ProviderConv1dOptions {
            mode: ProviderConvMode::Full,
            orientation: conv_orientation_for(orientation_u),
        };
        let options_den = ProviderConv1dOptions {
            mode: ProviderConvMode::Full,
            orientation: conv_orientation_for(orientation_v),
        };

        let du = self.polyder_exec(u).await?;
        let dv = self.polyder_exec(v).await?;
        let term1 = self.conv1d_exec(&du, v, options_num)?;
        let term2 = self.conv1d_exec(u, &dv, options_num)?;
        let numerator_handle = self.binary_op_exec(
            crate::backend::wgpu::types::BinaryOpCode::Sub,
            &term1,
            &term2,
        )?;
        let denominator_handle = self.conv1d_exec(v, v, options_den)?;
        self.free_exec(&du).ok();
        self.free_exec(&dv).ok();
        self.free_exec(&term1).ok();
        self.free_exec(&term2).ok();

        let numerator = self
            .trim_polynomial_handle(numerator_handle, orientation_u)
            .await?;
        let denominator = self
            .trim_polynomial_handle(denominator_handle, orientation_v)
            .await?;
        Ok(ProviderPolyderQuotient {
            numerator,
            denominator,
        })
    }

    async fn trim_polynomial_handle(
        &self,
        handle: GpuTensorHandle,
        orientation: PolynomialOrientation,
    ) -> Result<GpuTensorHandle> {
        let host = self.download_exec(&handle).await?;
        let trimmed = trim_leading_zeros_real(&host.data);
        if trimmed.len() == host.data.len() {
            return Ok(handle);
        }
        let shape_vec = shape_for_orientation(orientation, trimmed.len());
        let new_handle = if trimmed.is_empty() {
            self.register_existing_buffer(
                self.create_storage_buffer(0, "runmat-polyder-trim-empty"),
                shape_vec,
                0,
            )
        } else {
            match self.precision {
                NumericPrecision::F64 => {
                    let buffer = Arc::new(self.device.create_buffer_init(
                        &wgpu::util::BufferInitDescriptor {
                            label: Some("runmat-polyder-trim-f64"),
                            contents: cast_slice(trimmed.as_slice()),
                            usage: wgpu::BufferUsages::STORAGE
                                | wgpu::BufferUsages::COPY_DST
                                | wgpu::BufferUsages::COPY_SRC,
                        },
                    ));
                    self.register_existing_buffer(buffer, shape_vec, trimmed.len())
                }
                NumericPrecision::F32 => {
                    let data_f32: Vec<f32> = trimmed.iter().map(|v| *v as f32).collect();
                    let buffer = Arc::new(self.device.create_buffer_init(
                        &wgpu::util::BufferInitDescriptor {
                            label: Some("runmat-polyder-trim-f32"),
                            contents: cast_slice(&data_f32),
                            usage: wgpu::BufferUsages::STORAGE
                                | wgpu::BufferUsages::COPY_DST
                                | wgpu::BufferUsages::COPY_SRC,
                        },
                    ));
                    self.register_existing_buffer(buffer, shape_vec, trimmed.len())
                }
            }
        };
        self.free_exec(&handle).ok();
        Ok(new_handle)
    }
}