tokitai-operator 0.1.0

//! GPU scaffold backend (default-features stub).
//!
//! The `GpuScaffoldBackend` is a CPU-side stub that the
//! `rocm-hip` feature replaces with a real HIP implementation. In
//! the default build, every op returns
//! `Error::Backend("GPU scaffold: X not yet implemented")`. The
//! stub is the only way the default `cargo test` runs without
//! needing a GPU device.
//!
//! `GpuUnsupportedReport` is the structured reason for each
//! unimplemented op; the support matrix references these reasons.
//!
use crate::backend::hardware::{
    ComputeHardware, DeviceCapabilities, DeviceKind, HardwareTarget, MemorySpace,
};
use crate::backend::memory::{DeviceBuffer, TransferFallbackReason, TransferPlan, TransferStatus};
use crate::backend::{Backend, BackendCapabilities, Executable, ObjectRef};
use crate::planner::{ExecutionPlan, PlanStepKind};
use crate::{Error, Result};

#[cfg(feature = "accelerated-pilot")]
use crate::backend::TensorStore;
#[cfg(feature = "accelerated-pilot")]
use crate::backend::cpu::CpuScalarBackend;
#[cfg(feature = "accelerated-pilot")]
use crate::domain::DomainId;
#[cfg(feature = "accelerated-pilot")]
use crate::ir::SemanticGraph;
#[cfg(feature = "accelerated-pilot")]
use crate::object::Tensor;
#[cfg(feature = "accelerated-pilot")]
use crate::op::{LoweringCapability, LoweringEvidenceKind, LoweringRule, OperatorRegistry};
#[cfg(feature = "accelerated-pilot")]
use crate::planner::HeuristicPlanner;

#[derive(Debug, Clone, Copy, Default)]
pub struct GpuScaffoldBackend;

#[cfg(feature = "accelerated-pilot")]
#[derive(Debug, Clone, Copy, Default)]
pub struct GpuDenseI64PilotBackend;

#[cfg(feature = "accelerated-pilot")]
pub const GPU_DENSE_I64_PILOT_LOWERING_ID: &str = "gpu_pilot.add.dense_i64";

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum GpuSynchronizationModel {
    HostSynchronous,
    StreamSynchronized { stream: String },
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct GpuTransferLifecycle {
    pub allocates_device_memory: bool,
    pub host_to_device_copy: bool,
    pub device_to_host_copy: bool,
    pub synchronization: GpuSynchronizationModel,
    pub cpu_oracle_verification: bool,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct GpuKernelRegistryEntry {
    pub op_name: String,
    pub kernel_symbol: String,
    pub scalar_type: String,
    pub supported_domain: String,
    pub supported_representation: String,
    pub source_fingerprint: String,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct GpuExecutionContract {
    pub backend: String,
    pub target: HardwareTarget,
    pub scope: String,
    pub real_device_execution: bool,
    pub lifecycle: GpuTransferLifecycle,
    pub kernels: Vec<GpuKernelRegistryEntry>,
    pub evidence: Vec<String>,
    pub non_claims: Vec<String>,
}

impl GpuExecutionContract {
    pub fn scaffold_fallback() -> Self {
        Self {
            backend: "gpu_scaffold".to_string(),
            target: GpuScaffoldBackend::target(),
            scope: "fallback-only GPU planning scaffold".to_string(),
            real_device_execution: false,
            lifecycle: GpuTransferLifecycle {
                allocates_device_memory: false,
                host_to_device_copy: false,
                device_to_host_copy: false,
                synchronization: GpuSynchronizationModel::HostSynchronous,
                cpu_oracle_verification: false,
            },
            kernels: Vec::new(),
            evidence: vec![
                "gpu_scaffold intentionally has no executable kernels".to_string(),
                "all executable work must fall back to cpu_scalar".to_string(),
            ],
            non_claims: vec![
                "not real GPU execution".to_string(),
                "not generic GPU support".to_string(),
            ],
        }
    }

    #[cfg(feature = "accelerated-pilot")]
    pub fn dense_i64_host_vector_pilot() -> Self {
        Self {
            backend: "gpu_dense_i64_pilot".to_string(),
            target: GpuDenseI64PilotBackend::target(),
            scope: "feature-gated dense i64 add host-vector pilot".to_string(),
            real_device_execution: false,
            lifecycle: GpuTransferLifecycle {
                allocates_device_memory: false,
                host_to_device_copy: false,
                device_to_host_copy: false,
                synchronization: GpuSynchronizationModel::HostSynchronous,
                cpu_oracle_verification: true,
            },
            kernels: vec![GpuKernelRegistryEntry {
                op_name: "add".to_string(),
                kernel_symbol: "host_vector_dense_i64_add".to_string(),
                scalar_type: "i64".to_string(),
                supported_domain: "integer".to_string(),
                supported_representation: crate::object::Representation::dense_cpu().id().0,
                source_fingerprint: "host-vector-rust".to_string(),
            }],
            evidence: vec![
                "selected through public prefer-gpu planning only under accelerated-pilot"
                    .to_string(),
                "outputs must match CpuScalarBackend exactly".to_string(),
            ],
            non_claims: vec![
                "not real device allocation".to_string(),
                "not production GPU acceleration".to_string(),
                "not generic GPU support".to_string(),
            ],
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct GpuUnsupportedReport {
    pub backend: String,
    pub reason: GpuUnsupportedReason,
    pub transfer_reason: Option<TransferFallbackReason>,
    pub fallback_backend: String,
    pub evidence: Vec<String>,
}

#[derive(Debug, Clone, PartialEq, Eq)]
pub enum GpuUnsupportedReason {
    NonstandardDomain { domain: String },
    SheafLocality { op_name: String },
    PadicPrecision { domain: String },
    DeviceTransfer { message: String },
    NoKernel { op_name: String },
}

impl GpuUnsupportedReason {
    pub fn message(&self) -> String {
        match self {
            Self::NonstandardDomain { domain } => {
                format!("gpu_scaffold does not support mathematical domain {domain}")
            }
            Self::SheafLocality { op_name } => {
                format!("gpu_scaffold does not support finite-site sheaf locality for {op_name}")
            }
            Self::PadicPrecision { domain } => {
                format!(
                    "gpu_scaffold does not support fixed-precision p-adic execution for {domain}"
                )
            }
            Self::DeviceTransfer { message } => message.clone(),
            Self::NoKernel { op_name } => {
                format!("gpu_scaffold has no executable kernel for {op_name}")
            }
        }
    }
}

impl GpuUnsupportedReport {
    fn new(reason: GpuUnsupportedReason, transfer_reason: Option<TransferFallbackReason>) -> Self {
        let message = reason.message();
        Self {
            backend: "gpu_scaffold".to_string(),
            reason,
            transfer_reason,
            fallback_backend: "cpu_scalar".to_string(),
            evidence: vec![
                "P183 GPU support is scaffold/fallback only; no optimized kernels are claimed"
                    .to_string(),
                message,
            ],
        }
    }

    pub fn new_for_real_backend(backend: &str, reason: GpuUnsupportedReason) -> Self {
        let message = reason.message();
        Self {
            backend: backend.to_string(),
            reason,
            transfer_reason: None,
            fallback_backend: "cpu_scalar".to_string(),
            evidence: vec![
                format!("{backend} rejected unsupported public GPU plan"),
                message,
            ],
        }
    }
}

impl GpuScaffoldBackend {
    pub fn target() -> HardwareTarget {
        HardwareTarget {
            id: "gpu_scaffold".to_string(),
            kind: DeviceKind::Gpu,
            memory_space: MemorySpace::Device,
        }
    }

    pub fn capabilities() -> BackendCapabilities {
        BackendCapabilities {
            name: "gpu_scaffold".to_string(),
            exact: false,
            deterministic: false,
            supported_representations: vec![crate::object::Representation::dense_cpu().id().0],
            supported_domains: vec!["integer".to_string()],
            semantic_degradations: vec![
                "scaffold_only:no_kernel_execution".to_string(),
                "unsupported:padic:fixed_precision".to_string(),
                "unsupported:sheaf:finite_site".to_string(),
            ],
        }
    }

    pub fn unsupported_plan_report(&self, plan: &ExecutionPlan) -> Option<GpuUnsupportedReport> {
        for step in &plan.steps {
            if step.domain.starts_with("Q_") || step.domain.contains("padic") {
                return Some(GpuUnsupportedReport::new(
                    GpuUnsupportedReason::PadicPrecision {
                        domain: step.domain.clone(),
                    },
                    None,
                ));
            }
            if matches!(step.kind, PlanStepKind::CoverGlueCheck { .. })
                || step.domain.starts_with("cover:")
            {
                return Some(GpuUnsupportedReport::new(
                    GpuUnsupportedReason::SheafLocality {
                        op_name: step.op_name.clone(),
                    },
                    None,
                ));
            }
            if step.domain != "integer" && step.domain != "unknown" {
                return Some(GpuUnsupportedReport::new(
                    GpuUnsupportedReason::NonstandardDomain {
                        domain: step.domain.clone(),
                    },
                    None,
                ));
            }
            return Some(GpuUnsupportedReport::new(
                GpuUnsupportedReason::NoKernel {
                    op_name: step.op_name.clone(),
                },
                None,
            ));
        }
        None
    }

    pub fn unsupported_transfer_report(
        &self,
        source: DeviceBuffer,
        destination: DeviceBuffer,
    ) -> Option<GpuUnsupportedReport> {
        let transfer = TransferPlan::plan(source, destination);
        match transfer.status {
            TransferStatus::Supported | TransferStatus::NoOp => None,
            TransferStatus::Unsupported(reason) => Some(GpuUnsupportedReport::new(
                GpuUnsupportedReason::DeviceTransfer {
                    message: format!("gpu_scaffold rejected transfer: {reason:?}"),
                },
                Some(reason),
            )),
        }
    }
}

impl Backend for GpuScaffoldBackend {
    fn name(&self) -> &'static str {
        "gpu_scaffold"
    }

    fn capabilities(&self) -> BackendCapabilities {
        Self::capabilities()
    }

    fn compile(&self, plan: &ExecutionPlan) -> Result<Executable> {
        if let Some(report) = self.unsupported_plan_report(plan) {
            return Err(Error::backend(report.reason.message()));
        }
        Ok(Executable {
            backend: self.name().to_string(),
        })
    }

    fn execute(&self, _executable: &Executable, _args: &[ObjectRef]) -> Result<()> {
        Err(Error::backend(
            "gpu_scaffold has no runtime execution kernels; use CPU fallback",
        ))
    }
}

impl ComputeHardware for GpuScaffoldBackend {
    fn target(&self) -> HardwareTarget {
        Self::target()
    }

    fn device_capabilities(&self) -> DeviceCapabilities {
        DeviceCapabilities::from_backend(Self::target(), Self::capabilities())
    }
}

#[cfg(feature = "accelerated-pilot")]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct GpuDenseI64PilotReport {
    pub backend: String,
    pub op_name: String,
    pub checked_outputs: Vec<usize>,
    pub cpu_oracle_matches: bool,
    pub preliminary_runtime_ns: Option<u64>,
    pub evidence: Vec<String>,
}

#[cfg(feature = "accelerated-pilot")]
impl GpuDenseI64PilotBackend {
    pub fn target() -> HardwareTarget {
        HardwareTarget {
            id: "gpu_dense_i64_pilot".to_string(),
            kind: DeviceKind::Gpu,
            memory_space: MemorySpace::Device,
        }
    }

    pub fn capabilities() -> BackendCapabilities {
        BackendCapabilities {
            name: "gpu_dense_i64_pilot".to_string(),
            exact: true,
            deterministic: true,
            supported_representations: vec![crate::object::Representation::dense_cpu().id().0],
            supported_domains: vec!["integer".to_string()],
            semantic_degradations: vec![
                "pilot:feature_gated_dense_i64_add_only".to_string(),
                "pilot:host_vector_kernel_no_device_allocator".to_string(),
                "unsupported:padic:fixed_precision".to_string(),
                "unsupported:sheaf:finite_site".to_string(),
            ],
        }
    }

    pub fn lowering_rule() -> LoweringRule {
        LoweringRule::new(
            GPU_DENSE_I64_PILOT_LOWERING_ID,
            "add",
            "gpu_dense_i64_pilot",
            vec![crate::object::Representation::dense_cpu().id().0],
        )
        .with_supported_domain("integer")
        .with_capability(LoweringCapability::dense_integer())
        .with_required_evidence(
            LoweringEvidenceKind::ExactnessPreserved,
            "feature-gated dense i64 pilot preserves integer addition after CPU oracle comparison",
        )
        .with_obligation(
            "inputs must be dense i64 tensors with identical shape",
            "the pilot executes host-vector elementwise addition and checks CpuScalarBackend output",
        )
        .with_obligation(
            "performance claims remain disabled for the host-vector pilot",
            "the accelerated-pilot path has no device allocator, stream, or external GPU dependency",
        )
    }

    pub fn execute_i64_add_with_cpu_oracle(
        &self,
        graph: &SemanticGraph,
        plan: &ExecutionPlan,
        store: &mut TensorStore<i64>,
    ) -> Result<GpuDenseI64PilotReport> {
        self.ensure_supported_i64_add_plan(plan)?;
        let mut oracle_store = store.clone();
        let cpu_plan = HeuristicPlanner::new(BackendCapabilities::cpu_scalar()).plan(graph)?;
        CpuScalarBackend.execute_i64(graph, &cpu_plan, &mut oracle_store)?;

        let mut checked_outputs = Vec::new();
        for step in &plan.steps {
            let node = graph
                .nodes()
                .get(step.node_id)
                .ok_or_else(|| Error::backend(format!("unknown node {}", step.node_id)))?;
            let lhs = store.get(node.inputs[0])?.clone();
            let rhs = store.get(node.inputs[1])?.clone();
            let output = dense_i64_add_tensor(&lhs, &rhs)?;
            let output_id = node.output_ids[0];
            store.insert(output_id, output);
            checked_outputs.push(output_id);
        }

        for output_id in &checked_outputs {
            let candidate = store.get(*output_id)?;
            let oracle = oracle_store.get(*output_id)?;
            if candidate != oracle {
                return Ok(GpuDenseI64PilotReport {
                    backend: self.name().to_string(),
                    op_name: "add".to_string(),
                    checked_outputs,
                    cpu_oracle_matches: false,
                    preliminary_runtime_ns: None,
                    evidence: vec![
                        "P187 dense i64 pilot executed but failed CPU oracle comparison"
                            .to_string(),
                    ],
                });
            }
        }

        Ok(GpuDenseI64PilotReport {
            backend: self.name().to_string(),
            op_name: "add".to_string(),
            checked_outputs,
            cpu_oracle_matches: true,
            preliminary_runtime_ns: None,
            evidence: vec![
                "P187 accelerated pilot is feature-gated and scoped to dense i64 add".to_string(),
                "candidate output matched CpuScalarBackend oracle exactly".to_string(),
                "performance is preliminary: no device allocator, stream, or external GPU dependency is used"
                    .to_string(),
            ],
        })
    }

    fn ensure_supported_i64_add_plan(&self, plan: &ExecutionPlan) -> Result<()> {
        if plan.backend != self.name() {
            return Err(Error::backend(format!(
                "plan targets backend {}, but executor is {}",
                plan.backend,
                self.name()
            )));
        }
        if plan.steps.is_empty() {
            return Err(Error::backend(
                "gpu_dense_i64_pilot requires at least one dense i64 add step",
            ));
        }
        for step in &plan.steps {
            if step.domain.starts_with("Q_") || step.domain.contains("padic") {
                return Err(Error::backend(format!(
                    "gpu_dense_i64_pilot does not support fixed-precision p-adic execution for {}",
                    step.domain
                )));
            }
            if matches!(step.kind, PlanStepKind::CoverGlueCheck { .. })
                || step.domain.starts_with("cover:")
            {
                return Err(Error::backend(format!(
                    "gpu_dense_i64_pilot does not support finite-site sheaf locality for {}",
                    step.op_name
                )));
            }
            if !matches!(step.kind, PlanStepKind::Single)
                || step.op_name != "add"
                || step.domain != "integer"
            {
                return Err(Error::backend(format!(
                    "gpu_dense_i64_pilot only supports single dense integer add steps, got op={} domain={}",
                    step.op_name, step.domain
                )));
            }
        }
        Ok(())
    }
}

#[cfg(feature = "accelerated-pilot")]
pub fn register_gpu_dense_i64_pilot_lowering(registry: &mut OperatorRegistry) -> Result<()> {
    registry.register_lowering(GpuDenseI64PilotBackend::lowering_rule())
}

#[cfg(feature = "accelerated-pilot")]
impl Backend for GpuDenseI64PilotBackend {
    fn name(&self) -> &'static str {
        "gpu_dense_i64_pilot"
    }

    fn capabilities(&self) -> BackendCapabilities {
        Self::capabilities()
    }

    fn compile(&self, plan: &ExecutionPlan) -> Result<Executable> {
        self.ensure_supported_i64_add_plan(plan)?;
        Ok(Executable {
            backend: self.name().to_string(),
        })
    }

    fn execute(&self, _executable: &Executable, _args: &[ObjectRef]) -> Result<()> {
        Err(Error::backend(
            "gpu_dense_i64_pilot requires execute_i64_add_with_cpu_oracle for semantic guardrails",
        ))
    }
}

#[cfg(feature = "accelerated-pilot")]
impl ComputeHardware for GpuDenseI64PilotBackend {
    fn target(&self) -> HardwareTarget {
        Self::target()
    }

    fn device_capabilities(&self) -> DeviceCapabilities {
        DeviceCapabilities::from_backend(Self::target(), Self::capabilities())
    }
}

#[cfg(feature = "accelerated-pilot")]
fn dense_i64_add_tensor(lhs: &Tensor<i64>, rhs: &Tensor<i64>) -> Result<Tensor<i64>> {
    if lhs.meta.domain != DomainId::new("integer") || rhs.meta.domain != DomainId::new("integer") {
        return Err(Error::backend(
            "gpu_dense_i64_pilot only supports integer-domain tensors",
        ));
    }
    if lhs.meta.shape != rhs.meta.shape {
        return Err(Error::backend(format!(
            "gpu_dense_i64_pilot add shape mismatch: lhs={:?}, rhs={:?}",
            lhs.meta.shape, rhs.meta.shape
        )));
    }
    if lhs.data.len() != rhs.data.len() {
        return Err(Error::backend(format!(
            "gpu_dense_i64_pilot add length mismatch: lhs={}, rhs={}",
            lhs.data.len(),
            rhs.data.len()
        )));
    }
    Ok(Tensor {
        meta: lhs.meta.clone(),
        data: lhs
            .data
            .iter()
            .zip(rhs.data.iter())
            .map(|(lhs, rhs)| lhs + rhs)
            .collect(),
    })
}