aprender-compute 0.31.2

High-performance SIMD compute library with GPU support, LLM inference engine, and GGUF model loading (was: trueno)
# NOTE: Workspace declaration removed — this crate is now a member of
# the root paiml/aprender workspace (APR-MONO consolidation).
# Sub-crates (aprender-gpu, aprender-quant, etc.) are also root workspace members.
# Old workspace.lints.clippy entries removed — root workspace handles linting.

[package]
name = "aprender-compute"
version.workspace = true
edition.workspace = true
rust-version = "1.89"
authors.workspace = true
license.workspace = true
description = "High-performance SIMD compute library with GPU support, LLM inference engine, and GGUF model loading (was: trueno)"
repository.workspace = true
readme = "README.md"
keywords = ["simd", "gpu", "wasm", "performance", "vectorization"]
categories = ["algorithms", "mathematics", "science"]
exclude = ["target/", ".profraw", ".profdata", ".vscode/", ".idea/", ".pmat", "proptest-regressions"]

[package.metadata.docs.rs]
all-features = true
rustdoc-args = ["--generate-link-to-definition"]

[lints.rust]
unsafe_code = "allow"


[dependencies]
thiserror = "2.0.17"
serde = { version = "1.0", features = ["derive"] }  # Hardware capability serialization (PMAT-447)
serde_json = "1.0"  # ML tuner data serialization (TUNER-010)
rayon = { version = "1.11", optional = true }
wgpu = { version = "27.0", optional = true }
pollster = { version = "0.4", optional = true }
bytemuck = { version = "1.24", features = ["derive"], optional = true }
futures-intrusive = { version = "0.5", optional = true }
# WASM async support for GPU
wasm-bindgen-futures = { version = "0.4", optional = true }
wasm-bindgen = { version = "0.2", optional = true }
# Native CUDA monitoring via trueno-gpu (TRUENO-SPEC-010)
trueno-gpu = { version = "0.31.2", path = "../aprender-gpu", optional = true, package = "aprender-gpu" }
# PMAT-336: Provable contracts compile-time enforcement
provable-contracts-macros = "0.3"
# P1a: Sovereign GEMM microkernel codegen (compile-time shape specialization)
trueno-gemm-codegen = { version = "0.31.2", path = "../aprender-gemm-codegen", package = "aprender-gemm-codegen" }
# Tracing/profiling support (renacer integration)
tracing = { version = "0.1", optional = true }
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"], optional = true }
tracing-appender = { version = "0.2", optional = true }
dirs = { version = "5.0", optional = true }  # Hardware capability path detection (PMAT-447)
# TUI monitoring (TRUENO-SPEC-020) — presentar-terminal
crossterm = { version = "0.28", optional = true }
presentar-core = { version = "0.3", optional = true }
# Stress test reporting types compatible with renacer (TRUENO-SPEC-025)
# Note: renacer 0.9.1 has compilation issue; using compatible local types instead
dhat = { version = "0.3", optional = true }
trueno-quant = { version = "0.31.2", path = "../aprender-quant", package = "aprender-quant" }
num_cpus = "1.17.0"
anyhow = "1.0.100"
# Hardware capability detection (PMAT-447)
chrono = "0.4"
toml = "0.8"
# ML tuner integration (SHOWCASE-BRICK-001, Section 12)
# Uses aprender 0.24.0 RandomForest for learned optimization
aprender = { path = "../aprender-core", version = "0.31.2", package = "aprender-core", optional = true, default-features = false }
# Execution path graph (PAR-201, Section E.7)
trueno-graph = { version = "0.1.17", optional = true }
# TUI visualization for execution graphs (PAR-201)
presentar-terminal = { version = "0.3", optional = true }

[target.'cfg(target_arch = "wasm32")'.dependencies]
# Web-sys for WebGPU (auto-enabled when gpu-wasm feature is on)
web-sys = { version = "0.3", optional = true, features = ["console"] }

[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
# Hostname detection only on native (not available on WASM)
hostname = "0.4"

[build-dependencies]
serde = { version = "1", features = ["derive"] }
serde_yaml_ng = "0.10"

[dev-dependencies]
proptest = "1.9"
criterion = { workspace = true }
tokio = { version = "1", features = ["macros", "rt"] }
serde_json = "1.0"
nalgebra = "0.34"  # For eigendecomposition benchmark comparison
# Simulation testing framework (TRUENO-SPEC-012)
simular = "0.2.0"
regex = "1.11"  # For PTX/WGSL pattern validation in falsification tests
# GPU edge-case test framework (TCE-001)
trueno-cuda-edge = { version = "0.31.2", path = "../aprender-cuda-edge", package = "aprender-cuda-edge" }
# Sparse and solver crates for contract tests
trueno-sparse = { version = "0.31.2", path = "../aprender-sparse", package = "aprender-sparse" }
trueno-solve = { version = "0.31.2", path = "../aprender-solve", package = "aprender-solve" }
ndarray = "0.17.2"
faer = "0.24"  # For GEMM benchmark comparison (fastest pure-Rust BLAS)
matrixmultiply = "0.3"  # Direct matrixmultiply crate benchmark (engine behind ndarray)

[features]
default = []
parallel = ["rayon"]
gpu = ["wgpu", "pollster", "bytemuck", "futures-intrusive"]
# GPU for WASM (WebGPU) - uses wasm-bindgen-futures instead of pollster
gpu-wasm = ["wgpu", "bytemuck", "futures-intrusive", "wasm-bindgen-futures", "wasm-bindgen", "web-sys"]
# Native CUDA monitoring via trueno-gpu (TRUENO-SPEC-010)
# Provides accurate device info (e.g., "NVIDIA GeForce RTX 4090") and real-time memory metrics
cuda-monitor = ["trueno-gpu/cuda"]
# Chaos engineering features (from renacer v0.4.1)
chaos-basic = []
chaos-network = ["chaos-basic"]
chaos-byzantine = ["chaos-basic"]
chaos-full = ["chaos-network", "chaos-byzantine"]
# Tracing/profiling (renacer integration)
tracing = ["dep:tracing"]
# Hardware capability detection with home directory support (PMAT-447)
hardware-detect = ["dep:dirs"]
# TUI monitoring (TRUENO-SPEC-020) with stress reporting and logging
tui-monitor = ["presentar-core", "presentar-terminal", "crossterm", "trueno-gpu", "dep:tracing", "tracing-subscriber", "tracing-appender", "dep:dirs"]
# CUDA support for TUI (real hardware monitoring)
cuda = ["trueno-gpu/cuda"]
# ML-based kernel selection and throughput prediction (SHOWCASE-BRICK-001)
# Uses aprender RandomForest for learned optimization
ml-tuner = ["aprender"]
# Execution path graph for profiling (PAR-201)
# Tracks PTX→kernel→brick call relationships, exports to CsrGraph
execution-graph = ["trueno-graph"]
# TUI visualization for execution graphs (PAR-201)
# Uses presentar-terminal Tree widget for hierarchical display
presentar-tui = ["presentar-terminal"]
# Heap profiling via dhat-rs (DHAT viewer: https://nnethercote.github.io/dh_view/dh_view.html)
dhat-heap = ["dep:dhat"]

[profile.release]
opt-level = 3
lto = true
codegen-units = 1
debug = true  # Enable debug symbols for profiling (flamegraph, perf)

[lib]
name = "trueno"
path = "src/lib.rs"

[[bench]]
name = "vector_ops"
harness = false

[[bench]]
name = "matrix_ops"
harness = false

[[bench]]
name = "gpu_ops"
path = "benches/gpu_ops/main.rs"
harness = false
required-features = ["gpu"]

[[bench]]
name = "gpu_reduction"
harness = false
required-features = ["gpu"]

[[bench]]
name = "async_gpu_ops"
harness = false
required-features = ["gpu"]

[[bench]]
name = "eigen_ops"
harness = false

[[bench]]
name = "tiling_ops"
harness = false

[[bench]]
name = "gemm_comparison"
harness = false

[[example]]
name = "gpu_batch_demo"
required-features = ["gpu"]

[[example]]
name = "gpu_tiled_reduction"
required-features = ["gpu"]

[[example]]
name = "wgpu_backward_demo"
required-features = ["gpu"]

[[example]]
name = "coop_gemm_bench"
required-features = ["gpu"]

[[example]]
name = "gpu_monitor_demo"
required-features = ["gpu", "tui-monitor"]

[package.metadata.release]
shared-version = true

[[package.metadata.release.pre-release-replacements]]
file = "CHANGELOG.md"
search = "## \\[Unreleased\\]"
replace = "## [{{version}}] - {{date}}"

# [workspace.dependencies] removed — root workspace handles shared deps

[profile.test]
opt-level = 1              # Slight optimization for faster test execution
incremental = true         # Reuse previous builds

[profile.dev]
panic = "abort"