#![allow(clippy::manual_div_ceil)]
#![allow(clippy::manual_is_multiple_of)]
#![allow(clippy::needless_range_loop)]
#![allow(clippy::empty_line_after_doc_comments)]
#![allow(clippy::similar_names)]
#![allow(clippy::many_single_char_names)]
#![allow(clippy::too_many_arguments)]
#![allow(clippy::type_complexity)]
#![allow(clippy::macro_metavars_in_unsafe)]
#![allow(clippy::missing_panics_doc)]
#![allow(clippy::missing_errors_doc)]
#![allow(clippy::missing_safety_doc)]
#![allow(clippy::excessive_precision)]
#![allow(clippy::unnecessary_cast)]
#![allow(clippy::cast_possible_truncation)]
#![allow(clippy::cast_sign_loss)]
#![allow(clippy::cast_precision_loss)]
#![allow(clippy::large_stack_arrays)]
#![cfg_attr(test, allow(clippy::disallowed_methods, clippy::float_cmp))]
#[macro_use]
#[allow(unused_macros)]
mod generated_contracts;
pub mod activations;
pub mod backends;
pub mod blis;
pub mod brick;
pub mod chaos;
pub mod contracts;
pub mod eigen;
pub mod error;
pub mod hardware;
pub mod hash;
pub mod matrix;
pub mod monitor;
pub mod simulation;
pub mod tiling;
pub mod tuner;
pub mod vector;
pub use activations::{
f16_to_f32, f32_to_f16, gelu_scalar, relu_scalar, sigmoid_scalar, silu_scalar, tanh_scalar,
};
pub use eigen::SymmetricEigen;
pub use error::{Result, TruenoError};
pub use hash::{hash_bytes, hash_key, hash_keys_batch, hash_keys_batch_with_backend};
pub use matrix::Matrix;
pub use monitor::{
cuda_monitor_available, GpuBackend, GpuClockMetrics, GpuDeviceInfo, GpuMemoryMetrics,
GpuMetrics, GpuMonitor, GpuPcieMetrics, GpuPowerMetrics, GpuThermalMetrics, GpuUtilization,
GpuVendor, MonitorConfig, MonitorError,
};
#[cfg(feature = "cuda-monitor")]
pub use monitor::{enumerate_cuda_devices, query_cuda_device_info, query_cuda_memory};
pub use vector::Vector;
pub use brick::{
fnv1a_f32_checksum,
AddOp,
AssertionResult,
AttentionOp,
BlockQ5K,
BlockQ6K,
BrickBottleneck,
BrickCategory,
BrickError,
BrickId,
BrickIdTimer,
BrickLayer,
BrickProfiler,
BrickSample,
BrickStats,
BrickTimer,
BrickVerification,
ByteBudget,
CategoryStats,
ComputeAssertion,
ComputeBackend,
ComputeBrick,
ComputeOp,
DivergenceInfo,
DotOp,
DotQ5KOp,
DotQ6KOp,
EdgeType,
ExecutionEdge,
ExecutionGraph,
ExecutionNode,
ExecutionNodeId,
FusedGateUpOp,
FusedGateUpWeights,
FusedQKVOp,
FusedQKVWeights,
KernelChecksum,
MatmulOp,
PtxRegistry,
SoftmaxOp,
SyncMode,
TileLevel,
TileStats,
TileTimer,
TokenBudget,
TokenResult,
};
pub use hardware::{
default_hardware_path, Bottleneck, CpuCapability, GpuBackend as HardwareGpuBackend,
GpuCapability, HardwareCapability, RooflineParams, SimdWidth,
};
pub use tuner::{
BottleneckClass, BottleneckPrediction, BrickTuner, ConceptDriftStatus, ExperimentSuggestion,
FeatureExtractor, KernelClassifier, KernelRecommendation, KernelType, QuantType, RunConfig,
ThroughputPrediction, ThroughputRegressor, TrainingSample, TrainingStats, TunerDataCollector,
TunerError, TunerFeatures, TunerRecommendation, UserFeedback,
};
pub use tiling::{
optimal_prefetch_distance, pack_a_index, pack_b_index, swizzle_index, PackingLayout,
PrefetchLocality, TcbGeometry, TcbIndexCalculator, TcbLevel, TiledQ4KMatvec, TilingBackend,
TilingConfig, TilingError, TilingStats, Q4K_SUPERBLOCK_BYTES, Q4K_SUPERBLOCK_SIZE,
};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Backend {
Scalar,
SSE2,
AVX,
AVX2,
AVX512,
NEON,
WasmSIMD,
GPU,
Auto,
}
impl Backend {
pub fn select_best() -> Self {
select_best_available_backend()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum OpComplexity {
Low = 0,
Medium = 1,
High = 2,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OperationType {
MemoryBound,
ComputeBound,
Mixed,
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
fn detect_x86_backend() -> Backend {
if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
return Backend::AVX2;
}
if is_x86_feature_detected!("avx") {
return Backend::AVX;
}
if is_x86_feature_detected!("sse2") {
return Backend::SSE2;
}
Backend::Scalar
}
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
fn detect_arm_backend() -> Backend {
#[cfg(target_feature = "neon")]
{
Backend::NEON
}
#[cfg(not(target_feature = "neon"))]
{
Backend::Scalar
}
}
#[cfg(target_arch = "wasm32")]
fn detect_wasm_backend() -> Backend {
#[cfg(target_feature = "simd128")]
{
Backend::WasmSIMD
}
#[cfg(not(target_feature = "simd128"))]
{
Backend::Scalar
}
}
pub fn select_best_available_backend() -> Backend {
static BEST_BACKEND: std::sync::OnceLock<Backend> = std::sync::OnceLock::new();
*BEST_BACKEND.get_or_init(|| {
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
detect_x86_backend()
}
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
{
detect_arm_backend()
}
#[cfg(target_arch = "wasm32")]
{
detect_wasm_backend()
}
#[cfg(not(any(
target_arch = "x86_64",
target_arch = "x86",
target_arch = "aarch64",
target_arch = "arm",
target_arch = "wasm32"
)))]
{
Backend::Scalar
}
})
}
pub fn select_backend_for_operation(op_type: OperationType) -> Backend {
let _ = &op_type;
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
{
select_x86_backend_for_operation(op_type)
}
#[cfg(any(target_arch = "aarch64", target_arch = "arm"))]
{
detect_arm_backend()
}
#[cfg(target_arch = "wasm32")]
{
detect_wasm_backend()
}
#[cfg(not(any(
target_arch = "x86_64",
target_arch = "x86",
target_arch = "aarch64",
target_arch = "arm",
target_arch = "wasm32"
)))]
{
Backend::Scalar
}
}
#[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
fn select_x86_backend_for_operation(op_type: OperationType) -> Backend {
use std::arch::is_x86_feature_detected;
let use_avx512 = op_type == OperationType::ComputeBound && is_x86_feature_detected!("avx512f");
if use_avx512 {
return Backend::AVX512;
}
if is_x86_feature_detected!("avx2") && is_x86_feature_detected!("fma") {
return Backend::AVX2;
}
if is_x86_feature_detected!("avx") {
return Backend::AVX;
}
if is_x86_feature_detected!("sse2") {
return Backend::SSE2;
}
Backend::Scalar
}
#[cfg(test)]
mod contract_tests;
#[cfg(test)]
mod contract_tests_image;
#[cfg(test)]
mod contract_tests_linalg;
#[cfg(test)]
mod tests;