rage-quant 0.1.0

High-performance quantized GEMV kernels for CPU-only LLM inference. Direct dot product on Q8_0/Q6_K/Q4_K GGUF blocks with AVX2+FMA SIMD — 3.0x decode speedup.
Documentation
[package]
name = "rage-quant"
version = "0.1.0"
edition = "2021"
authors = ["Carlos Enrique Castro Lazaro <the@angriestboy.com>"]
description = "High-performance quantized GEMV kernels for CPU-only LLM inference. Direct dot product on Q8_0/Q6_K/Q4_K GGUF blocks with AVX2+FMA SIMD — 3.0x decode speedup."
license = "AGPL-3.0-only"
repository = "https://github.com/OnCeUponTry/rage-quant"
homepage = "https://github.com/OnCeUponTry/rage-quant"
documentation = "https://docs.rs/rage-quant"
readme = "README.md"
keywords = ["llm", "inference", "quantization", "simd", "gguf"]
categories = ["science", "algorithms"]

[dependencies]
half = "2.4"        # f16 support for GGML scale factors
anyhow = "1.0"      # Error handling
rayon = "1.10"       # Parallel GEMV/GEMM
gemm = "0.19"        # Dense GEMM backend (used by gemm_par)

[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }

[[bench]]
name = "gemv_benchmark"
harness = false