pub struct QuantizeParams {
pub nthread: i32,
pub ftype: LlamaFtype,
pub output_tensor_type: Option<GgmlType>,
pub token_embedding_type: Option<GgmlType>,
pub allow_requantize: bool,
pub quantize_output_tensor: bool,
pub only_copy: bool,
pub pure: bool,
pub keep_split: bool,
pub dry_run: bool,
/* private fields */
}Expand description
Parameters for quantizing a model.
Create with QuantizeParams::new and chain with_* builder methods to
configure, then pass a reference to crate::model_quantize.
§Example
use llama_cpp_4::quantize::{GgmlType, LlamaFtype, QuantizeParams, TensorTypeOverride};
let ov = TensorTypeOverride::new("output", GgmlType::F16).unwrap();
let params = QuantizeParams::new(LlamaFtype::MostlyQ4KM)
.with_nthread(8)
.with_allow_requantize(false)
.with_quantize_output_tensor(true)
.with_pure(false)
.with_tensor_type_override(ov);
llama_cpp_4::model_quantize("in.gguf", "out.gguf", ¶ms).unwrap();Fields§
§nthread: i32Number of threads (0 = auto-detect).
ftype: LlamaFtypeTarget quantization type.
output_tensor_type: Option<GgmlType>Force this storage type for the output/lm-head tensor (None = use ftype default).
token_embedding_type: Option<GgmlType>Force this storage type for the token-embedding tensor (None = use ftype default).
allow_requantize: boolAllow re-quantizing tensors that are already quantized.
quantize_output_tensor: boolQuantize the output/lm-head weight tensor.
only_copy: boolCopy all tensors without quantizing (ignores ftype).
pure: boolQuantize every tensor to the same type (no mixed k-quant strategy).
keep_split: boolKeep the same number of shards as the input (for split models).
dry_run: boolEstimate output size without writing anything to disk.
Implementations§
Source§impl QuantizeParams
impl QuantizeParams
Sourcepub fn new(ftype: LlamaFtype) -> Self
pub fn new(ftype: LlamaFtype) -> Self
Create a new params set targeting ftype.
All other options are set to the same defaults as
llama_model_quantize_default_params().
Sourcepub fn with_nthread(self, n: i32) -> Self
pub fn with_nthread(self, n: i32) -> Self
Set the number of quantization threads (0 = auto).
Sourcepub fn with_output_tensor_type(self, ty: GgmlType) -> Self
pub fn with_output_tensor_type(self, ty: GgmlType) -> Self
Override the output-tensor storage type.
Sourcepub fn with_token_embedding_type(self, ty: GgmlType) -> Self
pub fn with_token_embedding_type(self, ty: GgmlType) -> Self
Override the token-embedding storage type.
Sourcepub fn with_allow_requantize(self, v: bool) -> Self
pub fn with_allow_requantize(self, v: bool) -> Self
Allow (or disallow) re-quantizing already-quantized tensors.
Sourcepub fn with_quantize_output_tensor(self, v: bool) -> Self
pub fn with_quantize_output_tensor(self, v: bool) -> Self
Quantize the output/lm-head weight (true by default).
Sourcepub fn with_only_copy(self, v: bool) -> Self
pub fn with_only_copy(self, v: bool) -> Self
When true, only copy tensors verbatim (no quantization at all).
Sourcepub fn with_pure(self, v: bool) -> Self
pub fn with_pure(self, v: bool) -> Self
When true, quantize all tensors to the same type (no mixed k-quant strategy).
Sourcepub fn with_keep_split(self, v: bool) -> Self
pub fn with_keep_split(self, v: bool) -> Self
Preserve the number of shards when quantizing a split model.
Sourcepub fn with_dry_run(self, v: bool) -> Self
pub fn with_dry_run(self, v: bool) -> Self
Only estimate the output size; do not write anything to disk.
Sourcepub fn with_imatrix(self, imatrix: Imatrix) -> Self
pub fn with_imatrix(self, imatrix: Imatrix) -> Self
Supply importance matrix data to improve quantization quality.
The imatrix is generated by the imatrix tool (or the imatrix example
in this crate) and contains per-tensor activation statistics collected
from a calibration dataset.
Sourcepub fn with_imatrix_entry(self, entry: ImatrixEntry) -> Self
pub fn with_imatrix_entry(self, entry: ImatrixEntry) -> Self
Append a single imatrix entry.
Sourcepub fn with_kv_override(self, kv: KvOverride) -> Self
pub fn with_kv_override(self, kv: KvOverride) -> Self
Add (or replace) a GGUF metadata key-value pair in the output file.
Sourcepub fn with_tensor_type_override(self, ov: TensorTypeOverride) -> Self
pub fn with_tensor_type_override(self, ov: TensorTypeOverride) -> Self
Override the quantization type for tensors whose name matches pattern.
Can be called multiple times; overrides are applied in order.
Sourcepub fn with_pruned_layer(self, layer: i32) -> Self
pub fn with_pruned_layer(self, layer: i32) -> Self
Mark a layer index for pruning (removal) from the output model.
Sourcepub fn with_pruned_layers(self, layers: impl IntoIterator<Item = i32>) -> Self
pub fn with_pruned_layers(self, layers: impl IntoIterator<Item = i32>) -> Self
Mark multiple layer indices for pruning.
Trait Implementations§
Source§impl Clone for QuantizeParams
impl Clone for QuantizeParams
Source§fn clone(&self) -> QuantizeParams
fn clone(&self) -> QuantizeParams
1.0.0 · Source§fn clone_from(&mut self, source: &Self)
fn clone_from(&mut self, source: &Self)
source. Read more