use anyhow::{bail, Result};
use candle_core::Device;
use std::path::{Path, PathBuf};
use crate::device::{
fits_in_memory, fmt_gb, qwen3_vram_threshold, should_use_gpu, t5_vram_threshold,
QWEN3_FP16_VRAM_THRESHOLD, T5_VRAM_THRESHOLD,
};
use crate::progress::ProgressReporter;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub(crate) enum Qwen3Size {
B4,
B8,
}
pub(crate) fn resolve_t5_variant(
progress: &ProgressReporter,
preference: Option<&str>,
gpu_device: &Device,
free_vram: u64,
default_t5_path: &Path,
) -> Result<(PathBuf, bool, String)> {
use mold_core::download::{cached_file_path, download_single_file_sync};
use mold_core::manifest::{find_t5_variant, known_t5_variants, T5_FP16_SIZE};
let is_cuda = gpu_device.is_cuda();
let is_metal = gpu_device.is_metal();
match preference {
Some(tag) if tag != "fp16" && tag != "auto" => {
let variant = find_t5_variant(tag).ok_or_else(|| {
anyhow::anyhow!(
"unknown T5 variant '{}'. Valid: fp16, auto, q8, q6, q5, q4, q3",
tag,
)
})?;
let path = resolve_t5_gguf_path(progress, variant)?;
let threshold = t5_vram_threshold(variant.size_bytes);
let on_gpu = should_use_gpu(is_cuda, is_metal, free_vram, threshold);
let label = if on_gpu {
"GPU, quantized"
} else {
"CPU, quantized"
};
progress.info(&format!(
"Using T5 {} ({}) on {} (explicit)",
variant.tag,
fmt_gb(variant.size_bytes),
if on_gpu { "GPU" } else { "CPU" },
));
Ok((path, on_gpu, label.to_string()))
}
Some("fp16") => {
let on_gpu = should_use_gpu(is_cuda, is_metal, free_vram, T5_VRAM_THRESHOLD);
let label = if on_gpu { "GPU" } else { "CPU" };
progress.info(&format!("Using FP16 T5 on {} (explicit)", label));
Ok((default_t5_path.to_path_buf(), on_gpu, label.to_string()))
}
_ => {
if fits_in_memory(is_cuda, is_metal, free_vram, T5_VRAM_THRESHOLD) {
if is_metal {
progress.info("Loading FP16 T5 on GPU (unified memory)");
} else {
progress.info(&format!(
"Loading FP16 T5 on GPU ({} free > {} threshold)",
fmt_gb(free_vram),
fmt_gb(T5_VRAM_THRESHOLD),
));
}
return Ok((default_t5_path.to_path_buf(), true, "GPU".to_string()));
}
if is_cuda || is_metal {
for variant in known_t5_variants() {
let threshold = t5_vram_threshold(variant.size_bytes);
if fits_in_memory(is_cuda, is_metal, free_vram, threshold) {
let path = match cached_file_path(
variant.hf_repo,
variant.hf_filename,
Some("shared/t5-gguf"),
) {
Some(p) => p,
None => {
progress.info(&format!(
"Downloading T5 {} ({})...",
variant.tag,
fmt_gb(variant.size_bytes),
));
tracing::info!(
variant = variant.tag,
repo = variant.hf_repo,
file = variant.hf_filename,
"downloading quantized T5 encoder"
);
download_single_file_sync(
variant.hf_repo,
variant.hf_filename,
Some("shared/t5-gguf"),
)
.map_err(|e| {
anyhow::anyhow!("failed to download T5 {}: {e}", variant.tag)
})?
}
};
progress.info(&format!(
"FP16 T5 ({}) exceeds remaining VRAM ({}). Using quantized T5 {} ({}) on GPU instead.",
fmt_gb(T5_FP16_SIZE),
fmt_gb(free_vram),
variant.tag,
fmt_gb(variant.size_bytes),
));
return Ok((path, true, format!("GPU, quantized {}", variant.tag)));
}
}
}
if is_metal {
let variants = known_t5_variants();
if let Some(smallest) = variants.last() {
let path = resolve_t5_gguf_path(progress, smallest)?;
progress.info(&format!(
"Memory tight — using smallest T5 {} ({}) on GPU to reduce page pressure",
smallest.tag,
fmt_gb(smallest.size_bytes),
));
return Ok((path, true, format!("GPU, quantized {}", smallest.tag)));
}
}
if is_cuda || is_metal {
progress.info(&format!(
"Loading FP16 T5 on CPU ({} free, no variant fits on GPU)",
fmt_gb(free_vram),
));
} else {
progress.info("No GPU detected, loading T5 on CPU");
}
Ok((default_t5_path.to_path_buf(), false, "CPU".to_string()))
}
}
}
pub(crate) fn resolve_t5_gguf_path(
progress: &ProgressReporter,
variant: &mold_core::manifest::T5Variant,
) -> Result<PathBuf> {
use mold_core::download::{cached_file_path, download_single_file_sync};
if let Some(path) =
cached_file_path(variant.hf_repo, variant.hf_filename, Some("shared/t5-gguf"))
{
return Ok(path);
}
progress.info(&format!(
"Downloading T5 {} ({})...",
variant.tag,
fmt_gb(variant.size_bytes),
));
download_single_file_sync(variant.hf_repo, variant.hf_filename, Some("shared/t5-gguf"))
.map_err(|e| anyhow::anyhow!("failed to download T5 {}: {e}", variant.tag))
}
#[allow(clippy::too_many_arguments, clippy::type_complexity)]
pub(crate) fn resolve_qwen3_variant(
progress: &ProgressReporter,
preference: Option<&str>,
gpu_device: &Device,
free_vram: u64,
bf16_paths: &[PathBuf],
have_bf16: bool,
prefer_gguf: bool,
qwen3_size: Qwen3Size,
) -> Result<(Vec<PathBuf>, bool, bool, String)> {
use mold_core::download::{cached_file_path, download_single_file_sync};
let is_cuda = gpu_device.is_cuda();
let is_metal = gpu_device.is_metal();
let (variants, find_variant, fp16_threshold, cache_subdir): (
&[mold_core::manifest::Qwen3Variant],
fn(&str) -> Option<&'static mold_core::manifest::Qwen3Variant>,
u64,
&str,
) = match qwen3_size {
Qwen3Size::B4 => (
mold_core::manifest::known_qwen3_variants(),
mold_core::manifest::find_qwen3_variant,
QWEN3_FP16_VRAM_THRESHOLD,
"shared/qwen3-gguf",
),
Qwen3Size::B8 => {
let threshold_8b = (mold_core::manifest::QWEN3_8B_FP16_SIZE as f64 * 1.25) as u64;
(
mold_core::manifest::known_qwen3_8b_variants(),
mold_core::manifest::find_qwen3_8b_variant,
threshold_8b,
"shared/qwen3-8b-gguf",
)
}
};
let size_label = match qwen3_size {
Qwen3Size::B4 => "Qwen3-4B",
Qwen3Size::B8 => "Qwen3-8B",
};
match preference {
Some(tag) if tag != "bf16" && tag != "auto" => {
let variant = find_variant(tag).ok_or_else(|| {
anyhow::anyhow!(
"unknown {} variant '{}'. Valid: bf16, auto, q8, q6, iq4, q3",
size_label,
tag,
)
})?;
let path = resolve_qwen3_gguf_path_with_cache(progress, variant, cache_subdir)?;
let threshold = qwen3_vram_threshold(variant.size_bytes);
let on_gpu = should_use_gpu(is_cuda, is_metal, free_vram, threshold);
let label = if on_gpu {
"GPU, quantized"
} else {
"CPU, quantized"
};
progress.info(&format!(
"Using {} {} ({}) on {} (explicit)",
size_label,
variant.tag,
fmt_gb(variant.size_bytes),
if on_gpu { "GPU" } else { "CPU" },
));
Ok((vec![path], true, on_gpu, label.to_string()))
}
Some("bf16") => {
if !have_bf16 {
bail!(
"BF16 {} encoder requested but shard files are missing or not configured. \
Either run `mold pull` for a model with Qwen3 or use --qwen3-variant q8/q6/iq4/q3.",
size_label,
);
}
let on_gpu = should_use_gpu(is_cuda, is_metal, free_vram, fp16_threshold);
let label = if on_gpu { "GPU" } else { "CPU" };
progress.info(&format!(
"Using BF16 {} on {} (explicit)",
size_label, label
));
Ok((bf16_paths.to_vec(), false, on_gpu, label.to_string()))
}
_ => {
if prefer_gguf {
if is_cuda || is_metal {
for variant in variants {
let threshold = qwen3_vram_threshold(variant.size_bytes);
if fits_in_memory(is_cuda, is_metal, free_vram, threshold) {
let path = match cached_file_path(
variant.hf_repo,
variant.hf_filename,
Some(cache_subdir),
) {
Some(p) => p,
None => {
progress.info(&format!(
"Downloading {} {} ({})...",
size_label,
variant.tag,
fmt_gb(variant.size_bytes),
));
download_single_file_sync(
variant.hf_repo,
variant.hf_filename,
Some(cache_subdir),
)
.map_err(|e| {
anyhow::anyhow!(
"failed to download {} {}: {e}",
size_label,
variant.tag
)
})?
}
};
progress.info(&format!(
"Using quantized {} {} ({}) on GPU",
size_label,
variant.tag,
fmt_gb(variant.size_bytes),
));
return Ok((
vec![path],
true,
true,
format!("GPU, quantized {}", variant.tag),
));
}
}
}
if have_bf16 {
progress.info(&format!(
"Loading BF16 {} on CPU (no variant fits on GPU)",
size_label
));
Ok((bf16_paths.to_vec(), false, false, "CPU".to_string()))
} else {
bail!(
"No {} encoder available (no BF16 files and no GGUF cached)",
size_label
)
}
} else {
if have_bf16 && fits_in_memory(is_cuda, is_metal, free_vram, fp16_threshold) {
if is_metal {
progress.info(&format!(
"Loading BF16 {} on GPU (unified memory)",
size_label
));
} else {
progress.info(&format!(
"Loading BF16 {} on GPU ({} free > {} threshold, drop-and-reload)",
size_label,
fmt_gb(free_vram),
fmt_gb(fp16_threshold),
));
}
return Ok((bf16_paths.to_vec(), false, true, "GPU".to_string()));
}
if is_cuda || is_metal || !have_bf16 {
for variant in variants {
let threshold = qwen3_vram_threshold(variant.size_bytes);
if fits_in_memory(is_cuda, is_metal, free_vram, threshold)
|| (!is_cuda && !is_metal)
{
let path = match cached_file_path(
variant.hf_repo,
variant.hf_filename,
Some(cache_subdir),
) {
Some(p) => p,
None => {
progress.info(&format!(
"Downloading {} {} ({})...",
size_label,
variant.tag,
fmt_gb(variant.size_bytes),
));
tracing::info!(
variant = variant.tag,
repo = variant.hf_repo,
file = variant.hf_filename,
"downloading quantized Qwen3 encoder"
);
download_single_file_sync(
variant.hf_repo,
variant.hf_filename,
Some(cache_subdir),
)
.map_err(|e| {
anyhow::anyhow!(
"failed to download {} {}: {e}",
size_label,
variant.tag
)
})?
}
};
let on_gpu = is_cuda || is_metal;
progress.info(&format!(
"Using {} {} ({}) on {}",
size_label,
variant.tag,
fmt_gb(variant.size_bytes),
if on_gpu { "GPU" } else { "CPU" },
));
return Ok((
vec![path],
true,
on_gpu,
format!(
"{}, quantized {}",
if on_gpu { "GPU" } else { "CPU" },
variant.tag
),
));
}
}
}
if is_metal {
if let Some(smallest) = variants.last() {
let path =
resolve_qwen3_gguf_path_with_cache(progress, smallest, cache_subdir)?;
progress.info(&format!(
"Memory tight — using smallest {} {} ({}) on GPU to reduce page pressure",
size_label,
smallest.tag,
fmt_gb(smallest.size_bytes),
));
return Ok((
vec![path],
true,
true,
format!("GPU, quantized {}", smallest.tag),
));
}
}
if have_bf16 {
if is_cuda || is_metal {
progress.info(&format!(
"Loading BF16 {} on CPU ({} free, no variant fits on GPU)",
size_label,
fmt_gb(free_vram),
));
} else {
progress.info(&format!("No GPU detected, loading {} on CPU", size_label));
}
return Ok((bf16_paths.to_vec(), false, false, "CPU".to_string()));
}
bail!(
"no {} text encoder available: BF16 shards not configured and no \
quantized variant could be resolved. Run `mold pull` for a model with \
Qwen3 or use --qwen3-variant q8/q6/iq4/q3.",
size_label,
);
}
}
}
}
fn resolve_qwen3_gguf_path_with_cache(
progress: &ProgressReporter,
variant: &mold_core::manifest::Qwen3Variant,
cache_subdir: &str,
) -> Result<PathBuf> {
use mold_core::download::{cached_file_path, download_single_file_sync};
if let Some(path) = cached_file_path(variant.hf_repo, variant.hf_filename, Some(cache_subdir)) {
return Ok(path);
}
progress.info(&format!(
"Downloading Qwen3 {} ({})...",
variant.tag,
fmt_gb(variant.size_bytes),
));
download_single_file_sync(variant.hf_repo, variant.hf_filename, Some(cache_subdir))
.map_err(|e| anyhow::anyhow!("failed to download Qwen3 {}: {e}", variant.tag))
}
pub(crate) fn resolve_qwen2_vl_gguf_path(
progress: &ProgressReporter,
variant: &mold_core::manifest::Qwen2VlVariant,
) -> Result<PathBuf> {
use mold_core::download::{cached_file_path, download_single_file_sync};
const CACHE_SUBDIR: &str = "shared/qwen2-vl-gguf";
if let Some(path) = cached_file_path(variant.hf_repo, variant.hf_filename, Some(CACHE_SUBDIR)) {
return Ok(path);
}
progress.info(&format!(
"Downloading Qwen2.5-VL {} ({})...",
variant.tag,
fmt_gb(variant.size_bytes),
));
download_single_file_sync(variant.hf_repo, variant.hf_filename, Some(CACHE_SUBDIR))
.map_err(|e| anyhow::anyhow!("failed to download Qwen2.5-VL {}: {e}", variant.tag))
}