mold-ai-inference 0.13.1

Candle-based inference engine for mold — FLUX, SDXL, SD3.5, Z-Image diffusion models
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
//! Local LLM-powered prompt expansion using quantized Qwen3 GGUF models.
//!
//! Loads a small Qwen3 model (1.7B or 0.6B at Q4), generates expanded prompts,
//! then drops the weights to free VRAM for the diffusion pipeline.

use anyhow::{bail, Result};
use candle_core::{DType, Device, Tensor};
use candle_transformers::models::quantized_qwen3::ModelWeights;
use std::io::Seek;
use std::path::{Path, PathBuf};
use tokenizers::Tokenizer;

use mold_core::expand::{ExpandConfig, ExpandResult, PromptExpander};
use mold_core::expand_prompts::{build_batch_messages, build_single_messages, format_chatml};

use crate::device::{
    discover_gpus, expand_vram_threshold, filter_gpus, memory_status_string,
    preflight_memory_check, select_expand_device_with_preference, ExpandPlacement,
};
use crate::progress::{ProgressCallback, ProgressReporter};
use mold_core::types::GpuSelection;

/// Local prompt expander using quantized Qwen3 GGUF.
pub struct LocalExpander {
    model_path: PathBuf,
    tokenizer_path: PathBuf,
    progress: ProgressReporter,
    gpu_selection: GpuSelection,
    preferred_gpu: Option<usize>,
}

impl LocalExpander {
    /// Create a new local expander with paths to the GGUF model and tokenizer.
    pub fn new(model_path: impl Into<PathBuf>, tokenizer_path: impl Into<PathBuf>) -> Self {
        Self {
            model_path: model_path.into(),
            tokenizer_path: tokenizer_path.into(),
            progress: ProgressReporter::default(),
            gpu_selection: GpuSelection::All,
            preferred_gpu: None,
        }
    }

    /// Set a progress callback for reporting device selection, loading, and generation status.
    pub fn set_on_progress(&mut self, callback: ProgressCallback) {
        self.progress.set_callback(callback);
    }

    /// Restrict local expansion to the GPU ordinals selected by the caller.
    pub fn with_gpu_selection(mut self, gpu_selection: GpuSelection) -> Self {
        self.gpu_selection = gpu_selection;
        self
    }

    /// Prefer this GPU ordinal when it is allowed and has enough free VRAM.
    pub fn with_preferred_gpu(mut self, preferred_gpu: Option<usize>) -> Self {
        self.preferred_gpu = preferred_gpu;
        self
    }

    /// Try to create a local expander by finding the model files.
    ///
    /// Searches the standard mold models directory for the expand model's
    /// GGUF and tokenizer files, checking both manifest storage paths
    /// (e.g., `qwen3-expand-q8/`) and the shared tokenizer location.
    ///
    /// `expand_model` overrides the model spec from config (e.g., from
    /// `--expand-model` or `MOLD_EXPAND_MODEL`). Pass `None` to use
    /// `config.expand.model`.
    ///
    /// Returns `None` if the model hasn't been pulled yet.
    pub fn from_config(config: &mold_core::Config, expand_model: Option<&str>) -> Option<Self> {
        let models_dir = config.resolved_models_dir();
        let expand_model = expand_model.unwrap_or(&config.expand.model);

        // Determine the tag from the model spec (e.g., "qwen3-expand:q4" → "q4")
        let tag = expand_model.split(':').nth(1).unwrap_or("q8");

        // Search model-specific directories for GGUF files.
        // Manifest storage places transformers under <model-name>/ with colons
        // replaced by dashes, e.g., "qwen3-expand-q8/" or "qwen3-expand-small-q8/".
        // Order candidates so the explicitly requested variant is checked first —
        // otherwise if both qwen3-expand and qwen3-expand-small are installed,
        // the larger model would always win regardless of user's choice.
        let candidate_dirs = if expand_model.contains("small") {
            vec![
                format!("qwen3-expand-small-{tag}"),
                format!("qwen3-expand-{tag}"),
                "qwen3-expand".to_string(),
            ]
        } else {
            vec![
                format!("qwen3-expand-{tag}"),
                format!("qwen3-expand-small-{tag}"),
                "qwen3-expand".to_string(),
            ]
        };

        let mut gguf_path = None;
        for dir_name in &candidate_dirs {
            let dir = models_dir.join(dir_name);
            if dir.exists() {
                if let Some(path) = find_gguf_in_dir(&dir, expand_model) {
                    gguf_path = Some(path);
                    break;
                }
            }
        }
        let gguf_path = gguf_path?;

        // Search for tokenizer: shared location first, then model-specific dirs
        let shared_tokenizer = models_dir.join("shared/qwen3-expand/tokenizer.json");
        let tokenizer_path = if shared_tokenizer.exists() {
            shared_tokenizer
        } else {
            // Search candidate dirs for tokenizer
            let mut found = None;
            for dir_name in &candidate_dirs {
                let dir = models_dir.join(dir_name);
                if let Some(path) = find_tokenizer_in_dir(&dir) {
                    found = Some(path);
                    break;
                }
            }
            found?
        };

        Some(Self::new(gguf_path, tokenizer_path))
    }

    /// Load model, generate text, drop model.
    fn generate_text(&self, prompt_text: &str, config: &ExpandConfig) -> Result<String> {
        // Size the VRAM/RAM budget off the actual GGUF file (weights + 2 GB
        // activations), not a static constant — a 4 GB q8 model needs ~6 GB
        // to fit, not 2 GB.
        let model_size = std::fs::metadata(&self.model_path)
            .map(|m| m.len())
            .unwrap_or(0);
        let threshold = expand_vram_threshold(model_size);

        // Cascade: main GPU → remaining GPUs (ordinal order) → CPU.
        // `discover_gpus()` returns an empty list on CPU-only builds, which
        // lands us directly on CPU.
        let discovered = discover_gpus();
        let gpus = filter_gpus(&discovered, &self.gpu_selection);
        let is_metal = candle_core::utils::metal_is_available();
        let placement =
            select_expand_device_with_preference(&gpus, threshold, is_metal, self.preferred_gpu);

        let device = match placement {
            ExpandPlacement::Gpu(ordinal) => {
                // Metal is always ordinal 0 (single device). CUDA may pick any
                // ordinal; create_device() respects MOLD_DEVICE=cpu override.
                let dev = crate::device::create_device(ordinal, &self.progress)?;
                // Metal and CPU fallback both draw from system RAM — preflight
                // either way. Pure CUDA allocation doesn't touch RAM and is
                // already bounded by the VRAM threshold check above.
                if dev.is_cpu() || dev.is_metal() {
                    // Expand LLMs generate ≤ 512 tokens; the existing
                    // `EXPAND_ACTIVATION_HEADROOM` (2 GB) covers KV cache +
                    // residuals on the worst-case prompt.
                    preflight_memory_check(
                        "Expand LLM",
                        model_size,
                        crate::device::EXPAND_ACTIVATION_HEADROOM,
                    )?;
                }
                dev
            }
            ExpandPlacement::Cpu => {
                if gpus.is_empty() {
                    self.progress
                        .info("Using CPU for prompt expansion (no GPU detected)");
                } else {
                    self.progress.info(&format!(
                        "Using CPU for prompt expansion (needed {:.1} GB, no GPU had room)",
                        threshold as f64 / 1_000_000_000.0,
                    ));
                }
                // Final guard: if system RAM can't hold the model, fail fast
                // with a clear message rather than OOM mid-load.
                preflight_memory_check(
                    "Expand LLM",
                    model_size,
                    crate::device::EXPAND_ACTIVATION_HEADROOM,
                )?;
                Device::Cpu
            }
        };

        let device_label = if device.is_metal() {
            "Metal"
        } else if device.is_cuda() {
            "CUDA"
        } else {
            "CPU"
        };

        // Report memory status
        if let Some(mem_status) = memory_status_string() {
            self.progress.info(&mem_status);
        }

        // Load GGUF model
        let load_start = std::time::Instant::now();
        let stage_name = format!("Loading expand model ({device_label})");
        self.progress.stage_start(&stage_name);

        let mut file = std::fs::File::open(&self.model_path)
            .map_err(|e| anyhow::anyhow!("failed to open expand model: {e}"))?;
        let ct = candle_core::quantized::gguf_file::Content::read(&mut file)
            .map_err(|e| anyhow::anyhow!("failed to read GGUF content: {e}"))?;
        file.seek(std::io::SeekFrom::Start(0))?;
        let mut model = ModelWeights::from_gguf(ct, &mut file, &device)
            .map_err(|e| anyhow::anyhow!("failed to load expand model weights: {e}"))?;

        // Load tokenizer
        let tokenizer = Tokenizer::from_file(&self.tokenizer_path)
            .map_err(|e| anyhow::anyhow!("failed to load expand tokenizer: {e}"))?;

        self.progress.stage_done(&stage_name, load_start.elapsed());

        // Tokenize prompt
        let encoding = tokenizer
            .encode(prompt_text, false)
            .map_err(|e| anyhow::anyhow!("failed to tokenize expand prompt: {e}"))?;
        let input_ids = encoding.get_ids();

        // Generate tokens autoregressively
        let gen_start = std::time::Instant::now();
        let mut all_tokens: Vec<u32> = input_ids.to_vec();
        let mut generated_tokens: Vec<u32> = Vec::new();

        // Get stop tokens
        let eos_token = tokenizer
            .token_to_id("<|im_end|>")
            .or_else(|| tokenizer.token_to_id("<|endoftext|>"));
        let start_think_token = tokenizer.token_to_id("<think>");
        let end_think_token = tokenizer.token_to_id("</think>");

        let max_new_tokens = config.max_tokens as usize;
        let mut in_thinking = false;

        // Process the prompt through the model first
        let mut offset = {
            let input = Tensor::new(input_ids, &device)?.unsqueeze(0)?;
            let _logits = model.forward(&input, 0)?;
            input_ids.len()
        };

        // Generate new tokens one at a time
        let mut last_token = *input_ids.last().unwrap_or(&0);
        for _ in 0..max_new_tokens {
            let input = Tensor::new(&[last_token], &device)?.unsqueeze(0)?;
            let logits = model.forward(&input, offset)?;
            offset += 1;

            // Sample next token
            let next_token = sample_token(&logits, config.temperature, config.top_p)?;

            // Check for stop conditions
            if let Some(eos) = eos_token {
                if next_token == eos {
                    break;
                }
            }

            // Track thinking mode — skip <think>...</think> tokens from output
            if let Some(st) = start_think_token {
                if next_token == st {
                    in_thinking = true;
                }
            }
            if let Some(et) = end_think_token {
                if next_token == et {
                    in_thinking = false;
                    all_tokens.push(next_token);
                    last_token = next_token;
                    continue; // Don't include </think> in generated_tokens
                }
            }

            all_tokens.push(next_token);
            if !in_thinking {
                generated_tokens.push(next_token);
            }
            last_token = next_token;
        }

        // Report generation speed
        let gen_elapsed = gen_start.elapsed().as_secs_f64();
        let tok_per_sec = generated_tokens.len() as f64 / gen_elapsed.max(0.001);
        self.progress.info(&format!(
            "Generated {} tokens ({:.1} tok/s)",
            generated_tokens.len(),
            tok_per_sec
        ));

        // Decode generated tokens
        let output = tokenizer
            .decode(&generated_tokens, true)
            .map_err(|e| anyhow::anyhow!("failed to decode generated tokens: {e}"))?;

        // Clear KV cache and drop model weights to free VRAM.
        // NOTE: We intentionally do NOT call reclaim_gpu_memory() here.
        // That function resets the CUDA primary context, but cudarc caches
        // CudaDevice per ordinal — the next Device::new_cuda(0) would get
        // the stale cached handle and segfault.  Dropping the model frees
        // all tensor allocations; the device handle is cheap and the context
        // stays valid for the diffusion engine that runs next.
        model.clear_kv_cache();
        drop(model);
        let _ = device;

        Ok(output)
    }
}

impl PromptExpander for LocalExpander {
    fn expand(&self, prompt: &str, config: &ExpandConfig) -> Result<ExpandResult> {
        let family_override = config.family_overrides.get(&config.model_family);
        let messages = if config.variations > 1 {
            build_batch_messages(
                prompt,
                &config.model_family,
                config.variations,
                config.batch_prompt.as_deref(),
                family_override,
            )
        } else {
            build_single_messages(
                prompt,
                &config.model_family,
                config.system_prompt.as_deref(),
                family_override,
            )
        };

        let prompt_text = format_chatml(&messages, config.thinking);
        let output = self.generate_text(&prompt_text, config)?;

        let expanded = if config.variations > 1 {
            mold_core::expand::parse_variations_public(&output, config.variations)
        } else {
            vec![mold_core::expand::clean_expanded_prompt_public(&output)]
        };

        // Validate we got reasonable output
        if expanded.is_empty() || expanded.iter().all(|s| s.is_empty()) {
            bail!(
                "expand model produced empty output. The model may need re-downloading: \
                 mold pull qwen3-expand"
            );
        }

        Ok(ExpandResult {
            original: prompt.to_string(),
            expanded,
        })
    }
}

/// Sample a token from logits using temperature and top-p (nucleus) sampling.
fn sample_token(logits: &Tensor, temperature: f64, top_p: f64) -> Result<u32> {
    let logits = logits.squeeze(0)?.to_dtype(DType::F32)?;
    let logits_vec: Vec<f32> = logits.to_vec1()?;

    if temperature <= 0.0 {
        // Greedy: pick the max
        let (max_idx, _) = logits_vec
            .iter()
            .enumerate()
            .max_by(|a, b| a.1.partial_cmp(b.1).unwrap_or(std::cmp::Ordering::Equal))
            .unwrap_or((0, &0.0));
        return Ok(max_idx as u32);
    }

    // Apply temperature
    let scaled: Vec<f64> = logits_vec.iter().map(|&x| x as f64 / temperature).collect();

    // Softmax
    let max_val = scaled.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
    let exp: Vec<f64> = scaled.iter().map(|&x| (x - max_val).exp()).collect();
    let sum: f64 = exp.iter().sum();
    let mut probs: Vec<(usize, f64)> = exp.iter().enumerate().map(|(i, &e)| (i, e / sum)).collect();

    // Sort by probability descending for top-p
    probs.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));

    // Apply top-p (nucleus) sampling
    let mut cumulative = 0.0;
    let mut cutoff_idx = probs.len();
    for (i, &(_, p)) in probs.iter().enumerate() {
        cumulative += p;
        if cumulative >= top_p {
            cutoff_idx = i + 1;
            break;
        }
    }
    let candidates = &probs[..cutoff_idx];

    // Re-normalize
    let total: f64 = candidates.iter().map(|&(_, p)| p).sum();
    let r: f64 = rand::random::<f64>() * total;

    let mut acc = 0.0;
    for &(idx, p) in candidates {
        acc += p;
        if acc >= r {
            return Ok(idx as u32);
        }
    }

    // Fallback to last candidate
    Ok(candidates.last().map(|&(idx, _)| idx as u32).unwrap_or(0))
}

/// Find a GGUF file in a directory, preferring one matching the model spec's
/// quantization tag (e.g., "q8" from "qwen3-expand:q8").
fn find_gguf_in_dir(dir: &Path, model_spec: &str) -> Option<PathBuf> {
    let entries: Vec<_> = std::fs::read_dir(dir)
        .ok()?
        .filter_map(|e| e.ok())
        .filter(|e| {
            e.path()
                .extension()
                .map(|ext| ext == "gguf")
                .unwrap_or(false)
        })
        .collect();

    // Extract quantization tag from model spec (e.g., "qwen3-expand:q8" → "q8")
    let quant_tag = model_spec.split(':').nth(1).unwrap_or("q8").to_uppercase();

    // Prefer file matching the requested quantization
    for entry in &entries {
        let name = entry.file_name().to_string_lossy().to_uppercase();
        if name.contains(&quant_tag) {
            return Some(entry.path());
        }
    }

    // Fallback to any GGUF
    entries.first().map(|e| e.path())
}

/// Find a tokenizer file in a directory.
fn find_tokenizer_in_dir(dir: &Path) -> Option<PathBuf> {
    let candidates = [
        "tokenizer.json",
        "qwen3-tokenizer.json",
        "tokenizer_config.json",
    ];
    for name in &candidates {
        let path = dir.join(name);
        if path.exists() {
            return Some(path);
        }
    }
    // Search subdirectories
    if let Ok(entries) = std::fs::read_dir(dir) {
        for entry in entries.flatten() {
            let fname = entry.file_name().to_string_lossy().to_string();
            if fname.contains("tokenizer") && fname.ends_with(".json") {
                return Some(entry.path());
            }
        }
    }
    None
}