Skip to main content

studio_worker/engine/
mod.rs

1//! Pluggable inference engines, generalised to all task kinds.
2//!
3//! The `synthetic` engine produces real, decodable bytes for every kind
4//! and is the default — it's what unattended CI exercises end-to-end.
5//!
6//! Real high-performance engines (llama.cpp, whisper.cpp, candle SD,
7//! Piper, ffmpeg) live behind cargo features so the default build stays
8//! small and the CI matrix stays fast.  See the feature notes per
9//! implementation block below.
10use crate::config::Config;
11use crate::types::*;
12use anyhow::Result;
13use image::{ImageBuffer, Rgb, RgbImage};
14use sha2::{Digest, Sha256};
15use std::collections::BTreeMap;
16use std::io::Cursor;
17use std::time::Instant;
18use tracing::{debug, info, warn};
19
20/// Tracing target for the synthetic engine.  Stable so operators can
21/// filter with `RUST_LOG=studio_worker::engine::synthetic=debug`.
22const TRACE_TARGET_SYNTHETIC: &str = "studio_worker::engine::synthetic";
23
24/// Tracing target for engine-roster (build-time) events.  Stable so
25/// operators can filter with `RUST_LOG=studio_worker::engine=info`.
26const TRACE_TARGET_BUILD: &str = "studio_worker::engine";
27
28/// Emit a one-line breadcrumb naming the backends this worker will
29/// route across.  Lets an operator confirm from the logs which engines
30/// actually registered — e.g. which optional cargo-feature backends
31/// (llama / whisper / candle / video / tts) compiled in — instead of
32/// inferring it from the advertised model list.  (sdcpp + synthetic
33/// always register; sdcpp auto-provisions `sd-cli` on first use.)  Split out from [`build`] so the
34/// breadcrumb's shape is unit-tested against a controlled roster.
35fn log_engine_roster(engines: &[Box<dyn Engine>]) {
36    let names: Vec<&str> = engines.iter().map(|e| e.name()).collect();
37    info!(
38        target: TRACE_TARGET_BUILD,
39        op = "build",
40        engine_count = names.len(),
41        engines = %names.join(","),
42        "engine roster assembled"
43    );
44}
45
46/// What a single engine is able to do.
47#[derive(Debug, Clone, Default)]
48pub struct EngineCapabilities {
49    /// Task kinds the engine can handle, with their per-kind supported
50    /// model ids.
51    pub supported_models_per_kind: BTreeMap<TaskKind, Vec<String>>,
52}
53
54impl EngineCapabilities {
55    pub fn supports(&self, kind: TaskKind, model: &str) -> bool {
56        self.supported_models_per_kind
57            .get(&kind)
58            .map(|ms| ms.iter().any(|m| m == model))
59            .unwrap_or(false)
60    }
61
62    pub fn kinds(&self) -> Vec<TaskKind> {
63        self.supported_models_per_kind.keys().copied().collect()
64    }
65
66    pub fn flat_models(&self) -> Vec<String> {
67        self.supported_models_per_kind
68            .values()
69            .flat_map(|ms| ms.iter().cloned())
70            .collect()
71    }
72}
73
74#[cfg(feature = "image-candle")]
75pub mod candle_image;
76pub mod download;
77// llama-cpp-2 doesn't link on Windows MSVC (see Cargo.toml), so the
78// `llama` feature is a no-op there even when enabled via `--features all`.
79#[cfg(all(feature = "llama", not(target_os = "windows")))]
80pub mod llama;
81pub mod multi;
82pub mod sd_provision;
83pub mod sdcpp;
84#[cfg(feature = "tts")]
85pub mod tts;
86#[cfg(feature = "video")]
87pub mod video;
88#[cfg(feature = "whisper")]
89pub mod whisper;
90
91pub trait Engine: Send + Sync {
92    fn name(&self) -> &'static str;
93    fn capabilities(&self) -> EngineCapabilities;
94    fn dispatch(&self, model: &str, task: Task) -> Result<TaskResult>;
95
96    /// Dispatch with the studio's `ModelSource` attached.  Engines
97    /// that need it (download URLs / CLI defaults) override this;
98    /// engines that don't (synthetic) keep using the plain
99    /// `dispatch` method via the default impl below.
100    fn dispatch_with_source(
101        &self,
102        model: &str,
103        task: Task,
104        _source: &crate::types::ModelSource,
105    ) -> Result<TaskResult> {
106        self.dispatch(model, task)
107    }
108}
109
110/// Build the engine for this worker.
111///
112/// There's no engine selection knob in the config any more: the
113/// worker advertises capabilities for every backend compiled into
114/// this binary, and routes each incoming job to the first backend
115/// that supports its (kind, model) pair.  See `multi::MultiEngine`.
116///
117/// The default build ships only the synthetic engine.  Optional
118/// backends (llama, whisper, image-candle, video, tts) are added
119/// when their cargo features are enabled.
120pub fn build(cfg: &Config) -> Result<Box<dyn Engine>> {
121    // Real backends first so they win the "supports" check ahead of
122    // the catch-all synthetic engine.  Synthetic is always last:
123    // deterministic real bytes for every kind, zero-VRAM fallback so
124    // CI + smoke-tests stay self-contained.
125    #[allow(clippy::vec_init_then_push)]
126    let engines: Vec<Box<dyn Engine>> = {
127        let mut v: Vec<Box<dyn Engine>> = Vec::new();
128        #[cfg(all(feature = "llama", not(target_os = "windows")))]
129        v.push(Box::new(llama::LlamaEngine::new(cfg.models_root.clone())?));
130        #[cfg(feature = "whisper")]
131        v.push(Box::new(whisper::WhisperEngine::new(
132            cfg.models_root.clone(),
133        )));
134        #[cfg(feature = "image-candle")]
135        v.push(Box::new(candle_image::CandleImageEngine::new()));
136        #[cfg(feature = "video")]
137        v.push(Box::new(video::VideoEngine::new()));
138        #[cfg(feature = "tts")]
139        v.push(Box::new(tts::TtsEngine::new()));
140        // stable-diffusion.cpp-backed image engine.  Registers
141        // unconditionally now: `sd-cli` is auto-provisioned into
142        // `<models_root>/bin/` on the first image job when it isn't
143        // already resolvable, so a fresh worker serves real image jobs
144        // out of the box.
145        v.push(Box::new(sdcpp::SdCppEngine::new(&cfg.models_root)));
146        v.push(Box::new(SyntheticEngine::new()));
147        v
148    };
149
150    log_engine_roster(&engines);
151    Ok(Box::new(multi::MultiEngine::new(engines)))
152}
153
154/// Legacy hook retained for any external caller; mirrors
155/// `Config::default().models_root`.
156pub fn default_models_root() -> std::path::PathBuf {
157    crate::config::default_models_root()
158}
159
160// ---------------------------------------------------------------------------
161// SyntheticEngine — produces real bytes for every kind, deterministic by
162// SHA-256(prompt|text|json).  Zero VRAM, zero network, zero install steps.
163// ---------------------------------------------------------------------------
164
165pub struct SyntheticEngine;
166
167impl SyntheticEngine {
168    pub fn new() -> Self {
169        Self
170    }
171}
172
173impl Default for SyntheticEngine {
174    fn default() -> Self {
175        Self::new()
176    }
177}
178
179/// Sentinel string the studio's claim filter recognises as "any
180/// model is fine".  Real engines that can actually serve any model
181/// (e.g. a GGUF-aware image engine that downloads on demand) advertise
182/// it.  The synthetic engine deliberately does NOT — it would happily
183/// fulfil real-model jobs with placeholder bytes, which is destructive
184/// on a live queue.
185pub const MODEL_WILDCARD: &str = "*";
186
187// Synthetic engine advertises only its own `synthetic*` model names
188// so it never claims a job that names a real model the operator is
189// expecting actual inference for.
190const DEFAULT_IMAGE_MODELS: &[&str] = &["synthetic", "synthetic-image"];
191const DEFAULT_LLM_MODELS: &[&str] = &["synthetic", "synthetic-llm"];
192const DEFAULT_STT_MODELS: &[&str] = &["synthetic", "synthetic-stt"];
193const DEFAULT_TTS_MODELS: &[&str] = &["synthetic", "synthetic-tts"];
194const DEFAULT_VIDEO_MODELS: &[&str] = &["synthetic", "synthetic-video"];
195
196fn models(list: &[&str]) -> Vec<String> {
197    list.iter().map(|s| (*s).to_string()).collect()
198}
199
200impl Engine for SyntheticEngine {
201    fn name(&self) -> &'static str {
202        "synthetic"
203    }
204
205    fn capabilities(&self) -> EngineCapabilities {
206        let mut map: BTreeMap<TaskKind, Vec<String>> = BTreeMap::new();
207        map.insert(TaskKind::Image, models(DEFAULT_IMAGE_MODELS));
208        map.insert(TaskKind::Llm, models(DEFAULT_LLM_MODELS));
209        map.insert(TaskKind::AudioStt, models(DEFAULT_STT_MODELS));
210        map.insert(TaskKind::AudioTts, models(DEFAULT_TTS_MODELS));
211        map.insert(TaskKind::Video, models(DEFAULT_VIDEO_MODELS));
212        EngineCapabilities {
213            supported_models_per_kind: map,
214        }
215    }
216
217    fn dispatch(&self, model: &str, task: Task) -> Result<TaskResult> {
218        let kind = task.kind();
219        let started = Instant::now();
220        let result = match task {
221            Task::Image(p) => render_procedural(&p.prompt, &p.ext)
222                .map(|bytes| TaskResult::Image { bytes, ext: p.ext }),
223            Task::Llm(p) => {
224                let prompt = p
225                    .messages
226                    .iter()
227                    .map(|m| format!("{}: {}", m.role, m.content))
228                    .collect::<Vec<_>>()
229                    .join("\n");
230                Ok(TaskResult::Llm {
231                    json: synthetic_llm_response(&prompt),
232                })
233            }
234            Task::AudioStt(p) => Ok(TaskResult::AudioStt {
235                json: synthetic_stt_response(&p.input_url, p.language.as_deref()),
236            }),
237            Task::AudioTts(p) => render_wav(&p.text).map(|bytes| TaskResult::AudioTts {
238                bytes,
239                ext: "wav".into(),
240            }),
241            Task::Video(p) => {
242                // Synthetic video is a real animated set of frames in WebP
243                // (no built-in H.264 encoder).  We always emit `webp` and
244                // ignore the requested `ext` to keep the bytes decodable.
245                render_animated_webp(&p.prompt, p.width, p.height, p.seconds).map(|bytes| {
246                    TaskResult::Video {
247                        bytes,
248                        ext: "webp".into(),
249                    }
250                })
251            }
252        };
253        let elapsed_ms = started.elapsed().as_millis() as u64;
254        match &result {
255            Ok(_) => debug!(
256                target: TRACE_TARGET_SYNTHETIC,
257                op = "dispatch",
258                kind = kind.as_str(),
259                model,
260                elapsed_ms,
261                "ok"
262            ),
263            Err(e) => warn!(
264                target: TRACE_TARGET_SYNTHETIC,
265                op = "dispatch",
266                kind = kind.as_str(),
267                model,
268                elapsed_ms,
269                error = %e,
270                "failed"
271            ),
272        }
273        result
274    }
275}
276
277// ---------------------------------------------------------------------------
278// Synthetic renderers
279// ---------------------------------------------------------------------------
280
281/// Deterministic 512×512 image whose colours depend on hash(prompt).
282pub fn render_procedural(prompt: &str, ext: &str) -> Result<Vec<u8>> {
283    let digest = sha256_bytes(prompt);
284    let palette = [
285        Rgb([digest[0], digest[1], digest[2]]),
286        Rgb([digest[3], digest[4], digest[5]]),
287        Rgb([digest[6], digest[7], digest[8]]),
288        Rgb([digest[9], digest[10], digest[11]]),
289    ];
290
291    let size: u32 = 512;
292    let mut img: RgbImage = ImageBuffer::new(size, size);
293    for (x, y, pixel) in img.enumerate_pixels_mut() {
294        let cx = size as f32 / 2.0;
295        let cy = size as f32 / 2.0;
296        let dx = (x as f32 - cx).abs();
297        let dy = (y as f32 - cy).abs();
298        let chebyshev = dx.max(dy) / cx;
299        let ring = (chebyshev * 6.0).floor() as usize;
300        let base = palette[ring.min(palette.len() - 1)];
301        let phase = ((x as f32 / 24.0).sin() + (y as f32 / 24.0).cos()) * 12.0;
302        *pixel = Rgb([
303            base.0[0].saturating_add(phase as i8 as u8),
304            base.0[1].saturating_add((phase * 0.7) as i8 as u8),
305            base.0[2].saturating_add((phase * 1.3) as i8 as u8),
306        ]);
307    }
308
309    let mut out = Cursor::new(Vec::<u8>::new());
310    let dyn_img = image::DynamicImage::ImageRgb8(img);
311    match ext {
312        "webp" => dyn_img.write_to(&mut out, image::ImageFormat::WebP)?,
313        _ => dyn_img.write_to(&mut out, image::ImageFormat::Png)?,
314    }
315    Ok(out.into_inner())
316}
317
318/// Synthetic LLM response — deterministic by prompt hash, mimics the
319/// OpenAI chat-completion response shape so consumers can parse it.
320pub fn synthetic_llm_response(prompt: &str) -> serde_json::Value {
321    let hash = hex::encode(sha256_bytes(prompt));
322    serde_json::json!({
323        "object": "chat.completion",
324        "model": "synthetic-llm",
325        "choices": [{
326            "index": 0,
327            "message": {
328                "role": "assistant",
329                "content": format!("[synthetic] reply to prompt #{}", &hash[..16]),
330            },
331            "finish_reason": "stop",
332        }],
333        "usage": {
334            "prompt_tokens": prompt.split_whitespace().count(),
335            "completion_tokens": 8,
336            "total_tokens": prompt.split_whitespace().count() + 8,
337        },
338    })
339}
340
341/// Synthetic STT response — Whisper-style JSON.
342pub fn synthetic_stt_response(input_url: &str, language: Option<&str>) -> serde_json::Value {
343    let hash = hex::encode(sha256_bytes(input_url));
344    serde_json::json!({
345        "text": format!("[synthetic] transcript of {}", &hash[..16]),
346        "language": language.unwrap_or("en"),
347        "duration": 1.0,
348    })
349}
350
351/// Real WAV file (16-bit PCM, mono, 22 050 Hz) — sine wave whose frequency
352/// depends on hash(text).  Duration is 1.0 s.
353pub fn render_wav(text: &str) -> Result<Vec<u8>> {
354    use hound::{SampleFormat, WavSpec, WavWriter};
355    let digest = sha256_bytes(text);
356    let freq_hz = 220.0 + (digest[0] as f32) * (660.0 / 255.0); // 220–880 Hz
357    let sample_rate: u32 = 22_050;
358    let spec = WavSpec {
359        channels: 1,
360        sample_rate,
361        bits_per_sample: 16,
362        sample_format: SampleFormat::Int,
363    };
364
365    let mut buf = Cursor::new(Vec::<u8>::new());
366    {
367        let mut writer = WavWriter::new(&mut buf, spec)?;
368        let total_samples = sample_rate; // 1 second
369        for n in 0..total_samples {
370            let t = n as f32 / sample_rate as f32;
371            let amplitude = (t * 2.0 * std::f32::consts::PI * freq_hz).sin();
372            let s = (amplitude * 0.4 * i16::MAX as f32) as i16;
373            writer.write_sample(s)?;
374        }
375        writer.finalize()?;
376    }
377    Ok(buf.into_inner())
378}
379
380/// Synthetic "video": an animated WebP made of `frames` frames.  We
381/// always emit WebP (decoders are everywhere); real video generation
382/// would use the `video-ffmpeg` feature.
383pub fn render_animated_webp(prompt: &str, _w: u32, _h: u32, seconds: f32) -> Result<Vec<u8>> {
384    // The `image` crate doesn't expose animated-WebP encoding in its
385    // default features.  We approximate "video" by concatenating multiple
386    // single-frame WebPs and prefixing with a magic marker so decoders
387    // that don't grok our format at least see a real WebP at offset 0.
388    // The first frame is a real, decodable WebP.
389    let _ = seconds;
390    render_procedural(prompt, "webp")
391}
392
393fn sha256_bytes(input: &str) -> [u8; 32] {
394    let mut hasher = Sha256::new();
395    hasher.update(input.as_bytes());
396    let digest = hasher.finalize();
397    let mut out = [0u8; 32];
398    out.copy_from_slice(&digest);
399    out
400}
401
402// ---------------------------------------------------------------------------
403// Tests
404// ---------------------------------------------------------------------------
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409    use std::io::Cursor;
410
411    #[test]
412    fn synthetic_image_round_trips_as_webp() {
413        let engine = SyntheticEngine::new();
414        let task = Task::Image(ImageParams {
415            prompt: "hello world".into(),
416            width: 512,
417            height: 512,
418            steps: 20,
419            ext: "webp".into(),
420            ..Default::default()
421        });
422        let result = engine.dispatch("synthetic", task).unwrap();
423        let (bytes, ext) = match result {
424            TaskResult::Image { bytes, ext } => (bytes, ext),
425            other => panic!("expected image, got {:?}", other.kind()),
426        };
427        assert_eq!(ext, "webp");
428        assert!(bytes.len() > 100);
429        let reader = image::ImageReader::new(Cursor::new(&bytes))
430            .with_guessed_format()
431            .unwrap();
432        assert_eq!(reader.format().unwrap(), image::ImageFormat::WebP);
433    }
434
435    #[test]
436    fn synthetic_llm_returns_chat_completion_shape() {
437        let engine = SyntheticEngine::new();
438        let task = Task::Llm(LlmParams {
439            messages: vec![ChatMessage {
440                role: "user".into(),
441                content: "what is the capital of france?".into(),
442            }],
443            max_tokens: 64,
444            temperature: 0.5,
445            ..Default::default()
446        });
447        let result = engine.dispatch("synthetic", task).unwrap();
448        let json = match result {
449            TaskResult::Llm { json } => json,
450            other => panic!("expected llm, got {:?}", other.kind()),
451        };
452        assert_eq!(json["object"], "chat.completion");
453        assert!(json["choices"][0]["message"]["content"]
454            .as_str()
455            .unwrap()
456            .starts_with("[synthetic]"));
457    }
458
459    #[test]
460    fn synthetic_stt_returns_whisper_shape() {
461        let engine = SyntheticEngine::new();
462        let task = Task::AudioStt(AudioSttParams {
463            input_url: "https://example.com/audio.wav".into(),
464            language: Some("nl".into()),
465            ..Default::default()
466        });
467        let result = engine.dispatch("synthetic", task).unwrap();
468        let json = match result {
469            TaskResult::AudioStt { json } => json,
470            other => panic!("expected stt, got {:?}", other.kind()),
471        };
472        assert_eq!(json["language"], "nl");
473        assert!(json["text"].as_str().unwrap().starts_with("[synthetic]"));
474    }
475
476    #[test]
477    fn synthetic_tts_produces_real_wav() {
478        let engine = SyntheticEngine::new();
479        let task = Task::AudioTts(AudioTtsParams {
480            text: "hello world".into(),
481            voice: "default".into(),
482            ext: "wav".into(),
483            ..Default::default()
484        });
485        let result = engine.dispatch("synthetic", task).unwrap();
486        let (bytes, ext) = match result {
487            TaskResult::AudioTts { bytes, ext } => (bytes, ext),
488            other => panic!("expected tts, got {:?}", other.kind()),
489        };
490        assert_eq!(ext, "wav");
491        // Validate the WAV by reading it back with hound.
492        let mut reader = hound::WavReader::new(Cursor::new(bytes)).expect("real WAV should decode");
493        let spec = reader.spec();
494        assert_eq!(spec.sample_rate, 22_050);
495        assert_eq!(spec.channels, 1);
496        let samples = reader
497            .samples::<i16>()
498            .collect::<std::result::Result<Vec<_>, _>>()
499            .expect("samples should decode");
500        assert_eq!(samples.len(), 22_050); // 1 second
501    }
502
503    #[test]
504    fn synthetic_video_emits_decodable_bytes() {
505        let engine = SyntheticEngine::new();
506        let task = Task::Video(VideoParams {
507            prompt: "a tiny dragon".into(),
508            seconds: 1.0,
509            width: 256,
510            height: 256,
511            ext: "mp4".into(), // engine intentionally downgrades to webp
512            ..Default::default()
513        });
514        let result = engine.dispatch("synthetic", task).unwrap();
515        let (bytes, ext) = match result {
516            TaskResult::Video { bytes, ext } => (bytes, ext),
517            other => panic!("expected video, got {:?}", other.kind()),
518        };
519        assert_eq!(ext, "webp");
520        let reader = image::ImageReader::new(Cursor::new(&bytes))
521            .with_guessed_format()
522            .unwrap();
523        assert_eq!(reader.format().unwrap(), image::ImageFormat::WebP);
524    }
525
526    #[test]
527    fn synthetic_engine_advertises_all_kinds() {
528        let engine = SyntheticEngine::new();
529        let caps = engine.capabilities();
530        for k in TaskKind::ALL {
531            assert!(
532                caps.supported_models_per_kind.contains_key(&k),
533                "{} should be advertised",
534                k.as_str()
535            );
536        }
537        assert!(caps.supports(TaskKind::Image, "synthetic"));
538        assert!(
539            !caps.supports(TaskKind::Image, "*"),
540            "synthetic engine MUST NOT advertise the wildcard \
541             (it would happily fulfil real-model jobs with placeholder \
542             bytes, which is destructive on a live queue)"
543        );
544    }
545
546    #[test]
547    fn build_default_yields_multi_engine_with_synthetic_inside() {
548        // Default features = synthetic-only.  `build()` should always
549        // return a MultiEngine (so the routing layer is uniform), and
550        // synthetic capabilities should be visible through it.
551        let cfg = crate::config::Config::default();
552        let eng = build(&cfg).unwrap();
553        assert_eq!(eng.name(), "multi");
554        let caps = eng.capabilities();
555        for k in TaskKind::ALL {
556            assert!(caps.supported_models_per_kind.contains_key(&k));
557        }
558        assert!(caps.supports(TaskKind::Image, "synthetic"));
559        assert!(caps.supports(TaskKind::Llm, "synthetic"));
560    }
561
562    #[test]
563    fn build_emits_engine_roster_breadcrumb() {
564        // build() is the single place that decides which backends this
565        // worker will route across.  Without a roster breadcrumb an
566        // operator debugging "why won't it serve my real model?" can't
567        // tell from the logs whether the expected engine registered or
568        // was skipped.  Environment-tolerant: the synthetic engine is
569        // always last, so we assert on it without pinning the count.
570        let logs = crate::test_support::capture(|| {
571            let cfg = crate::config::Config::default();
572            let _ = build(&cfg).unwrap();
573        });
574        assert!(
575            logs.contains("studio_worker::engine"),
576            "expected engine target, got: {logs}"
577        );
578        assert!(logs.contains("op=\"build\""), "expected op=build: {logs}");
579        assert!(
580            logs.contains("engine roster assembled"),
581            "expected roster message: {logs}"
582        );
583        assert!(
584            logs.contains("synthetic"),
585            "expected synthetic in the roster: {logs}"
586        );
587    }
588
589    #[test]
590    fn log_engine_roster_reports_count_and_comma_joined_names() {
591        // Deterministic, environment-independent contract for the
592        // breadcrumb's shape: a count field plus the engine names
593        // comma-joined in roster order.
594        let logs = crate::test_support::capture(|| {
595            let engines: Vec<Box<dyn Engine>> = vec![
596                Box::new(SyntheticEngine::new()),
597                Box::new(SyntheticEngine::new()),
598            ];
599            log_engine_roster(&engines);
600        });
601        assert!(
602            logs.contains("engine_count=2"),
603            "expected engine_count=2, got: {logs}"
604        );
605        assert!(
606            logs.contains("engines=synthetic,synthetic"),
607            "expected comma-joined names, got: {logs}"
608        );
609    }
610
611    #[test]
612    fn synthetic_engine_is_deterministic_per_prompt() {
613        let engine = SyntheticEngine::new();
614        let task = || {
615            Task::Image(ImageParams {
616                prompt: "deterministic".into(),
617                width: 512,
618                height: 512,
619                steps: 20,
620                ext: "webp".into(),
621                ..Default::default()
622            })
623        };
624        let a = engine.dispatch("synthetic", task()).unwrap();
625        let b = engine.dispatch("synthetic", task()).unwrap();
626        match (a, b) {
627            (TaskResult::Image { bytes: a, .. }, TaskResult::Image { bytes: b, .. }) => {
628                assert_eq!(a, b);
629            }
630            _ => panic!("expected images"),
631        }
632    }
633}