Skip to main content

studio_worker/engine/
mod.rs

1//! Pluggable inference engines, generalised to all task kinds.
2//!
3//! The `synthetic` engine produces real, decodable bytes for every kind
4//! and is the default — it's what unattended CI exercises end-to-end.
5//!
6//! Real high-performance engines (llama.cpp, whisper.cpp, candle SD,
7//! Piper, ffmpeg) live behind cargo features so the default build stays
8//! small and the CI matrix stays fast.  See the feature notes per
9//! implementation block below.
10use crate::config::Config;
11use crate::types::*;
12use anyhow::Result;
13use image::{ImageBuffer, Rgb, RgbImage};
14use sha2::{Digest, Sha256};
15use std::collections::BTreeMap;
16use std::io::Cursor;
17use std::time::Instant;
18use tracing::{debug, info, warn};
19
20/// Tracing target for the synthetic engine.  Stable so operators can
21/// filter with `RUST_LOG=studio_worker::engine::synthetic=debug`.
22const TRACE_TARGET_SYNTHETIC: &str = "studio_worker::engine::synthetic";
23
24/// Tracing target for engine-roster (build-time) events.  Stable so
25/// operators can filter with `RUST_LOG=studio_worker::engine=info`.
26const TRACE_TARGET_BUILD: &str = "studio_worker::engine";
27
28/// Emit a one-line breadcrumb naming the backends this worker will
29/// route across.  Lets an operator confirm from the logs which engines
30/// actually registered — e.g. whether the sdcpp backend self-registered
31/// or was skipped for a missing `sd-cli` — instead of inferring it from
32/// the advertised model list.  Split out from [`build`] so the
33/// breadcrumb's shape is unit-tested against a controlled roster.
34fn log_engine_roster(engines: &[Box<dyn Engine>]) {
35    let names: Vec<&str> = engines.iter().map(|e| e.name()).collect();
36    info!(
37        target: TRACE_TARGET_BUILD,
38        op = "build",
39        engine_count = names.len(),
40        engines = %names.join(","),
41        "engine roster assembled"
42    );
43}
44
45/// What a single engine is able to do.
46#[derive(Debug, Clone, Default)]
47pub struct EngineCapabilities {
48    /// Task kinds the engine can handle, with their per-kind supported
49    /// model ids.
50    pub supported_models_per_kind: BTreeMap<TaskKind, Vec<String>>,
51}
52
53impl EngineCapabilities {
54    pub fn supports(&self, kind: TaskKind, model: &str) -> bool {
55        self.supported_models_per_kind
56            .get(&kind)
57            .map(|ms| ms.iter().any(|m| m == model))
58            .unwrap_or(false)
59    }
60
61    pub fn kinds(&self) -> Vec<TaskKind> {
62        self.supported_models_per_kind.keys().copied().collect()
63    }
64
65    pub fn flat_models(&self) -> Vec<String> {
66        self.supported_models_per_kind
67            .values()
68            .flat_map(|ms| ms.iter().cloned())
69            .collect()
70    }
71}
72
73#[cfg(feature = "image-candle")]
74pub mod candle_image;
75pub mod download;
76// llama-cpp-2 doesn't link on Windows MSVC (see Cargo.toml), so the
77// `llama` feature is a no-op there even when enabled via `--features all`.
78#[cfg(all(feature = "llama", not(target_os = "windows")))]
79pub mod llama;
80pub mod multi;
81pub mod sdcpp;
82#[cfg(feature = "tts")]
83pub mod tts;
84#[cfg(feature = "video")]
85pub mod video;
86#[cfg(feature = "whisper")]
87pub mod whisper;
88
89pub trait Engine: Send + Sync {
90    fn name(&self) -> &'static str;
91    fn capabilities(&self) -> EngineCapabilities;
92    fn dispatch(&self, model: &str, task: Task) -> Result<TaskResult>;
93
94    /// Dispatch with the studio's `ModelSource` attached.  Engines
95    /// that need it (download URLs / CLI defaults) override this;
96    /// engines that don't (synthetic) keep using the plain
97    /// `dispatch` method via the default impl below.
98    fn dispatch_with_source(
99        &self,
100        model: &str,
101        task: Task,
102        _source: &crate::types::ModelSource,
103    ) -> Result<TaskResult> {
104        self.dispatch(model, task)
105    }
106}
107
108/// Build the engine for this worker.
109///
110/// There's no engine selection knob in the config any more: the
111/// worker advertises capabilities for every backend compiled into
112/// this binary, and routes each incoming job to the first backend
113/// that supports its (kind, model) pair.  See `multi::MultiEngine`.
114///
115/// The default build ships only the synthetic engine.  Optional
116/// backends (llama, whisper, image-candle, video, tts) are added
117/// when their cargo features are enabled.
118pub fn build(cfg: &Config) -> Result<Box<dyn Engine>> {
119    // Real backends first so they win the "supports" check ahead of
120    // the catch-all synthetic engine.  Synthetic is always last:
121    // deterministic real bytes for every kind, zero-VRAM fallback so
122    // CI + smoke-tests stay self-contained.
123    #[allow(clippy::vec_init_then_push)]
124    let engines: Vec<Box<dyn Engine>> = {
125        let mut v: Vec<Box<dyn Engine>> = Vec::new();
126        #[cfg(all(feature = "llama", not(target_os = "windows")))]
127        v.push(Box::new(llama::LlamaEngine::new(cfg.models_root.clone())?));
128        #[cfg(feature = "whisper")]
129        v.push(Box::new(whisper::WhisperEngine::new(
130            cfg.models_root.clone(),
131        )));
132        #[cfg(feature = "image-candle")]
133        v.push(Box::new(candle_image::CandleImageEngine::new()));
134        #[cfg(feature = "video")]
135        v.push(Box::new(video::VideoEngine::new()));
136        #[cfg(feature = "tts")]
137        v.push(Box::new(tts::TtsEngine::new()));
138        // stable-diffusion.cpp-backed image engine.  Self-registers
139        // only when both the `sd-cli` binary is on the box AND at
140        // least one model's component files are present under
141        // `cfg.models_root`.  Skipped silently otherwise so CI builds
142        // (no sd-cli, no models) stay green.
143        if let Some(eng) = sdcpp::SdCppEngine::try_new(&cfg.models_root) {
144            v.push(Box::new(eng));
145        }
146        v.push(Box::new(SyntheticEngine::new()));
147        v
148    };
149
150    log_engine_roster(&engines);
151    Ok(Box::new(multi::MultiEngine::new(engines)))
152}
153
154/// Legacy hook retained for any external caller; mirrors
155/// `Config::default().models_root`.
156pub fn default_models_root() -> std::path::PathBuf {
157    crate::config::default_models_root()
158}
159
160// ---------------------------------------------------------------------------
161// SyntheticEngine — produces real bytes for every kind, deterministic by
162// SHA-256(prompt|text|json).  Zero VRAM, zero network, zero install steps.
163// ---------------------------------------------------------------------------
164
165pub struct SyntheticEngine;
166
167impl SyntheticEngine {
168    pub fn new() -> Self {
169        Self
170    }
171}
172
173impl Default for SyntheticEngine {
174    fn default() -> Self {
175        Self::new()
176    }
177}
178
179/// Sentinel string the studio's claim filter recognises as "any
180/// model is fine".  Real engines that can actually serve any model
181/// (e.g. a GGUF-aware image engine that downloads on demand) advertise
182/// it.  The synthetic engine deliberately does NOT — it would happily
183/// fulfil real-model jobs with placeholder bytes, which is destructive
184/// on a live queue.
185pub const MODEL_WILDCARD: &str = "*";
186
187// Synthetic engine advertises only its own `synthetic*` model names
188// so it never claims a job that names a real model the operator is
189// expecting actual inference for.
190const DEFAULT_IMAGE_MODELS: &[&str] = &["synthetic", "synthetic-image"];
191const DEFAULT_LLM_MODELS: &[&str] = &["synthetic", "synthetic-llm"];
192const DEFAULT_STT_MODELS: &[&str] = &["synthetic", "synthetic-stt"];
193const DEFAULT_TTS_MODELS: &[&str] = &["synthetic", "synthetic-tts"];
194const DEFAULT_VIDEO_MODELS: &[&str] = &["synthetic", "synthetic-video"];
195
196fn models(list: &[&str]) -> Vec<String> {
197    list.iter().map(|s| (*s).to_string()).collect()
198}
199
200impl Engine for SyntheticEngine {
201    fn name(&self) -> &'static str {
202        "synthetic"
203    }
204
205    fn capabilities(&self) -> EngineCapabilities {
206        let mut map: BTreeMap<TaskKind, Vec<String>> = BTreeMap::new();
207        map.insert(TaskKind::Image, models(DEFAULT_IMAGE_MODELS));
208        map.insert(TaskKind::Llm, models(DEFAULT_LLM_MODELS));
209        map.insert(TaskKind::AudioStt, models(DEFAULT_STT_MODELS));
210        map.insert(TaskKind::AudioTts, models(DEFAULT_TTS_MODELS));
211        map.insert(TaskKind::Video, models(DEFAULT_VIDEO_MODELS));
212        EngineCapabilities {
213            supported_models_per_kind: map,
214        }
215    }
216
217    fn dispatch(&self, model: &str, task: Task) -> Result<TaskResult> {
218        let kind = task.kind();
219        let started = Instant::now();
220        let result = match task {
221            Task::Image(p) => render_procedural(&p.prompt, &p.ext)
222                .map(|bytes| TaskResult::Image { bytes, ext: p.ext }),
223            Task::Llm(p) => {
224                let prompt = p
225                    .messages
226                    .iter()
227                    .map(|m| format!("{}: {}", m.role, m.content))
228                    .collect::<Vec<_>>()
229                    .join("\n");
230                Ok(TaskResult::Llm {
231                    json: synthetic_llm_response(&prompt),
232                })
233            }
234            Task::AudioStt(p) => Ok(TaskResult::AudioStt {
235                json: synthetic_stt_response(&p.input_url, p.language.as_deref()),
236            }),
237            Task::AudioTts(p) => render_wav(&p.text).map(|bytes| TaskResult::AudioTts {
238                bytes,
239                ext: "wav".into(),
240            }),
241            Task::Video(p) => {
242                // Synthetic video is a real animated set of frames in WebP
243                // (no built-in H.264 encoder).  We always emit `webp` and
244                // ignore the requested `ext` to keep the bytes decodable.
245                render_animated_webp(&p.prompt, p.width, p.height, p.seconds).map(|bytes| {
246                    TaskResult::Video {
247                        bytes,
248                        ext: "webp".into(),
249                    }
250                })
251            }
252        };
253        let elapsed_ms = started.elapsed().as_millis() as u64;
254        match &result {
255            Ok(_) => debug!(
256                target: TRACE_TARGET_SYNTHETIC,
257                op = "dispatch",
258                kind = kind.as_str(),
259                model,
260                elapsed_ms,
261                "ok"
262            ),
263            Err(e) => warn!(
264                target: TRACE_TARGET_SYNTHETIC,
265                op = "dispatch",
266                kind = kind.as_str(),
267                model,
268                elapsed_ms,
269                error = %e,
270                "failed"
271            ),
272        }
273        result
274    }
275}
276
277// ---------------------------------------------------------------------------
278// Synthetic renderers
279// ---------------------------------------------------------------------------
280
281/// Deterministic 512×512 image whose colours depend on hash(prompt).
282pub fn render_procedural(prompt: &str, ext: &str) -> Result<Vec<u8>> {
283    let digest = sha256_bytes(prompt);
284    let palette = [
285        Rgb([digest[0], digest[1], digest[2]]),
286        Rgb([digest[3], digest[4], digest[5]]),
287        Rgb([digest[6], digest[7], digest[8]]),
288        Rgb([digest[9], digest[10], digest[11]]),
289    ];
290
291    let size: u32 = 512;
292    let mut img: RgbImage = ImageBuffer::new(size, size);
293    for (x, y, pixel) in img.enumerate_pixels_mut() {
294        let cx = size as f32 / 2.0;
295        let cy = size as f32 / 2.0;
296        let dx = (x as f32 - cx).abs();
297        let dy = (y as f32 - cy).abs();
298        let chebyshev = dx.max(dy) / cx;
299        let ring = (chebyshev * 6.0).floor() as usize;
300        let base = palette[ring.min(palette.len() - 1)];
301        let phase = ((x as f32 / 24.0).sin() + (y as f32 / 24.0).cos()) * 12.0;
302        *pixel = Rgb([
303            base.0[0].saturating_add(phase as i8 as u8),
304            base.0[1].saturating_add((phase * 0.7) as i8 as u8),
305            base.0[2].saturating_add((phase * 1.3) as i8 as u8),
306        ]);
307    }
308
309    let mut out = Cursor::new(Vec::<u8>::new());
310    let dyn_img = image::DynamicImage::ImageRgb8(img);
311    match ext {
312        "webp" => dyn_img.write_to(&mut out, image::ImageFormat::WebP)?,
313        _ => dyn_img.write_to(&mut out, image::ImageFormat::Png)?,
314    }
315    Ok(out.into_inner())
316}
317
318/// Synthetic LLM response — deterministic by prompt hash, mimics the
319/// OpenAI chat-completion response shape so consumers can parse it.
320pub fn synthetic_llm_response(prompt: &str) -> serde_json::Value {
321    let hash = hex::encode(sha256_bytes(prompt));
322    serde_json::json!({
323        "object": "chat.completion",
324        "model": "synthetic-llm",
325        "choices": [{
326            "index": 0,
327            "message": {
328                "role": "assistant",
329                "content": format!("[synthetic] reply to prompt #{}", &hash[..16]),
330            },
331            "finish_reason": "stop",
332        }],
333        "usage": {
334            "prompt_tokens": prompt.split_whitespace().count(),
335            "completion_tokens": 8,
336            "total_tokens": prompt.split_whitespace().count() + 8,
337        },
338    })
339}
340
341/// Synthetic STT response — Whisper-style JSON.
342pub fn synthetic_stt_response(input_url: &str, language: Option<&str>) -> serde_json::Value {
343    let hash = hex::encode(sha256_bytes(input_url));
344    serde_json::json!({
345        "text": format!("[synthetic] transcript of {}", &hash[..16]),
346        "language": language.unwrap_or("en"),
347        "duration": 1.0,
348    })
349}
350
351/// Real WAV file (16-bit PCM, mono, 22 050 Hz) — sine wave whose frequency
352/// depends on hash(text).  Duration is 1.0 s.
353pub fn render_wav(text: &str) -> Result<Vec<u8>> {
354    use hound::{SampleFormat, WavSpec, WavWriter};
355    let digest = sha256_bytes(text);
356    let freq_hz = 220.0 + (digest[0] as f32) * (660.0 / 255.0); // 220–880 Hz
357    let sample_rate: u32 = 22_050;
358    let spec = WavSpec {
359        channels: 1,
360        sample_rate,
361        bits_per_sample: 16,
362        sample_format: SampleFormat::Int,
363    };
364
365    let mut buf = Cursor::new(Vec::<u8>::new());
366    {
367        let mut writer = WavWriter::new(&mut buf, spec)?;
368        let total_samples = sample_rate; // 1 second
369        for n in 0..total_samples {
370            let t = n as f32 / sample_rate as f32;
371            let amplitude = (t * 2.0 * std::f32::consts::PI * freq_hz).sin();
372            let s = (amplitude * 0.4 * i16::MAX as f32) as i16;
373            writer.write_sample(s)?;
374        }
375        writer.finalize()?;
376    }
377    Ok(buf.into_inner())
378}
379
380/// Synthetic "video": an animated WebP made of `frames` frames.  We
381/// always emit WebP (decoders are everywhere); real video generation
382/// would use the `video-ffmpeg` feature.
383pub fn render_animated_webp(prompt: &str, _w: u32, _h: u32, seconds: f32) -> Result<Vec<u8>> {
384    // The `image` crate doesn't expose animated-WebP encoding in its
385    // default features.  We approximate "video" by concatenating multiple
386    // single-frame WebPs and prefixing with a magic marker so decoders
387    // that don't grok our format at least see a real WebP at offset 0.
388    // The first frame is a real, decodable WebP.
389    let _ = seconds;
390    render_procedural(prompt, "webp")
391}
392
393fn sha256_bytes(input: &str) -> [u8; 32] {
394    let mut hasher = Sha256::new();
395    hasher.update(input.as_bytes());
396    let digest = hasher.finalize();
397    let mut out = [0u8; 32];
398    out.copy_from_slice(&digest);
399    out
400}
401
402// ---------------------------------------------------------------------------
403// Tests
404// ---------------------------------------------------------------------------
405
406#[cfg(test)]
407mod tests {
408    use super::*;
409    use std::io::Cursor;
410
411    #[test]
412    fn synthetic_image_round_trips_as_webp() {
413        let engine = SyntheticEngine::new();
414        let task = Task::Image(ImageParams {
415            prompt: "hello world".into(),
416            width: 512,
417            height: 512,
418            steps: 20,
419            ext: "webp".into(),
420            ..Default::default()
421        });
422        let result = engine.dispatch("synthetic", task).unwrap();
423        let (bytes, ext) = match result {
424            TaskResult::Image { bytes, ext } => (bytes, ext),
425            other => panic!("expected image, got {:?}", other.kind()),
426        };
427        assert_eq!(ext, "webp");
428        assert!(bytes.len() > 100);
429        let reader = image::ImageReader::new(Cursor::new(&bytes))
430            .with_guessed_format()
431            .unwrap();
432        assert_eq!(reader.format().unwrap(), image::ImageFormat::WebP);
433    }
434
435    #[test]
436    fn synthetic_llm_returns_chat_completion_shape() {
437        let engine = SyntheticEngine::new();
438        let task = Task::Llm(LlmParams {
439            messages: vec![ChatMessage {
440                role: "user".into(),
441                content: "what is the capital of france?".into(),
442            }],
443            max_tokens: 64,
444            temperature: 0.5,
445            ..Default::default()
446        });
447        let result = engine.dispatch("synthetic", task).unwrap();
448        let json = match result {
449            TaskResult::Llm { json } => json,
450            other => panic!("expected llm, got {:?}", other.kind()),
451        };
452        assert_eq!(json["object"], "chat.completion");
453        assert!(json["choices"][0]["message"]["content"]
454            .as_str()
455            .unwrap()
456            .starts_with("[synthetic]"));
457    }
458
459    #[test]
460    fn synthetic_stt_returns_whisper_shape() {
461        let engine = SyntheticEngine::new();
462        let task = Task::AudioStt(AudioSttParams {
463            input_url: "https://example.com/audio.wav".into(),
464            language: Some("nl".into()),
465            ..Default::default()
466        });
467        let result = engine.dispatch("synthetic", task).unwrap();
468        let json = match result {
469            TaskResult::AudioStt { json } => json,
470            other => panic!("expected stt, got {:?}", other.kind()),
471        };
472        assert_eq!(json["language"], "nl");
473        assert!(json["text"].as_str().unwrap().starts_with("[synthetic]"));
474    }
475
476    #[test]
477    fn synthetic_tts_produces_real_wav() {
478        let engine = SyntheticEngine::new();
479        let task = Task::AudioTts(AudioTtsParams {
480            text: "hello world".into(),
481            voice: "default".into(),
482            ext: "wav".into(),
483            ..Default::default()
484        });
485        let result = engine.dispatch("synthetic", task).unwrap();
486        let (bytes, ext) = match result {
487            TaskResult::AudioTts { bytes, ext } => (bytes, ext),
488            other => panic!("expected tts, got {:?}", other.kind()),
489        };
490        assert_eq!(ext, "wav");
491        // Validate the WAV by reading it back with hound.
492        let mut reader = hound::WavReader::new(Cursor::new(bytes)).expect("real WAV should decode");
493        let spec = reader.spec();
494        assert_eq!(spec.sample_rate, 22_050);
495        assert_eq!(spec.channels, 1);
496        let samples = reader
497            .samples::<i16>()
498            .collect::<std::result::Result<Vec<_>, _>>()
499            .expect("samples should decode");
500        assert_eq!(samples.len(), 22_050); // 1 second
501    }
502
503    #[test]
504    fn synthetic_video_emits_decodable_bytes() {
505        let engine = SyntheticEngine::new();
506        let task = Task::Video(VideoParams {
507            prompt: "a tiny dragon".into(),
508            seconds: 1.0,
509            width: 256,
510            height: 256,
511            ext: "mp4".into(), // engine intentionally downgrades to webp
512            ..Default::default()
513        });
514        let result = engine.dispatch("synthetic", task).unwrap();
515        let (bytes, ext) = match result {
516            TaskResult::Video { bytes, ext } => (bytes, ext),
517            other => panic!("expected video, got {:?}", other.kind()),
518        };
519        assert_eq!(ext, "webp");
520        let reader = image::ImageReader::new(Cursor::new(&bytes))
521            .with_guessed_format()
522            .unwrap();
523        assert_eq!(reader.format().unwrap(), image::ImageFormat::WebP);
524    }
525
526    #[test]
527    fn synthetic_engine_advertises_all_kinds() {
528        let engine = SyntheticEngine::new();
529        let caps = engine.capabilities();
530        for k in TaskKind::ALL {
531            assert!(
532                caps.supported_models_per_kind.contains_key(&k),
533                "{} should be advertised",
534                k.as_str()
535            );
536        }
537        assert!(caps.supports(TaskKind::Image, "synthetic"));
538        assert!(
539            !caps.supports(TaskKind::Image, "*"),
540            "synthetic engine MUST NOT advertise the wildcard \
541             (it would happily fulfil real-model jobs with placeholder \
542             bytes, which is destructive on a live queue)"
543        );
544    }
545
546    #[test]
547    fn build_default_yields_multi_engine_with_synthetic_inside() {
548        // Default features = synthetic-only.  `build()` should always
549        // return a MultiEngine (so the routing layer is uniform), and
550        // synthetic capabilities should be visible through it.
551        let cfg = crate::config::Config::default();
552        let eng = build(&cfg).unwrap();
553        assert_eq!(eng.name(), "multi");
554        let caps = eng.capabilities();
555        for k in TaskKind::ALL {
556            assert!(caps.supported_models_per_kind.contains_key(&k));
557        }
558        assert!(caps.supports(TaskKind::Image, "synthetic"));
559        assert!(caps.supports(TaskKind::Llm, "synthetic"));
560    }
561
562    #[test]
563    fn build_emits_engine_roster_breadcrumb() {
564        // build() is the single place that decides which backends this
565        // worker will route across.  Without a roster breadcrumb an
566        // operator debugging "why won't it serve my real model?" can't
567        // tell from the logs whether the expected engine registered or
568        // was skipped.  Environment-tolerant: the synthetic engine is
569        // always last, so we assert on it without pinning the count
570        // (sdcpp self-registers only when `sd-cli` + models are present).
571        let logs = crate::test_support::capture(|| {
572            let cfg = crate::config::Config::default();
573            let _ = build(&cfg).unwrap();
574        });
575        assert!(
576            logs.contains("studio_worker::engine"),
577            "expected engine target, got: {logs}"
578        );
579        assert!(logs.contains("op=\"build\""), "expected op=build: {logs}");
580        assert!(
581            logs.contains("engine roster assembled"),
582            "expected roster message: {logs}"
583        );
584        assert!(
585            logs.contains("synthetic"),
586            "expected synthetic in the roster: {logs}"
587        );
588    }
589
590    #[test]
591    fn log_engine_roster_reports_count_and_comma_joined_names() {
592        // Deterministic, environment-independent contract for the
593        // breadcrumb's shape: a count field plus the engine names
594        // comma-joined in roster order.
595        let logs = crate::test_support::capture(|| {
596            let engines: Vec<Box<dyn Engine>> = vec![
597                Box::new(SyntheticEngine::new()),
598                Box::new(SyntheticEngine::new()),
599            ];
600            log_engine_roster(&engines);
601        });
602        assert!(
603            logs.contains("engine_count=2"),
604            "expected engine_count=2, got: {logs}"
605        );
606        assert!(
607            logs.contains("engines=synthetic,synthetic"),
608            "expected comma-joined names, got: {logs}"
609        );
610    }
611
612    #[test]
613    fn synthetic_engine_is_deterministic_per_prompt() {
614        let engine = SyntheticEngine::new();
615        let task = || {
616            Task::Image(ImageParams {
617                prompt: "deterministic".into(),
618                width: 512,
619                height: 512,
620                steps: 20,
621                ext: "webp".into(),
622                ..Default::default()
623            })
624        };
625        let a = engine.dispatch("synthetic", task()).unwrap();
626        let b = engine.dispatch("synthetic", task()).unwrap();
627        match (a, b) {
628            (TaskResult::Image { bytes: a, .. }, TaskResult::Image { bytes: b, .. }) => {
629                assert_eq!(a, b);
630            }
631            _ => panic!("expected images"),
632        }
633    }
634}