Skip to main content

trusty_embedder/
lib.rs

1//! Shared text-embedding abstraction for trusty-* projects.
2//!
3//! Why: trusty-memory and trusty-search both shipped near-identical
4//! `Embedder` traits and `FastEmbedder` implementations, with subtle
5//! drift (cache vs no-cache, sync vs async warmup, `dim()` vs `dimension()`).
6//! Centralising fixes one bug in one place and lets future consumers pick up
7//! the embedder for free.
8//!
9//! What: an async `Embedder` trait with `embed_batch` as the single primitive
10//! (single-text embed is a free helper), plus a production `FastEmbedder`
11//! (fastembed-rs, all-MiniLM-L6-v2, 384-d) with LRU caching and ORT warmup,
12//! and a `MockEmbedder` test double behind the `test-support` feature.
13//!
14//! Test: `cargo test -p trusty-embedder` covers shape, cache hits, and the
15//! mock embedder. ONNX-backed tests are `#[ignore]` to keep CI under one
16//! cargo-feature umbrella.
17
18use std::num::NonZeroUsize;
19use std::sync::Arc;
20
21use anyhow::{Context, Result};
22use async_trait::async_trait;
23use fastembed::{EmbeddingModel, TextEmbedding, TextInitOptions};
24use lru::LruCache;
25use parking_lot::Mutex;
26
27/// Output dimension of the all-MiniLM-L6-v2 model.
28///
29/// Note: we now load the INT8-quantised variant (`AllMiniLML6V2Q`) which
30/// produces identical 384-dim vectors but runs ~3-4× faster on CPU ONNX
31/// and ships as a ~22MB file (vs 86MB for the f32 model).
32pub const EMBED_DIM: usize = 384;
33
34/// Default LRU cache capacity. Picked to be large enough to keep the
35/// hot working set of repeat queries in memory but small enough that the
36/// cache itself fits well inside L2/L3 on a typical developer machine.
37pub const DEFAULT_CACHE_CAPACITY: usize = 256;
38
39/// Identifier for the execution provider an embedder is actually using.
40///
41/// Why: callers want to log which backend is active (CPU vs CoreML/Metal vs
42/// CUDA) so operators can verify the daemon is GPU-accelerated without a
43/// debug log dive.
44/// What: a stable, human-friendly tag returned by `FastEmbedder::provider()`.
45/// Test: `FastEmbedder::new()` on Apple Silicon should yield `CoreML`; on
46/// other platforms it yields `Cpu` (or `Cuda` when the `cuda` feature is on).
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48pub enum ExecutionProvider {
49    Cpu,
50    CoreML,
51    Cuda,
52}
53
54impl ExecutionProvider {
55    pub fn as_str(&self) -> &'static str {
56        match self {
57            ExecutionProvider::Cpu => "CPU",
58            ExecutionProvider::CoreML => "CoreML",
59            ExecutionProvider::Cuda => "CUDA",
60        }
61    }
62}
63
64impl std::fmt::Display for ExecutionProvider {
65    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
66        f.write_str(self.as_str())
67    }
68}
69
70/// Abstraction over embedding backends.
71///
72/// Why: Decouple consumers from any one model so we can swap in remote APIs,
73/// quantised models, or deterministic mocks without changing call sites.
74/// What: a single primitive — `embed_batch` — plus a dimension accessor.
75/// Single-text callers should use the [`embed_one`] convenience helper.
76/// Test: covered by `FastEmbedder` and `MockEmbedder` tests below.
77#[async_trait]
78pub trait Embedder: Send + Sync {
79    /// Embed a batch of texts. Returns one `Vec<f32>` per input, each of
80    /// length `self.dimension()`. An empty input batch returns an empty Vec.
81    async fn embed_batch(&self, texts: &[String]) -> Result<Vec<Vec<f32>>>;
82
83    /// Output dimension of the produced embeddings.
84    fn dimension(&self) -> usize;
85}
86
87/// Convenience helper: embed a single text via `embed_batch` and return the
88/// lone vector.
89///
90/// Why: Most call sites only need one embedding at a time and writing
91/// `.embed_batch(&[text]).await?.into_iter().next()` everywhere is noise.
92/// What: builds a 1-element batch, calls `embed_batch`, returns the first
93/// vector (or errors if the embedder produced nothing).
94/// Test: covered indirectly by `mock_embedder_round_trip`.
95pub async fn embed_one(embedder: &dyn Embedder, text: &str) -> Result<Vec<f32>> {
96    let mut v = embedder.embed_batch(&[text.to_string()]).await?;
97    v.pop()
98        .context("embedder returned no embedding for non-empty input")
99}
100
101/// Local CPU embedder backed by fastembed-rs (ONNX runtime, all-MiniLM-L6-v2).
102///
103/// Why: Default to local-only embeddings so consumers have zero external
104/// network dependency and predictable latency. The LRU cache keeps the hot
105/// path free of redundant ONNX work for repeat strings (queries, common
106/// chunks).
107/// What: wraps a single `TextEmbedding` behind a `parking_lot::Mutex` (the
108/// underlying `embed` requires `&mut self`) and an `LruCache<String, Vec<f32>>`.
109/// Initialisation warms the ORT graph with a small batch so the first user
110/// query doesn't pay the one-shot compile cost.
111/// Test: `embed_batch_returns_correct_dim` and `cache_hit_is_idempotent`
112/// (marked `#[ignore]` — they download a real model).
113pub struct FastEmbedder {
114    model: Arc<Mutex<TextEmbedding>>,
115    cache: Arc<Mutex<LruCache<String, Vec<f32>>>>,
116    dim: usize,
117    provider: ExecutionProvider,
118}
119
120impl FastEmbedder {
121    /// Construct a new `FastEmbedder` with the default cache size.
122    pub async fn new() -> Result<Self> {
123        Self::with_cache_size(DEFAULT_CACHE_CAPACITY).await
124    }
125
126    /// Identifier for the execution provider this embedder is actually using.
127    ///
128    /// Why: callers (e.g. `trusty-search` startup logs) want to surface
129    /// whether the daemon is running on CPU or GPU/ANE without poking at
130    /// internals.
131    /// What: returns `ExecutionProvider::CoreML` on Apple Silicon (when EP
132    /// registration succeeded), otherwise `Cpu` (or `Cuda` if/when wired).
133    /// Test: covered by the public-surface compile check.
134    pub fn provider(&self) -> ExecutionProvider {
135        self.provider
136    }
137
138    /// Build `TextInitOptions` for the given model, attempting to register
139    /// the CoreML execution provider at runtime when on Apple Silicon.
140    ///
141    /// Why: We want zero-friction GPU/ANE acceleration on Apple Silicon
142    /// without forcing users to pass `--features coreml`. fastembed-rs accepts
143    /// a `Vec<ExecutionProviderDispatch>` via `with_execution_providers`, and
144    /// our `ort` dep (pinned to the exact `=2.0.0-rc.12` fastembed uses) has
145    /// the `coreml` feature on by default on macOS, so we can always try to
146    /// build and register CoreML at runtime. On non-Apple platforms, or if
147    /// CoreML registration fails for any reason, we transparently fall back
148    /// to the default CPU provider.
149    /// What: returns `(TextInitOptions, ExecutionProvider)` where the tag
150    /// reflects which backend was actually wired in.
151    /// Test: on an M-series Mac the tag is `CoreML`; on Intel/Linux/Windows
152    /// (or if CoreML build fails) the tag is `Cpu`.
153    fn init_options(model: EmbeddingModel) -> (TextInitOptions, ExecutionProvider) {
154        use ort::execution_providers::ExecutionProviderDispatch;
155
156        let opts = TextInitOptions::new(model);
157
158        // Always register an explicit CPU EP with the memory arena DISABLED.
159        //
160        // Why: ORT's default CPU memory arena pre-allocates a large contiguous
161        // slab sized to the peak tensor shape on first inference. For repos
162        // with 16k+ files this arena grows to 19-53 GB before any RSS soft cap
163        // can react (issue bobmatnyc/trusty-search#89). Disabling the arena
164        // forces per-inference allocations that are freed after each call,
165        // capping steady-state RSS at ~hundreds of MB instead of tens of GB.
166        let cpu_no_arena: ExecutionProviderDispatch =
167            ort::ep::CPU::default().with_arena_allocator(false).build();
168
169        // ──────────────────────────────────────────────────────────────────
170        // CUDA (Linux/Windows, NVIDIA GPU)
171        //
172        // Why: when the operator opts in with `--features cuda` and runs on a
173        // host with a CUDA-capable GPU, we should auto-prefer the CUDA EP so
174        // embedding throughput jumps from CPU-bound (~5h for a 40k-file repo)
175        // to GPU-bound (target <30 min). This mirrors the always-on CoreML
176        // pattern on Apple Silicon but is gated on the build-time `cuda`
177        // feature because the `ort/cuda` feature requires a CUDA toolkit at
178        // compile time. If the binary was built without `cuda`, this branch
179        // is compiled out entirely (no runtime cost, no link-time CUDA dep).
180        //
181        // Operator override: setting `TRUSTY_DEVICE=cpu` forces CPU even on a
182        // GPU-enabled binary. Useful for A/B benchmarking or for running on a
183        // host whose GPU is reserved for another workload.
184        // Test: on a g4dn.xlarge with `--features cuda` the provider tag
185        // resolves to `Cuda`; setting `TRUSTY_DEVICE=cpu` reverts to `Cpu`.
186        #[cfg(feature = "cuda")]
187        {
188            let force_cpu = std::env::var("TRUSTY_DEVICE")
189                .map(|v| v.eq_ignore_ascii_case("cpu"))
190                .unwrap_or(false);
191            if !force_cpu {
192                let cuda: ExecutionProviderDispatch = ort::ep::CUDA::default().build();
193                let providers: Vec<ExecutionProviderDispatch> = vec![cuda, cpu_no_arena];
194                tracing::info!(
195                    "trusty-embedder: registering CUDA + CPU(no-arena) execution providers \
196                     (will fall back to CPU at session-init if no CUDA device is available)"
197                );
198                return (
199                    opts.with_execution_providers(providers),
200                    ExecutionProvider::Cuda,
201                );
202            }
203            tracing::info!(
204                "trusty-embedder: TRUSTY_DEVICE=cpu set — skipping CUDA EP registration"
205            );
206        }
207
208        #[cfg(all(target_arch = "aarch64", target_os = "macos"))]
209        {
210            // Operator override: setting `TRUSTY_DEVICE=cpu` forces CPU even on
211            // Apple Silicon. This is the load-bearing escape hatch for the
212            // macOS jetsam kill (trusty-search#118 / blocking bug): CoreML on
213            // M-series allocates from the unified memory pool, which inflates
214            // *virtual* RSS to ~100+ GB during indexing of large repos
215            // (>~50 MB of source). macOS jetsam treats that virtual footprint
216            // as memory pressure and SIGKILLs the process, even though
217            // physical RAM is fine. Falling through to the CPU-only EP path
218            // (which already disables the ORT memory arena) keeps the
219            // footprint bounded — at the cost of slower embedding throughput.
220            // Operators who want GPU on Apple Silicon explicitly pass
221            // `--device auto` (default) or `--device gpu`.
222            let force_cpu = std::env::var("TRUSTY_DEVICE")
223                .map(|v| v.eq_ignore_ascii_case("cpu"))
224                .unwrap_or(false);
225            if !force_cpu {
226                let coreml: ExecutionProviderDispatch = ort::ep::CoreML::default().build();
227                // CoreML first (GPU/ANE), CPU-no-arena as fallback. The CPU EP
228                // still applies its session-level DisableCpuMemArena flag even
229                // when CoreML handles most ops, which is what prevents the spike.
230                let providers: Vec<ExecutionProviderDispatch> = vec![coreml, cpu_no_arena];
231                tracing::info!(
232                    "trusty-embedder: registering CoreML + CPU(no-arena) execution providers (Apple Silicon)"
233                );
234                return (
235                    opts.with_execution_providers(providers),
236                    ExecutionProvider::CoreML,
237                );
238            }
239            tracing::info!(
240                "trusty-embedder: TRUSTY_DEVICE=cpu set — skipping CoreML EP registration (Apple Silicon jetsam-kill avoidance)"
241            );
242        }
243
244        #[allow(unreachable_code)]
245        {
246            tracing::info!("trusty-embedder: registering CPU(no-arena) execution provider");
247            let providers: Vec<ExecutionProviderDispatch> = vec![cpu_no_arena];
248            (
249                opts.with_execution_providers(providers),
250                ExecutionProvider::Cpu,
251            )
252        }
253    }
254
255    /// Construct with an explicit LRU capacity.
256    pub async fn with_cache_size(capacity: usize) -> Result<Self> {
257        let capacity =
258            NonZeroUsize::new(capacity.max(1)).expect("capacity.max(1) is always non-zero");
259
260        // fastembed's `try_new` downloads + builds an ONNX session — blocking
261        // work that must run off the async reactor.
262        let (model, provider) =
263            tokio::task::spawn_blocking(|| -> Result<(TextEmbedding, ExecutionProvider)> {
264                // Honour the explicit `TRUSTY_DEVICE=gpu` requirement: when the
265                // operator asks for GPU, init_options will have selected an
266                // accelerated EP. If that EP fails to initialise (no GPU, no
267                // CUDA driver, etc.) AND the user did NOT explicitly require
268                // GPU, we transparently fall back to CPU. With `gpu` we
269                // surface the failure so the operator notices instead of
270                // silently running CPU-bound on a "GPU node".
271                let require_gpu = std::env::var("TRUSTY_DEVICE")
272                    .map(|v| v.eq_ignore_ascii_case("gpu"))
273                    .unwrap_or(false);
274
275                let (q_opts, q_provider) = Self::init_options(EmbeddingModel::AllMiniLML6V2Q);
276                let (m, provider) = match TextEmbedding::try_new(q_opts) {
277                    Ok(m) => (m, q_provider),
278                    Err(q_err) => {
279                        // Hardware-accelerated EP build failed — most often
280                        // "no CUDA device" or "CoreML EP not available". On a
281                        // best-effort tier (default), retry once with CPU only
282                        // so the daemon still starts. On `TRUSTY_DEVICE=gpu`
283                        // we propagate the original error.
284                        if q_provider != ExecutionProvider::Cpu && !require_gpu {
285                            tracing::warn!(
286                                "{} EP init failed ({q_err:#}); retrying with CPU-only \
287                                 execution provider",
288                                q_provider
289                            );
290                            // SAFETY: see TRUSTY_DEVICE comment in
291                            // init_options — the env mutation happens before
292                            // any worker thread reads it.
293                            unsafe { std::env::set_var("TRUSTY_DEVICE", "cpu") };
294                            let (cpu_opts, cpu_provider) =
295                                Self::init_options(EmbeddingModel::AllMiniLML6V2Q);
296                            match TextEmbedding::try_new(cpu_opts) {
297                                Ok(m) => (m, cpu_provider),
298                                Err(cpu_err) => {
299                                    tracing::warn!(
300                                        "AllMiniLML6V2Q init failed on CPU ({cpu_err:#}), \
301                                         falling back to AllMiniLML6V2"
302                                    );
303                                    let (fb_opts, fb_provider) =
304                                        Self::init_options(EmbeddingModel::AllMiniLML6V2);
305                                    let m = TextEmbedding::try_new(fb_opts).context(
306                                        "failed to initialise fastembed (tried CUDA→CPU on AllMiniLML6V2Q, then AllMiniLML6V2)",
307                                    )?;
308                                    (m, fb_provider)
309                                }
310                            }
311                        } else if require_gpu {
312                            return Err(anyhow::anyhow!(
313                                "TRUSTY_DEVICE=gpu requested but accelerated execution provider \
314                                 failed to initialise: {q_err:#}"
315                            ));
316                        } else {
317                            tracing::warn!(
318                                "AllMiniLML6V2Q init failed ({q_err:#}), falling back to AllMiniLML6V2"
319                            );
320                            let (fb_opts, fb_provider) =
321                                Self::init_options(EmbeddingModel::AllMiniLML6V2);
322                            let m = TextEmbedding::try_new(fb_opts).context(
323                                "failed to initialise fastembed (tried AllMiniLML6V2Q and AllMiniLML6V2)",
324                            )?;
325                            (m, fb_provider)
326                        }
327                    }
328                };
329                let mut m = m;
330
331                // Warm the graph so the first real user query is hot.
332                let warmup: Vec<&str> = vec![
333                    "hello world",
334                    "the quick brown fox",
335                    "memory palace warmup",
336                    "embedding model ready",
337                    "trusty common warmup",
338                ];
339                let _ = m
340                    .embed(warmup, None)
341                    .context("fastembed warmup batch failed")?;
342                Ok((m, provider))
343            })
344            .await
345            .context("spawn_blocking joined with error during embedder init")??;
346
347        tracing::info!(
348            "trusty-embedder: FastEmbedder ready (provider={}, dim={})",
349            provider,
350            EMBED_DIM
351        );
352
353        Ok(Self {
354            model: Arc::new(Mutex::new(model)),
355            cache: Arc::new(Mutex::new(LruCache::new(capacity))),
356            dim: EMBED_DIM,
357            provider,
358        })
359    }
360}
361
362#[async_trait]
363impl Embedder for FastEmbedder {
364    async fn embed_batch(&self, texts: &[String]) -> Result<Vec<Vec<f32>>> {
365        if texts.is_empty() {
366            return Ok(Vec::new());
367        }
368
369        // Split into cached hits vs misses.
370        let mut results: Vec<Option<Vec<f32>>> = vec![None; texts.len()];
371        let mut to_compute: Vec<(usize, String)> = Vec::new();
372        {
373            let mut cache = self.cache.lock();
374            for (i, t) in texts.iter().enumerate() {
375                if let Some(v) = cache.get(t) {
376                    results[i] = Some(v.clone());
377                } else {
378                    to_compute.push((i, t.clone()));
379                }
380            }
381        }
382
383        if !to_compute.is_empty() {
384            let model = Arc::clone(&self.model);
385            let owned: Vec<String> = to_compute.iter().map(|(_, s)| s.clone()).collect();
386            let computed = tokio::task::spawn_blocking(move || -> Result<Vec<Vec<f32>>> {
387                let mut guard = model.lock();
388                guard
389                    .embed(owned, None)
390                    .context("fastembed embed call failed")
391            })
392            .await
393            .context("spawn_blocking joined with error during embed")??;
394
395            if computed.len() != to_compute.len() {
396                anyhow::bail!(
397                    "fastembed returned {} embeddings, expected {}",
398                    computed.len(),
399                    to_compute.len()
400                );
401            }
402
403            let mut cache = self.cache.lock();
404            for ((idx, key), vector) in to_compute.into_iter().zip(computed.into_iter()) {
405                cache.put(key, vector.clone());
406                results[idx] = Some(vector);
407            }
408        }
409
410        results
411            .into_iter()
412            .map(|opt| opt.context("missing embedding slot after batch"))
413            .collect()
414    }
415
416    fn dimension(&self) -> usize {
417        self.dim
418    }
419}
420
421/// Deterministic test double — hashes input bytes into a fixed-dim vector.
422///
423/// Why: ONNX model downloads dominate test runtime and can race on cold
424/// caches when multiple tests construct embedders in parallel. The mock
425/// gives integration tests a "rank by similarity" surface without any I/O.
426/// What: a tiny per-byte hash spread across `dim` slots, with the first byte
427/// always contributing so short/empty strings still differ.
428/// Test: `mock_embedder_round_trip` confirms shape + determinism.
429#[cfg(any(test, feature = "test-support"))]
430pub struct MockEmbedder {
431    dim: usize,
432}
433
434#[cfg(any(test, feature = "test-support"))]
435impl MockEmbedder {
436    pub fn new(dim: usize) -> Self {
437        Self { dim }
438    }
439
440    fn hash_to_vec(&self, text: &str) -> Vec<f32> {
441        let mut v = vec![0.0_f32; self.dim];
442        for (i, b) in text.bytes().enumerate() {
443            let slot = (i + b as usize) % self.dim;
444            v[slot] += (b as f32) / 255.0;
445        }
446        if let Some(first) = text.bytes().next() {
447            v[0] += first as f32 / 255.0;
448        }
449        v
450    }
451}
452
453#[cfg(any(test, feature = "test-support"))]
454#[async_trait]
455impl Embedder for MockEmbedder {
456    async fn embed_batch(&self, texts: &[String]) -> Result<Vec<Vec<f32>>> {
457        Ok(texts.iter().map(|t| self.hash_to_vec(t)).collect())
458    }
459
460    fn dimension(&self) -> usize {
461        self.dim
462    }
463}
464
465#[cfg(test)]
466mod tests {
467    use super::*;
468
469    #[tokio::test]
470    async fn mock_embedder_round_trip() {
471        let e = MockEmbedder::new(EMBED_DIM);
472        assert_eq!(e.dimension(), EMBED_DIM);
473        let v = embed_one(&e, "hello").await.unwrap();
474        assert_eq!(v.len(), EMBED_DIM);
475        let batch = e
476            .embed_batch(&["a".to_string(), "b".to_string()])
477            .await
478            .unwrap();
479        assert_eq!(batch.len(), 2);
480        assert_ne!(batch[0], batch[1]);
481    }
482
483    #[tokio::test]
484    async fn mock_embedder_empty_input_returns_empty() {
485        let e = MockEmbedder::new(EMBED_DIM);
486        let v = e.embed_batch(&[]).await.unwrap();
487        assert!(v.is_empty());
488    }
489
490    // ONNX-backed test: downloads ~23MB on first run. Marked ignored so default
491    // `cargo test` stays offline; run with `cargo test -- --ignored` when needed.
492    #[tokio::test]
493    #[ignore]
494    async fn fastembed_returns_correct_dim() {
495        let e = FastEmbedder::new().await.unwrap();
496        assert_eq!(e.dimension(), 384);
497        let v = embed_one(&e, "fn authenticate(user: &str) -> bool")
498            .await
499            .unwrap();
500        assert_eq!(v.len(), 384);
501        assert!(v.iter().any(|x| *x != 0.0));
502    }
503
504    #[tokio::test]
505    #[ignore]
506    async fn fastembed_cache_hit_is_idempotent() {
507        let e = FastEmbedder::new().await.unwrap();
508        let v1 = embed_one(&e, "cached").await.unwrap();
509        let v2 = embed_one(&e, "cached").await.unwrap();
510        assert_eq!(v1, v2);
511    }
512
513    /// Why: `TRUSTY_DEVICE=cpu` MUST suppress CoreML EP registration on Apple
514    /// Silicon. CoreML on M-series uses the unified memory pool and inflates
515    /// virtual RSS to ~100 GB during indexing of large repos, which triggers
516    /// macOS jetsam SIGKILL even though physical RAM is fine (blocking bug,
517    /// reported via trusty-search). The `--device cpu` flag is the operator's
518    /// escape hatch; if `init_options` ignores it the daemon is unkillable
519    /// short of disabling the launchd plist.
520    /// What: serialises env mutation, sets `TRUSTY_DEVICE=cpu`, calls
521    /// `init_options`, and asserts the returned `ExecutionProvider` is `Cpu`.
522    /// Then clears the var and asserts it goes back to `CoreML` on macOS
523    /// aarch64 (or stays `Cpu` elsewhere — both are acceptable for this test
524    /// since the bug is specifically about the override being honoured).
525    #[cfg(all(target_arch = "aarch64", target_os = "macos"))]
526    #[test]
527    fn trusty_device_cpu_disables_coreml_on_apple_silicon() {
528        use std::sync::Mutex;
529        // Serialise env mutation across all tests in this binary that touch
530        // process-global env.
531        static ENV_LOCK: Mutex<()> = Mutex::new(());
532        let _guard = ENV_LOCK.lock().unwrap();
533
534        // SAFETY: test is single-threaded under ENV_LOCK; no other thread
535        // observes the env mutation.
536        let prev = std::env::var("TRUSTY_DEVICE").ok();
537        unsafe { std::env::set_var("TRUSTY_DEVICE", "cpu") };
538
539        let (_opts, provider) = FastEmbedder::init_options(EmbeddingModel::AllMiniLML6V2Q);
540        assert_eq!(
541            provider,
542            ExecutionProvider::Cpu,
543            "TRUSTY_DEVICE=cpu must suppress CoreML EP on Apple Silicon"
544        );
545
546        // Restore for sibling tests.
547        unsafe {
548            match prev {
549                Some(v) => std::env::set_var("TRUSTY_DEVICE", v),
550                None => std::env::remove_var("TRUSTY_DEVICE"),
551            }
552        }
553    }
554
555    /// Why: counterpart to the test above — confirms the default path still
556    /// registers CoreML when `TRUSTY_DEVICE` is unset, so we don't regress
557    /// GPU acceleration for operators who *want* it.
558    /// What: clears `TRUSTY_DEVICE`, calls `init_options`, asserts `CoreML`.
559    /// Test: this test, on M-series Mac.
560    #[cfg(all(target_arch = "aarch64", target_os = "macos"))]
561    #[test]
562    fn default_apple_silicon_uses_coreml() {
563        use std::sync::Mutex;
564        static ENV_LOCK: Mutex<()> = Mutex::new(());
565        let _guard = ENV_LOCK.lock().unwrap();
566
567        let prev = std::env::var("TRUSTY_DEVICE").ok();
568        // SAFETY: single-threaded under ENV_LOCK.
569        unsafe { std::env::remove_var("TRUSTY_DEVICE") };
570
571        let (_opts, provider) = FastEmbedder::init_options(EmbeddingModel::AllMiniLML6V2Q);
572        assert_eq!(
573            provider,
574            ExecutionProvider::CoreML,
575            "default behaviour on Apple Silicon must still register CoreML"
576        );
577
578        unsafe {
579            match prev {
580                Some(v) => std::env::set_var("TRUSTY_DEVICE", v),
581                None => std::env::remove_var("TRUSTY_DEVICE"),
582            }
583        }
584    }
585}