blazen-embed-tract 0.5.2

Pure-Rust ONNX inference backend for Blazen embeddings (tract-onnx)
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
//! The [`TractEmbedModel`] type providing local embeddings via `tract-onnx`.
//!
//! This is the pure-Rust counterpart to `blazen-embed-fastembed`. It loads the
//! same model catalog via `tract-onnx` instead of `onnxruntime`, so it builds
//! on targets where ONNX Runtime's prebuilt binaries are unavailable (musl,
//! WASM, etc.). The public API mirrors `FastEmbedModel` so callers can swap
//! backends without touching their own code.

use std::path::PathBuf;
use std::sync::{Arc, Mutex};

use tract_onnx::prelude::*;

use crate::options::{ModelInfo, Pooling, TractOptions, lookup};

/// Error type for tract embedding operations. Variants mirror
/// `blazen_embed_fastembed::FastEmbedError` so consumers can swap backends
/// without touching error-handling code.
#[derive(Debug, thiserror::Error)]
pub enum TractError {
    /// The caller requested a model name that is not in our registry.
    #[error("unknown tract embed model: {0}")]
    UnknownModel(String),

    /// Loading the tokenizer, downloading weights, or parsing the ONNX graph
    /// failed during [`TractEmbedModel::from_options`].
    #[error("tract model init failed: {0}")]
    Init(String),

    /// Running the ONNX graph or pooling the output failed during an embed
    /// call.
    #[error("tract embed failed: {0}")]
    Embed(String),

    /// The internal mutex guarding the tract model handle was poisoned by a
    /// previous panic.
    #[error("mutex poisoned: {0}")]
    MutexPoisoned(String),

    /// The blocking task that ran the tract pipeline panicked.
    #[error("blocking task panicked: {0}")]
    TaskPanicked(String),
}

/// Response from a tract embedding operation. Same shape as
/// `blazen_embed_fastembed::FastEmbedResponse` for drop-in compatibility.
#[derive(Debug, Clone)]
pub struct TractResponse {
    /// The embedding vectors — one per input text.
    pub embeddings: Vec<Vec<f32>>,
    /// The model identifier that produced these embeddings (typically the
    /// Hugging Face repo id).
    pub model: String,
}

/// Type alias for the runnable tract graph we hold onto. Bare `SimplePlan`
/// satisfies every tract version we've tested — the concrete generics match
/// what `into_runnable()` returns after `into_optimized()`.
type TractModel = SimplePlan<TypedFact, Box<dyn TypedOp>, Graph<TypedFact, Box<dyn TypedOp>>>;

/// A local embedding model backed by [`tract_onnx`] (pure Rust ONNX inference).
///
/// Wraps a loaded ONNX graph plus its tokenizer. All inference is dispatched
/// onto [`tokio::task::spawn_blocking`] because tract runs synchronously and
/// is CPU-bound.
pub struct TractEmbedModel {
    /// The runnable tract plan. Wrapped in `Arc<Mutex<...>>` because `run()`
    /// takes `&self` but we need interior mutability across `spawn_blocking`
    /// boundaries; the underlying plan itself is already internally re-entrant
    /// but we serialize to keep scratch-buffer reuse predictable.
    model: Arc<Mutex<TractModel>>,
    /// The tokenizer. `tokenizers::Tokenizer` is `Send + Sync` so a bare `Arc`
    /// is sufficient — no mutex needed.
    tokenizer: Arc<tokenizers::Tokenizer>,
    /// Hugging Face repo id used to load the weights. Exposed via
    /// [`Self::model_id`].
    model_id: String,
    /// Output embedding dimensionality. Exposed via [`Self::dimensions`].
    dims: usize,
    /// Pooling strategy applied to the final hidden states tensor.
    pooling: Pooling,
    /// Maximum sequence length passed to the tokenizer. Tract graphs in
    /// fastembed's registry are all trained with the BERT-family default of
    /// 512; we hardcode that here to match.
    max_length: usize,
    /// How many input texts to batch into a single forward pass. `None` means
    /// "run the whole input vector in one pass".
    batch_size: Option<usize>,
    /// Number of model inputs. BERT-family graphs expect 3 (`input_ids`,
    /// `attention_mask`, `token_type_ids`); distilled or 2-input variants
    /// (common for sentence-transformers) expect 2. We inspect this at load
    /// time and feed the matching tensors at inference time.
    input_count: usize,
}

// Manual `Debug` impl because `SimplePlan<TypedFact, Box<dyn TypedOp>, ...>`
// can't auto-derive (trait objects don't implement `Debug`). We show the
// user-facing identity and config fields, and elide the opaque tract graph
// and tokenizer handles via `finish_non_exhaustive`.
impl std::fmt::Debug for TractEmbedModel {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("TractEmbedModel")
            .field("model_id", &self.model_id)
            .field("dims", &self.dims)
            .field("pooling", &self.pooling)
            .field("max_length", &self.max_length)
            .field("batch_size", &self.batch_size)
            .field("input_count", &self.input_count)
            .finish_non_exhaustive()
    }
}

impl TractEmbedModel {
    /// Build a [`TractEmbedModel`] from the given options.
    ///
    /// This is a synchronous function even though model download is async:
    /// when called from inside a tokio runtime (the common case) we
    /// `block_on` the download using the current runtime's handle; when
    /// called outside any runtime we spin up a small current-thread runtime
    /// just for the downloads. This matches `FastEmbedModel::from_options`'s
    /// sync contract so the two backends are drop-in swappable.
    ///
    /// # Errors
    ///
    /// Returns [`TractError::UnknownModel`] if the name does not map to a
    /// registry entry, or [`TractError::Init`] for any failure during
    /// download, tokenizer load, or ONNX parse.
    pub fn from_options(opts: TractOptions) -> Result<Self, TractError> {
        let TractOptions {
            model_name,
            cache_dir,
            max_batch_size,
            show_download_progress: _,
        } = opts;

        let name = model_name.as_deref();
        let info = lookup(name)
            .ok_or_else(|| TractError::UnknownModel(name.unwrap_or("<none>").to_string()))?;

        // Build the model cache, honoring an optional override.
        let cache = if let Some(dir) = cache_dir {
            blazen_model_cache::ModelCache::with_dir(dir)
        } else {
            blazen_model_cache::ModelCache::new()
                .map_err(|e| TractError::Init(format!("cache init failed: {e}")))?
        };

        // Download all required files. Runs inside a current-thread runtime
        // if we're not already inside one.
        let (onnx_path, tokenizer_path) = block_on_downloads(&cache, info)?;

        // Load tokenizer from disk.
        let tokenizer = tokenizers::Tokenizer::from_file(&tokenizer_path)
            .map_err(|e| TractError::Init(format!("tokenizer load failed: {e}")))?;

        // Load and optimize the ONNX graph.
        let model = tract_onnx::onnx()
            .model_for_path(&onnx_path)
            .map_err(|e| TractError::Init(format!("onnx parse failed: {e}")))?
            .into_optimized()
            .map_err(|e| TractError::Init(format!("onnx optimize failed: {e}")))?
            .into_runnable()
            .map_err(|e| TractError::Init(format!("onnx runnable failed: {e}")))?;

        // Count the model's inputs so `embed` knows whether to build
        // `token_type_ids` (3-input BERT) or not (2-input sentence-transformer).
        let input_count = model.model().inputs.len();

        Ok(Self {
            model: Arc::new(Mutex::new(model)),
            tokenizer: Arc::new(tokenizer),
            model_id: info.model_code.to_string(),
            dims: info.dim,
            pooling: info.pooling,
            max_length: 512,
            batch_size: max_batch_size,
            input_count,
        })
    }

    /// The Hugging Face model id this instance was loaded from
    /// (e.g. `"Xenova/bge-small-en-v1.5"`).
    #[must_use]
    pub fn model_id(&self) -> &str {
        &self.model_id
    }

    /// Output embedding dimensionality (the size of each row in the
    /// [`TractResponse::embeddings`] vector).
    #[must_use]
    pub fn dimensions(&self) -> usize {
        self.dims
    }

    /// Embed one or more texts, returning one L2-normalized vector per input.
    ///
    /// Dispatches onto [`tokio::task::spawn_blocking`] because tract is
    /// synchronous CPU-bound work and must not run on the async runtime.
    ///
    /// # Errors
    ///
    /// Returns [`TractError::Embed`] for tokenization or inference failures,
    /// [`TractError::MutexPoisoned`] if a prior panic poisoned the model lock,
    /// or [`TractError::TaskPanicked`] if the blocking task itself panics.
    pub async fn embed(&self, texts: &[String]) -> Result<TractResponse, TractError> {
        if texts.is_empty() {
            return Ok(TractResponse {
                embeddings: Vec::new(),
                model: self.model_id.clone(),
            });
        }

        let texts_owned: Vec<String> = texts.to_vec();
        let model_handle = Arc::clone(&self.model);
        let tokenizer = Arc::clone(&self.tokenizer);
        let dims = self.dims;
        let pooling = self.pooling;
        let max_length = self.max_length;
        let batch_size = self.batch_size;
        let input_count = self.input_count;
        let model_id = self.model_id.clone();

        let embeddings = tokio::task::spawn_blocking(move || {
            embed_blocking(
                &model_handle,
                &tokenizer,
                &texts_owned,
                dims,
                pooling,
                max_length,
                batch_size,
                input_count,
            )
        })
        .await
        .map_err(|e| TractError::TaskPanicked(e.to_string()))??;

        Ok(TractResponse {
            embeddings,
            model: model_id,
        })
    }
}

/// Run the async `ModelCache` downloads from a sync context, returning the
/// paths to the ONNX file and the tokenizer file. All `additional_files` are
/// fetched too (and kept co-located on disk next to the main model file so
/// tract can find them via relative path lookups), but their paths are not
/// returned because tract doesn't need to know them explicitly.
fn block_on_downloads(
    cache: &blazen_model_cache::ModelCache,
    info: &ModelInfo,
) -> Result<(PathBuf, PathBuf), TractError> {
    let do_downloads = async {
        let onnx = cache
            .download(info.model_code, info.model_file, None)
            .await
            .map_err(|e| {
                TractError::Init(format!("failed to download {}: {}", info.model_file, e))
            })?;

        let tokenizer = cache
            .download(info.model_code, "tokenizer.json", None)
            .await
            .map_err(|e| TractError::Init(format!("failed to download tokenizer.json: {e}")))?;

        for extra in info.additional_files {
            cache
                .download(info.model_code, extra, None)
                .await
                .map_err(|e| TractError::Init(format!("failed to download {extra}: {e}")))?;
        }

        Ok::<_, TractError>((onnx, tokenizer))
    };

    // Prefer the current tokio runtime so we don't double-schedule inside an
    // already-running reactor. `block_in_place` lets us block the current
    // worker thread without starving the runtime (it moves other tasks off
    // this thread). Outside any runtime, build a single-thread runtime ad hoc.
    if let Ok(handle) = tokio::runtime::Handle::try_current() {
        tokio::task::block_in_place(|| handle.block_on(do_downloads))
    } else {
        let rt = tokio::runtime::Builder::new_current_thread()
            .enable_all()
            .build()
            .map_err(|e| TractError::Init(format!("runtime build failed: {e}")))?;
        rt.block_on(do_downloads)
    }
}

/// The synchronous hot path. Invoked from [`tokio::task::spawn_blocking`] —
/// must not touch the async runtime.
#[allow(clippy::too_many_arguments)] // internal helper, arity is load-bearing
fn embed_blocking(
    model_handle: &Mutex<TractModel>,
    tokenizer: &tokenizers::Tokenizer,
    texts: &[String],
    dims: usize,
    pooling: Pooling,
    max_length: usize,
    batch_size: Option<usize>,
    input_count: usize,
) -> Result<Vec<Vec<f32>>, TractError> {
    // Chunk inputs by batch_size so memory stays bounded on large calls.
    let chunk_size = batch_size.unwrap_or(texts.len()).max(1);
    let mut all_embeddings: Vec<Vec<f32>> = Vec::with_capacity(texts.len());

    for chunk in texts.chunks(chunk_size) {
        let chunk_vec: Vec<String> = chunk.to_vec();

        // Tokenize the chunk. `encode_batch` returns one Encoding per input.
        let encodings = tokenizer
            .encode_batch(chunk_vec, true)
            .map_err(|e| TractError::Embed(format!("tokenize failed: {e}")))?;

        let batch = encodings.len();

        // Determine padded sequence length for this chunk: min of longest
        // encoding and the model's max_length.
        let seq_len = encodings
            .iter()
            .map(tokenizers::Encoding::len)
            .max()
            .unwrap_or(0)
            .min(max_length);

        if seq_len == 0 {
            // All inputs were empty post-tokenization — emit zero vectors so
            // the caller still gets one output per input.
            for _ in 0..batch {
                all_embeddings.push(vec![0.0; dims]);
            }
            continue;
        }

        // Build flat [batch * seq_len] i64 buffers for input_ids,
        // attention_mask, and (if the model expects it) token_type_ids.
        let mut input_ids = vec![0_i64; batch * seq_len];
        let mut attention_mask = vec![0_i64; batch * seq_len];
        let mut token_type_ids = vec![0_i64; batch * seq_len];

        for (row, enc) in encodings.iter().enumerate() {
            let ids = enc.get_ids();
            let mask = enc.get_attention_mask();
            let types = enc.get_type_ids();
            let take = ids.len().min(seq_len);
            let base = row * seq_len;
            for i in 0..take {
                input_ids[base + i] = i64::from(ids[i]);
                attention_mask[base + i] = i64::from(mask[i]);
                token_type_ids[base + i] = i64::from(types[i]);
            }
        }

        // Wrap into tract tensors via tract_ndarray (tract re-exports
        // ndarray 0.16 as `tract_ndarray` in its prelude).
        let ids_arr = tract_ndarray::Array2::from_shape_vec((batch, seq_len), input_ids)
            .map_err(|e| TractError::Embed(format!("ids reshape failed: {e}")))?;
        let mask_arr = tract_ndarray::Array2::from_shape_vec((batch, seq_len), attention_mask)
            .map_err(|e| TractError::Embed(format!("mask reshape failed: {e}")))?;
        let types_arr = tract_ndarray::Array2::from_shape_vec((batch, seq_len), token_type_ids)
            .map_err(|e| TractError::Embed(format!("types reshape failed: {e}")))?;

        let ids_tensor: Tensor = ids_arr.clone().into();
        let mask_tensor: Tensor = mask_arr.clone().into();
        let types_tensor: Tensor = types_arr.into();

        // Assemble inputs matching the graph's arity. Convention for
        // BERT-family ONNX: [input_ids, attention_mask, token_type_ids].
        let inputs: TVec<TValue> = if input_count >= 3 {
            tvec!(ids_tensor.into(), mask_tensor.into(), types_tensor.into())
        } else {
            tvec!(ids_tensor.into(), mask_tensor.into())
        };

        // Run the graph under the mutex.
        let outputs = {
            let locked = model_handle
                .lock()
                .map_err(|e| TractError::MutexPoisoned(e.to_string()))?;
            locked
                .run(inputs)
                .map_err(|e| TractError::Embed(format!("tract run failed: {e}")))?
        };

        let hidden = outputs
            .first()
            .ok_or_else(|| TractError::Embed("no outputs from tract graph".to_string()))?;

        // Expect shape [batch, seq_len, hidden]. Some graphs emit the pooled
        // vector directly as [batch, hidden] — handle that too.
        let view = hidden
            .to_array_view::<f32>()
            .map_err(|e| TractError::Embed(format!("output view failed: {e}")))?;

        let pooled: Vec<Vec<f32>> = match view.ndim() {
            3 => {
                // [batch, seq_len, hidden] — apply the requested pooling.
                let array = view
                    .view()
                    .into_dimensionality::<tract_ndarray::Ix3>()
                    .map_err(|e| TractError::Embed(format!("output ndim coerce failed: {e}")))?;
                pool_hidden_states(array, &mask_arr, pooling, dims)?
            }
            2 => {
                // [batch, hidden] — already pooled, pass through.
                let array = view
                    .view()
                    .into_dimensionality::<tract_ndarray::Ix2>()
                    .map_err(|e| TractError::Embed(format!("output ndim coerce failed: {e}")))?;
                array
                    .outer_iter()
                    .map(|row| row.iter().copied().collect::<Vec<f32>>())
                    .collect()
            }
            other => {
                return Err(TractError::Embed(format!(
                    "unexpected output rank {other}, expected 2 or 3"
                )));
            }
        };

        // L2-normalize each row so cosine similarity == dot product downstream.
        for mut row in pooled {
            l2_normalize(&mut row);
            all_embeddings.push(row);
        }
    }

    Ok(all_embeddings)
}

/// Collapse `[batch, seq_len, hidden]` into `[batch, hidden]` using the given
/// pooling strategy and the attention mask.
fn pool_hidden_states(
    hidden: tract_ndarray::ArrayView3<f32>,
    mask: &tract_ndarray::Array2<i64>,
    pooling: Pooling,
    dims: usize,
) -> Result<Vec<Vec<f32>>, TractError> {
    let (batch, seq_len, hidden_dim) = hidden.dim();
    if hidden_dim != dims {
        return Err(TractError::Embed(format!(
            "model output hidden size {hidden_dim} != expected dim {dims}"
        )));
    }

    let mut out: Vec<Vec<f32>> = Vec::with_capacity(batch);

    match pooling {
        Pooling::Cls => {
            // First token per row.
            for b in 0..batch {
                let slice = hidden.slice(tract_ndarray::s![b, 0_usize, ..]);
                out.push(slice.iter().copied().collect());
            }
        }
        Pooling::Mean => {
            // Sum hidden[b, t, :] * mask[b, t] over t, divide by sum(mask[b, :]).
            for b in 0..batch {
                let mut acc = vec![0.0_f32; hidden_dim];
                let mut weight_sum: f32 = 0.0;
                for t in 0..seq_len {
                    #[allow(clippy::cast_precision_loss)]
                    let w = mask[[b, t]] as f32;
                    if w == 0.0 {
                        continue;
                    }
                    weight_sum += w;
                    for h in 0..hidden_dim {
                        acc[h] += hidden[[b, t, h]] * w;
                    }
                }
                let denom = weight_sum.max(1e-12);
                for v in &mut acc {
                    *v /= denom;
                }
                out.push(acc);
            }
        }
    }

    Ok(out)
}

/// L2-normalize `v` in place. Uses a small epsilon to avoid dividing by zero
/// on all-zero rows (which would otherwise produce NaNs).
fn l2_normalize(v: &mut [f32]) {
    let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-12);
    for x in v.iter_mut() {
        *x /= norm;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
    async fn embed_empty_input_returns_empty() {
        // We can construct a fake model without downloading by hand-building
        // the struct — but that's fragile. Instead, skip if the default model
        // isn't already cached locally; otherwise, exercise the empty path.
        let Ok(model) = TractEmbedModel::from_options(TractOptions::default()) else {
            eprintln!("skipping embed_empty_input_returns_empty: model not available");
            return;
        };
        let response = model.embed(&[]).await.expect("empty embed should succeed");
        assert!(response.embeddings.is_empty());
        assert_eq!(response.model, model.model_id());
    }

    #[test]
    fn unknown_model_name_is_rejected() {
        let opts = TractOptions {
            model_name: Some("NotARealModel".to_string()),
            ..TractOptions::default()
        };
        let err = TractEmbedModel::from_options(opts).unwrap_err();
        assert!(matches!(err, TractError::UnknownModel(_)));
    }

    #[tokio::test]
    #[ignore = "requires model download from HuggingFace"]
    async fn embed_returns_correct_count_and_dims() {
        let model = TractEmbedModel::from_options(TractOptions::default())
            .expect("should create model with default options");
        let response = model
            .embed(&["hello".into(), "world".into()])
            .await
            .expect("embedding should succeed");
        assert_eq!(response.embeddings.len(), 2);
        assert_eq!(response.embeddings[0].len(), model.dimensions());
        // L2-normalized vectors have a norm of ~1.0.
        let norm: f32 = response.embeddings[0]
            .iter()
            .map(|x| x * x)
            .sum::<f32>()
            .sqrt();
        assert!((norm - 1.0).abs() < 1e-3, "expected ~1.0 norm, got {norm}");
    }
}