1#![allow(clippy::unwrap_used, clippy::expect_used)]
3use blake3::hash;
18#[cfg(feature = "clip")]
19use image::DynamicImage;
20#[cfg(all(feature = "clip", not(feature = "pdfium")))]
21use image::{ImageBuffer, Luma, Rgb};
22#[cfg(all(feature = "clip", not(feature = "pdfium")))]
23use lopdf::{Dictionary, Document, Object, ObjectId};
24use serde::{Deserialize, Serialize};
25use std::borrow::Cow;
26#[cfg(all(feature = "clip", not(feature = "pdfium")))]
27use std::collections::HashSet;
28use std::path::{Path, PathBuf};
29use std::time::Duration;
30
31use crate::{MemvidError, Result, types::FrameId};
32
33#[cfg(all(feature = "clip", target_os = "macos"))]
40mod stderr_suppress {
41 use std::fs::File;
42 use std::io;
43 use std::os::unix::io::{AsRawFd, RawFd};
44
45 pub struct StderrSuppressor {
46 original_stderr: RawFd,
47 #[allow(dead_code)]
48 dev_null: File,
49 }
50
51 impl StderrSuppressor {
52 pub fn new() -> io::Result<Self> {
53 let dev_null = File::open("/dev/null")?;
54 let original_stderr = unsafe { libc::dup(2) };
55 if original_stderr == -1 {
56 return Err(io::Error::last_os_error());
57 }
58 let result = unsafe { libc::dup2(dev_null.as_raw_fd(), 2) };
59 if result == -1 {
60 unsafe { libc::close(original_stderr) };
61 return Err(io::Error::last_os_error());
62 }
63 Ok(Self {
64 original_stderr,
65 dev_null,
66 })
67 }
68 }
69
70 impl Drop for StderrSuppressor {
71 fn drop(&mut self) {
72 unsafe {
73 libc::dup2(self.original_stderr, 2);
74 libc::close(self.original_stderr);
75 }
76 }
77 }
78}
79
80#[cfg(all(feature = "clip", not(target_os = "macos")))]
81mod stderr_suppress {
82 pub struct StderrSuppressor;
83 impl StderrSuppressor {
84 pub fn new() -> std::io::Result<Self> {
85 Ok(Self)
86 }
87 }
88}
89
90#[allow(clippy::cast_possible_truncation)]
96const CLIP_DECODE_LIMIT: usize = crate::MAX_INDEX_BYTES as usize;
97
98pub const MOBILECLIP_DIMS: u32 = 512;
100
101pub const SIGLIP_DIMS: u32 = 768;
103
104pub const MOBILECLIP_INPUT_SIZE: u32 = 256;
106
107pub const SIGLIP_INPUT_SIZE: u32 = 224;
109
110pub const MIN_IMAGE_DIM: u32 = 64;
112
113pub const MAX_ASPECT_RATIO: f32 = 10.0;
115
116pub const MIN_COLOR_VARIANCE: f32 = 0.01;
118
119pub const MODEL_UNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
121
122fn clip_config() -> impl bincode::config::Config {
127 bincode::config::standard()
128 .with_fixed_int_encoding()
129 .with_little_endian()
130}
131
132#[derive(Debug, Clone)]
138pub struct ClipModelInfo {
139 pub name: &'static str,
141 pub vision_url: &'static str,
143 pub text_url: &'static str,
145 pub tokenizer_url: &'static str,
147 pub vision_size_mb: f32,
149 pub text_size_mb: f32,
151 pub dims: u32,
153 pub input_resolution: u32,
155 pub is_default: bool,
157}
158
159pub static CLIP_MODELS: &[ClipModelInfo] = &[
161 ClipModelInfo {
164 name: "mobileclip-s2-int8",
165 vision_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/onnx/vision_model_int8.onnx",
166 text_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/onnx/text_model_int8.onnx",
167 tokenizer_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/tokenizer.json",
168 vision_size_mb: 36.7,
169 text_size_mb: 64.1,
170 dims: MOBILECLIP_DIMS,
171 input_resolution: MOBILECLIP_INPUT_SIZE,
172 is_default: false,
173 },
174 ClipModelInfo {
176 name: "siglip-base",
177 vision_url: "https://huggingface.co/Xenova/siglip-base-patch16-224/resolve/main/onnx/vision_model_quantized.onnx",
178 text_url: "https://huggingface.co/Xenova/siglip-base-patch16-224/resolve/main/onnx/text_model_quantized.onnx",
179 tokenizer_url: "https://huggingface.co/Xenova/siglip-base-patch16-224/resolve/main/tokenizer.json",
180 vision_size_mb: 99.5,
181 text_size_mb: 111.0,
182 dims: SIGLIP_DIMS,
183 input_resolution: SIGLIP_INPUT_SIZE,
184 is_default: false,
185 },
186 ClipModelInfo {
188 name: "mobileclip-s2",
189 vision_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/onnx/vision_model_fp16.onnx",
190 text_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/onnx/text_model_fp16.onnx",
191 tokenizer_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/tokenizer.json",
192 vision_size_mb: 71.7,
193 text_size_mb: 127.0,
194 dims: MOBILECLIP_DIMS,
195 input_resolution: MOBILECLIP_INPUT_SIZE,
196 is_default: true,
197 },
198];
199
200#[must_use]
202pub fn get_model_info(name: &str) -> &'static ClipModelInfo {
203 CLIP_MODELS
204 .iter()
205 .find(|m| m.name == name)
206 .unwrap_or_else(|| {
207 CLIP_MODELS
208 .iter()
209 .find(|m| m.is_default)
210 .expect("default model")
211 })
212}
213
214#[must_use]
216pub fn default_model_info() -> &'static ClipModelInfo {
217 CLIP_MODELS
218 .iter()
219 .find(|m| m.is_default)
220 .expect("default model exists")
221}
222
223#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct ClipDocument {
230 pub frame_id: FrameId,
232 pub embedding: Vec<f32>,
234 #[serde(default)]
236 pub page: Option<u32>,
237}
238
239#[derive(Default)]
241pub struct ClipIndexBuilder {
242 documents: Vec<ClipDocument>,
243}
244
245impl ClipIndexBuilder {
246 #[must_use]
247 pub fn new() -> Self {
248 Self::default()
249 }
250
251 pub fn add_document<I>(&mut self, frame_id: FrameId, page: Option<u32>, embedding: I)
253 where
254 I: Into<Vec<f32>>,
255 {
256 self.documents.push(ClipDocument {
257 frame_id,
258 embedding: embedding.into(),
259 page,
260 });
261 }
262
263 pub fn finish(self) -> Result<ClipIndexArtifact> {
265 let bytes = bincode::serde::encode_to_vec(&self.documents, clip_config())?;
266
267 let checksum = *hash(&bytes).as_bytes();
268 let dimension = self
269 .documents
270 .first()
271 .map_or(0, |doc| u32::try_from(doc.embedding.len()).unwrap_or(0));
272
273 Ok(ClipIndexArtifact {
274 bytes,
275 vector_count: self.documents.len() as u64,
276 dimension,
277 checksum,
278 })
279 }
280}
281
282#[derive(Debug, Clone)]
284pub struct ClipIndexArtifact {
285 pub bytes: Vec<u8>,
287 pub vector_count: u64,
289 pub dimension: u32,
291 pub checksum: [u8; 32],
293}
294
295#[derive(Debug, Clone)]
297pub struct ClipIndex {
298 documents: Vec<ClipDocument>,
299}
300
301impl Default for ClipIndex {
302 fn default() -> Self {
303 Self::new()
304 }
305}
306
307impl ClipIndex {
308 #[must_use]
310 pub fn new() -> Self {
311 Self {
312 documents: Vec::new(),
313 }
314 }
315
316 pub fn add_document<I>(&mut self, frame_id: FrameId, page: Option<u32>, embedding: I)
318 where
319 I: Into<Vec<f32>>,
320 {
321 self.documents.push(ClipDocument {
322 frame_id,
323 embedding: embedding.into(),
324 page,
325 });
326 }
327
328 pub fn decode(bytes: &[u8]) -> Result<Self> {
330 let (documents, read) = bincode::serde::decode_from_slice::<Vec<ClipDocument>, _>(
331 bytes,
332 bincode::config::standard()
333 .with_fixed_int_encoding()
334 .with_little_endian()
335 .with_limit::<CLIP_DECODE_LIMIT>(),
336 )?;
337
338 if read != bytes.len() {
339 return Err(MemvidError::InvalidToc {
340 reason: Cow::Owned(format!(
341 "CLIP index decode: expected {} bytes, read {}",
342 bytes.len(),
343 read
344 )),
345 });
346 }
347
348 tracing::debug!(
349 bytes_len = bytes.len(),
350 docs_count = documents.len(),
351 "decoded CLIP index"
352 );
353
354 Ok(Self { documents })
355 }
356
357 #[must_use]
359 pub fn search(&self, query: &[f32], limit: usize) -> Vec<ClipSearchHit> {
360 if query.is_empty() {
361 return Vec::new();
362 }
363
364 let mut hits: Vec<ClipSearchHit> = self
365 .documents
366 .iter()
367 .map(|doc| {
368 let distance = l2_distance(query, &doc.embedding);
369 ClipSearchHit {
370 frame_id: doc.frame_id,
371 page: doc.page,
372 distance,
373 }
374 })
375 .collect();
376
377 hits.sort_by(|a, b| {
378 a.distance
379 .partial_cmp(&b.distance)
380 .unwrap_or(std::cmp::Ordering::Equal)
381 });
382 hits.truncate(limit);
383 hits
384 }
385
386 pub fn entries(&self) -> impl Iterator<Item = (FrameId, Option<u32>, &[f32])> + '_ {
388 self.documents
389 .iter()
390 .map(|doc| (doc.frame_id, doc.page, doc.embedding.as_slice()))
391 }
392
393 #[must_use]
395 pub fn embedding_for(&self, frame_id: FrameId) -> Option<&[f32]> {
396 self.documents
397 .iter()
398 .find(|doc| doc.frame_id == frame_id)
399 .map(|doc| doc.embedding.as_slice())
400 }
401
402 pub fn remove(&mut self, frame_id: FrameId) {
404 self.documents.retain(|doc| doc.frame_id != frame_id);
405 }
406
407 #[must_use]
409 pub fn len(&self) -> usize {
410 self.documents.len()
411 }
412
413 #[must_use]
415 pub fn is_empty(&self) -> bool {
416 self.documents.is_empty()
417 }
418
419 pub fn encode(&self) -> Result<ClipIndexArtifact> {
421 let bytes = bincode::serde::encode_to_vec(&self.documents, clip_config())?;
422
423 let checksum = *hash(&bytes).as_bytes();
424 let dimension = self
425 .documents
426 .first()
427 .map_or(0, |doc| u32::try_from(doc.embedding.len()).unwrap_or(0));
428
429 Ok(ClipIndexArtifact {
430 bytes,
431 vector_count: self.documents.len() as u64,
432 dimension,
433 checksum,
434 })
435 }
436}
437
438#[derive(Debug, Clone, PartialEq)]
440pub struct ClipSearchHit {
441 pub frame_id: FrameId,
443 pub page: Option<u32>,
445 pub distance: f32,
447}
448
449fn l2_distance(a: &[f32], b: &[f32]) -> f32 {
451 a.iter()
452 .zip(b.iter())
453 .map(|(x, y)| (x - y).powi(2))
454 .sum::<f32>()
455 .sqrt()
456}
457
458#[derive(Debug, Clone)]
464pub struct ImageInfo {
465 pub width: u32,
466 pub height: u32,
467 pub color_variance: f32,
468}
469
470impl ImageInfo {
471 #[must_use]
473 pub fn should_embed(&self) -> bool {
474 if self.width < MIN_IMAGE_DIM || self.height < MIN_IMAGE_DIM {
476 return false;
477 }
478
479 let aspect = self.width as f32 / self.height as f32;
481 if !((1.0 / MAX_ASPECT_RATIO)..=MAX_ASPECT_RATIO).contains(&aspect) {
482 return false;
483 }
484
485 if self.color_variance < MIN_COLOR_VARIANCE {
487 return false;
488 }
489
490 true
491 }
492}
493
494pub fn filter_junk_images<T, F>(images: Vec<T>, get_info: F) -> Vec<T>
496where
497 F: Fn(&T) -> ImageInfo,
498{
499 images
500 .into_iter()
501 .filter(|img| get_info(img).should_embed())
502 .collect()
503}
504
505#[derive(Debug, Clone)]
511pub struct ClipConfig {
512 pub model_name: String,
514 pub models_dir: PathBuf,
516 pub offline: bool,
518}
519
520impl Default for ClipConfig {
521 fn default() -> Self {
522 let models_dir = std::env::var("MEMVID_MODELS_DIR")
524 .ok()
525 .map(PathBuf::from)
526 .or_else(|| dirs_next::home_dir().map(|d| d.join(".memvid/models")))
527 .unwrap_or_else(|| PathBuf::from(".memvid/models"));
528
529 let model_name =
530 std::env::var("MEMVID_CLIP_MODEL").unwrap_or_else(|_| "mobileclip-s2".to_string());
531
532 let offline = std::env::var("MEMVID_OFFLINE").is_ok();
533
534 Self {
535 model_name,
536 models_dir,
537 offline,
538 }
539 }
540}
541
542#[derive(Debug, thiserror::Error)]
548pub enum ClipError {
549 #[error("CLIP model '{model}' not found. {hint}")]
551 ModelNotFound { model: String, hint: String },
552
553 #[error("Failed to decode image at {path:?}: {cause}")]
555 ImageDecodeError { path: PathBuf, cause: String },
556
557 #[error("Failed to decode image bytes: {cause}")]
559 ImageBytesDecodeError { cause: String },
560
561 #[error("CLIP inference error: {cause}")]
563 InferenceError { cause: String },
564
565 #[error("Failed to download CLIP model: {cause}")]
567 DownloadError { cause: String },
568
569 #[error("CLIP model file is corrupted: {cause}")]
571 ModelCorrupted { cause: String },
572}
573
574impl From<ClipError> for MemvidError {
575 fn from(err: ClipError) -> Self {
576 MemvidError::EmbeddingFailed {
577 reason: err.to_string().into_boxed_str(),
578 }
579 }
580}
581
582#[cfg(feature = "clip")]
587mod model {
588 use super::*;
589 use image::{DynamicImage, GenericImageView, imageops::FilterType};
590 use ndarray::{Array, Array4};
591 use ort::session::{Session, builder::GraphOptimizationLevel};
592 use ort::value::Tensor;
593 use std::sync::Mutex;
594 use std::time::Instant;
595 use tokenizers::{
596 PaddingDirection, PaddingParams, PaddingStrategy, Tokenizer, TruncationDirection,
597 TruncationParams, TruncationStrategy,
598 };
599
600 pub struct ClipModel {
602 config: ClipConfig,
603 model_info: &'static ClipModelInfo,
604 vision_session: Mutex<Option<Session>>,
606 text_session: Mutex<Option<Session>>,
608 tokenizer: Mutex<Option<Tokenizer>>,
610 last_used: Mutex<Instant>,
612 }
613
614 impl ClipModel {
615 pub fn new(config: ClipConfig) -> Result<Self> {
617 let model_info = get_model_info(&config.model_name);
618
619 Ok(Self {
620 config,
621 model_info,
622 vision_session: Mutex::new(None),
623 text_session: Mutex::new(None),
624 tokenizer: Mutex::new(None),
625 last_used: Mutex::new(Instant::now()),
626 })
627 }
628
629 pub fn default_model() -> Result<Self> {
631 Self::new(ClipConfig::default())
632 }
633
634 pub fn model_info(&self) -> &'static ClipModelInfo {
636 self.model_info
637 }
638
639 pub fn dims(&self) -> u32 {
641 self.model_info.dims
642 }
643
644 fn ensure_model_file(&self, kind: &str) -> Result<PathBuf> {
646 let filename = format!("{}_{}.onnx", self.model_info.name, kind);
647 let path = self.config.models_dir.join(&filename);
648
649 if path.exists() {
650 return Ok(path);
651 }
652
653 if self.config.offline {
654 return Err(ClipError::ModelNotFound {
655 model: self.model_info.name.to_string(),
656 hint: format!(
657 "Run: memvid model download {} (or disable MEMVID_OFFLINE)",
658 self.model_info.name
659 ),
660 }
661 .into());
662 }
663
664 std::fs::create_dir_all(&self.config.models_dir).map_err(|e| {
666 ClipError::DownloadError {
667 cause: format!("Failed to create models directory: {}", e),
668 }
669 })?;
670
671 Err(ClipError::DownloadError {
673 cause: format!(
674 "Automatic download not yet implemented. Please download manually:\n\
675 curl -L '{}' -o '{}'",
676 if kind == "vision" {
677 self.model_info.vision_url
678 } else {
679 self.model_info.text_url
680 },
681 path.display()
682 ),
683 }
684 .into())
685 }
686
687 fn ensure_tokenizer_file(&self) -> Result<PathBuf> {
689 let filename = format!("{}_tokenizer.json", self.model_info.name);
690 let path = self.config.models_dir.join(&filename);
691
692 if path.exists() {
693 return Ok(path);
694 }
695
696 if self.config.offline {
697 return Err(ClipError::ModelNotFound {
698 model: self.model_info.name.to_string(),
699 hint: format!(
700 "Tokenizer missing at {}. Copy tokenizer.json from {}",
701 path.display(),
702 self.model_info.tokenizer_url
703 ),
704 }
705 .into());
706 }
707
708 std::fs::create_dir_all(&self.config.models_dir).map_err(|e| {
709 ClipError::DownloadError {
710 cause: format!("Failed to create models directory: {}", e),
711 }
712 })?;
713
714 Err(ClipError::DownloadError {
715 cause: format!(
716 "Automatic download not yet implemented. Please download manually:\n\
717 curl -L '{}' -o '{}'",
718 self.model_info.tokenizer_url,
719 path.display()
720 ),
721 }
722 .into())
723 }
724
725 fn load_vision_session(&self) -> Result<()> {
727 let mut session_guard = self
728 .vision_session
729 .lock()
730 .map_err(|_| MemvidError::Lock("Failed to lock vision session".into()))?;
731
732 if session_guard.is_some() {
733 return Ok(());
734 }
735
736 let vision_path = self.ensure_model_file("vision")?;
737
738 tracing::debug!(path = %vision_path.display(), "Loading CLIP vision model");
739
740 let _stderr_guard = stderr_suppress::StderrSuppressor::new().ok();
742
743 let session = Session::builder()
744 .map_err(|e| ClipError::InferenceError {
745 cause: e.to_string(),
746 })?
747 .with_optimization_level(GraphOptimizationLevel::Level3)
748 .map_err(|e| ClipError::InferenceError {
749 cause: e.to_string(),
750 })?
751 .with_intra_threads(4)
752 .map_err(|e| ClipError::InferenceError {
753 cause: e.to_string(),
754 })?
755 .commit_from_file(&vision_path)
756 .map_err(|e| ClipError::InferenceError {
757 cause: format!("Failed to load vision model: {}", e),
758 })?;
759
760 *session_guard = Some(session);
763 tracing::info!(model = %self.model_info.name, "CLIP vision model loaded");
764
765 Ok(())
766 }
767
768 fn load_text_session(&self) -> Result<()> {
770 let mut session_guard = self
771 .text_session
772 .lock()
773 .map_err(|_| MemvidError::Lock("Failed to lock text session".into()))?;
774
775 if session_guard.is_some() {
776 return Ok(());
777 }
778
779 let text_path = self.ensure_model_file("text")?;
780
781 tracing::debug!(path = %text_path.display(), "Loading CLIP text model");
782
783 let _stderr_guard = stderr_suppress::StderrSuppressor::new().ok();
785
786 let session = Session::builder()
787 .map_err(|e| ClipError::InferenceError {
788 cause: e.to_string(),
789 })?
790 .with_optimization_level(GraphOptimizationLevel::Level3)
791 .map_err(|e| ClipError::InferenceError {
792 cause: e.to_string(),
793 })?
794 .with_intra_threads(4)
795 .map_err(|e| ClipError::InferenceError {
796 cause: e.to_string(),
797 })?
798 .commit_from_file(&text_path)
799 .map_err(|e| ClipError::InferenceError {
800 cause: format!("Failed to load text model: {}", e),
801 })?;
802
803 *session_guard = Some(session);
806 tracing::info!(model = %self.model_info.name, "CLIP text model loaded");
807
808 Ok(())
809 }
810
811 fn load_tokenizer(&self) -> Result<()> {
813 let mut tokenizer_guard = self
814 .tokenizer
815 .lock()
816 .map_err(|_| MemvidError::Lock("Failed to lock CLIP tokenizer".into()))?;
817
818 if tokenizer_guard.is_some() {
819 return Ok(());
820 }
821
822 let tokenizer_path = self.ensure_tokenizer_file()?;
823
824 tracing::debug!(path = %tokenizer_path.display(), "Loading CLIP tokenizer");
825
826 let mut tokenizer =
827 Tokenizer::from_file(&tokenizer_path).map_err(|e| ClipError::InferenceError {
828 cause: format!("Failed to load tokenizer: {}", e),
829 })?;
830
831 tokenizer.with_padding(Some(PaddingParams {
832 strategy: PaddingStrategy::Fixed(77),
833 direction: PaddingDirection::Right,
834 pad_to_multiple_of: None,
835 pad_id: 0,
836 pad_type_id: 0,
837 pad_token: "[PAD]".to_string(),
838 }));
839
840 tokenizer
841 .with_truncation(Some(TruncationParams {
842 max_length: 77,
843 strategy: TruncationStrategy::LongestFirst,
844 stride: 0,
845 direction: TruncationDirection::Right,
846 }))
847 .map_err(|e| ClipError::InferenceError {
848 cause: format!("Failed to apply truncation config: {}", e),
849 })?;
850
851 *tokenizer_guard = Some(tokenizer);
852 tracing::info!(model = %self.model_info.name, "CLIP tokenizer loaded");
853
854 Ok(())
855 }
856
857 fn preprocess_image(&self, image: &DynamicImage) -> Array4<f32> {
865 let size = self.model_info.input_resolution;
866 let rgb_input = image.to_rgb8();
867 let (w, h) = rgb_input.dimensions();
868
869 let scale = size as f32 / w.min(h) as f32;
871 let new_w = ((w as f32) * scale).round().max(1.0) as u32;
872 let new_h = ((h as f32) * scale).round().max(1.0) as u32;
873 let resized = image.resize_exact(new_w, new_h, FilterType::Triangle);
874
875 let start_x = (resized.width().saturating_sub(size)) / 2;
877 let start_y = (resized.height().saturating_sub(size)) / 2;
878
879 let mut array = Array4::<f32>::zeros((1, 3, size as usize, size as usize));
881
882 for y in 0..size as usize {
883 for x in 0..size as usize {
884 let pixel = resized.get_pixel(start_x + x as u32, start_y + y as u32);
885 array[[0, 0, y, x]] = pixel[0] as f32 / 255.0;
886 array[[0, 1, y, x]] = pixel[1] as f32 / 255.0;
887 array[[0, 2, y, x]] = pixel[2] as f32 / 255.0;
888 }
889 }
890
891 array
892 }
893
894 pub fn encode_image(&self, image: &DynamicImage) -> Result<Vec<f32>> {
896 self.load_vision_session()?;
898
899 let pixel_values = self.preprocess_image(image);
901
902 if let Ok(mut last) = self.last_used.lock() {
904 *last = Instant::now();
905 }
906
907 let mut session_guard = self
909 .vision_session
910 .lock()
911 .map_err(|_| MemvidError::Lock("Failed to lock vision session".into()))?;
912
913 let session = session_guard
914 .as_mut()
915 .ok_or_else(|| ClipError::InferenceError {
916 cause: "Vision session not loaded".to_string(),
917 })?;
918
919 let input_name = session
921 .inputs
922 .first()
923 .map(|i| i.name.clone())
924 .unwrap_or_else(|| "pixel_values".into());
925 let output_name = session
926 .outputs
927 .first()
928 .map(|o| o.name.clone())
929 .unwrap_or_else(|| "image_embeds".into());
930
931 let input_tensor =
933 Tensor::from_array(pixel_values).map_err(|e| ClipError::InferenceError {
934 cause: format!("Failed to create input tensor: {}", e),
935 })?;
936
937 let outputs = session
939 .run(ort::inputs![input_name => input_tensor])
940 .map_err(|e| ClipError::InferenceError {
941 cause: format!("Vision inference failed: {}", e),
942 })?;
943
944 let output = outputs
946 .get(&output_name)
947 .ok_or_else(|| ClipError::InferenceError {
948 cause: format!("No output '{}' from vision model", output_name),
949 })?;
950
951 let (_shape, data) =
952 output
953 .try_extract_tensor::<f32>()
954 .map_err(|e| ClipError::InferenceError {
955 cause: format!("Failed to extract embeddings: {}", e),
956 })?;
957
958 let embedding: Vec<f32> = data.to_vec();
960 if embedding.iter().any(|v| !v.is_finite()) {
961 return Err(ClipError::InferenceError {
962 cause: "Vision embedding contains non-finite values".to_string(),
963 }
964 .into());
965 }
966 let normalized = l2_normalize(&embedding);
967
968 tracing::debug!(dims = normalized.len(), "Generated CLIP image embedding");
969
970 Ok(normalized)
971 }
972
973 pub fn encode_image_bytes(&self, bytes: &[u8]) -> Result<Vec<f32>> {
975 let image =
976 image::load_from_memory(bytes).map_err(|e| ClipError::ImageBytesDecodeError {
977 cause: e.to_string(),
978 })?;
979 self.encode_image(&image)
980 }
981
982 pub fn encode_image_file(&self, path: &Path) -> Result<Vec<f32>> {
984 let image = image::open(path).map_err(|e| ClipError::ImageDecodeError {
985 path: path.to_path_buf(),
986 cause: e.to_string(),
987 })?;
988 self.encode_image(&image)
989 }
990
991 pub fn encode_text(&self, text: &str) -> Result<Vec<f32>> {
993 self.load_text_session()?;
995 self.load_tokenizer()?;
996
997 let encoding = {
999 let tokenizer_guard = self
1000 .tokenizer
1001 .lock()
1002 .map_err(|_| MemvidError::Lock("Failed to lock CLIP tokenizer".into()))?;
1003 let tokenizer =
1004 tokenizer_guard
1005 .as_ref()
1006 .ok_or_else(|| ClipError::InferenceError {
1007 cause: "Tokenizer not loaded".to_string(),
1008 })?;
1009
1010 tokenizer
1011 .encode(text, true)
1012 .map_err(|e| ClipError::InferenceError {
1013 cause: format!("Text tokenization failed: {}", e),
1014 })?
1015 };
1016
1017 let input_ids: Vec<i64> = encoding.get_ids().iter().map(|id| *id as i64).collect();
1018 let attention_mask: Vec<i64> = encoding
1019 .get_attention_mask()
1020 .iter()
1021 .map(|id| *id as i64)
1022 .collect();
1023 let max_length = input_ids.len();
1024
1025 let input_ids_array =
1027 Array::from_shape_vec((1, max_length), input_ids).map_err(|e| {
1028 ClipError::InferenceError {
1029 cause: e.to_string(),
1030 }
1031 })?;
1032 let attention_mask_array = Array::from_shape_vec((1, max_length), attention_mask)
1033 .map_err(|e| ClipError::InferenceError {
1034 cause: e.to_string(),
1035 })?;
1036
1037 if let Ok(mut last) = self.last_used.lock() {
1039 *last = Instant::now();
1040 }
1041
1042 let mut session_guard = self
1044 .text_session
1045 .lock()
1046 .map_err(|_| MemvidError::Lock("Failed to lock text session".into()))?;
1047
1048 let session = session_guard
1049 .as_mut()
1050 .ok_or_else(|| ClipError::InferenceError {
1051 cause: "Text session not loaded".to_string(),
1052 })?;
1053
1054 let input_names: Vec<String> = session.inputs.iter().map(|i| i.name.clone()).collect();
1056 let output_name = session
1057 .outputs
1058 .first()
1059 .map(|o| o.name.clone())
1060 .unwrap_or_else(|| "text_embeds".into());
1061
1062 let input_ids_tensor =
1064 Tensor::from_array(input_ids_array).map_err(|e| ClipError::InferenceError {
1065 cause: format!("Failed to create input_ids tensor: {}", e),
1066 })?;
1067 let attention_mask_tensor = Tensor::from_array(attention_mask_array).map_err(|e| {
1068 ClipError::InferenceError {
1069 cause: format!("Failed to create attention_mask tensor: {}", e),
1070 }
1071 })?;
1072
1073 let outputs = if input_names.len() >= 2 {
1075 session
1076 .run(ort::inputs![
1077 input_names[0].clone() => input_ids_tensor,
1078 input_names[1].clone() => attention_mask_tensor
1079 ])
1080 .map_err(|e| ClipError::InferenceError {
1081 cause: format!("Text inference failed: {}", e),
1082 })?
1083 } else {
1084 let name = input_names
1086 .first()
1087 .cloned()
1088 .unwrap_or_else(|| "input_ids".to_string());
1089 session
1090 .run(ort::inputs![name => input_ids_tensor])
1091 .map_err(|e| ClipError::InferenceError {
1092 cause: format!("Text inference failed: {}", e),
1093 })?
1094 };
1095
1096 let output = outputs
1098 .get(&output_name)
1099 .ok_or_else(|| ClipError::InferenceError {
1100 cause: format!("No output '{}' from text model", output_name),
1101 })?;
1102
1103 let (_shape, data) =
1104 output
1105 .try_extract_tensor::<f32>()
1106 .map_err(|e| ClipError::InferenceError {
1107 cause: format!("Failed to extract text embeddings: {}", e),
1108 })?;
1109
1110 let embedding: Vec<f32> = data.to_vec();
1112 if embedding.iter().any(|v| !v.is_finite()) {
1113 return Err(ClipError::InferenceError {
1114 cause: "Text embedding contains non-finite values".to_string(),
1115 }
1116 .into());
1117 }
1118 let normalized = l2_normalize(&embedding);
1119
1120 tracing::debug!(
1121 text_len = text.len(),
1122 dims = normalized.len(),
1123 "Generated CLIP text embedding"
1124 );
1125
1126 Ok(normalized)
1127 }
1128
1129 pub fn maybe_unload(&self) -> Result<()> {
1131 let last_used = self
1132 .last_used
1133 .lock()
1134 .map_err(|_| MemvidError::Lock("Failed to check last_used".into()))?;
1135
1136 if last_used.elapsed() > MODEL_UNLOAD_TIMEOUT {
1137 tracing::debug!(model = %self.model_info.name, "Model idle, unloading sessions");
1138
1139 if let Ok(mut guard) = self.vision_session.lock() {
1141 *guard = None;
1142 }
1143
1144 if let Ok(mut guard) = self.text_session.lock() {
1146 *guard = None;
1147 }
1148
1149 if let Ok(mut guard) = self.tokenizer.lock() {
1151 *guard = None;
1152 }
1153 }
1154
1155 Ok(())
1156 }
1157
1158 pub fn unload(&self) -> Result<()> {
1160 if let Ok(mut guard) = self.vision_session.lock() {
1161 *guard = None;
1162 }
1163 if let Ok(mut guard) = self.text_session.lock() {
1164 *guard = None;
1165 }
1166 if let Ok(mut guard) = self.tokenizer.lock() {
1167 *guard = None;
1168 }
1169 tracing::debug!(model = %self.model_info.name, "CLIP sessions unloaded");
1170 Ok(())
1171 }
1172
1173 pub fn is_vision_loaded(&self) -> bool {
1175 self.vision_session
1176 .lock()
1177 .map(|g| g.is_some())
1178 .unwrap_or(false)
1179 }
1180
1181 pub fn is_text_loaded(&self) -> bool {
1183 self.text_session
1184 .lock()
1185 .map(|g| g.is_some())
1186 .unwrap_or(false)
1187 }
1188 }
1189
1190 fn l2_normalize(v: &[f32]) -> Vec<f32> {
1192 let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
1193 if norm.is_finite() && norm > 1e-10 {
1194 v.iter().map(|x| x / norm).collect()
1195 } else {
1196 vec![0.0; v.len()]
1198 }
1199 }
1200
1201 pub fn calculate_color_variance(image: &DynamicImage) -> f32 {
1203 let rgb = image.to_rgb8();
1204 let (width, height) = rgb.dimensions();
1205 let total_pixels = (width * height) as f32;
1206
1207 if total_pixels == 0.0 {
1208 return 0.0;
1209 }
1210
1211 let mut sum_r = 0.0f32;
1213 let mut sum_g = 0.0f32;
1214 let mut sum_b = 0.0f32;
1215
1216 for pixel in rgb.pixels() {
1217 sum_r += pixel[0] as f32;
1218 sum_g += pixel[1] as f32;
1219 sum_b += pixel[2] as f32;
1220 }
1221
1222 let mean_r = sum_r / total_pixels;
1223 let mean_g = sum_g / total_pixels;
1224 let mean_b = sum_b / total_pixels;
1225
1226 let mut var_r = 0.0f32;
1228 let mut var_g = 0.0f32;
1229 let mut var_b = 0.0f32;
1230
1231 for pixel in rgb.pixels() {
1232 var_r += (pixel[0] as f32 - mean_r).powi(2);
1233 var_g += (pixel[1] as f32 - mean_g).powi(2);
1234 var_b += (pixel[2] as f32 - mean_b).powi(2);
1235 }
1236
1237 ((var_r + var_g + var_b) / (3.0 * total_pixels)) / (255.0 * 255.0)
1239 }
1240
1241 pub fn get_image_info(image: &DynamicImage) -> ImageInfo {
1243 let (width, height) = image.dimensions();
1244 let color_variance = calculate_color_variance(image);
1245
1246 ImageInfo {
1247 width,
1248 height,
1249 color_variance,
1250 }
1251 }
1252}
1253
1254#[cfg(feature = "clip")]
1255pub use model::*;
1256
1257#[cfg(all(feature = "clip", feature = "pdfium"))]
1258use pdfium_render::prelude::{PdfPageRenderRotation, PdfRenderConfig, Pdfium};
1259
1260#[cfg(all(feature = "clip", feature = "pdfium"))]
1262pub fn render_pdf_pages_for_clip(
1263 path: &Path,
1264 max_pages: usize,
1265 target_px: u32,
1266) -> Result<Vec<(u32, DynamicImage)>> {
1267 let bindings = Pdfium::bind_to_system_library().map_err(|e| ClipError::InferenceError {
1268 cause: format!("Failed to bind pdfium: {}", e),
1269 })?;
1270 let pdfium = Pdfium::new(bindings);
1271 let document =
1272 pdfium
1273 .load_pdf_from_file(path, None)
1274 .map_err(|e| ClipError::InferenceError {
1275 cause: format!("Failed to load PDF for CLIP rendering: {}", e),
1276 })?;
1277
1278 let render_config = PdfRenderConfig::new()
1279 .set_target_width(target_px as i32)
1280 .set_maximum_height(target_px as i32)
1281 .set_maximum_width(target_px as i32)
1282 .rotate_if_landscape(PdfPageRenderRotation::None, false);
1283
1284 let mut pages = Vec::new();
1285 for (index, page) in document.pages().iter().enumerate() {
1286 if index >= max_pages {
1287 break;
1288 }
1289 let rendered = page
1290 .render_with_config(&render_config)
1291 .map_err(|e| ClipError::InferenceError {
1292 cause: format!("Failed to render PDF page {}: {}", index + 1, e),
1293 })?
1294 .as_image();
1295 pages.push(((index + 1) as u32, rendered));
1296 }
1297
1298 Ok(pages)
1299}
1300
1301#[cfg(all(feature = "clip", not(feature = "pdfium")))]
1302pub fn render_pdf_pages_for_clip(
1303 path: &Path,
1304 max_pages: usize,
1305 _target_px: u32,
1306) -> Result<Vec<(u32, DynamicImage)>> {
1307 fn extract_images_from_page(
1308 doc: &Document,
1309 page_id: ObjectId,
1310 remaining: &mut usize,
1311 out: &mut Vec<(u32, DynamicImage)>,
1312 ) -> Result<()> {
1313 if *remaining == 0 {
1314 return Ok(());
1315 }
1316
1317 let (resources_opt, resource_ids) =
1318 doc.get_page_resources(page_id)
1319 .map_err(|e| ClipError::InferenceError {
1320 cause: format!("Failed to read PDF resources: {}", e),
1321 })?;
1322
1323 let mut seen = HashSet::new();
1324 let mut resource_dicts: Vec<Dictionary> = Vec::new();
1325
1326 if let Some(dict) = resources_opt {
1327 resource_dicts.push(dict.clone());
1328 }
1329 for res_id in resource_ids {
1330 if seen.insert(res_id) {
1331 if let Ok(dict) = doc.get_dictionary(res_id) {
1332 resource_dicts.push(dict.clone());
1333 }
1334 }
1335 }
1336
1337 for dict in resource_dicts {
1338 if let Ok(xobjects) = dict.get(b"XObject") {
1339 let xobj_dict = match xobjects {
1340 Object::Dictionary(d) => Some(d),
1341 Object::Reference(id) => doc.get_dictionary(*id).ok(),
1342 _ => None,
1343 };
1344 if let Some(xobj_dict) = xobj_dict {
1345 for (_, obj) in xobj_dict.iter() {
1346 let id = match obj {
1347 Object::Reference(id) => *id,
1348 _ => continue,
1349 };
1350 let stream = match doc.get_object(id).and_then(Object::as_stream) {
1351 Ok(s) => s,
1352 Err(_) => continue,
1353 };
1354 let subtype = stream.dict.get(b"Subtype").ok();
1355 let is_image = matches!(subtype, Some(Object::Name(n)) if n == b"Image");
1356 if !is_image {
1357 continue;
1358 }
1359
1360 let width = stream
1361 .dict
1362 .get(b"Width")
1363 .ok()
1364 .and_then(|o| o.as_i64().ok())
1365 .unwrap_or(0);
1366 let height = stream
1367 .dict
1368 .get(b"Height")
1369 .ok()
1370 .and_then(|o| o.as_i64().ok())
1371 .unwrap_or(0);
1372 if width <= 0 || height <= 0 {
1373 continue;
1374 }
1375
1376 let filters = stream
1377 .dict
1378 .get(b"Filter")
1379 .ok()
1380 .and_then(|f| match f {
1381 Object::Name(n) => Some(vec![n.clone()]),
1382 Object::Array(arr) => Some(
1383 arr.iter()
1384 .filter_map(|o| o.as_name().ok().map(|n| n.to_vec()))
1385 .collect(),
1386 ),
1387 _ => None,
1388 })
1389 .unwrap_or_default();
1390
1391 let data = stream
1392 .decompressed_content()
1393 .unwrap_or_else(|_| stream.content.clone());
1394
1395 if filters
1397 .iter()
1398 .any(|f| f == b"DCTDecode" || f == b"JPXDecode")
1399 {
1400 if let Ok(img) = image::load_from_memory(&data) {
1401 out.push((1, img));
1402 if out.len() >= *remaining {
1403 *remaining = 0;
1404 return Ok(());
1405 }
1406 *remaining -= 1;
1407 continue;
1408 }
1409 }
1410
1411 let color_space = stream
1412 .dict
1413 .get(b"ColorSpace")
1414 .ok()
1415 .and_then(|o| o.as_name().ok())
1416 .unwrap_or(b"DeviceRGB");
1417 let channels = if color_space == b"DeviceGray" { 1 } else { 3 };
1418
1419 let expected = width as usize * height as usize * channels;
1420 if data.len() >= expected && channels == 3 {
1421 if let Some(buf) = ImageBuffer::<Rgb<u8>, _>::from_raw(
1422 width as u32,
1423 height as u32,
1424 data.clone(),
1425 ) {
1426 out.push((1, DynamicImage::ImageRgb8(buf)));
1427 if out.len() >= *remaining {
1428 *remaining = 0;
1429 return Ok(());
1430 }
1431 *remaining -= 1;
1432 continue;
1433 }
1434 } else if data.len() >= expected && channels == 1 {
1435 if let Some(buf) = ImageBuffer::<Luma<u8>, _>::from_raw(
1436 width as u32,
1437 height as u32,
1438 data.clone(),
1439 ) {
1440 out.push((1, DynamicImage::ImageLuma8(buf)));
1441 if out.len() >= *remaining {
1442 *remaining = 0;
1443 return Ok(());
1444 }
1445 *remaining -= 1;
1446 continue;
1447 }
1448 }
1449 }
1450 }
1451 }
1452 }
1453
1454 Ok(())
1455 }
1456
1457 let doc = Document::load(path).map_err(|e| ClipError::InferenceError {
1458 cause: format!("Failed to load PDF for image extraction: {}", e),
1459 })?;
1460
1461 let mut remaining = max_pages;
1462 let mut pages: Vec<(u32, DynamicImage)> = Vec::new();
1463
1464 for (page_num, page_id) in doc.get_pages() {
1465 if remaining == 0 {
1466 break;
1467 }
1468 let start_len = pages.len();
1469 extract_images_from_page(&doc, page_id, &mut remaining, &mut pages)?;
1470 if pages.len() > start_len {
1471 for entry in pages.iter_mut().skip(start_len) {
1472 entry.0 = page_num as u32;
1473 }
1474 }
1475 }
1476
1477 Ok(pages)
1478}
1479
1480pub trait ClipEmbeddingProvider: Send + Sync {
1509 fn kind(&self) -> &str;
1511
1512 fn model(&self) -> &str;
1514
1515 fn dimension(&self) -> usize;
1517
1518 fn embed_image_file(&self, path: &Path) -> Result<Vec<f32>>;
1520
1521 fn embed_image_bytes(&self, bytes: &[u8]) -> Result<Vec<f32>>;
1523
1524 fn embed_query(&self, text: &str) -> Result<Vec<f32>>;
1526
1527 fn embed_image_batch(&self, paths: &[&Path]) -> Result<Vec<Vec<f32>>> {
1532 let mut embeddings = Vec::with_capacity(paths.len());
1533 for path in paths {
1534 embeddings.push(self.embed_image_file(path)?);
1535 }
1536 Ok(embeddings)
1537 }
1538
1539 fn is_ready(&self) -> bool {
1541 true
1542 }
1543
1544 fn init(&mut self) -> Result<()> {
1546 Ok(())
1547 }
1548
1549 fn unload(&self) -> Result<()> {
1551 Ok(())
1552 }
1553}
1554
1555pub type ClipEmbeddingResult = Result<Vec<f32>>;
1557pub type ClipBatchEmbeddingResult = Result<Vec<Vec<f32>>>;
1558
1559#[cfg(feature = "clip")]
1564impl ClipEmbeddingProvider for ClipModel {
1565 fn kind(&self) -> &str {
1566 "clip"
1567 }
1568
1569 fn model(&self) -> &str {
1570 self.model_info().name
1571 }
1572
1573 fn dimension(&self) -> usize {
1574 self.model_info().dims as usize
1575 }
1576
1577 fn embed_image_file(&self, path: &Path) -> Result<Vec<f32>> {
1578 self.encode_image_file(path)
1579 }
1580
1581 fn embed_image_bytes(&self, bytes: &[u8]) -> Result<Vec<f32>> {
1582 self.encode_image_bytes(bytes)
1583 }
1584
1585 fn embed_query(&self, text: &str) -> Result<Vec<f32>> {
1586 self.encode_text(text)
1587 }
1588
1589 fn embed_image_batch(&self, paths: &[&Path]) -> Result<Vec<Vec<f32>>> {
1590 let mut embeddings = Vec::with_capacity(paths.len());
1591 for path in paths {
1592 embeddings.push(self.encode_image_file(path)?);
1593 }
1594 Ok(embeddings)
1595 }
1596
1597 fn is_ready(&self) -> bool {
1598 true
1600 }
1601
1602 fn unload(&self) -> Result<()> {
1603 ClipModel::unload(self)
1604 }
1605}
1606
1607#[derive(Debug, Clone, Serialize, Deserialize)]
1613pub struct ClipIndexManifest {
1614 pub bytes_offset: u64,
1616 pub bytes_length: u64,
1618 pub vector_count: u64,
1620 pub dimension: u32,
1622 pub checksum: [u8; 32],
1624 pub model_name: String,
1626}
1627
1628#[cfg(test)]
1633mod tests {
1634 use super::*;
1635
1636 #[test]
1637 fn clip_index_builder_roundtrip() {
1638 let mut builder = ClipIndexBuilder::new();
1639 builder.add_document(1, None, vec![0.1, 0.2, 0.3, 0.4]);
1640 builder.add_document(2, None, vec![0.5, 0.6, 0.7, 0.8]);
1641
1642 let artifact = builder.finish().expect("finish");
1643 assert_eq!(artifact.vector_count, 2);
1644 assert_eq!(artifact.dimension, 4);
1645
1646 let index = ClipIndex::decode(&artifact.bytes).expect("decode");
1647 assert_eq!(index.len(), 2);
1648
1649 let hits = index.search(&[0.1, 0.2, 0.3, 0.4], 10);
1650 assert_eq!(hits[0].frame_id, 1);
1651 assert!(hits[0].distance < 0.001); }
1653
1654 #[test]
1655 fn clip_index_search() {
1656 let mut builder = ClipIndexBuilder::new();
1657 builder.add_document(1, None, vec![1.0, 0.0, 0.0]);
1658 builder.add_document(2, None, vec![0.0, 1.0, 0.0]);
1659 builder.add_document(3, None, vec![0.0, 0.0, 1.0]);
1660
1661 let artifact = builder.finish().expect("finish");
1662 let index = ClipIndex::decode(&artifact.bytes).expect("decode");
1663
1664 let hits = index.search(&[1.0, 0.0, 0.0], 3);
1666 assert_eq!(hits[0].frame_id, 1);
1667
1668 let hits = index.search(&[0.0, 1.0, 0.0], 3);
1670 assert_eq!(hits[0].frame_id, 2);
1671 }
1672
1673 #[test]
1674 fn l2_distance_calculation() {
1675 let d = l2_distance(&[0.0, 0.0], &[3.0, 4.0]);
1676 assert!((d - 5.0).abs() < 1e-6);
1677
1678 let d = l2_distance(&[1.0, 1.0, 1.0], &[1.0, 1.0, 1.0]);
1679 assert!(d.abs() < 1e-6);
1680 }
1681
1682 #[test]
1683 fn image_info_filtering() {
1684 let tiny = ImageInfo {
1686 width: 32,
1687 height: 32,
1688 color_variance: 0.5,
1689 };
1690 assert!(!tiny.should_embed());
1691
1692 let good = ImageInfo {
1694 width: 256,
1695 height: 256,
1696 color_variance: 0.5,
1697 };
1698 assert!(good.should_embed());
1699
1700 let wide = ImageInfo {
1702 width: 1000,
1703 height: 10,
1704 color_variance: 0.5,
1705 };
1706 assert!(!wide.should_embed());
1707
1708 let solid = ImageInfo {
1710 width: 256,
1711 height: 256,
1712 color_variance: 0.001,
1713 };
1714 assert!(!solid.should_embed());
1715 }
1716
1717 #[test]
1718 fn model_registry() {
1719 let default = default_model_info();
1720 assert_eq!(default.name, "mobileclip-s2");
1721 assert_eq!(default.dims, 512);
1722 assert!(default.is_default);
1723
1724 let siglip = get_model_info("siglip-base");
1725 assert_eq!(siglip.dims, 768);
1726
1727 let unknown = get_model_info("nonexistent");
1729 assert_eq!(unknown.name, "mobileclip-s2");
1730 }
1731
1732 #[test]
1733 fn clip_config_defaults() {
1734 unsafe {
1737 std::env::remove_var("MEMVID_CLIP_MODEL");
1738 std::env::remove_var("MEMVID_OFFLINE");
1739 }
1740
1741 let config = ClipConfig::default();
1742 assert_eq!(config.model_name, "mobileclip-s2");
1743 assert!(!config.offline);
1744 }
1745
1746 #[test]
1747 fn clip_embedding_provider_trait() {
1748 fn assert_send_sync<T: Send + Sync>() {}
1750
1751 assert_send_sync::<Box<dyn super::ClipEmbeddingProvider>>();
1753 }
1754}