Skip to main content

memvid_core/
clip.rs

1// Safe expect: Static CLIP model lookup with guaranteed default.
2#![allow(clippy::unwrap_used, clippy::expect_used)]
3//! CLIP (Contrastive Language-Image Pre-training) visual embeddings module.
4//!
5//! This module provides visual understanding capabilities using MobileCLIP-S2,
6//! enabling semantic search across images and PDF pages with natural language queries.
7//!
8//! # Design Philosophy
9//!
10//! - **Synchronous with Parallelism**: CLIP runs in parallel with text embedding via rayon.
11//!   Since CLIP (~25ms) is faster than text embedding (~200-500ms), it adds zero latency.
12//! - **Separate Index**: CLIP embeddings (512 dims) are stored in a separate index from
13//!   text embeddings (384/768/1536 dims) because dimensions must match within an index.
14//! - **Auto-detection**: Images and PDFs with images are automatically processed without flags.
15//! - **Graceful Degradation**: Works without CLIP, just loses visual search capability.
16
17use blake3::hash;
18#[cfg(feature = "clip")]
19use image::DynamicImage;
20#[cfg(all(feature = "clip", not(feature = "pdfium")))]
21use image::{ImageBuffer, Luma, Rgb};
22#[cfg(all(feature = "clip", not(feature = "pdfium")))]
23use lopdf::{Dictionary, Document, Object, ObjectId};
24use serde::{Deserialize, Serialize};
25use std::borrow::Cow;
26#[cfg(all(feature = "clip", not(feature = "pdfium")))]
27use std::collections::HashSet;
28use std::path::{Path, PathBuf};
29use std::time::Duration;
30
31use crate::{MemvidError, Result, types::FrameId};
32
33// ============================================================================
34// Stderr Suppression for macOS
35// ============================================================================
36// ONNX Runtime on macOS emits "Context leak detected, msgtracer returned -1"
37// warnings from Apple's tracing infrastructure. These are harmless but noisy.
38
39#[cfg(all(feature = "clip", target_os = "macos"))]
40mod stderr_suppress {
41    use std::fs::File;
42    use std::io;
43    use std::os::unix::io::{AsRawFd, RawFd};
44
45    pub struct StderrSuppressor {
46        original_stderr: RawFd,
47        #[allow(dead_code)]
48        dev_null: File,
49    }
50
51    impl StderrSuppressor {
52        pub fn new() -> io::Result<Self> {
53            let dev_null = File::open("/dev/null")?;
54            let original_stderr = unsafe { libc::dup(2) };
55            if original_stderr == -1 {
56                return Err(io::Error::last_os_error());
57            }
58            let result = unsafe { libc::dup2(dev_null.as_raw_fd(), 2) };
59            if result == -1 {
60                unsafe { libc::close(original_stderr) };
61                return Err(io::Error::last_os_error());
62            }
63            Ok(Self {
64                original_stderr,
65                dev_null,
66            })
67        }
68    }
69
70    impl Drop for StderrSuppressor {
71        fn drop(&mut self) {
72            unsafe {
73                libc::dup2(self.original_stderr, 2);
74                libc::close(self.original_stderr);
75            }
76        }
77    }
78}
79
80#[cfg(all(feature = "clip", not(target_os = "macos")))]
81mod stderr_suppress {
82    pub struct StderrSuppressor;
83    impl StderrSuppressor {
84        pub fn new() -> std::io::Result<Self> {
85            Ok(Self)
86        }
87    }
88}
89
90// ============================================================================
91// Configuration Constants
92// ============================================================================
93
94/// CLIP index decode limit (512MB max)
95#[allow(clippy::cast_possible_truncation)]
96const CLIP_DECODE_LIMIT: usize = crate::MAX_INDEX_BYTES as usize;
97
98/// MobileCLIP-S2 embedding dimensions
99pub const MOBILECLIP_DIMS: u32 = 512;
100
101/// SigLIP-base embedding dimensions
102pub const SIGLIP_DIMS: u32 = 768;
103
104/// Default input resolution for MobileCLIP-S2
105pub const MOBILECLIP_INPUT_SIZE: u32 = 256;
106
107/// Default input resolution for `SigLIP`
108pub const SIGLIP_INPUT_SIZE: u32 = 224;
109
110/// Minimum image dimension to process (skip icons, bullets)
111pub const MIN_IMAGE_DIM: u32 = 64;
112
113/// Maximum aspect ratio deviation from 1:1 (skip dividers, lines)
114pub const MAX_ASPECT_RATIO: f32 = 10.0;
115
116/// Minimum color variance threshold (skip solid backgrounds)
117pub const MIN_COLOR_VARIANCE: f32 = 0.01;
118
119/// Model unload timeout (5 minutes idle)
120pub const MODEL_UNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
121
122// ============================================================================
123// Bincode Configuration
124// ============================================================================
125
126fn clip_config() -> impl bincode::config::Config {
127    bincode::config::standard()
128        .with_fixed_int_encoding()
129        .with_little_endian()
130}
131
132// ============================================================================
133// Model Registry
134// ============================================================================
135
136/// Available CLIP models with verified `HuggingFace` URLs
137#[derive(Debug, Clone)]
138pub struct ClipModelInfo {
139    /// Model identifier
140    pub name: &'static str,
141    /// URL for vision encoder ONNX model
142    pub vision_url: &'static str,
143    /// URL for text encoder ONNX model
144    pub text_url: &'static str,
145    /// URL for tokenizer JSON (BPE)
146    pub tokenizer_url: &'static str,
147    /// Vision model size in MB
148    pub vision_size_mb: f32,
149    /// Text model size in MB
150    pub text_size_mb: f32,
151    /// Output embedding dimensions
152    pub dims: u32,
153    /// Input image resolution
154    pub input_resolution: u32,
155    /// Whether this is the default model
156    pub is_default: bool,
157}
158
159/// Available CLIP models registry
160pub static CLIP_MODELS: &[ClipModelInfo] = &[
161    // MobileCLIP-S2 int8 quantized (smallest, but requires INT8 ONNX support)
162    // Note: INT8 quantized models don't work on all platforms (ConvInteger not supported)
163    ClipModelInfo {
164        name: "mobileclip-s2-int8",
165        vision_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/onnx/vision_model_int8.onnx",
166        text_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/onnx/text_model_int8.onnx",
167        tokenizer_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/tokenizer.json",
168        vision_size_mb: 36.7,
169        text_size_mb: 64.1,
170        dims: MOBILECLIP_DIMS,
171        input_resolution: MOBILECLIP_INPUT_SIZE,
172        is_default: false,
173    },
174    // Alternative: SigLIP-base quantized (higher quality, but may have INT8 issues)
175    ClipModelInfo {
176        name: "siglip-base",
177        vision_url: "https://huggingface.co/Xenova/siglip-base-patch16-224/resolve/main/onnx/vision_model_quantized.onnx",
178        text_url: "https://huggingface.co/Xenova/siglip-base-patch16-224/resolve/main/onnx/text_model_quantized.onnx",
179        tokenizer_url: "https://huggingface.co/Xenova/siglip-base-patch16-224/resolve/main/tokenizer.json",
180        vision_size_mb: 99.5,
181        text_size_mb: 111.0,
182        dims: SIGLIP_DIMS,
183        input_resolution: SIGLIP_INPUT_SIZE,
184        is_default: false,
185    },
186    // Default: MobileCLIP-S2 fp16 (works on all platforms, good balance of size/quality)
187    ClipModelInfo {
188        name: "mobileclip-s2",
189        vision_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/onnx/vision_model_fp16.onnx",
190        text_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/onnx/text_model_fp16.onnx",
191        tokenizer_url: "https://huggingface.co/Xenova/mobileclip_s2/resolve/main/tokenizer.json",
192        vision_size_mb: 71.7,
193        text_size_mb: 127.0,
194        dims: MOBILECLIP_DIMS,
195        input_resolution: MOBILECLIP_INPUT_SIZE,
196        is_default: true,
197    },
198];
199
200/// Get model info by name, defaults to mobileclip-s2
201#[must_use]
202pub fn get_model_info(name: &str) -> &'static ClipModelInfo {
203    CLIP_MODELS
204        .iter()
205        .find(|m| m.name == name)
206        .unwrap_or_else(|| {
207            CLIP_MODELS
208                .iter()
209                .find(|m| m.is_default)
210                .expect("default model")
211        })
212}
213
214/// Get the default model info
215#[must_use]
216pub fn default_model_info() -> &'static ClipModelInfo {
217    CLIP_MODELS
218        .iter()
219        .find(|m| m.is_default)
220        .expect("default model exists")
221}
222
223// ============================================================================
224// CLIP Document and Index Types (mirrors vec.rs pattern)
225// ============================================================================
226
227/// A document with CLIP embedding stored in the index
228#[derive(Debug, Clone, Serialize, Deserialize)]
229pub struct ClipDocument {
230    /// Frame ID this embedding belongs to
231    pub frame_id: FrameId,
232    /// CLIP embedding vector (512 or 768 dims depending on model)
233    pub embedding: Vec<f32>,
234    /// Optional page number (for PDFs)
235    #[serde(default)]
236    pub page: Option<u32>,
237}
238
239/// Builder for constructing CLIP index artifacts
240#[derive(Default)]
241pub struct ClipIndexBuilder {
242    documents: Vec<ClipDocument>,
243}
244
245impl ClipIndexBuilder {
246    #[must_use]
247    pub fn new() -> Self {
248        Self::default()
249    }
250
251    /// Add a document with its CLIP embedding
252    pub fn add_document<I>(&mut self, frame_id: FrameId, page: Option<u32>, embedding: I)
253    where
254        I: Into<Vec<f32>>,
255    {
256        self.documents.push(ClipDocument {
257            frame_id,
258            embedding: embedding.into(),
259            page,
260        });
261    }
262
263    /// Finish building and produce the index artifact
264    pub fn finish(self) -> Result<ClipIndexArtifact> {
265        let bytes = bincode::serde::encode_to_vec(&self.documents, clip_config())?;
266
267        let checksum = *hash(&bytes).as_bytes();
268        let dimension = self
269            .documents
270            .first()
271            .map_or(0, |doc| u32::try_from(doc.embedding.len()).unwrap_or(0));
272
273        Ok(ClipIndexArtifact {
274            bytes,
275            vector_count: self.documents.len() as u64,
276            dimension,
277            checksum,
278        })
279    }
280}
281
282/// Artifact produced by the CLIP index builder
283#[derive(Debug, Clone)]
284pub struct ClipIndexArtifact {
285    /// Serialized index bytes
286    pub bytes: Vec<u8>,
287    /// Number of vectors in the index
288    pub vector_count: u64,
289    /// Embedding dimension (512 for `MobileCLIP`, 768 for `SigLIP`)
290    pub dimension: u32,
291    /// Blake3 checksum of the bytes
292    pub checksum: [u8; 32],
293}
294
295/// In-memory CLIP index for similarity search
296#[derive(Debug, Clone)]
297pub struct ClipIndex {
298    documents: Vec<ClipDocument>,
299}
300
301impl Default for ClipIndex {
302    fn default() -> Self {
303        Self::new()
304    }
305}
306
307impl ClipIndex {
308    /// Create a new empty CLIP index
309    #[must_use]
310    pub fn new() -> Self {
311        Self {
312            documents: Vec::new(),
313        }
314    }
315
316    /// Add a document with its CLIP embedding
317    pub fn add_document<I>(&mut self, frame_id: FrameId, page: Option<u32>, embedding: I)
318    where
319        I: Into<Vec<f32>>,
320    {
321        self.documents.push(ClipDocument {
322            frame_id,
323            embedding: embedding.into(),
324            page,
325        });
326    }
327
328    /// Decode CLIP index from bytes
329    pub fn decode(bytes: &[u8]) -> Result<Self> {
330        let (documents, read) = bincode::serde::decode_from_slice::<Vec<ClipDocument>, _>(
331            bytes,
332            bincode::config::standard()
333                .with_fixed_int_encoding()
334                .with_little_endian()
335                .with_limit::<CLIP_DECODE_LIMIT>(),
336        )?;
337
338        if read != bytes.len() {
339            return Err(MemvidError::InvalidToc {
340                reason: Cow::Owned(format!(
341                    "CLIP index decode: expected {} bytes, read {}",
342                    bytes.len(),
343                    read
344                )),
345            });
346        }
347
348        tracing::debug!(
349            bytes_len = bytes.len(),
350            docs_count = documents.len(),
351            "decoded CLIP index"
352        );
353
354        Ok(Self { documents })
355    }
356
357    /// Search for similar embeddings using L2 distance
358    #[must_use]
359    pub fn search(&self, query: &[f32], limit: usize) -> Vec<ClipSearchHit> {
360        if query.is_empty() {
361            return Vec::new();
362        }
363
364        let mut hits: Vec<ClipSearchHit> = self
365            .documents
366            .iter()
367            .map(|doc| {
368                let distance = l2_distance(query, &doc.embedding);
369                ClipSearchHit {
370                    frame_id: doc.frame_id,
371                    page: doc.page,
372                    distance,
373                }
374            })
375            .collect();
376
377        hits.sort_by(|a, b| {
378            a.distance
379                .partial_cmp(&b.distance)
380                .unwrap_or(std::cmp::Ordering::Equal)
381        });
382        hits.truncate(limit);
383        hits
384    }
385
386    /// Get all entries in the index
387    pub fn entries(&self) -> impl Iterator<Item = (FrameId, Option<u32>, &[f32])> + '_ {
388        self.documents
389            .iter()
390            .map(|doc| (doc.frame_id, doc.page, doc.embedding.as_slice()))
391    }
392
393    /// Get embedding for a specific frame
394    #[must_use]
395    pub fn embedding_for(&self, frame_id: FrameId) -> Option<&[f32]> {
396        self.documents
397            .iter()
398            .find(|doc| doc.frame_id == frame_id)
399            .map(|doc| doc.embedding.as_slice())
400    }
401
402    /// Remove a document from the index
403    pub fn remove(&mut self, frame_id: FrameId) {
404        self.documents.retain(|doc| doc.frame_id != frame_id);
405    }
406
407    /// Number of documents in the index
408    #[must_use]
409    pub fn len(&self) -> usize {
410        self.documents.len()
411    }
412
413    /// Check if index is empty
414    #[must_use]
415    pub fn is_empty(&self) -> bool {
416        self.documents.is_empty()
417    }
418
419    /// Encode the CLIP index to bytes and produce an artifact for persistence
420    pub fn encode(&self) -> Result<ClipIndexArtifact> {
421        let bytes = bincode::serde::encode_to_vec(&self.documents, clip_config())?;
422
423        let checksum = *hash(&bytes).as_bytes();
424        let dimension = self
425            .documents
426            .first()
427            .map_or(0, |doc| u32::try_from(doc.embedding.len()).unwrap_or(0));
428
429        Ok(ClipIndexArtifact {
430            bytes,
431            vector_count: self.documents.len() as u64,
432            dimension,
433            checksum,
434        })
435    }
436}
437
438/// Search result from CLIP index
439#[derive(Debug, Clone, PartialEq)]
440pub struct ClipSearchHit {
441    /// Frame ID of the matched document
442    pub frame_id: FrameId,
443    /// Optional page number (for PDFs)
444    pub page: Option<u32>,
445    /// L2 distance to query (lower is more similar)
446    pub distance: f32,
447}
448
449/// L2 (Euclidean) distance between two vectors
450fn l2_distance(a: &[f32], b: &[f32]) -> f32 {
451    a.iter()
452        .zip(b.iter())
453        .map(|(x, y)| (x - y).powi(2))
454        .sum::<f32>()
455        .sqrt()
456}
457
458// ============================================================================
459// Image Filtering (Junk Detection)
460// ============================================================================
461
462/// Metadata about an image for filtering
463#[derive(Debug, Clone)]
464pub struct ImageInfo {
465    pub width: u32,
466    pub height: u32,
467    pub color_variance: f32,
468}
469
470impl ImageInfo {
471    /// Check if this image should be processed for CLIP embedding
472    #[must_use]
473    pub fn should_embed(&self) -> bool {
474        // Skip tiny images (icons, bullets)
475        if self.width < MIN_IMAGE_DIM || self.height < MIN_IMAGE_DIM {
476            return false;
477        }
478
479        // Skip extreme aspect ratios (dividers, lines)
480        let aspect = self.width as f32 / self.height as f32;
481        if !((1.0 / MAX_ASPECT_RATIO)..=MAX_ASPECT_RATIO).contains(&aspect) {
482            return false;
483        }
484
485        // Skip near-solid colors (backgrounds, spacers)
486        if self.color_variance < MIN_COLOR_VARIANCE {
487            return false;
488        }
489
490        true
491    }
492}
493
494/// Filter a list of images, keeping only those worth embedding
495pub fn filter_junk_images<T, F>(images: Vec<T>, get_info: F) -> Vec<T>
496where
497    F: Fn(&T) -> ImageInfo,
498{
499    images
500        .into_iter()
501        .filter(|img| get_info(img).should_embed())
502        .collect()
503}
504
505// ============================================================================
506// CLIP Model Configuration
507// ============================================================================
508
509/// Configuration for CLIP model initialization
510#[derive(Debug, Clone)]
511pub struct ClipConfig {
512    /// Model name (e.g., "mobileclip-s2", "siglip-base")
513    pub model_name: String,
514    /// Directory where models are cached
515    pub models_dir: PathBuf,
516    /// Whether to run in offline mode (no downloads)
517    pub offline: bool,
518}
519
520impl Default for ClipConfig {
521    fn default() -> Self {
522        // Use ~/.memvid/models as default, consistent with CLI's model installation
523        let models_dir = std::env::var("MEMVID_MODELS_DIR")
524            .ok()
525            .map(PathBuf::from)
526            .or_else(|| dirs_next::home_dir().map(|d| d.join(".memvid/models")))
527            .unwrap_or_else(|| PathBuf::from(".memvid/models"));
528
529        let model_name =
530            std::env::var("MEMVID_CLIP_MODEL").unwrap_or_else(|_| "mobileclip-s2".to_string());
531
532        let offline = std::env::var("MEMVID_OFFLINE").is_ok();
533
534        Self {
535            model_name,
536            models_dir,
537            offline,
538        }
539    }
540}
541
542// ============================================================================
543// CLIP Error Types
544// ============================================================================
545
546/// CLIP-specific errors
547#[derive(Debug, thiserror::Error)]
548pub enum ClipError {
549    /// Model not found and offline mode enabled
550    #[error("CLIP model '{model}' not found. {hint}")]
551    ModelNotFound { model: String, hint: String },
552
553    /// Image decode failed
554    #[error("Failed to decode image at {path:?}: {cause}")]
555    ImageDecodeError { path: PathBuf, cause: String },
556
557    /// Image bytes decode failed
558    #[error("Failed to decode image bytes: {cause}")]
559    ImageBytesDecodeError { cause: String },
560
561    /// ONNX runtime error
562    #[error("CLIP inference error: {cause}")]
563    InferenceError { cause: String },
564
565    /// Model download failed
566    #[error("Failed to download CLIP model: {cause}")]
567    DownloadError { cause: String },
568
569    /// Model file corrupted or invalid
570    #[error("CLIP model file is corrupted: {cause}")]
571    ModelCorrupted { cause: String },
572}
573
574impl From<ClipError> for MemvidError {
575    fn from(err: ClipError) -> Self {
576        MemvidError::EmbeddingFailed {
577            reason: err.to_string().into_boxed_str(),
578        }
579    }
580}
581
582// ============================================================================
583// CLIP Model (Feature-gated implementation)
584// ============================================================================
585
586#[cfg(feature = "clip")]
587mod model {
588    use super::*;
589    use image::{DynamicImage, GenericImageView, imageops::FilterType};
590    use ndarray::{Array, Array4};
591    use ort::session::{Session, builder::GraphOptimizationLevel};
592    use ort::value::Tensor;
593    use std::sync::Mutex;
594    use std::time::Instant;
595    use tokenizers::{
596        PaddingDirection, PaddingParams, PaddingStrategy, Tokenizer, TruncationDirection,
597        TruncationParams, TruncationStrategy,
598    };
599
600    /// CLIP model with lazy-loaded vision and text encoders
601    pub struct ClipModel {
602        config: ClipConfig,
603        model_info: &'static ClipModelInfo,
604        /// Lazy-loaded vision encoder session
605        vision_session: Mutex<Option<Session>>,
606        /// Lazy-loaded text encoder session
607        text_session: Mutex<Option<Session>>,
608        /// Lazy-loaded tokenizer matching the text encoder
609        tokenizer: Mutex<Option<Tokenizer>>,
610        /// Last time the model was used (for idle unloading)
611        last_used: Mutex<Instant>,
612    }
613
614    impl ClipModel {
615        /// Create a new CLIP model with the given configuration
616        pub fn new(config: ClipConfig) -> Result<Self> {
617            let model_info = get_model_info(&config.model_name);
618
619            Ok(Self {
620                config,
621                model_info,
622                vision_session: Mutex::new(None),
623                text_session: Mutex::new(None),
624                tokenizer: Mutex::new(None),
625                last_used: Mutex::new(Instant::now()),
626            })
627        }
628
629        /// Create with default configuration
630        pub fn default_model() -> Result<Self> {
631            Self::new(ClipConfig::default())
632        }
633
634        /// Get model info
635        pub fn model_info(&self) -> &'static ClipModelInfo {
636            self.model_info
637        }
638
639        /// Get embedding dimensions
640        pub fn dims(&self) -> u32 {
641            self.model_info.dims
642        }
643
644        /// Ensure model file exists, downloading if necessary
645        fn ensure_model_file(&self, kind: &str) -> Result<PathBuf> {
646            let filename = format!("{}_{}.onnx", self.model_info.name, kind);
647            let path = self.config.models_dir.join(&filename);
648
649            if path.exists() {
650                return Ok(path);
651            }
652
653            if self.config.offline {
654                return Err(ClipError::ModelNotFound {
655                    model: self.model_info.name.to_string(),
656                    hint: format!(
657                        "Run: memvid model download {} (or disable MEMVID_OFFLINE)",
658                        self.model_info.name
659                    ),
660                }
661                .into());
662            }
663
664            // Create models directory if needed
665            std::fs::create_dir_all(&self.config.models_dir).map_err(|e| {
666                ClipError::DownloadError {
667                    cause: format!("Failed to create models directory: {}", e),
668                }
669            })?;
670
671            // Provide manual download instructions
672            Err(ClipError::DownloadError {
673                cause: format!(
674                    "Automatic download not yet implemented. Please download manually:\n\
675                     curl -L '{}' -o '{}'",
676                    if kind == "vision" {
677                        self.model_info.vision_url
678                    } else {
679                        self.model_info.text_url
680                    },
681                    path.display()
682                ),
683            }
684            .into())
685        }
686
687        /// Ensure tokenizer file exists, downloading if necessary
688        fn ensure_tokenizer_file(&self) -> Result<PathBuf> {
689            let filename = format!("{}_tokenizer.json", self.model_info.name);
690            let path = self.config.models_dir.join(&filename);
691
692            if path.exists() {
693                return Ok(path);
694            }
695
696            if self.config.offline {
697                return Err(ClipError::ModelNotFound {
698                    model: self.model_info.name.to_string(),
699                    hint: format!(
700                        "Tokenizer missing at {}. Copy tokenizer.json from {}",
701                        path.display(),
702                        self.model_info.tokenizer_url
703                    ),
704                }
705                .into());
706            }
707
708            std::fs::create_dir_all(&self.config.models_dir).map_err(|e| {
709                ClipError::DownloadError {
710                    cause: format!("Failed to create models directory: {}", e),
711                }
712            })?;
713
714            Err(ClipError::DownloadError {
715                cause: format!(
716                    "Automatic download not yet implemented. Please download manually:\n\
717                     curl -L '{}' -o '{}'",
718                    self.model_info.tokenizer_url,
719                    path.display()
720                ),
721            }
722            .into())
723        }
724
725        /// Load vision session lazily
726        fn load_vision_session(&self) -> Result<()> {
727            let mut session_guard = self
728                .vision_session
729                .lock()
730                .map_err(|_| MemvidError::Lock("Failed to lock vision session".into()))?;
731
732            if session_guard.is_some() {
733                return Ok(());
734            }
735
736            let vision_path = self.ensure_model_file("vision")?;
737
738            tracing::debug!(path = %vision_path.display(), "Loading CLIP vision model");
739
740            // Suppress stderr during ONNX session creation (macOS emits harmless warnings)
741            let _stderr_guard = stderr_suppress::StderrSuppressor::new().ok();
742
743            let session = Session::builder()
744                .map_err(|e| ClipError::InferenceError {
745                    cause: e.to_string(),
746                })?
747                .with_optimization_level(GraphOptimizationLevel::Level3)
748                .map_err(|e| ClipError::InferenceError {
749                    cause: e.to_string(),
750                })?
751                .with_intra_threads(4)
752                .map_err(|e| ClipError::InferenceError {
753                    cause: e.to_string(),
754                })?
755                .commit_from_file(&vision_path)
756                .map_err(|e| ClipError::InferenceError {
757                    cause: format!("Failed to load vision model: {}", e),
758                })?;
759
760            // _stderr_guard dropped here, restoring stderr
761
762            *session_guard = Some(session);
763            tracing::info!(model = %self.model_info.name, "CLIP vision model loaded");
764
765            Ok(())
766        }
767
768        /// Load text session lazily
769        fn load_text_session(&self) -> Result<()> {
770            let mut session_guard = self
771                .text_session
772                .lock()
773                .map_err(|_| MemvidError::Lock("Failed to lock text session".into()))?;
774
775            if session_guard.is_some() {
776                return Ok(());
777            }
778
779            let text_path = self.ensure_model_file("text")?;
780
781            tracing::debug!(path = %text_path.display(), "Loading CLIP text model");
782
783            // Suppress stderr during ONNX session creation (macOS emits harmless warnings)
784            let _stderr_guard = stderr_suppress::StderrSuppressor::new().ok();
785
786            let session = Session::builder()
787                .map_err(|e| ClipError::InferenceError {
788                    cause: e.to_string(),
789                })?
790                .with_optimization_level(GraphOptimizationLevel::Level3)
791                .map_err(|e| ClipError::InferenceError {
792                    cause: e.to_string(),
793                })?
794                .with_intra_threads(4)
795                .map_err(|e| ClipError::InferenceError {
796                    cause: e.to_string(),
797                })?
798                .commit_from_file(&text_path)
799                .map_err(|e| ClipError::InferenceError {
800                    cause: format!("Failed to load text model: {}", e),
801                })?;
802
803            // _stderr_guard dropped here, restoring stderr
804
805            *session_guard = Some(session);
806            tracing::info!(model = %self.model_info.name, "CLIP text model loaded");
807
808            Ok(())
809        }
810
811        /// Load tokenizer lazily (matches the text model vocab/BPE)
812        fn load_tokenizer(&self) -> Result<()> {
813            let mut tokenizer_guard = self
814                .tokenizer
815                .lock()
816                .map_err(|_| MemvidError::Lock("Failed to lock CLIP tokenizer".into()))?;
817
818            if tokenizer_guard.is_some() {
819                return Ok(());
820            }
821
822            let tokenizer_path = self.ensure_tokenizer_file()?;
823
824            tracing::debug!(path = %tokenizer_path.display(), "Loading CLIP tokenizer");
825
826            let mut tokenizer =
827                Tokenizer::from_file(&tokenizer_path).map_err(|e| ClipError::InferenceError {
828                    cause: format!("Failed to load tokenizer: {}", e),
829                })?;
830
831            tokenizer.with_padding(Some(PaddingParams {
832                strategy: PaddingStrategy::Fixed(77),
833                direction: PaddingDirection::Right,
834                pad_to_multiple_of: None,
835                pad_id: 0,
836                pad_type_id: 0,
837                pad_token: "[PAD]".to_string(),
838            }));
839
840            tokenizer
841                .with_truncation(Some(TruncationParams {
842                    max_length: 77,
843                    strategy: TruncationStrategy::LongestFirst,
844                    stride: 0,
845                    direction: TruncationDirection::Right,
846                }))
847                .map_err(|e| ClipError::InferenceError {
848                    cause: format!("Failed to apply truncation config: {}", e),
849                })?;
850
851            *tokenizer_guard = Some(tokenizer);
852            tracing::info!(model = %self.model_info.name, "CLIP tokenizer loaded");
853
854            Ok(())
855        }
856
857        /// Preprocess image for CLIP inference
858        ///
859        /// MobileCLIP-S2 uses:
860        /// - Input size: 256x256
861        /// - Resize: shortest edge to 256, preserve aspect, center-crop
862        /// - Normalization: scale to [0, 1] (no mean/std shift per preprocessor_config)
863        /// - Format: NCHW (batch, channels, height, width)
864        fn preprocess_image(&self, image: &DynamicImage) -> Array4<f32> {
865            let size = self.model_info.input_resolution;
866            let rgb_input = image.to_rgb8();
867            let (w, h) = rgb_input.dimensions();
868
869            // Resize shortest edge to target while preserving aspect ratio
870            let scale = size as f32 / w.min(h) as f32;
871            let new_w = ((w as f32) * scale).round().max(1.0) as u32;
872            let new_h = ((h as f32) * scale).round().max(1.0) as u32;
873            let resized = image.resize_exact(new_w, new_h, FilterType::Triangle);
874
875            // Center crop to (size, size)
876            let start_x = (resized.width().saturating_sub(size)) / 2;
877            let start_y = (resized.height().saturating_sub(size)) / 2;
878
879            // Create array in NCHW format: [1, 3, H, W]
880            let mut array = Array4::<f32>::zeros((1, 3, size as usize, size as usize));
881
882            for y in 0..size as usize {
883                for x in 0..size as usize {
884                    let pixel = resized.get_pixel(start_x + x as u32, start_y + y as u32);
885                    array[[0, 0, y, x]] = pixel[0] as f32 / 255.0;
886                    array[[0, 1, y, x]] = pixel[1] as f32 / 255.0;
887                    array[[0, 2, y, x]] = pixel[2] as f32 / 255.0;
888                }
889            }
890
891            array
892        }
893
894        /// Encode an image to CLIP embedding
895        pub fn encode_image(&self, image: &DynamicImage) -> Result<Vec<f32>> {
896            // Ensure vision session is loaded
897            self.load_vision_session()?;
898
899            // Preprocess the image
900            let pixel_values = self.preprocess_image(image);
901
902            // Update last used timestamp
903            if let Ok(mut last) = self.last_used.lock() {
904                *last = Instant::now();
905            }
906
907            // Run inference
908            let mut session_guard = self
909                .vision_session
910                .lock()
911                .map_err(|_| MemvidError::Lock("Failed to lock vision session".into()))?;
912
913            let session = session_guard
914                .as_mut()
915                .ok_or_else(|| ClipError::InferenceError {
916                    cause: "Vision session not loaded".to_string(),
917                })?;
918
919            // Get input and output names from session before running
920            let input_name = session
921                .inputs
922                .first()
923                .map(|i| i.name.clone())
924                .unwrap_or_else(|| "pixel_values".into());
925            let output_name = session
926                .outputs
927                .first()
928                .map(|o| o.name.clone())
929                .unwrap_or_else(|| "image_embeds".into());
930
931            // Create tensor from ndarray
932            let input_tensor =
933                Tensor::from_array(pixel_values).map_err(|e| ClipError::InferenceError {
934                    cause: format!("Failed to create input tensor: {}", e),
935                })?;
936
937            // Run the model
938            let outputs = session
939                .run(ort::inputs![input_name => input_tensor])
940                .map_err(|e| ClipError::InferenceError {
941                    cause: format!("Vision inference failed: {}", e),
942                })?;
943
944            // Extract embeddings from first output
945            let output = outputs
946                .get(&output_name)
947                .ok_or_else(|| ClipError::InferenceError {
948                    cause: format!("No output '{}' from vision model", output_name),
949                })?;
950
951            let (_shape, data) =
952                output
953                    .try_extract_tensor::<f32>()
954                    .map_err(|e| ClipError::InferenceError {
955                        cause: format!("Failed to extract embeddings: {}", e),
956                    })?;
957
958            // Get the embedding from the raw data
959            let embedding: Vec<f32> = data.to_vec();
960            if embedding.iter().any(|v| !v.is_finite()) {
961                return Err(ClipError::InferenceError {
962                    cause: "Vision embedding contains non-finite values".to_string(),
963                }
964                .into());
965            }
966            let normalized = l2_normalize(&embedding);
967
968            tracing::debug!(dims = normalized.len(), "Generated CLIP image embedding");
969
970            Ok(normalized)
971        }
972
973        /// Encode image bytes to CLIP embedding
974        pub fn encode_image_bytes(&self, bytes: &[u8]) -> Result<Vec<f32>> {
975            let image =
976                image::load_from_memory(bytes).map_err(|e| ClipError::ImageBytesDecodeError {
977                    cause: e.to_string(),
978                })?;
979            self.encode_image(&image)
980        }
981
982        /// Encode an image file to CLIP embedding
983        pub fn encode_image_file(&self, path: &Path) -> Result<Vec<f32>> {
984            let image = image::open(path).map_err(|e| ClipError::ImageDecodeError {
985                path: path.to_path_buf(),
986                cause: e.to_string(),
987            })?;
988            self.encode_image(&image)
989        }
990
991        /// Encode text to CLIP embedding (for query)
992        pub fn encode_text(&self, text: &str) -> Result<Vec<f32>> {
993            // Ensure text session is loaded
994            self.load_text_session()?;
995            self.load_tokenizer()?;
996
997            // Tokenize the text using the model's tokenizer
998            let encoding = {
999                let tokenizer_guard = self
1000                    .tokenizer
1001                    .lock()
1002                    .map_err(|_| MemvidError::Lock("Failed to lock CLIP tokenizer".into()))?;
1003                let tokenizer =
1004                    tokenizer_guard
1005                        .as_ref()
1006                        .ok_or_else(|| ClipError::InferenceError {
1007                            cause: "Tokenizer not loaded".to_string(),
1008                        })?;
1009
1010                tokenizer
1011                    .encode(text, true)
1012                    .map_err(|e| ClipError::InferenceError {
1013                        cause: format!("Text tokenization failed: {}", e),
1014                    })?
1015            };
1016
1017            let input_ids: Vec<i64> = encoding.get_ids().iter().map(|id| *id as i64).collect();
1018            let attention_mask: Vec<i64> = encoding
1019                .get_attention_mask()
1020                .iter()
1021                .map(|id| *id as i64)
1022                .collect();
1023            let max_length = input_ids.len();
1024
1025            // Create input arrays
1026            let input_ids_array =
1027                Array::from_shape_vec((1, max_length), input_ids).map_err(|e| {
1028                    ClipError::InferenceError {
1029                        cause: e.to_string(),
1030                    }
1031                })?;
1032            let attention_mask_array = Array::from_shape_vec((1, max_length), attention_mask)
1033                .map_err(|e| ClipError::InferenceError {
1034                    cause: e.to_string(),
1035                })?;
1036
1037            // Update last used timestamp
1038            if let Ok(mut last) = self.last_used.lock() {
1039                *last = Instant::now();
1040            }
1041
1042            // Run inference
1043            let mut session_guard = self
1044                .text_session
1045                .lock()
1046                .map_err(|_| MemvidError::Lock("Failed to lock text session".into()))?;
1047
1048            let session = session_guard
1049                .as_mut()
1050                .ok_or_else(|| ClipError::InferenceError {
1051                    cause: "Text session not loaded".to_string(),
1052                })?;
1053
1054            // Get input and output names from session before running
1055            let input_names: Vec<String> = session.inputs.iter().map(|i| i.name.clone()).collect();
1056            let output_name = session
1057                .outputs
1058                .first()
1059                .map(|o| o.name.clone())
1060                .unwrap_or_else(|| "text_embeds".into());
1061
1062            // Create tensors from ndarray
1063            let input_ids_tensor =
1064                Tensor::from_array(input_ids_array).map_err(|e| ClipError::InferenceError {
1065                    cause: format!("Failed to create input_ids tensor: {}", e),
1066                })?;
1067            let attention_mask_tensor = Tensor::from_array(attention_mask_array).map_err(|e| {
1068                ClipError::InferenceError {
1069                    cause: format!("Failed to create attention_mask tensor: {}", e),
1070                }
1071            })?;
1072
1073            // Build inputs based on what the model expects
1074            let outputs = if input_names.len() >= 2 {
1075                session
1076                    .run(ort::inputs![
1077                        input_names[0].clone() => input_ids_tensor,
1078                        input_names[1].clone() => attention_mask_tensor
1079                    ])
1080                    .map_err(|e| ClipError::InferenceError {
1081                        cause: format!("Text inference failed: {}", e),
1082                    })?
1083            } else {
1084                // Single input model
1085                let name = input_names
1086                    .first()
1087                    .cloned()
1088                    .unwrap_or_else(|| "input_ids".to_string());
1089                session
1090                    .run(ort::inputs![name => input_ids_tensor])
1091                    .map_err(|e| ClipError::InferenceError {
1092                        cause: format!("Text inference failed: {}", e),
1093                    })?
1094            };
1095
1096            // Extract embeddings from output
1097            let output = outputs
1098                .get(&output_name)
1099                .ok_or_else(|| ClipError::InferenceError {
1100                    cause: format!("No output '{}' from text model", output_name),
1101                })?;
1102
1103            let (_shape, data) =
1104                output
1105                    .try_extract_tensor::<f32>()
1106                    .map_err(|e| ClipError::InferenceError {
1107                        cause: format!("Failed to extract text embeddings: {}", e),
1108                    })?;
1109
1110            // Flatten and normalize the embedding
1111            let embedding: Vec<f32> = data.to_vec();
1112            if embedding.iter().any(|v| !v.is_finite()) {
1113                return Err(ClipError::InferenceError {
1114                    cause: "Text embedding contains non-finite values".to_string(),
1115                }
1116                .into());
1117            }
1118            let normalized = l2_normalize(&embedding);
1119
1120            tracing::debug!(
1121                text_len = text.len(),
1122                dims = normalized.len(),
1123                "Generated CLIP text embedding"
1124            );
1125
1126            Ok(normalized)
1127        }
1128
1129        /// Maybe unload model if unused for too long (memory management)
1130        pub fn maybe_unload(&self) -> Result<()> {
1131            let last_used = self
1132                .last_used
1133                .lock()
1134                .map_err(|_| MemvidError::Lock("Failed to check last_used".into()))?;
1135
1136            if last_used.elapsed() > MODEL_UNLOAD_TIMEOUT {
1137                tracing::debug!(model = %self.model_info.name, "Model idle, unloading sessions");
1138
1139                // Unload vision session
1140                if let Ok(mut guard) = self.vision_session.lock() {
1141                    *guard = None;
1142                }
1143
1144                // Unload text session
1145                if let Ok(mut guard) = self.text_session.lock() {
1146                    *guard = None;
1147                }
1148
1149                // Unload tokenizer
1150                if let Ok(mut guard) = self.tokenizer.lock() {
1151                    *guard = None;
1152                }
1153            }
1154
1155            Ok(())
1156        }
1157
1158        /// Force unload all sessions
1159        pub fn unload(&self) -> Result<()> {
1160            if let Ok(mut guard) = self.vision_session.lock() {
1161                *guard = None;
1162            }
1163            if let Ok(mut guard) = self.text_session.lock() {
1164                *guard = None;
1165            }
1166            if let Ok(mut guard) = self.tokenizer.lock() {
1167                *guard = None;
1168            }
1169            tracing::debug!(model = %self.model_info.name, "CLIP sessions unloaded");
1170            Ok(())
1171        }
1172
1173        /// Check if vision model is loaded
1174        pub fn is_vision_loaded(&self) -> bool {
1175            self.vision_session
1176                .lock()
1177                .map(|g| g.is_some())
1178                .unwrap_or(false)
1179        }
1180
1181        /// Check if text model is loaded
1182        pub fn is_text_loaded(&self) -> bool {
1183            self.text_session
1184                .lock()
1185                .map(|g| g.is_some())
1186                .unwrap_or(false)
1187        }
1188    }
1189
1190    /// L2 normalize a vector (unit length)
1191    fn l2_normalize(v: &[f32]) -> Vec<f32> {
1192        let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
1193        if norm.is_finite() && norm > 1e-10 {
1194            v.iter().map(|x| x / norm).collect()
1195        } else {
1196            // Fall back to zeros to avoid NaNs propagating through distances
1197            vec![0.0; v.len()]
1198        }
1199    }
1200
1201    /// Calculate color variance of an image
1202    pub fn calculate_color_variance(image: &DynamicImage) -> f32 {
1203        let rgb = image.to_rgb8();
1204        let (width, height) = rgb.dimensions();
1205        let total_pixels = (width * height) as f32;
1206
1207        if total_pixels == 0.0 {
1208            return 0.0;
1209        }
1210
1211        // Calculate mean
1212        let mut sum_r = 0.0f32;
1213        let mut sum_g = 0.0f32;
1214        let mut sum_b = 0.0f32;
1215
1216        for pixel in rgb.pixels() {
1217            sum_r += pixel[0] as f32;
1218            sum_g += pixel[1] as f32;
1219            sum_b += pixel[2] as f32;
1220        }
1221
1222        let mean_r = sum_r / total_pixels;
1223        let mean_g = sum_g / total_pixels;
1224        let mean_b = sum_b / total_pixels;
1225
1226        // Calculate variance
1227        let mut var_r = 0.0f32;
1228        let mut var_g = 0.0f32;
1229        let mut var_b = 0.0f32;
1230
1231        for pixel in rgb.pixels() {
1232            var_r += (pixel[0] as f32 - mean_r).powi(2);
1233            var_g += (pixel[1] as f32 - mean_g).powi(2);
1234            var_b += (pixel[2] as f32 - mean_b).powi(2);
1235        }
1236
1237        // Average variance across channels, normalized to 0-1
1238        ((var_r + var_g + var_b) / (3.0 * total_pixels)) / (255.0 * 255.0)
1239    }
1240
1241    /// Get ImageInfo from a DynamicImage
1242    pub fn get_image_info(image: &DynamicImage) -> ImageInfo {
1243        let (width, height) = image.dimensions();
1244        let color_variance = calculate_color_variance(image);
1245
1246        ImageInfo {
1247            width,
1248            height,
1249            color_variance,
1250        }
1251    }
1252}
1253
1254#[cfg(feature = "clip")]
1255pub use model::*;
1256
1257#[cfg(all(feature = "clip", feature = "pdfium"))]
1258use pdfium_render::prelude::{PdfPageRenderRotation, PdfRenderConfig, Pdfium};
1259
1260/// Render PDF pages to images suitable for CLIP embedding (feature-gated).
1261#[cfg(all(feature = "clip", feature = "pdfium"))]
1262pub fn render_pdf_pages_for_clip(
1263    path: &Path,
1264    max_pages: usize,
1265    target_px: u32,
1266) -> Result<Vec<(u32, DynamicImage)>> {
1267    let bindings = Pdfium::bind_to_system_library().map_err(|e| ClipError::InferenceError {
1268        cause: format!("Failed to bind pdfium: {}", e),
1269    })?;
1270    let pdfium = Pdfium::new(bindings);
1271    let document =
1272        pdfium
1273            .load_pdf_from_file(path, None)
1274            .map_err(|e| ClipError::InferenceError {
1275                cause: format!("Failed to load PDF for CLIP rendering: {}", e),
1276            })?;
1277
1278    let render_config = PdfRenderConfig::new()
1279        .set_target_width(target_px as i32)
1280        .set_maximum_height(target_px as i32)
1281        .set_maximum_width(target_px as i32)
1282        .rotate_if_landscape(PdfPageRenderRotation::None, false);
1283
1284    let mut pages = Vec::new();
1285    for (index, page) in document.pages().iter().enumerate() {
1286        if index >= max_pages {
1287            break;
1288        }
1289        let rendered = page
1290            .render_with_config(&render_config)
1291            .map_err(|e| ClipError::InferenceError {
1292                cause: format!("Failed to render PDF page {}: {}", index + 1, e),
1293            })?
1294            .as_image();
1295        pages.push(((index + 1) as u32, rendered));
1296    }
1297
1298    Ok(pages)
1299}
1300
1301#[cfg(all(feature = "clip", not(feature = "pdfium")))]
1302pub fn render_pdf_pages_for_clip(
1303    path: &Path,
1304    max_pages: usize,
1305    _target_px: u32,
1306) -> Result<Vec<(u32, DynamicImage)>> {
1307    fn extract_images_from_page(
1308        doc: &Document,
1309        page_id: ObjectId,
1310        remaining: &mut usize,
1311        out: &mut Vec<(u32, DynamicImage)>,
1312    ) -> Result<()> {
1313        if *remaining == 0 {
1314            return Ok(());
1315        }
1316
1317        let (resources_opt, resource_ids) =
1318            doc.get_page_resources(page_id)
1319                .map_err(|e| ClipError::InferenceError {
1320                    cause: format!("Failed to read PDF resources: {}", e),
1321                })?;
1322
1323        let mut seen = HashSet::new();
1324        let mut resource_dicts: Vec<Dictionary> = Vec::new();
1325
1326        if let Some(dict) = resources_opt {
1327            resource_dicts.push(dict.clone());
1328        }
1329        for res_id in resource_ids {
1330            if seen.insert(res_id) {
1331                if let Ok(dict) = doc.get_dictionary(res_id) {
1332                    resource_dicts.push(dict.clone());
1333                }
1334            }
1335        }
1336
1337        for dict in resource_dicts {
1338            if let Ok(xobjects) = dict.get(b"XObject") {
1339                let xobj_dict = match xobjects {
1340                    Object::Dictionary(d) => Some(d),
1341                    Object::Reference(id) => doc.get_dictionary(*id).ok(),
1342                    _ => None,
1343                };
1344                if let Some(xobj_dict) = xobj_dict {
1345                    for (_, obj) in xobj_dict.iter() {
1346                        let id = match obj {
1347                            Object::Reference(id) => *id,
1348                            _ => continue,
1349                        };
1350                        let stream = match doc.get_object(id).and_then(Object::as_stream) {
1351                            Ok(s) => s,
1352                            Err(_) => continue,
1353                        };
1354                        let subtype = stream.dict.get(b"Subtype").ok();
1355                        let is_image = matches!(subtype, Some(Object::Name(n)) if n == b"Image");
1356                        if !is_image {
1357                            continue;
1358                        }
1359
1360                        let width = stream
1361                            .dict
1362                            .get(b"Width")
1363                            .ok()
1364                            .and_then(|o| o.as_i64().ok())
1365                            .unwrap_or(0);
1366                        let height = stream
1367                            .dict
1368                            .get(b"Height")
1369                            .ok()
1370                            .and_then(|o| o.as_i64().ok())
1371                            .unwrap_or(0);
1372                        if width <= 0 || height <= 0 {
1373                            continue;
1374                        }
1375
1376                        let filters = stream
1377                            .dict
1378                            .get(b"Filter")
1379                            .ok()
1380                            .and_then(|f| match f {
1381                                Object::Name(n) => Some(vec![n.clone()]),
1382                                Object::Array(arr) => Some(
1383                                    arr.iter()
1384                                        .filter_map(|o| o.as_name().ok().map(|n| n.to_vec()))
1385                                        .collect(),
1386                                ),
1387                                _ => None,
1388                            })
1389                            .unwrap_or_default();
1390
1391                        let data = stream
1392                            .decompressed_content()
1393                            .unwrap_or_else(|_| stream.content.clone());
1394
1395                        // If DCT/JPX, hand to image crate directly
1396                        if filters
1397                            .iter()
1398                            .any(|f| f == b"DCTDecode" || f == b"JPXDecode")
1399                        {
1400                            if let Ok(img) = image::load_from_memory(&data) {
1401                                out.push((1, img));
1402                                if out.len() >= *remaining {
1403                                    *remaining = 0;
1404                                    return Ok(());
1405                                }
1406                                *remaining -= 1;
1407                                continue;
1408                            }
1409                        }
1410
1411                        let color_space = stream
1412                            .dict
1413                            .get(b"ColorSpace")
1414                            .ok()
1415                            .and_then(|o| o.as_name().ok())
1416                            .unwrap_or(b"DeviceRGB");
1417                        let channels = if color_space == b"DeviceGray" { 1 } else { 3 };
1418
1419                        let expected = width as usize * height as usize * channels;
1420                        if data.len() >= expected && channels == 3 {
1421                            if let Some(buf) = ImageBuffer::<Rgb<u8>, _>::from_raw(
1422                                width as u32,
1423                                height as u32,
1424                                data.clone(),
1425                            ) {
1426                                out.push((1, DynamicImage::ImageRgb8(buf)));
1427                                if out.len() >= *remaining {
1428                                    *remaining = 0;
1429                                    return Ok(());
1430                                }
1431                                *remaining -= 1;
1432                                continue;
1433                            }
1434                        } else if data.len() >= expected && channels == 1 {
1435                            if let Some(buf) = ImageBuffer::<Luma<u8>, _>::from_raw(
1436                                width as u32,
1437                                height as u32,
1438                                data.clone(),
1439                            ) {
1440                                out.push((1, DynamicImage::ImageLuma8(buf)));
1441                                if out.len() >= *remaining {
1442                                    *remaining = 0;
1443                                    return Ok(());
1444                                }
1445                                *remaining -= 1;
1446                                continue;
1447                            }
1448                        }
1449                    }
1450                }
1451            }
1452        }
1453
1454        Ok(())
1455    }
1456
1457    let doc = Document::load(path).map_err(|e| ClipError::InferenceError {
1458        cause: format!("Failed to load PDF for image extraction: {}", e),
1459    })?;
1460
1461    let mut remaining = max_pages;
1462    let mut pages: Vec<(u32, DynamicImage)> = Vec::new();
1463
1464    for (page_num, page_id) in doc.get_pages() {
1465        if remaining == 0 {
1466            break;
1467        }
1468        let start_len = pages.len();
1469        extract_images_from_page(&doc, page_id, &mut remaining, &mut pages)?;
1470        if pages.len() > start_len {
1471            for entry in pages.iter_mut().skip(start_len) {
1472                entry.0 = page_num as u32;
1473            }
1474        }
1475    }
1476
1477    Ok(pages)
1478}
1479
1480// ============================================================================
1481// CLIP Embedding Provider Trait
1482// ============================================================================
1483
1484/// Trait for CLIP visual embedding providers.
1485///
1486/// Unlike text `EmbeddingProvider`, CLIP providers handle both:
1487/// - **Image encoding**: Generate embeddings from images (for indexing)
1488/// - **Text encoding**: Generate embeddings from text (for queries)
1489///
1490/// This allows natural language queries against visual content.
1491///
1492/// # Example
1493///
1494/// ```ignore
1495/// use memvid_core::clip::{ClipEmbeddingProvider, ClipConfig};
1496///
1497/// // Create provider
1498/// let provider = ClipModel::new(ClipConfig::default())?;
1499///
1500/// // Encode image for indexing
1501/// let image_embedding = provider.embed_image_file(&path)?;
1502///
1503/// // Encode query text for search
1504/// let query_embedding = provider.embed_query("a photo of a cat")?;
1505///
1506/// // Search uses cosine similarity between query and image embeddings
1507/// ```
1508pub trait ClipEmbeddingProvider: Send + Sync {
1509    /// Return the provider kind (e.g., "mobileclip", "siglip").
1510    fn kind(&self) -> &str;
1511
1512    /// Return the model identifier.
1513    fn model(&self) -> &str;
1514
1515    /// Return the embedding dimension.
1516    fn dimension(&self) -> usize;
1517
1518    /// Generate an embedding for an image file.
1519    fn embed_image_file(&self, path: &Path) -> Result<Vec<f32>>;
1520
1521    /// Generate an embedding for image bytes.
1522    fn embed_image_bytes(&self, bytes: &[u8]) -> Result<Vec<f32>>;
1523
1524    /// Generate an embedding for a text query (for searching).
1525    fn embed_query(&self, text: &str) -> Result<Vec<f32>>;
1526
1527    /// Generate embeddings for multiple image files.
1528    ///
1529    /// Default implementation calls `embed_image_file` in a loop.
1530    /// Providers should override this for efficient batch processing.
1531    fn embed_image_batch(&self, paths: &[&Path]) -> Result<Vec<Vec<f32>>> {
1532        let mut embeddings = Vec::with_capacity(paths.len());
1533        for path in paths {
1534            embeddings.push(self.embed_image_file(path)?);
1535        }
1536        Ok(embeddings)
1537    }
1538
1539    /// Check if the provider is ready to generate embeddings.
1540    fn is_ready(&self) -> bool {
1541        true
1542    }
1543
1544    /// Initialize the provider (e.g., load models).
1545    fn init(&mut self) -> Result<()> {
1546        Ok(())
1547    }
1548
1549    /// Unload models to free memory.
1550    fn unload(&self) -> Result<()> {
1551        Ok(())
1552    }
1553}
1554
1555/// Result type for CLIP embedding operations
1556pub type ClipEmbeddingResult = Result<Vec<f32>>;
1557pub type ClipBatchEmbeddingResult = Result<Vec<Vec<f32>>>;
1558
1559// ============================================================================
1560// ClipEmbeddingProvider Implementation (Feature-gated)
1561// ============================================================================
1562
1563#[cfg(feature = "clip")]
1564impl ClipEmbeddingProvider for ClipModel {
1565    fn kind(&self) -> &str {
1566        "clip"
1567    }
1568
1569    fn model(&self) -> &str {
1570        self.model_info().name
1571    }
1572
1573    fn dimension(&self) -> usize {
1574        self.model_info().dims as usize
1575    }
1576
1577    fn embed_image_file(&self, path: &Path) -> Result<Vec<f32>> {
1578        self.encode_image_file(path)
1579    }
1580
1581    fn embed_image_bytes(&self, bytes: &[u8]) -> Result<Vec<f32>> {
1582        self.encode_image_bytes(bytes)
1583    }
1584
1585    fn embed_query(&self, text: &str) -> Result<Vec<f32>> {
1586        self.encode_text(text)
1587    }
1588
1589    fn embed_image_batch(&self, paths: &[&Path]) -> Result<Vec<Vec<f32>>> {
1590        let mut embeddings = Vec::with_capacity(paths.len());
1591        for path in paths {
1592            embeddings.push(self.encode_image_file(path)?);
1593        }
1594        Ok(embeddings)
1595    }
1596
1597    fn is_ready(&self) -> bool {
1598        // CLIP models are lazy-loaded, so always "ready"
1599        true
1600    }
1601
1602    fn unload(&self) -> Result<()> {
1603        ClipModel::unload(self)
1604    }
1605}
1606
1607// ============================================================================
1608// CLIP Index Manifest (for TOC)
1609// ============================================================================
1610
1611/// Manifest for CLIP index stored in TOC
1612#[derive(Debug, Clone, Serialize, Deserialize)]
1613pub struct ClipIndexManifest {
1614    /// Byte offset in file
1615    pub bytes_offset: u64,
1616    /// Length in bytes
1617    pub bytes_length: u64,
1618    /// Number of vectors
1619    pub vector_count: u64,
1620    /// Embedding dimensions
1621    pub dimension: u32,
1622    /// Blake3 checksum
1623    pub checksum: [u8; 32],
1624    /// Model name used to generate embeddings
1625    pub model_name: String,
1626}
1627
1628// ============================================================================
1629// Tests
1630// ============================================================================
1631
1632#[cfg(test)]
1633mod tests {
1634    use super::*;
1635
1636    #[test]
1637    fn clip_index_builder_roundtrip() {
1638        let mut builder = ClipIndexBuilder::new();
1639        builder.add_document(1, None, vec![0.1, 0.2, 0.3, 0.4]);
1640        builder.add_document(2, None, vec![0.5, 0.6, 0.7, 0.8]);
1641
1642        let artifact = builder.finish().expect("finish");
1643        assert_eq!(artifact.vector_count, 2);
1644        assert_eq!(artifact.dimension, 4);
1645
1646        let index = ClipIndex::decode(&artifact.bytes).expect("decode");
1647        assert_eq!(index.len(), 2);
1648
1649        let hits = index.search(&[0.1, 0.2, 0.3, 0.4], 10);
1650        assert_eq!(hits[0].frame_id, 1);
1651        assert!(hits[0].distance < 0.001); // Should be very close
1652    }
1653
1654    #[test]
1655    fn clip_index_search() {
1656        let mut builder = ClipIndexBuilder::new();
1657        builder.add_document(1, None, vec![1.0, 0.0, 0.0]);
1658        builder.add_document(2, None, vec![0.0, 1.0, 0.0]);
1659        builder.add_document(3, None, vec![0.0, 0.0, 1.0]);
1660
1661        let artifact = builder.finish().expect("finish");
1662        let index = ClipIndex::decode(&artifact.bytes).expect("decode");
1663
1664        // Search for [1, 0, 0] - should find frame 1 first
1665        let hits = index.search(&[1.0, 0.0, 0.0], 3);
1666        assert_eq!(hits[0].frame_id, 1);
1667
1668        // Search for [0, 1, 0] - should find frame 2 first
1669        let hits = index.search(&[0.0, 1.0, 0.0], 3);
1670        assert_eq!(hits[0].frame_id, 2);
1671    }
1672
1673    #[test]
1674    fn l2_distance_calculation() {
1675        let d = l2_distance(&[0.0, 0.0], &[3.0, 4.0]);
1676        assert!((d - 5.0).abs() < 1e-6);
1677
1678        let d = l2_distance(&[1.0, 1.0, 1.0], &[1.0, 1.0, 1.0]);
1679        assert!(d.abs() < 1e-6);
1680    }
1681
1682    #[test]
1683    fn image_info_filtering() {
1684        // Tiny image - should skip
1685        let tiny = ImageInfo {
1686            width: 32,
1687            height: 32,
1688            color_variance: 0.5,
1689        };
1690        assert!(!tiny.should_embed());
1691
1692        // Good image
1693        let good = ImageInfo {
1694            width: 256,
1695            height: 256,
1696            color_variance: 0.5,
1697        };
1698        assert!(good.should_embed());
1699
1700        // Extreme aspect ratio
1701        let wide = ImageInfo {
1702            width: 1000,
1703            height: 10,
1704            color_variance: 0.5,
1705        };
1706        assert!(!wide.should_embed());
1707
1708        // Solid color
1709        let solid = ImageInfo {
1710            width: 256,
1711            height: 256,
1712            color_variance: 0.001,
1713        };
1714        assert!(!solid.should_embed());
1715    }
1716
1717    #[test]
1718    fn model_registry() {
1719        let default = default_model_info();
1720        assert_eq!(default.name, "mobileclip-s2");
1721        assert_eq!(default.dims, 512);
1722        assert!(default.is_default);
1723
1724        let siglip = get_model_info("siglip-base");
1725        assert_eq!(siglip.dims, 768);
1726
1727        // Unknown model returns default
1728        let unknown = get_model_info("nonexistent");
1729        assert_eq!(unknown.name, "mobileclip-s2");
1730    }
1731
1732    #[test]
1733    fn clip_config_defaults() {
1734        // Clear the env vars to test true defaults
1735        // SAFETY: No other threads are modifying these env vars in this test
1736        unsafe {
1737            std::env::remove_var("MEMVID_CLIP_MODEL");
1738            std::env::remove_var("MEMVID_OFFLINE");
1739        }
1740
1741        let config = ClipConfig::default();
1742        assert_eq!(config.model_name, "mobileclip-s2");
1743        assert!(!config.offline);
1744    }
1745
1746    #[test]
1747    fn clip_embedding_provider_trait() {
1748        // Test that the trait is properly defined
1749        fn assert_send_sync<T: Send + Sync>() {}
1750
1751        // The trait should require Send + Sync
1752        assert_send_sync::<Box<dyn super::ClipEmbeddingProvider>>();
1753    }
1754}