Skip to main content

mimirs_embed/
lib.rs

1//! mimirs-embed: LFM2.5-VL-450M embedding provider
2//!
3//! Produces 1024-dimensional f32 embeddings from text, images, documents,
4//! and media files via the LFM2.5-VL multimodal model.
5//!
6//! Supported formats:
7//!   Text:      .txt, .md, .rst, .log, .rs, .py, .js, .ts, .go, .c, .cpp,
8//!              .h, .hpp, .java, .rb, .sh, .toml, .yaml, .yml, .json, .xml,
9//!              .html, .htm, .css, .scss, .less, .sql, .php, .swift, .kt,
10//!              .scala, .r, .m, .mm, .lua, .pl, .pm, .vim, .el, .lisp, .hs,
11//!              .ml, .fs, .fsx, .clj, .ex, .exs, .erl, .hrl
12//!   Rich text: .rtf
13//!   Word:      .docx, .doc (via antiword)
14//!   Excel:     .xlsx, .xls, .csv, .tsv
15//!   PowerPoint:.pptx, .ppt (via libreoffice)
16//!   OpenDoc:   .odt, .ods, .odp
17//!   ePub:      .epub
18//!   LaTeX:     .tex, .latex, .bib
19//!   PDF:       .pdf
20//!   Images:    .png, .jpg, .jpeg, .webp, .bmp, .gif, .tiff, .tif, .svg,
21//!              .ico, .heic, .heif, .avif
22//!   Audio:     .mp3, .wav, .flac, .ogg, .m4a, .aac, .wma, .opus
23//!   Video:     .mp4, .mkv, .avi, .webm, .mov, .wmv, .flv, .m4v, .mpg,
24//!              .mpeg, .3gp
25//!   Archives:  .zip, .tar, .gz, .bz2, .xz, .7z, .rar (extracts & embeds
26//!              contained text files)
27
28use std::fs;
29use std::io::Read;
30
31use std::path::{Path, PathBuf};
32use std::process::Command;
33
34use ndarray::Array1;
35use sha2::{Digest, Sha256};
36
37/// Default embedding dimension for LFM2.5-VL-450M.
38///
39/// Alias for [`mimirs_core::EMBED_DIM`] — the single source of truth.
40pub const EMBED_DIM: usize = mimirs_core::EMBED_DIM;
41
42/// HuggingFace download URL for the LFM2.5-VL-450M GGUF model.
43pub const MODEL_URL: &str = "https://huggingface.co/LiquidAI/LFM2.5-VL-450M-GGUF/resolve/main/LFM2.5-VL-450M-Q8_0.gguf?download=true";
44
45/// Model filename after download.
46pub const MODEL_FILENAME: &str = "LFM2.5-VL-450M-Q8_0.gguf";
47
48/// Errors that can occur during embedding.
49#[derive(Debug, thiserror::Error)]
50pub enum EmbedError {
51    #[error("IO error: {0}")]
52    Io(#[from] std::io::Error),
53    #[error("Model not found at {0}")]
54    ModelNotFound(PathBuf),
55    #[error("llama.cpp binary not found: {0}")]
56    LlamaNotFound(String),
57    #[error("Unsupported format: {0}")]
58    UnsupportedFormat(String),
59    #[error("Image decode error: {0}")]
60    ImageError(String),
61    #[error("PDF extraction error: {0}")]
62    PdfError(String),
63    #[error("ZIP extraction error: {0}")]
64    ZipError(String),
65    #[error("Archive error: {0}")]
66    ArchiveError(String),
67    #[error("XML parse error: {0}")]
68    XmlError(String),
69    #[error("Fallback embedding (llama.cpp unavailable)")]
70    Fallback,
71}
72
73/// LFM2.5-VL embedding provider.
74///
75/// Owns the model path and can produce embeddings from text, images, and
76/// multimodal (text+image) inputs. Falls back to deterministic hash-based
77/// embeddings when llama.cpp is unavailable.
78pub struct Lfm2Embedder {
79    model_path: PathBuf,
80    cache_dir: PathBuf,
81}
82
83impl Default for Lfm2Embedder {
84    fn default() -> Self {
85        Self::new()
86    }
87}
88
89impl Lfm2Embedder {
90    /// Creates a new embedder. The model will be downloaded on first use.
91    pub fn new() -> Self {
92        let cache_dir = dirs::cache_dir()
93            .unwrap_or_else(|| PathBuf::from("/tmp"))
94            .join("mimirswell")
95            .join("models");
96        let model_path = cache_dir.join(MODEL_FILENAME);
97        Self {
98            model_path,
99            cache_dir,
100        }
101    }
102
103    /// Ensures the GGUF model is downloaded, downloading if necessary.
104    fn ensure_model(&self) -> Result<(), EmbedError> {
105        if self.model_path.exists() {
106            return Ok(());
107        }
108        fs::create_dir_all(&self.cache_dir)?;
109        eprintln!("[mimirs-embed] Downloading LFM2.5-VL-450M from HuggingFace...");
110        let status = Command::new("curl")
111            .args([
112                "-L",
113                "-o",
114                self.model_path.to_string_lossy().as_ref(),
115                MODEL_URL,
116            ])
117            .status()?;
118        if !status.success() {
119            return Err(EmbedError::Io(std::io::Error::other(
120                "curl download failed",
121            )));
122        }
123        Ok(())
124    }
125
126    /// Returns the path to the llama-embedding binary, or None if not found.
127    fn find_llama_embedding(&self) -> Option<PathBuf> {
128        which::which("llama-embedding").ok().or_else(|| {
129            // Check common build locations
130            let candidates = [
131                "/usr/local/bin/llama-embedding",
132                "/opt/llama.cpp/bin/llama-embedding",
133                "/usr/bin/llama-embedding",
134            ];
135            candidates.iter().map(PathBuf::from).find(|p| p.exists())
136        })
137    }
138
139    /// Produces an embedding from text via llama.cpp.
140    fn embed_via_llama(&self, text: &str) -> Result<Array1<f64>, EmbedError> {
141        self.ensure_model()?;
142        let bin = self
143            .find_llama_embedding()
144            .ok_or_else(|| EmbedError::LlamaNotFound("llama-embedding not in PATH".into()))?;
145        let output = Command::new(&bin)
146            .args([
147                "-m",
148                self.model_path.to_string_lossy().as_ref(),
149                "-c",
150                "512",
151                "--embd-output-format",
152                "array",
153                "--embd-separator",
154                " ",
155                "-p",
156                text,
157            ])
158            .output()?;
159        if !output.status.success() {
160            return Err(EmbedError::Io(std::io::Error::other(
161                String::from_utf8_lossy(&output.stderr).to_string(),
162            )));
163        }
164        parse_embedding_output(&output.stdout)
165    }
166
167    /// Produces an embedding from an image via llama.cpp's --image flag.
168    fn embed_image_via_llama(&self, image_path: &Path) -> Result<Array1<f64>, EmbedError> {
169        self.ensure_model()?;
170        let bin = self
171            .find_llama_embedding()
172            .ok_or_else(|| EmbedError::LlamaNotFound("llama-embedding not in PATH".into()))?;
173        let output = Command::new(&bin)
174            .args([
175                "-m",
176                self.model_path.to_string_lossy().as_ref(),
177                "-c",
178                "512",
179                "--embd-output-format",
180                "array",
181                "--image",
182                image_path.to_string_lossy().as_ref(),
183            ])
184            .output()?;
185        if !output.status.success() {
186            return Err(EmbedError::Io(std::io::Error::other(
187                String::from_utf8_lossy(&output.stderr).to_string(),
188            )));
189        }
190        parse_embedding_output(&output.stdout)
191    }
192
193    /// Produces a joint text+image embedding via llama.cpp.
194    fn embed_multimodal_via_llama(
195        &self,
196        text: &str,
197        image_path: &Path,
198    ) -> Result<Array1<f64>, EmbedError> {
199        self.ensure_model()?;
200        let bin = self
201            .find_llama_embedding()
202            .ok_or_else(|| EmbedError::LlamaNotFound("llama-embedding not in PATH".into()))?;
203        let output = Command::new(&bin)
204            .args([
205                "-m",
206                self.model_path.to_string_lossy().as_ref(),
207                "-c",
208                "512",
209                "--embd-output-format",
210                "array",
211                "--image",
212                image_path.to_string_lossy().as_ref(),
213                "-p",
214                text,
215            ])
216            .output()?;
217        if !output.status.success() {
218            return Err(EmbedError::Io(std::io::Error::other(
219                String::from_utf8_lossy(&output.stderr).to_string(),
220            )));
221        }
222        parse_embedding_output(&output.stdout)
223    }
224
225    // ── Public API ──────────────────────────────────────────────────────
226
227    /// Embeds a text string.
228    pub fn embed(&self, text: &str) -> Array1<f64> {
229        self.embed_via_llama(text)
230            .unwrap_or_else(|_| hash_embedding(text.as_bytes()))
231    }
232
233    /// Embeds an image file (PNG, JPEG, WebP, BMP, GIF, TIFF, SVG, HEIC, AVIF, ICO).
234    pub fn embed_image(&self, path: &Path) -> Array1<f64> {
235        // Try to render/convert to pixels first for formats that need it
236        let pixel_path = match self.load_image_pixels(path) {
237            Ok(p) => p,
238            Err(_) => {
239                return self
240                    .embed_image_via_llama(path)
241                    .unwrap_or_else(|_| hash_embedding(&fs::read(path).unwrap_or_default()));
242            }
243        };
244        self.embed_image_via_llama(&pixel_path)
245            .unwrap_or_else(|_| hash_embedding(&fs::read(&pixel_path).unwrap_or_default()))
246    }
247
248    /// Embeds text and an image jointly.
249    pub fn embed_multimodal(&self, text: &str, image_path: &Path) -> Array1<f64> {
250        let pixel_path = self
251            .load_image_pixels(image_path)
252            .unwrap_or_else(|_| image_path.to_path_buf());
253        self.embed_multimodal_via_llama(text, &pixel_path)
254            .unwrap_or_else(|_| {
255                let mut combined = text.as_bytes().to_vec();
256                if let Ok(img_bytes) = fs::read(&pixel_path) {
257                    combined.extend_from_slice(&img_bytes);
258                }
259                hash_embedding(&combined)
260            })
261    }
262
263    /// Embeds a PDF document (text + rendered page images).
264    pub fn embed_pdf(&self, path: &Path) -> Array1<f64> {
265        let text = self.extract_pdf_text(path);
266        if text.trim().is_empty() {
267            return hash_embedding(&fs::read(path).unwrap_or_default());
268        }
269        self.embed(&text)
270    }
271
272    /// Embeds a PowerPoint file (.pptx or .ppt).
273    pub fn embed_presentation(&self, path: &Path) -> Array1<f64> {
274        let ext = path
275            .extension()
276            .and_then(|e| e.to_str())
277            .unwrap_or("")
278            .to_lowercase();
279        if ext == "pptx" {
280            self.embed_pptx(path)
281        } else {
282            // Legacy .ppt — try libreoffice conversion
283            self.embed_via_libreoffice(path)
284        }
285    }
286
287    /// Embeds a Word document (.docx or .doc).
288    pub fn embed_document(&self, path: &Path) -> Array1<f64> {
289        let ext = path
290            .extension()
291            .and_then(|e| e.to_str())
292            .unwrap_or("")
293            .to_lowercase();
294        if ext == "docx" {
295            self.embed_docx(path)
296        } else if ext == "doc" {
297            // Try antiword for legacy .doc
298            if let Ok(text) = self.run_command("antiword", &[path.to_string_lossy().as_ref()]) {
299                return self.embed(&text);
300            }
301            self.embed_via_libreoffice(path)
302        } else {
303            hash_embedding(&fs::read(path).unwrap_or_default())
304        }
305    }
306
307    /// Embeds a spreadsheet (.xlsx, .xls, .csv, .tsv, .ods).
308    pub fn embed_spreadsheet(&self, path: &Path) -> Array1<f64> {
309        let ext = path
310            .extension()
311            .and_then(|e| e.to_str())
312            .unwrap_or("")
313            .to_lowercase();
314        match ext.as_str() {
315            "csv" | "tsv" => {
316                let text = fs::read_to_string(path).unwrap_or_default();
317                self.embed(&text)
318            }
319            "xlsx" | "xls" | "ods" => {
320                if let Ok(text) = self.extract_spreadsheet_text(path, &ext) {
321                    return self.embed(&text);
322                }
323                self.embed_via_libreoffice(path)
324            }
325            _ => hash_embedding(&fs::read(path).unwrap_or_default()),
326        }
327    }
328
329    /// Embeds an OpenDocument file (.odt, .ods, .odp).
330    pub fn embed_opendocument(&self, path: &Path) -> Array1<f64> {
331        let ext = path
332            .extension()
333            .and_then(|e| e.to_str())
334            .unwrap_or("")
335            .to_lowercase();
336        match ext.as_str() {
337            "odt" => {
338                if let Ok(text) = self.extract_odt_text(path) {
339                    return self.embed(&text);
340                }
341            }
342            "ods" | "odp" => {
343                if let Ok(text) = self.extract_opendocument_xml(path) {
344                    return self.embed(&text);
345                }
346            }
347            _ => {}
348        }
349        self.embed_via_libreoffice(path)
350    }
351
352    /// Embeds an ePub file.
353    pub fn embed_epub(&self, path: &Path) -> Array1<f64> {
354        let text = self.extract_epub_text(path);
355        if text.trim().is_empty() {
356            hash_embedding(&fs::read(path).unwrap_or_default())
357        } else {
358            self.embed(&text)
359        }
360    }
361
362    /// Embeds an RTF file.
363    pub fn embed_rtf(&self, path: &Path) -> Array1<f64> {
364        let raw = fs::read_to_string(path).unwrap_or_default();
365        let text = strip_rtf(&raw);
366        self.embed(&text)
367    }
368
369    /// Embeds a LaTeX file.
370    pub fn embed_latex(&self, path: &Path) -> Array1<f64> {
371        let text = fs::read_to_string(path).unwrap_or_default();
372        self.embed(&text)
373    }
374
375    /// Embeds an audio file (extracts metadata, falls back to hash).
376    pub fn embed_audio(&self, path: &Path) -> Array1<f64> {
377        // Try to extract metadata via symphonia
378        if let Ok(text) = self.extract_audio_metadata(path)
379            && !text.trim().is_empty()
380        {
381            return self.embed(&text);
382        }
383        hash_embedding(&fs::read(path).unwrap_or_default())
384    }
385
386    /// Embeds a video file (extracts keyframes as images, embeds each, averages).
387    pub fn embed_video(&self, path: &Path) -> Array1<f64> {
388        // Try ffmpeg to extract keyframes
389        if let Ok(frames) = self.extract_video_keyframes(path)
390            && !frames.is_empty()
391        {
392            let mut sum = Array1::zeros(EMBED_DIM);
393            let mut count = 0u32;
394            for frame in &frames {
395                let e = self
396                    .embed_image_via_llama(frame)
397                    .unwrap_or_else(|_| hash_embedding(&fs::read(frame).unwrap_or_default()));
398                sum = sum + e;
399                count += 1;
400            }
401            if count > 0 {
402                return sum / count as f64;
403            }
404        }
405        hash_embedding(&fs::read(path).unwrap_or_default())
406    }
407
408    /// Embeds an archive by extracting and embedding contained text files.
409    pub fn embed_archive(&self, path: &Path) -> Array1<f64> {
410        let ext = path
411            .extension()
412            .and_then(|e| e.to_str())
413            .unwrap_or("")
414            .to_lowercase();
415        let texts = match ext.as_str() {
416            "zip" => self.extract_zip_texts(path),
417            "tar" | "gz" | "bz2" | "xz" => self.extract_tar_texts(path),
418            "7z" => self.extract_7z_texts(path),
419            "rar" => self.extract_rar_texts(path),
420            _ => Ok(vec![]),
421        };
422        match texts {
423            Ok(t) if !t.is_empty() => {
424                let mut sum = Array1::zeros(EMBED_DIM);
425                let mut count = 0u32;
426                for text in &t {
427                    let e = self.embed(text);
428                    sum = sum + e;
429                    count += 1;
430                }
431                if count > 0 {
432                    sum / count as f64
433                } else {
434                    hash_embedding(&[])
435                }
436            }
437            _ => hash_embedding(&fs::read(path).unwrap_or_default()),
438        }
439    }
440
441    /// Auto-detects the file format and embeds accordingly.
442    pub fn embed_file(&self, path: &Path) -> Array1<f64> {
443        let ext = path
444            .extension()
445            .and_then(|e| e.to_str())
446            .unwrap_or("")
447            .to_lowercase();
448        match ext.as_str() {
449            // Plain text & code
450            "txt" | "md" | "rst" | "log" | "rs" | "py" | "js" | "ts" | "go" | "c" | "cpp" | "h"
451            | "hpp" | "java" | "rb" | "sh" | "toml" | "yaml" | "yml" | "json" | "xml" | "html"
452            | "htm" | "css" | "scss" | "less" | "sql" | "php" | "swift" | "kt" | "scala" | "r"
453            | "m" | "mm" | "lua" | "pl" | "pm" | "vim" | "el" | "lisp" | "hs" | "ml" | "fs"
454            | "fsx" | "clj" | "ex" | "exs" | "erl" | "hrl" => {
455                let text = fs::read_to_string(path).unwrap_or_default();
456                self.embed(&text)
457            }
458            "rtf" => self.embed_rtf(path),
459            "tex" | "latex" | "bib" => self.embed_latex(path),
460            "pdf" => self.embed_pdf(path),
461            "docx" | "doc" => self.embed_document(path),
462            "xlsx" | "xls" | "csv" | "tsv" | "ods" => self.embed_spreadsheet(path),
463            "pptx" | "ppt" => self.embed_presentation(path),
464            "odt" | "odp" => self.embed_opendocument(path),
465            "epub" => self.embed_epub(path),
466            // Images
467            "png" | "jpg" | "jpeg" | "webp" | "bmp" | "gif" | "tiff" | "tif" | "svg" | "ico"
468            | "heic" | "heif" | "avif" => self.embed_image(path),
469            // Audio
470            "mp3" | "wav" | "flac" | "ogg" | "m4a" | "aac" | "wma" | "opus" => {
471                self.embed_audio(path)
472            }
473            // Video
474            "mp4" | "mkv" | "avi" | "webm" | "mov" | "wmv" | "flv" | "m4v" | "mpg" | "mpeg"
475            | "3gp" => self.embed_video(path),
476            // Archives
477            "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" => self.embed_archive(path),
478            // Unknown — try as text, fall back to hash
479            _ => {
480                if let Ok(text) = fs::read_to_string(path) {
481                    self.embed(&text)
482                } else {
483                    hash_embedding(&fs::read(path).unwrap_or_default())
484                }
485            }
486        }
487    }
488
489    // ── Internal helpers ─────────────────────────────────────────────────
490
491    /// Loads an image, converts to 336x336 RGB PNG, returns the temp path.
492    fn load_image_pixels(&self, path: &Path) -> Result<PathBuf, EmbedError> {
493        let ext = path
494            .extension()
495            .and_then(|e| e.to_str())
496            .unwrap_or("")
497            .to_lowercase();
498        let img = if ext == "svg" {
499            self.render_svg_to_image(path)?
500        } else {
501            image::open(path).map_err(|e| EmbedError::ImageError(e.to_string()))?
502        };
503        let resized = img.resize_exact(336, 336, image::imageops::FilterType::Triangle);
504        let mut tmp = tempfile::NamedTempFile::with_suffix(".png")?;
505        resized
506            .write_to(&mut tmp, image::ImageFormat::Png)
507            .map_err(|e| EmbedError::ImageError(e.to_string()))?;
508        let path = tmp.into_temp_path().to_path_buf();
509        Ok(path)
510    }
511
512    fn render_svg_to_image(&self, path: &Path) -> Result<image::DynamicImage, EmbedError> {
513        let data = fs::read(path)?;
514        let opt = resvg::usvg::Options::default();
515        let tree = resvg::usvg::Tree::from_data(&data, &opt)
516            .map_err(|e| EmbedError::ImageError(format!("SVG parse: {e}")))?;
517        let size = tree.size().to_int_size();
518        let mut pixmap = resvg::tiny_skia::Pixmap::new(size.width(), size.height())
519            .ok_or_else(|| EmbedError::ImageError("SVG pixmap alloc failed".into()))?;
520        resvg::render(
521            &tree,
522            resvg::tiny_skia::Transform::default(),
523            &mut pixmap.as_mut(),
524        );
525        let img =
526            image::RgbaImage::from_raw(pixmap.width(), pixmap.height(), pixmap.data().to_vec())
527                .ok_or_else(|| EmbedError::ImageError("SVG image conversion failed".into()))?;
528        Ok(image::DynamicImage::ImageRgba8(img))
529    }
530
531    fn extract_pdf_text(&self, path: &Path) -> String {
532        pdf_extract::extract_text(path).unwrap_or_default()
533    }
534
535    fn embed_pptx(&self, path: &Path) -> Array1<f64> {
536        let file = match fs::File::open(path) {
537            Ok(f) => f,
538            Err(_) => return hash_embedding(&[]),
539        };
540        let mut archive = match zip::ZipArchive::new(file) {
541            Ok(a) => a,
542            Err(_) => return hash_embedding(&[]),
543        };
544        let mut texts = Vec::new();
545        let mut images: Vec<PathBuf> = Vec::new();
546        for i in 0..archive.len() {
547            let mut f = match archive.by_index(i) {
548                Ok(f) => f,
549                Err(_) => continue,
550            };
551            let name = f.name().to_lowercase();
552            if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
553                let mut content = String::new();
554                if f.read_to_string(&mut content).is_ok() {
555                    let text = extract_xml_text(&content);
556                    if !text.trim().is_empty() {
557                        texts.push(text);
558                    }
559                }
560            } else if name.starts_with("ppt/media/") {
561                let mut buf = Vec::new();
562                if f.read_to_end(&mut buf).is_ok()
563                    && let Ok(tmp) = tempfile::NamedTempFile::with_suffix(".img")
564                {
565                    let tmp_path = tmp.into_temp_path().to_path_buf();
566                    if fs::write(&tmp_path, &buf).is_ok() {
567                        images.push(tmp_path);
568                    }
569                }
570            }
571        }
572        let mut sum = Array1::zeros(EMBED_DIM);
573        let mut count = 0u32;
574        for text in &texts {
575            let e = self.embed(text);
576            sum = sum + e;
577            count += 1;
578        }
579        for img_path in &images {
580            let e = self
581                .embed_image_via_llama(img_path)
582                .unwrap_or_else(|_| hash_embedding(&[]));
583            sum = sum + e;
584            count += 1;
585        }
586        if count > 0 {
587            sum / count as f64
588        } else {
589            hash_embedding(&[])
590        }
591    }
592
593    fn embed_docx(&self, path: &Path) -> Array1<f64> {
594        let file = match fs::File::open(path) {
595            Ok(f) => f,
596            Err(_) => return hash_embedding(&[]),
597        };
598        let mut archive = match zip::ZipArchive::new(file) {
599            Ok(a) => a,
600            Err(_) => return hash_embedding(&[]),
601        };
602        let mut text = String::new();
603        for i in 0..archive.len() {
604            let mut f = match archive.by_index(i) {
605                Ok(f) => f,
606                Err(_) => continue,
607            };
608            if f.name().to_lowercase() == "word/document.xml" {
609                let mut content = String::new();
610                if f.read_to_string(&mut content).is_ok() {
611                    text = extract_xml_text(&content);
612                }
613                break;
614            }
615        }
616        if text.trim().is_empty() {
617            hash_embedding(&fs::read(path).unwrap_or_default())
618        } else {
619            self.embed(&text)
620        }
621    }
622
623    fn extract_spreadsheet_text(&self, path: &Path, ext: &str) -> Result<String, EmbedError> {
624        let file = fs::File::open(path)?;
625        let mut archive =
626            zip::ZipArchive::new(file).map_err(|e| EmbedError::ZipError(e.to_string()))?;
627        let mut text = String::new();
628        for i in 0..archive.len() {
629            let mut f = match archive.by_index(i) {
630                Ok(f) => f,
631                Err(_) => continue,
632            };
633            let name = f.name().to_lowercase();
634            let is_sheet = (ext == "xlsx"
635                && name.starts_with("xl/worksheets/sheet")
636                && name.ends_with(".xml"))
637                || (ext == "ods" && name == "content.xml");
638            if is_sheet {
639                let mut content = String::new();
640                if f.read_to_string(&mut content).is_ok() {
641                    text.push_str(&extract_xml_text(&content));
642                    text.push('\n');
643                }
644            }
645        }
646        Ok(text)
647    }
648
649    fn extract_odt_text(&self, path: &Path) -> Result<String, EmbedError> {
650        let file = fs::File::open(path)?;
651        let mut archive =
652            zip::ZipArchive::new(file).map_err(|e| EmbedError::ZipError(e.to_string()))?;
653        for i in 0..archive.len() {
654            let mut f = match archive.by_index(i) {
655                Ok(f) => f,
656                Err(_) => continue,
657            };
658            if f.name().to_lowercase() == "content.xml" {
659                let mut content = String::new();
660                f.read_to_string(&mut content)?;
661                return Ok(extract_xml_text(&content));
662            }
663        }
664        Err(EmbedError::XmlError("content.xml not found in ODT".into()))
665    }
666
667    fn extract_opendocument_xml(&self, path: &Path) -> Result<String, EmbedError> {
668        let file = fs::File::open(path)?;
669        let mut archive =
670            zip::ZipArchive::new(file).map_err(|e| EmbedError::ZipError(e.to_string()))?;
671        let mut text = String::new();
672        for i in 0..archive.len() {
673            let mut f = match archive.by_index(i) {
674                Ok(f) => f,
675                Err(_) => continue,
676            };
677            if f.name().to_lowercase() == "content.xml" {
678                let mut content = String::new();
679                if f.read_to_string(&mut content).is_ok() {
680                    text = extract_xml_text(&content);
681                }
682                break;
683            }
684        }
685        Ok(text)
686    }
687
688    fn extract_epub_text(&self, path: &Path) -> String {
689        let file = match fs::File::open(path) {
690            Ok(f) => f,
691            Err(_) => return String::new(),
692        };
693        let mut archive = match zip::ZipArchive::new(file) {
694            Ok(a) => a,
695            Err(_) => return String::new(),
696        };
697        let mut text = String::new();
698        for i in 0..archive.len() {
699            let mut f = match archive.by_index(i) {
700                Ok(f) => f,
701                Err(_) => continue,
702            };
703            let name = f.name().to_lowercase();
704            if (name.ends_with(".xhtml") || name.ends_with(".html") || name.ends_with(".htm"))
705                && !name.contains("nav")
706            {
707                let mut content = String::new();
708                if f.read_to_string(&mut content).is_ok() {
709                    text.push_str(&extract_xml_text(&content));
710                    text.push('\n');
711                }
712            }
713        }
714        text
715    }
716
717    fn extract_audio_metadata(&self, path: &Path) -> Result<String, EmbedError> {
718        use symphonia::core::formats::FormatOptions;
719        use symphonia::core::io::MediaSourceStream;
720        use symphonia::core::meta::MetadataOptions;
721        use symphonia::core::probe::Hint;
722        let file = std::fs::File::open(path)?;
723        let mss = MediaSourceStream::new(Box::new(file), Default::default());
724        let hint = Hint::new();
725        let format_opts = FormatOptions::default();
726        let metadata_opts = MetadataOptions::default();
727        let mut probed = symphonia::default::get_probe()
728            .format(&hint, mss, &format_opts, &metadata_opts)
729            .map_err(|e| EmbedError::Io(std::io::Error::other(e.to_string())))?;
730        let mut meta_text = String::new();
731        if let Some(metadata) = probed.format.metadata().current() {
732            for tag in metadata.tags() {
733                meta_text.push_str(&format!("{}: {}\n", tag.key, tag.value));
734            }
735        }
736        Ok(meta_text)
737    }
738
739    fn extract_video_keyframes(&self, path: &Path) -> Result<Vec<PathBuf>, EmbedError> {
740        let tmp_dir = tempfile::tempdir()?;
741        let pattern = tmp_dir.path().join("frame_%03d.png");
742        let status = Command::new("ffmpeg")
743            .args([
744                "-i",
745                path.to_string_lossy().as_ref(),
746                "-vf",
747                "select=eq(pict_type\\,I)",
748                "-vsync",
749                "vfr",
750                "-frames:v",
751                "5",
752                pattern.to_string_lossy().as_ref(),
753            ])
754            .stdout(std::process::Stdio::null())
755            .stderr(std::process::Stdio::null())
756            .status()?;
757        if !status.success() {
758            return Err(EmbedError::Io(std::io::Error::other("ffmpeg failed")));
759        }
760        let mut frames = Vec::new();
761        for entry in fs::read_dir(tmp_dir.path())? {
762            let entry = entry?;
763            if entry.path().extension().and_then(|e| e.to_str()) == Some("png") {
764                // Copy to a persistent temp file since tmp_dir will be dropped
765                let persistent = tempfile::NamedTempFile::with_suffix(".png")?;
766                let p = persistent.into_temp_path().to_path_buf();
767                fs::copy(entry.path(), &p)?;
768                frames.push(p);
769            }
770        }
771        Ok(frames)
772    }
773
774    fn extract_zip_texts(&self, path: &Path) -> Result<Vec<String>, EmbedError> {
775        let file = fs::File::open(path)?;
776        let mut archive =
777            zip::ZipArchive::new(file).map_err(|e| EmbedError::ZipError(e.to_string()))?;
778        let mut texts = Vec::new();
779        for i in 0..archive.len() {
780            let mut f = match archive.by_index(i) {
781                Ok(f) => f,
782                Err(_) => continue,
783            };
784            let name = f.name().to_lowercase();
785            if name.ends_with('/') {
786                continue;
787            }
788            let is_text = name.ends_with(".txt")
789                || name.ends_with(".md")
790                || name.ends_with(".json")
791                || name.ends_with(".xml")
792                || name.ends_with(".csv")
793                || name.ends_with(".yaml")
794                || name.ends_with(".yml")
795                || name.ends_with(".toml")
796                || name.ends_with(".rs")
797                || name.ends_with(".py")
798                || name.ends_with(".js")
799                || name.ends_with(".ts")
800                || name.ends_with(".html")
801                || name.ends_with(".css")
802                || name.ends_with(".sh")
803                || name.ends_with(".log")
804                || name.ends_with(".rst");
805            if is_text {
806                let mut content = String::new();
807                if f.read_to_string(&mut content).is_ok() {
808                    texts.push(content);
809                }
810            }
811        }
812        Ok(texts)
813    }
814
815    fn extract_tar_texts(&self, path: &Path) -> Result<Vec<String>, EmbedError> {
816        let file = fs::File::open(path)?;
817        let mut archive = tar::Archive::new(file);
818        let mut texts = Vec::new();
819        for entry in archive.entries()? {
820            let mut entry = entry?;
821            let name = entry.path()?.to_string_lossy().to_lowercase();
822            let is_text = name.ends_with(".txt")
823                || name.ends_with(".md")
824                || name.ends_with(".json")
825                || name.ends_with(".xml")
826                || name.ends_with(".csv")
827                || name.ends_with(".yaml")
828                || name.ends_with(".rs")
829                || name.ends_with(".py")
830                || name.ends_with(".js");
831            if is_text {
832                let mut content = String::new();
833                if entry.read_to_string(&mut content).is_ok() {
834                    texts.push(content);
835                }
836            }
837        }
838        Ok(texts)
839    }
840
841    fn extract_7z_texts(&self, path: &Path) -> Result<Vec<String>, EmbedError> {
842        let output = Command::new("7z")
843            .args(["l", "-ba", path.to_string_lossy().as_ref()])
844            .output()?;
845        let _listing = String::from_utf8_lossy(&output.stdout);
846        // 7z doesn't easily extract to stdout; use 7z to extract to temp dir
847        let tmp_dir = tempfile::tempdir()?;
848        let status = Command::new("7z")
849            .args([
850                "x",
851                path.to_string_lossy().as_ref(),
852                &format!("-o{}", tmp_dir.path().display()),
853                "-y",
854            ])
855            .stdout(std::process::Stdio::null())
856            .stderr(std::process::Stdio::null())
857            .status()?;
858        if !status.success() {
859            return Err(EmbedError::Io(std::io::Error::other(
860                "7z extraction failed",
861            )));
862        }
863        let mut texts = Vec::new();
864        self.collect_text_files(tmp_dir.path(), &mut texts)?;
865        Ok(texts)
866    }
867
868    fn extract_rar_texts(&self, path: &Path) -> Result<Vec<String>, EmbedError> {
869        let tmp_dir = tempfile::tempdir()?;
870        let status = Command::new("unrar")
871            .args([
872                "x",
873                "-y",
874                path.to_string_lossy().as_ref(),
875                &format!("{}/", tmp_dir.path().display()),
876            ])
877            .stdout(std::process::Stdio::null())
878            .stderr(std::process::Stdio::null())
879            .status()?;
880        if !status.success() {
881            // Try 7z as fallback
882            return self.extract_7z_texts(path);
883        }
884        let mut texts = Vec::new();
885        self.collect_text_files(tmp_dir.path(), &mut texts)?;
886        Ok(texts)
887    }
888
889    fn collect_text_files(&self, dir: &Path, texts: &mut Vec<String>) -> Result<(), EmbedError> {
890        if dir.is_dir() {
891            for entry in fs::read_dir(dir)? {
892                let entry = entry?;
893                let path = entry.path();
894                if path.is_dir() {
895                    self.collect_text_files(&path, texts)?;
896                } else if let Ok(content) = fs::read_to_string(&path) {
897                    texts.push(content);
898                }
899            }
900        }
901        Ok(())
902    }
903
904    fn embed_via_libreoffice(&self, path: &Path) -> Array1<f64> {
905        let tmp_dir = match tempfile::tempdir() {
906            Ok(d) => d,
907            Err(_) => return hash_embedding(&[]),
908        };
909        let status = Command::new("libreoffice")
910            .args([
911                "--headless",
912                "--convert-to",
913                "txt:Text",
914                path.to_string_lossy().as_ref(),
915                "--outdir",
916                tmp_dir.path().to_string_lossy().as_ref(),
917            ])
918            .stdout(std::process::Stdio::null())
919            .stderr(std::process::Stdio::null())
920            .status();
921        match status {
922            Ok(s) if s.success() => {
923                // Find the converted .txt file
924                if let Ok(entries) = fs::read_dir(tmp_dir.path()) {
925                    for entry in entries.flatten() {
926                        if entry.path().extension().and_then(|e| e.to_str()) == Some("txt")
927                            && let Ok(text) = fs::read_to_string(entry.path())
928                        {
929                            return self.embed(&text);
930                        }
931                    }
932                }
933                hash_embedding(&[])
934            }
935            _ => hash_embedding(&fs::read(path).unwrap_or_default()),
936        }
937    }
938
939    fn run_command(&self, cmd: &str, args: &[&str]) -> Result<String, EmbedError> {
940        let output = Command::new(cmd).args(args).output()?;
941        if output.status.success() {
942            Ok(String::from_utf8_lossy(&output.stdout).to_string())
943        } else {
944            Err(EmbedError::Io(std::io::Error::other(
945                String::from_utf8_lossy(&output.stderr).to_string(),
946            )))
947        }
948    }
949}
950
951// ── Free functions ──────────────────────────────────────────────────────
952
953/// Parses llama.cpp embedding output into an Array1<f64>.
954fn parse_embedding_output(stdout: &[u8]) -> Result<Array1<f64>, EmbedError> {
955    let text = String::from_utf8_lossy(stdout);
956    let values: Vec<f64> = text
957        .split_whitespace()
958        .filter_map(|s| s.parse::<f64>().ok())
959        .collect();
960    if values.is_empty() {
961        return Err(EmbedError::Io(std::io::Error::other(
962            "empty embedding output",
963        )));
964    }
965    Ok(Array1::from(values))
966}
967
968/// Deterministic hash-based embedding fallback.
969pub fn hash_embedding(data: &[u8]) -> Array1<f64> {
970    let mut result = Array1::zeros(EMBED_DIM);
971    if data.is_empty() {
972        return result;
973    }
974    // Chunk the data and hash each chunk to fill the embedding
975    let chunk_size = (data.len() / EMBED_DIM).max(1);
976    for (i, chunk) in data.chunks(chunk_size).enumerate().take(EMBED_DIM) {
977        let hash = Sha256::digest(chunk);
978        let mut val = 0.0f64;
979        for (j, &byte) in hash.iter().take(8).enumerate() {
980            val += (byte as f64) / (256.0f64.powi(j as i32 + 1));
981        }
982        result[i] = val;
983    }
984    // Normalize
985    let norm = result.dot(&result).sqrt();
986    if norm > 1e-12 {
987        result /= norm;
988    }
989    result
990}
991
992/// Strips RTF formatting to extract plain text.
993fn strip_rtf(rtf: &str) -> String {
994    let mut result = String::new();
995    let mut in_control = false;
996    let mut depth: usize = 0;
997    let mut control_word = String::new();
998    for ch in rtf.chars() {
999        match ch {
1000            '{' => {
1001                depth += 1;
1002            }
1003            '}' => {
1004                depth = depth.saturating_sub(1);
1005            }
1006            '\\' => {
1007                if in_control {
1008                    // Escaped backslash
1009                    result.push('\\');
1010                    in_control = false;
1011                } else {
1012                    in_control = true;
1013                    control_word.clear();
1014                }
1015            }
1016            ' ' | '\n' | '\r' => {
1017                if in_control {
1018                    in_control = false;
1019                    handle_rtf_control(&control_word, &mut result);
1020                    control_word.clear();
1021                } else if depth > 0 {
1022                    result.push(ch);
1023                }
1024            }
1025            _ => {
1026                if in_control {
1027                    if ch.is_ascii_alphabetic() {
1028                        control_word.push(ch);
1029                    } else {
1030                        in_control = false;
1031                        handle_rtf_control(&control_word, &mut result);
1032                        control_word.clear();
1033                        if ch != '\\' {
1034                            result.push(ch);
1035                        }
1036                    }
1037                } else if depth > 0 {
1038                    result.push(ch);
1039                }
1040            }
1041        }
1042    }
1043    result
1044}
1045
1046fn handle_rtf_control(word: &str, output: &mut String) {
1047    match word {
1048        "par" | "line" | "row" => output.push('\n'),
1049        "tab" => output.push('\t'),
1050        "lquote" => output.push('\u{2018}'),
1051        "rquote" => output.push('\u{2019}'),
1052        "ldblquote" => output.push('\u{201C}'),
1053        "rdblquote" => output.push('\u{201D}'),
1054        "endash" => output.push('\u{2013}'),
1055        "emdash" => output.push('\u{2014}'),
1056        "bullet" => output.push('\u{2022}'),
1057        "" => {} // skip
1058        _ => {
1059            // Unicode escape \uN?
1060            if word.starts_with('u')
1061                && word.len() > 1
1062                && let Ok(n) = word[1..].parse::<i32>()
1063                && let Some(c) = char::from_u32(n as u32)
1064            {
1065                output.push(c);
1066            }
1067        }
1068    }
1069}
1070
1071/// Extracts text content from XML by stripping tags.
1072fn extract_xml_text(xml: &str) -> String {
1073    let mut text = String::new();
1074    let mut in_tag = false;
1075    let mut _in_text = false;
1076    let mut tag_depth: usize = 0;
1077    for ch in xml.chars() {
1078        match ch {
1079            '<' => {
1080                in_tag = true;
1081                tag_depth += 1;
1082            }
1083            '>' => {
1084                in_tag = false;
1085                tag_depth = tag_depth.saturating_sub(1);
1086                if tag_depth == 0 {
1087                    _in_text = false;
1088                }
1089            }
1090            _ => {
1091                if !in_tag && tag_depth == 0 {
1092                    text.push(ch);
1093                }
1094            }
1095        }
1096    }
1097    // Collapse whitespace
1098    text.split_whitespace().collect::<Vec<_>>().join(" ")
1099}
1100
1101#[cfg(test)]
1102mod tests {
1103    use super::*;
1104
1105    #[test]
1106    fn test_embed_dim() {
1107        assert_eq!(EMBED_DIM, 1024, "LFM2.5-VL-450M hidden_size is 1024");
1108    }
1109
1110    #[test]
1111    fn test_embedder_creation() {
1112        let _embedder = Lfm2Embedder::new();
1113    }
1114}