1use std::fs;
29use std::io::Read;
30
31use std::path::{Path, PathBuf};
32use std::process::Command;
33
34use ndarray::Array1;
35use sha2::{Digest, Sha256};
36
37pub const EMBED_DIM: usize = mimirs_core::EMBED_DIM;
41
42pub const MODEL_URL: &str = "https://huggingface.co/LiquidAI/LFM2.5-VL-450M-GGUF/resolve/main/LFM2.5-VL-450M-Q8_0.gguf?download=true";
44
45pub const MODEL_FILENAME: &str = "LFM2.5-VL-450M-Q8_0.gguf";
47
48#[derive(Debug, thiserror::Error)]
50pub enum EmbedError {
51 #[error("IO error: {0}")]
52 Io(#[from] std::io::Error),
53 #[error("Model not found at {0}")]
54 ModelNotFound(PathBuf),
55 #[error("llama.cpp binary not found: {0}")]
56 LlamaNotFound(String),
57 #[error("Unsupported format: {0}")]
58 UnsupportedFormat(String),
59 #[error("Image decode error: {0}")]
60 ImageError(String),
61 #[error("PDF extraction error: {0}")]
62 PdfError(String),
63 #[error("ZIP extraction error: {0}")]
64 ZipError(String),
65 #[error("Archive error: {0}")]
66 ArchiveError(String),
67 #[error("XML parse error: {0}")]
68 XmlError(String),
69 #[error("Fallback embedding (llama.cpp unavailable)")]
70 Fallback,
71}
72
73pub struct Lfm2Embedder {
79 model_path: PathBuf,
80 cache_dir: PathBuf,
81}
82
83impl Default for Lfm2Embedder {
84 fn default() -> Self {
85 Self::new()
86 }
87}
88
89impl Lfm2Embedder {
90 pub fn new() -> Self {
92 let cache_dir = dirs::cache_dir()
93 .unwrap_or_else(|| PathBuf::from("/tmp"))
94 .join("mimirswell")
95 .join("models");
96 let model_path = cache_dir.join(MODEL_FILENAME);
97 Self {
98 model_path,
99 cache_dir,
100 }
101 }
102
103 fn ensure_model(&self) -> Result<(), EmbedError> {
105 if self.model_path.exists() {
106 return Ok(());
107 }
108 fs::create_dir_all(&self.cache_dir)?;
109 eprintln!("[mimirs-embed] Downloading LFM2.5-VL-450M from HuggingFace...");
110 let status = Command::new("curl")
111 .args([
112 "-L",
113 "-o",
114 self.model_path.to_string_lossy().as_ref(),
115 MODEL_URL,
116 ])
117 .status()?;
118 if !status.success() {
119 return Err(EmbedError::Io(std::io::Error::other(
120 "curl download failed",
121 )));
122 }
123 Ok(())
124 }
125
126 fn find_llama_embedding(&self) -> Option<PathBuf> {
128 which::which("llama-embedding").ok().or_else(|| {
129 let candidates = [
131 "/usr/local/bin/llama-embedding",
132 "/opt/llama.cpp/bin/llama-embedding",
133 "/usr/bin/llama-embedding",
134 ];
135 candidates.iter().map(PathBuf::from).find(|p| p.exists())
136 })
137 }
138
139 fn embed_via_llama(&self, text: &str) -> Result<Array1<f64>, EmbedError> {
141 self.ensure_model()?;
142 let bin = self
143 .find_llama_embedding()
144 .ok_or_else(|| EmbedError::LlamaNotFound("llama-embedding not in PATH".into()))?;
145 let output = Command::new(&bin)
146 .args([
147 "-m",
148 self.model_path.to_string_lossy().as_ref(),
149 "-c",
150 "512",
151 "--embd-output-format",
152 "array",
153 "--embd-separator",
154 " ",
155 "-p",
156 text,
157 ])
158 .output()?;
159 if !output.status.success() {
160 return Err(EmbedError::Io(std::io::Error::other(
161 String::from_utf8_lossy(&output.stderr).to_string(),
162 )));
163 }
164 parse_embedding_output(&output.stdout)
165 }
166
167 fn embed_image_via_llama(&self, image_path: &Path) -> Result<Array1<f64>, EmbedError> {
169 self.ensure_model()?;
170 let bin = self
171 .find_llama_embedding()
172 .ok_or_else(|| EmbedError::LlamaNotFound("llama-embedding not in PATH".into()))?;
173 let output = Command::new(&bin)
174 .args([
175 "-m",
176 self.model_path.to_string_lossy().as_ref(),
177 "-c",
178 "512",
179 "--embd-output-format",
180 "array",
181 "--image",
182 image_path.to_string_lossy().as_ref(),
183 ])
184 .output()?;
185 if !output.status.success() {
186 return Err(EmbedError::Io(std::io::Error::other(
187 String::from_utf8_lossy(&output.stderr).to_string(),
188 )));
189 }
190 parse_embedding_output(&output.stdout)
191 }
192
193 fn embed_multimodal_via_llama(
195 &self,
196 text: &str,
197 image_path: &Path,
198 ) -> Result<Array1<f64>, EmbedError> {
199 self.ensure_model()?;
200 let bin = self
201 .find_llama_embedding()
202 .ok_or_else(|| EmbedError::LlamaNotFound("llama-embedding not in PATH".into()))?;
203 let output = Command::new(&bin)
204 .args([
205 "-m",
206 self.model_path.to_string_lossy().as_ref(),
207 "-c",
208 "512",
209 "--embd-output-format",
210 "array",
211 "--image",
212 image_path.to_string_lossy().as_ref(),
213 "-p",
214 text,
215 ])
216 .output()?;
217 if !output.status.success() {
218 return Err(EmbedError::Io(std::io::Error::other(
219 String::from_utf8_lossy(&output.stderr).to_string(),
220 )));
221 }
222 parse_embedding_output(&output.stdout)
223 }
224
225 pub fn embed(&self, text: &str) -> Array1<f64> {
229 self.embed_via_llama(text)
230 .unwrap_or_else(|_| hash_embedding(text.as_bytes()))
231 }
232
233 pub fn embed_image(&self, path: &Path) -> Array1<f64> {
235 let pixel_path = match self.load_image_pixels(path) {
237 Ok(p) => p,
238 Err(_) => {
239 return self
240 .embed_image_via_llama(path)
241 .unwrap_or_else(|_| hash_embedding(&fs::read(path).unwrap_or_default()));
242 }
243 };
244 self.embed_image_via_llama(&pixel_path)
245 .unwrap_or_else(|_| hash_embedding(&fs::read(&pixel_path).unwrap_or_default()))
246 }
247
248 pub fn embed_multimodal(&self, text: &str, image_path: &Path) -> Array1<f64> {
250 let pixel_path = self
251 .load_image_pixels(image_path)
252 .unwrap_or_else(|_| image_path.to_path_buf());
253 self.embed_multimodal_via_llama(text, &pixel_path)
254 .unwrap_or_else(|_| {
255 let mut combined = text.as_bytes().to_vec();
256 if let Ok(img_bytes) = fs::read(&pixel_path) {
257 combined.extend_from_slice(&img_bytes);
258 }
259 hash_embedding(&combined)
260 })
261 }
262
263 pub fn embed_pdf(&self, path: &Path) -> Array1<f64> {
265 let text = self.extract_pdf_text(path);
266 if text.trim().is_empty() {
267 return hash_embedding(&fs::read(path).unwrap_or_default());
268 }
269 self.embed(&text)
270 }
271
272 pub fn embed_presentation(&self, path: &Path) -> Array1<f64> {
274 let ext = path
275 .extension()
276 .and_then(|e| e.to_str())
277 .unwrap_or("")
278 .to_lowercase();
279 if ext == "pptx" {
280 self.embed_pptx(path)
281 } else {
282 self.embed_via_libreoffice(path)
284 }
285 }
286
287 pub fn embed_document(&self, path: &Path) -> Array1<f64> {
289 let ext = path
290 .extension()
291 .and_then(|e| e.to_str())
292 .unwrap_or("")
293 .to_lowercase();
294 if ext == "docx" {
295 self.embed_docx(path)
296 } else if ext == "doc" {
297 if let Ok(text) = self.run_command("antiword", &[path.to_string_lossy().as_ref()]) {
299 return self.embed(&text);
300 }
301 self.embed_via_libreoffice(path)
302 } else {
303 hash_embedding(&fs::read(path).unwrap_or_default())
304 }
305 }
306
307 pub fn embed_spreadsheet(&self, path: &Path) -> Array1<f64> {
309 let ext = path
310 .extension()
311 .and_then(|e| e.to_str())
312 .unwrap_or("")
313 .to_lowercase();
314 match ext.as_str() {
315 "csv" | "tsv" => {
316 let text = fs::read_to_string(path).unwrap_or_default();
317 self.embed(&text)
318 }
319 "xlsx" | "xls" | "ods" => {
320 if let Ok(text) = self.extract_spreadsheet_text(path, &ext) {
321 return self.embed(&text);
322 }
323 self.embed_via_libreoffice(path)
324 }
325 _ => hash_embedding(&fs::read(path).unwrap_or_default()),
326 }
327 }
328
329 pub fn embed_opendocument(&self, path: &Path) -> Array1<f64> {
331 let ext = path
332 .extension()
333 .and_then(|e| e.to_str())
334 .unwrap_or("")
335 .to_lowercase();
336 match ext.as_str() {
337 "odt" => {
338 if let Ok(text) = self.extract_odt_text(path) {
339 return self.embed(&text);
340 }
341 }
342 "ods" | "odp" => {
343 if let Ok(text) = self.extract_opendocument_xml(path) {
344 return self.embed(&text);
345 }
346 }
347 _ => {}
348 }
349 self.embed_via_libreoffice(path)
350 }
351
352 pub fn embed_epub(&self, path: &Path) -> Array1<f64> {
354 let text = self.extract_epub_text(path);
355 if text.trim().is_empty() {
356 hash_embedding(&fs::read(path).unwrap_or_default())
357 } else {
358 self.embed(&text)
359 }
360 }
361
362 pub fn embed_rtf(&self, path: &Path) -> Array1<f64> {
364 let raw = fs::read_to_string(path).unwrap_or_default();
365 let text = strip_rtf(&raw);
366 self.embed(&text)
367 }
368
369 pub fn embed_latex(&self, path: &Path) -> Array1<f64> {
371 let text = fs::read_to_string(path).unwrap_or_default();
372 self.embed(&text)
373 }
374
375 pub fn embed_audio(&self, path: &Path) -> Array1<f64> {
377 if let Ok(text) = self.extract_audio_metadata(path)
379 && !text.trim().is_empty()
380 {
381 return self.embed(&text);
382 }
383 hash_embedding(&fs::read(path).unwrap_or_default())
384 }
385
386 pub fn embed_video(&self, path: &Path) -> Array1<f64> {
388 if let Ok(frames) = self.extract_video_keyframes(path)
390 && !frames.is_empty()
391 {
392 let mut sum = Array1::zeros(EMBED_DIM);
393 let mut count = 0u32;
394 for frame in &frames {
395 let e = self
396 .embed_image_via_llama(frame)
397 .unwrap_or_else(|_| hash_embedding(&fs::read(frame).unwrap_or_default()));
398 sum = sum + e;
399 count += 1;
400 }
401 if count > 0 {
402 return sum / count as f64;
403 }
404 }
405 hash_embedding(&fs::read(path).unwrap_or_default())
406 }
407
408 pub fn embed_archive(&self, path: &Path) -> Array1<f64> {
410 let ext = path
411 .extension()
412 .and_then(|e| e.to_str())
413 .unwrap_or("")
414 .to_lowercase();
415 let texts = match ext.as_str() {
416 "zip" => self.extract_zip_texts(path),
417 "tar" | "gz" | "bz2" | "xz" => self.extract_tar_texts(path),
418 "7z" => self.extract_7z_texts(path),
419 "rar" => self.extract_rar_texts(path),
420 _ => Ok(vec![]),
421 };
422 match texts {
423 Ok(t) if !t.is_empty() => {
424 let mut sum = Array1::zeros(EMBED_DIM);
425 let mut count = 0u32;
426 for text in &t {
427 let e = self.embed(text);
428 sum = sum + e;
429 count += 1;
430 }
431 if count > 0 {
432 sum / count as f64
433 } else {
434 hash_embedding(&[])
435 }
436 }
437 _ => hash_embedding(&fs::read(path).unwrap_or_default()),
438 }
439 }
440
441 pub fn embed_file(&self, path: &Path) -> Array1<f64> {
443 let ext = path
444 .extension()
445 .and_then(|e| e.to_str())
446 .unwrap_or("")
447 .to_lowercase();
448 match ext.as_str() {
449 "txt" | "md" | "rst" | "log" | "rs" | "py" | "js" | "ts" | "go" | "c" | "cpp" | "h"
451 | "hpp" | "java" | "rb" | "sh" | "toml" | "yaml" | "yml" | "json" | "xml" | "html"
452 | "htm" | "css" | "scss" | "less" | "sql" | "php" | "swift" | "kt" | "scala" | "r"
453 | "m" | "mm" | "lua" | "pl" | "pm" | "vim" | "el" | "lisp" | "hs" | "ml" | "fs"
454 | "fsx" | "clj" | "ex" | "exs" | "erl" | "hrl" => {
455 let text = fs::read_to_string(path).unwrap_or_default();
456 self.embed(&text)
457 }
458 "rtf" => self.embed_rtf(path),
459 "tex" | "latex" | "bib" => self.embed_latex(path),
460 "pdf" => self.embed_pdf(path),
461 "docx" | "doc" => self.embed_document(path),
462 "xlsx" | "xls" | "csv" | "tsv" | "ods" => self.embed_spreadsheet(path),
463 "pptx" | "ppt" => self.embed_presentation(path),
464 "odt" | "odp" => self.embed_opendocument(path),
465 "epub" => self.embed_epub(path),
466 "png" | "jpg" | "jpeg" | "webp" | "bmp" | "gif" | "tiff" | "tif" | "svg" | "ico"
468 | "heic" | "heif" | "avif" => self.embed_image(path),
469 "mp3" | "wav" | "flac" | "ogg" | "m4a" | "aac" | "wma" | "opus" => {
471 self.embed_audio(path)
472 }
473 "mp4" | "mkv" | "avi" | "webm" | "mov" | "wmv" | "flv" | "m4v" | "mpg" | "mpeg"
475 | "3gp" => self.embed_video(path),
476 "zip" | "tar" | "gz" | "bz2" | "xz" | "7z" | "rar" => self.embed_archive(path),
478 _ => {
480 if let Ok(text) = fs::read_to_string(path) {
481 self.embed(&text)
482 } else {
483 hash_embedding(&fs::read(path).unwrap_or_default())
484 }
485 }
486 }
487 }
488
489 fn load_image_pixels(&self, path: &Path) -> Result<PathBuf, EmbedError> {
493 let ext = path
494 .extension()
495 .and_then(|e| e.to_str())
496 .unwrap_or("")
497 .to_lowercase();
498 let img = if ext == "svg" {
499 self.render_svg_to_image(path)?
500 } else {
501 image::open(path).map_err(|e| EmbedError::ImageError(e.to_string()))?
502 };
503 let resized = img.resize_exact(336, 336, image::imageops::FilterType::Triangle);
504 let mut tmp = tempfile::NamedTempFile::with_suffix(".png")?;
505 resized
506 .write_to(&mut tmp, image::ImageFormat::Png)
507 .map_err(|e| EmbedError::ImageError(e.to_string()))?;
508 let path = tmp.into_temp_path().to_path_buf();
509 Ok(path)
510 }
511
512 fn render_svg_to_image(&self, path: &Path) -> Result<image::DynamicImage, EmbedError> {
513 let data = fs::read(path)?;
514 let opt = resvg::usvg::Options::default();
515 let tree = resvg::usvg::Tree::from_data(&data, &opt)
516 .map_err(|e| EmbedError::ImageError(format!("SVG parse: {e}")))?;
517 let size = tree.size().to_int_size();
518 let mut pixmap = resvg::tiny_skia::Pixmap::new(size.width(), size.height())
519 .ok_or_else(|| EmbedError::ImageError("SVG pixmap alloc failed".into()))?;
520 resvg::render(
521 &tree,
522 resvg::tiny_skia::Transform::default(),
523 &mut pixmap.as_mut(),
524 );
525 let img =
526 image::RgbaImage::from_raw(pixmap.width(), pixmap.height(), pixmap.data().to_vec())
527 .ok_or_else(|| EmbedError::ImageError("SVG image conversion failed".into()))?;
528 Ok(image::DynamicImage::ImageRgba8(img))
529 }
530
531 fn extract_pdf_text(&self, path: &Path) -> String {
532 pdf_extract::extract_text(path).unwrap_or_default()
533 }
534
535 fn embed_pptx(&self, path: &Path) -> Array1<f64> {
536 let file = match fs::File::open(path) {
537 Ok(f) => f,
538 Err(_) => return hash_embedding(&[]),
539 };
540 let mut archive = match zip::ZipArchive::new(file) {
541 Ok(a) => a,
542 Err(_) => return hash_embedding(&[]),
543 };
544 let mut texts = Vec::new();
545 let mut images: Vec<PathBuf> = Vec::new();
546 for i in 0..archive.len() {
547 let mut f = match archive.by_index(i) {
548 Ok(f) => f,
549 Err(_) => continue,
550 };
551 let name = f.name().to_lowercase();
552 if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
553 let mut content = String::new();
554 if f.read_to_string(&mut content).is_ok() {
555 let text = extract_xml_text(&content);
556 if !text.trim().is_empty() {
557 texts.push(text);
558 }
559 }
560 } else if name.starts_with("ppt/media/") {
561 let mut buf = Vec::new();
562 if f.read_to_end(&mut buf).is_ok()
563 && let Ok(tmp) = tempfile::NamedTempFile::with_suffix(".img")
564 {
565 let tmp_path = tmp.into_temp_path().to_path_buf();
566 if fs::write(&tmp_path, &buf).is_ok() {
567 images.push(tmp_path);
568 }
569 }
570 }
571 }
572 let mut sum = Array1::zeros(EMBED_DIM);
573 let mut count = 0u32;
574 for text in &texts {
575 let e = self.embed(text);
576 sum = sum + e;
577 count += 1;
578 }
579 for img_path in &images {
580 let e = self
581 .embed_image_via_llama(img_path)
582 .unwrap_or_else(|_| hash_embedding(&[]));
583 sum = sum + e;
584 count += 1;
585 }
586 if count > 0 {
587 sum / count as f64
588 } else {
589 hash_embedding(&[])
590 }
591 }
592
593 fn embed_docx(&self, path: &Path) -> Array1<f64> {
594 let file = match fs::File::open(path) {
595 Ok(f) => f,
596 Err(_) => return hash_embedding(&[]),
597 };
598 let mut archive = match zip::ZipArchive::new(file) {
599 Ok(a) => a,
600 Err(_) => return hash_embedding(&[]),
601 };
602 let mut text = String::new();
603 for i in 0..archive.len() {
604 let mut f = match archive.by_index(i) {
605 Ok(f) => f,
606 Err(_) => continue,
607 };
608 if f.name().to_lowercase() == "word/document.xml" {
609 let mut content = String::new();
610 if f.read_to_string(&mut content).is_ok() {
611 text = extract_xml_text(&content);
612 }
613 break;
614 }
615 }
616 if text.trim().is_empty() {
617 hash_embedding(&fs::read(path).unwrap_or_default())
618 } else {
619 self.embed(&text)
620 }
621 }
622
623 fn extract_spreadsheet_text(&self, path: &Path, ext: &str) -> Result<String, EmbedError> {
624 let file = fs::File::open(path)?;
625 let mut archive =
626 zip::ZipArchive::new(file).map_err(|e| EmbedError::ZipError(e.to_string()))?;
627 let mut text = String::new();
628 for i in 0..archive.len() {
629 let mut f = match archive.by_index(i) {
630 Ok(f) => f,
631 Err(_) => continue,
632 };
633 let name = f.name().to_lowercase();
634 let is_sheet = (ext == "xlsx"
635 && name.starts_with("xl/worksheets/sheet")
636 && name.ends_with(".xml"))
637 || (ext == "ods" && name == "content.xml");
638 if is_sheet {
639 let mut content = String::new();
640 if f.read_to_string(&mut content).is_ok() {
641 text.push_str(&extract_xml_text(&content));
642 text.push('\n');
643 }
644 }
645 }
646 Ok(text)
647 }
648
649 fn extract_odt_text(&self, path: &Path) -> Result<String, EmbedError> {
650 let file = fs::File::open(path)?;
651 let mut archive =
652 zip::ZipArchive::new(file).map_err(|e| EmbedError::ZipError(e.to_string()))?;
653 for i in 0..archive.len() {
654 let mut f = match archive.by_index(i) {
655 Ok(f) => f,
656 Err(_) => continue,
657 };
658 if f.name().to_lowercase() == "content.xml" {
659 let mut content = String::new();
660 f.read_to_string(&mut content)?;
661 return Ok(extract_xml_text(&content));
662 }
663 }
664 Err(EmbedError::XmlError("content.xml not found in ODT".into()))
665 }
666
667 fn extract_opendocument_xml(&self, path: &Path) -> Result<String, EmbedError> {
668 let file = fs::File::open(path)?;
669 let mut archive =
670 zip::ZipArchive::new(file).map_err(|e| EmbedError::ZipError(e.to_string()))?;
671 let mut text = String::new();
672 for i in 0..archive.len() {
673 let mut f = match archive.by_index(i) {
674 Ok(f) => f,
675 Err(_) => continue,
676 };
677 if f.name().to_lowercase() == "content.xml" {
678 let mut content = String::new();
679 if f.read_to_string(&mut content).is_ok() {
680 text = extract_xml_text(&content);
681 }
682 break;
683 }
684 }
685 Ok(text)
686 }
687
688 fn extract_epub_text(&self, path: &Path) -> String {
689 let file = match fs::File::open(path) {
690 Ok(f) => f,
691 Err(_) => return String::new(),
692 };
693 let mut archive = match zip::ZipArchive::new(file) {
694 Ok(a) => a,
695 Err(_) => return String::new(),
696 };
697 let mut text = String::new();
698 for i in 0..archive.len() {
699 let mut f = match archive.by_index(i) {
700 Ok(f) => f,
701 Err(_) => continue,
702 };
703 let name = f.name().to_lowercase();
704 if (name.ends_with(".xhtml") || name.ends_with(".html") || name.ends_with(".htm"))
705 && !name.contains("nav")
706 {
707 let mut content = String::new();
708 if f.read_to_string(&mut content).is_ok() {
709 text.push_str(&extract_xml_text(&content));
710 text.push('\n');
711 }
712 }
713 }
714 text
715 }
716
717 fn extract_audio_metadata(&self, path: &Path) -> Result<String, EmbedError> {
718 use symphonia::core::formats::FormatOptions;
719 use symphonia::core::io::MediaSourceStream;
720 use symphonia::core::meta::MetadataOptions;
721 use symphonia::core::probe::Hint;
722 let file = std::fs::File::open(path)?;
723 let mss = MediaSourceStream::new(Box::new(file), Default::default());
724 let hint = Hint::new();
725 let format_opts = FormatOptions::default();
726 let metadata_opts = MetadataOptions::default();
727 let mut probed = symphonia::default::get_probe()
728 .format(&hint, mss, &format_opts, &metadata_opts)
729 .map_err(|e| EmbedError::Io(std::io::Error::other(e.to_string())))?;
730 let mut meta_text = String::new();
731 if let Some(metadata) = probed.format.metadata().current() {
732 for tag in metadata.tags() {
733 meta_text.push_str(&format!("{}: {}\n", tag.key, tag.value));
734 }
735 }
736 Ok(meta_text)
737 }
738
739 fn extract_video_keyframes(&self, path: &Path) -> Result<Vec<PathBuf>, EmbedError> {
740 let tmp_dir = tempfile::tempdir()?;
741 let pattern = tmp_dir.path().join("frame_%03d.png");
742 let status = Command::new("ffmpeg")
743 .args([
744 "-i",
745 path.to_string_lossy().as_ref(),
746 "-vf",
747 "select=eq(pict_type\\,I)",
748 "-vsync",
749 "vfr",
750 "-frames:v",
751 "5",
752 pattern.to_string_lossy().as_ref(),
753 ])
754 .stdout(std::process::Stdio::null())
755 .stderr(std::process::Stdio::null())
756 .status()?;
757 if !status.success() {
758 return Err(EmbedError::Io(std::io::Error::other("ffmpeg failed")));
759 }
760 let mut frames = Vec::new();
761 for entry in fs::read_dir(tmp_dir.path())? {
762 let entry = entry?;
763 if entry.path().extension().and_then(|e| e.to_str()) == Some("png") {
764 let persistent = tempfile::NamedTempFile::with_suffix(".png")?;
766 let p = persistent.into_temp_path().to_path_buf();
767 fs::copy(entry.path(), &p)?;
768 frames.push(p);
769 }
770 }
771 Ok(frames)
772 }
773
774 fn extract_zip_texts(&self, path: &Path) -> Result<Vec<String>, EmbedError> {
775 let file = fs::File::open(path)?;
776 let mut archive =
777 zip::ZipArchive::new(file).map_err(|e| EmbedError::ZipError(e.to_string()))?;
778 let mut texts = Vec::new();
779 for i in 0..archive.len() {
780 let mut f = match archive.by_index(i) {
781 Ok(f) => f,
782 Err(_) => continue,
783 };
784 let name = f.name().to_lowercase();
785 if name.ends_with('/') {
786 continue;
787 }
788 let is_text = name.ends_with(".txt")
789 || name.ends_with(".md")
790 || name.ends_with(".json")
791 || name.ends_with(".xml")
792 || name.ends_with(".csv")
793 || name.ends_with(".yaml")
794 || name.ends_with(".yml")
795 || name.ends_with(".toml")
796 || name.ends_with(".rs")
797 || name.ends_with(".py")
798 || name.ends_with(".js")
799 || name.ends_with(".ts")
800 || name.ends_with(".html")
801 || name.ends_with(".css")
802 || name.ends_with(".sh")
803 || name.ends_with(".log")
804 || name.ends_with(".rst");
805 if is_text {
806 let mut content = String::new();
807 if f.read_to_string(&mut content).is_ok() {
808 texts.push(content);
809 }
810 }
811 }
812 Ok(texts)
813 }
814
815 fn extract_tar_texts(&self, path: &Path) -> Result<Vec<String>, EmbedError> {
816 let file = fs::File::open(path)?;
817 let mut archive = tar::Archive::new(file);
818 let mut texts = Vec::new();
819 for entry in archive.entries()? {
820 let mut entry = entry?;
821 let name = entry.path()?.to_string_lossy().to_lowercase();
822 let is_text = name.ends_with(".txt")
823 || name.ends_with(".md")
824 || name.ends_with(".json")
825 || name.ends_with(".xml")
826 || name.ends_with(".csv")
827 || name.ends_with(".yaml")
828 || name.ends_with(".rs")
829 || name.ends_with(".py")
830 || name.ends_with(".js");
831 if is_text {
832 let mut content = String::new();
833 if entry.read_to_string(&mut content).is_ok() {
834 texts.push(content);
835 }
836 }
837 }
838 Ok(texts)
839 }
840
841 fn extract_7z_texts(&self, path: &Path) -> Result<Vec<String>, EmbedError> {
842 let output = Command::new("7z")
843 .args(["l", "-ba", path.to_string_lossy().as_ref()])
844 .output()?;
845 let _listing = String::from_utf8_lossy(&output.stdout);
846 let tmp_dir = tempfile::tempdir()?;
848 let status = Command::new("7z")
849 .args([
850 "x",
851 path.to_string_lossy().as_ref(),
852 &format!("-o{}", tmp_dir.path().display()),
853 "-y",
854 ])
855 .stdout(std::process::Stdio::null())
856 .stderr(std::process::Stdio::null())
857 .status()?;
858 if !status.success() {
859 return Err(EmbedError::Io(std::io::Error::other(
860 "7z extraction failed",
861 )));
862 }
863 let mut texts = Vec::new();
864 self.collect_text_files(tmp_dir.path(), &mut texts)?;
865 Ok(texts)
866 }
867
868 fn extract_rar_texts(&self, path: &Path) -> Result<Vec<String>, EmbedError> {
869 let tmp_dir = tempfile::tempdir()?;
870 let status = Command::new("unrar")
871 .args([
872 "x",
873 "-y",
874 path.to_string_lossy().as_ref(),
875 &format!("{}/", tmp_dir.path().display()),
876 ])
877 .stdout(std::process::Stdio::null())
878 .stderr(std::process::Stdio::null())
879 .status()?;
880 if !status.success() {
881 return self.extract_7z_texts(path);
883 }
884 let mut texts = Vec::new();
885 self.collect_text_files(tmp_dir.path(), &mut texts)?;
886 Ok(texts)
887 }
888
889 fn collect_text_files(&self, dir: &Path, texts: &mut Vec<String>) -> Result<(), EmbedError> {
890 if dir.is_dir() {
891 for entry in fs::read_dir(dir)? {
892 let entry = entry?;
893 let path = entry.path();
894 if path.is_dir() {
895 self.collect_text_files(&path, texts)?;
896 } else if let Ok(content) = fs::read_to_string(&path) {
897 texts.push(content);
898 }
899 }
900 }
901 Ok(())
902 }
903
904 fn embed_via_libreoffice(&self, path: &Path) -> Array1<f64> {
905 let tmp_dir = match tempfile::tempdir() {
906 Ok(d) => d,
907 Err(_) => return hash_embedding(&[]),
908 };
909 let status = Command::new("libreoffice")
910 .args([
911 "--headless",
912 "--convert-to",
913 "txt:Text",
914 path.to_string_lossy().as_ref(),
915 "--outdir",
916 tmp_dir.path().to_string_lossy().as_ref(),
917 ])
918 .stdout(std::process::Stdio::null())
919 .stderr(std::process::Stdio::null())
920 .status();
921 match status {
922 Ok(s) if s.success() => {
923 if let Ok(entries) = fs::read_dir(tmp_dir.path()) {
925 for entry in entries.flatten() {
926 if entry.path().extension().and_then(|e| e.to_str()) == Some("txt")
927 && let Ok(text) = fs::read_to_string(entry.path())
928 {
929 return self.embed(&text);
930 }
931 }
932 }
933 hash_embedding(&[])
934 }
935 _ => hash_embedding(&fs::read(path).unwrap_or_default()),
936 }
937 }
938
939 fn run_command(&self, cmd: &str, args: &[&str]) -> Result<String, EmbedError> {
940 let output = Command::new(cmd).args(args).output()?;
941 if output.status.success() {
942 Ok(String::from_utf8_lossy(&output.stdout).to_string())
943 } else {
944 Err(EmbedError::Io(std::io::Error::other(
945 String::from_utf8_lossy(&output.stderr).to_string(),
946 )))
947 }
948 }
949}
950
951fn parse_embedding_output(stdout: &[u8]) -> Result<Array1<f64>, EmbedError> {
955 let text = String::from_utf8_lossy(stdout);
956 let values: Vec<f64> = text
957 .split_whitespace()
958 .filter_map(|s| s.parse::<f64>().ok())
959 .collect();
960 if values.is_empty() {
961 return Err(EmbedError::Io(std::io::Error::other(
962 "empty embedding output",
963 )));
964 }
965 Ok(Array1::from(values))
966}
967
968pub fn hash_embedding(data: &[u8]) -> Array1<f64> {
970 let mut result = Array1::zeros(EMBED_DIM);
971 if data.is_empty() {
972 return result;
973 }
974 let chunk_size = (data.len() / EMBED_DIM).max(1);
976 for (i, chunk) in data.chunks(chunk_size).enumerate().take(EMBED_DIM) {
977 let hash = Sha256::digest(chunk);
978 let mut val = 0.0f64;
979 for (j, &byte) in hash.iter().take(8).enumerate() {
980 val += (byte as f64) / (256.0f64.powi(j as i32 + 1));
981 }
982 result[i] = val;
983 }
984 let norm = result.dot(&result).sqrt();
986 if norm > 1e-12 {
987 result /= norm;
988 }
989 result
990}
991
992fn strip_rtf(rtf: &str) -> String {
994 let mut result = String::new();
995 let mut in_control = false;
996 let mut depth: usize = 0;
997 let mut control_word = String::new();
998 for ch in rtf.chars() {
999 match ch {
1000 '{' => {
1001 depth += 1;
1002 }
1003 '}' => {
1004 depth = depth.saturating_sub(1);
1005 }
1006 '\\' => {
1007 if in_control {
1008 result.push('\\');
1010 in_control = false;
1011 } else {
1012 in_control = true;
1013 control_word.clear();
1014 }
1015 }
1016 ' ' | '\n' | '\r' => {
1017 if in_control {
1018 in_control = false;
1019 handle_rtf_control(&control_word, &mut result);
1020 control_word.clear();
1021 } else if depth > 0 {
1022 result.push(ch);
1023 }
1024 }
1025 _ => {
1026 if in_control {
1027 if ch.is_ascii_alphabetic() {
1028 control_word.push(ch);
1029 } else {
1030 in_control = false;
1031 handle_rtf_control(&control_word, &mut result);
1032 control_word.clear();
1033 if ch != '\\' {
1034 result.push(ch);
1035 }
1036 }
1037 } else if depth > 0 {
1038 result.push(ch);
1039 }
1040 }
1041 }
1042 }
1043 result
1044}
1045
1046fn handle_rtf_control(word: &str, output: &mut String) {
1047 match word {
1048 "par" | "line" | "row" => output.push('\n'),
1049 "tab" => output.push('\t'),
1050 "lquote" => output.push('\u{2018}'),
1051 "rquote" => output.push('\u{2019}'),
1052 "ldblquote" => output.push('\u{201C}'),
1053 "rdblquote" => output.push('\u{201D}'),
1054 "endash" => output.push('\u{2013}'),
1055 "emdash" => output.push('\u{2014}'),
1056 "bullet" => output.push('\u{2022}'),
1057 "" => {} _ => {
1059 if word.starts_with('u')
1061 && word.len() > 1
1062 && let Ok(n) = word[1..].parse::<i32>()
1063 && let Some(c) = char::from_u32(n as u32)
1064 {
1065 output.push(c);
1066 }
1067 }
1068 }
1069}
1070
1071fn extract_xml_text(xml: &str) -> String {
1073 let mut text = String::new();
1074 let mut in_tag = false;
1075 let mut _in_text = false;
1076 let mut tag_depth: usize = 0;
1077 for ch in xml.chars() {
1078 match ch {
1079 '<' => {
1080 in_tag = true;
1081 tag_depth += 1;
1082 }
1083 '>' => {
1084 in_tag = false;
1085 tag_depth = tag_depth.saturating_sub(1);
1086 if tag_depth == 0 {
1087 _in_text = false;
1088 }
1089 }
1090 _ => {
1091 if !in_tag && tag_depth == 0 {
1092 text.push(ch);
1093 }
1094 }
1095 }
1096 }
1097 text.split_whitespace().collect::<Vec<_>>().join(" ")
1099}
1100
1101#[cfg(test)]
1102mod tests {
1103 use super::*;
1104
1105 #[test]
1106 fn test_embed_dim() {
1107 assert_eq!(EMBED_DIM, 1024, "LFM2.5-VL-450M hidden_size is 1024");
1108 }
1109
1110 #[test]
1111 fn test_embedder_creation() {
1112 let _embedder = Lfm2Embedder::new();
1113 }
1114}