1#[cfg(feature = "local")]
2use anyhow::Context;
3use anyhow::{anyhow, Result};
4#[cfg(feature = "local")]
5use calamine::{open_workbook_auto, Reader};
6#[cfg(feature = "local")]
7use fastembed::{EmbeddingModel, TextEmbedding, TextInitOptions};
8#[cfg(feature = "local")]
9use hnsw_rs::prelude::{DistCosine, Hnsw};
10use mxr_config::SemanticConfig;
11use mxr_core::id::MessageId;
12#[cfg(feature = "local")]
13use mxr_core::id::SemanticProfileId;
14#[cfg(feature = "local")]
15use mxr_core::types::{
16 AttachmentMeta, Envelope, MessageBody, SemanticChunkRecord, SemanticChunkSourceKind,
17 SemanticEmbeddingRecord, SemanticEmbeddingStatus, SemanticProfileStatus,
18};
19use mxr_core::types::{SearchMode, SemanticProfile, SemanticProfileRecord, SemanticStatusSnapshot};
20#[cfg(feature = "local")]
21use mxr_reader::{clean, ReaderConfig};
22use mxr_store::Store;
23#[cfg(feature = "local")]
24use sha2::{Digest, Sha256};
25#[cfg(feature = "local")]
26use std::collections::HashMap;
27use std::path::Path;
28#[cfg(feature = "local")]
29use std::path::{Path as StdPath, PathBuf};
30#[cfg(feature = "local")]
31use std::process::Command;
32use std::sync::Arc;
33
34#[cfg(feature = "local")]
35const FASTEMBED_REVISION: &str = "fastembed-5.13.0";
36#[cfg(feature = "local")]
37const OCR_MAX_PAGES: usize = 5;
38
39#[derive(Debug, Clone)]
40pub struct SemanticHit {
41 pub message_id: MessageId,
42 pub score: f32,
43}
44
45#[cfg(feature = "local")]
46struct IndexedChunk {
47 message_id: MessageId,
48}
49
50#[cfg(feature = "local")]
51struct SemanticIndex {
52 hnsw: Hnsw<'static, f32, DistCosine>,
53 chunks_by_id: HashMap<usize, IndexedChunk>,
54}
55
56pub struct SemanticEngine {
57 store: Arc<Store>,
58 #[cfg(feature = "local")]
59 cache_dir: PathBuf,
60 config: SemanticConfig,
61 #[cfg(feature = "local")]
62 models: HashMap<SemanticProfile, TextEmbedding>,
63 #[cfg(feature = "local")]
64 indexes: HashMap<SemanticProfile, SemanticIndex>,
65}
66
67#[cfg(feature = "local")]
68impl SemanticEngine {
69 pub fn new(store: Arc<Store>, data_dir: &Path, config: SemanticConfig) -> Self {
70 Self {
71 store,
72 cache_dir: data_dir.join("models"),
73 config,
74 models: HashMap::new(),
75 indexes: HashMap::new(),
76 }
77 }
78
79 pub fn apply_config(&mut self, config: SemanticConfig) {
80 self.config = config;
81 }
82
83 pub async fn status_snapshot(&self) -> Result<SemanticStatusSnapshot> {
84 Ok(SemanticStatusSnapshot {
85 enabled: self.config.enabled,
86 active_profile: self.config.active_profile,
87 profiles: self.store.list_semantic_profiles().await?,
88 })
89 }
90
91 pub async fn install_profile(
92 &mut self,
93 profile: SemanticProfile,
94 ) -> Result<SemanticProfileRecord> {
95 let dimensions = {
96 let model = self.ensure_model(profile, true)?;
97 let embeddings = model
98 .embed([prefixed_document(profile, "warmup document")], Some(1))
99 .context("embed warmup document")?;
100 embeddings
101 .first()
102 .map(|embedding| embedding.len() as u32)
103 .ok_or_else(|| anyhow!("embedding backend returned no vector"))?
104 };
105
106 let mut record = self
107 .store
108 .get_semantic_profile(profile)
109 .await?
110 .unwrap_or_else(|| default_profile_record(profile, dimensions));
111 record.dimensions = dimensions;
112 record.status = SemanticProfileStatus::Ready;
113 if record.installed_at.is_none() {
114 record.installed_at = Some(chrono::Utc::now());
115 }
116 record.last_error = None;
117 self.store.upsert_semantic_profile(&record).await?;
118 Ok(record)
119 }
120
121 pub async fn use_profile(&mut self, profile: SemanticProfile) -> Result<SemanticProfileRecord> {
122 self.install_profile(profile).await?;
123 let mut record = self.reindex_all_for_profile(profile).await?;
124 record.activated_at = Some(chrono::Utc::now());
125 self.store.upsert_semantic_profile(&record).await?;
126 Ok(record)
127 }
128
129 pub async fn reindex_active(&mut self) -> Result<SemanticProfileRecord> {
130 self.reindex_all_for_profile(self.config.active_profile)
131 .await
132 }
133
134 pub async fn reindex_messages(&mut self, message_ids: &[MessageId]) -> Result<()> {
135 if !self.config.enabled || message_ids.is_empty() {
136 return Ok(());
137 }
138
139 let profile = self.config.active_profile;
140 let record = self.install_profile(profile).await?;
141 let now = chrono::Utc::now();
142
143 for message_id in message_ids {
144 let Some(envelope) = self.store.get_envelope(message_id).await? else {
145 continue;
146 };
147 let body = self.store.get_body(message_id).await?;
148 let (chunks, embeddings) =
149 self.build_message_records(&record, &envelope, body.as_ref(), now)?;
150 self.store
151 .replace_semantic_message_data(&envelope.id, &record.id, &chunks, &embeddings)
152 .await?;
153 }
154
155 let mut ready_record = record;
156 ready_record.status = SemanticProfileStatus::Ready;
157 ready_record.last_indexed_at = Some(chrono::Utc::now());
158 ready_record.last_error = None;
159 self.store.upsert_semantic_profile(&ready_record).await?;
160 self.rebuild_index(profile).await?;
161 Ok(())
162 }
163
164 pub async fn search(&mut self, query: &str, limit: usize) -> Result<Vec<SemanticHit>> {
165 if !self.config.enabled {
166 return Ok(Vec::new());
167 }
168
169 let profile = self.config.active_profile;
170 self.install_profile(profile).await?;
171 if !self.indexes.contains_key(&profile) {
172 self.rebuild_index(profile).await?;
173 }
174
175 let query_text = prefixed_query(profile, query);
176 let query_embedding = self
177 .ensure_model(profile, self.config.auto_download_models)?
178 .embed([query_text], Some(1))
179 .context("embed query")?
180 .into_iter()
181 .next()
182 .ok_or_else(|| anyhow!("embedding backend returned no query vector"))?;
183
184 let Some(index) = self.indexes.get(&profile) else {
185 return Ok(Vec::new());
186 };
187 if index.chunks_by_id.is_empty() {
188 return Ok(Vec::new());
189 }
190
191 let candidate_limit = limit.max(1);
192 let ef = candidate_limit.max(64);
193 let neighbours = index.hnsw.search(&query_embedding, candidate_limit, ef);
194 let mut best_by_message: HashMap<MessageId, f32> = HashMap::new();
195
196 for neighbour in neighbours {
197 let Some(chunk) = index.chunks_by_id.get(&neighbour.d_id) else {
198 continue;
199 };
200 let similarity = 1.0 - neighbour.distance;
201 best_by_message
202 .entry(chunk.message_id.clone())
203 .and_modify(|score| {
204 if similarity > *score {
205 *score = similarity;
206 }
207 })
208 .or_insert(similarity);
209 }
210
211 let mut hits = best_by_message
212 .into_iter()
213 .map(|(message_id, score)| SemanticHit { message_id, score })
214 .collect::<Vec<_>>();
215 hits.sort_by(|left, right| right.score.total_cmp(&left.score));
216 if hits.len() > limit {
217 hits.truncate(limit);
218 }
219 Ok(hits)
220 }
221
222 async fn reindex_all_for_profile(
223 &mut self,
224 profile: SemanticProfile,
225 ) -> Result<SemanticProfileRecord> {
226 let mut record = self.install_profile(profile).await?;
227 record.status = SemanticProfileStatus::Indexing;
228 record.progress_completed = 0;
229 record.progress_total = 0;
230 record.last_error = None;
231 self.store.upsert_semantic_profile(&record).await?;
232
233 let accounts = self.store.list_accounts().await?;
234 let mut envelopes = Vec::new();
235 for account in accounts {
236 envelopes.extend(
237 self.store
238 .list_envelopes_by_account(&account.id, 10_000, 0)
239 .await?,
240 );
241 }
242
243 record.progress_total = envelopes.len() as u32;
244 self.store.upsert_semantic_profile(&record).await?;
245 let now = chrono::Utc::now();
246
247 for envelope in &envelopes {
248 let body = self.store.get_body(&envelope.id).await?;
249 let (chunks, embeddings) =
250 self.build_message_records(&record, envelope, body.as_ref(), now)?;
251 self.store
252 .replace_semantic_message_data(&envelope.id, &record.id, &chunks, &embeddings)
253 .await?;
254 record.progress_completed += 1;
255 }
256
257 record.status = SemanticProfileStatus::Ready;
258 record.last_indexed_at = Some(chrono::Utc::now());
259 if record.activated_at.is_none() && self.config.active_profile == profile {
260 record.activated_at = Some(chrono::Utc::now());
261 }
262 self.store.upsert_semantic_profile(&record).await?;
263 self.rebuild_index(profile).await?;
264 Ok(record)
265 }
266
267 async fn rebuild_index(&mut self, profile: SemanticProfile) -> Result<()> {
268 let record = self
269 .store
270 .get_semantic_profile(profile)
271 .await?
272 .ok_or_else(|| anyhow!("semantic profile {} not installed", profile.as_str()))?;
273 let rows = self.store.list_semantic_embeddings(&record.id).await?;
274 let max_elements = rows.len().max(1);
275 let mut hnsw = Hnsw::<f32, DistCosine>::new(16, max_elements, 16, 200, DistCosine {});
276 let mut chunks_by_id = HashMap::with_capacity(rows.len());
277
278 for (point_id, (chunk, embedding)) in rows.into_iter().enumerate() {
279 let vector = blob_to_f32s(&embedding.vector);
280 if vector.is_empty() {
281 continue;
282 }
283 hnsw.insert((&vector, point_id));
284 chunks_by_id.insert(
285 point_id,
286 IndexedChunk {
287 message_id: chunk.message_id,
288 },
289 );
290 }
291 hnsw.set_searching_mode(true);
292
293 self.indexes
294 .insert(profile, SemanticIndex { hnsw, chunks_by_id });
295 Ok(())
296 }
297
298 fn build_message_records(
299 &mut self,
300 profile: &SemanticProfileRecord,
301 envelope: &Envelope,
302 body: Option<&MessageBody>,
303 now: chrono::DateTime<chrono::Utc>,
304 ) -> Result<(Vec<SemanticChunkRecord>, Vec<SemanticEmbeddingRecord>)> {
305 let chunks = build_chunks(envelope, body);
306 if chunks.is_empty() {
307 return Ok((Vec::new(), Vec::new()));
308 }
309
310 let texts = chunks
311 .iter()
312 .map(|chunk| prefixed_document(profile.profile, &chunk.1))
313 .collect::<Vec<_>>();
314 let embeddings = self
315 .ensure_model(profile.profile, self.config.auto_download_models)?
316 .embed(texts, Some(32))
317 .context("embed message chunks")?;
318
319 let mut chunk_records = Vec::with_capacity(chunks.len());
320 let mut embedding_records = Vec::with_capacity(chunks.len());
321
322 for (index, ((source_kind, normalized), embedding)) in
323 chunks.into_iter().zip(embeddings.into_iter()).enumerate()
324 {
325 let chunk_id = semantic_chunk_id(&envelope.id.as_str(), &source_kind, index as u32);
326 let chunk_record = SemanticChunkRecord {
327 id: chunk_id.clone(),
328 message_id: envelope.id.clone(),
329 source_kind,
330 ordinal: index as u32,
331 normalized: normalized.clone(),
332 content_hash: content_hash(&normalized),
333 created_at: now,
334 updated_at: now,
335 };
336 let embedding_record = SemanticEmbeddingRecord {
337 chunk_id,
338 profile_id: profile.id.clone(),
339 dimensions: embedding.len() as u32,
340 vector: f32s_to_blob(&embedding),
341 status: SemanticEmbeddingStatus::Ready,
342 created_at: now,
343 updated_at: now,
344 };
345 chunk_records.push(chunk_record);
346 embedding_records.push(embedding_record);
347 }
348
349 Ok((chunk_records, embedding_records))
350 }
351
352 fn ensure_model(
353 &mut self,
354 profile: SemanticProfile,
355 allow_download: bool,
356 ) -> Result<&mut TextEmbedding> {
357 if !self.models.contains_key(&profile) {
358 if !allow_download {
359 return Err(anyhow!(
360 "semantic profile {} is not installed locally",
361 profile.as_str()
362 ));
363 }
364 std::fs::create_dir_all(&self.cache_dir)?;
365 let model = TextEmbedding::try_new(
366 TextInitOptions::new(embedding_model(profile))
367 .with_cache_dir(self.cache_dir.clone())
368 .with_show_download_progress(false),
369 )
370 .with_context(|| format!("load semantic profile {}", profile.as_str()))?;
371 self.models.insert(profile, model);
372 }
373
374 self.models
375 .get_mut(&profile)
376 .ok_or_else(|| anyhow!("semantic profile {} not loaded", profile.as_str()))
377 }
378}
379
380#[cfg(not(feature = "local"))]
381impl SemanticEngine {
382 pub fn new(store: Arc<Store>, data_dir: &Path, config: SemanticConfig) -> Self {
383 let _ = data_dir;
384 Self { store, config }
385 }
386
387 pub fn apply_config(&mut self, config: SemanticConfig) {
388 self.config = config;
389 }
390
391 pub async fn status_snapshot(&self) -> Result<SemanticStatusSnapshot> {
392 Ok(SemanticStatusSnapshot {
393 enabled: false,
394 active_profile: self.config.active_profile,
395 profiles: self.store.list_semantic_profiles().await?,
396 })
397 }
398
399 pub async fn install_profile(
400 &mut self,
401 _profile: SemanticProfile,
402 ) -> Result<SemanticProfileRecord> {
403 Err(semantic_unavailable_error())
404 }
405
406 pub async fn use_profile(
407 &mut self,
408 _profile: SemanticProfile,
409 ) -> Result<SemanticProfileRecord> {
410 Err(semantic_unavailable_error())
411 }
412
413 pub async fn reindex_active(&mut self) -> Result<SemanticProfileRecord> {
414 Err(semantic_unavailable_error())
415 }
416
417 pub async fn reindex_messages(&mut self, _message_ids: &[MessageId]) -> Result<()> {
418 Ok(())
419 }
420
421 pub async fn search(&mut self, _query: &str, _limit: usize) -> Result<Vec<SemanticHit>> {
422 Ok(Vec::new())
423 }
424}
425
426#[cfg(feature = "local")]
427pub fn should_use_semantic(mode: SearchMode) -> bool {
428 matches!(mode, SearchMode::Hybrid | SearchMode::Semantic)
429}
430
431#[cfg(not(feature = "local"))]
432pub fn should_use_semantic(_mode: SearchMode) -> bool {
433 false
434}
435
436#[cfg(feature = "local")]
437fn default_profile_record(profile: SemanticProfile, dimensions: u32) -> SemanticProfileRecord {
438 SemanticProfileRecord {
439 id: semantic_profile_id(profile),
440 profile,
441 backend: "fastembed".to_string(),
442 model_revision: FASTEMBED_REVISION.to_string(),
443 dimensions,
444 status: SemanticProfileStatus::Pending,
445 installed_at: None,
446 activated_at: None,
447 last_indexed_at: None,
448 progress_completed: 0,
449 progress_total: 0,
450 last_error: None,
451 }
452}
453
454#[cfg(not(feature = "local"))]
455fn semantic_unavailable_error() -> anyhow::Error {
456 anyhow!("semantic search unavailable in this binary")
457}
458
459#[cfg(feature = "local")]
460fn semantic_profile_id(profile: SemanticProfile) -> SemanticProfileId {
461 SemanticProfileId::from_provider_id("semantic_profile", profile.as_str())
462}
463
464#[cfg(feature = "local")]
465fn semantic_chunk_id(
466 message_id: &str,
467 source_kind: &SemanticChunkSourceKind,
468 ordinal: u32,
469) -> mxr_core::SemanticChunkId {
470 mxr_core::SemanticChunkId::from_provider_id(
471 "semantic_chunk",
472 &format!("{message_id}:{source_kind:?}:{ordinal}"),
473 )
474}
475
476#[cfg(feature = "local")]
477fn build_chunks(
478 envelope: &Envelope,
479 body: Option<&MessageBody>,
480) -> Vec<(SemanticChunkSourceKind, String)> {
481 let mut chunks = Vec::new();
482
483 let header = normalize_text(&format!(
484 "subject {} from {} {} to {} snippet {}",
485 envelope.subject,
486 envelope.from.name.as_deref().unwrap_or(""),
487 envelope.from.email,
488 envelope
489 .to
490 .iter()
491 .map(|addr| addr.email.as_str())
492 .collect::<Vec<_>>()
493 .join(" "),
494 envelope.snippet
495 ));
496 if !header.is_empty() {
497 chunks.push((SemanticChunkSourceKind::Header, header));
498 }
499
500 if let Some(body) = body {
501 let reader_output = clean(
502 body.text_plain.as_deref(),
503 body.text_html.as_deref(),
504 &ReaderConfig::default(),
505 );
506 for chunk in chunk_text(&reader_output.content, 120, 30) {
507 chunks.push((SemanticChunkSourceKind::Body, chunk));
508 }
509
510 for attachment in &body.attachments {
511 let summary =
512 normalize_text(&format!("{} {}", attachment.filename, attachment.mime_type));
513 if !summary.is_empty() {
514 chunks.push((SemanticChunkSourceKind::AttachmentSummary, summary));
515 }
516
517 if let Some(text) = read_attachment_text(attachment) {
518 for chunk in chunk_text(&text, 120, 30) {
519 chunks.push((SemanticChunkSourceKind::AttachmentText, chunk));
520 }
521 }
522 }
523 }
524
525 chunks
526}
527
528#[cfg(feature = "local")]
529fn read_attachment_text(attachment: &AttachmentMeta) -> Option<String> {
530 let path = attachment.local_path.as_ref()?;
531 match attachment_kind(attachment, path) {
532 AttachmentKind::Text => read_text_attachment(path, false),
533 AttachmentKind::Html => read_text_attachment(path, true),
534 AttachmentKind::Pdf => read_pdf_attachment(path),
535 AttachmentKind::OfficeDocument => read_office_attachment(path),
536 AttachmentKind::Spreadsheet => read_spreadsheet_attachment(attachment, path),
537 AttachmentKind::Image => run_tesseract(path),
538 AttachmentKind::Unknown => None,
539 }
540}
541
542#[cfg(feature = "local")]
543fn read_text_attachment(path: &StdPath, is_html: bool) -> Option<String> {
544 let content = std::fs::read_to_string(path).ok()?;
545 if is_html {
546 return normalized_nonempty(&clean(None, Some(&content), &ReaderConfig::default()).content);
547 }
548 normalized_nonempty(&content)
549}
550
551#[cfg(feature = "local")]
552fn read_office_attachment(path: &StdPath) -> Option<String> {
553 let markdown = undoc::to_markdown(path).ok()?;
554 normalized_nonempty(&markdown)
555}
556
557#[cfg(feature = "local")]
558fn read_spreadsheet_attachment(attachment: &AttachmentMeta, path: &StdPath) -> Option<String> {
559 let extension = attachment_extension(attachment, path);
560 let mime = attachment.mime_type.to_ascii_lowercase();
561 let undoc_text = should_try_undoc_spreadsheet(&mime, extension.as_deref())
562 .then(|| read_office_attachment(path))
563 .flatten();
564 let table_text = read_spreadsheet_tables(path);
565 combine_extracted_texts([undoc_text, table_text])
566}
567
568#[cfg(feature = "local")]
569fn read_spreadsheet_tables(path: &StdPath) -> Option<String> {
570 let mut workbook = open_workbook_auto(path).ok()?;
571 let mut sections = Vec::new();
572
573 for sheet_name in workbook.sheet_names().to_owned() {
574 let Ok(range) = workbook.worksheet_range(&sheet_name) else {
575 continue;
576 };
577
578 let mut rows = Vec::new();
579 for row in range.rows() {
580 let cells = row
581 .iter()
582 .map(ToString::to_string)
583 .map(|cell| normalize_text(&cell))
584 .filter(|cell| !cell.is_empty())
585 .collect::<Vec<_>>();
586 if !cells.is_empty() {
587 rows.push(cells.join(" | "));
588 }
589 }
590
591 if !rows.is_empty() {
592 sections.push(format!("sheet {sheet_name}\n{}", rows.join("\n")));
593 }
594 }
595
596 normalized_nonempty(§ions.join("\n\n"))
597}
598
599#[cfg(feature = "local")]
600fn should_try_undoc_spreadsheet(mime: &str, extension: Option<&str>) -> bool {
601 mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
602 || matches!(extension, Some("xlsx"))
603}
604
605#[cfg(feature = "local")]
606fn combine_extracted_texts<I>(parts: I) -> Option<String>
607where
608 I: IntoIterator<Item = Option<String>>,
609{
610 let mut combined = Vec::new();
611 for part in parts.into_iter().flatten() {
612 if combined.iter().any(|existing: &String| {
613 existing == &part || existing.contains(&part) || part.contains(existing)
614 }) {
615 continue;
616 }
617 combined.push(part);
618 }
619
620 if combined.is_empty() {
621 None
622 } else {
623 Some(combined.join("\n\n"))
624 }
625}
626
627#[cfg(feature = "local")]
628fn attachment_extension(attachment: &AttachmentMeta, path: &StdPath) -> Option<String> {
629 path.extension()
630 .and_then(|ext| ext.to_str())
631 .or_else(|| attachment.filename.rsplit('.').next())
632 .map(|ext| ext.trim().to_ascii_lowercase())
633 .filter(|ext| !ext.is_empty())
634}
635
636#[cfg(feature = "local")]
637fn normalized_nonempty(text: &str) -> Option<String> {
638 let normalized = normalize_text(text);
639 if normalized.is_empty() {
640 None
641 } else {
642 Some(normalized)
643 }
644}
645
646#[cfg(feature = "local")]
647#[derive(Clone, Copy, Debug, Eq, PartialEq)]
648enum AttachmentKind {
649 Text,
650 Html,
651 Pdf,
652 OfficeDocument,
653 Spreadsheet,
654 Image,
655 Unknown,
656}
657
658#[cfg(feature = "local")]
659fn attachment_kind(attachment: &AttachmentMeta, path: &StdPath) -> AttachmentKind {
660 let mime = attachment.mime_type.to_ascii_lowercase();
661 let extension = attachment_extension(attachment, path);
662 let extension = extension.as_deref();
663
664 if mime == "text/html" || matches!(extension, Some("html" | "htm")) {
665 return AttachmentKind::Html;
666 }
667
668 if mime.starts_with("text/")
669 || matches!(
670 mime.as_str(),
671 "application/json"
672 | "application/xml"
673 | "application/x-yaml"
674 | "application/yaml"
675 | "application/markdown"
676 )
677 || matches!(
678 extension,
679 Some("txt" | "md" | "markdown" | "json" | "xml" | "yaml" | "yml" | "csv" | "tsv")
680 )
681 {
682 return AttachmentKind::Text;
683 }
684
685 if mime == "application/pdf" || matches!(extension, Some("pdf")) {
686 return AttachmentKind::Pdf;
687 }
688
689 if matches!(
690 mime.as_str(),
691 "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
692 | "application/vnd.openxmlformats-officedocument.presentationml.presentation"
693 ) || matches!(extension, Some("docx" | "pptx"))
694 {
695 return AttachmentKind::OfficeDocument;
696 }
697
698 if matches!(
699 mime.as_str(),
700 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
701 | "application/vnd.ms-excel"
702 | "application/vnd.ms-excel.sheet.binary.macroenabled.12"
703 | "application/vnd.ms-excel.sheet.macroenabled.12"
704 | "application/vnd.oasis.opendocument.spreadsheet"
705 ) || matches!(extension, Some("xlsx" | "xlsm" | "xlsb" | "xls" | "ods"))
706 {
707 return AttachmentKind::Spreadsheet;
708 }
709
710 if mime.starts_with("image/")
711 || matches!(
712 extension,
713 Some("png" | "jpg" | "jpeg" | "gif" | "webp" | "bmp" | "tif" | "tiff")
714 )
715 {
716 return AttachmentKind::Image;
717 }
718
719 AttachmentKind::Unknown
720}
721
722#[cfg(feature = "local")]
723fn read_pdf_attachment(path: &StdPath) -> Option<String> {
724 if let Some(extracted) = unpdf::to_markdown(path)
725 .ok()
726 .and_then(|markdown| normalized_nonempty(&markdown))
727 {
728 return Some(extracted);
729 }
730
731 ocr_pdf(path)
732}
733
734#[cfg(feature = "local")]
735fn ocr_pdf(path: &StdPath) -> Option<String> {
736 let pdftoppm = which::which("pdftoppm").ok()?;
737 let tempdir = tempfile::tempdir().ok()?;
738 let prefix = tempdir.path().join("page");
739 let status = Command::new(pdftoppm)
740 .arg("-f")
741 .arg("1")
742 .arg("-l")
743 .arg(OCR_MAX_PAGES.to_string())
744 .arg("-png")
745 .arg(path)
746 .arg(&prefix)
747 .status()
748 .ok()?;
749 if !status.success() {
750 return None;
751 }
752
753 let mut images = std::fs::read_dir(tempdir.path())
754 .ok()?
755 .flatten()
756 .map(|entry| entry.path())
757 .filter(|path| {
758 path.extension()
759 .and_then(|ext| ext.to_str())
760 .is_some_and(|ext| ext.eq_ignore_ascii_case("png"))
761 })
762 .collect::<Vec<_>>();
763 images.sort();
764
765 let mut output = String::new();
766 for image in images {
767 if let Some(text) = run_tesseract(&image) {
768 if !output.is_empty() {
769 output.push(' ');
770 }
771 output.push_str(&text);
772 }
773 }
774
775 let normalized = normalize_text(&output);
776 if normalized.is_empty() {
777 None
778 } else {
779 Some(normalized)
780 }
781}
782
783#[cfg(feature = "local")]
784fn run_tesseract(path: &StdPath) -> Option<String> {
785 let tesseract = which::which("tesseract").ok()?;
786 let output = Command::new(tesseract)
787 .arg(path)
788 .arg("stdout")
789 .output()
790 .ok()?;
791 if !output.status.success() {
792 return None;
793 }
794
795 let normalized = normalize_text(&String::from_utf8_lossy(&output.stdout));
796 if normalized.is_empty() {
797 None
798 } else {
799 Some(normalized)
800 }
801}
802
803#[cfg(feature = "local")]
804fn embedding_model(profile: SemanticProfile) -> EmbeddingModel {
805 match profile {
806 SemanticProfile::BgeSmallEnV15 => EmbeddingModel::BGESmallENV15,
807 SemanticProfile::MultilingualE5Small => EmbeddingModel::MultilingualE5Small,
808 SemanticProfile::BgeM3 => EmbeddingModel::BGEM3,
809 }
810}
811
812#[cfg(feature = "local")]
813fn prefixed_query(profile: SemanticProfile, text: &str) -> String {
814 let normalized = normalize_text(text);
815 match profile {
816 SemanticProfile::MultilingualE5Small => format!("query: {normalized}"),
817 _ => normalized,
818 }
819}
820
821#[cfg(feature = "local")]
822fn prefixed_document(profile: SemanticProfile, text: &str) -> String {
823 let normalized = normalize_text(text);
824 match profile {
825 SemanticProfile::MultilingualE5Small => format!("passage: {normalized}"),
826 _ => normalized,
827 }
828}
829
830#[cfg(feature = "local")]
831fn normalize_text(text: &str) -> String {
832 text.split_whitespace()
833 .map(str::trim)
834 .filter(|part| !part.is_empty())
835 .collect::<Vec<_>>()
836 .join(" ")
837 .to_lowercase()
838}
839
840#[cfg(feature = "local")]
841fn chunk_text(text: &str, window_words: usize, overlap_words: usize) -> Vec<String> {
842 let normalized = normalize_text(text);
843 if normalized.is_empty() {
844 return Vec::new();
845 }
846 let words = normalized.split_whitespace().collect::<Vec<_>>();
847 if words.len() <= window_words {
848 return vec![normalized];
849 }
850
851 let mut chunks = Vec::new();
852 let mut start = 0usize;
853 let step = window_words.saturating_sub(overlap_words).max(1);
854 while start < words.len() {
855 let end = (start + window_words).min(words.len());
856 let chunk = words[start..end].join(" ");
857 if !chunk.is_empty() {
858 chunks.push(chunk);
859 }
860 if end == words.len() {
861 break;
862 }
863 start += step;
864 }
865 chunks
866}
867
868#[cfg(feature = "local")]
869fn content_hash(normalized: &str) -> String {
870 let digest = Sha256::digest(normalized.as_bytes());
871 format!("{digest:x}")
872}
873
874#[cfg(feature = "local")]
875fn f32s_to_blob(values: &[f32]) -> Vec<u8> {
876 let mut bytes = Vec::with_capacity(values.len() * 4);
877 for value in values {
878 bytes.extend_from_slice(&value.to_le_bytes());
879 }
880 bytes
881}
882
883#[cfg(feature = "local")]
884fn blob_to_f32s(bytes: &[u8]) -> Vec<f32> {
885 bytes
886 .chunks_exact(4)
887 .map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
888 .collect()
889}
890
891#[cfg(all(test, feature = "local"))]
892mod tests {
893 use super::*;
894 use mxr_core::id::{AttachmentId, MessageId};
895 use std::fs::File;
896 use std::io::Write;
897 use tempfile::tempdir;
898 use zip::write::SimpleFileOptions;
899 use zip::ZipWriter;
900
901 fn attachment(path: &StdPath, filename: &str, mime_type: &str) -> AttachmentMeta {
902 AttachmentMeta {
903 id: AttachmentId::new(),
904 message_id: MessageId::new(),
905 filename: filename.to_string(),
906 mime_type: mime_type.to_string(),
907 size_bytes: std::fs::metadata(path).unwrap().len(),
908 local_path: Some(path.to_path_buf()),
909 provider_id: "att-1".to_string(),
910 }
911 }
912
913 fn write_zip(path: &StdPath, files: &[(&str, String)]) {
914 let file = File::create(path).unwrap();
915 let mut zip = ZipWriter::new(file);
916 let options = SimpleFileOptions::default();
917 for (name, contents) in files {
918 zip.start_file(name, options).unwrap();
919 zip.write_all(contents.as_bytes()).unwrap();
920 }
921 zip.finish().unwrap();
922 }
923
924 fn write_docx(path: &StdPath, text: &str) {
925 write_zip(
926 path,
927 &[
928 (
929 "[Content_Types].xml",
930 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
931<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
932 <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
933 <Default Extension="xml" ContentType="application/xml"/>
934 <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
935</Types>"#
936 .to_string(),
937 ),
938 (
939 "_rels/.rels",
940 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
941<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
942 <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
943</Relationships>"#
944 .to_string(),
945 ),
946 (
947 "word/document.xml",
948 format!(
949 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
950<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
951 <w:body>
952 <w:p>
953 <w:r><w:t>{text}</w:t></w:r>
954 </w:p>
955 </w:body>
956</w:document>"#
957 ),
958 ),
959 ],
960 );
961 }
962
963 fn write_pptx(path: &StdPath, text: &str) {
964 write_zip(
965 path,
966 &[
967 (
968 "[Content_Types].xml",
969 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
970<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
971 <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
972 <Default Extension="xml" ContentType="application/xml"/>
973 <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
974 <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
975</Types>"#
976 .to_string(),
977 ),
978 (
979 "_rels/.rels",
980 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
981<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
982 <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
983</Relationships>"#
984 .to_string(),
985 ),
986 (
987 "ppt/presentation.xml",
988 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
989<p:presentation xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
990 xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
991 xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
992 <p:sldIdLst>
993 <p:sldId id="256" r:id="rId1"/>
994 </p:sldIdLst>
995</p:presentation>"#
996 .to_string(),
997 ),
998 (
999 "ppt/_rels/presentation.xml.rels",
1000 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1001<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1002 <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
1003</Relationships>"#
1004 .to_string(),
1005 ),
1006 (
1007 "ppt/slides/slide1.xml",
1008 format!(
1009 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1010<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1011 xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
1012 xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1013 <p:cSld>
1014 <p:spTree>
1015 <p:nvGrpSpPr>
1016 <p:cNvPr id="1" name=""/>
1017 <p:cNvGrpSpPr/>
1018 <p:nvPr/>
1019 </p:nvGrpSpPr>
1020 <p:grpSpPr/>
1021 <p:sp>
1022 <p:nvSpPr>
1023 <p:cNvPr id="2" name="Title 1"/>
1024 <p:cNvSpPr/>
1025 <p:nvPr/>
1026 </p:nvSpPr>
1027 <p:txBody>
1028 <a:bodyPr/>
1029 <a:lstStyle/>
1030 <a:p><a:r><a:t>{text}</a:t></a:r></a:p>
1031 </p:txBody>
1032 </p:sp>
1033 </p:spTree>
1034 </p:cSld>
1035</p:sld>"#
1036 ),
1037 ),
1038 ],
1039 );
1040 }
1041
1042 fn write_xlsx(path: &StdPath) {
1043 write_zip(
1044 path,
1045 &[
1046 (
1047 "[Content_Types].xml",
1048 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1049<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
1050 <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
1051 <Default Extension="xml" ContentType="application/xml"/>
1052 <Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
1053 <Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
1054 <Override PartName="/xl/sharedStrings.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"/>
1055</Types>"#
1056 .to_string(),
1057 ),
1058 (
1059 "_rels/.rels",
1060 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1061<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1062 <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>
1063</Relationships>"#
1064 .to_string(),
1065 ),
1066 (
1067 "xl/workbook.xml",
1068 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1069<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
1070 xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
1071 <sheets>
1072 <sheet name="Summary" sheetId="1" r:id="rId1"/>
1073 </sheets>
1074</workbook>"#
1075 .to_string(),
1076 ),
1077 (
1078 "xl/_rels/workbook.xml.rels",
1079 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1080<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1081 <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
1082 <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings" Target="sharedStrings.xml"/>
1083</Relationships>"#
1084 .to_string(),
1085 ),
1086 (
1087 "xl/sharedStrings.xml",
1088 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1089<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="4" uniqueCount="4">
1090 <si><t>Name</t></si>
1091 <si><t>Value</t></si>
1092 <si><t>Alice</t></si>
1093 <si><t>42</t></si>
1094</sst>"#
1095 .to_string(),
1096 ),
1097 (
1098 "xl/worksheets/sheet1.xml",
1099 r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1100<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
1101 <sheetData>
1102 <row r="1">
1103 <c r="A1" t="s"><v>0</v></c>
1104 <c r="B1" t="s"><v>1</v></c>
1105 </row>
1106 <row r="2">
1107 <c r="A2" t="s"><v>2</v></c>
1108 <c r="B2" t="s"><v>3</v></c>
1109 </row>
1110 </sheetData>
1111</worksheet>"#
1112 .to_string(),
1113 ),
1114 ],
1115 );
1116 }
1117
1118 #[test]
1119 fn attachment_kind_uses_extension_when_mime_is_generic() {
1120 let dir = tempdir().unwrap();
1121 let docx_path = dir.path().join("roadmap.docx");
1122 write_docx(&docx_path, "Quarterly roadmap");
1123 let attachment = attachment(&docx_path, "roadmap.docx", "application/octet-stream");
1124
1125 assert_eq!(
1126 attachment_kind(&attachment, docx_path.as_path()),
1127 AttachmentKind::OfficeDocument
1128 );
1129 }
1130
1131 #[test]
1132 fn read_attachment_text_extracts_docx_with_undoc() {
1133 let dir = tempdir().unwrap();
1134 let docx_path = dir.path().join("roadmap.docx");
1135 write_docx(&docx_path, "Quarterly roadmap for launch");
1136 let attachment = attachment(
1137 &docx_path,
1138 "roadmap.docx",
1139 "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1140 );
1141
1142 let extracted = read_attachment_text(&attachment).unwrap();
1143 assert!(extracted.contains("quarterly roadmap"));
1144 assert!(extracted.contains("launch"));
1145 }
1146
1147 #[test]
1148 fn read_attachment_text_extracts_pptx_with_undoc() {
1149 let dir = tempdir().unwrap();
1150 let pptx_path = dir.path().join("deck.pptx");
1151 write_pptx(&pptx_path, "Launch metrics");
1152 let attachment = attachment(
1153 &pptx_path,
1154 "deck.pptx",
1155 "application/vnd.openxmlformats-officedocument.presentationml.presentation",
1156 );
1157
1158 let extracted = read_attachment_text(&attachment).unwrap();
1159 assert!(extracted.contains("launch metrics"));
1160 }
1161
1162 #[test]
1163 fn read_attachment_text_extracts_xlsx_with_table_fallback() {
1164 let dir = tempdir().unwrap();
1165 let xlsx_path = dir.path().join("table.xlsx");
1166 write_xlsx(&xlsx_path);
1167 let attachment = attachment(
1168 &xlsx_path,
1169 "table.xlsx",
1170 "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1171 );
1172
1173 let extracted = read_attachment_text(&attachment).unwrap();
1174 assert!(extracted.contains("sheet summary"));
1175 assert!(extracted.contains("name | value"));
1176 assert!(extracted.contains("alice | 42"));
1177 }
1178}