Skip to main content

mxr_semantic/
lib.rs

1#[cfg(feature = "local")]
2use anyhow::Context;
3use anyhow::{anyhow, Result};
4#[cfg(feature = "local")]
5use calamine::{open_workbook_auto, Reader};
6#[cfg(feature = "local")]
7use fastembed::{EmbeddingModel, TextEmbedding, TextInitOptions};
8#[cfg(feature = "local")]
9use hnsw_rs::prelude::{DistCosine, Hnsw};
10use mxr_config::SemanticConfig;
11use mxr_core::id::MessageId;
12#[cfg(feature = "local")]
13use mxr_core::id::SemanticProfileId;
14#[cfg(feature = "local")]
15use mxr_core::types::{
16    AttachmentMeta, Envelope, MessageBody, SemanticChunkRecord, SemanticChunkSourceKind,
17    SemanticEmbeddingRecord, SemanticEmbeddingStatus, SemanticProfileStatus,
18};
19use mxr_core::types::{SearchMode, SemanticProfile, SemanticProfileRecord, SemanticStatusSnapshot};
20#[cfg(feature = "local")]
21use mxr_reader::{clean, ReaderConfig};
22use mxr_store::Store;
23#[cfg(feature = "local")]
24use sha2::{Digest, Sha256};
25#[cfg(feature = "local")]
26use std::collections::HashMap;
27use std::path::Path;
28#[cfg(feature = "local")]
29use std::path::{Path as StdPath, PathBuf};
30#[cfg(feature = "local")]
31use std::process::Command;
32use std::sync::Arc;
33
34#[cfg(feature = "local")]
35const FASTEMBED_REVISION: &str = "fastembed-5.13.0";
36#[cfg(feature = "local")]
37const OCR_MAX_PAGES: usize = 5;
38
39#[derive(Debug, Clone)]
40pub struct SemanticHit {
41    pub message_id: MessageId,
42    pub score: f32,
43}
44
45#[cfg(feature = "local")]
46struct IndexedChunk {
47    message_id: MessageId,
48}
49
50#[cfg(feature = "local")]
51struct SemanticIndex {
52    hnsw: Hnsw<'static, f32, DistCosine>,
53    chunks_by_id: HashMap<usize, IndexedChunk>,
54}
55
56pub struct SemanticEngine {
57    store: Arc<Store>,
58    #[cfg(feature = "local")]
59    cache_dir: PathBuf,
60    config: SemanticConfig,
61    #[cfg(feature = "local")]
62    models: HashMap<SemanticProfile, TextEmbedding>,
63    #[cfg(feature = "local")]
64    indexes: HashMap<SemanticProfile, SemanticIndex>,
65}
66
67#[cfg(feature = "local")]
68impl SemanticEngine {
69    pub fn new(store: Arc<Store>, data_dir: &Path, config: SemanticConfig) -> Self {
70        Self {
71            store,
72            cache_dir: data_dir.join("models"),
73            config,
74            models: HashMap::new(),
75            indexes: HashMap::new(),
76        }
77    }
78
79    pub fn apply_config(&mut self, config: SemanticConfig) {
80        self.config = config;
81    }
82
83    pub async fn status_snapshot(&self) -> Result<SemanticStatusSnapshot> {
84        Ok(SemanticStatusSnapshot {
85            enabled: self.config.enabled,
86            active_profile: self.config.active_profile,
87            profiles: self.store.list_semantic_profiles().await?,
88        })
89    }
90
91    pub async fn install_profile(
92        &mut self,
93        profile: SemanticProfile,
94    ) -> Result<SemanticProfileRecord> {
95        let dimensions = {
96            let model = self.ensure_model(profile, true)?;
97            let embeddings = model
98                .embed([prefixed_document(profile, "warmup document")], Some(1))
99                .context("embed warmup document")?;
100            embeddings
101                .first()
102                .map(|embedding| embedding.len() as u32)
103                .ok_or_else(|| anyhow!("embedding backend returned no vector"))?
104        };
105
106        let mut record = self
107            .store
108            .get_semantic_profile(profile)
109            .await?
110            .unwrap_or_else(|| default_profile_record(profile, dimensions));
111        record.dimensions = dimensions;
112        record.status = SemanticProfileStatus::Ready;
113        if record.installed_at.is_none() {
114            record.installed_at = Some(chrono::Utc::now());
115        }
116        record.last_error = None;
117        self.store.upsert_semantic_profile(&record).await?;
118        Ok(record)
119    }
120
121    pub async fn use_profile(&mut self, profile: SemanticProfile) -> Result<SemanticProfileRecord> {
122        self.install_profile(profile).await?;
123        let mut record = self.reindex_all_for_profile(profile).await?;
124        record.activated_at = Some(chrono::Utc::now());
125        self.store.upsert_semantic_profile(&record).await?;
126        Ok(record)
127    }
128
129    pub async fn reindex_active(&mut self) -> Result<SemanticProfileRecord> {
130        self.reindex_all_for_profile(self.config.active_profile)
131            .await
132    }
133
134    pub async fn reindex_messages(&mut self, message_ids: &[MessageId]) -> Result<()> {
135        if !self.config.enabled || message_ids.is_empty() {
136            return Ok(());
137        }
138
139        let profile = self.config.active_profile;
140        let record = self.install_profile(profile).await?;
141        let now = chrono::Utc::now();
142
143        for message_id in message_ids {
144            let Some(envelope) = self.store.get_envelope(message_id).await? else {
145                continue;
146            };
147            let body = self.store.get_body(message_id).await?;
148            let (chunks, embeddings) =
149                self.build_message_records(&record, &envelope, body.as_ref(), now)?;
150            self.store
151                .replace_semantic_message_data(&envelope.id, &record.id, &chunks, &embeddings)
152                .await?;
153        }
154
155        let mut ready_record = record;
156        ready_record.status = SemanticProfileStatus::Ready;
157        ready_record.last_indexed_at = Some(chrono::Utc::now());
158        ready_record.last_error = None;
159        self.store.upsert_semantic_profile(&ready_record).await?;
160        self.rebuild_index(profile).await?;
161        Ok(())
162    }
163
164    pub async fn search(&mut self, query: &str, limit: usize) -> Result<Vec<SemanticHit>> {
165        if !self.config.enabled {
166            return Ok(Vec::new());
167        }
168
169        let profile = self.config.active_profile;
170        self.install_profile(profile).await?;
171        if !self.indexes.contains_key(&profile) {
172            self.rebuild_index(profile).await?;
173        }
174
175        let query_text = prefixed_query(profile, query);
176        let query_embedding = self
177            .ensure_model(profile, self.config.auto_download_models)?
178            .embed([query_text], Some(1))
179            .context("embed query")?
180            .into_iter()
181            .next()
182            .ok_or_else(|| anyhow!("embedding backend returned no query vector"))?;
183
184        let Some(index) = self.indexes.get(&profile) else {
185            return Ok(Vec::new());
186        };
187        if index.chunks_by_id.is_empty() {
188            return Ok(Vec::new());
189        }
190
191        let candidate_limit = limit.max(1);
192        let ef = candidate_limit.max(64);
193        let neighbours = index.hnsw.search(&query_embedding, candidate_limit, ef);
194        let mut best_by_message: HashMap<MessageId, f32> = HashMap::new();
195
196        for neighbour in neighbours {
197            let Some(chunk) = index.chunks_by_id.get(&neighbour.d_id) else {
198                continue;
199            };
200            let similarity = 1.0 - neighbour.distance;
201            best_by_message
202                .entry(chunk.message_id.clone())
203                .and_modify(|score| {
204                    if similarity > *score {
205                        *score = similarity;
206                    }
207                })
208                .or_insert(similarity);
209        }
210
211        let mut hits = best_by_message
212            .into_iter()
213            .map(|(message_id, score)| SemanticHit { message_id, score })
214            .collect::<Vec<_>>();
215        hits.sort_by(|left, right| right.score.total_cmp(&left.score));
216        if hits.len() > limit {
217            hits.truncate(limit);
218        }
219        Ok(hits)
220    }
221
222    async fn reindex_all_for_profile(
223        &mut self,
224        profile: SemanticProfile,
225    ) -> Result<SemanticProfileRecord> {
226        let mut record = self.install_profile(profile).await?;
227        record.status = SemanticProfileStatus::Indexing;
228        record.progress_completed = 0;
229        record.progress_total = 0;
230        record.last_error = None;
231        self.store.upsert_semantic_profile(&record).await?;
232
233        let accounts = self.store.list_accounts().await?;
234        let mut envelopes = Vec::new();
235        for account in accounts {
236            envelopes.extend(
237                self.store
238                    .list_envelopes_by_account(&account.id, 10_000, 0)
239                    .await?,
240            );
241        }
242
243        record.progress_total = envelopes.len() as u32;
244        self.store.upsert_semantic_profile(&record).await?;
245        let now = chrono::Utc::now();
246
247        for envelope in &envelopes {
248            let body = self.store.get_body(&envelope.id).await?;
249            let (chunks, embeddings) =
250                self.build_message_records(&record, envelope, body.as_ref(), now)?;
251            self.store
252                .replace_semantic_message_data(&envelope.id, &record.id, &chunks, &embeddings)
253                .await?;
254            record.progress_completed += 1;
255        }
256
257        record.status = SemanticProfileStatus::Ready;
258        record.last_indexed_at = Some(chrono::Utc::now());
259        if record.activated_at.is_none() && self.config.active_profile == profile {
260            record.activated_at = Some(chrono::Utc::now());
261        }
262        self.store.upsert_semantic_profile(&record).await?;
263        self.rebuild_index(profile).await?;
264        Ok(record)
265    }
266
267    async fn rebuild_index(&mut self, profile: SemanticProfile) -> Result<()> {
268        let record = self
269            .store
270            .get_semantic_profile(profile)
271            .await?
272            .ok_or_else(|| anyhow!("semantic profile {} not installed", profile.as_str()))?;
273        let rows = self.store.list_semantic_embeddings(&record.id).await?;
274        let max_elements = rows.len().max(1);
275        let mut hnsw = Hnsw::<f32, DistCosine>::new(16, max_elements, 16, 200, DistCosine {});
276        let mut chunks_by_id = HashMap::with_capacity(rows.len());
277
278        for (point_id, (chunk, embedding)) in rows.into_iter().enumerate() {
279            let vector = blob_to_f32s(&embedding.vector);
280            if vector.is_empty() {
281                continue;
282            }
283            hnsw.insert((&vector, point_id));
284            chunks_by_id.insert(
285                point_id,
286                IndexedChunk {
287                    message_id: chunk.message_id,
288                },
289            );
290        }
291        hnsw.set_searching_mode(true);
292
293        self.indexes
294            .insert(profile, SemanticIndex { hnsw, chunks_by_id });
295        Ok(())
296    }
297
298    fn build_message_records(
299        &mut self,
300        profile: &SemanticProfileRecord,
301        envelope: &Envelope,
302        body: Option<&MessageBody>,
303        now: chrono::DateTime<chrono::Utc>,
304    ) -> Result<(Vec<SemanticChunkRecord>, Vec<SemanticEmbeddingRecord>)> {
305        let chunks = build_chunks(envelope, body);
306        if chunks.is_empty() {
307            return Ok((Vec::new(), Vec::new()));
308        }
309
310        let texts = chunks
311            .iter()
312            .map(|chunk| prefixed_document(profile.profile, &chunk.1))
313            .collect::<Vec<_>>();
314        let embeddings = self
315            .ensure_model(profile.profile, self.config.auto_download_models)?
316            .embed(texts, Some(32))
317            .context("embed message chunks")?;
318
319        let mut chunk_records = Vec::with_capacity(chunks.len());
320        let mut embedding_records = Vec::with_capacity(chunks.len());
321
322        for (index, ((source_kind, normalized), embedding)) in
323            chunks.into_iter().zip(embeddings.into_iter()).enumerate()
324        {
325            let chunk_id = semantic_chunk_id(&envelope.id.as_str(), &source_kind, index as u32);
326            let chunk_record = SemanticChunkRecord {
327                id: chunk_id.clone(),
328                message_id: envelope.id.clone(),
329                source_kind,
330                ordinal: index as u32,
331                normalized: normalized.clone(),
332                content_hash: content_hash(&normalized),
333                created_at: now,
334                updated_at: now,
335            };
336            let embedding_record = SemanticEmbeddingRecord {
337                chunk_id,
338                profile_id: profile.id.clone(),
339                dimensions: embedding.len() as u32,
340                vector: f32s_to_blob(&embedding),
341                status: SemanticEmbeddingStatus::Ready,
342                created_at: now,
343                updated_at: now,
344            };
345            chunk_records.push(chunk_record);
346            embedding_records.push(embedding_record);
347        }
348
349        Ok((chunk_records, embedding_records))
350    }
351
352    fn ensure_model(
353        &mut self,
354        profile: SemanticProfile,
355        allow_download: bool,
356    ) -> Result<&mut TextEmbedding> {
357        if !self.models.contains_key(&profile) {
358            if !allow_download {
359                return Err(anyhow!(
360                    "semantic profile {} is not installed locally",
361                    profile.as_str()
362                ));
363            }
364            std::fs::create_dir_all(&self.cache_dir)?;
365            let model = TextEmbedding::try_new(
366                TextInitOptions::new(embedding_model(profile))
367                    .with_cache_dir(self.cache_dir.clone())
368                    .with_show_download_progress(false),
369            )
370            .with_context(|| format!("load semantic profile {}", profile.as_str()))?;
371            self.models.insert(profile, model);
372        }
373
374        self.models
375            .get_mut(&profile)
376            .ok_or_else(|| anyhow!("semantic profile {} not loaded", profile.as_str()))
377    }
378}
379
380#[cfg(not(feature = "local"))]
381impl SemanticEngine {
382    pub fn new(store: Arc<Store>, data_dir: &Path, config: SemanticConfig) -> Self {
383        let _ = data_dir;
384        Self { store, config }
385    }
386
387    pub fn apply_config(&mut self, config: SemanticConfig) {
388        self.config = config;
389    }
390
391    pub async fn status_snapshot(&self) -> Result<SemanticStatusSnapshot> {
392        Ok(SemanticStatusSnapshot {
393            enabled: false,
394            active_profile: self.config.active_profile,
395            profiles: self.store.list_semantic_profiles().await?,
396        })
397    }
398
399    pub async fn install_profile(
400        &mut self,
401        _profile: SemanticProfile,
402    ) -> Result<SemanticProfileRecord> {
403        Err(semantic_unavailable_error())
404    }
405
406    pub async fn use_profile(
407        &mut self,
408        _profile: SemanticProfile,
409    ) -> Result<SemanticProfileRecord> {
410        Err(semantic_unavailable_error())
411    }
412
413    pub async fn reindex_active(&mut self) -> Result<SemanticProfileRecord> {
414        Err(semantic_unavailable_error())
415    }
416
417    pub async fn reindex_messages(&mut self, _message_ids: &[MessageId]) -> Result<()> {
418        Ok(())
419    }
420
421    pub async fn search(&mut self, _query: &str, _limit: usize) -> Result<Vec<SemanticHit>> {
422        Ok(Vec::new())
423    }
424}
425
426#[cfg(feature = "local")]
427pub fn should_use_semantic(mode: SearchMode) -> bool {
428    matches!(mode, SearchMode::Hybrid | SearchMode::Semantic)
429}
430
431#[cfg(not(feature = "local"))]
432pub fn should_use_semantic(_mode: SearchMode) -> bool {
433    false
434}
435
436#[cfg(feature = "local")]
437fn default_profile_record(profile: SemanticProfile, dimensions: u32) -> SemanticProfileRecord {
438    SemanticProfileRecord {
439        id: semantic_profile_id(profile),
440        profile,
441        backend: "fastembed".to_string(),
442        model_revision: FASTEMBED_REVISION.to_string(),
443        dimensions,
444        status: SemanticProfileStatus::Pending,
445        installed_at: None,
446        activated_at: None,
447        last_indexed_at: None,
448        progress_completed: 0,
449        progress_total: 0,
450        last_error: None,
451    }
452}
453
454#[cfg(not(feature = "local"))]
455fn semantic_unavailable_error() -> anyhow::Error {
456    anyhow!("semantic search unavailable in this binary")
457}
458
459#[cfg(feature = "local")]
460fn semantic_profile_id(profile: SemanticProfile) -> SemanticProfileId {
461    SemanticProfileId::from_provider_id("semantic_profile", profile.as_str())
462}
463
464#[cfg(feature = "local")]
465fn semantic_chunk_id(
466    message_id: &str,
467    source_kind: &SemanticChunkSourceKind,
468    ordinal: u32,
469) -> mxr_core::SemanticChunkId {
470    mxr_core::SemanticChunkId::from_provider_id(
471        "semantic_chunk",
472        &format!("{message_id}:{source_kind:?}:{ordinal}"),
473    )
474}
475
476#[cfg(feature = "local")]
477fn build_chunks(
478    envelope: &Envelope,
479    body: Option<&MessageBody>,
480) -> Vec<(SemanticChunkSourceKind, String)> {
481    let mut chunks = Vec::new();
482
483    let header = normalize_text(&format!(
484        "subject {} from {} {} to {} snippet {}",
485        envelope.subject,
486        envelope.from.name.as_deref().unwrap_or(""),
487        envelope.from.email,
488        envelope
489            .to
490            .iter()
491            .map(|addr| addr.email.as_str())
492            .collect::<Vec<_>>()
493            .join(" "),
494        envelope.snippet
495    ));
496    if !header.is_empty() {
497        chunks.push((SemanticChunkSourceKind::Header, header));
498    }
499
500    if let Some(body) = body {
501        let reader_output = clean(
502            body.text_plain.as_deref(),
503            body.text_html.as_deref(),
504            &ReaderConfig::default(),
505        );
506        for chunk in chunk_text(&reader_output.content, 120, 30) {
507            chunks.push((SemanticChunkSourceKind::Body, chunk));
508        }
509
510        for attachment in &body.attachments {
511            let summary =
512                normalize_text(&format!("{} {}", attachment.filename, attachment.mime_type));
513            if !summary.is_empty() {
514                chunks.push((SemanticChunkSourceKind::AttachmentSummary, summary));
515            }
516
517            if let Some(text) = read_attachment_text(attachment) {
518                for chunk in chunk_text(&text, 120, 30) {
519                    chunks.push((SemanticChunkSourceKind::AttachmentText, chunk));
520                }
521            }
522        }
523    }
524
525    chunks
526}
527
528#[cfg(feature = "local")]
529fn read_attachment_text(attachment: &AttachmentMeta) -> Option<String> {
530    let path = attachment.local_path.as_ref()?;
531    match attachment_kind(attachment, path) {
532        AttachmentKind::Text => read_text_attachment(path, false),
533        AttachmentKind::Html => read_text_attachment(path, true),
534        AttachmentKind::Pdf => read_pdf_attachment(path),
535        AttachmentKind::OfficeDocument => read_office_attachment(path),
536        AttachmentKind::Spreadsheet => read_spreadsheet_attachment(attachment, path),
537        AttachmentKind::Image => run_tesseract(path),
538        AttachmentKind::Unknown => None,
539    }
540}
541
542#[cfg(feature = "local")]
543fn read_text_attachment(path: &StdPath, is_html: bool) -> Option<String> {
544    let content = std::fs::read_to_string(path).ok()?;
545    if is_html {
546        return normalized_nonempty(&clean(None, Some(&content), &ReaderConfig::default()).content);
547    }
548    normalized_nonempty(&content)
549}
550
551#[cfg(feature = "local")]
552fn read_office_attachment(path: &StdPath) -> Option<String> {
553    let markdown = undoc::to_markdown(path).ok()?;
554    normalized_nonempty(&markdown)
555}
556
557#[cfg(feature = "local")]
558fn read_spreadsheet_attachment(attachment: &AttachmentMeta, path: &StdPath) -> Option<String> {
559    let extension = attachment_extension(attachment, path);
560    let mime = attachment.mime_type.to_ascii_lowercase();
561    let undoc_text = should_try_undoc_spreadsheet(&mime, extension.as_deref())
562        .then(|| read_office_attachment(path))
563        .flatten();
564    let table_text = read_spreadsheet_tables(path);
565    combine_extracted_texts([undoc_text, table_text])
566}
567
568#[cfg(feature = "local")]
569fn read_spreadsheet_tables(path: &StdPath) -> Option<String> {
570    let mut workbook = open_workbook_auto(path).ok()?;
571    let mut sections = Vec::new();
572
573    for sheet_name in workbook.sheet_names().to_owned() {
574        let Ok(range) = workbook.worksheet_range(&sheet_name) else {
575            continue;
576        };
577
578        let mut rows = Vec::new();
579        for row in range.rows() {
580            let cells = row
581                .iter()
582                .map(ToString::to_string)
583                .map(|cell| normalize_text(&cell))
584                .filter(|cell| !cell.is_empty())
585                .collect::<Vec<_>>();
586            if !cells.is_empty() {
587                rows.push(cells.join(" | "));
588            }
589        }
590
591        if !rows.is_empty() {
592            sections.push(format!("sheet {sheet_name}\n{}", rows.join("\n")));
593        }
594    }
595
596    normalized_nonempty(&sections.join("\n\n"))
597}
598
599#[cfg(feature = "local")]
600fn should_try_undoc_spreadsheet(mime: &str, extension: Option<&str>) -> bool {
601    mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
602        || matches!(extension, Some("xlsx"))
603}
604
605#[cfg(feature = "local")]
606fn combine_extracted_texts<I>(parts: I) -> Option<String>
607where
608    I: IntoIterator<Item = Option<String>>,
609{
610    let mut combined = Vec::new();
611    for part in parts.into_iter().flatten() {
612        if combined.iter().any(|existing: &String| {
613            existing == &part || existing.contains(&part) || part.contains(existing)
614        }) {
615            continue;
616        }
617        combined.push(part);
618    }
619
620    if combined.is_empty() {
621        None
622    } else {
623        Some(combined.join("\n\n"))
624    }
625}
626
627#[cfg(feature = "local")]
628fn attachment_extension(attachment: &AttachmentMeta, path: &StdPath) -> Option<String> {
629    path.extension()
630        .and_then(|ext| ext.to_str())
631        .or_else(|| attachment.filename.rsplit('.').next())
632        .map(|ext| ext.trim().to_ascii_lowercase())
633        .filter(|ext| !ext.is_empty())
634}
635
636#[cfg(feature = "local")]
637fn normalized_nonempty(text: &str) -> Option<String> {
638    let normalized = normalize_text(text);
639    if normalized.is_empty() {
640        None
641    } else {
642        Some(normalized)
643    }
644}
645
646#[cfg(feature = "local")]
647#[derive(Clone, Copy, Debug, Eq, PartialEq)]
648enum AttachmentKind {
649    Text,
650    Html,
651    Pdf,
652    OfficeDocument,
653    Spreadsheet,
654    Image,
655    Unknown,
656}
657
658#[cfg(feature = "local")]
659fn attachment_kind(attachment: &AttachmentMeta, path: &StdPath) -> AttachmentKind {
660    let mime = attachment.mime_type.to_ascii_lowercase();
661    let extension = attachment_extension(attachment, path);
662    let extension = extension.as_deref();
663
664    if mime == "text/html" || matches!(extension, Some("html" | "htm")) {
665        return AttachmentKind::Html;
666    }
667
668    if mime.starts_with("text/")
669        || matches!(
670            mime.as_str(),
671            "application/json"
672                | "application/xml"
673                | "application/x-yaml"
674                | "application/yaml"
675                | "application/markdown"
676        )
677        || matches!(
678            extension,
679            Some("txt" | "md" | "markdown" | "json" | "xml" | "yaml" | "yml" | "csv" | "tsv")
680        )
681    {
682        return AttachmentKind::Text;
683    }
684
685    if mime == "application/pdf" || matches!(extension, Some("pdf")) {
686        return AttachmentKind::Pdf;
687    }
688
689    if matches!(
690        mime.as_str(),
691        "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
692            | "application/vnd.openxmlformats-officedocument.presentationml.presentation"
693    ) || matches!(extension, Some("docx" | "pptx"))
694    {
695        return AttachmentKind::OfficeDocument;
696    }
697
698    if matches!(
699        mime.as_str(),
700        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
701            | "application/vnd.ms-excel"
702            | "application/vnd.ms-excel.sheet.binary.macroenabled.12"
703            | "application/vnd.ms-excel.sheet.macroenabled.12"
704            | "application/vnd.oasis.opendocument.spreadsheet"
705    ) || matches!(extension, Some("xlsx" | "xlsm" | "xlsb" | "xls" | "ods"))
706    {
707        return AttachmentKind::Spreadsheet;
708    }
709
710    if mime.starts_with("image/")
711        || matches!(
712            extension,
713            Some("png" | "jpg" | "jpeg" | "gif" | "webp" | "bmp" | "tif" | "tiff")
714        )
715    {
716        return AttachmentKind::Image;
717    }
718
719    AttachmentKind::Unknown
720}
721
722#[cfg(feature = "local")]
723fn read_pdf_attachment(path: &StdPath) -> Option<String> {
724    if let Some(extracted) = unpdf::to_markdown(path)
725        .ok()
726        .and_then(|markdown| normalized_nonempty(&markdown))
727    {
728        return Some(extracted);
729    }
730
731    ocr_pdf(path)
732}
733
734#[cfg(feature = "local")]
735fn ocr_pdf(path: &StdPath) -> Option<String> {
736    let pdftoppm = which::which("pdftoppm").ok()?;
737    let tempdir = tempfile::tempdir().ok()?;
738    let prefix = tempdir.path().join("page");
739    let status = Command::new(pdftoppm)
740        .arg("-f")
741        .arg("1")
742        .arg("-l")
743        .arg(OCR_MAX_PAGES.to_string())
744        .arg("-png")
745        .arg(path)
746        .arg(&prefix)
747        .status()
748        .ok()?;
749    if !status.success() {
750        return None;
751    }
752
753    let mut images = std::fs::read_dir(tempdir.path())
754        .ok()?
755        .flatten()
756        .map(|entry| entry.path())
757        .filter(|path| {
758            path.extension()
759                .and_then(|ext| ext.to_str())
760                .is_some_and(|ext| ext.eq_ignore_ascii_case("png"))
761        })
762        .collect::<Vec<_>>();
763    images.sort();
764
765    let mut output = String::new();
766    for image in images {
767        if let Some(text) = run_tesseract(&image) {
768            if !output.is_empty() {
769                output.push(' ');
770            }
771            output.push_str(&text);
772        }
773    }
774
775    let normalized = normalize_text(&output);
776    if normalized.is_empty() {
777        None
778    } else {
779        Some(normalized)
780    }
781}
782
783#[cfg(feature = "local")]
784fn run_tesseract(path: &StdPath) -> Option<String> {
785    let tesseract = which::which("tesseract").ok()?;
786    let output = Command::new(tesseract)
787        .arg(path)
788        .arg("stdout")
789        .output()
790        .ok()?;
791    if !output.status.success() {
792        return None;
793    }
794
795    let normalized = normalize_text(&String::from_utf8_lossy(&output.stdout));
796    if normalized.is_empty() {
797        None
798    } else {
799        Some(normalized)
800    }
801}
802
803#[cfg(feature = "local")]
804fn embedding_model(profile: SemanticProfile) -> EmbeddingModel {
805    match profile {
806        SemanticProfile::BgeSmallEnV15 => EmbeddingModel::BGESmallENV15,
807        SemanticProfile::MultilingualE5Small => EmbeddingModel::MultilingualE5Small,
808        SemanticProfile::BgeM3 => EmbeddingModel::BGEM3,
809    }
810}
811
812#[cfg(feature = "local")]
813fn prefixed_query(profile: SemanticProfile, text: &str) -> String {
814    let normalized = normalize_text(text);
815    match profile {
816        SemanticProfile::MultilingualE5Small => format!("query: {normalized}"),
817        _ => normalized,
818    }
819}
820
821#[cfg(feature = "local")]
822fn prefixed_document(profile: SemanticProfile, text: &str) -> String {
823    let normalized = normalize_text(text);
824    match profile {
825        SemanticProfile::MultilingualE5Small => format!("passage: {normalized}"),
826        _ => normalized,
827    }
828}
829
830#[cfg(feature = "local")]
831fn normalize_text(text: &str) -> String {
832    text.split_whitespace()
833        .map(str::trim)
834        .filter(|part| !part.is_empty())
835        .collect::<Vec<_>>()
836        .join(" ")
837        .to_lowercase()
838}
839
840#[cfg(feature = "local")]
841fn chunk_text(text: &str, window_words: usize, overlap_words: usize) -> Vec<String> {
842    let normalized = normalize_text(text);
843    if normalized.is_empty() {
844        return Vec::new();
845    }
846    let words = normalized.split_whitespace().collect::<Vec<_>>();
847    if words.len() <= window_words {
848        return vec![normalized];
849    }
850
851    let mut chunks = Vec::new();
852    let mut start = 0usize;
853    let step = window_words.saturating_sub(overlap_words).max(1);
854    while start < words.len() {
855        let end = (start + window_words).min(words.len());
856        let chunk = words[start..end].join(" ");
857        if !chunk.is_empty() {
858            chunks.push(chunk);
859        }
860        if end == words.len() {
861            break;
862        }
863        start += step;
864    }
865    chunks
866}
867
868#[cfg(feature = "local")]
869fn content_hash(normalized: &str) -> String {
870    let digest = Sha256::digest(normalized.as_bytes());
871    format!("{digest:x}")
872}
873
874#[cfg(feature = "local")]
875fn f32s_to_blob(values: &[f32]) -> Vec<u8> {
876    let mut bytes = Vec::with_capacity(values.len() * 4);
877    for value in values {
878        bytes.extend_from_slice(&value.to_le_bytes());
879    }
880    bytes
881}
882
883#[cfg(feature = "local")]
884fn blob_to_f32s(bytes: &[u8]) -> Vec<f32> {
885    bytes
886        .chunks_exact(4)
887        .map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
888        .collect()
889}
890
891#[cfg(all(test, feature = "local"))]
892mod tests {
893    use super::*;
894    use mxr_core::id::{AttachmentId, MessageId};
895    use std::fs::File;
896    use std::io::Write;
897    use tempfile::tempdir;
898    use zip::write::SimpleFileOptions;
899    use zip::ZipWriter;
900
901    fn attachment(path: &StdPath, filename: &str, mime_type: &str) -> AttachmentMeta {
902        AttachmentMeta {
903            id: AttachmentId::new(),
904            message_id: MessageId::new(),
905            filename: filename.to_string(),
906            mime_type: mime_type.to_string(),
907            size_bytes: std::fs::metadata(path).unwrap().len(),
908            local_path: Some(path.to_path_buf()),
909            provider_id: "att-1".to_string(),
910        }
911    }
912
913    fn write_zip(path: &StdPath, files: &[(&str, String)]) {
914        let file = File::create(path).unwrap();
915        let mut zip = ZipWriter::new(file);
916        let options = SimpleFileOptions::default();
917        for (name, contents) in files {
918            zip.start_file(name, options).unwrap();
919            zip.write_all(contents.as_bytes()).unwrap();
920        }
921        zip.finish().unwrap();
922    }
923
924    fn write_docx(path: &StdPath, text: &str) {
925        write_zip(
926            path,
927            &[
928                (
929                    "[Content_Types].xml",
930                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
931<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
932  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
933  <Default Extension="xml" ContentType="application/xml"/>
934  <Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
935</Types>"#
936                        .to_string(),
937                ),
938                (
939                    "_rels/.rels",
940                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
941<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
942  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
943</Relationships>"#
944                        .to_string(),
945                ),
946                (
947                    "word/document.xml",
948                    format!(
949                        r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
950<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
951  <w:body>
952    <w:p>
953      <w:r><w:t>{text}</w:t></w:r>
954    </w:p>
955  </w:body>
956</w:document>"#
957                    ),
958                ),
959            ],
960        );
961    }
962
963    fn write_pptx(path: &StdPath, text: &str) {
964        write_zip(
965            path,
966            &[
967                (
968                    "[Content_Types].xml",
969                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
970<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
971  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
972  <Default Extension="xml" ContentType="application/xml"/>
973  <Override PartName="/ppt/presentation.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml"/>
974  <Override PartName="/ppt/slides/slide1.xml" ContentType="application/vnd.openxmlformats-officedocument.presentationml.slide+xml"/>
975</Types>"#
976                        .to_string(),
977                ),
978                (
979                    "_rels/.rels",
980                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
981<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
982  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="ppt/presentation.xml"/>
983</Relationships>"#
984                        .to_string(),
985                ),
986                (
987                    "ppt/presentation.xml",
988                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
989<p:presentation xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
990    xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
991    xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
992  <p:sldIdLst>
993    <p:sldId id="256" r:id="rId1"/>
994  </p:sldIdLst>
995</p:presentation>"#
996                        .to_string(),
997                ),
998                (
999                    "ppt/_rels/presentation.xml.rels",
1000                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1001<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1002  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/slide" Target="slides/slide1.xml"/>
1003</Relationships>"#
1004                        .to_string(),
1005                ),
1006                (
1007                    "ppt/slides/slide1.xml",
1008                    format!(
1009                        r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1010<p:sld xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"
1011    xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"
1012    xmlns:p="http://schemas.openxmlformats.org/presentationml/2006/main">
1013  <p:cSld>
1014    <p:spTree>
1015      <p:nvGrpSpPr>
1016        <p:cNvPr id="1" name=""/>
1017        <p:cNvGrpSpPr/>
1018        <p:nvPr/>
1019      </p:nvGrpSpPr>
1020      <p:grpSpPr/>
1021      <p:sp>
1022        <p:nvSpPr>
1023          <p:cNvPr id="2" name="Title 1"/>
1024          <p:cNvSpPr/>
1025          <p:nvPr/>
1026        </p:nvSpPr>
1027        <p:txBody>
1028          <a:bodyPr/>
1029          <a:lstStyle/>
1030          <a:p><a:r><a:t>{text}</a:t></a:r></a:p>
1031        </p:txBody>
1032      </p:sp>
1033    </p:spTree>
1034  </p:cSld>
1035</p:sld>"#
1036                    ),
1037                ),
1038            ],
1039        );
1040    }
1041
1042    fn write_xlsx(path: &StdPath) {
1043        write_zip(
1044            path,
1045            &[
1046                (
1047                    "[Content_Types].xml",
1048                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1049<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
1050  <Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
1051  <Default Extension="xml" ContentType="application/xml"/>
1052  <Override PartName="/xl/workbook.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet.main+xml"/>
1053  <Override PartName="/xl/worksheets/sheet1.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.worksheet+xml"/>
1054  <Override PartName="/xl/sharedStrings.xml" ContentType="application/vnd.openxmlformats-officedocument.spreadsheetml.sharedStrings+xml"/>
1055</Types>"#
1056                        .to_string(),
1057                ),
1058                (
1059                    "_rels/.rels",
1060                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1061<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1062  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="xl/workbook.xml"/>
1063</Relationships>"#
1064                        .to_string(),
1065                ),
1066                (
1067                    "xl/workbook.xml",
1068                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1069<workbook xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"
1070    xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships">
1071  <sheets>
1072    <sheet name="Summary" sheetId="1" r:id="rId1"/>
1073  </sheets>
1074</workbook>"#
1075                        .to_string(),
1076                ),
1077                (
1078                    "xl/_rels/workbook.xml.rels",
1079                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1080<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
1081  <Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/worksheet" Target="worksheets/sheet1.xml"/>
1082  <Relationship Id="rId2" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/sharedStrings" Target="sharedStrings.xml"/>
1083</Relationships>"#
1084                        .to_string(),
1085                ),
1086                (
1087                    "xl/sharedStrings.xml",
1088                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1089<sst xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main" count="4" uniqueCount="4">
1090  <si><t>Name</t></si>
1091  <si><t>Value</t></si>
1092  <si><t>Alice</t></si>
1093  <si><t>42</t></si>
1094</sst>"#
1095                        .to_string(),
1096                ),
1097                (
1098                    "xl/worksheets/sheet1.xml",
1099                    r#"<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
1100<worksheet xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main">
1101  <sheetData>
1102    <row r="1">
1103      <c r="A1" t="s"><v>0</v></c>
1104      <c r="B1" t="s"><v>1</v></c>
1105    </row>
1106    <row r="2">
1107      <c r="A2" t="s"><v>2</v></c>
1108      <c r="B2" t="s"><v>3</v></c>
1109    </row>
1110  </sheetData>
1111</worksheet>"#
1112                        .to_string(),
1113                ),
1114            ],
1115        );
1116    }
1117
1118    #[test]
1119    fn attachment_kind_uses_extension_when_mime_is_generic() {
1120        let dir = tempdir().unwrap();
1121        let docx_path = dir.path().join("roadmap.docx");
1122        write_docx(&docx_path, "Quarterly roadmap");
1123        let attachment = attachment(&docx_path, "roadmap.docx", "application/octet-stream");
1124
1125        assert_eq!(
1126            attachment_kind(&attachment, docx_path.as_path()),
1127            AttachmentKind::OfficeDocument
1128        );
1129    }
1130
1131    #[test]
1132    fn read_attachment_text_extracts_docx_with_undoc() {
1133        let dir = tempdir().unwrap();
1134        let docx_path = dir.path().join("roadmap.docx");
1135        write_docx(&docx_path, "Quarterly roadmap for launch");
1136        let attachment = attachment(
1137            &docx_path,
1138            "roadmap.docx",
1139            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1140        );
1141
1142        let extracted = read_attachment_text(&attachment).unwrap();
1143        assert!(extracted.contains("quarterly roadmap"));
1144        assert!(extracted.contains("launch"));
1145    }
1146
1147    #[test]
1148    fn read_attachment_text_extracts_pptx_with_undoc() {
1149        let dir = tempdir().unwrap();
1150        let pptx_path = dir.path().join("deck.pptx");
1151        write_pptx(&pptx_path, "Launch metrics");
1152        let attachment = attachment(
1153            &pptx_path,
1154            "deck.pptx",
1155            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
1156        );
1157
1158        let extracted = read_attachment_text(&attachment).unwrap();
1159        assert!(extracted.contains("launch metrics"));
1160    }
1161
1162    #[test]
1163    fn read_attachment_text_extracts_xlsx_with_table_fallback() {
1164        let dir = tempdir().unwrap();
1165        let xlsx_path = dir.path().join("table.xlsx");
1166        write_xlsx(&xlsx_path);
1167        let attachment = attachment(
1168            &xlsx_path,
1169            "table.xlsx",
1170            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1171        );
1172
1173        let extracted = read_attachment_text(&attachment).unwrap();
1174        assert!(extracted.contains("sheet summary"));
1175        assert!(extracted.contains("name | value"));
1176        assert!(extracted.contains("alice | 42"));
1177    }
1178}