Skip to main content

hematite/memory/
vein.rs

1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7/// "The Vein" — local RAG memory engine backed by SQLite FTS5 + semantic embeddings.
8///
9/// Two retrieval modes, used together:
10///
11/// **BM25 (always available)**
12/// Full-text search via SQLite FTS5 with Porter-stemming. Fast, zero extra GPU cost,
13/// works as the fallback when the embedding model isn't loaded.
14///
15/// **Semantic (when LM Studio has an embedding model loaded)**
16/// Calls `/v1/embeddings` (nomic-embed-text-v1.5 or similar) to produce 768-dim float
17/// vectors for each chunk. At search time the query is embedded and cosine similarity
18/// selects the most conceptually relevant chunks — even when no keywords match.
19///
20/// Hybrid search runs BM25 and semantic in parallel, deduplicates by path, and returns
21/// the top-k results ranked by combined score. Semantic results score higher when the
22/// embedding model is available; BM25 fills the gap when it isn't.
23///
24/// Indexing is incremental: files are re-indexed only when their mtime changes. Embedding
25/// vectors are stored in a separate `chunks_vec` SQLite table so they survive re-runs
26/// without hitting the embedding API again.
27pub struct Vein {
28    db: std::sync::Arc<std::sync::Mutex<Connection>>,
29    /// Base URL of the LLM provider, used for the embeddings endpoint.
30    base_url: String,
31}
32
33// SAFETY: rusqlite::Connection is !Send by default, but we wrap it in Arc<Mutex>
34// and ensure all accesses are serialized by the mutex.
35unsafe impl Send for Vein {}
36unsafe impl Sync for Vein {}
37
38#[derive(Debug, Clone)]
39pub struct SearchResult {
40    pub path: String,
41    pub content: String,
42    /// Combined relevance score (higher = more relevant).
43    pub score: f32,
44    /// Subsystem room derived from the file path (e.g. "agent", "ui", "tools").
45    pub room: String,
46    /// Last-modified timestamp from chunks_meta (unix seconds).
47    pub last_modified: i64,
48}
49
50#[derive(Debug, Clone, PartialEq, Eq)]
51pub struct VeinHotFile {
52    pub path: String,
53    pub heat: i64,
54    pub last_modified: i64,
55    pub room: String,
56}
57
58#[derive(Debug, Clone)]
59pub struct VeinInspectionSnapshot {
60    pub indexed_source_files: usize,
61    pub indexed_docs: usize,
62    pub indexed_session_exchanges: usize,
63    pub embedded_source_doc_chunks: usize,
64    pub has_any_embeddings: bool,
65    pub active_room: Option<String>,
66    pub hot_files: Vec<VeinHotFile>,
67    pub l1_ready: bool,
68}
69
70#[derive(Debug, Default)]
71struct QuerySignals {
72    exact_phrases: Vec<String>,
73    standout_terms: Vec<String>,
74    historical_memory_hint: bool,
75    temporal_reference: Option<TemporalReference>,
76}
77
78#[derive(Debug, Clone, Copy)]
79struct TemporalReference {
80    target_ts: i64,
81    window_secs: i64,
82}
83
84#[derive(Debug, Deserialize)]
85struct SessionReport {
86    #[serde(default)]
87    session_start: String,
88    #[serde(default)]
89    transcript: Vec<SessionTranscriptEntry>,
90}
91
92#[derive(Debug, Deserialize)]
93struct SessionTranscriptEntry {
94    #[serde(default)]
95    speaker: String,
96    #[serde(default)]
97    text: String,
98}
99
100#[derive(Debug)]
101struct SessionExchange {
102    path: String,
103    last_modified: i64,
104    content: String,
105}
106
107#[derive(Debug, Clone, Copy, PartialEq, Eq)]
108enum SessionSpeakerKind {
109    User,
110    Assistant,
111    Ignore,
112}
113
114/// Derive a subsystem room label from a file path.
115/// Uses path segments, filenames, and repo-role hints to map files into
116/// stable subsystem rooms. Falls back to the first directory component or
117/// "root" when no stronger signal exists.
118pub fn detect_room(path: &str) -> String {
119    let lower = path.to_lowercase().replace('\\', "/");
120    let filename = lower.rsplit('/').next().unwrap_or(&lower);
121    let ext = filename.rsplit('.').next().unwrap_or("");
122
123    let mut best_room = None::<&str>;
124    let mut best_score = 0i32;
125    let mut consider = |room: &'static str, score: i32| {
126        if score > best_score {
127            best_score = score;
128            best_room = Some(room);
129        }
130    };
131
132    let is_component = |segment: &str| {
133        lower == segment
134            || lower.starts_with(&format!("{segment}/"))
135            || lower.contains(&format!("/{segment}/"))
136    };
137
138    if lower.starts_with("session/")
139        || lower.starts_with(".hematite/reports/")
140        || lower.starts_with(".hematite/imports/")
141        || is_component("reports")
142        || is_component("imports")
143    {
144        consider("session", 100);
145    }
146
147    if lower.starts_with(".hematite/docs/")
148        || is_component("docs")
149        || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
150        || matches!(ext, "md" | "markdown" | "pdf" | "rst")
151    {
152        consider("docs", 80);
153    }
154
155    if is_component("tests")
156        || filename.contains("diagnostic")
157        || filename.ends_with("_test.rs")
158        || filename.ends_with(".test.ts")
159    {
160        consider("tests", 85);
161    }
162
163    if lower.starts_with(".github/workflows/")
164        || is_component("workflows")
165        || filename == ".pre-commit-config.yaml"
166        || filename == ".pre-commit-config.yml"
167        || filename.contains("hook")
168    {
169        consider("automation", 78);
170    }
171
172    if lower.starts_with("installer/")
173        || lower.starts_with("dist/")
174        || lower.starts_with("scripts/package-")
175        || filename.contains("release")
176        || filename.contains("bump-version")
177        || ext == "iss"
178    {
179        consider("release", 82);
180    }
181
182    if matches!(
183        filename,
184        "cargo.toml"
185            | "cargo.lock"
186            | "package.json"
187            | "pnpm-lock.yaml"
188            | "yarn.lock"
189            | "bun.lock"
190            | "bun.lockb"
191            | "pyproject.toml"
192            | "setup.py"
193            | "go.mod"
194            | "pom.xml"
195            | "build.gradle"
196            | "build.gradle.kts"
197            | "cmakelists.txt"
198            | ".gitignore"
199            | "settings.json"
200            | "mcp_servers.json"
201    ) || filename.ends_with(".sln")
202        || filename.ends_with(".csproj")
203        || filename.contains("config")
204    {
205        consider("config", 76);
206    }
207
208    if is_component("ui")
209        || matches!(
210            filename,
211            "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
212        )
213    {
214        consider("ui", 70);
215    }
216
217    if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
218        consider("memory", 72);
219    }
220
221    if is_component("tools")
222        || matches!(
223            filename,
224            "verify_build.rs"
225                | "host_inspect.rs"
226                | "shell.rs"
227                | "code_sandbox.rs"
228                | "project_map.rs"
229                | "runtime_trace.rs"
230        )
231    {
232        consider("tools", 68);
233    }
234
235    if filename.contains("mcp")
236        || filename.contains("lsp")
237        || lower.contains("/mcp/")
238        || lower.contains("/lsp/")
239    {
240        consider("integration", 67);
241    }
242
243    if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
244        || filename.contains("startup")
245        || filename.contains("runtime")
246    {
247        consider("runtime", 66);
248    }
249
250    if is_component("agent") {
251        consider("agent", 60);
252    }
253
254    if lower.starts_with("libs/") || is_component("libs") {
255        consider("libs", 58);
256    }
257
258    if lower.starts_with("scripts/") || is_component("scripts") {
259        consider("scripts", 55);
260    }
261
262    if let Some(room) = best_room {
263        return room.to_string();
264    }
265
266    // Fall back to first directory component
267    lower
268        .split('/')
269        .next()
270        .filter(|s| !s.is_empty() && !s.contains('.'))
271        .unwrap_or("root")
272        .to_string()
273}
274
275impl Vein {
276    const SESSION_REPORT_LIMIT: usize = 5;
277    const SESSION_TURN_LIMIT: usize = 50;
278    const IMPORT_FILE_LIMIT: usize = 12;
279    const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
280
281    pub fn new<P: AsRef<Path>>(
282        db_path: P,
283        base_url: String,
284    ) -> Result<Self, Box<dyn std::error::Error>> {
285        let db = Connection::open(db_path)?;
286
287        // WAL mode for better concurrent read performance.
288        db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
289
290        // chunks_meta: tracks last-modified time per path for incremental indexing.
291        // chunks_fts:  BM25 full-text index of all code chunks.
292        // chunks_vec:  semantic embedding vectors, keyed by (path, chunk_idx).
293        db.execute_batch(
294            "CREATE TABLE IF NOT EXISTS chunks_meta (
295                path TEXT PRIMARY KEY,
296                last_modified INTEGER NOT NULL,
297                room TEXT NOT NULL DEFAULT 'root'
298            );
299            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
300                path UNINDEXED,
301                content,
302                tokenize='porter ascii'
303            );
304            CREATE TABLE IF NOT EXISTS chunks_vec (
305                path TEXT NOT NULL,
306                chunk_idx INTEGER NOT NULL,
307                embedding BLOB NOT NULL,
308                PRIMARY KEY (path, chunk_idx)
309            );
310            CREATE TABLE IF NOT EXISTS file_heat (
311                path TEXT PRIMARY KEY,
312                heat INTEGER NOT NULL DEFAULT 0,
313                last_edit INTEGER NOT NULL DEFAULT 0
314            );",
315        )?;
316
317        // Schema migrations — safe to run on every open (IF NOT EXISTS / ignored if col exists).
318        let _ = db
319            .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
320        let _ = db.execute_batch(
321            "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
322        );
323
324        Ok(Self {
325            db: std::sync::Arc::new(std::sync::Mutex::new(db)),
326            base_url,
327        })
328    }
329
330    // ── Indexing ──────────────────────────────────────────────────────────────
331
332    /// Index a single file for BM25 search. Skip if mtime hasn't changed.
333    /// Returns the chunks that were written (empty if file was unchanged).
334    pub fn index_document(
335        &mut self,
336        path: &str,
337        last_modified: i64,
338        full_text: &str,
339    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
340        let room = detect_room(path);
341        let ext = std::path::Path::new(path)
342            .extension()
343            .and_then(|e| e.to_str())
344            .unwrap_or("");
345        let chunks = chunk_by_symbols(ext, full_text);
346        self.index_chunks_with_room(path, last_modified, &room, &chunks)
347    }
348
349    fn index_chunks_with_room(
350        &mut self,
351        path: &str,
352        last_modified: i64,
353        room: &str,
354        chunks: &[String],
355    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
356        let db = self.db.lock().unwrap();
357        let existing: Option<i64> = db
358            .query_row(
359                "SELECT last_modified FROM chunks_meta WHERE path = ?1",
360                params![path],
361                |r| r.get(0),
362            )
363            .ok();
364
365        if let Some(ts) = existing {
366            if ts >= last_modified {
367                return Ok(Vec::new()); // unchanged — skip
368            }
369        }
370
371        // Evict stale BM25 chunks, stale embedding vectors, then update metadata.
372        db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
373        db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
374        db.execute(
375            "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room) VALUES (?1, ?2, ?3)",
376            params![path, last_modified, room],
377        )?;
378
379        drop(db);
380
381        let mut db = self.db.lock().unwrap();
382        let tx = db.transaction()?;
383        {
384            let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
385            for chunk in chunks {
386                stmt.execute(params![path, chunk.as_str()])?;
387            }
388        }
389        tx.commit()?;
390
391        Ok(chunks.to_vec())
392    }
393
394    /// Embed a set of chunks for one file and store the vectors.
395    /// Called after `index_document` returns new chunks.
396    /// Silently skips if the embedding model is unavailable.
397    pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
398        for (idx, chunk) in chunks.iter().enumerate() {
399            if let Some(vec) = embed_text_blocking(chunk, &self.base_url) {
400                let blob = floats_to_blob(&vec);
401                let db = self.db.lock().unwrap();
402                let _ = db.execute(
403                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
404                    params![path, idx as i64, blob],
405                );
406            }
407        }
408    }
409
410    // ── Search ────────────────────────────────────────────────────────────────
411
412    /// BM25-ranked full-text search via FTS5 MATCH.
413    pub fn search_bm25(
414        &self,
415        query: &str,
416        limit: usize,
417    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
418        // Strip common English stopwords so FTS5 MATCH gets meaningful tokens only.
419        // FTS5 uses implicit AND by default — passing stopwords like "how", "does",
420        // "the" causes zero results because source code never contains those phrases.
421        const STOPWORDS: &[&str] = &[
422            "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
423            "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an", "the",
424            "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "get",
425            "gets", "got", "work", "works", "make", "makes", "use", "uses", "into", "that", "this",
426            "it", "its",
427        ];
428
429        let safe_query: String = query
430            .chars()
431            .map(|c| {
432                if c.is_alphanumeric() || c == ' ' || c == '_' {
433                    c
434                } else {
435                    ' '
436                }
437            })
438            .collect();
439
440        // Build an OR query from non-stopword tokens so any relevant term matches.
441        let fts_query = safe_query
442            .split_whitespace()
443            .filter(|w| w.len() >= 3 && !STOPWORDS.contains(&w.to_lowercase().as_str()))
444            .collect::<Vec<_>>()
445            .join(" OR ");
446
447        if fts_query.is_empty() {
448            return Ok(Vec::new());
449        }
450
451        let db = self.db.lock().unwrap();
452        let mut stmt = db.prepare(
453            "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room
454             FROM chunks_fts
455             JOIN chunks_meta cm ON cm.path = chunks_fts.path
456             WHERE chunks_fts MATCH ?1
457             ORDER BY rank
458             LIMIT ?2",
459        )?;
460
461        let results: Vec<SearchResult> = stmt
462            .query_map(params![fts_query, limit as i64], |row| {
463                Ok(SearchResult {
464                    path: row.get(0)?,
465                    content: row.get(1)?,
466                    score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
467                    last_modified: row.get(3)?,
468                    room: row.get(4)?,
469                })
470            })?
471            .filter_map(|r| r.ok())
472            .collect();
473
474        Ok(results)
475    }
476
477    /// Semantic search: embed the query, cosine-similarity against all stored vectors.
478    /// Returns empty if the embedding model isn't loaded.
479    pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
480        let query_vec = match embed_query_blocking(query, &self.base_url) {
481            Some(v) => v,
482            None => return Vec::new(),
483        };
484
485        // Load all stored embeddings.
486        let rows: Vec<(String, i64, Vec<u8>, i64, String)> = {
487            let db = self.db.lock().unwrap();
488            let mut stmt = match db.prepare(
489                "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room
490                 FROM chunks_vec cv
491                 JOIN chunks_meta cm ON cm.path = cv.path",
492            ) {
493                Ok(s) => s,
494                Err(_) => return Vec::new(),
495            };
496            stmt.query_map([], |row| {
497                Ok((
498                    row.get::<_, String>(0)?,
499                    row.get::<_, i64>(1)?,
500                    row.get::<_, Vec<u8>>(2)?,
501                    row.get::<_, i64>(3)?,
502                    row.get::<_, String>(4)?,
503                ))
504            })
505            .ok()
506            .map(|rows| rows.filter_map(|r| r.ok()).collect())
507            .unwrap_or_default()
508        };
509
510        if rows.is_empty() {
511            return Vec::new();
512        }
513
514        // Score each chunk.
515        let mut scored: Vec<(f32, String, i64, i64, String)> = rows
516            .into_iter()
517            .filter_map(|(path, idx, blob, last_modified, room)| {
518                let vec = blob_to_floats(&blob);
519                let sim = cosine_similarity(&query_vec, &vec);
520                Some((sim, path, idx, last_modified, room))
521            })
522            .collect();
523
524        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
525        scored.truncate(limit);
526
527        // Fetch the content for the top chunks.
528        let db = self.db.lock().unwrap();
529        scored
530            .into_iter()
531            .filter_map(|(score, path, idx, last_modified, room)| {
532                // chunks_fts rows for this path are indexed in insertion order = chunk order.
533                // We use LIMIT/OFFSET to fetch the chunk at position `idx`.
534                let content: Option<String> = db
535                    .query_row(
536                        "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
537                        params![path, idx],
538                        |r| r.get(0),
539                    )
540                    .ok();
541                content.map(|c| SearchResult {
542                    path,
543                    content: c,
544                    score,
545                    room,
546                    last_modified,
547                })
548            })
549            .collect()
550    }
551
552    /// Hybrid search: BM25 + semantic, deduplicated and re-ranked.
553    ///
554    /// Semantic results are preferred (they score higher) when the embedding model
555    /// is available. BM25 fills in or takes over when it isn't.
556    /// Results from the active room (hottest subsystem by edit count) get a
557    /// small boost so the model gravitates toward what's currently being worked on.
558    pub fn search_context(
559        &self,
560        query: &str,
561        limit: usize,
562    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
563        let candidate_limit = (limit.max(1) * 4).max(12);
564        let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
565        let semantic = self.search_semantic(query, candidate_limit);
566        let signals = QuerySignals::from_query(query);
567
568        // Determine the active room from heat scores.
569        let active_room = self.active_room();
570
571        // Merge: semantic results win ties (scored 1.0–2.0 range after boost).
572        // BM25 results land in 0.0–1.0 range.
573        let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
574
575        for r in semantic {
576            let score = reranked_score(&signals, active_room.as_deref(), &r, true);
577            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
578        }
579
580        for r in bm25 {
581            let score = reranked_score(&signals, active_room.as_deref(), &r, false);
582            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
583        }
584
585        let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
586        merged.sort_by(|a, b| {
587            b.score
588                .partial_cmp(&a.score)
589                .unwrap_or(std::cmp::Ordering::Equal)
590        });
591        merged.truncate(limit);
592        Ok(merged)
593    }
594
595    /// Returns the room with the highest total heat (most edited subsystem).
596    /// Used to bias retrieval toward what the user is actively working on.
597    fn active_room(&self) -> Option<String> {
598        let db = self.db.lock().unwrap();
599        db.query_row(
600            "SELECT cm.room, SUM(fh.heat) as total
601             FROM file_heat fh
602             JOIN chunks_meta cm ON cm.path = fh.path
603             GROUP BY cm.room
604             ORDER BY total DESC
605             LIMIT 1",
606            [],
607            |row| row.get::<_, String>(0),
608        )
609        .ok()
610    }
611
612    // ── Project Indexing ──────────────────────────────────────────────────────
613
614    /// Walk the entire project and index all source files (BM25 + embeddings).
615    ///
616    /// Skips: `target/`, `.git/`, `node_modules/`, `.hematite/`, files > 512 KB.
617    /// Also indexes `.hematite/docs/` — the designated reference document drop folder.
618    /// Returns the number of files processed (unchanged files are fast-pathed).
619    pub fn index_project(&mut self) -> usize {
620        let root = crate::tools::file_ops::workspace_root();
621        let mut count = 0usize;
622
623        const INDEXABLE: &[&str] = &[
624            "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
625            "yml", "txt",
626        ];
627        const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
628
629        for entry in walkdir::WalkDir::new(&root)
630            .follow_links(false)
631            .into_iter()
632            .filter_entry(|e| {
633                if e.file_type().is_dir() {
634                    let name = e.file_name().to_string_lossy();
635                    return !SKIP_DIRS.contains(&name.as_ref());
636                }
637                true
638            })
639            .filter_map(|e| e.ok())
640            .filter(|e| e.file_type().is_file())
641        {
642            let path = entry.path();
643            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
644            if !INDEXABLE.contains(&ext) {
645                continue;
646            }
647
648            let Ok(meta) = std::fs::metadata(path) else {
649                continue;
650            };
651            if meta.len() > 512_000 {
652                continue;
653            }
654
655            let mtime = meta
656                .modified()
657                .map(|t| {
658                    t.duration_since(std::time::UNIX_EPOCH)
659                        .unwrap_or_default()
660                        .as_secs() as i64
661                })
662                .unwrap_or(0);
663
664            let rel = path.strip_prefix(&root).unwrap_or(path);
665            let rel_str = rel.to_string_lossy().replace('\\', "/");
666
667            if let Ok(content) = std::fs::read_to_string(path) {
668                match self.index_document(&rel_str, mtime, &content) {
669                    Ok(new_chunks) if !new_chunks.is_empty() => {
670                        count += 1;
671                    }
672                    Ok(_) => {}
673                    Err(_) => {}
674                }
675            }
676        }
677
678        count += self.index_workspace_artifacts(&root);
679
680        count
681    }
682
683    /// Index workspace-local supporting context that should be available even
684    /// outside a real project workspace: `.hematite/docs/`, recent session
685    /// reports stored in `.hematite/reports/`, and imported chat exports in
686    /// `.hematite/imports/`.
687    pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
688        let mut count = self.index_docs_folder(workspace_root);
689        count += self.index_recent_session_reports(workspace_root);
690        count += self.index_imported_session_exports(workspace_root);
691        self.backfill_missing_embeddings();
692        count
693    }
694
695    /// Index reference documents in `.hematite/docs/`.
696    /// Supports PDF (text extraction), markdown, and plain text.
697    /// Documents are stored with path prefix `docs/filename` so they are
698    /// distinguishable from source files in retrieval results.
699    fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
700        let docs_dir = workspace_root.join(".hematite").join("docs");
701        const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
702        let mut count = 0usize;
703        let mut desired_paths = HashSet::new();
704
705        if docs_dir.exists() {
706            for entry in walkdir::WalkDir::new(&docs_dir)
707                .max_depth(3)
708                .follow_links(false)
709                .into_iter()
710                .filter_map(|e| e.ok())
711                .filter(|e| e.file_type().is_file())
712            {
713                let path = entry.path();
714                let ext = path
715                    .extension()
716                    .and_then(|e| e.to_str())
717                    .unwrap_or("")
718                    .to_lowercase();
719                if !DOCS_INDEXABLE.contains(&ext.as_str()) {
720                    continue;
721                }
722
723                let Ok(meta) = std::fs::metadata(path) else {
724                    continue;
725                };
726                if meta.len() > 50_000_000 {
727                    continue;
728                }
729
730                let mtime = meta
731                    .modified()
732                    .map(|t| {
733                        t.duration_since(std::time::UNIX_EPOCH)
734                            .unwrap_or_default()
735                            .as_secs() as i64
736                    })
737                    .unwrap_or(0);
738
739                let rel = path.strip_prefix(workspace_root).unwrap_or(path);
740                let rel_str = rel.to_string_lossy().replace('\\', "/");
741                desired_paths.insert(rel_str.clone());
742
743                let content = if ext == "pdf" {
744                    extract_pdf_text(path).ok().flatten()
745                } else {
746                    std::fs::read_to_string(path).ok()
747                };
748
749                if let Some(text) = content {
750                    if text.trim().is_empty() {
751                        continue;
752                    }
753                    match self.index_document(&rel_str, mtime, &text) {
754                        Ok(new_chunks) if !new_chunks.is_empty() => {
755                            count += 1;
756                        }
757                        Ok(_) => {}
758                        Err(_) => {}
759                    }
760                }
761            }
762        }
763
764        self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
765        count
766    }
767
768    /// Index the most recent local session reports by exchange pair so prior
769    /// decisions remain searchable across launches without flooding the vein.
770    pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
771        let reports_dir = workspace_root.join(".hematite").join("reports");
772        let mut count = 0usize;
773        let mut desired_paths = HashSet::new();
774
775        if reports_dir.exists() {
776            let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
777                .ok()
778                .into_iter()
779                .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
780                .map(|entry| entry.path())
781                .filter(|path| {
782                    path.is_file()
783                        && path.extension().and_then(|ext| ext.to_str()) == Some("json")
784                        && path
785                            .file_stem()
786                            .and_then(|stem| stem.to_str())
787                            .map(|stem| stem.starts_with("session_"))
788                            .unwrap_or(false)
789                })
790                .collect();
791
792            reports.sort_by(|a, b| {
793                let a_name = a
794                    .file_name()
795                    .and_then(|name| name.to_str())
796                    .unwrap_or_default();
797                let b_name = b
798                    .file_name()
799                    .and_then(|name| name.to_str())
800                    .unwrap_or_default();
801                b_name.cmp(a_name)
802            });
803            reports.truncate(Self::SESSION_REPORT_LIMIT);
804
805            for report_path in reports {
806                let Ok(meta) = std::fs::metadata(&report_path) else {
807                    continue;
808                };
809                let mtime = meta
810                    .modified()
811                    .map(|t| {
812                        t.duration_since(std::time::UNIX_EPOCH)
813                            .unwrap_or_default()
814                            .as_secs() as i64
815                    })
816                    .unwrap_or(0);
817
818                for exchange in load_session_exchanges(&report_path, mtime) {
819                    desired_paths.insert(exchange.path.clone());
820                    match self.index_chunks_with_room(
821                        &exchange.path,
822                        exchange.last_modified,
823                        "session",
824                        std::slice::from_ref(&exchange.content),
825                    ) {
826                        Ok(new_chunks) if !new_chunks.is_empty() => {
827                            count += 1;
828                        }
829                        Ok(_) => {}
830                        Err(_) => {}
831                    }
832                }
833            }
834        }
835
836        self.prune_indexed_prefix("session/", &desired_paths);
837        count
838    }
839
840    /// Index imported chat exports from `.hematite/imports/`.
841    /// Supported inputs include already-normalized `>` transcripts, Claude Code
842    /// JSONL, Codex CLI JSONL, simple role/content JSON exports, ChatGPT
843    /// `mapping` exports, and Hematite session-report JSON.
844    pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
845        let imports_dir = workspace_root.join(".hematite").join("imports");
846        let mut count = 0usize;
847        let mut desired_paths = HashSet::new();
848
849        if imports_dir.exists() {
850            let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
851                .max_depth(4)
852                .follow_links(false)
853                .into_iter()
854                .filter_map(|entry| entry.ok())
855                .filter(|entry| entry.file_type().is_file())
856                .filter_map(|entry| {
857                    let path = entry.into_path();
858                    let ext = path
859                        .extension()
860                        .and_then(|ext| ext.to_str())
861                        .unwrap_or("")
862                        .to_ascii_lowercase();
863                    if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
864                        return None;
865                    }
866                    let meta = std::fs::metadata(&path).ok()?;
867                    if meta.len() > Self::IMPORT_MAX_BYTES {
868                        return None;
869                    }
870                    let mtime = meta
871                        .modified()
872                        .map(|t| {
873                            t.duration_since(std::time::UNIX_EPOCH)
874                                .unwrap_or_default()
875                                .as_secs() as i64
876                        })
877                        .unwrap_or(0);
878                    Some((path, mtime))
879                })
880                .collect();
881
882            imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
883                b_mtime
884                    .cmp(a_mtime)
885                    .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
886            });
887            imports.truncate(Self::IMPORT_FILE_LIMIT);
888
889            for (import_path, mtime) in imports {
890                for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
891                    desired_paths.insert(exchange.path.clone());
892                    match self.index_chunks_with_room(
893                        &exchange.path,
894                        exchange.last_modified,
895                        "session",
896                        std::slice::from_ref(&exchange.content),
897                    ) {
898                        Ok(new_chunks) if !new_chunks.is_empty() => {
899                            count += 1;
900                        }
901                        Ok(_) => {}
902                        Err(_) => {}
903                    }
904                }
905            }
906        }
907
908        self.prune_indexed_prefix("session/imports/", &desired_paths);
909        count
910    }
911
912    /// Embed any FTS chunks that don't yet have a vector in chunks_vec.
913    /// Called at the end of index_project so that loading the embedding model
914    /// after the initial index automatically triggers a semantic upgrade on the
915    /// next agent turn — no /forget or file-touch required.
916    fn backfill_missing_embeddings(&self) {
917        // Fast path: if chunk counts match, nothing to do.
918        let (fts_count, vec_count) = {
919            let db = self.db.lock().unwrap();
920            let fts: i64 = db
921                .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
922                .unwrap_or(0);
923            let vec: i64 = db
924                .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
925                .unwrap_or(0);
926            (fts, vec)
927        };
928        if fts_count == 0 || fts_count == vec_count {
929            return;
930        }
931
932        // Fetch (path, chunk_idx, content) for chunks with no embedding.
933        // chunks_fts rowid serves as chunk_idx (1-based → convert to 0-based).
934        let missing: Vec<(String, i64, String)> = {
935            let db = self.db.lock().unwrap();
936            let mut stmt = db
937                .prepare(
938                    "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
939                     FROM chunks_fts f
940                     LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
941                     WHERE v.path IS NULL
942                     ORDER BY CASE
943                         WHEN f.path LIKE '%.rs' THEN 0
944                         WHEN f.path LIKE '%.toml' THEN 1
945                         WHEN f.path LIKE '%.json' THEN 2
946                         ELSE 3
947                     END, f.path
948                     LIMIT 20",
949                )
950                .unwrap();
951            stmt.query_map([], |r| {
952                Ok((
953                    r.get::<_, String>(0)?,
954                    r.get::<_, i64>(1)?,
955                    r.get::<_, String>(2)?,
956                ))
957            })
958            .unwrap()
959            .filter_map(|r| r.ok())
960            .collect()
961        };
962
963        for (path, idx, content) in missing {
964            if let Some(vec) = embed_text_blocking(&content, &self.base_url) {
965                let blob = floats_to_blob(&vec);
966                let db = self.db.lock().unwrap();
967                let _ = db.execute(
968                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
969                    params![path, idx, blob],
970                );
971            } else {
972                // Embedding model not available — stop trying for this pass.
973                break;
974            }
975        }
976    }
977
978    /// Total number of unique files currently indexed.
979    /// Session exchange chunks are excluded so status counts stay source/doc centric.
980    pub fn file_count(&self) -> usize {
981        let db = self.db.lock().unwrap();
982        db.query_row(
983            "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
984            [],
985            |r| r.get::<_, i64>(0),
986        )
987        .unwrap_or(0) as usize
988    }
989
990    /// Number of source/doc chunks that have semantic embedding vectors stored.
991    /// Session exchange chunks are excluded so status counts stay source/doc centric.
992    pub fn embedded_chunk_count(&self) -> usize {
993        let db = self.db.lock().unwrap();
994        db.query_row(
995            "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
996            [],
997            |r| r.get::<_, i64>(0),
998        )
999        .unwrap_or(0) as usize
1000    }
1001
1002    /// True when any chunk type currently has embeddings available.
1003    pub fn has_any_embeddings(&self) -> bool {
1004        let db = self.db.lock().unwrap();
1005        db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1006            r.get::<_, i64>(0)
1007        })
1008        .unwrap_or(0)
1009            != 0
1010    }
1011
1012    /// Wipe all indexed data. The DB file stays on disk; next index_project()
1013    /// call rebuilds from scratch (re-reads all files, re-embeds all chunks).
1014    pub fn reset(&self) {
1015        let db = self.db.lock().unwrap();
1016        let _ = db.execute_batch(
1017            "DELETE FROM chunks_fts;
1018             DELETE FROM chunks_vec;
1019             DELETE FROM chunks_meta;",
1020        );
1021    }
1022
1023    /// Return a compact operator-facing snapshot of what The Vein currently knows.
1024    /// Intended for trust/debug surfaces like `/vein-inspect`.
1025    pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1026        let db = self.db.lock().unwrap();
1027        let indexed_source_files = db
1028            .query_row(
1029                "SELECT COUNT(*) FROM chunks_meta
1030                 WHERE path NOT LIKE 'session/%'
1031                   AND path NOT LIKE '.hematite/docs/%'",
1032                [],
1033                |r| r.get::<_, i64>(0),
1034            )
1035            .unwrap_or(0) as usize;
1036        let indexed_docs = db
1037            .query_row(
1038                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1039                [],
1040                |r| r.get::<_, i64>(0),
1041            )
1042            .unwrap_or(0) as usize;
1043        let indexed_session_exchanges = db
1044            .query_row(
1045                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1046                [],
1047                |r| r.get::<_, i64>(0),
1048            )
1049            .unwrap_or(0) as usize;
1050        let embedded_source_doc_chunks = db
1051            .query_row(
1052                "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1053                [],
1054                |r| r.get::<_, i64>(0),
1055            )
1056            .unwrap_or(0) as usize;
1057        let has_any_embeddings = db
1058            .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1059                r.get::<_, i64>(0)
1060            })
1061            .unwrap_or(0)
1062            != 0;
1063        drop(db);
1064
1065        let hot_files = self
1066            .hot_files(hot_limit.max(1))
1067            .into_iter()
1068            .map(|(path, heat, last_modified, room)| VeinHotFile {
1069                path,
1070                heat,
1071                last_modified,
1072                room,
1073            })
1074            .collect::<Vec<_>>();
1075
1076        VeinInspectionSnapshot {
1077            indexed_source_files,
1078            indexed_docs,
1079            indexed_session_exchanges,
1080            embedded_source_doc_chunks,
1081            has_any_embeddings,
1082            active_room: self.active_room(),
1083            l1_ready: !hot_files.is_empty(),
1084            hot_files,
1085        }
1086    }
1087
1088    // ── L1 heat tracking ──────────────────────────────────────────────────────
1089
1090    /// Record an edit to a file. Increments its heat score in file_heat.
1091    /// Called from the tool dispatch after a successful edit_file / write_file /
1092    /// patch_hunk / multi_search_replace so the L1 context stays current.
1093    pub fn bump_heat(&self, path: &str) {
1094        if path.is_empty() {
1095            return;
1096        }
1097        let now = std::time::SystemTime::now()
1098            .duration_since(std::time::UNIX_EPOCH)
1099            .unwrap_or_default()
1100            .as_secs() as i64;
1101        let db = self.db.lock().unwrap();
1102        let _ = db.execute(
1103            "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1104             ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1105            params![path, now],
1106        );
1107    }
1108
1109    /// Return the top N hot files ranked by edit count (heat) then recency.
1110    /// Joins file_heat with chunks_meta so only indexed files are included.
1111    /// Returns (path, heat, mtime, room).
1112    fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1113        let db = self.db.lock().unwrap();
1114        let mut stmt = match db.prepare(
1115            "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1116             FROM file_heat fh
1117             JOIN chunks_meta cm ON cm.path = fh.path
1118             ORDER BY fh.heat DESC, cm.last_modified DESC
1119             LIMIT ?1",
1120        ) {
1121            Ok(s) => s,
1122            Err(_) => return vec![],
1123        };
1124        stmt.query_map(params![n as i64], |row| {
1125            Ok((
1126                row.get::<_, String>(0)?,
1127                row.get::<_, i64>(1)?,
1128                row.get::<_, i64>(2)?,
1129                row.get::<_, String>(3)?,
1130            ))
1131        })
1132        .map(|rows| rows.filter_map(|r| r.ok()).collect())
1133        .unwrap_or_default()
1134    }
1135
1136    /// Build the L1 context block — a compact "hot files" summary injected into
1137    /// the system prompt at session start. Capped at ~150 tokens.
1138    /// Files are grouped by room so the model sees subsystem structure at a glance.
1139    /// Returns None when there are no heat records yet (fresh project).
1140    pub fn l1_context(&self) -> Option<String> {
1141        let files = self.hot_files(8);
1142        if files.is_empty() {
1143            return None;
1144        }
1145        let now = std::time::SystemTime::now()
1146            .duration_since(std::time::UNIX_EPOCH)
1147            .unwrap_or_default()
1148            .as_secs() as i64;
1149
1150        // Group by room for readability.
1151        let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1152            std::collections::BTreeMap::new();
1153        for (path, heat, mtime, room) in &files {
1154            by_room
1155                .entry(room.clone())
1156                .or_default()
1157                .push((path.clone(), *heat, *mtime));
1158        }
1159
1160        let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1161        for (room, entries) in &by_room {
1162            out.push_str(&format!("[{}]\n", room));
1163            for (path, heat, mtime) in entries {
1164                let age_secs = now - mtime;
1165                let age = if age_secs < 3600 {
1166                    "just now".to_string()
1167                } else if age_secs < 86400 {
1168                    format!("{}h ago", age_secs / 3600)
1169                } else {
1170                    format!("{}d ago", age_secs / 86400)
1171                };
1172                out.push_str(&format!(
1173                    "  - {} [{} edit{}, {}]\n",
1174                    path,
1175                    heat,
1176                    if *heat == 1 { "" } else { "s" },
1177                    age
1178                ));
1179            }
1180        }
1181        Some(out)
1182    }
1183
1184    fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1185        let pattern = format!("{}%", prefix);
1186        let existing_paths: Vec<String> = {
1187            let db = self.db.lock().unwrap();
1188            let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1189                Ok(stmt) => stmt,
1190                Err(_) => return,
1191            };
1192            stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1193                .map(|rows| rows.filter_map(|row| row.ok()).collect())
1194                .unwrap_or_default()
1195        };
1196
1197        if existing_paths.is_empty() {
1198            return;
1199        }
1200
1201        let db = self.db.lock().unwrap();
1202        for path in existing_paths {
1203            if desired_paths.contains(&path) {
1204                continue;
1205            }
1206            let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1207            let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1208            let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1209        }
1210    }
1211}
1212
1213impl QuerySignals {
1214    fn from_query(query: &str) -> Self {
1215        let lower = query.to_ascii_lowercase();
1216        let historical_memory_hint = [
1217            "remember",
1218            "earlier",
1219            "previous",
1220            "last time",
1221            "what did we decide",
1222            "why did we decide",
1223            "what did we say",
1224            "why did we change",
1225        ]
1226        .iter()
1227        .any(|needle| lower.contains(needle));
1228
1229        Self {
1230            exact_phrases: extract_exact_phrases(query),
1231            standout_terms: extract_standout_terms(query),
1232            historical_memory_hint,
1233            temporal_reference: extract_temporal_reference(query),
1234        }
1235    }
1236}
1237
1238fn merge_scored_result(
1239    merged_by_path: &mut HashMap<String, SearchResult>,
1240    candidate: SearchResult,
1241) {
1242    match merged_by_path.get_mut(&candidate.path) {
1243        Some(existing) if candidate.score > existing.score => *existing = candidate,
1244        Some(_) => {}
1245        None => {
1246            merged_by_path.insert(candidate.path.clone(), candidate);
1247        }
1248    }
1249}
1250
1251fn reranked_score(
1252    signals: &QuerySignals,
1253    active_room: Option<&str>,
1254    result: &SearchResult,
1255    is_semantic: bool,
1256) -> f32 {
1257    let base = if is_semantic {
1258        1.0 + result.score.clamp(0.0, 1.0)
1259    } else {
1260        (result.score / 10.0).clamp(0.0, 1.0)
1261    };
1262    base + room_bias(active_room, result)
1263        + retrieval_signal_boost(signals, result)
1264        + temporal_memory_boost(signals, result)
1265}
1266
1267fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1268    if active_room == Some(result.room.as_str()) {
1269        0.15
1270    } else {
1271        0.0
1272    }
1273}
1274
1275fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1276    let mut boost = 0.0f32;
1277    let haystack = format!(
1278        "{}\n{}",
1279        result.path.to_ascii_lowercase(),
1280        result.content.to_ascii_lowercase()
1281    );
1282
1283    let phrase_matches = signals
1284        .exact_phrases
1285        .iter()
1286        .filter(|phrase| haystack.contains(phrase.as_str()))
1287        .count();
1288    if phrase_matches > 0 {
1289        boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1290    }
1291
1292    let standout_matches = signals
1293        .standout_terms
1294        .iter()
1295        .filter(|term| haystack.contains(term.as_str()))
1296        .count()
1297        .min(3);
1298    boost += standout_matches as f32 * 0.12;
1299
1300    if signals.historical_memory_hint && result.room == "session" {
1301        boost += 0.1;
1302    }
1303
1304    boost
1305}
1306
1307fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1308    if result.room != "session" {
1309        return 0.0;
1310    }
1311    let Some(reference) = signals.temporal_reference else {
1312        return 0.0;
1313    };
1314    let Some(memory_ts) = session_memory_timestamp(result) else {
1315        return 0.0;
1316    };
1317
1318    let span = reference.window_secs.max(86_400);
1319    let full_fade = span.saturating_mul(8);
1320    if full_fade <= 0 {
1321        return 0.0;
1322    }
1323
1324    let distance = (memory_ts - reference.target_ts).abs();
1325    let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1326    if closeness <= 0.0 {
1327        0.0
1328    } else {
1329        0.22 * closeness
1330    }
1331}
1332
1333fn extract_exact_phrases(query: &str) -> Vec<String> {
1334    let mut phrases = Vec::new();
1335    let chars: Vec<char> = query.chars().collect();
1336    let mut i = 0usize;
1337
1338    while i < chars.len() {
1339        let quote = chars[i];
1340        if !matches!(quote, '"' | '\'' | '`') {
1341            i += 1;
1342            continue;
1343        }
1344        let start = i + 1;
1345        let mut end = start;
1346        while end < chars.len() && chars[end] != quote {
1347            end += 1;
1348        }
1349        if end > start {
1350            let phrase = chars[start..end]
1351                .iter()
1352                .collect::<String>()
1353                .trim()
1354                .to_ascii_lowercase();
1355            if phrase.len() >= 3 && !phrases.contains(&phrase) {
1356                phrases.push(phrase);
1357            }
1358        }
1359        i = end.saturating_add(1);
1360    }
1361
1362    phrases
1363}
1364
1365fn extract_standout_terms(query: &str) -> Vec<String> {
1366    const STOPWORDS: &[&str] = &[
1367        "about", "after", "before", "change", "changed", "decide", "decided", "does", "earlier",
1368        "flow", "from", "have", "into", "just", "last", "local", "make", "more", "remember",
1369        "should", "that", "their", "there", "these", "they", "this", "those", "what", "when",
1370        "where", "which", "why", "with", "work",
1371    ];
1372
1373    let mut standout = Vec::new();
1374    for token in query.split(|ch: char| {
1375        !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1376    }) {
1377        let trimmed = token.trim();
1378        if trimmed.len() < 4 {
1379            continue;
1380        }
1381        let lower = trimmed.to_ascii_lowercase();
1382        if STOPWORDS.contains(&lower.as_str()) {
1383            continue;
1384        }
1385
1386        let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1387            || trimmed
1388                .chars()
1389                .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1390            || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1391            || trimmed.len() >= 9;
1392
1393        if interesting && !standout.contains(&lower) {
1394            standout.push(lower);
1395        }
1396    }
1397
1398    standout
1399}
1400
1401fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1402    if let Some(ts) = extract_iso_date_from_query(query) {
1403        return Some(TemporalReference {
1404            target_ts: ts,
1405            window_secs: 86_400,
1406        });
1407    }
1408
1409    let now = current_unix_timestamp();
1410    let lower = query.to_ascii_lowercase();
1411    if lower.contains("yesterday") {
1412        Some(TemporalReference {
1413            target_ts: now.saturating_sub(86_400),
1414            window_secs: 86_400,
1415        })
1416    } else if lower.contains("today") || lower.contains("earlier today") {
1417        Some(TemporalReference {
1418            target_ts: now,
1419            window_secs: 86_400,
1420        })
1421    } else if lower.contains("last week") {
1422        Some(TemporalReference {
1423            target_ts: now.saturating_sub(7 * 86_400),
1424            window_secs: 7 * 86_400,
1425        })
1426    } else if lower.contains("last month") {
1427        Some(TemporalReference {
1428            target_ts: now.saturating_sub(30 * 86_400),
1429            window_secs: 30 * 86_400,
1430        })
1431    } else {
1432        None
1433    }
1434}
1435
1436fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1437    query
1438        .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1439        .find_map(parse_iso_date_token)
1440}
1441
1442fn parse_iso_date_token(token: &str) -> Option<i64> {
1443    if token.len() != 10 {
1444        return None;
1445    }
1446    let bytes = token.as_bytes();
1447    if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1448        return None;
1449    }
1450
1451    let year = token.get(0..4)?.parse::<i32>().ok()?;
1452    let month = token.get(5..7)?.parse::<u32>().ok()?;
1453    let day = token.get(8..10)?.parse::<u32>().ok()?;
1454    if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1455        return None;
1456    }
1457
1458    Some(days_from_civil(year, month, day).saturating_mul(86_400))
1459}
1460
1461fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1462    let year = year - if month <= 2 { 1 } else { 0 };
1463    let era = if year >= 0 { year } else { year - 399 } / 400;
1464    let yoe = year - era * 400;
1465    let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1466    let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1467    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1468    era as i64 * 146_097 + doe as i64 - 719_468
1469}
1470
1471fn current_unix_timestamp() -> i64 {
1472    std::time::SystemTime::now()
1473        .duration_since(std::time::UNIX_EPOCH)
1474        .unwrap_or_default()
1475        .as_secs() as i64
1476}
1477
1478fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1479    extract_session_path_timestamp(&result.path).or_else(|| {
1480        if result.last_modified > 0 {
1481            Some(result.last_modified)
1482        } else {
1483            None
1484        }
1485    })
1486}
1487
1488fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1489    let normalized = path.replace('\\', "/");
1490    let mut parts = normalized.split('/');
1491    if parts.next()? != "session" {
1492        return None;
1493    }
1494    parse_iso_date_token(parts.next()?)
1495}
1496
1497fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1498    let normalized = speaker.trim().to_ascii_lowercase();
1499    match normalized.as_str() {
1500        "you" | "user" => SessionSpeakerKind::User,
1501        "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1502        _ => SessionSpeakerKind::Assistant,
1503    }
1504}
1505
1506fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1507    let Ok(raw) = std::fs::read_to_string(report_path) else {
1508        return Vec::new();
1509    };
1510    let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1511        return Vec::new();
1512    };
1513
1514    let session_key = report_path
1515        .file_stem()
1516        .and_then(|stem| stem.to_str())
1517        .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1518        .unwrap_or("unknown-session")
1519        .to_string();
1520    let session_date = report
1521        .session_start
1522        .split('_')
1523        .next()
1524        .filter(|date| !date.is_empty())
1525        .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1526        .to_string();
1527
1528    let mut exchanges = Vec::new();
1529    let mut pending_user: Option<String> = None;
1530    let mut turn_index = 0usize;
1531
1532    for entry in report.transcript {
1533        match session_speaker_kind(&entry.speaker) {
1534            SessionSpeakerKind::User => {
1535                let text = entry.text.trim();
1536                if !text.is_empty() {
1537                    pending_user = Some(text.to_string());
1538                }
1539            }
1540            SessionSpeakerKind::Assistant => {
1541                let text = entry.text.trim();
1542                if text.is_empty() {
1543                    continue;
1544                }
1545                let Some(user_text) = pending_user.take() else {
1546                    continue;
1547                };
1548                turn_index += 1;
1549                exchanges.push(SessionExchange {
1550                    path: format!(
1551                        "session/{}/{}/turn-{}",
1552                        session_date, session_key, turn_index
1553                    ),
1554                    last_modified,
1555                    content: format!(
1556                        "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1557                        user_text, text
1558                    ),
1559                });
1560            }
1561            SessionSpeakerKind::Ignore => {}
1562        }
1563    }
1564
1565    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1566        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1567        exchanges = exchanges.into_iter().skip(keep_from).collect();
1568    }
1569
1570    exchanges
1571}
1572
1573fn load_imported_session_exchanges(
1574    import_path: &Path,
1575    imports_root: &Path,
1576    last_modified: i64,
1577) -> Vec<SessionExchange> {
1578    let Ok(raw) = std::fs::read_to_string(import_path) else {
1579        return Vec::new();
1580    };
1581
1582    let messages = normalize_import_messages(&raw, import_path);
1583    if messages.is_empty() {
1584        return Vec::new();
1585    }
1586
1587    let rel = import_path
1588        .strip_prefix(imports_root)
1589        .unwrap_or(import_path);
1590    let rel_slug = slugify_import_path(rel);
1591    let mut exchanges = Vec::new();
1592    let mut pending_user: Option<String> = None;
1593    let mut turn_index = 0usize;
1594
1595    for (role, text) in messages {
1596        let cleaned = text.trim();
1597        if cleaned.is_empty() {
1598            continue;
1599        }
1600        match role.as_str() {
1601            "user" => pending_user = Some(cleaned.to_string()),
1602            "assistant" => {
1603                let Some(user_text) = pending_user.take() else {
1604                    continue;
1605                };
1606                turn_index += 1;
1607                exchanges.push(SessionExchange {
1608                    path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1609                    last_modified,
1610                    content: format!(
1611                        "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1612                        rel.to_string_lossy().replace('\\', "/"),
1613                        user_text,
1614                        cleaned
1615                    ),
1616                });
1617            }
1618            _ => {}
1619        }
1620    }
1621
1622    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1623        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1624        exchanges = exchanges.into_iter().skip(keep_from).collect();
1625    }
1626
1627    exchanges
1628}
1629
1630fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1631    if raw.trim().is_empty() {
1632        return Vec::new();
1633    }
1634
1635    if let Some(messages) = parse_marker_transcript(raw) {
1636        return messages;
1637    }
1638
1639    let ext = import_path
1640        .extension()
1641        .and_then(|ext| ext.to_str())
1642        .unwrap_or("")
1643        .to_ascii_lowercase();
1644
1645    if matches!(ext.as_str(), "json" | "jsonl")
1646        || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1647    {
1648        if let Some(messages) = parse_jsonl_messages(raw) {
1649            if !messages.is_empty() {
1650                return messages;
1651            }
1652        }
1653
1654        if let Ok(value) = serde_json::from_str::<Value>(raw) {
1655            if let Some(messages) = parse_session_report_messages(&value) {
1656                return messages;
1657            }
1658            if let Some(messages) = parse_simple_role_messages(&value) {
1659                return messages;
1660            }
1661            if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1662                return messages;
1663            }
1664        }
1665    }
1666
1667    Vec::new()
1668}
1669
1670fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1671    let lines = raw.lines().collect::<Vec<_>>();
1672    if lines
1673        .iter()
1674        .filter(|line| line.trim_start().starts_with("> "))
1675        .count()
1676        < 2
1677    {
1678        return None;
1679    }
1680
1681    let mut messages = Vec::new();
1682    let mut i = 0usize;
1683    while i < lines.len() {
1684        let line = lines[i].trim_start();
1685        if let Some(rest) = line.strip_prefix("> ") {
1686            messages.push(("user".to_string(), rest.trim().to_string()));
1687            i += 1;
1688            let mut assistant_lines = Vec::new();
1689            while i < lines.len() {
1690                let next = lines[i];
1691                if next.trim_start().starts_with("> ") {
1692                    break;
1693                }
1694                let trimmed = next.trim();
1695                if !trimmed.is_empty() && trimmed != "---" {
1696                    assistant_lines.push(trimmed.to_string());
1697                }
1698                i += 1;
1699            }
1700            if !assistant_lines.is_empty() {
1701                messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1702            }
1703        } else {
1704            i += 1;
1705        }
1706    }
1707
1708    (!messages.is_empty()).then_some(messages)
1709}
1710
1711fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1712    let mut messages = Vec::new();
1713    let mut has_codex_session_meta = false;
1714    let mut saw_jsonl = false;
1715
1716    for line in raw.lines() {
1717        let trimmed = line.trim();
1718        if trimmed.is_empty() {
1719            continue;
1720        }
1721        let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1722            continue;
1723        };
1724        saw_jsonl = true;
1725        let Some(object) = value.as_object() else {
1726            continue;
1727        };
1728
1729        match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1730            "session_meta" => {
1731                has_codex_session_meta = true;
1732            }
1733            "event_msg" => {
1734                let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1735                    continue;
1736                };
1737                let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1738                    continue;
1739                };
1740                match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1741                    "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1742                    "agent_message" => {
1743                        messages.push(("assistant".to_string(), text.trim().to_string()))
1744                    }
1745                    _ => {}
1746                }
1747            }
1748            "human" | "user" => {
1749                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1750                    messages.push(("user".to_string(), text));
1751                }
1752            }
1753            "assistant" => {
1754                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1755                    messages.push(("assistant".to_string(), text));
1756                }
1757            }
1758            _ => {
1759                if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
1760                    if let Some(text) = extract_text_content(&value) {
1761                        match role {
1762                            "user" | "human" => messages.push(("user".to_string(), text)),
1763                            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1764                            _ => {}
1765                        }
1766                    }
1767                }
1768            }
1769        }
1770    }
1771
1772    if !saw_jsonl {
1773        return None;
1774    }
1775
1776    if has_codex_session_meta || !messages.is_empty() {
1777        return Some(messages);
1778    }
1779
1780    None
1781}
1782
1783fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
1784    let report = value.as_object()?;
1785    let transcript = report.get("transcript")?.as_array()?;
1786    let mut messages = Vec::new();
1787
1788    for entry in transcript {
1789        let Some(obj) = entry.as_object() else {
1790            continue;
1791        };
1792        let speaker = obj
1793            .get("speaker")
1794            .and_then(|v| v.as_str())
1795            .unwrap_or_default();
1796        let text = obj
1797            .get("text")
1798            .and_then(|v| v.as_str())
1799            .unwrap_or_default()
1800            .trim()
1801            .to_string();
1802        if text.is_empty() {
1803            continue;
1804        }
1805        match session_speaker_kind(speaker) {
1806            SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
1807            SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
1808            SessionSpeakerKind::Ignore => {}
1809        }
1810    }
1811
1812    (!messages.is_empty()).then_some(messages)
1813}
1814
1815fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
1816    if let Some(array) = value.as_array() {
1817        let messages = collect_role_messages(array);
1818        return (!messages.is_empty()).then_some(messages);
1819    }
1820
1821    let obj = value.as_object()?;
1822    if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
1823        let array = messages_value.as_array()?;
1824        let messages = collect_role_messages(array);
1825        return (!messages.is_empty()).then_some(messages);
1826    }
1827
1828    None
1829}
1830
1831fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
1832    let mut messages = Vec::new();
1833    for item in items {
1834        let Some(obj) = item.as_object() else {
1835            continue;
1836        };
1837        let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
1838            continue;
1839        };
1840        let Some(text) = extract_text_content(item) else {
1841            continue;
1842        };
1843        match role {
1844            "user" | "human" => messages.push(("user".to_string(), text)),
1845            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1846            _ => {}
1847        }
1848    }
1849    messages
1850}
1851
1852fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
1853    let mapping = value.get("mapping")?.as_object()?;
1854    let mut current_id = mapping.iter().find_map(|(node_id, node)| {
1855        let obj = node.as_object()?;
1856        (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
1857    })?;
1858
1859    let mut messages = Vec::new();
1860    let mut visited = std::collections::HashSet::new();
1861
1862    while visited.insert(current_id.clone()) {
1863        let Some(node) = mapping.get(&current_id).and_then(|v| v.as_object()) else {
1864            break;
1865        };
1866
1867        if let Some(message) = node.get("message") {
1868            let role = message
1869                .get("author")
1870                .and_then(|author| author.get("role"))
1871                .and_then(|v| v.as_str())
1872                .unwrap_or("");
1873            if let Some(text) = extract_text_content(message) {
1874                match role {
1875                    "user" => messages.push(("user".to_string(), text)),
1876                    "assistant" => messages.push(("assistant".to_string(), text)),
1877                    _ => {}
1878                }
1879            }
1880        }
1881
1882        let Some(next_id) = node
1883            .get("children")
1884            .and_then(|children| children.as_array())
1885            .and_then(|children| children.first())
1886            .and_then(|child| child.as_str())
1887        else {
1888            break;
1889        };
1890        current_id = next_id.to_string();
1891    }
1892
1893    (!messages.is_empty()).then_some(messages)
1894}
1895
1896fn extract_text_content(value: &Value) -> Option<String> {
1897    if let Some(text) = value.as_str() {
1898        let trimmed = text.trim();
1899        return (!trimmed.is_empty()).then_some(trimmed.to_string());
1900    }
1901
1902    if let Some(array) = value.as_array() {
1903        let joined = array
1904            .iter()
1905            .filter_map(extract_text_content)
1906            .filter(|part| !part.is_empty())
1907            .collect::<Vec<_>>()
1908            .join("\n");
1909        return (!joined.is_empty()).then_some(joined);
1910    }
1911
1912    let obj = value.as_object()?;
1913
1914    if let Some(content) = obj.get("content") {
1915        if let Some(text) = extract_text_content(content) {
1916            return Some(text);
1917        }
1918    }
1919
1920    if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
1921        let trimmed = text.trim();
1922        if !trimmed.is_empty() {
1923            return Some(trimmed.to_string());
1924        }
1925    }
1926
1927    if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
1928        let joined = parts
1929            .iter()
1930            .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
1931            .filter(|part| !part.is_empty())
1932            .collect::<Vec<_>>()
1933            .join("\n");
1934        if !joined.is_empty() {
1935            return Some(joined);
1936        }
1937    }
1938
1939    None
1940}
1941
1942fn slugify_import_path(path: &Path) -> String {
1943    path.to_string_lossy()
1944        .replace('\\', "/")
1945        .chars()
1946        .map(|ch| {
1947            if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
1948                ch
1949            } else {
1950                '_'
1951            }
1952        })
1953        .collect::<String>()
1954        .trim_matches('/')
1955        .replace('/', "__")
1956}
1957
1958// ── Embedding API ─────────────────────────────────────────────────────────────
1959
1960/// Call LM Studio's `/v1/embeddings` endpoint synchronously.
1961///
1962/// Uses nomic-embed-text-v2 MoE. Nomic v2 requires task instruction prefixes:
1963/// - Chunks stored in the index use `"search_document: "` prefix
1964/// - Queries at search time use `"search_query: "` prefix
1965/// LM Studio matches loaded models by substring so the quant suffix doesn't matter.
1966///
1967/// Returns `None` if:
1968/// - No embedding model is loaded in LM Studio
1969/// - LM Studio is not running
1970/// - Any network or parse error occurs
1971///
1972/// Callers must tolerate `None` and fall back to BM25-only search.
1973fn embed_text_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
1974    embed_text_with_prefix(text, "search_document", base_url)
1975}
1976
1977fn embed_query_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
1978    embed_text_with_prefix(text, "search_query", base_url)
1979}
1980
1981fn embed_text_with_prefix(text: &str, task: &str, base_url: &str) -> Option<Vec<f32>> {
1982    // Nomic v2 task instruction prefix format: "<task>: <text>"
1983    let prefixed = format!("{}: {}", task, text);
1984    // Truncate to ~8000 chars to stay within typical embedding model limits.
1985    let input = if prefixed.len() > 8000 {
1986        &prefixed[..8000]
1987    } else {
1988        &prefixed
1989    };
1990
1991    let client = reqwest::blocking::Client::builder()
1992        .timeout(std::time::Duration::from_secs(10))
1993        .build()
1994        .ok()?;
1995
1996    let body = serde_json::json!({
1997        "model": "nomic-embed-text-v2",
1998        "input": input
1999    });
2000
2001    let url = format!("{}/v1/embeddings", base_url);
2002    let resp = client.post(&url).json(&body).send().ok()?;
2003
2004    if !resp.status().is_success() {
2005        return None;
2006    }
2007
2008    let json: serde_json::Value = resp.json().ok()?;
2009    let embedding = json["data"][0]["embedding"].as_array()?;
2010    let vec: Vec<f32> = embedding
2011        .iter()
2012        .filter_map(|v| v.as_f64().map(|f| f as f32))
2013        .collect();
2014
2015    if vec.is_empty() {
2016        None
2017    } else {
2018        Some(vec)
2019    }
2020}
2021
2022// ── Vector math ───────────────────────────────────────────────────────────────
2023
2024fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2025    if a.len() != b.len() || a.is_empty() {
2026        return 0.0;
2027    }
2028    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2029    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2030    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2031    if norm_a == 0.0 || norm_b == 0.0 {
2032        0.0
2033    } else {
2034        dot / (norm_a * norm_b)
2035    }
2036}
2037
2038fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2039    floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2040}
2041
2042fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2043    blob.chunks_exact(4)
2044        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2045        .collect()
2046}
2047
2048// ── Document extraction ───────────────────────────────────────────────────────
2049
2050/// Extract plain text from a PDF file using pdf-extract.
2051/// Returns None if the file can't be read or yields no text.
2052/// Output is best-effort — layout is not preserved, but content is.
2053fn normalize_extracted_document_text(text: String) -> Option<String> {
2054    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2055    let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2056    if trimmed.is_empty() {
2057        None
2058    } else {
2059        Some(trimmed.to_string())
2060    }
2061}
2062
2063fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2064    let previous_hook = std::panic::take_hook();
2065    std::panic::set_hook(Box::new(|_| {}));
2066    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2067        pdf_extract::extract_text(path)
2068    }));
2069    std::panic::set_hook(previous_hook);
2070
2071    match result {
2072        Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2073        Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2074        Err(payload) => {
2075            let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2076                (*msg).to_string()
2077            } else if let Some(msg) = payload.downcast_ref::<String>() {
2078                msg.clone()
2079            } else {
2080                "unknown parser panic".to_string()
2081            };
2082            Err(format!("pdf-extract panicked: {}", panic_text))
2083        }
2084    }
2085}
2086
2087fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2088    let mut doc =
2089        lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2090
2091    if doc.is_encrypted() {
2092        doc.decrypt("")
2093            .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2094    }
2095
2096    let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2097    if page_numbers.is_empty() {
2098        return Ok(None);
2099    }
2100
2101    let mut extracted_pages = Vec::new();
2102    let mut page_errors = Vec::new();
2103
2104    for page_number in page_numbers {
2105        match doc.extract_text(&[page_number]) {
2106            Ok(text) => {
2107                if let Some(page_text) = normalize_extracted_document_text(text) {
2108                    extracted_pages.push(page_text);
2109                }
2110            }
2111            Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2112        }
2113    }
2114
2115    if !extracted_pages.is_empty() {
2116        return Ok(Some(extracted_pages.join("\n\n")));
2117    }
2118
2119    if !page_errors.is_empty() {
2120        let sample_errors = page_errors
2121            .into_iter()
2122            .take(3)
2123            .collect::<Vec<_>>()
2124            .join("; ");
2125        return Err(format!(
2126            "lopdf could not extract usable page text ({sample_errors})"
2127        ));
2128    }
2129
2130    Ok(None)
2131}
2132
2133fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2134    let mut failures = Vec::new();
2135
2136    match extract_pdf_text_with_pdf_extract(path) {
2137        Ok(Some(text)) => return Ok(Some(text)),
2138        Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2139        Err(e) => failures.push(e),
2140    }
2141
2142    match extract_pdf_text_with_lopdf(path) {
2143        Ok(Some(text)) => return Ok(Some(text)),
2144        Ok(None) => failures.push("lopdf found no usable text".to_string()),
2145        Err(e) => failures.push(e),
2146    }
2147
2148    let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2149    Err(format!(
2150        "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2151        detail
2152    ))
2153}
2154
2155fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2156    let exe = std::env::current_exe()
2157        .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2158    let output = std::process::Command::new(exe)
2159        .arg("--pdf-extract-helper")
2160        .arg(path)
2161        .stdin(std::process::Stdio::null())
2162        .stdout(std::process::Stdio::piped())
2163        .stderr(std::process::Stdio::piped())
2164        .output()
2165        .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2166
2167    if !output.status.success() {
2168        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2169        return Err(if stderr.is_empty() {
2170            "PDF extraction failed.".to_string()
2171        } else {
2172            stderr
2173        });
2174    }
2175
2176    let text = String::from_utf8(output.stdout)
2177        .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2178    if text.trim().is_empty() {
2179        Ok(None)
2180    } else {
2181        Ok(Some(text))
2182    }
2183}
2184
2185pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2186    match extract_pdf_text_inside_helper(path) {
2187        Ok(Some(text)) => {
2188            use std::io::Write;
2189            let mut stdout = std::io::stdout();
2190            if stdout.write_all(text.as_bytes()).is_ok() {
2191                0
2192            } else {
2193                let _ = writeln!(
2194                    std::io::stderr(),
2195                    "PDF helper could not write extracted text."
2196                );
2197                1
2198            }
2199        }
2200        Ok(None) => {
2201            eprintln!(
2202                "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2203            );
2204            1
2205        }
2206        Err(e) => {
2207            eprintln!("{}", e);
2208            1
2209        }
2210    }
2211}
2212
2213/// Extract text from any supported document type (PDF, markdown, plain text).
2214/// Used by /attach for one-shot context injection.
2215pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2216    let ext = path
2217        .extension()
2218        .and_then(|e| e.to_str())
2219        .unwrap_or("")
2220        .to_lowercase();
2221    match ext.as_str() {
2222        "pdf" => {
2223            let text = extract_pdf_text(path)?.ok_or_else(|| {
2224                "PDF contains no extractable text — it may be scanned/image-only. \
2225                     Try attaching page screenshots with /image instead."
2226                    .to_string()
2227            })?;
2228            pdf_quality_check(text)
2229        }
2230        _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2231    }
2232}
2233
2234/// Detect garbled PDF extraction — common with academic publisher PDFs that use
2235/// custom embedded fonts with non-standard glyph mappings.
2236///
2237/// Returns the text if it looks usable, or an informative error if it looks garbled.
2238fn pdf_quality_check(text: String) -> Result<String, String> {
2239    let trimmed = text.trim();
2240
2241    // Too little content to be useful.
2242    if trimmed.len() < 150 {
2243        return Err(format!(
2244            "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2245             or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2246            trimmed.len()
2247        ));
2248    }
2249
2250    // Detect words smashed together: space ratio too low.
2251    // Normal prose is ~15–20% spaces. Below 4% means glyphs aren't mapping to spaces.
2252    let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2253    let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2254    let space_ratio = if non_newline > 0 {
2255        spaces as f32 / non_newline as f32
2256    } else {
2257        0.0
2258    };
2259
2260    if space_ratio < 0.04 {
2261        return Err(
2262            "PDF text extraction produced garbled output — words are merged with no spaces. \
2263             This usually means the PDF uses custom embedded fonts (common with academic publishers \
2264             like EBSCO, Elsevier, Springer). \
2265             Try a PDF exported from Word, Google Docs, or LaTeX, \
2266             or attach page screenshots with /image instead.".to_string()
2267        );
2268    }
2269
2270    Ok(text)
2271}
2272
2273// ── Chunking strategies ───────────────────────────────────────────────────────
2274
2275/// Dispatch to the correct chunking strategy based on file extension.
2276fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2277    if ext == "rs" {
2278        chunk_rust_symbols(text)
2279    } else {
2280        chunk_paragraphs(text)
2281    }
2282}
2283
2284/// Chunk Rust source at top-level item boundaries.
2285///
2286/// Detects lines at column 0 that start a Rust declaration keyword, flushes
2287/// the accumulated buffer, then moves any trailing doc-comments / attributes
2288/// forward so they stay with the item they annotate.
2289///
2290/// Items larger than 3000 chars (e.g. large impl blocks) are further split
2291/// by sliding window so no single chunk blows the retrieval budget.
2292fn chunk_rust_symbols(text: &str) -> Vec<String> {
2293    const ITEM_STARTS: &[&str] = &[
2294        "pub fn ",
2295        "pub async fn ",
2296        "pub unsafe fn ",
2297        "async fn ",
2298        "unsafe fn ",
2299        "fn ",
2300        "pub impl",
2301        "impl ",
2302        "pub struct ",
2303        "struct ",
2304        "pub enum ",
2305        "enum ",
2306        "pub trait ",
2307        "trait ",
2308        "pub mod ",
2309        "mod ",
2310        "pub type ",
2311        "type ",
2312        "pub const ",
2313        "const ",
2314        "pub static ",
2315        "static ",
2316    ];
2317
2318    let lines: Vec<&str> = text.lines().collect();
2319    let mut chunks: Vec<String> = Vec::new();
2320    let mut current: Vec<&str> = Vec::new();
2321
2322    for &line in &lines {
2323        let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2324        let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2325
2326        if is_item && !current.is_empty() {
2327            // Scan backward to find where trailing doc-comments / attributes start —
2328            // move them to the new chunk so they land with their item.
2329            let mut split = current.len();
2330            while split > 0 {
2331                let prev = current[split - 1].trim();
2332                if prev.starts_with("///")
2333                    || prev.starts_with("//!")
2334                    || prev.starts_with("#[")
2335                    || prev.is_empty()
2336                {
2337                    split -= 1;
2338                } else {
2339                    break;
2340                }
2341            }
2342            let body = current[..split].join("\n");
2343            if !body.trim().is_empty() {
2344                chunks.push(body);
2345            }
2346            current = current[split..].to_vec();
2347        }
2348        current.push(line);
2349    }
2350    if !current.is_empty() {
2351        let body = current.join("\n");
2352        if !body.trim().is_empty() {
2353            chunks.push(body);
2354        }
2355    }
2356
2357    // Subdivide any oversized blocks (e.g. long impl blocks with many methods).
2358    let mut result = Vec::new();
2359    for chunk in chunks {
2360        if chunk.len() > 3000 {
2361            result.extend(sliding_window_chunks(&chunk, 2000, 200));
2362        } else {
2363            result.push(chunk);
2364        }
2365    }
2366    result
2367}
2368
2369/// Chunk non-Rust text at paragraph boundaries (double newline).
2370fn chunk_paragraphs(text: &str) -> Vec<String> {
2371    let mut result: Vec<String> = Vec::new();
2372    let mut current = String::new();
2373
2374    for para in text.split("\n\n") {
2375        if current.len() + para.len() + 2 > 2000 {
2376            if !current.trim().is_empty() {
2377                result.push(current.clone());
2378            }
2379            current = para.to_string();
2380        } else {
2381            if !current.is_empty() {
2382                current.push_str("\n\n");
2383            }
2384            current.push_str(para);
2385        }
2386    }
2387    if !current.trim().is_empty() {
2388        result.push(current);
2389    }
2390
2391    let mut final_result = Vec::new();
2392    for chunk in result {
2393        if chunk.len() > 2000 {
2394            final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2395        } else {
2396            final_result.push(chunk);
2397        }
2398    }
2399    final_result
2400}
2401
2402/// Classic sliding-window fallback for oversized blocks.
2403fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2404    let chars: Vec<char> = text.chars().collect();
2405    let mut result = Vec::new();
2406    let mut i = 0;
2407    while i < chars.len() {
2408        let end = (i + chunk_size).min(chars.len());
2409        result.push(chars[i..end].iter().collect());
2410        if end == chars.len() {
2411            break;
2412        }
2413        i += chunk_size - overlap;
2414    }
2415    result
2416}