Skip to main content

hematite/memory/
vein.rs

1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7/// "The Vein" — local RAG memory engine backed by SQLite FTS5 + semantic embeddings.
8///
9/// Two retrieval modes, used together:
10///
11/// **BM25 (always available)**
12/// Full-text search via SQLite FTS5 with Porter-stemming. Fast, zero extra GPU cost,
13/// works as the fallback when the embedding model isn't loaded.
14///
15/// **Semantic (when LM Studio has an embedding model loaded)**
16/// Calls `/v1/embeddings` (nomic-embed-text-v1.5 or similar) to produce 768-dim float
17/// vectors for each chunk. At search time the query is embedded and cosine similarity
18/// selects the most conceptually relevant chunks — even when no keywords match.
19///
20/// Hybrid search runs BM25 and semantic in parallel, deduplicates by path, and returns
21/// the top-k results ranked by combined score. Semantic results score higher when the
22/// embedding model is available; BM25 fills the gap when it isn't.
23///
24/// Indexing is incremental: files are re-indexed only when their mtime changes. Embedding
25/// vectors are stored in a separate `chunks_vec` SQLite table so they survive re-runs
26/// without hitting the embedding API again.
27pub struct Vein {
28    db: std::sync::Arc<std::sync::Mutex<Connection>>,
29    /// Base URL of the LLM provider, used for the embeddings endpoint.
30    base_url: String,
31}
32
33// SAFETY: rusqlite::Connection is !Send by default, but we wrap it in Arc<Mutex>
34// and ensure all accesses are serialized by the mutex.
35unsafe impl Send for Vein {}
36unsafe impl Sync for Vein {}
37
38#[derive(Debug, Clone)]
39pub struct SearchResult {
40    pub path: String,
41    pub content: String,
42    /// Combined relevance score (higher = more relevant).
43    pub score: f32,
44    /// Subsystem room derived from the file path (e.g. "agent", "ui", "tools").
45    pub room: String,
46    /// Last-modified timestamp from chunks_meta (unix seconds).
47    pub last_modified: i64,
48}
49
50#[derive(Debug, Clone, PartialEq, Eq)]
51pub struct VeinHotFile {
52    pub path: String,
53    pub heat: i64,
54    pub last_modified: i64,
55    pub room: String,
56}
57
58#[derive(Debug, Clone)]
59pub struct VeinInspectionSnapshot {
60    pub indexed_source_files: usize,
61    pub indexed_docs: usize,
62    pub indexed_session_exchanges: usize,
63    pub embedded_source_doc_chunks: usize,
64    pub has_any_embeddings: bool,
65    pub active_room: Option<String>,
66    pub hot_files: Vec<VeinHotFile>,
67    pub l1_ready: bool,
68}
69
70#[derive(Debug, Default)]
71struct QuerySignals {
72    exact_phrases: Vec<String>,
73    standout_terms: Vec<String>,
74    historical_memory_hint: bool,
75    temporal_reference: Option<TemporalReference>,
76}
77
78#[derive(Debug, Clone, Copy)]
79struct TemporalReference {
80    target_ts: i64,
81    window_secs: i64,
82}
83
84#[derive(Debug, Deserialize)]
85struct SessionReport {
86    #[serde(default)]
87    session_start: String,
88    #[serde(default)]
89    transcript: Vec<SessionTranscriptEntry>,
90}
91
92#[derive(Debug, Deserialize)]
93struct SessionTranscriptEntry {
94    #[serde(default)]
95    speaker: String,
96    #[serde(default)]
97    text: String,
98}
99
100#[derive(Debug)]
101struct SessionExchange {
102    path: String,
103    last_modified: i64,
104    content: String,
105}
106
107#[derive(Debug, Clone, Copy, PartialEq, Eq)]
108enum SessionSpeakerKind {
109    User,
110    Assistant,
111    Ignore,
112}
113
114/// Derive a subsystem room label from a file path.
115/// Uses path segments, filenames, and repo-role hints to map files into
116/// stable subsystem rooms. Falls back to the first directory component or
117/// "root" when no stronger signal exists.
118pub fn detect_room(path: &str) -> String {
119    let lower = path.to_lowercase().replace('\\', "/");
120    let filename = lower.rsplit('/').next().unwrap_or(&lower);
121    let ext = filename.rsplit('.').next().unwrap_or("");
122
123    let mut best_room = None::<&str>;
124    let mut best_score = 0i32;
125    let mut consider = |room: &'static str, score: i32| {
126        if score > best_score {
127            best_score = score;
128            best_room = Some(room);
129        }
130    };
131
132    let is_component = |segment: &str| {
133        lower == segment
134            || lower.starts_with(&format!("{segment}/"))
135            || lower.contains(&format!("/{segment}/"))
136    };
137
138    if lower.starts_with("session/")
139        || lower.starts_with(".hematite/reports/")
140        || lower.starts_with(".hematite/imports/")
141        || is_component("reports")
142        || is_component("imports")
143    {
144        consider("session", 100);
145    }
146
147    if lower.starts_with(".hematite/docs/")
148        || is_component("docs")
149        || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
150        || matches!(ext, "md" | "markdown" | "pdf" | "rst")
151    {
152        consider("docs", 80);
153    }
154
155    if is_component("tests")
156        || filename.contains("diagnostic")
157        || filename.ends_with("_test.rs")
158        || filename.ends_with(".test.ts")
159    {
160        consider("tests", 85);
161    }
162
163    if lower.starts_with(".github/workflows/")
164        || is_component("workflows")
165        || filename == ".pre-commit-config.yaml"
166        || filename == ".pre-commit-config.yml"
167        || filename.contains("hook")
168    {
169        consider("automation", 84);
170    }
171
172    if lower.starts_with("installer/")
173        || lower.starts_with("dist/")
174        || lower.starts_with("scripts/package-")
175        || filename.contains("release")
176        || filename.contains("bump-version")
177        || ext == "iss"
178    {
179        consider("release", 82);
180    }
181
182    if matches!(
183        filename,
184        "cargo.toml"
185            | "cargo.lock"
186            | "package.json"
187            | "pnpm-lock.yaml"
188            | "yarn.lock"
189            | "bun.lock"
190            | "bun.lockb"
191            | "pyproject.toml"
192            | "setup.py"
193            | "go.mod"
194            | "pom.xml"
195            | "build.gradle"
196            | "build.gradle.kts"
197            | "cmakelists.txt"
198            | ".gitignore"
199            | "settings.json"
200            | "mcp_servers.json"
201    ) || filename.ends_with(".sln")
202        || filename.ends_with(".csproj")
203        || filename.contains("config")
204    {
205        consider("config", 76);
206    }
207
208    if is_component("ui")
209        || matches!(
210            filename,
211            "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
212        )
213    {
214        consider("ui", 70);
215    }
216
217    if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
218        consider("memory", 72);
219    }
220
221    if is_component("tools")
222        || matches!(
223            filename,
224            "verify_build.rs"
225                | "host_inspect.rs"
226                | "shell.rs"
227                | "code_sandbox.rs"
228                | "project_map.rs"
229                | "runtime_trace.rs"
230        )
231    {
232        consider("tools", 68);
233    }
234
235    if filename.contains("mcp")
236        || filename.contains("lsp")
237        || lower.contains("/mcp/")
238        || lower.contains("/lsp/")
239    {
240        consider("integration", 67);
241    }
242
243    if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
244        || filename.contains("startup")
245        || filename.contains("runtime")
246    {
247        consider("runtime", 66);
248    }
249
250    if is_component("agent") {
251        consider("agent", 60);
252    }
253
254    if lower.starts_with("libs/") || is_component("libs") {
255        consider("libs", 58);
256    }
257
258    if lower.starts_with("scripts/") || is_component("scripts") {
259        consider("scripts", 55);
260    }
261
262    if let Some(room) = best_room {
263        return room.to_string();
264    }
265
266    // Fall back to first directory component
267    lower
268        .split('/')
269        .next()
270        .filter(|s| !s.is_empty() && !s.contains('.'))
271        .unwrap_or("root")
272        .to_string()
273}
274
275impl Vein {
276    const SESSION_REPORT_LIMIT: usize = 5;
277    const SESSION_TURN_LIMIT: usize = 50;
278    const IMPORT_FILE_LIMIT: usize = 12;
279    const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
280
281    pub fn new<P: AsRef<Path>>(
282        db_path: P,
283        base_url: String,
284    ) -> Result<Self, Box<dyn std::error::Error>> {
285        let db = Connection::open(db_path)?;
286
287        // WAL mode for better concurrent read performance.
288        db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
289
290        // chunks_meta: tracks last-modified time per path for incremental indexing.
291        // chunks_fts:  BM25 full-text index of all code chunks.
292        // chunks_vec:  semantic embedding vectors, keyed by (path, chunk_idx).
293        db.execute_batch(
294            "CREATE TABLE IF NOT EXISTS chunks_meta (
295                path TEXT PRIMARY KEY,
296                last_modified INTEGER NOT NULL,
297                room TEXT NOT NULL DEFAULT 'root'
298            );
299            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
300                path UNINDEXED,
301                content,
302                tokenize='porter ascii'
303            );
304            CREATE TABLE IF NOT EXISTS chunks_vec (
305                path TEXT NOT NULL,
306                chunk_idx INTEGER NOT NULL,
307                embedding BLOB NOT NULL,
308                PRIMARY KEY (path, chunk_idx)
309            );
310            CREATE TABLE IF NOT EXISTS file_heat (
311                path TEXT PRIMARY KEY,
312                heat INTEGER NOT NULL DEFAULT 0,
313                last_edit INTEGER NOT NULL DEFAULT 0
314            );",
315        )?;
316
317        // Schema migrations — safe to run on every open (IF NOT EXISTS / ignored if col exists).
318        let _ = db
319            .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
320        let _ = db.execute_batch(
321            "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
322        );
323
324        Ok(Self {
325            db: std::sync::Arc::new(std::sync::Mutex::new(db)),
326            base_url,
327        })
328    }
329
330    // ── Indexing ──────────────────────────────────────────────────────────────
331
332    /// Index a single file for BM25 search. Skip if mtime hasn't changed.
333    /// Returns the chunks that were written (empty if file was unchanged).
334    pub fn index_document(
335        &mut self,
336        path: &str,
337        last_modified: i64,
338        full_text: &str,
339    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
340        let room = detect_room(path);
341        let ext = std::path::Path::new(path)
342            .extension()
343            .and_then(|e| e.to_str())
344            .unwrap_or("");
345        let chunks = chunk_by_symbols(ext, full_text);
346        self.index_chunks_with_room(path, last_modified, &room, &chunks)
347    }
348
349    fn index_chunks_with_room(
350        &mut self,
351        path: &str,
352        last_modified: i64,
353        room: &str,
354        chunks: &[String],
355    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
356        let db = self.db.lock().unwrap();
357        let existing: Option<i64> = db
358            .query_row(
359                "SELECT last_modified FROM chunks_meta WHERE path = ?1",
360                params![path],
361                |r| r.get(0),
362            )
363            .ok();
364
365        if let Some(ts) = existing {
366            if ts >= last_modified {
367                return Ok(Vec::new()); // unchanged — skip
368            }
369        }
370
371        // Evict stale BM25 chunks, stale embedding vectors, then update metadata.
372        db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
373        db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
374        db.execute(
375            "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room) VALUES (?1, ?2, ?3)",
376            params![path, last_modified, room],
377        )?;
378
379        drop(db);
380
381        let mut db = self.db.lock().unwrap();
382        let tx = db.transaction()?;
383        {
384            let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
385            for chunk in chunks {
386                stmt.execute(params![path, chunk.as_str()])?;
387            }
388        }
389        tx.commit()?;
390
391        Ok(chunks.to_vec())
392    }
393
394    /// Embed a set of chunks for one file and store the vectors.
395    /// Called after `index_document` returns new chunks.
396    /// Silently skips if the embedding model is unavailable.
397    pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
398        for (idx, chunk) in chunks.iter().enumerate() {
399            if let Some(vec) = embed_text_blocking(chunk, &self.base_url) {
400                let blob = floats_to_blob(&vec);
401                let db = self.db.lock().unwrap();
402                let _ = db.execute(
403                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
404                    params![path, idx as i64, blob],
405                );
406            }
407        }
408    }
409
410    // ── Search ────────────────────────────────────────────────────────────────
411
412    /// BM25-ranked full-text search via FTS5 MATCH.
413    pub fn search_bm25(
414        &self,
415        query: &str,
416        limit: usize,
417    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
418        // Strip common English stopwords so FTS5 MATCH gets meaningful tokens only.
419        // FTS5 uses implicit AND by default — passing stopwords like "how", "does",
420        // "the" causes zero results because source code never contains those phrases.
421        const STOPWORDS: &[&str] = &[
422            "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
423            "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an", "the",
424            "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "get",
425            "gets", "got", "work", "works", "make", "makes", "use", "uses", "into", "that", "this",
426            "it", "its",
427        ];
428
429        let safe_query: String = query
430            .chars()
431            .map(|c| {
432                if c.is_alphanumeric() || c == ' ' || c == '_' {
433                    c
434                } else {
435                    ' '
436                }
437            })
438            .collect();
439
440        // Build an OR query from non-stopword tokens so any relevant term matches.
441        let fts_query = safe_query
442            .split_whitespace()
443            .filter(|w| w.len() >= 3 && !STOPWORDS.contains(&w.to_lowercase().as_str()))
444            .collect::<Vec<_>>()
445            .join(" OR ");
446
447        if fts_query.is_empty() {
448            return Ok(Vec::new());
449        }
450
451        let db = self.db.lock().unwrap();
452        let mut stmt = db.prepare(
453            "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room
454             FROM chunks_fts
455             JOIN chunks_meta cm ON cm.path = chunks_fts.path
456             WHERE chunks_fts MATCH ?1
457             ORDER BY rank
458             LIMIT ?2",
459        )?;
460
461        let results: Vec<SearchResult> = stmt
462            .query_map(params![fts_query, limit as i64], |row| {
463                Ok(SearchResult {
464                    path: row.get(0)?,
465                    content: row.get(1)?,
466                    score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
467                    last_modified: row.get(3)?,
468                    room: row.get(4)?,
469                })
470            })?
471            .filter_map(|r| r.ok())
472            .collect();
473
474        Ok(results)
475    }
476
477    /// Semantic search: embed the query, cosine-similarity against all stored vectors.
478    /// Returns empty if the embedding model isn't loaded.
479    pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
480        let query_vec = match embed_query_blocking(query, &self.base_url) {
481            Some(v) => v,
482            None => return Vec::new(),
483        };
484
485        // Load all stored embeddings.
486        let rows: Vec<(String, i64, Vec<u8>, i64, String)> = {
487            let db = self.db.lock().unwrap();
488            let mut stmt = match db.prepare(
489                "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room
490                 FROM chunks_vec cv
491                 JOIN chunks_meta cm ON cm.path = cv.path",
492            ) {
493                Ok(s) => s,
494                Err(_) => return Vec::new(),
495            };
496            stmt.query_map([], |row| {
497                Ok((
498                    row.get::<_, String>(0)?,
499                    row.get::<_, i64>(1)?,
500                    row.get::<_, Vec<u8>>(2)?,
501                    row.get::<_, i64>(3)?,
502                    row.get::<_, String>(4)?,
503                ))
504            })
505            .ok()
506            .map(|rows| rows.filter_map(|r| r.ok()).collect())
507            .unwrap_or_default()
508        };
509
510        if rows.is_empty() {
511            return Vec::new();
512        }
513
514        // Score each chunk.
515        let mut scored: Vec<(f32, String, i64, i64, String)> = rows
516            .into_iter()
517            .filter_map(|(path, idx, blob, last_modified, room)| {
518                let vec = blob_to_floats(&blob);
519                let sim = cosine_similarity(&query_vec, &vec);
520                Some((sim, path, idx, last_modified, room))
521            })
522            .collect();
523
524        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
525        scored.truncate(limit);
526
527        // Fetch the content for the top chunks.
528        let db = self.db.lock().unwrap();
529        scored
530            .into_iter()
531            .filter_map(|(score, path, idx, last_modified, room)| {
532                // chunks_fts rows for this path are indexed in insertion order = chunk order.
533                // We use LIMIT/OFFSET to fetch the chunk at position `idx`.
534                let content: Option<String> = db
535                    .query_row(
536                        "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
537                        params![path, idx],
538                        |r| r.get(0),
539                    )
540                    .ok();
541                content.map(|c| SearchResult {
542                    path,
543                    content: c,
544                    score,
545                    room,
546                    last_modified,
547                })
548            })
549            .collect()
550    }
551
552    /// Hybrid search: BM25 + semantic, deduplicated and re-ranked.
553    ///
554    /// Semantic results are preferred (they score higher) when the embedding model
555    /// is available. BM25 fills in or takes over when it isn't.
556    /// Results from the active room (hottest subsystem by edit count) get a
557    /// small boost so the model gravitates toward what's currently being worked on.
558    pub fn search_context(
559        &self,
560        query: &str,
561        limit: usize,
562    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
563        let candidate_limit = (limit.max(1) * 4).max(12);
564        let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
565        let semantic = self.search_semantic(query, candidate_limit);
566        let signals = QuerySignals::from_query(query);
567
568        // Determine the active room from heat scores.
569        let active_room = self.active_room();
570
571        // Merge: semantic results win ties (scored 1.0–2.0 range after boost).
572        // BM25 results land in 0.0–1.0 range.
573        let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
574
575        for r in semantic {
576            let score = reranked_score(&signals, active_room.as_deref(), &r, true);
577            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
578        }
579
580        for r in bm25 {
581            let score = reranked_score(&signals, active_room.as_deref(), &r, false);
582            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
583        }
584
585        let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
586        merged.sort_by(|a, b| {
587            b.score
588                .partial_cmp(&a.score)
589                .unwrap_or(std::cmp::Ordering::Equal)
590        });
591        merged.truncate(limit);
592        Ok(merged)
593    }
594
595    /// Returns the room with the highest total heat (most edited subsystem).
596    /// Used to bias retrieval toward what the user is actively working on.
597    fn active_room(&self) -> Option<String> {
598        let db = self.db.lock().unwrap();
599        db.query_row(
600            "SELECT cm.room, SUM(fh.heat) as total
601             FROM file_heat fh
602             JOIN chunks_meta cm ON cm.path = fh.path
603             GROUP BY cm.room
604             ORDER BY total DESC
605             LIMIT 1",
606            [],
607            |row| row.get::<_, String>(0),
608        )
609        .ok()
610    }
611
612    // ── Project Indexing ──────────────────────────────────────────────────────
613
614    /// Walk the entire project and index all source files (BM25 + embeddings).
615    ///
616    /// Skips: `target/`, `.git/`, `node_modules/`, `.hematite/`, files > 512 KB.
617    /// Also indexes `.hematite/docs/` — the designated reference document drop folder.
618    /// Returns the number of files processed (unchanged files are fast-pathed).
619    pub fn index_project(&mut self) -> usize {
620        let root = crate::tools::file_ops::workspace_root();
621        let mut count = 0usize;
622
623        const INDEXABLE: &[&str] = &[
624            "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
625            "yml", "txt",
626        ];
627        const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
628
629        for entry in walkdir::WalkDir::new(&root)
630            .follow_links(false)
631            .into_iter()
632            .filter_entry(|e| {
633                if e.file_type().is_dir() {
634                    let name = e.file_name().to_string_lossy();
635                    return !SKIP_DIRS.contains(&name.as_ref());
636                }
637                true
638            })
639            .filter_map(|e| e.ok())
640            .filter(|e| e.file_type().is_file())
641        {
642            let path = entry.path();
643            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
644            if !INDEXABLE.contains(&ext) {
645                continue;
646            }
647
648            let Ok(meta) = std::fs::metadata(path) else {
649                continue;
650            };
651            if meta.len() > 512_000 {
652                continue;
653            }
654
655            let mtime = meta
656                .modified()
657                .map(|t| {
658                    t.duration_since(std::time::UNIX_EPOCH)
659                        .unwrap_or_default()
660                        .as_secs() as i64
661                })
662                .unwrap_or(0);
663
664            let rel = path.strip_prefix(&root).unwrap_or(path);
665            let rel_str = rel.to_string_lossy().replace('\\', "/");
666
667            if let Ok(content) = std::fs::read_to_string(path) {
668                match self.index_document(&rel_str, mtime, &content) {
669                    Ok(new_chunks) if !new_chunks.is_empty() => {
670                        count += 1;
671                    }
672                    Ok(_) => {}
673                    Err(_) => {}
674                }
675            }
676        }
677
678        count += self.index_workspace_artifacts(&root);
679
680        count
681    }
682
683    /// Index workspace-local supporting context that should be available even
684    /// outside a real project workspace: `.hematite/docs/`, recent session
685    /// reports stored in `.hematite/reports/`, and imported chat exports in
686    /// `.hematite/imports/`.
687    pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
688        let mut count = self.index_docs_folder(workspace_root);
689        count += self.index_recent_session_reports(workspace_root);
690        count += self.index_imported_session_exports(workspace_root);
691        self.backfill_missing_embeddings();
692        count
693    }
694
695    /// Index reference documents in `.hematite/docs/`.
696    /// Supports PDF (text extraction), markdown, and plain text.
697    /// Documents are stored with path prefix `docs/filename` so they are
698    /// distinguishable from source files in retrieval results.
699    fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
700        let docs_dir = workspace_root.join(".hematite").join("docs");
701        const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
702        let mut count = 0usize;
703        let mut desired_paths = HashSet::new();
704
705        if docs_dir.exists() {
706            for entry in walkdir::WalkDir::new(&docs_dir)
707                .max_depth(3)
708                .follow_links(false)
709                .into_iter()
710                .filter_map(|e| e.ok())
711                .filter(|e| e.file_type().is_file())
712            {
713                let path = entry.path();
714                let ext = path
715                    .extension()
716                    .and_then(|e| e.to_str())
717                    .unwrap_or("")
718                    .to_lowercase();
719                if !DOCS_INDEXABLE.contains(&ext.as_str()) {
720                    continue;
721                }
722
723                let Ok(meta) = std::fs::metadata(path) else {
724                    continue;
725                };
726                if meta.len() > 50_000_000 {
727                    continue;
728                }
729
730                let mtime = meta
731                    .modified()
732                    .map(|t| {
733                        t.duration_since(std::time::UNIX_EPOCH)
734                            .unwrap_or_default()
735                            .as_secs() as i64
736                    })
737                    .unwrap_or(0);
738
739                let rel = path.strip_prefix(workspace_root).unwrap_or(path);
740                let rel_str = rel.to_string_lossy().replace('\\', "/");
741                desired_paths.insert(rel_str.clone());
742
743                let content = if ext == "pdf" {
744                    extract_pdf_text(path).ok().flatten()
745                } else {
746                    std::fs::read_to_string(path).ok()
747                };
748
749                if let Some(text) = content {
750                    if text.trim().is_empty() {
751                        continue;
752                    }
753                    match self.index_document(&rel_str, mtime, &text) {
754                        Ok(new_chunks) if !new_chunks.is_empty() => {
755                            count += 1;
756                        }
757                        Ok(_) => {}
758                        Err(_) => {}
759                    }
760                }
761            }
762        }
763
764        self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
765        count
766    }
767
768    /// Index the most recent local session reports by exchange pair so prior
769    /// decisions remain searchable across launches without flooding the vein.
770    pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
771        let reports_dir = workspace_root.join(".hematite").join("reports");
772        let mut count = 0usize;
773        let mut desired_paths = HashSet::new();
774
775        if reports_dir.exists() {
776            let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
777                .ok()
778                .into_iter()
779                .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
780                .map(|entry| entry.path())
781                .filter(|path| {
782                    path.is_file()
783                        && path.extension().and_then(|ext| ext.to_str()) == Some("json")
784                        && path
785                            .file_stem()
786                            .and_then(|stem| stem.to_str())
787                            .map(|stem| stem.starts_with("session_"))
788                            .unwrap_or(false)
789                })
790                .collect();
791
792            reports.sort_by(|a, b| {
793                let a_name = a
794                    .file_name()
795                    .and_then(|name| name.to_str())
796                    .unwrap_or_default();
797                let b_name = b
798                    .file_name()
799                    .and_then(|name| name.to_str())
800                    .unwrap_or_default();
801                b_name.cmp(a_name)
802            });
803            reports.truncate(Self::SESSION_REPORT_LIMIT);
804
805            for report_path in reports {
806                let Ok(meta) = std::fs::metadata(&report_path) else {
807                    continue;
808                };
809                let mtime = meta
810                    .modified()
811                    .map(|t| {
812                        t.duration_since(std::time::UNIX_EPOCH)
813                            .unwrap_or_default()
814                            .as_secs() as i64
815                    })
816                    .unwrap_or(0);
817
818                for exchange in load_session_exchanges(&report_path, mtime) {
819                    desired_paths.insert(exchange.path.clone());
820                    match self.index_chunks_with_room(
821                        &exchange.path,
822                        exchange.last_modified,
823                        "session",
824                        std::slice::from_ref(&exchange.content),
825                    ) {
826                        Ok(new_chunks) if !new_chunks.is_empty() => {
827                            count += 1;
828                        }
829                        Ok(_) => {}
830                        Err(_) => {}
831                    }
832                }
833            }
834        }
835
836        self.prune_indexed_prefix("session/", &desired_paths);
837        count
838    }
839
840    /// Index imported chat exports from `.hematite/imports/`.
841    /// Supported inputs include already-normalized `>` transcripts, Claude Code
842    /// JSONL, Codex CLI JSONL, simple role/content JSON exports, ChatGPT
843    /// `mapping` exports, and Hematite session-report JSON.
844    pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
845        let imports_dir = workspace_root.join(".hematite").join("imports");
846        let mut count = 0usize;
847        let mut desired_paths = HashSet::new();
848
849        if imports_dir.exists() {
850            let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
851                .max_depth(4)
852                .follow_links(false)
853                .into_iter()
854                .filter_map(|entry| entry.ok())
855                .filter(|entry| entry.file_type().is_file())
856                .filter_map(|entry| {
857                    let path = entry.into_path();
858                    let ext = path
859                        .extension()
860                        .and_then(|ext| ext.to_str())
861                        .unwrap_or("")
862                        .to_ascii_lowercase();
863                    if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
864                        return None;
865                    }
866                    let meta = std::fs::metadata(&path).ok()?;
867                    if meta.len() > Self::IMPORT_MAX_BYTES {
868                        return None;
869                    }
870                    let mtime = meta
871                        .modified()
872                        .map(|t| {
873                            t.duration_since(std::time::UNIX_EPOCH)
874                                .unwrap_or_default()
875                                .as_secs() as i64
876                        })
877                        .unwrap_or(0);
878                    Some((path, mtime))
879                })
880                .collect();
881
882            imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
883                b_mtime
884                    .cmp(a_mtime)
885                    .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
886            });
887            imports.truncate(Self::IMPORT_FILE_LIMIT);
888
889            for (import_path, mtime) in imports {
890                for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
891                    desired_paths.insert(exchange.path.clone());
892                    match self.index_chunks_with_room(
893                        &exchange.path,
894                        exchange.last_modified,
895                        "session",
896                        std::slice::from_ref(&exchange.content),
897                    ) {
898                        Ok(new_chunks) if !new_chunks.is_empty() => {
899                            count += 1;
900                        }
901                        Ok(_) => {}
902                        Err(_) => {}
903                    }
904                }
905            }
906        }
907
908        self.prune_indexed_prefix("session/imports/", &desired_paths);
909        count
910    }
911
912    /// Embed any FTS chunks that don't yet have a vector in chunks_vec.
913    /// Called at the end of index_project so that loading the embedding model
914    /// after the initial index automatically triggers a semantic upgrade on the
915    /// next agent turn — no /forget or file-touch required.
916    fn backfill_missing_embeddings(&self) {
917        // Fast path: if chunk counts match, nothing to do.
918        let (fts_count, vec_count) = {
919            let db = self.db.lock().unwrap();
920            let fts: i64 = db
921                .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
922                .unwrap_or(0);
923            let vec: i64 = db
924                .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
925                .unwrap_or(0);
926            (fts, vec)
927        };
928        if fts_count == 0 || fts_count == vec_count {
929            return;
930        }
931
932        // Fetch (path, chunk_idx, content) for chunks with no embedding.
933        // chunks_fts rowid serves as chunk_idx (1-based → convert to 0-based).
934        let missing: Vec<(String, i64, String)> = {
935            let db = self.db.lock().unwrap();
936            let mut stmt = db
937                .prepare(
938                    "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
939                     FROM chunks_fts f
940                     LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
941                     WHERE v.path IS NULL
942                     ORDER BY CASE
943                         WHEN f.path LIKE '%.rs' THEN 0
944                         WHEN f.path LIKE '%.toml' THEN 1
945                         WHEN f.path LIKE '%.json' THEN 2
946                         ELSE 3
947                     END, f.path
948                     LIMIT 20",
949                )
950                .unwrap();
951            stmt.query_map([], |r| {
952                Ok((
953                    r.get::<_, String>(0)?,
954                    r.get::<_, i64>(1)?,
955                    r.get::<_, String>(2)?,
956                ))
957            })
958            .unwrap()
959            .filter_map(|r| r.ok())
960            .collect()
961        };
962
963        for (path, idx, content) in missing {
964            if let Some(vec) = embed_text_blocking(&content, &self.base_url) {
965                let blob = floats_to_blob(&vec);
966                let db = self.db.lock().unwrap();
967                let _ = db.execute(
968                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
969                    params![path, idx, blob],
970                );
971            } else {
972                // Embedding model not available — stop trying for this pass.
973                break;
974            }
975        }
976    }
977
978    /// Total number of unique files currently indexed.
979    /// Session exchange chunks are excluded so status counts stay source/doc centric.
980    pub fn file_count(&self) -> usize {
981        let db = self.db.lock().unwrap();
982        db.query_row(
983            "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
984            [],
985            |r| r.get::<_, i64>(0),
986        )
987        .unwrap_or(0) as usize
988    }
989
990    /// Number of source/doc chunks that have semantic embedding vectors stored.
991    /// Session exchange chunks are excluded so status counts stay source/doc centric.
992    pub fn embedded_chunk_count(&self) -> usize {
993        let db = self.db.lock().unwrap();
994        db.query_row(
995            "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
996            [],
997            |r| r.get::<_, i64>(0),
998        )
999        .unwrap_or(0) as usize
1000    }
1001
1002    /// True when any chunk type currently has embeddings available.
1003    pub fn has_any_embeddings(&self) -> bool {
1004        let db = self.db.lock().unwrap();
1005        db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1006            r.get::<_, i64>(0)
1007        })
1008        .unwrap_or(0)
1009            != 0
1010    }
1011
1012    /// Wipe all indexed data. The DB file stays on disk; next index_project()
1013    /// call rebuilds from scratch (re-reads all files, re-embeds all chunks).
1014    pub fn reset(&self) {
1015        let db = self.db.lock().unwrap();
1016        let _ = db.execute_batch(
1017            "DELETE FROM chunks_fts;
1018             DELETE FROM chunks_vec;
1019             DELETE FROM chunks_meta;",
1020        );
1021    }
1022
1023    /// Return a compact operator-facing snapshot of what The Vein currently knows.
1024    /// Intended for trust/debug surfaces like `/vein-inspect`.
1025    pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1026        let db = self.db.lock().unwrap();
1027        let indexed_source_files = db
1028            .query_row(
1029                "SELECT COUNT(*) FROM chunks_meta
1030                 WHERE path NOT LIKE 'session/%'
1031                   AND path NOT LIKE '.hematite/docs/%'",
1032                [],
1033                |r| r.get::<_, i64>(0),
1034            )
1035            .unwrap_or(0) as usize;
1036        let indexed_docs = db
1037            .query_row(
1038                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1039                [],
1040                |r| r.get::<_, i64>(0),
1041            )
1042            .unwrap_or(0) as usize;
1043        let indexed_session_exchanges = db
1044            .query_row(
1045                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1046                [],
1047                |r| r.get::<_, i64>(0),
1048            )
1049            .unwrap_or(0) as usize;
1050        let embedded_source_doc_chunks = db
1051            .query_row(
1052                "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1053                [],
1054                |r| r.get::<_, i64>(0),
1055            )
1056            .unwrap_or(0) as usize;
1057        let has_any_embeddings = db
1058            .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1059                r.get::<_, i64>(0)
1060            })
1061            .unwrap_or(0)
1062            != 0;
1063        drop(db);
1064
1065        let hot_files = self
1066            .hot_files(hot_limit.max(1))
1067            .into_iter()
1068            .map(|(path, heat, last_modified, room)| VeinHotFile {
1069                path,
1070                heat,
1071                last_modified,
1072                room,
1073            })
1074            .collect::<Vec<_>>();
1075
1076        VeinInspectionSnapshot {
1077            indexed_source_files,
1078            indexed_docs,
1079            indexed_session_exchanges,
1080            embedded_source_doc_chunks,
1081            has_any_embeddings,
1082            active_room: self.active_room(),
1083            l1_ready: !hot_files.is_empty(),
1084            hot_files,
1085        }
1086    }
1087
1088    // ── L1 heat tracking ──────────────────────────────────────────────────────
1089
1090    /// Record an edit to a file. Increments its heat score in file_heat.
1091    /// Called from the tool dispatch after a successful edit_file / write_file /
1092    /// patch_hunk / multi_search_replace so the L1 context stays current.
1093    pub fn bump_heat(&self, path: &str) {
1094        if path.is_empty() {
1095            return;
1096        }
1097        let now = std::time::SystemTime::now()
1098            .duration_since(std::time::UNIX_EPOCH)
1099            .unwrap_or_default()
1100            .as_secs() as i64;
1101        let db = self.db.lock().unwrap();
1102        let _ = db.execute(
1103            "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1104             ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1105            params![path, now],
1106        );
1107    }
1108
1109    /// Return the top N hot files ranked by edit count (heat) then recency.
1110    /// Joins file_heat with chunks_meta so only indexed files are included.
1111    /// Returns (path, heat, mtime, room).
1112    fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1113        let db = self.db.lock().unwrap();
1114        let mut stmt = match db.prepare(
1115            "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1116             FROM file_heat fh
1117             JOIN chunks_meta cm ON cm.path = fh.path
1118             ORDER BY fh.heat DESC, cm.last_modified DESC
1119             LIMIT ?1",
1120        ) {
1121            Ok(s) => s,
1122            Err(_) => return vec![],
1123        };
1124        stmt.query_map(params![n as i64], |row| {
1125            Ok((
1126                row.get::<_, String>(0)?,
1127                row.get::<_, i64>(1)?,
1128                row.get::<_, i64>(2)?,
1129                row.get::<_, String>(3)?,
1130            ))
1131        })
1132        .map(|rows| rows.filter_map(|r| r.ok()).collect())
1133        .unwrap_or_default()
1134    }
1135
1136    /// Return the paths of the top hot files (most edited).
1137    /// Used by RepoMapGenerator to bias PageRank toward active files.
1138    pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1139        self.hot_files(n)
1140            .into_iter()
1141            .map(|(path, _, _, _)| path)
1142            .collect()
1143    }
1144
1145    /// Return hot files with normalized heat weights in [0.0, 1.0].
1146    /// The hottest file gets weight 1.0; others are scaled proportionally.
1147    /// Used by RepoMapGenerator to apply heat-weighted PageRank personalization.
1148    pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1149        let files = self.hot_files(n);
1150        if files.is_empty() {
1151            return vec![];
1152        }
1153        let max_heat = files.iter().map(|(_, h, _, _)| *h).max().unwrap_or(1).max(1) as f64;
1154        files
1155            .into_iter()
1156            .map(|(path, heat, _, _)| {
1157                let weight = (heat as f64) / max_heat;
1158                (path, weight)
1159            })
1160            .collect()
1161    }
1162
1163    /// Build the L1 context block — a compact "hot files" summary injected into
1164    /// the system prompt at session start. Capped at ~150 tokens.
1165    /// Files are grouped by room so the model sees subsystem structure at a glance.
1166    /// Returns None when there are no heat records yet (fresh project).
1167    pub fn l1_context(&self) -> Option<String> {
1168        let files = self.hot_files(8);
1169        if files.is_empty() {
1170            return None;
1171        }
1172        let now = std::time::SystemTime::now()
1173            .duration_since(std::time::UNIX_EPOCH)
1174            .unwrap_or_default()
1175            .as_secs() as i64;
1176
1177        // Group by room for readability.
1178        let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1179            std::collections::BTreeMap::new();
1180        for (path, heat, mtime, room) in &files {
1181            by_room
1182                .entry(room.clone())
1183                .or_default()
1184                .push((path.clone(), *heat, *mtime));
1185        }
1186
1187        let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1188        for (room, entries) in &by_room {
1189            out.push_str(&format!("[{}]\n", room));
1190            for (path, heat, mtime) in entries {
1191                let age_secs = now - mtime;
1192                let age = if age_secs < 3600 {
1193                    "just now".to_string()
1194                } else if age_secs < 86400 {
1195                    format!("{}h ago", age_secs / 3600)
1196                } else {
1197                    format!("{}d ago", age_secs / 86400)
1198                };
1199                out.push_str(&format!(
1200                    "  - {} [{} edit{}, {}]\n",
1201                    path,
1202                    heat,
1203                    if *heat == 1 { "" } else { "s" },
1204                    age
1205                ));
1206            }
1207        }
1208        Some(out)
1209    }
1210
1211    fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1212        let pattern = format!("{}%", prefix);
1213        let existing_paths: Vec<String> = {
1214            let db = self.db.lock().unwrap();
1215            let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1216                Ok(stmt) => stmt,
1217                Err(_) => return,
1218            };
1219            stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1220                .map(|rows| rows.filter_map(|row| row.ok()).collect())
1221                .unwrap_or_default()
1222        };
1223
1224        if existing_paths.is_empty() {
1225            return;
1226        }
1227
1228        let db = self.db.lock().unwrap();
1229        for path in existing_paths {
1230            if desired_paths.contains(&path) {
1231                continue;
1232            }
1233            let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1234            let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1235            let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1236        }
1237    }
1238}
1239
1240impl QuerySignals {
1241    fn from_query(query: &str) -> Self {
1242        let lower = query.to_ascii_lowercase();
1243        let historical_memory_hint = [
1244            "remember",
1245            "earlier",
1246            "previous",
1247            "last time",
1248            "what did we decide",
1249            "why did we decide",
1250            "what did we say",
1251            "why did we change",
1252        ]
1253        .iter()
1254        .any(|needle| lower.contains(needle));
1255
1256        Self {
1257            exact_phrases: extract_exact_phrases(query),
1258            standout_terms: extract_standout_terms(query),
1259            historical_memory_hint,
1260            temporal_reference: extract_temporal_reference(query),
1261        }
1262    }
1263}
1264
1265fn merge_scored_result(
1266    merged_by_path: &mut HashMap<String, SearchResult>,
1267    candidate: SearchResult,
1268) {
1269    match merged_by_path.get_mut(&candidate.path) {
1270        Some(existing) if candidate.score > existing.score => *existing = candidate,
1271        Some(_) => {}
1272        None => {
1273            merged_by_path.insert(candidate.path.clone(), candidate);
1274        }
1275    }
1276}
1277
1278fn reranked_score(
1279    signals: &QuerySignals,
1280    active_room: Option<&str>,
1281    result: &SearchResult,
1282    is_semantic: bool,
1283) -> f32 {
1284    let base = if is_semantic {
1285        1.0 + result.score.clamp(0.0, 1.0)
1286    } else {
1287        (result.score / 10.0).clamp(0.0, 1.0)
1288    };
1289    base + room_bias(active_room, result)
1290        + retrieval_signal_boost(signals, result)
1291        + temporal_memory_boost(signals, result)
1292}
1293
1294fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1295    if active_room == Some(result.room.as_str()) {
1296        0.15
1297    } else {
1298        0.0
1299    }
1300}
1301
1302fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1303    let mut boost = 0.0f32;
1304    let haystack = format!(
1305        "{}\n{}",
1306        result.path.to_ascii_lowercase(),
1307        result.content.to_ascii_lowercase()
1308    );
1309
1310    let phrase_matches = signals
1311        .exact_phrases
1312        .iter()
1313        .filter(|phrase| haystack.contains(phrase.as_str()))
1314        .count();
1315    if phrase_matches > 0 {
1316        boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1317    }
1318
1319    let mut standout_matches = 0;
1320    for term in &signals.standout_terms {
1321        if result.path.to_ascii_lowercase().contains(term.as_str()) {
1322            boost += 0.40;
1323            standout_matches += 1;
1324        } else if result.content.to_ascii_lowercase().contains(term.as_str()) {
1325            boost += 0.12;
1326            standout_matches += 1;
1327        }
1328        if standout_matches >= 3 {
1329            break;
1330        }
1331    }
1332
1333    if signals.historical_memory_hint && result.room == "session" {
1334        boost += 0.45;
1335    }
1336
1337    boost
1338}
1339
1340fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1341    if result.room != "session" {
1342        return 0.0;
1343    }
1344    let Some(reference) = signals.temporal_reference else {
1345        return 0.0;
1346    };
1347    let Some(memory_ts) = session_memory_timestamp(result) else {
1348        return 0.0;
1349    };
1350
1351    let span = reference.window_secs.max(86_400);
1352    let full_fade = span.saturating_mul(8);
1353    if full_fade <= 0 {
1354        return 0.0;
1355    }
1356
1357    let distance = (memory_ts - reference.target_ts).abs();
1358    let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1359    if closeness <= 0.0 {
1360        0.0
1361    } else {
1362        0.22 * closeness
1363    }
1364}
1365
1366fn extract_exact_phrases(query: &str) -> Vec<String> {
1367    let mut phrases = Vec::new();
1368    let chars: Vec<char> = query.chars().collect();
1369    let mut i = 0usize;
1370
1371    while i < chars.len() {
1372        let quote = chars[i];
1373        if !matches!(quote, '"' | '\'' | '`') {
1374            i += 1;
1375            continue;
1376        }
1377        let start = i + 1;
1378        let mut end = start;
1379        while end < chars.len() && chars[end] != quote {
1380            end += 1;
1381        }
1382        if end > start {
1383            let phrase = chars[start..end]
1384                .iter()
1385                .collect::<String>()
1386                .trim()
1387                .to_ascii_lowercase();
1388            if phrase.len() >= 3 && !phrases.contains(&phrase) {
1389                phrases.push(phrase);
1390            }
1391        }
1392        i = end.saturating_add(1);
1393    }
1394
1395    phrases
1396}
1397
1398fn extract_standout_terms(query: &str) -> Vec<String> {
1399    const STOPWORDS: &[&str] = &[
1400        "about", "after", "before", "change", "changed", "decide", "decided", "does", "earlier",
1401        "flow", "from", "have", "into", "just", "last", "local", "make", "more", "remember",
1402        "should", "that", "their", "there", "these", "they", "this", "those", "what", "when",
1403        "where", "which", "why", "with", "work",
1404    ];
1405
1406    let mut standout = Vec::new();
1407    for token in query.split(|ch: char| {
1408        !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1409    }) {
1410        let trimmed = token.trim();
1411        if trimmed.len() < 4 {
1412            continue;
1413        }
1414        let lower = trimmed.to_ascii_lowercase();
1415        if STOPWORDS.contains(&lower.as_str()) {
1416            continue;
1417        }
1418
1419        let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1420            || trimmed
1421                .chars()
1422                .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1423            || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1424            || trimmed.len() >= 9;
1425
1426        if interesting && !standout.contains(&lower) {
1427            standout.push(lower);
1428        }
1429    }
1430
1431    standout
1432}
1433
1434fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1435    if let Some(ts) = extract_iso_date_from_query(query) {
1436        return Some(TemporalReference {
1437            target_ts: ts,
1438            window_secs: 86_400,
1439        });
1440    }
1441
1442    let now = current_unix_timestamp();
1443    let lower = query.to_ascii_lowercase();
1444    if lower.contains("yesterday") {
1445        Some(TemporalReference {
1446            target_ts: now.saturating_sub(86_400),
1447            window_secs: 86_400,
1448        })
1449    } else if lower.contains("today") || lower.contains("earlier today") {
1450        Some(TemporalReference {
1451            target_ts: now,
1452            window_secs: 86_400,
1453        })
1454    } else if lower.contains("last week") {
1455        Some(TemporalReference {
1456            target_ts: now.saturating_sub(7 * 86_400),
1457            window_secs: 7 * 86_400,
1458        })
1459    } else if lower.contains("last month") {
1460        Some(TemporalReference {
1461            target_ts: now.saturating_sub(30 * 86_400),
1462            window_secs: 30 * 86_400,
1463        })
1464    } else {
1465        None
1466    }
1467}
1468
1469fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1470    query
1471        .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1472        .find_map(parse_iso_date_token)
1473}
1474
1475fn parse_iso_date_token(token: &str) -> Option<i64> {
1476    if token.len() != 10 {
1477        return None;
1478    }
1479    let bytes = token.as_bytes();
1480    if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1481        return None;
1482    }
1483
1484    let year = token.get(0..4)?.parse::<i32>().ok()?;
1485    let month = token.get(5..7)?.parse::<u32>().ok()?;
1486    let day = token.get(8..10)?.parse::<u32>().ok()?;
1487    if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1488        return None;
1489    }
1490
1491    Some(days_from_civil(year, month, day).saturating_mul(86_400))
1492}
1493
1494fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1495    let year = year - if month <= 2 { 1 } else { 0 };
1496    let era = if year >= 0 { year } else { year - 399 } / 400;
1497    let yoe = year - era * 400;
1498    let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1499    let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1500    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1501    era as i64 * 146_097 + doe as i64 - 719_468
1502}
1503
1504fn current_unix_timestamp() -> i64 {
1505    std::time::SystemTime::now()
1506        .duration_since(std::time::UNIX_EPOCH)
1507        .unwrap_or_default()
1508        .as_secs() as i64
1509}
1510
1511fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1512    extract_session_path_timestamp(&result.path).or_else(|| {
1513        if result.last_modified > 0 {
1514            Some(result.last_modified)
1515        } else {
1516            None
1517        }
1518    })
1519}
1520
1521fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1522    let normalized = path.replace('\\', "/");
1523    let mut parts = normalized.split('/');
1524    if parts.next()? != "session" {
1525        return None;
1526    }
1527    parse_iso_date_token(parts.next()?)
1528}
1529
1530fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1531    let normalized = speaker.trim().to_ascii_lowercase();
1532    match normalized.as_str() {
1533        "you" | "user" => SessionSpeakerKind::User,
1534        "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1535        _ => SessionSpeakerKind::Assistant,
1536    }
1537}
1538
1539fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1540    let Ok(raw) = std::fs::read_to_string(report_path) else {
1541        return Vec::new();
1542    };
1543    let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1544        return Vec::new();
1545    };
1546
1547    let session_key = report_path
1548        .file_stem()
1549        .and_then(|stem| stem.to_str())
1550        .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1551        .unwrap_or("unknown-session")
1552        .to_string();
1553    let session_date = report
1554        .session_start
1555        .split('_')
1556        .next()
1557        .filter(|date| !date.is_empty())
1558        .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1559        .to_string();
1560
1561    let mut exchanges = Vec::new();
1562    let mut pending_user: Option<String> = None;
1563    let mut turn_index = 0usize;
1564
1565    for entry in report.transcript {
1566        match session_speaker_kind(&entry.speaker) {
1567            SessionSpeakerKind::User => {
1568                let text = entry.text.trim();
1569                if !text.is_empty() {
1570                    pending_user = Some(text.to_string());
1571                }
1572            }
1573            SessionSpeakerKind::Assistant => {
1574                let text = entry.text.trim();
1575                if text.is_empty() {
1576                    continue;
1577                }
1578                let Some(user_text) = pending_user.take() else {
1579                    continue;
1580                };
1581                turn_index += 1;
1582                exchanges.push(SessionExchange {
1583                    path: format!(
1584                        "session/{}/{}/turn-{}",
1585                        session_date, session_key, turn_index
1586                    ),
1587                    last_modified,
1588                    content: format!(
1589                        "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1590                        user_text, text
1591                    ),
1592                });
1593            }
1594            SessionSpeakerKind::Ignore => {}
1595        }
1596    }
1597
1598    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1599        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1600        exchanges = exchanges.into_iter().skip(keep_from).collect();
1601    }
1602
1603    exchanges
1604}
1605
1606fn load_imported_session_exchanges(
1607    import_path: &Path,
1608    imports_root: &Path,
1609    last_modified: i64,
1610) -> Vec<SessionExchange> {
1611    let Ok(raw) = std::fs::read_to_string(import_path) else {
1612        return Vec::new();
1613    };
1614
1615    let messages = normalize_import_messages(&raw, import_path);
1616    if messages.is_empty() {
1617        return Vec::new();
1618    }
1619
1620    let rel = import_path
1621        .strip_prefix(imports_root)
1622        .unwrap_or(import_path);
1623    let rel_slug = slugify_import_path(rel);
1624    let mut exchanges = Vec::new();
1625    let mut pending_user: Option<String> = None;
1626    let mut turn_index = 0usize;
1627
1628    for (role, text) in messages {
1629        let cleaned = text.trim();
1630        if cleaned.is_empty() {
1631            continue;
1632        }
1633        match role.as_str() {
1634            "user" => pending_user = Some(cleaned.to_string()),
1635            "assistant" => {
1636                let Some(user_text) = pending_user.take() else {
1637                    continue;
1638                };
1639                turn_index += 1;
1640                exchanges.push(SessionExchange {
1641                    path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1642                    last_modified,
1643                    content: format!(
1644                        "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1645                        rel.to_string_lossy().replace('\\', "/"),
1646                        user_text,
1647                        cleaned
1648                    ),
1649                });
1650            }
1651            _ => {}
1652        }
1653    }
1654
1655    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1656        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1657        exchanges = exchanges.into_iter().skip(keep_from).collect();
1658    }
1659
1660    exchanges
1661}
1662
1663fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1664    if raw.trim().is_empty() {
1665        return Vec::new();
1666    }
1667
1668    if let Some(messages) = parse_marker_transcript(raw) {
1669        return messages;
1670    }
1671
1672    let ext = import_path
1673        .extension()
1674        .and_then(|ext| ext.to_str())
1675        .unwrap_or("")
1676        .to_ascii_lowercase();
1677
1678    if matches!(ext.as_str(), "json" | "jsonl")
1679        || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1680    {
1681        if let Some(messages) = parse_jsonl_messages(raw) {
1682            if !messages.is_empty() {
1683                return messages;
1684            }
1685        }
1686
1687        if let Ok(value) = serde_json::from_str::<Value>(raw) {
1688            if let Some(messages) = parse_session_report_messages(&value) {
1689                return messages;
1690            }
1691            if let Some(messages) = parse_simple_role_messages(&value) {
1692                return messages;
1693            }
1694            if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1695                return messages;
1696            }
1697        }
1698    }
1699
1700    Vec::new()
1701}
1702
1703fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1704    let lines = raw.lines().collect::<Vec<_>>();
1705    if lines
1706        .iter()
1707        .filter(|line| line.trim_start().starts_with("> "))
1708        .count()
1709        < 2
1710    {
1711        return None;
1712    }
1713
1714    let mut messages = Vec::new();
1715    let mut i = 0usize;
1716    while i < lines.len() {
1717        let line = lines[i].trim_start();
1718        if let Some(rest) = line.strip_prefix("> ") {
1719            messages.push(("user".to_string(), rest.trim().to_string()));
1720            i += 1;
1721            let mut assistant_lines = Vec::new();
1722            while i < lines.len() {
1723                let next = lines[i];
1724                if next.trim_start().starts_with("> ") {
1725                    break;
1726                }
1727                let trimmed = next.trim();
1728                if !trimmed.is_empty() && trimmed != "---" {
1729                    assistant_lines.push(trimmed.to_string());
1730                }
1731                i += 1;
1732            }
1733            if !assistant_lines.is_empty() {
1734                messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1735            }
1736        } else {
1737            i += 1;
1738        }
1739    }
1740
1741    (!messages.is_empty()).then_some(messages)
1742}
1743
1744fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1745    let mut messages = Vec::new();
1746    let mut has_codex_session_meta = false;
1747    let mut saw_jsonl = false;
1748
1749    for line in raw.lines() {
1750        let trimmed = line.trim();
1751        if trimmed.is_empty() {
1752            continue;
1753        }
1754        let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1755            continue;
1756        };
1757        saw_jsonl = true;
1758        let Some(object) = value.as_object() else {
1759            continue;
1760        };
1761
1762        match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1763            "session_meta" => {
1764                has_codex_session_meta = true;
1765            }
1766            "event_msg" => {
1767                let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1768                    continue;
1769                };
1770                let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1771                    continue;
1772                };
1773                match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1774                    "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1775                    "agent_message" => {
1776                        messages.push(("assistant".to_string(), text.trim().to_string()))
1777                    }
1778                    _ => {}
1779                }
1780            }
1781            "human" | "user" => {
1782                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1783                    messages.push(("user".to_string(), text));
1784                }
1785            }
1786            "assistant" => {
1787                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1788                    messages.push(("assistant".to_string(), text));
1789                }
1790            }
1791            _ => {
1792                if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
1793                    if let Some(text) = extract_text_content(&value) {
1794                        match role {
1795                            "user" | "human" => messages.push(("user".to_string(), text)),
1796                            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1797                            _ => {}
1798                        }
1799                    }
1800                }
1801            }
1802        }
1803    }
1804
1805    if !saw_jsonl {
1806        return None;
1807    }
1808
1809    if has_codex_session_meta || !messages.is_empty() {
1810        return Some(messages);
1811    }
1812
1813    None
1814}
1815
1816fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
1817    let report = value.as_object()?;
1818    let transcript = report.get("transcript")?.as_array()?;
1819    let mut messages = Vec::new();
1820
1821    for entry in transcript {
1822        let Some(obj) = entry.as_object() else {
1823            continue;
1824        };
1825        let speaker = obj
1826            .get("speaker")
1827            .and_then(|v| v.as_str())
1828            .unwrap_or_default();
1829        let text = obj
1830            .get("text")
1831            .and_then(|v| v.as_str())
1832            .unwrap_or_default()
1833            .trim()
1834            .to_string();
1835        if text.is_empty() {
1836            continue;
1837        }
1838        match session_speaker_kind(speaker) {
1839            SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
1840            SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
1841            SessionSpeakerKind::Ignore => {}
1842        }
1843    }
1844
1845    (!messages.is_empty()).then_some(messages)
1846}
1847
1848fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
1849    if let Some(array) = value.as_array() {
1850        let messages = collect_role_messages(array);
1851        return (!messages.is_empty()).then_some(messages);
1852    }
1853
1854    let obj = value.as_object()?;
1855    if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
1856        let array = messages_value.as_array()?;
1857        let messages = collect_role_messages(array);
1858        return (!messages.is_empty()).then_some(messages);
1859    }
1860
1861    None
1862}
1863
1864fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
1865    let mut messages = Vec::new();
1866    for item in items {
1867        let Some(obj) = item.as_object() else {
1868            continue;
1869        };
1870        let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
1871            continue;
1872        };
1873        let Some(text) = extract_text_content(item) else {
1874            continue;
1875        };
1876        match role {
1877            "user" | "human" => messages.push(("user".to_string(), text)),
1878            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1879            _ => {}
1880        }
1881    }
1882    messages
1883}
1884
1885fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
1886    let mapping = value.get("mapping")?.as_object()?;
1887    let mut current_id = mapping.iter().find_map(|(node_id, node)| {
1888        let obj = node.as_object()?;
1889        (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
1890    })?;
1891
1892    let mut messages = Vec::new();
1893    let mut visited = std::collections::HashSet::new();
1894
1895    while visited.insert(current_id.clone()) {
1896        let Some(node) = mapping.get(&current_id).and_then(|v| v.as_object()) else {
1897            break;
1898        };
1899
1900        if let Some(message) = node.get("message") {
1901            let role = message
1902                .get("author")
1903                .and_then(|author| author.get("role"))
1904                .and_then(|v| v.as_str())
1905                .unwrap_or("");
1906            if let Some(text) = extract_text_content(message) {
1907                match role {
1908                    "user" => messages.push(("user".to_string(), text)),
1909                    "assistant" => messages.push(("assistant".to_string(), text)),
1910                    _ => {}
1911                }
1912            }
1913        }
1914
1915        let Some(next_id) = node
1916            .get("children")
1917            .and_then(|children| children.as_array())
1918            .and_then(|children| children.first())
1919            .and_then(|child| child.as_str())
1920        else {
1921            break;
1922        };
1923        current_id = next_id.to_string();
1924    }
1925
1926    (!messages.is_empty()).then_some(messages)
1927}
1928
1929fn extract_text_content(value: &Value) -> Option<String> {
1930    if let Some(text) = value.as_str() {
1931        let trimmed = text.trim();
1932        return (!trimmed.is_empty()).then_some(trimmed.to_string());
1933    }
1934
1935    if let Some(array) = value.as_array() {
1936        let joined = array
1937            .iter()
1938            .filter_map(extract_text_content)
1939            .filter(|part| !part.is_empty())
1940            .collect::<Vec<_>>()
1941            .join("\n");
1942        return (!joined.is_empty()).then_some(joined);
1943    }
1944
1945    let obj = value.as_object()?;
1946
1947    if let Some(content) = obj.get("content") {
1948        if let Some(text) = extract_text_content(content) {
1949            return Some(text);
1950        }
1951    }
1952
1953    if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
1954        let trimmed = text.trim();
1955        if !trimmed.is_empty() {
1956            return Some(trimmed.to_string());
1957        }
1958    }
1959
1960    if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
1961        let joined = parts
1962            .iter()
1963            .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
1964            .filter(|part| !part.is_empty())
1965            .collect::<Vec<_>>()
1966            .join("\n");
1967        if !joined.is_empty() {
1968            return Some(joined);
1969        }
1970    }
1971
1972    None
1973}
1974
1975fn slugify_import_path(path: &Path) -> String {
1976    path.to_string_lossy()
1977        .replace('\\', "/")
1978        .chars()
1979        .map(|ch| {
1980            if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
1981                ch
1982            } else {
1983                '_'
1984            }
1985        })
1986        .collect::<String>()
1987        .trim_matches('/')
1988        .replace('/', "__")
1989}
1990
1991// ── Embedding API ─────────────────────────────────────────────────────────────
1992
1993/// Call LM Studio's `/v1/embeddings` endpoint synchronously.
1994///
1995/// Uses nomic-embed-text-v2 MoE. Nomic v2 requires task instruction prefixes:
1996/// - Chunks stored in the index use `"search_document: "` prefix
1997/// - Queries at search time use `"search_query: "` prefix
1998/// LM Studio matches loaded models by substring so the quant suffix doesn't matter.
1999///
2000/// Returns `None` if:
2001/// - No embedding model is loaded in LM Studio
2002/// - LM Studio is not running
2003/// - Any network or parse error occurs
2004///
2005/// Callers must tolerate `None` and fall back to BM25-only search.
2006fn embed_text_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2007    embed_text_with_prefix(text, "search_document", base_url)
2008}
2009
2010fn embed_query_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2011    embed_text_with_prefix(text, "search_query", base_url)
2012}
2013
2014fn embed_text_with_prefix(text: &str, task: &str, base_url: &str) -> Option<Vec<f32>> {
2015    // Nomic v2 task instruction prefix format: "<task>: <text>"
2016    let prefixed = format!("{}: {}", task, text);
2017    // Truncate to ~8000 chars to stay within typical embedding model limits.
2018    let input = if prefixed.len() > 8000 {
2019        &prefixed[..8000]
2020    } else {
2021        &prefixed
2022    };
2023
2024    let client = reqwest::blocking::Client::builder()
2025        .timeout(std::time::Duration::from_secs(10))
2026        .build()
2027        .ok()?;
2028
2029    let body = serde_json::json!({
2030        "model": "nomic-embed-text-v2",
2031        "input": input
2032    });
2033
2034    let url = format!("{}/v1/embeddings", base_url);
2035    let resp = client.post(&url).json(&body).send().ok()?;
2036
2037    if !resp.status().is_success() {
2038        return None;
2039    }
2040
2041    let json: serde_json::Value = resp.json().ok()?;
2042    let embedding = json["data"][0]["embedding"].as_array()?;
2043    let vec: Vec<f32> = embedding
2044        .iter()
2045        .filter_map(|v| v.as_f64().map(|f| f as f32))
2046        .collect();
2047
2048    if vec.is_empty() {
2049        None
2050    } else {
2051        Some(vec)
2052    }
2053}
2054
2055// ── Vector math ───────────────────────────────────────────────────────────────
2056
2057fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2058    if a.len() != b.len() || a.is_empty() {
2059        return 0.0;
2060    }
2061    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2062    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2063    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2064    if norm_a == 0.0 || norm_b == 0.0 {
2065        0.0
2066    } else {
2067        dot / (norm_a * norm_b)
2068    }
2069}
2070
2071fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2072    floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2073}
2074
2075fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2076    blob.chunks_exact(4)
2077        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2078        .collect()
2079}
2080
2081// ── Document extraction ───────────────────────────────────────────────────────
2082
2083/// Extract plain text from a PDF file using pdf-extract.
2084/// Returns None if the file can't be read or yields no text.
2085/// Output is best-effort — layout is not preserved, but content is.
2086fn normalize_extracted_document_text(text: String) -> Option<String> {
2087    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2088    let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2089    if trimmed.is_empty() {
2090        None
2091    } else {
2092        Some(trimmed.to_string())
2093    }
2094}
2095
2096fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2097    let previous_hook = std::panic::take_hook();
2098    std::panic::set_hook(Box::new(|_| {}));
2099    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2100        pdf_extract::extract_text(path)
2101    }));
2102    std::panic::set_hook(previous_hook);
2103
2104    match result {
2105        Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2106        Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2107        Err(payload) => {
2108            let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2109                (*msg).to_string()
2110            } else if let Some(msg) = payload.downcast_ref::<String>() {
2111                msg.clone()
2112            } else {
2113                "unknown parser panic".to_string()
2114            };
2115            Err(format!("pdf-extract panicked: {}", panic_text))
2116        }
2117    }
2118}
2119
2120fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2121    let mut doc =
2122        lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2123
2124    if doc.is_encrypted() {
2125        doc.decrypt("")
2126            .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2127    }
2128
2129    let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2130    if page_numbers.is_empty() {
2131        return Ok(None);
2132    }
2133
2134    let mut extracted_pages = Vec::new();
2135    let mut page_errors = Vec::new();
2136
2137    for page_number in page_numbers {
2138        match doc.extract_text(&[page_number]) {
2139            Ok(text) => {
2140                if let Some(page_text) = normalize_extracted_document_text(text) {
2141                    extracted_pages.push(page_text);
2142                }
2143            }
2144            Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2145        }
2146    }
2147
2148    if !extracted_pages.is_empty() {
2149        return Ok(Some(extracted_pages.join("\n\n")));
2150    }
2151
2152    if !page_errors.is_empty() {
2153        let sample_errors = page_errors
2154            .into_iter()
2155            .take(3)
2156            .collect::<Vec<_>>()
2157            .join("; ");
2158        return Err(format!(
2159            "lopdf could not extract usable page text ({sample_errors})"
2160        ));
2161    }
2162
2163    Ok(None)
2164}
2165
2166fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2167    let mut failures = Vec::new();
2168
2169    match extract_pdf_text_with_pdf_extract(path) {
2170        Ok(Some(text)) => return Ok(Some(text)),
2171        Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2172        Err(e) => failures.push(e),
2173    }
2174
2175    match extract_pdf_text_with_lopdf(path) {
2176        Ok(Some(text)) => return Ok(Some(text)),
2177        Ok(None) => failures.push("lopdf found no usable text".to_string()),
2178        Err(e) => failures.push(e),
2179    }
2180
2181    let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2182    Err(format!(
2183        "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2184        detail
2185    ))
2186}
2187
2188fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2189    let exe = std::env::current_exe()
2190        .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2191    let output = std::process::Command::new(exe)
2192        .arg("--pdf-extract-helper")
2193        .arg(path)
2194        .stdin(std::process::Stdio::null())
2195        .stdout(std::process::Stdio::piped())
2196        .stderr(std::process::Stdio::piped())
2197        .output()
2198        .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2199
2200    if !output.status.success() {
2201        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2202        return Err(if stderr.is_empty() {
2203            "PDF extraction failed.".to_string()
2204        } else {
2205            stderr
2206        });
2207    }
2208
2209    let text = String::from_utf8(output.stdout)
2210        .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2211    if text.trim().is_empty() {
2212        Ok(None)
2213    } else {
2214        Ok(Some(text))
2215    }
2216}
2217
2218pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2219    match extract_pdf_text_inside_helper(path) {
2220        Ok(Some(text)) => {
2221            use std::io::Write;
2222            let mut stdout = std::io::stdout();
2223            if stdout.write_all(text.as_bytes()).is_ok() {
2224                0
2225            } else {
2226                let _ = writeln!(
2227                    std::io::stderr(),
2228                    "PDF helper could not write extracted text."
2229                );
2230                1
2231            }
2232        }
2233        Ok(None) => {
2234            eprintln!(
2235                "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2236            );
2237            1
2238        }
2239        Err(e) => {
2240            eprintln!("{}", e);
2241            1
2242        }
2243    }
2244}
2245
2246/// Extract text from any supported document type (PDF, markdown, plain text).
2247/// Used by /attach for one-shot context injection.
2248pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2249    let ext = path
2250        .extension()
2251        .and_then(|e| e.to_str())
2252        .unwrap_or("")
2253        .to_lowercase();
2254    match ext.as_str() {
2255        "pdf" => {
2256            let text = extract_pdf_text(path)?.ok_or_else(|| {
2257                "PDF contains no extractable text — it may be scanned/image-only. \
2258                     Try attaching page screenshots with /image instead."
2259                    .to_string()
2260            })?;
2261            pdf_quality_check(text)
2262        }
2263        _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2264    }
2265}
2266
2267/// Detect garbled PDF extraction — common with academic publisher PDFs that use
2268/// custom embedded fonts with non-standard glyph mappings.
2269///
2270/// Returns the text if it looks usable, or an informative error if it looks garbled.
2271fn pdf_quality_check(text: String) -> Result<String, String> {
2272    let trimmed = text.trim();
2273
2274    // Too little content to be useful.
2275    if trimmed.len() < 150 {
2276        return Err(format!(
2277            "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2278             or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2279            trimmed.len()
2280        ));
2281    }
2282
2283    // Detect words smashed together: space ratio too low.
2284    // Normal prose is ~15–20% spaces. Below 4% means glyphs aren't mapping to spaces.
2285    let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2286    let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2287    let space_ratio = if non_newline > 0 {
2288        spaces as f32 / non_newline as f32
2289    } else {
2290        0.0
2291    };
2292
2293    if space_ratio < 0.04 {
2294        return Err(
2295            "PDF text extraction produced garbled output — words are merged with no spaces. \
2296             This usually means the PDF uses custom embedded fonts (common with academic publishers \
2297             like EBSCO, Elsevier, Springer). \
2298             Try a PDF exported from Word, Google Docs, or LaTeX, \
2299             or attach page screenshots with /image instead.".to_string()
2300        );
2301    }
2302
2303    Ok(text)
2304}
2305
2306// ── Chunking strategies ───────────────────────────────────────────────────────
2307
2308/// Dispatch to the correct chunking strategy based on file extension.
2309fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2310    if ext == "rs" {
2311        chunk_rust_symbols(text)
2312    } else {
2313        chunk_paragraphs(text)
2314    }
2315}
2316
2317/// Chunk Rust source at top-level item boundaries.
2318///
2319/// Detects lines at column 0 that start a Rust declaration keyword, flushes
2320/// the accumulated buffer, then moves any trailing doc-comments / attributes
2321/// forward so they stay with the item they annotate.
2322///
2323/// Items larger than 3000 chars (e.g. large impl blocks) are further split
2324/// by sliding window so no single chunk blows the retrieval budget.
2325fn chunk_rust_symbols(text: &str) -> Vec<String> {
2326    const ITEM_STARTS: &[&str] = &[
2327        "pub fn ",
2328        "pub async fn ",
2329        "pub unsafe fn ",
2330        "async fn ",
2331        "unsafe fn ",
2332        "fn ",
2333        "pub impl",
2334        "impl ",
2335        "pub struct ",
2336        "struct ",
2337        "pub enum ",
2338        "enum ",
2339        "pub trait ",
2340        "trait ",
2341        "pub mod ",
2342        "mod ",
2343        "pub type ",
2344        "type ",
2345        "pub const ",
2346        "const ",
2347        "pub static ",
2348        "static ",
2349    ];
2350
2351    let lines: Vec<&str> = text.lines().collect();
2352    let mut chunks: Vec<String> = Vec::new();
2353    let mut current: Vec<&str> = Vec::new();
2354
2355    for &line in &lines {
2356        let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2357        let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2358
2359        if is_item && !current.is_empty() {
2360            // Scan backward to find where trailing doc-comments / attributes start —
2361            // move them to the new chunk so they land with their item.
2362            let mut split = current.len();
2363            while split > 0 {
2364                let prev = current[split - 1].trim();
2365                if prev.starts_with("///")
2366                    || prev.starts_with("//!")
2367                    || prev.starts_with("#[")
2368                    || prev.is_empty()
2369                {
2370                    split -= 1;
2371                } else {
2372                    break;
2373                }
2374            }
2375            let body = current[..split].join("\n");
2376            if !body.trim().is_empty() {
2377                chunks.push(body);
2378            }
2379            current = current[split..].to_vec();
2380        }
2381        current.push(line);
2382    }
2383    if !current.is_empty() {
2384        let body = current.join("\n");
2385        if !body.trim().is_empty() {
2386            chunks.push(body);
2387        }
2388    }
2389
2390    // Subdivide any oversized blocks (e.g. long impl blocks with many methods).
2391    let mut result = Vec::new();
2392    for chunk in chunks {
2393        if chunk.len() > 3000 {
2394            result.extend(sliding_window_chunks(&chunk, 2000, 200));
2395        } else {
2396            result.push(chunk);
2397        }
2398    }
2399    result
2400}
2401
2402/// Chunk non-Rust text at paragraph boundaries (double newline).
2403fn chunk_paragraphs(text: &str) -> Vec<String> {
2404    let mut result: Vec<String> = Vec::new();
2405    let mut current = String::new();
2406
2407    for para in text.split("\n\n") {
2408        if current.len() + para.len() + 2 > 2000 {
2409            if !current.trim().is_empty() {
2410                result.push(current.clone());
2411            }
2412            current = para.to_string();
2413        } else {
2414            if !current.is_empty() {
2415                current.push_str("\n\n");
2416            }
2417            current.push_str(para);
2418        }
2419    }
2420    if !current.trim().is_empty() {
2421        result.push(current);
2422    }
2423
2424    let mut final_result = Vec::new();
2425    for chunk in result {
2426        if chunk.len() > 2000 {
2427            final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2428        } else {
2429            final_result.push(chunk);
2430        }
2431    }
2432    final_result
2433}
2434
2435/// Classic sliding-window fallback for oversized blocks.
2436fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2437    let chars: Vec<char> = text.chars().collect();
2438    let mut result = Vec::new();
2439    let mut i = 0;
2440    while i < chars.len() {
2441        let end = (i + chunk_size).min(chars.len());
2442        result.push(chars[i..end].iter().collect());
2443        if end == chars.len() {
2444            break;
2445        }
2446        i += chunk_size - overlap;
2447    }
2448    result
2449}