hematite/memory/
vein.rs

1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7/// "The Vein" — local RAG memory engine backed by SQLite FTS5 + semantic embeddings.
8///
9/// Two retrieval modes, used together:
10///
11/// **BM25 (always available)**
12/// Full-text search via SQLite FTS5 with Porter-stemming. Fast, zero extra GPU cost,
13/// works as the fallback when the embedding model isn't loaded.
14///
15/// **Semantic (when LM Studio has an embedding model loaded)**
16/// Calls `/v1/embeddings` (nomic-embed-text-v1.5 or similar) to produce 768-dim float
17/// vectors for each chunk. At search time the query is embedded and cosine similarity
18/// selects the most conceptually relevant chunks — even when no keywords match.
19///
20/// Hybrid search runs BM25 and semantic in parallel, deduplicates by path, and returns
21/// the top-k results ranked by combined score. Semantic results score higher when the
22/// embedding model is available; BM25 fills the gap when it isn't.
23///
24/// Indexing is incremental: files are re-indexed only when their mtime changes. Embedding
25/// vectors are stored in a separate `chunks_vec` SQLite table so they survive re-runs
26/// without hitting the embedding API again.
27pub struct Vein {
28    db: std::sync::Arc<std::sync::Mutex<Connection>>,
29    /// Base URL of the LLM provider, used for the embeddings endpoint.
30    base_url: String,
31    embed_model: std::sync::Arc<std::sync::RwLock<Option<String>>>,
32}
33
34// SAFETY: rusqlite::Connection is !Send by default, but we wrap it in Arc<Mutex>
35// and ensure all accesses are serialized by the mutex.
36unsafe impl Send for Vein {}
37unsafe impl Sync for Vein {}
38
39#[derive(Debug, Clone)]
40pub struct SearchResult {
41    pub path: String,
42    pub content: String,
43    /// Combined relevance score (higher = more relevant).
44    pub score: f32,
45    /// Subsystem room derived from the file path (e.g. "agent", "ui", "tools").
46    pub room: String,
47    /// Last-modified timestamp from chunks_meta (unix seconds).
48    pub last_modified: i64,
49    /// Semantic memory type tagged at index time: "decision", "problem",
50    /// "milestone", "preference", or "" for unclassified/source/doc chunks.
51    pub memory_type: String,
52}
53
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub struct VeinHotFile {
56    pub path: String,
57    pub heat: i64,
58    pub last_modified: i64,
59    pub room: String,
60}
61
62#[derive(Debug, Clone)]
63pub struct VeinInspectionSnapshot {
64    pub indexed_source_files: usize,
65    pub indexed_docs: usize,
66    pub indexed_session_exchanges: usize,
67    pub embedded_source_doc_chunks: usize,
68    pub has_any_embeddings: bool,
69    pub active_room: Option<String>,
70    pub hot_files: Vec<VeinHotFile>,
71    pub l1_ready: bool,
72}
73
74#[derive(Debug, Default)]
75struct QuerySignals {
76    exact_phrases: Vec<String>,
77    standout_terms: Vec<String>,
78    historical_memory_hint: bool,
79    temporal_reference: Option<TemporalReference>,
80    /// Memory type the query is asking about — boosts matching chunks.
81    /// e.g. "what did we decide" → "decision", "what was the bug" → "problem"
82    query_memory_type: Option<&'static str>,
83}
84
85#[derive(Debug, Clone, Copy)]
86struct TemporalReference {
87    target_ts: i64,
88    window_secs: i64,
89}
90
91#[derive(Debug, Deserialize)]
92struct SessionReport {
93    #[serde(default)]
94    session_start: String,
95    #[serde(default)]
96    transcript: Vec<SessionTranscriptEntry>,
97}
98
99#[derive(Debug, Deserialize)]
100struct SessionTranscriptEntry {
101    #[serde(default)]
102    speaker: String,
103    #[serde(default)]
104    text: String,
105}
106
107#[derive(Debug)]
108struct SessionExchange {
109    path: String,
110    last_modified: i64,
111    content: String,
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115enum SessionSpeakerKind {
116    User,
117    Assistant,
118    Ignore,
119}
120
121/// Derive a subsystem room label from a file path.
122/// Uses path segments, filenames, and repo-role hints to map files into
123/// stable subsystem rooms. Falls back to the first directory component or
124/// "root" when no stronger signal exists.
125pub fn detect_room(path: &str) -> String {
126    let lower = path.to_lowercase().replace('\\', "/");
127    let filename = lower.rsplit('/').next().unwrap_or(&lower);
128    let ext = filename.rsplit('.').next().unwrap_or("");
129
130    let mut best_room = None::<&str>;
131    let mut best_score = 0i32;
132    let mut consider = |room: &'static str, score: i32| {
133        if score > best_score {
134            best_score = score;
135            best_room = Some(room);
136        }
137    };
138
139    let is_component = |segment: &str| {
140        lower == segment
141            || lower.starts_with(&format!("{segment}/"))
142            || lower.contains(&format!("/{segment}/"))
143    };
144
145    if lower.starts_with("session/")
146        || lower.starts_with(".hematite/reports/")
147        || lower.starts_with(".hematite/imports/")
148        || is_component("reports")
149        || is_component("imports")
150    {
151        consider("session", 100);
152    }
153
154    if lower.starts_with(".hematite/docs/")
155        || is_component("docs")
156        || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
157        || matches!(ext, "md" | "markdown" | "pdf" | "rst")
158    {
159        consider("docs", 80);
160    }
161
162    if is_component("tests")
163        || filename.contains("diagnostic")
164        || filename.ends_with("_test.rs")
165        || filename.ends_with(".test.ts")
166    {
167        consider("tests", 85);
168    }
169
170    if lower.starts_with(".github/workflows/")
171        || is_component("workflows")
172        || filename == ".pre-commit-config.yaml"
173        || filename == ".pre-commit-config.yml"
174        || filename.contains("hook")
175    {
176        consider("automation", 84);
177    }
178
179    if lower.starts_with("installer/")
180        || lower.starts_with("dist/")
181        || lower.starts_with("scripts/package-")
182        || filename.contains("release")
183        || filename.contains("bump-version")
184        || ext == "iss"
185    {
186        consider("release", 82);
187    }
188
189    if matches!(
190        filename,
191        "cargo.toml"
192            | "cargo.lock"
193            | "package.json"
194            | "pnpm-lock.yaml"
195            | "yarn.lock"
196            | "bun.lock"
197            | "bun.lockb"
198            | "pyproject.toml"
199            | "setup.py"
200            | "go.mod"
201            | "pom.xml"
202            | "build.gradle"
203            | "build.gradle.kts"
204            | "cmakelists.txt"
205            | ".gitignore"
206            | "settings.json"
207            | "mcp_servers.json"
208    ) || filename.ends_with(".sln")
209        || filename.ends_with(".csproj")
210        || filename.contains("config")
211    {
212        consider("config", 76);
213    }
214
215    if is_component("ui")
216        || matches!(
217            filename,
218            "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
219        )
220    {
221        consider("ui", 70);
222    }
223
224    if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
225        consider("memory", 72);
226    }
227
228    if is_component("tools")
229        || matches!(
230            filename,
231            "verify_build.rs"
232                | "host_inspect.rs"
233                | "shell.rs"
234                | "code_sandbox.rs"
235                | "project_map.rs"
236                | "runtime_trace.rs"
237        )
238    {
239        consider("tools", 68);
240    }
241
242    if filename.contains("mcp")
243        || filename.contains("lsp")
244        || lower.contains("/mcp/")
245        || lower.contains("/lsp/")
246    {
247        consider("integration", 67);
248    }
249
250    if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
251        || filename.contains("startup")
252        || filename.contains("runtime")
253    {
254        consider("runtime", 66);
255    }
256
257    if is_component("agent") {
258        consider("agent", 60);
259    }
260
261    if lower.starts_with("libs/") || is_component("libs") {
262        consider("libs", 58);
263    }
264
265    if lower.starts_with("scripts/") || is_component("scripts") {
266        consider("scripts", 55);
267    }
268
269    if let Some(room) = best_room {
270        return room.to_string();
271    }
272
273    // Fall back to first directory component
274    lower
275        .split('/')
276        .next()
277        .filter(|s| !s.is_empty() && !s.contains('.'))
278        .unwrap_or("root")
279        .to_string()
280}
281
282/// Classify session memory text into a semantic type using zero-cost regex patterns.
283/// Returns one of: "decision", "problem", "milestone", "preference", or "" (unclassified).
284///
285/// Applied only to session/import chunks at index time. Source and doc chunks always get "".
286/// Used by reranking to boost chunks whose type matches the query's implied intent.
287pub fn detect_memory_type(text: &str) -> &'static str {
288    let lower = text.to_lowercase();
289
290    // Decision markers — architectural choices, agreed approaches, "let's use X"
291    let decision_patterns = [
292        "let's use ",
293        "we'll use ",
294        "decided to ",
295        "going with ",
296        "we agreed ",
297        "the plan is",
298        "we're going to",
299        "switching to",
300        "we chose",
301        "final decision",
302        "we settled on",
303        "agreed on",
304        "we decided",
305    ];
306    for pat in &decision_patterns {
307        if lower.contains(pat) {
308            return "decision";
309        }
310    }
311
312    // Problem markers — bugs, errors, failures, blockers
313    let problem_patterns = [
314        "bug fixed",
315        "bug was",
316        "the issue was",
317        "root cause",
318        "error was",
319        "turned out to be",
320        "the fix was",
321        "was caused by",
322        "broken because",
323        "fixed by",
324        "the problem was",
325        "found the bug",
326        "port conflict",
327        "crash",
328        "panicked",
329        "segfault",
330        "oom",
331        "out of memory",
332    ];
333    for pat in &problem_patterns {
334        if lower.contains(pat) {
335            return "problem";
336        }
337    }
338
339    // Milestone markers — shipped, completed, working
340    let milestone_patterns = [
341        "now working",
342        "successfully",
343        "shipped",
344        "deployed",
345        "it works",
346        "tests pass",
347        "all green",
348        "breakthrough",
349        "finally got",
350        "got it working",
351        "completed",
352        "finished",
353        "done with",
354        "landed",
355    ];
356    for pat in &milestone_patterns {
357        if lower.contains(pat) {
358            return "milestone";
359        }
360    }
361
362    // Preference markers — personal/operator preferences for style or workflow
363    let preference_patterns = [
364        "i prefer",
365        "i like",
366        "i don't like",
367        "i want",
368        "always use",
369        "never use",
370        "i usually",
371        "my preference",
372        "keep it",
373        "avoid using",
374    ];
375    for pat in &preference_patterns {
376        if lower.contains(pat) {
377            return "preference";
378        }
379    }
380
381    ""
382}
383
384impl Vein {
385    const SESSION_REPORT_LIMIT: usize = 5;
386    const SESSION_TURN_LIMIT: usize = 50;
387    const IMPORT_FILE_LIMIT: usize = 12;
388    const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
389
390    pub fn new<P: AsRef<Path>>(
391        db_path: P,
392        base_url: String,
393    ) -> Result<Self, Box<dyn std::error::Error>> {
394        let db = Connection::open(db_path)?;
395
396        // WAL mode for better concurrent read performance.
397        db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
398
399        // chunks_meta: tracks last-modified time per path for incremental indexing.
400        // chunks_fts:  BM25 full-text index of all code chunks.
401        // chunks_vec:  semantic embedding vectors, keyed by (path, chunk_idx).
402        db.execute_batch(
403            "CREATE TABLE IF NOT EXISTS chunks_meta (
404                path TEXT PRIMARY KEY,
405                last_modified INTEGER NOT NULL,
406                room TEXT NOT NULL DEFAULT 'root'
407            );
408            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
409                path UNINDEXED,
410                content,
411                tokenize='porter ascii'
412            );
413            CREATE TABLE IF NOT EXISTS chunks_vec (
414                path TEXT NOT NULL,
415                chunk_idx INTEGER NOT NULL,
416                embedding BLOB NOT NULL,
417                PRIMARY KEY (path, chunk_idx)
418            );
419            CREATE TABLE IF NOT EXISTS file_heat (
420                path TEXT PRIMARY KEY,
421                heat INTEGER NOT NULL DEFAULT 0,
422                last_edit INTEGER NOT NULL DEFAULT 0
423            );",
424        )?;
425
426        // Schema migrations — safe to run on every open (IF NOT EXISTS / ignored if col exists).
427        let _ = db
428            .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
429        let _ = db.execute_batch(
430            "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
431        );
432        let _ = db.execute_batch(
433            "ALTER TABLE chunks_meta ADD COLUMN memory_type TEXT NOT NULL DEFAULT '';",
434        );
435
436        Ok(Self {
437            db: std::sync::Arc::new(std::sync::Mutex::new(db)),
438            base_url,
439            embed_model: std::sync::Arc::new(std::sync::RwLock::new(None)),
440        })
441    }
442
443    pub fn set_embed_model(&self, model_id: Option<String>) {
444        if let Ok(mut guard) = self.embed_model.write() {
445            *guard = model_id;
446        }
447    }
448
449    fn current_embed_model(&self) -> Option<String> {
450        self.embed_model.read().ok().and_then(|guard| guard.clone())
451    }
452
453    // ── Indexing ──────────────────────────────────────────────────────────────
454
455    /// Index a single file for BM25 search. Skip if mtime hasn't changed.
456    /// Returns the chunks that were written (empty if file was unchanged).
457    pub fn index_document(
458        &mut self,
459        path: &str,
460        last_modified: i64,
461        full_text: &str,
462    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
463        let room = detect_room(path);
464        let ext = std::path::Path::new(path)
465            .extension()
466            .and_then(|e| e.to_str())
467            .unwrap_or("");
468        let chunks = chunk_by_symbols(ext, full_text);
469        // Tag session memory with semantic type; source/doc chunks leave it empty.
470        let memory_type = if room == "session" {
471            detect_memory_type(full_text)
472        } else {
473            ""
474        };
475        self.index_chunks_with_room_and_type(path, last_modified, &room, memory_type, &chunks)
476    }
477
478    fn index_chunks_with_room_and_type(
479        &mut self,
480        path: &str,
481        last_modified: i64,
482        room: &str,
483        memory_type: &str,
484        chunks: &[String],
485    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
486        let db = self.db.lock().unwrap();
487        let existing: Option<i64> = db
488            .query_row(
489                "SELECT last_modified FROM chunks_meta WHERE path = ?1",
490                params![path],
491                |r| r.get(0),
492            )
493            .ok();
494
495        if let Some(ts) = existing {
496            if ts >= last_modified {
497                return Ok(Vec::new()); // unchanged — skip
498            }
499        }
500
501        // Evict stale BM25 chunks, stale embedding vectors, then update metadata.
502        db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
503        db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
504        db.execute(
505            "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room, memory_type) VALUES (?1, ?2, ?3, ?4)",
506            params![path, last_modified, room, memory_type],
507        )?;
508
509        drop(db);
510
511        let mut db = self.db.lock().unwrap();
512        let tx = db.transaction()?;
513        {
514            let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
515            for chunk in chunks {
516                stmt.execute(params![path, chunk.as_str()])?;
517            }
518        }
519        tx.commit()?;
520
521        Ok(chunks.to_vec())
522    }
523
524    /// Embed a set of chunks for one file and store the vectors.
525    /// Called after `index_document` returns new chunks.
526    /// Silently skips if the embedding model is unavailable.
527    pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
528        let Some(embed_model) = self.current_embed_model() else {
529            return;
530        };
531        for (idx, chunk) in chunks.iter().enumerate() {
532            if let Some(vec) = embed_text_blocking(chunk, &self.base_url, &embed_model) {
533                let blob = floats_to_blob(&vec);
534                let db = self.db.lock().unwrap();
535                let _ = db.execute(
536                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
537                    params![path, idx as i64, blob],
538                );
539            }
540        }
541    }
542
543    // ── Search ────────────────────────────────────────────────────────────────
544
545    /// BM25-ranked full-text search via FTS5 MATCH.
546    pub fn search_bm25(
547        &self,
548        query: &str,
549        limit: usize,
550    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
551        // Strip common English stopwords so FTS5 MATCH gets meaningful tokens only.
552        // FTS5 uses implicit AND by default — passing stopwords like "how", "does",
553        // "the" causes zero results because source code never contains those phrases.
554        const STOPWORDS: &[&str] = &[
555            "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
556            "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an", "the",
557            "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "get",
558            "gets", "got", "work", "works", "make", "makes", "use", "uses", "into", "that", "this",
559            "it", "its",
560        ];
561
562        let safe_query: String = query
563            .chars()
564            .map(|c| {
565                if c.is_alphanumeric() || c == ' ' || c == '_' {
566                    c
567                } else {
568                    ' '
569                }
570            })
571            .collect();
572
573        // Build an OR query from non-stopword tokens so any relevant term matches.
574        let fts_query = safe_query
575            .split_whitespace()
576            .filter(|w| w.len() >= 3 && !STOPWORDS.contains(&w.to_lowercase().as_str()))
577            .collect::<Vec<_>>()
578            .join(" OR ");
579
580        if fts_query.is_empty() {
581            return Ok(Vec::new());
582        }
583
584        let db = self.db.lock().unwrap();
585        let mut stmt = db.prepare(
586            "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room, cm.memory_type
587             FROM chunks_fts
588             JOIN chunks_meta cm ON cm.path = chunks_fts.path
589             WHERE chunks_fts MATCH ?1
590             ORDER BY rank
591             LIMIT ?2",
592        )?;
593
594        let results: Vec<SearchResult> = stmt
595            .query_map(params![fts_query, limit as i64], |row| {
596                Ok(SearchResult {
597                    path: row.get(0)?,
598                    content: row.get(1)?,
599                    score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
600                    last_modified: row.get(3)?,
601                    room: row.get(4)?,
602                    memory_type: row.get::<_, String>(5).unwrap_or_default(),
603                })
604            })?
605            .filter_map(|r| r.ok())
606            .collect();
607
608        Ok(results)
609    }
610
611    /// Semantic search: embed the query, cosine-similarity against all stored vectors.
612    /// Returns empty if the embedding model isn't loaded.
613    pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
614        let Some(embed_model) = self.current_embed_model() else {
615            return Vec::new();
616        };
617        let query_vec = match embed_query_blocking(query, &self.base_url, &embed_model) {
618            Some(v) => v,
619            None => return Vec::new(),
620        };
621
622        // Load all stored embeddings.
623        let rows: Vec<(String, i64, Vec<u8>, i64, String, String)> = {
624            let db = self.db.lock().unwrap();
625            let mut stmt = match db.prepare(
626                "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room, cm.memory_type
627                 FROM chunks_vec cv
628                 JOIN chunks_meta cm ON cm.path = cv.path",
629            ) {
630                Ok(s) => s,
631                Err(_) => return Vec::new(),
632            };
633            stmt.query_map([], |row| {
634                Ok((
635                    row.get::<_, String>(0)?,
636                    row.get::<_, i64>(1)?,
637                    row.get::<_, Vec<u8>>(2)?,
638                    row.get::<_, i64>(3)?,
639                    row.get::<_, String>(4)?,
640                    row.get::<_, String>(5).unwrap_or_default(),
641                ))
642            })
643            .ok()
644            .map(|rows| rows.filter_map(|r| r.ok()).collect())
645            .unwrap_or_default()
646        };
647
648        if rows.is_empty() {
649            return Vec::new();
650        }
651
652        // Score each chunk.
653        let mut scored: Vec<(f32, String, i64, i64, String, String)> = rows
654            .into_iter()
655            .filter_map(|(path, idx, blob, last_modified, room, memory_type)| {
656                let vec = blob_to_floats(&blob);
657                let sim = cosine_similarity(&query_vec, &vec);
658                Some((sim, path, idx, last_modified, room, memory_type))
659            })
660            .collect();
661
662        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
663        scored.truncate(limit);
664
665        // Fetch the content for the top chunks.
666        let db = self.db.lock().unwrap();
667        scored
668            .into_iter()
669            .filter_map(|(score, path, idx, last_modified, room, memory_type)| {
670                let content: Option<String> = db
671                    .query_row(
672                        "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
673                        params![path, idx],
674                        |r| r.get(0),
675                    )
676                    .ok();
677                content.map(|c| SearchResult {
678                    path,
679                    content: c,
680                    score,
681                    room,
682                    last_modified,
683                    memory_type,
684                })
685            })
686            .collect()
687    }
688
689    /// Hybrid search: BM25 + semantic, deduplicated and re-ranked.
690    ///
691    /// Semantic results are preferred (they score higher) when the embedding model
692    /// is available. BM25 fills in or takes over when it isn't.
693    /// Results from the active room (hottest subsystem by edit count) get a
694    /// small boost so the model gravitates toward what's currently being worked on.
695    pub fn search_context(
696        &self,
697        query: &str,
698        limit: usize,
699    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
700        let candidate_limit = (limit.max(1) * 4).max(12);
701        let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
702        let semantic = self.search_semantic(query, candidate_limit);
703        let signals = QuerySignals::from_query(query);
704
705        // Determine the active room from heat scores.
706        let active_room = self.active_room();
707
708        // Merge: semantic results win ties (scored 1.0–2.0 range after boost).
709        // BM25 results land in 0.0–1.0 range.
710        let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
711
712        for r in semantic {
713            let score = reranked_score(&signals, active_room.as_deref(), &r, true);
714            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
715        }
716
717        for r in bm25 {
718            let score = reranked_score(&signals, active_room.as_deref(), &r, false);
719            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
720        }
721
722        let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
723        merged.sort_by(|a, b| {
724            b.score
725                .partial_cmp(&a.score)
726                .unwrap_or(std::cmp::Ordering::Equal)
727        });
728        merged.truncate(limit);
729        Ok(merged)
730    }
731
732    /// Returns the room with the highest total heat (most edited subsystem).
733    /// Used to bias retrieval toward what the user is actively working on.
734    fn active_room(&self) -> Option<String> {
735        let db = self.db.lock().unwrap();
736        db.query_row(
737            "SELECT cm.room, SUM(fh.heat) as total
738             FROM file_heat fh
739             JOIN chunks_meta cm ON cm.path = fh.path
740             GROUP BY cm.room
741             ORDER BY total DESC
742             LIMIT 1",
743            [],
744            |row| row.get::<_, String>(0),
745        )
746        .ok()
747    }
748
749    // ── Project Indexing ──────────────────────────────────────────────────────
750
751    /// Walk the entire project and index all source files (BM25 + embeddings).
752    ///
753    /// Skips: `target/`, `.git/`, `node_modules/`, `.hematite/`, files > 512 KB.
754    /// Also indexes `.hematite/docs/` — the designated reference document drop folder.
755    /// Returns the number of files processed (unchanged files are fast-pathed).
756    pub fn index_project(&mut self) -> usize {
757        let root = crate::tools::file_ops::workspace_root();
758        let mut count = 0usize;
759
760        const INDEXABLE: &[&str] = &[
761            "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
762            "yml", "txt",
763        ];
764        const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
765
766        for entry in walkdir::WalkDir::new(&root)
767            .follow_links(false)
768            .into_iter()
769            .filter_entry(|e| {
770                if e.file_type().is_dir() {
771                    let name = e.file_name().to_string_lossy();
772                    return !SKIP_DIRS.contains(&name.as_ref());
773                }
774                true
775            })
776            .filter_map(|e| e.ok())
777            .filter(|e| e.file_type().is_file())
778        {
779            let path = entry.path();
780            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
781            if !INDEXABLE.contains(&ext) {
782                continue;
783            }
784
785            let Ok(meta) = std::fs::metadata(path) else {
786                continue;
787            };
788            if meta.len() > 512_000 {
789                continue;
790            }
791
792            let mtime = meta
793                .modified()
794                .map(|t| {
795                    t.duration_since(std::time::UNIX_EPOCH)
796                        .unwrap_or_default()
797                        .as_secs() as i64
798                })
799                .unwrap_or(0);
800
801            let rel = path.strip_prefix(&root).unwrap_or(path);
802            let rel_str = rel.to_string_lossy().replace('\\', "/");
803
804            if let Ok(content) = std::fs::read_to_string(path) {
805                match self.index_document(&rel_str, mtime, &content) {
806                    Ok(new_chunks) if !new_chunks.is_empty() => {
807                        count += 1;
808                    }
809                    Ok(_) => {}
810                    Err(_) => {}
811                }
812            }
813        }
814
815        count += self.index_workspace_artifacts(&crate::tools::file_ops::hematite_dir());
816
817        count
818    }
819
820    /// Index workspace-local supporting context that should be available even
821    /// outside a real project workspace: `.hematite/docs/`, recent session
822    /// reports stored in `.hematite/reports/`, and imported chat exports in
823    /// `.hematite/imports/`.
824    pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
825        let mut count = self.index_docs_folder(workspace_root);
826        count += self.index_recent_session_reports(workspace_root);
827        count += self.index_imported_session_exports(workspace_root);
828        self.backfill_missing_embeddings();
829        count
830    }
831
832    /// Index reference documents in `.hematite/docs/`.
833    /// Supports PDF (text extraction), markdown, and plain text.
834    /// Documents are stored with path prefix `docs/filename` so they are
835    /// distinguishable from source files in retrieval results.
836    fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
837        let docs_dir = workspace_root.join(".hematite").join("docs");
838        const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
839        let mut count = 0usize;
840        let mut desired_paths = HashSet::new();
841
842        if docs_dir.exists() {
843            for entry in walkdir::WalkDir::new(&docs_dir)
844                .max_depth(3)
845                .follow_links(false)
846                .into_iter()
847                .filter_map(|e| e.ok())
848                .filter(|e| e.file_type().is_file())
849            {
850                let path = entry.path();
851                let ext = path
852                    .extension()
853                    .and_then(|e| e.to_str())
854                    .unwrap_or("")
855                    .to_lowercase();
856                if !DOCS_INDEXABLE.contains(&ext.as_str()) {
857                    continue;
858                }
859
860                let Ok(meta) = std::fs::metadata(path) else {
861                    continue;
862                };
863                if meta.len() > 50_000_000 {
864                    continue;
865                }
866
867                let mtime = meta
868                    .modified()
869                    .map(|t| {
870                        t.duration_since(std::time::UNIX_EPOCH)
871                            .unwrap_or_default()
872                            .as_secs() as i64
873                    })
874                    .unwrap_or(0);
875
876                let rel = path.strip_prefix(workspace_root).unwrap_or(path);
877                let rel_str = rel.to_string_lossy().replace('\\', "/");
878                desired_paths.insert(rel_str.clone());
879
880                let content = if ext == "pdf" {
881                    extract_pdf_text(path).ok().flatten()
882                } else {
883                    std::fs::read_to_string(path).ok()
884                };
885
886                if let Some(text) = content {
887                    if text.trim().is_empty() {
888                        continue;
889                    }
890                    match self.index_document(&rel_str, mtime, &text) {
891                        Ok(new_chunks) if !new_chunks.is_empty() => {
892                            count += 1;
893                        }
894                        Ok(_) => {}
895                        Err(_) => {}
896                    }
897                }
898            }
899        }
900
901        self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
902        count
903    }
904
905    /// Index the most recent local session reports by exchange pair so prior
906    /// decisions remain searchable across launches without flooding the vein.
907    pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
908        let reports_dir = workspace_root.join(".hematite").join("reports");
909        let mut count = 0usize;
910        let mut desired_paths = HashSet::new();
911
912        if reports_dir.exists() {
913            let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
914                .ok()
915                .into_iter()
916                .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
917                .map(|entry| entry.path())
918                .filter(|path| {
919                    path.is_file()
920                        && path.extension().and_then(|ext| ext.to_str()) == Some("json")
921                        && path
922                            .file_stem()
923                            .and_then(|stem| stem.to_str())
924                            .map(|stem| stem.starts_with("session_"))
925                            .unwrap_or(false)
926                })
927                .collect();
928
929            reports.sort_by(|a, b| {
930                let a_name = a
931                    .file_name()
932                    .and_then(|name| name.to_str())
933                    .unwrap_or_default();
934                let b_name = b
935                    .file_name()
936                    .and_then(|name| name.to_str())
937                    .unwrap_or_default();
938                b_name.cmp(a_name)
939            });
940            reports.truncate(Self::SESSION_REPORT_LIMIT);
941
942            for report_path in reports {
943                let Ok(meta) = std::fs::metadata(&report_path) else {
944                    continue;
945                };
946                let mtime = meta
947                    .modified()
948                    .map(|t| {
949                        t.duration_since(std::time::UNIX_EPOCH)
950                            .unwrap_or_default()
951                            .as_secs() as i64
952                    })
953                    .unwrap_or(0);
954
955                for exchange in load_session_exchanges(&report_path, mtime) {
956                    desired_paths.insert(exchange.path.clone());
957                    let mtype = detect_memory_type(&exchange.content);
958                    match self.index_chunks_with_room_and_type(
959                        &exchange.path,
960                        exchange.last_modified,
961                        "session",
962                        mtype,
963                        std::slice::from_ref(&exchange.content),
964                    ) {
965                        Ok(new_chunks) if !new_chunks.is_empty() => {
966                            count += 1;
967                        }
968                        Ok(_) => {}
969                        Err(_) => {}
970                    }
971                }
972            }
973        }
974
975        self.prune_indexed_prefix("session/", &desired_paths);
976        count
977    }
978
979    /// Index imported chat exports from `.hematite/imports/`.
980    /// Supported inputs include already-normalized `>` transcripts, Claude Code
981    /// JSONL, Codex CLI JSONL, simple role/content JSON exports, ChatGPT
982    /// `mapping` exports, and Hematite session-report JSON.
983    pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
984        let imports_dir = workspace_root.join(".hematite").join("imports");
985        let mut count = 0usize;
986        let mut desired_paths = HashSet::new();
987
988        if imports_dir.exists() {
989            let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
990                .max_depth(4)
991                .follow_links(false)
992                .into_iter()
993                .filter_map(|entry| entry.ok())
994                .filter(|entry| entry.file_type().is_file())
995                .filter_map(|entry| {
996                    let path = entry.into_path();
997                    let ext = path
998                        .extension()
999                        .and_then(|ext| ext.to_str())
1000                        .unwrap_or("")
1001                        .to_ascii_lowercase();
1002                    if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
1003                        return None;
1004                    }
1005                    let meta = std::fs::metadata(&path).ok()?;
1006                    if meta.len() > Self::IMPORT_MAX_BYTES {
1007                        return None;
1008                    }
1009                    let mtime = meta
1010                        .modified()
1011                        .map(|t| {
1012                            t.duration_since(std::time::UNIX_EPOCH)
1013                                .unwrap_or_default()
1014                                .as_secs() as i64
1015                        })
1016                        .unwrap_or(0);
1017                    Some((path, mtime))
1018                })
1019                .collect();
1020
1021            imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
1022                b_mtime
1023                    .cmp(a_mtime)
1024                    .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
1025            });
1026            imports.truncate(Self::IMPORT_FILE_LIMIT);
1027
1028            for (import_path, mtime) in imports {
1029                for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
1030                    desired_paths.insert(exchange.path.clone());
1031                    let mtype = detect_memory_type(&exchange.content);
1032                    match self.index_chunks_with_room_and_type(
1033                        &exchange.path,
1034                        exchange.last_modified,
1035                        "session",
1036                        mtype,
1037                        std::slice::from_ref(&exchange.content),
1038                    ) {
1039                        Ok(new_chunks) if !new_chunks.is_empty() => {
1040                            count += 1;
1041                        }
1042                        Ok(_) => {}
1043                        Err(_) => {}
1044                    }
1045                }
1046            }
1047        }
1048
1049        self.prune_indexed_prefix("session/imports/", &desired_paths);
1050        count
1051    }
1052
1053    /// Embed any FTS chunks that don't yet have a vector in chunks_vec.
1054    /// Called at the end of index_project so that loading the embedding model
1055    /// after the initial index automatically triggers a semantic upgrade on the
1056    /// next agent turn — no /forget or file-touch required.
1057    fn backfill_missing_embeddings(&self) {
1058        // Fast path: if chunk counts match, nothing to do.
1059        let (fts_count, vec_count) = {
1060            let db = self.db.lock().unwrap();
1061            let fts: i64 = db
1062                .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
1063                .unwrap_or(0);
1064            let vec: i64 = db
1065                .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
1066                .unwrap_or(0);
1067            (fts, vec)
1068        };
1069        if fts_count == 0 || fts_count == vec_count {
1070            return;
1071        }
1072
1073        // Fetch (path, chunk_idx, content) for chunks with no embedding.
1074        // chunks_fts rowid serves as chunk_idx (1-based → convert to 0-based).
1075        let missing: Vec<(String, i64, String)> = {
1076            let db = self.db.lock().unwrap();
1077            let mut stmt = db
1078                .prepare(
1079                    "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
1080                     FROM chunks_fts f
1081                     LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
1082                     WHERE v.path IS NULL
1083                     ORDER BY CASE
1084                         WHEN f.path LIKE '%.rs' THEN 0
1085                         WHEN f.path LIKE '%.toml' THEN 1
1086                         WHEN f.path LIKE '%.json' THEN 2
1087                         ELSE 3
1088                     END, f.path
1089                     LIMIT 20",
1090                )
1091                .unwrap();
1092            stmt.query_map([], |r| {
1093                Ok((
1094                    r.get::<_, String>(0)?,
1095                    r.get::<_, i64>(1)?,
1096                    r.get::<_, String>(2)?,
1097                ))
1098            })
1099            .unwrap()
1100            .filter_map(|r| r.ok())
1101            .collect()
1102        };
1103
1104        let Some(embed_model) = self.current_embed_model() else {
1105            return;
1106        };
1107
1108        for (path, idx, content) in missing {
1109            if let Some(vec) = embed_text_blocking(&content, &self.base_url, &embed_model) {
1110                let blob = floats_to_blob(&vec);
1111                let db = self.db.lock().unwrap();
1112                let _ = db.execute(
1113                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
1114                    params![path, idx, blob],
1115                );
1116            } else {
1117                // Embedding model not available — stop trying for this pass.
1118                break;
1119            }
1120        }
1121    }
1122
1123    /// Total number of unique files currently indexed.
1124    /// Session exchange chunks are excluded so status counts stay source/doc centric.
1125    pub fn file_count(&self) -> usize {
1126        let db = self.db.lock().unwrap();
1127        db.query_row(
1128            "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
1129            [],
1130            |r| r.get::<_, i64>(0),
1131        )
1132        .unwrap_or(0) as usize
1133    }
1134
1135    /// Number of source/doc chunks that have semantic embedding vectors stored.
1136    /// Session exchange chunks are excluded so status counts stay source/doc centric.
1137    pub fn embedded_chunk_count(&self) -> usize {
1138        let db = self.db.lock().unwrap();
1139        db.query_row(
1140            "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1141            [],
1142            |r| r.get::<_, i64>(0),
1143        )
1144        .unwrap_or(0) as usize
1145    }
1146
1147    /// True when any chunk type currently has embeddings available.
1148    pub fn has_any_embeddings(&self) -> bool {
1149        let db = self.db.lock().unwrap();
1150        db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1151            r.get::<_, i64>(0)
1152        })
1153        .unwrap_or(0)
1154            != 0
1155    }
1156
1157    /// Wipe all indexed data. The DB file stays on disk; next index_project()
1158    /// call rebuilds from scratch (re-reads all files, re-embeds all chunks).
1159    pub fn reset(&self) {
1160        let db = self.db.lock().unwrap();
1161        let _ = db.execute_batch(
1162            "DELETE FROM chunks_fts;
1163             DELETE FROM chunks_vec;
1164             DELETE FROM chunks_meta;",
1165        );
1166    }
1167
1168    /// Return a compact operator-facing snapshot of what The Vein currently knows.
1169    /// Intended for trust/debug surfaces like `/vein-inspect`.
1170    pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1171        let db = self.db.lock().unwrap();
1172        let indexed_source_files = db
1173            .query_row(
1174                "SELECT COUNT(*) FROM chunks_meta
1175                 WHERE path NOT LIKE 'session/%'
1176                   AND path NOT LIKE '.hematite/docs/%'",
1177                [],
1178                |r| r.get::<_, i64>(0),
1179            )
1180            .unwrap_or(0) as usize;
1181        let indexed_docs = db
1182            .query_row(
1183                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1184                [],
1185                |r| r.get::<_, i64>(0),
1186            )
1187            .unwrap_or(0) as usize;
1188        let indexed_session_exchanges = db
1189            .query_row(
1190                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1191                [],
1192                |r| r.get::<_, i64>(0),
1193            )
1194            .unwrap_or(0) as usize;
1195        let embedded_source_doc_chunks = db
1196            .query_row(
1197                "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1198                [],
1199                |r| r.get::<_, i64>(0),
1200            )
1201            .unwrap_or(0) as usize;
1202        let has_any_embeddings = db
1203            .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1204                r.get::<_, i64>(0)
1205            })
1206            .unwrap_or(0)
1207            != 0;
1208        drop(db);
1209
1210        let hot_files = self
1211            .hot_files(hot_limit.max(1))
1212            .into_iter()
1213            .map(|(path, heat, last_modified, room)| VeinHotFile {
1214                path,
1215                heat,
1216                last_modified,
1217                room,
1218            })
1219            .collect::<Vec<_>>();
1220
1221        VeinInspectionSnapshot {
1222            indexed_source_files,
1223            indexed_docs,
1224            indexed_session_exchanges,
1225            embedded_source_doc_chunks,
1226            has_any_embeddings,
1227            active_room: self.active_room(),
1228            l1_ready: !hot_files.is_empty(),
1229            hot_files,
1230        }
1231    }
1232
1233    // ── L1 heat tracking ──────────────────────────────────────────────────────
1234
1235    /// Record an edit to a file. Increments its heat score in file_heat.
1236    /// Called from the tool dispatch after a successful edit_file / write_file /
1237    /// patch_hunk / multi_search_replace so the L1 context stays current.
1238    pub fn bump_heat(&self, path: &str) {
1239        if path.is_empty() {
1240            return;
1241        }
1242        let now = std::time::SystemTime::now()
1243            .duration_since(std::time::UNIX_EPOCH)
1244            .unwrap_or_default()
1245            .as_secs() as i64;
1246        let db = self.db.lock().unwrap();
1247        let _ = db.execute(
1248            "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1249             ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1250            params![path, now],
1251        );
1252    }
1253
1254    /// Return the top N hot files ranked by edit count (heat) then recency.
1255    /// Joins file_heat with chunks_meta so only indexed files are included.
1256    /// Returns (path, heat, mtime, room).
1257    fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1258        let db = self.db.lock().unwrap();
1259        let mut stmt = match db.prepare(
1260            "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1261             FROM file_heat fh
1262             JOIN chunks_meta cm ON cm.path = fh.path
1263             ORDER BY fh.heat DESC, cm.last_modified DESC
1264             LIMIT ?1",
1265        ) {
1266            Ok(s) => s,
1267            Err(_) => return vec![],
1268        };
1269        stmt.query_map(params![n as i64], |row| {
1270            Ok((
1271                row.get::<_, String>(0)?,
1272                row.get::<_, i64>(1)?,
1273                row.get::<_, i64>(2)?,
1274                row.get::<_, String>(3)?,
1275            ))
1276        })
1277        .map(|rows| rows.filter_map(|r| r.ok()).collect())
1278        .unwrap_or_default()
1279    }
1280
1281    /// Return the paths of the top hot files (most edited).
1282    /// Used by RepoMapGenerator to bias PageRank toward active files.
1283    pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1284        self.hot_files(n)
1285            .into_iter()
1286            .map(|(path, _, _, _)| path)
1287            .collect()
1288    }
1289
1290    /// Return hot files with normalized heat weights in [0.0, 1.0].
1291    /// The hottest file gets weight 1.0; others are scaled proportionally.
1292    /// Used by RepoMapGenerator to apply heat-weighted PageRank personalization.
1293    pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1294        let files = self.hot_files(n);
1295        if files.is_empty() {
1296            return vec![];
1297        }
1298        let max_heat = files
1299            .iter()
1300            .map(|(_, h, _, _)| *h)
1301            .max()
1302            .unwrap_or(1)
1303            .max(1) as f64;
1304        files
1305            .into_iter()
1306            .map(|(path, heat, _, _)| {
1307                let weight = (heat as f64) / max_heat;
1308                (path, weight)
1309            })
1310            .collect()
1311    }
1312
1313    /// Build the L1 context block — a compact "hot files" summary injected into
1314    /// the system prompt at session start. Capped at ~150 tokens.
1315    /// Files are grouped by room so the model sees subsystem structure at a glance.
1316    /// Returns None when there are no heat records yet (fresh project).
1317    pub fn l1_context(&self) -> Option<String> {
1318        let files = self.hot_files(8);
1319        if files.is_empty() {
1320            return None;
1321        }
1322        let now = std::time::SystemTime::now()
1323            .duration_since(std::time::UNIX_EPOCH)
1324            .unwrap_or_default()
1325            .as_secs() as i64;
1326
1327        // Group by room for readability.
1328        let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1329            std::collections::BTreeMap::new();
1330        for (path, heat, mtime, room) in &files {
1331            by_room
1332                .entry(room.clone())
1333                .or_default()
1334                .push((path.clone(), *heat, *mtime));
1335        }
1336
1337        let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1338        for (room, entries) in &by_room {
1339            out.push_str(&format!("[{}]\n", room));
1340            for (path, heat, mtime) in entries {
1341                let age_secs = now - mtime;
1342                let age = if age_secs < 3600 {
1343                    "just now".to_string()
1344                } else if age_secs < 86400 {
1345                    format!("{}h ago", age_secs / 3600)
1346                } else {
1347                    format!("{}d ago", age_secs / 86400)
1348                };
1349                out.push_str(&format!(
1350                    "  - {} [{} edit{}, {}]\n",
1351                    path,
1352                    heat,
1353                    if *heat == 1 { "" } else { "s" },
1354                    age
1355                ));
1356            }
1357        }
1358        Some(out)
1359    }
1360
1361    fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1362        let pattern = format!("{}%", prefix);
1363        let existing_paths: Vec<String> = {
1364            let db = self.db.lock().unwrap();
1365            let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1366                Ok(stmt) => stmt,
1367                Err(_) => return,
1368            };
1369            stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1370                .map(|rows| rows.filter_map(|row| row.ok()).collect())
1371                .unwrap_or_default()
1372        };
1373
1374        if existing_paths.is_empty() {
1375            return;
1376        }
1377
1378        let db = self.db.lock().unwrap();
1379        for path in existing_paths {
1380            if desired_paths.contains(&path) {
1381                continue;
1382            }
1383            let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1384            let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1385            let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1386        }
1387    }
1388}
1389
1390impl QuerySignals {
1391    fn from_query(query: &str) -> Self {
1392        let lower = query.to_ascii_lowercase();
1393        let historical_memory_hint = [
1394            "remember",
1395            "earlier",
1396            "previous",
1397            "last time",
1398            "what did we decide",
1399            "why did we decide",
1400            "what did we say",
1401            "why did we change",
1402        ]
1403        .iter()
1404        .any(|needle| lower.contains(needle));
1405
1406        // Detect what memory type the query is asking about.
1407        let query_memory_type = if lower.contains("decide")
1408            || lower.contains("decision")
1409            || lower.contains("we agreed")
1410            || lower.contains("we chose")
1411        {
1412            Some("decision")
1413        } else if lower.contains("bug")
1414            || lower.contains("error")
1415            || lower.contains("issue")
1416            || lower.contains("problem")
1417            || lower.contains("fix")
1418            || lower.contains("broken")
1419        {
1420            Some("problem")
1421        } else if lower.contains("shipped")
1422            || lower.contains("milestone")
1423            || lower.contains("finished")
1424            || lower.contains("working now")
1425        {
1426            Some("milestone")
1427        } else if lower.contains("prefer")
1428            || lower.contains("my preference")
1429            || lower.contains("i like")
1430            || lower.contains("i want")
1431        {
1432            Some("preference")
1433        } else {
1434            None
1435        };
1436
1437        Self {
1438            exact_phrases: extract_exact_phrases(query),
1439            standout_terms: extract_standout_terms(query),
1440            historical_memory_hint,
1441            temporal_reference: extract_temporal_reference(query),
1442            query_memory_type,
1443        }
1444    }
1445}
1446
1447fn merge_scored_result(
1448    merged_by_path: &mut HashMap<String, SearchResult>,
1449    candidate: SearchResult,
1450) {
1451    match merged_by_path.get_mut(&candidate.path) {
1452        Some(existing) if candidate.score > existing.score => *existing = candidate,
1453        Some(_) => {}
1454        None => {
1455            merged_by_path.insert(candidate.path.clone(), candidate);
1456        }
1457    }
1458}
1459
1460fn reranked_score(
1461    signals: &QuerySignals,
1462    active_room: Option<&str>,
1463    result: &SearchResult,
1464    is_semantic: bool,
1465) -> f32 {
1466    let base = if is_semantic {
1467        1.0 + result.score.clamp(0.0, 1.0)
1468    } else {
1469        (result.score / 10.0).clamp(0.0, 1.0)
1470    };
1471    base + room_bias(active_room, result)
1472        + retrieval_signal_boost(signals, result)
1473        + temporal_memory_boost(signals, result)
1474}
1475
1476fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1477    if active_room == Some(result.room.as_str()) {
1478        0.15
1479    } else {
1480        0.0
1481    }
1482}
1483
1484fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1485    let mut boost = 0.0f32;
1486    let haystack = format!(
1487        "{}\n{}",
1488        result.path.to_ascii_lowercase(),
1489        result.content.to_ascii_lowercase()
1490    );
1491
1492    let phrase_matches = signals
1493        .exact_phrases
1494        .iter()
1495        .filter(|phrase| haystack.contains(phrase.as_str()))
1496        .count();
1497    if phrase_matches > 0 {
1498        boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1499    }
1500
1501    let mut standout_matches = 0;
1502    for term in &signals.standout_terms {
1503        if result.path.to_ascii_lowercase().contains(term.as_str()) {
1504            boost += 0.40;
1505            standout_matches += 1;
1506        } else if result.content.to_ascii_lowercase().contains(term.as_str()) {
1507            boost += 0.12;
1508            standout_matches += 1;
1509        }
1510        if standout_matches >= 3 {
1511            break;
1512        }
1513    }
1514
1515    if signals.historical_memory_hint && result.room == "session" {
1516        boost += 0.45;
1517    }
1518
1519    // Boost session chunks whose tagged memory type matches the query's intent.
1520    if let Some(qtype) = signals.query_memory_type {
1521        if !result.memory_type.is_empty() && result.memory_type == qtype {
1522            boost += 0.35;
1523        }
1524    }
1525
1526    boost
1527}
1528
1529fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1530    if result.room != "session" {
1531        return 0.0;
1532    }
1533    let Some(reference) = signals.temporal_reference else {
1534        return 0.0;
1535    };
1536    let Some(memory_ts) = session_memory_timestamp(result) else {
1537        return 0.0;
1538    };
1539
1540    let span = reference.window_secs.max(86_400);
1541    let full_fade = span.saturating_mul(8);
1542    if full_fade <= 0 {
1543        return 0.0;
1544    }
1545
1546    let distance = (memory_ts - reference.target_ts).abs();
1547    let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1548    if closeness <= 0.0 {
1549        0.0
1550    } else {
1551        0.22 * closeness
1552    }
1553}
1554
1555fn extract_exact_phrases(query: &str) -> Vec<String> {
1556    let mut phrases = Vec::new();
1557    let chars: Vec<char> = query.chars().collect();
1558    let mut i = 0usize;
1559
1560    while i < chars.len() {
1561        let quote = chars[i];
1562        if !matches!(quote, '"' | '\'' | '`') {
1563            i += 1;
1564            continue;
1565        }
1566        let start = i + 1;
1567        let mut end = start;
1568        while end < chars.len() && chars[end] != quote {
1569            end += 1;
1570        }
1571        if end > start {
1572            let phrase = chars[start..end]
1573                .iter()
1574                .collect::<String>()
1575                .trim()
1576                .to_ascii_lowercase();
1577            if phrase.len() >= 3 && !phrases.contains(&phrase) {
1578                phrases.push(phrase);
1579            }
1580        }
1581        i = end.saturating_add(1);
1582    }
1583
1584    phrases
1585}
1586
1587fn extract_standout_terms(query: &str) -> Vec<String> {
1588    const STOPWORDS: &[&str] = &[
1589        "about", "after", "before", "change", "changed", "decide", "decided", "does", "earlier",
1590        "flow", "from", "have", "into", "just", "last", "local", "make", "more", "remember",
1591        "should", "that", "their", "there", "these", "they", "this", "those", "what", "when",
1592        "where", "which", "why", "with", "work",
1593    ];
1594
1595    let mut standout = Vec::new();
1596    for token in query.split(|ch: char| {
1597        !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1598    }) {
1599        let trimmed = token.trim();
1600        if trimmed.len() < 4 {
1601            continue;
1602        }
1603        let lower = trimmed.to_ascii_lowercase();
1604        if STOPWORDS.contains(&lower.as_str()) {
1605            continue;
1606        }
1607
1608        let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1609            || trimmed
1610                .chars()
1611                .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1612            || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1613            || trimmed.len() >= 9;
1614
1615        if interesting && !standout.contains(&lower) {
1616            standout.push(lower);
1617        }
1618    }
1619
1620    standout
1621}
1622
1623fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1624    if let Some(ts) = extract_iso_date_from_query(query) {
1625        return Some(TemporalReference {
1626            target_ts: ts,
1627            window_secs: 86_400,
1628        });
1629    }
1630
1631    let now = current_unix_timestamp();
1632    let lower = query.to_ascii_lowercase();
1633    if lower.contains("yesterday") {
1634        Some(TemporalReference {
1635            target_ts: now.saturating_sub(86_400),
1636            window_secs: 86_400,
1637        })
1638    } else if lower.contains("today") || lower.contains("earlier today") {
1639        Some(TemporalReference {
1640            target_ts: now,
1641            window_secs: 86_400,
1642        })
1643    } else if lower.contains("last week") {
1644        Some(TemporalReference {
1645            target_ts: now.saturating_sub(7 * 86_400),
1646            window_secs: 7 * 86_400,
1647        })
1648    } else if lower.contains("last month") {
1649        Some(TemporalReference {
1650            target_ts: now.saturating_sub(30 * 86_400),
1651            window_secs: 30 * 86_400,
1652        })
1653    } else {
1654        None
1655    }
1656}
1657
1658fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1659    query
1660        .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1661        .find_map(parse_iso_date_token)
1662}
1663
1664fn parse_iso_date_token(token: &str) -> Option<i64> {
1665    if token.len() != 10 {
1666        return None;
1667    }
1668    let bytes = token.as_bytes();
1669    if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1670        return None;
1671    }
1672
1673    let year = token.get(0..4)?.parse::<i32>().ok()?;
1674    let month = token.get(5..7)?.parse::<u32>().ok()?;
1675    let day = token.get(8..10)?.parse::<u32>().ok()?;
1676    if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1677        return None;
1678    }
1679
1680    Some(days_from_civil(year, month, day).saturating_mul(86_400))
1681}
1682
1683fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1684    let year = year - if month <= 2 { 1 } else { 0 };
1685    let era = if year >= 0 { year } else { year - 399 } / 400;
1686    let yoe = year - era * 400;
1687    let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1688    let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1689    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1690    era as i64 * 146_097 + doe as i64 - 719_468
1691}
1692
1693fn current_unix_timestamp() -> i64 {
1694    std::time::SystemTime::now()
1695        .duration_since(std::time::UNIX_EPOCH)
1696        .unwrap_or_default()
1697        .as_secs() as i64
1698}
1699
1700fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1701    extract_session_path_timestamp(&result.path).or_else(|| {
1702        if result.last_modified > 0 {
1703            Some(result.last_modified)
1704        } else {
1705            None
1706        }
1707    })
1708}
1709
1710fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1711    let normalized = path.replace('\\', "/");
1712    let mut parts = normalized.split('/');
1713    if parts.next()? != "session" {
1714        return None;
1715    }
1716    parse_iso_date_token(parts.next()?)
1717}
1718
1719fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1720    let normalized = speaker.trim().to_ascii_lowercase();
1721    match normalized.as_str() {
1722        "you" | "user" => SessionSpeakerKind::User,
1723        "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1724        _ => SessionSpeakerKind::Assistant,
1725    }
1726}
1727
1728fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1729    let Ok(raw) = std::fs::read_to_string(report_path) else {
1730        return Vec::new();
1731    };
1732    let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1733        return Vec::new();
1734    };
1735
1736    let session_key = report_path
1737        .file_stem()
1738        .and_then(|stem| stem.to_str())
1739        .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1740        .unwrap_or("unknown-session")
1741        .to_string();
1742    let session_date = report
1743        .session_start
1744        .split('_')
1745        .next()
1746        .filter(|date| !date.is_empty())
1747        .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1748        .to_string();
1749
1750    let mut exchanges = Vec::new();
1751    let mut pending_user: Option<String> = None;
1752    let mut turn_index = 0usize;
1753
1754    for entry in report.transcript {
1755        match session_speaker_kind(&entry.speaker) {
1756            SessionSpeakerKind::User => {
1757                let text = entry.text.trim();
1758                if !text.is_empty() {
1759                    pending_user = Some(text.to_string());
1760                }
1761            }
1762            SessionSpeakerKind::Assistant => {
1763                let text = entry.text.trim();
1764                if text.is_empty() {
1765                    continue;
1766                }
1767                let Some(user_text) = pending_user.take() else {
1768                    continue;
1769                };
1770                turn_index += 1;
1771                exchanges.push(SessionExchange {
1772                    path: format!(
1773                        "session/{}/{}/turn-{}",
1774                        session_date, session_key, turn_index
1775                    ),
1776                    last_modified,
1777                    content: format!(
1778                        "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1779                        user_text, text
1780                    ),
1781                });
1782            }
1783            SessionSpeakerKind::Ignore => {}
1784        }
1785    }
1786
1787    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1788        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1789        exchanges = exchanges.into_iter().skip(keep_from).collect();
1790    }
1791
1792    exchanges
1793}
1794
1795fn load_imported_session_exchanges(
1796    import_path: &Path,
1797    imports_root: &Path,
1798    last_modified: i64,
1799) -> Vec<SessionExchange> {
1800    let Ok(raw) = std::fs::read_to_string(import_path) else {
1801        return Vec::new();
1802    };
1803
1804    let messages = normalize_import_messages(&raw, import_path);
1805    if messages.is_empty() {
1806        return Vec::new();
1807    }
1808
1809    let rel = import_path
1810        .strip_prefix(imports_root)
1811        .unwrap_or(import_path);
1812    let rel_slug = slugify_import_path(rel);
1813    let mut exchanges = Vec::new();
1814    let mut pending_user: Option<String> = None;
1815    let mut turn_index = 0usize;
1816
1817    for (role, text) in messages {
1818        let cleaned = text.trim();
1819        if cleaned.is_empty() {
1820            continue;
1821        }
1822        match role.as_str() {
1823            "user" => pending_user = Some(cleaned.to_string()),
1824            "assistant" => {
1825                let Some(user_text) = pending_user.take() else {
1826                    continue;
1827                };
1828                turn_index += 1;
1829                exchanges.push(SessionExchange {
1830                    path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1831                    last_modified,
1832                    content: format!(
1833                        "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1834                        rel.to_string_lossy().replace('\\', "/"),
1835                        user_text,
1836                        cleaned
1837                    ),
1838                });
1839            }
1840            _ => {}
1841        }
1842    }
1843
1844    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1845        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1846        exchanges = exchanges.into_iter().skip(keep_from).collect();
1847    }
1848
1849    exchanges
1850}
1851
1852fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1853    if raw.trim().is_empty() {
1854        return Vec::new();
1855    }
1856
1857    if let Some(messages) = parse_marker_transcript(raw) {
1858        return messages;
1859    }
1860
1861    let ext = import_path
1862        .extension()
1863        .and_then(|ext| ext.to_str())
1864        .unwrap_or("")
1865        .to_ascii_lowercase();
1866
1867    if matches!(ext.as_str(), "json" | "jsonl")
1868        || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1869    {
1870        if let Some(messages) = parse_jsonl_messages(raw) {
1871            if !messages.is_empty() {
1872                return messages;
1873            }
1874        }
1875
1876        if let Ok(value) = serde_json::from_str::<Value>(raw) {
1877            if let Some(messages) = parse_session_report_messages(&value) {
1878                return messages;
1879            }
1880            if let Some(messages) = parse_simple_role_messages(&value) {
1881                return messages;
1882            }
1883            if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1884                return messages;
1885            }
1886        }
1887    }
1888
1889    Vec::new()
1890}
1891
1892fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1893    let lines = raw.lines().collect::<Vec<_>>();
1894    if lines
1895        .iter()
1896        .filter(|line| line.trim_start().starts_with("> "))
1897        .count()
1898        < 2
1899    {
1900        return None;
1901    }
1902
1903    let mut messages = Vec::new();
1904    let mut i = 0usize;
1905    while i < lines.len() {
1906        let line = lines[i].trim_start();
1907        if let Some(rest) = line.strip_prefix("> ") {
1908            messages.push(("user".to_string(), rest.trim().to_string()));
1909            i += 1;
1910            let mut assistant_lines = Vec::new();
1911            while i < lines.len() {
1912                let next = lines[i];
1913                if next.trim_start().starts_with("> ") {
1914                    break;
1915                }
1916                let trimmed = next.trim();
1917                if !trimmed.is_empty() && trimmed != "---" {
1918                    assistant_lines.push(trimmed.to_string());
1919                }
1920                i += 1;
1921            }
1922            if !assistant_lines.is_empty() {
1923                messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1924            }
1925        } else {
1926            i += 1;
1927        }
1928    }
1929
1930    (!messages.is_empty()).then_some(messages)
1931}
1932
1933fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1934    let mut messages = Vec::new();
1935    let mut has_codex_session_meta = false;
1936    let mut saw_jsonl = false;
1937
1938    for line in raw.lines() {
1939        let trimmed = line.trim();
1940        if trimmed.is_empty() {
1941            continue;
1942        }
1943        let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1944            continue;
1945        };
1946        saw_jsonl = true;
1947        let Some(object) = value.as_object() else {
1948            continue;
1949        };
1950
1951        match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1952            "session_meta" => {
1953                has_codex_session_meta = true;
1954            }
1955            "event_msg" => {
1956                let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1957                    continue;
1958                };
1959                let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1960                    continue;
1961                };
1962                match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1963                    "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1964                    "agent_message" => {
1965                        messages.push(("assistant".to_string(), text.trim().to_string()))
1966                    }
1967                    _ => {}
1968                }
1969            }
1970            "human" | "user" => {
1971                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1972                    messages.push(("user".to_string(), text));
1973                }
1974            }
1975            "assistant" => {
1976                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1977                    messages.push(("assistant".to_string(), text));
1978                }
1979            }
1980            _ => {
1981                if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
1982                    if let Some(text) = extract_text_content(&value) {
1983                        match role {
1984                            "user" | "human" => messages.push(("user".to_string(), text)),
1985                            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1986                            _ => {}
1987                        }
1988                    }
1989                }
1990            }
1991        }
1992    }
1993
1994    if !saw_jsonl {
1995        return None;
1996    }
1997
1998    if has_codex_session_meta || !messages.is_empty() {
1999        return Some(messages);
2000    }
2001
2002    None
2003}
2004
2005fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
2006    let report = value.as_object()?;
2007    let transcript = report.get("transcript")?.as_array()?;
2008    let mut messages = Vec::new();
2009
2010    for entry in transcript {
2011        let Some(obj) = entry.as_object() else {
2012            continue;
2013        };
2014        let speaker = obj
2015            .get("speaker")
2016            .and_then(|v| v.as_str())
2017            .unwrap_or_default();
2018        let text = obj
2019            .get("text")
2020            .and_then(|v| v.as_str())
2021            .unwrap_or_default()
2022            .trim()
2023            .to_string();
2024        if text.is_empty() {
2025            continue;
2026        }
2027        match session_speaker_kind(speaker) {
2028            SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
2029            SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
2030            SessionSpeakerKind::Ignore => {}
2031        }
2032    }
2033
2034    (!messages.is_empty()).then_some(messages)
2035}
2036
2037fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
2038    if let Some(array) = value.as_array() {
2039        let messages = collect_role_messages(array);
2040        return (!messages.is_empty()).then_some(messages);
2041    }
2042
2043    let obj = value.as_object()?;
2044    if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
2045        let array = messages_value.as_array()?;
2046        let messages = collect_role_messages(array);
2047        return (!messages.is_empty()).then_some(messages);
2048    }
2049
2050    None
2051}
2052
2053fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
2054    let mut messages = Vec::new();
2055    for item in items {
2056        let Some(obj) = item.as_object() else {
2057            continue;
2058        };
2059        let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
2060            continue;
2061        };
2062        let Some(text) = extract_text_content(item) else {
2063            continue;
2064        };
2065        match role {
2066            "user" | "human" => messages.push(("user".to_string(), text)),
2067            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2068            _ => {}
2069        }
2070    }
2071    messages
2072}
2073
2074fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
2075    let mapping = value.get("mapping")?.as_object()?;
2076    let mut current_id = mapping.iter().find_map(|(node_id, node)| {
2077        let obj = node.as_object()?;
2078        (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
2079    })?;
2080
2081    let mut messages = Vec::new();
2082    let mut visited = std::collections::HashSet::new();
2083
2084    while visited.insert(current_id.clone()) {
2085        let Some(node) = mapping.get(&current_id).and_then(|v| v.as_object()) else {
2086            break;
2087        };
2088
2089        if let Some(message) = node.get("message") {
2090            let role = message
2091                .get("author")
2092                .and_then(|author| author.get("role"))
2093                .and_then(|v| v.as_str())
2094                .unwrap_or("");
2095            if let Some(text) = extract_text_content(message) {
2096                match role {
2097                    "user" => messages.push(("user".to_string(), text)),
2098                    "assistant" => messages.push(("assistant".to_string(), text)),
2099                    _ => {}
2100                }
2101            }
2102        }
2103
2104        let Some(next_id) = node
2105            .get("children")
2106            .and_then(|children| children.as_array())
2107            .and_then(|children| children.first())
2108            .and_then(|child| child.as_str())
2109        else {
2110            break;
2111        };
2112        current_id = next_id.to_string();
2113    }
2114
2115    (!messages.is_empty()).then_some(messages)
2116}
2117
2118fn extract_text_content(value: &Value) -> Option<String> {
2119    if let Some(text) = value.as_str() {
2120        let trimmed = text.trim();
2121        return (!trimmed.is_empty()).then_some(trimmed.to_string());
2122    }
2123
2124    if let Some(array) = value.as_array() {
2125        let joined = array
2126            .iter()
2127            .filter_map(extract_text_content)
2128            .filter(|part| !part.is_empty())
2129            .collect::<Vec<_>>()
2130            .join("\n");
2131        return (!joined.is_empty()).then_some(joined);
2132    }
2133
2134    let obj = value.as_object()?;
2135
2136    if let Some(content) = obj.get("content") {
2137        if let Some(text) = extract_text_content(content) {
2138            return Some(text);
2139        }
2140    }
2141
2142    if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
2143        let trimmed = text.trim();
2144        if !trimmed.is_empty() {
2145            return Some(trimmed.to_string());
2146        }
2147    }
2148
2149    if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
2150        let joined = parts
2151            .iter()
2152            .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
2153            .filter(|part| !part.is_empty())
2154            .collect::<Vec<_>>()
2155            .join("\n");
2156        if !joined.is_empty() {
2157            return Some(joined);
2158        }
2159    }
2160
2161    None
2162}
2163
2164fn slugify_import_path(path: &Path) -> String {
2165    path.to_string_lossy()
2166        .replace('\\', "/")
2167        .chars()
2168        .map(|ch| {
2169            if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
2170                ch
2171            } else {
2172                '_'
2173            }
2174        })
2175        .collect::<String>()
2176        .trim_matches('/')
2177        .replace('/', "__")
2178}
2179
2180// ── Embedding API ─────────────────────────────────────────────────────────────
2181
2182/// Call the active provider's embedding endpoint synchronously.
2183///
2184/// Nomic-style models use task instruction prefixes:
2185/// - Chunks stored in the index use `"search_document: "` prefix
2186/// - Queries at search time use `"search_query: "` prefix
2187///
2188/// Callers must tolerate `None` and fall back to BM25-only search.
2189fn embed_text_blocking(text: &str, base_url: &str, embed_model: &str) -> Option<Vec<f32>> {
2190    embed_text_with_prefix(text, "search_document", base_url, embed_model)
2191}
2192
2193fn embed_query_blocking(text: &str, base_url: &str, embed_model: &str) -> Option<Vec<f32>> {
2194    embed_text_with_prefix(text, "search_query", base_url, embed_model)
2195}
2196
2197fn embed_text_with_prefix(
2198    text: &str,
2199    task: &str,
2200    base_url: &str,
2201    embed_model: &str,
2202) -> Option<Vec<f32>> {
2203    // Nomic v2 task instruction prefix format: "<task>: <text>"
2204    let prefixed = format!("{}: {}", task, text);
2205    // Truncate to ~8000 chars to stay within typical embedding model limits.
2206    let input = if prefixed.len() > 8000 {
2207        &prefixed[..8000]
2208    } else {
2209        &prefixed
2210    };
2211
2212    let client = reqwest::blocking::Client::builder()
2213        .timeout(std::time::Duration::from_secs(10))
2214        .build()
2215        .ok()?;
2216
2217    let body = serde_json::json!({
2218        "model": embed_model,
2219        "input": input
2220    });
2221
2222    let trimmed = base_url.trim_end_matches('/');
2223    let is_ollama = trimmed.contains("11434");
2224    let url = if is_ollama {
2225        format!("{}/api/embed", trimmed)
2226    } else {
2227        format!("{}/v1/embeddings", trimmed)
2228    };
2229    let resp = client.post(&url).json(&body).send().ok()?;
2230
2231    if !resp.status().is_success() {
2232        return None;
2233    }
2234
2235    let json: serde_json::Value = resp.json().ok()?;
2236    let embedding = if is_ollama {
2237        json["embeddings"][0].as_array()?
2238    } else {
2239        json["data"][0]["embedding"].as_array()?
2240    };
2241    let vec: Vec<f32> = embedding
2242        .iter()
2243        .filter_map(|v| v.as_f64().map(|f| f as f32))
2244        .collect();
2245
2246    if vec.is_empty() {
2247        None
2248    } else {
2249        Some(vec)
2250    }
2251}
2252
2253// ── Vector math ───────────────────────────────────────────────────────────────
2254
2255fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2256    if a.len() != b.len() || a.is_empty() {
2257        return 0.0;
2258    }
2259    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2260    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2261    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2262    if norm_a == 0.0 || norm_b == 0.0 {
2263        0.0
2264    } else {
2265        dot / (norm_a * norm_b)
2266    }
2267}
2268
2269fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2270    floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2271}
2272
2273fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2274    blob.chunks_exact(4)
2275        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2276        .collect()
2277}
2278
2279// ── Document extraction ───────────────────────────────────────────────────────
2280
2281/// Extract plain text from a PDF file using pdf-extract.
2282/// Returns None if the file can't be read or yields no text.
2283/// Output is best-effort — layout is not preserved, but content is.
2284fn normalize_extracted_document_text(text: String) -> Option<String> {
2285    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2286    let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2287    if trimmed.is_empty() {
2288        None
2289    } else {
2290        Some(trimmed.to_string())
2291    }
2292}
2293
2294fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2295    let previous_hook = std::panic::take_hook();
2296    std::panic::set_hook(Box::new(|_| {}));
2297    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2298        pdf_extract::extract_text(path)
2299    }));
2300    std::panic::set_hook(previous_hook);
2301
2302    match result {
2303        Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2304        Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2305        Err(payload) => {
2306            let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2307                (*msg).to_string()
2308            } else if let Some(msg) = payload.downcast_ref::<String>() {
2309                msg.clone()
2310            } else {
2311                "unknown parser panic".to_string()
2312            };
2313            Err(format!("pdf-extract panicked: {}", panic_text))
2314        }
2315    }
2316}
2317
2318fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2319    let mut doc =
2320        lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2321
2322    if doc.is_encrypted() {
2323        doc.decrypt("")
2324            .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2325    }
2326
2327    let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2328    if page_numbers.is_empty() {
2329        return Ok(None);
2330    }
2331
2332    let mut extracted_pages = Vec::new();
2333    let mut page_errors = Vec::new();
2334
2335    for page_number in page_numbers {
2336        match doc.extract_text(&[page_number]) {
2337            Ok(text) => {
2338                if let Some(page_text) = normalize_extracted_document_text(text) {
2339                    extracted_pages.push(page_text);
2340                }
2341            }
2342            Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2343        }
2344    }
2345
2346    if !extracted_pages.is_empty() {
2347        return Ok(Some(extracted_pages.join("\n\n")));
2348    }
2349
2350    if !page_errors.is_empty() {
2351        let sample_errors = page_errors
2352            .into_iter()
2353            .take(3)
2354            .collect::<Vec<_>>()
2355            .join("; ");
2356        return Err(format!(
2357            "lopdf could not extract usable page text ({sample_errors})"
2358        ));
2359    }
2360
2361    Ok(None)
2362}
2363
2364fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2365    let mut failures = Vec::new();
2366
2367    match extract_pdf_text_with_pdf_extract(path) {
2368        Ok(Some(text)) => return Ok(Some(text)),
2369        Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2370        Err(e) => failures.push(e),
2371    }
2372
2373    match extract_pdf_text_with_lopdf(path) {
2374        Ok(Some(text)) => return Ok(Some(text)),
2375        Ok(None) => failures.push("lopdf found no usable text".to_string()),
2376        Err(e) => failures.push(e),
2377    }
2378
2379    let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2380    Err(format!(
2381        "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2382        detail
2383    ))
2384}
2385
2386fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2387    let exe = std::env::current_exe()
2388        .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2389    let output = std::process::Command::new(exe)
2390        .arg("--pdf-extract-helper")
2391        .arg(path)
2392        .stdin(std::process::Stdio::null())
2393        .stdout(std::process::Stdio::piped())
2394        .stderr(std::process::Stdio::piped())
2395        .output()
2396        .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2397
2398    if !output.status.success() {
2399        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2400        return Err(if stderr.is_empty() {
2401            "PDF extraction failed.".to_string()
2402        } else {
2403            stderr
2404        });
2405    }
2406
2407    let text = String::from_utf8(output.stdout)
2408        .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2409    if text.trim().is_empty() {
2410        Ok(None)
2411    } else {
2412        Ok(Some(text))
2413    }
2414}
2415
2416pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2417    match extract_pdf_text_inside_helper(path) {
2418        Ok(Some(text)) => {
2419            use std::io::Write;
2420            let mut stdout = std::io::stdout();
2421            if stdout.write_all(text.as_bytes()).is_ok() {
2422                0
2423            } else {
2424                let _ = writeln!(
2425                    std::io::stderr(),
2426                    "PDF helper could not write extracted text."
2427                );
2428                1
2429            }
2430        }
2431        Ok(None) => {
2432            eprintln!(
2433                "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2434            );
2435            1
2436        }
2437        Err(e) => {
2438            eprintln!("{}", e);
2439            1
2440        }
2441    }
2442}
2443
2444/// Extract text from any supported document type (PDF, markdown, plain text).
2445/// Used by /attach for one-shot context injection.
2446pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2447    let ext = path
2448        .extension()
2449        .and_then(|e| e.to_str())
2450        .unwrap_or("")
2451        .to_lowercase();
2452    match ext.as_str() {
2453        "pdf" => {
2454            let text = extract_pdf_text(path)?.ok_or_else(|| {
2455                "PDF contains no extractable text — it may be scanned/image-only. \
2456                     Try attaching page screenshots with /image instead."
2457                    .to_string()
2458            })?;
2459            pdf_quality_check(text)
2460        }
2461        _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2462    }
2463}
2464
2465/// Detect garbled PDF extraction — common with academic publisher PDFs that use
2466/// custom embedded fonts with non-standard glyph mappings.
2467///
2468/// Returns the text if it looks usable, or an informative error if it looks garbled.
2469fn pdf_quality_check(text: String) -> Result<String, String> {
2470    let trimmed = text.trim();
2471
2472    // Too little content to be useful.
2473    if trimmed.len() < 150 {
2474        return Err(format!(
2475            "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2476             or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2477            trimmed.len()
2478        ));
2479    }
2480
2481    // Detect words smashed together: space ratio too low.
2482    // Normal prose is ~15–20% spaces. Below 4% means glyphs aren't mapping to spaces.
2483    let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2484    let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2485    let space_ratio = if non_newline > 0 {
2486        spaces as f32 / non_newline as f32
2487    } else {
2488        0.0
2489    };
2490
2491    if space_ratio < 0.04 {
2492        return Err(
2493            "PDF text extraction produced garbled output — words are merged with no spaces. \
2494             This usually means the PDF uses custom embedded fonts (common with academic publishers \
2495             like EBSCO, Elsevier, Springer). \
2496             Try a PDF exported from Word, Google Docs, or LaTeX, \
2497             or attach page screenshots with /image instead.".to_string()
2498        );
2499    }
2500
2501    Ok(text)
2502}
2503
2504// ── Chunking strategies ───────────────────────────────────────────────────────
2505
2506/// Dispatch to the correct chunking strategy based on file extension.
2507fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2508    if ext == "rs" {
2509        chunk_rust_symbols(text)
2510    } else {
2511        chunk_paragraphs(text)
2512    }
2513}
2514
2515/// Chunk Rust source at top-level item boundaries.
2516///
2517/// Detects lines at column 0 that start a Rust declaration keyword, flushes
2518/// the accumulated buffer, then moves any trailing doc-comments / attributes
2519/// forward so they stay with the item they annotate.
2520///
2521/// Items larger than 3000 chars (e.g. large impl blocks) are further split
2522/// by sliding window so no single chunk blows the retrieval budget.
2523fn chunk_rust_symbols(text: &str) -> Vec<String> {
2524    const ITEM_STARTS: &[&str] = &[
2525        "pub fn ",
2526        "pub async fn ",
2527        "pub unsafe fn ",
2528        "async fn ",
2529        "unsafe fn ",
2530        "fn ",
2531        "pub impl",
2532        "impl ",
2533        "pub struct ",
2534        "struct ",
2535        "pub enum ",
2536        "enum ",
2537        "pub trait ",
2538        "trait ",
2539        "pub mod ",
2540        "mod ",
2541        "pub type ",
2542        "type ",
2543        "pub const ",
2544        "const ",
2545        "pub static ",
2546        "static ",
2547    ];
2548
2549    let lines: Vec<&str> = text.lines().collect();
2550    let mut chunks: Vec<String> = Vec::new();
2551    let mut current: Vec<&str> = Vec::new();
2552
2553    for &line in &lines {
2554        let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2555        let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2556
2557        if is_item && !current.is_empty() {
2558            // Scan backward to find where trailing doc-comments / attributes start —
2559            // move them to the new chunk so they land with their item.
2560            let mut split = current.len();
2561            while split > 0 {
2562                let prev = current[split - 1].trim();
2563                if prev.starts_with("///")
2564                    || prev.starts_with("//!")
2565                    || prev.starts_with("#[")
2566                    || prev.is_empty()
2567                {
2568                    split -= 1;
2569                } else {
2570                    break;
2571                }
2572            }
2573            let body = current[..split].join("\n");
2574            if !body.trim().is_empty() {
2575                chunks.push(body);
2576            }
2577            current = current[split..].to_vec();
2578        }
2579        current.push(line);
2580    }
2581    if !current.is_empty() {
2582        let body = current.join("\n");
2583        if !body.trim().is_empty() {
2584            chunks.push(body);
2585        }
2586    }
2587
2588    // Subdivide any oversized blocks (e.g. long impl blocks with many methods).
2589    let mut result = Vec::new();
2590    for chunk in chunks {
2591        if chunk.len() > 3000 {
2592            result.extend(sliding_window_chunks(&chunk, 2000, 200));
2593        } else {
2594            result.push(chunk);
2595        }
2596    }
2597    result
2598}
2599
2600/// Chunk non-Rust text at paragraph boundaries (double newline).
2601fn chunk_paragraphs(text: &str) -> Vec<String> {
2602    let mut result: Vec<String> = Vec::new();
2603    let mut current = String::new();
2604
2605    for para in text.split("\n\n") {
2606        if current.len() + para.len() + 2 > 2000 {
2607            if !current.trim().is_empty() {
2608                result.push(current.clone());
2609            }
2610            current = para.to_string();
2611        } else {
2612            if !current.is_empty() {
2613                current.push_str("\n\n");
2614            }
2615            current.push_str(para);
2616        }
2617    }
2618    if !current.trim().is_empty() {
2619        result.push(current);
2620    }
2621
2622    let mut final_result = Vec::new();
2623    for chunk in result {
2624        if chunk.len() > 2000 {
2625            final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2626        } else {
2627            final_result.push(chunk);
2628        }
2629    }
2630    final_result
2631}
2632
2633/// Classic sliding-window fallback for oversized blocks.
2634fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2635    let chars: Vec<char> = text.chars().collect();
2636    let mut result = Vec::new();
2637    let mut i = 0;
2638    while i < chars.len() {
2639        let end = (i + chunk_size).min(chars.len());
2640        result.push(chars[i..end].iter().collect());
2641        if end == chars.len() {
2642            break;
2643        }
2644        i += chunk_size - overlap;
2645    }
2646    result
2647}
hematite/memory/vein.rs

hematite/memory/
vein.rs