hematite/memory/
vein.rs

1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7/// "The Vein" — local RAG memory engine backed by SQLite FTS5 + semantic embeddings.
8///
9/// Two retrieval modes, used together:
10///
11/// **BM25 (always available)**
12/// Full-text search via SQLite FTS5 with Porter-stemming. Fast, zero extra GPU cost,
13/// works as the fallback when the embedding model isn't loaded.
14///
15/// **Semantic (when LM Studio has an embedding model loaded)**
16/// Calls `/v1/embeddings` (nomic-embed-text-v1.5 or similar) to produce 768-dim float
17/// vectors for each chunk. At search time the query is embedded and cosine similarity
18/// selects the most conceptually relevant chunks — even when no keywords match.
19///
20/// Hybrid search runs BM25 and semantic in parallel, deduplicates by path, and returns
21/// the top-k results ranked by combined score. Semantic results score higher when the
22/// embedding model is available; BM25 fills the gap when it isn't.
23///
24/// Indexing is incremental: files are re-indexed only when their mtime changes. Embedding
25/// vectors are stored in a separate `chunks_vec` SQLite table so they survive re-runs
26/// without hitting the embedding API again.
27pub struct Vein {
28    db: std::sync::Arc<std::sync::Mutex<Connection>>,
29    /// Base URL of the LLM provider, used for the embeddings endpoint.
30    base_url: String,
31    embed_model: std::sync::Arc<std::sync::RwLock<Option<String>>>,
32}
33
34// SAFETY: rusqlite::Connection is !Send by default, but we wrap it in Arc<Mutex>
35// and ensure all accesses are serialized by the mutex.
36unsafe impl Send for Vein {}
37unsafe impl Sync for Vein {}
38
39#[derive(Debug, Clone)]
40pub struct SearchResult {
41    pub path: String,
42    pub content: String,
43    /// Combined relevance score (higher = more relevant).
44    pub score: f32,
45    /// Subsystem room derived from the file path (e.g. "agent", "ui", "tools").
46    pub room: String,
47    /// Last-modified timestamp from chunks_meta (unix seconds).
48    pub last_modified: i64,
49    /// Semantic memory type tagged at index time: "decision", "problem",
50    /// "milestone", "preference", or "" for unclassified/source/doc chunks.
51    pub memory_type: String,
52}
53
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub struct VeinHotFile {
56    pub path: String,
57    pub heat: i64,
58    pub last_modified: i64,
59    pub room: String,
60}
61
62#[derive(Debug, Clone)]
63pub struct VeinInspectionSnapshot {
64    pub indexed_source_files: usize,
65    pub indexed_docs: usize,
66    pub indexed_session_exchanges: usize,
67    pub embedded_source_doc_chunks: usize,
68    pub has_any_embeddings: bool,
69    pub active_room: Option<String>,
70    pub hot_files: Vec<VeinHotFile>,
71    pub l1_ready: bool,
72}
73
74#[derive(Debug, Default)]
75struct QuerySignals {
76    exact_phrases: Vec<String>,
77    standout_terms: Vec<String>,
78    historical_memory_hint: bool,
79    temporal_reference: Option<TemporalReference>,
80    /// Memory type the query is asking about — boosts matching chunks.
81    /// e.g. "what did we decide" → "decision", "what was the bug" → "problem"
82    query_memory_type: Option<&'static str>,
83}
84
85#[derive(Debug, Clone, Copy)]
86struct TemporalReference {
87    target_ts: i64,
88    window_secs: i64,
89}
90
91#[derive(Debug, Deserialize)]
92struct SessionReport {
93    #[serde(default)]
94    session_start: String,
95    #[serde(default)]
96    transcript: Vec<SessionTranscriptEntry>,
97}
98
99#[derive(Debug, Deserialize)]
100struct SessionTranscriptEntry {
101    #[serde(default)]
102    speaker: String,
103    #[serde(default)]
104    text: String,
105}
106
107#[derive(Debug)]
108struct SessionExchange {
109    path: String,
110    last_modified: i64,
111    content: String,
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115enum SessionSpeakerKind {
116    User,
117    Assistant,
118    Ignore,
119}
120
121/// Derive a subsystem room label from a file path.
122/// Uses path segments, filenames, and repo-role hints to map files into
123/// stable subsystem rooms. Falls back to the first directory component or
124/// "root" when no stronger signal exists.
125pub fn detect_room(path: &str) -> String {
126    let lower = path.to_lowercase().replace('\\', "/");
127    let filename = lower.rsplit('/').next().unwrap_or(&lower);
128    let ext = filename.rsplit('.').next().unwrap_or("");
129
130    let mut best_room = None::<&str>;
131    let mut best_score = 0i32;
132    let mut consider = |room: &'static str, score: i32| {
133        if score > best_score {
134            best_score = score;
135            best_room = Some(room);
136        }
137    };
138
139    let is_component = |segment: &str| {
140        lower == segment
141            || lower.starts_with(&format!("{segment}/"))
142            || lower.contains(&format!("/{segment}/"))
143    };
144
145    if lower.starts_with("session/")
146        || lower.starts_with(".hematite/reports/")
147        || lower.starts_with(".hematite/imports/")
148        || is_component("reports")
149        || is_component("imports")
150    {
151        consider("session", 100);
152    }
153
154    if lower.starts_with(".hematite/docs/")
155        || is_component("docs")
156        || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
157        || matches!(ext, "md" | "markdown" | "pdf" | "rst")
158    {
159        consider("docs", 80);
160    }
161
162    if is_component("tests")
163        || filename.contains("diagnostic")
164        || filename.ends_with("_test.rs")
165        || filename.ends_with(".test.ts")
166    {
167        consider("tests", 85);
168    }
169
170    if lower.starts_with(".github/workflows/")
171        || is_component("workflows")
172        || filename == ".pre-commit-config.yaml"
173        || filename == ".pre-commit-config.yml"
174        || filename.contains("hook")
175    {
176        consider("automation", 84);
177    }
178
179    if lower.starts_with("installer/")
180        || lower.starts_with("dist/")
181        || lower.starts_with("scripts/package-")
182        || filename.contains("release")
183        || filename.contains("bump-version")
184        || ext == "iss"
185    {
186        consider("release", 82);
187    }
188
189    if matches!(
190        filename,
191        "cargo.toml"
192            | "cargo.lock"
193            | "package.json"
194            | "pnpm-lock.yaml"
195            | "yarn.lock"
196            | "bun.lock"
197            | "bun.lockb"
198            | "pyproject.toml"
199            | "setup.py"
200            | "go.mod"
201            | "pom.xml"
202            | "build.gradle"
203            | "build.gradle.kts"
204            | "cmakelists.txt"
205            | ".gitignore"
206            | "settings.json"
207            | "mcp_servers.json"
208    ) || filename.ends_with(".sln")
209        || filename.ends_with(".csproj")
210        || filename.contains("config")
211    {
212        consider("config", 76);
213    }
214
215    if is_component("ui")
216        || matches!(
217            filename,
218            "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
219        )
220    {
221        consider("ui", 70);
222    }
223
224    if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
225        consider("memory", 72);
226    }
227
228    if is_component("tools")
229        || matches!(
230            filename,
231            "verify_build.rs"
232                | "host_inspect.rs"
233                | "shell.rs"
234                | "code_sandbox.rs"
235                | "project_map.rs"
236                | "runtime_trace.rs"
237        )
238    {
239        consider("tools", 68);
240    }
241
242    if filename.contains("mcp")
243        || filename.contains("lsp")
244        || lower.contains("/mcp/")
245        || lower.contains("/lsp/")
246    {
247        consider("integration", 67);
248    }
249
250    if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
251        || filename.contains("startup")
252        || filename.contains("runtime")
253    {
254        consider("runtime", 66);
255    }
256
257    if is_component("agent") {
258        consider("agent", 60);
259    }
260
261    if lower.starts_with("libs/") || is_component("libs") {
262        consider("libs", 58);
263    }
264
265    if lower.starts_with("scripts/") || is_component("scripts") {
266        consider("scripts", 55);
267    }
268
269    if let Some(room) = best_room {
270        return room.to_string();
271    }
272
273    // Fall back to first directory component
274    lower
275        .split('/')
276        .next()
277        .filter(|s| !s.is_empty() && !s.contains('.'))
278        .unwrap_or("root")
279        .to_string()
280}
281
282/// Classify session memory text into a semantic type using zero-cost regex patterns.
283/// Returns one of: "decision", "problem", "milestone", "preference", or "" (unclassified).
284///
285/// Applied only to session/import chunks at index time. Source and doc chunks always get "".
286/// Used by reranking to boost chunks whose type matches the query's implied intent.
287pub fn detect_memory_type(text: &str) -> &'static str {
288    let lower = text.to_lowercase();
289
290    // Decision markers — architectural choices, agreed approaches, "let's use X"
291    let decision_patterns = [
292        "let's use ",
293        "we'll use ",
294        "decided to ",
295        "going with ",
296        "we agreed ",
297        "the plan is",
298        "we're going to",
299        "switching to",
300        "we chose",
301        "final decision",
302        "we settled on",
303        "agreed on",
304        "we decided",
305    ];
306    for pat in &decision_patterns {
307        if lower.contains(pat) {
308            return "decision";
309        }
310    }
311
312    // Problem markers — bugs, errors, failures, blockers
313    let problem_patterns = [
314        "bug fixed",
315        "bug was",
316        "the issue was",
317        "root cause",
318        "error was",
319        "turned out to be",
320        "the fix was",
321        "was caused by",
322        "broken because",
323        "fixed by",
324        "the problem was",
325        "found the bug",
326        "port conflict",
327        "crash",
328        "panicked",
329        "segfault",
330        "oom",
331        "out of memory",
332    ];
333    for pat in &problem_patterns {
334        if lower.contains(pat) {
335            return "problem";
336        }
337    }
338
339    // Milestone markers — shipped, completed, working
340    let milestone_patterns = [
341        "now working",
342        "successfully",
343        "shipped",
344        "deployed",
345        "it works",
346        "tests pass",
347        "all green",
348        "breakthrough",
349        "finally got",
350        "got it working",
351        "completed",
352        "finished",
353        "done with",
354        "landed",
355    ];
356    for pat in &milestone_patterns {
357        if lower.contains(pat) {
358            return "milestone";
359        }
360    }
361
362    // Preference markers — personal/operator preferences for style or workflow
363    let preference_patterns = [
364        "i prefer",
365        "i like",
366        "i don't like",
367        "i want",
368        "always use",
369        "never use",
370        "i usually",
371        "my preference",
372        "keep it",
373        "avoid using",
374    ];
375    for pat in &preference_patterns {
376        if lower.contains(pat) {
377            return "preference";
378        }
379    }
380
381    ""
382}
383
384impl Vein {
385    const SESSION_REPORT_LIMIT: usize = 5;
386    const SESSION_TURN_LIMIT: usize = 50;
387    const IMPORT_FILE_LIMIT: usize = 12;
388    const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
389
390    pub fn new<P: AsRef<Path>>(
391        db_path: P,
392        base_url: String,
393    ) -> Result<Self, Box<dyn std::error::Error>> {
394        let db = Connection::open(db_path)?;
395
396        // WAL mode for better concurrent read performance.
397        db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
398
399        // chunks_meta: tracks last-modified time per path for incremental indexing.
400        // chunks_fts:  BM25 full-text index of all code chunks.
401        // chunks_vec:  semantic embedding vectors, keyed by (path, chunk_idx).
402        db.execute_batch(
403            "CREATE TABLE IF NOT EXISTS chunks_meta (
404                path TEXT PRIMARY KEY,
405                last_modified INTEGER NOT NULL,
406                room TEXT NOT NULL DEFAULT 'root'
407            );
408            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
409                path UNINDEXED,
410                content,
411                tokenize='porter ascii'
412            );
413            CREATE TABLE IF NOT EXISTS chunks_vec (
414                path TEXT NOT NULL,
415                chunk_idx INTEGER NOT NULL,
416                embedding BLOB NOT NULL,
417                PRIMARY KEY (path, chunk_idx)
418            );
419            CREATE TABLE IF NOT EXISTS file_heat (
420                path TEXT PRIMARY KEY,
421                heat INTEGER NOT NULL DEFAULT 0,
422                last_edit INTEGER NOT NULL DEFAULT 0
423            );",
424        )?;
425
426        // Schema migrations — safe to run on every open (IF NOT EXISTS / ignored if col exists).
427        let _ = db
428            .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
429        let _ = db.execute_batch(
430            "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
431        );
432        let _ = db.execute_batch(
433            "ALTER TABLE chunks_meta ADD COLUMN memory_type TEXT NOT NULL DEFAULT '';",
434        );
435
436        Ok(Self {
437            db: std::sync::Arc::new(std::sync::Mutex::new(db)),
438            base_url,
439            embed_model: std::sync::Arc::new(std::sync::RwLock::new(None)),
440        })
441    }
442
443    pub fn set_embed_model(&self, model_id: Option<String>) {
444        if let Ok(mut guard) = self.embed_model.write() {
445            *guard = model_id;
446        }
447    }
448
449    fn current_embed_model(&self) -> Option<String> {
450        self.embed_model.read().ok().and_then(|guard| guard.clone())
451    }
452
453    // ── Indexing ──────────────────────────────────────────────────────────────
454
455    /// Index a single file for BM25 search. Skip if mtime hasn't changed.
456    /// Returns the chunks that were written (empty if file was unchanged).
457    pub fn index_document(
458        &mut self,
459        path: &str,
460        last_modified: i64,
461        full_text: &str,
462    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
463        let room = detect_room(path);
464        let ext = std::path::Path::new(path)
465            .extension()
466            .and_then(|e| e.to_str())
467            .unwrap_or("");
468        let chunks = chunk_by_symbols(ext, full_text);
469        // Tag session memory with semantic type; source/doc chunks leave it empty.
470        let memory_type = if room == "session" {
471            detect_memory_type(full_text)
472        } else {
473            ""
474        };
475        self.index_chunks_with_room_and_type(path, last_modified, &room, memory_type, &chunks)
476    }
477
478    fn index_chunks_with_room_and_type(
479        &mut self,
480        path: &str,
481        last_modified: i64,
482        room: &str,
483        memory_type: &str,
484        chunks: &[String],
485    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
486        let db = self.db.lock().unwrap();
487        let existing: Option<i64> = db
488            .query_row(
489                "SELECT last_modified FROM chunks_meta WHERE path = ?1",
490                params![path],
491                |r| r.get(0),
492            )
493            .ok();
494
495        if let Some(ts) = existing {
496            if ts >= last_modified {
497                return Ok(Vec::new()); // unchanged — skip
498            }
499        }
500
501        // Evict stale BM25 chunks, stale embedding vectors, then update metadata.
502        db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
503        db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
504        db.execute(
505            "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room, memory_type) VALUES (?1, ?2, ?3, ?4)",
506            params![path, last_modified, room, memory_type],
507        )?;
508
509        drop(db);
510
511        let mut db = self.db.lock().unwrap();
512        let tx = db.transaction()?;
513        {
514            let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
515            for chunk in chunks {
516                stmt.execute(params![path, chunk.as_str()])?;
517            }
518        }
519        tx.commit()?;
520
521        Ok(chunks.to_vec())
522    }
523
524    /// Embed a set of chunks for one file and store the vectors.
525    /// Called after `index_document` returns new chunks.
526    /// Silently skips if the embedding model is unavailable.
527    pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
528        let Some(embed_model) = self.current_embed_model() else {
529            return;
530        };
531        for (idx, chunk) in chunks.iter().enumerate() {
532            if let Some(vec) = embed_text_blocking(chunk, &self.base_url, &embed_model) {
533                let blob = floats_to_blob(&vec);
534                let db = self.db.lock().unwrap();
535                let _ = db.execute(
536                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
537                    params![path, idx as i64, blob],
538                );
539            }
540        }
541    }
542
543    // ── Search ────────────────────────────────────────────────────────────────
544
545    /// BM25-ranked full-text search via FTS5 MATCH.
546    pub fn search_bm25(
547        &self,
548        query: &str,
549        limit: usize,
550    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
551        // Strip common English stopwords so FTS5 MATCH gets meaningful tokens only.
552        // FTS5 uses implicit AND by default — passing stopwords like "how", "does",
553        // "the" causes zero results because source code never contains those phrases.
554        static STOPWORDS: std::sync::OnceLock<HashSet<&'static str>> = std::sync::OnceLock::new();
555        let stopwords = STOPWORDS.get_or_init(|| {
556            [
557                "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
558                "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an",
559                "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
560                "from", "get", "gets", "got", "work", "works", "make", "makes", "use", "uses",
561                "into", "that", "this", "it", "its",
562            ]
563            .iter()
564            .copied()
565            .collect()
566        });
567
568        // Lowercase once so stopword matching below needs no per-token allocation.
569        // FTS5 is case-insensitive, so lowercased tokens match correctly.
570        let safe_query: String = query
571            .chars()
572            .map(|c| {
573                if c.is_alphanumeric() || c == ' ' || c == '_' {
574                    c.to_ascii_lowercase()
575                } else {
576                    ' '
577                }
578            })
579            .collect();
580
581        // Build an OR query from non-stopword tokens so any relevant term matches.
582        let fts_query = safe_query
583            .split_whitespace()
584            .filter(|w| w.len() >= 3 && !stopwords.contains(*w))
585            .collect::<Vec<_>>()
586            .join(" OR ");
587
588        if fts_query.is_empty() {
589            return Ok(Vec::new());
590        }
591
592        let db = self.db.lock().unwrap();
593        let mut stmt = db.prepare(
594            "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room, cm.memory_type
595             FROM chunks_fts
596             JOIN chunks_meta cm ON cm.path = chunks_fts.path
597             WHERE chunks_fts MATCH ?1
598             ORDER BY rank
599             LIMIT ?2",
600        )?;
601
602        let results: Vec<SearchResult> = stmt
603            .query_map(params![fts_query, limit as i64], |row| {
604                Ok(SearchResult {
605                    path: row.get(0)?,
606                    content: row.get(1)?,
607                    score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
608                    last_modified: row.get(3)?,
609                    room: row.get(4)?,
610                    memory_type: row.get::<_, String>(5).unwrap_or_default(),
611                })
612            })?
613            .filter_map(|r| r.ok())
614            .collect();
615
616        Ok(results)
617    }
618
619    /// Semantic search: embed the query, cosine-similarity against all stored vectors.
620    /// Returns empty if the embedding model isn't loaded.
621    pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
622        let Some(embed_model) = self.current_embed_model() else {
623            return Vec::new();
624        };
625        let query_vec = match embed_query_blocking(query, &self.base_url, &embed_model) {
626            Some(v) => v,
627            None => return Vec::new(),
628        };
629
630        // Load all stored embeddings.
631        let rows: Vec<(String, i64, Vec<u8>, i64, String, String)> = {
632            let db = self.db.lock().unwrap();
633            let mut stmt = match db.prepare(
634                "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room, cm.memory_type
635                 FROM chunks_vec cv
636                 JOIN chunks_meta cm ON cm.path = cv.path",
637            ) {
638                Ok(s) => s,
639                Err(_) => return Vec::new(),
640            };
641            stmt.query_map([], |row| {
642                Ok((
643                    row.get::<_, String>(0)?,
644                    row.get::<_, i64>(1)?,
645                    row.get::<_, Vec<u8>>(2)?,
646                    row.get::<_, i64>(3)?,
647                    row.get::<_, String>(4)?,
648                    row.get::<_, String>(5).unwrap_or_default(),
649                ))
650            })
651            .ok()
652            .map(|rows| rows.filter_map(|r| r.ok()).collect())
653            .unwrap_or_default()
654        };
655
656        if rows.is_empty() {
657            return Vec::new();
658        }
659
660        // Score each chunk.
661        let mut scored: Vec<(f32, String, i64, i64, String, String)> = rows
662            .into_iter()
663            .filter_map(|(path, idx, blob, last_modified, room, memory_type)| {
664                let vec = blob_to_floats(&blob);
665                let sim = cosine_similarity(&query_vec, &vec);
666                Some((sim, path, idx, last_modified, room, memory_type))
667            })
668            .collect();
669
670        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
671        scored.truncate(limit);
672
673        // Fetch the content for the top chunks.
674        let db = self.db.lock().unwrap();
675        scored
676            .into_iter()
677            .filter_map(|(score, path, idx, last_modified, room, memory_type)| {
678                let content: Option<String> = db
679                    .query_row(
680                        "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
681                        params![path, idx],
682                        |r| r.get(0),
683                    )
684                    .ok();
685                content.map(|c| SearchResult {
686                    path,
687                    content: c,
688                    score,
689                    room,
690                    last_modified,
691                    memory_type,
692                })
693            })
694            .collect()
695    }
696
697    /// Hybrid search: BM25 + semantic, deduplicated and re-ranked.
698    ///
699    /// Semantic results are preferred (they score higher) when the embedding model
700    /// is available. BM25 fills in or takes over when it isn't.
701    /// Results from the active room (hottest subsystem by edit count) get a
702    /// small boost so the model gravitates toward what's currently being worked on.
703    pub fn search_context(
704        &self,
705        query: &str,
706        limit: usize,
707    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
708        let candidate_limit = (limit.max(1) * 4).max(12);
709        let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
710        let semantic = self.search_semantic(query, candidate_limit);
711        let signals = QuerySignals::from_query(query);
712
713        // Determine the active room from heat scores.
714        let active_room = self.active_room();
715
716        // Merge: semantic results win ties (scored 1.0–2.0 range after boost).
717        // BM25 results land in 0.0–1.0 range.
718        let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
719
720        for r in semantic {
721            let score = reranked_score(&signals, active_room.as_deref(), &r, true);
722            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
723        }
724
725        for r in bm25 {
726            let score = reranked_score(&signals, active_room.as_deref(), &r, false);
727            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
728        }
729
730        let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
731        merged.sort_by(|a, b| {
732            b.score
733                .partial_cmp(&a.score)
734                .unwrap_or(std::cmp::Ordering::Equal)
735        });
736        merged.truncate(limit);
737        Ok(merged)
738    }
739
740    /// Returns the room with the highest total heat (most edited subsystem).
741    /// Used to bias retrieval toward what the user is actively working on.
742    fn active_room(&self) -> Option<String> {
743        let db = self.db.lock().unwrap();
744        db.query_row(
745            "SELECT cm.room, SUM(fh.heat) as total
746             FROM file_heat fh
747             JOIN chunks_meta cm ON cm.path = fh.path
748             GROUP BY cm.room
749             ORDER BY total DESC
750             LIMIT 1",
751            [],
752            |row| row.get::<_, String>(0),
753        )
754        .ok()
755    }
756
757    // ── Project Indexing ──────────────────────────────────────────────────────
758
759    /// Walk the entire project and index all source files (BM25 + embeddings).
760    ///
761    /// Skips: `target/`, `.git/`, `node_modules/`, `.hematite/`, files > 512 KB.
762    /// Also indexes `.hematite/docs/` — the designated reference document drop folder.
763    /// Returns the number of files processed (unchanged files are fast-pathed).
764    pub fn index_project(&mut self) -> usize {
765        let root = crate::tools::file_ops::workspace_root();
766        let mut count = 0usize;
767
768        const INDEXABLE: &[&str] = &[
769            "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
770            "yml", "txt",
771        ];
772        const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
773
774        // O(1) membership tests for the per-file extension check and the per-dir
775        // name check inside the walkdir iterator, built once via OnceLock.
776        static INDEXABLE_SET: std::sync::OnceLock<std::collections::HashSet<&'static str>> =
777            std::sync::OnceLock::new();
778        static SKIP_DIRS_SET: std::sync::OnceLock<std::collections::HashSet<&'static str>> =
779            std::sync::OnceLock::new();
780        let indexable = INDEXABLE_SET.get_or_init(|| INDEXABLE.iter().copied().collect());
781        let skip_dirs = SKIP_DIRS_SET.get_or_init(|| SKIP_DIRS.iter().copied().collect());
782
783        for entry in walkdir::WalkDir::new(&root)
784            .follow_links(false)
785            .into_iter()
786            .filter_entry(|e| {
787                if e.file_type().is_dir() {
788                    let name = e.file_name().to_string_lossy();
789                    return !skip_dirs.contains(name.as_ref());
790                }
791                true
792            })
793            .filter_map(|e| e.ok())
794            .filter(|e| e.file_type().is_file())
795        {
796            let path = entry.path();
797            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
798            if !indexable.contains(ext) {
799                continue;
800            }
801
802            let Ok(meta) = std::fs::metadata(path) else {
803                continue;
804            };
805            if meta.len() > 512_000 {
806                continue;
807            }
808
809            let mtime = meta
810                .modified()
811                .map(|t| {
812                    t.duration_since(std::time::UNIX_EPOCH)
813                        .unwrap_or_default()
814                        .as_secs() as i64
815                })
816                .unwrap_or(0);
817
818            let rel = path.strip_prefix(&root).unwrap_or(path);
819            let rel_str = rel.to_string_lossy().replace('\\', "/");
820
821            if let Ok(content) = std::fs::read_to_string(path) {
822                match self.index_document(&rel_str, mtime, &content) {
823                    Ok(new_chunks) if !new_chunks.is_empty() => {
824                        count += 1;
825                    }
826                    Ok(_) => {}
827                    Err(_) => {}
828                }
829            }
830        }
831
832        count += self.index_workspace_artifacts(&crate::tools::file_ops::hematite_dir());
833
834        count
835    }
836
837    /// Index workspace-local supporting context that should be available even
838    /// outside a real project workspace: `.hematite/docs/`, recent session
839    /// reports stored in `.hematite/reports/`, and imported chat exports in
840    /// `.hematite/imports/`.
841    pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
842        let mut count = self.index_docs_folder(workspace_root);
843        count += self.index_recent_session_reports(workspace_root);
844        count += self.index_imported_session_exports(workspace_root);
845        self.backfill_missing_embeddings();
846        count
847    }
848
849    /// Index reference documents in `.hematite/docs/`.
850    /// Supports PDF (text extraction), markdown, and plain text.
851    /// Documents are stored with path prefix `docs/filename` so they are
852    /// distinguishable from source files in retrieval results.
853    fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
854        let docs_dir = workspace_root.join(".hematite").join("docs");
855        const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
856        let mut count = 0usize;
857        let mut desired_paths = HashSet::new();
858
859        if docs_dir.exists() {
860            for entry in walkdir::WalkDir::new(&docs_dir)
861                .max_depth(3)
862                .follow_links(false)
863                .into_iter()
864                .filter_map(|e| e.ok())
865                .filter(|e| e.file_type().is_file())
866            {
867                let path = entry.path();
868                let ext = path
869                    .extension()
870                    .and_then(|e| e.to_str())
871                    .unwrap_or("")
872                    .to_lowercase();
873                if !DOCS_INDEXABLE.contains(&ext.as_str()) {
874                    continue;
875                }
876
877                let Ok(meta) = std::fs::metadata(path) else {
878                    continue;
879                };
880                if meta.len() > 50_000_000 {
881                    continue;
882                }
883
884                let mtime = meta
885                    .modified()
886                    .map(|t| {
887                        t.duration_since(std::time::UNIX_EPOCH)
888                            .unwrap_or_default()
889                            .as_secs() as i64
890                    })
891                    .unwrap_or(0);
892
893                let rel = path.strip_prefix(workspace_root).unwrap_or(path);
894                let rel_str = rel.to_string_lossy().replace('\\', "/");
895                desired_paths.insert(rel_str.clone());
896
897                let content = if ext == "pdf" {
898                    extract_pdf_text(path).ok().flatten()
899                } else {
900                    std::fs::read_to_string(path).ok()
901                };
902
903                if let Some(text) = content {
904                    if text.trim().is_empty() {
905                        continue;
906                    }
907                    match self.index_document(&rel_str, mtime, &text) {
908                        Ok(new_chunks) if !new_chunks.is_empty() => {
909                            count += 1;
910                        }
911                        Ok(_) => {}
912                        Err(_) => {}
913                    }
914                }
915            }
916        }
917
918        self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
919        count
920    }
921
922    /// Index the most recent local session reports by exchange pair so prior
923    /// decisions remain searchable across launches without flooding the vein.
924    pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
925        let reports_dir = workspace_root.join(".hematite").join("reports");
926        let mut count = 0usize;
927        let mut desired_paths = HashSet::new();
928
929        if reports_dir.exists() {
930            let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
931                .ok()
932                .into_iter()
933                .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
934                .map(|entry| entry.path())
935                .filter(|path| {
936                    path.is_file()
937                        && path.extension().and_then(|ext| ext.to_str()) == Some("json")
938                        && path
939                            .file_stem()
940                            .and_then(|stem| stem.to_str())
941                            .map(|stem| stem.starts_with("session_"))
942                            .unwrap_or(false)
943                })
944                .collect();
945
946            reports.sort_by(|a, b| {
947                let a_name = a
948                    .file_name()
949                    .and_then(|name| name.to_str())
950                    .unwrap_or_default();
951                let b_name = b
952                    .file_name()
953                    .and_then(|name| name.to_str())
954                    .unwrap_or_default();
955                b_name.cmp(a_name)
956            });
957            reports.truncate(Self::SESSION_REPORT_LIMIT);
958
959            for report_path in reports {
960                let Ok(meta) = std::fs::metadata(&report_path) else {
961                    continue;
962                };
963                let mtime = meta
964                    .modified()
965                    .map(|t| {
966                        t.duration_since(std::time::UNIX_EPOCH)
967                            .unwrap_or_default()
968                            .as_secs() as i64
969                    })
970                    .unwrap_or(0);
971
972                for exchange in load_session_exchanges(&report_path, mtime) {
973                    desired_paths.insert(exchange.path.clone());
974                    let mtype = detect_memory_type(&exchange.content);
975                    match self.index_chunks_with_room_and_type(
976                        &exchange.path,
977                        exchange.last_modified,
978                        "session",
979                        mtype,
980                        std::slice::from_ref(&exchange.content),
981                    ) {
982                        Ok(new_chunks) if !new_chunks.is_empty() => {
983                            count += 1;
984                        }
985                        Ok(_) => {}
986                        Err(_) => {}
987                    }
988                }
989            }
990        }
991
992        self.prune_indexed_prefix("session/", &desired_paths);
993        count
994    }
995
996    /// Index imported chat exports from `.hematite/imports/`.
997    /// Supported inputs include already-normalized `>` transcripts, Claude Code
998    /// JSONL, Codex CLI JSONL, simple role/content JSON exports, ChatGPT
999    /// `mapping` exports, and Hematite session-report JSON.
1000    pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
1001        let imports_dir = workspace_root.join(".hematite").join("imports");
1002        let mut count = 0usize;
1003        let mut desired_paths = HashSet::new();
1004
1005        if imports_dir.exists() {
1006            let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
1007                .max_depth(4)
1008                .follow_links(false)
1009                .into_iter()
1010                .filter_map(|entry| entry.ok())
1011                .filter(|entry| entry.file_type().is_file())
1012                .filter_map(|entry| {
1013                    let path = entry.into_path();
1014                    let ext = path
1015                        .extension()
1016                        .and_then(|ext| ext.to_str())
1017                        .unwrap_or("")
1018                        .to_ascii_lowercase();
1019                    if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
1020                        return None;
1021                    }
1022                    let meta = std::fs::metadata(&path).ok()?;
1023                    if meta.len() > Self::IMPORT_MAX_BYTES {
1024                        return None;
1025                    }
1026                    let mtime = meta
1027                        .modified()
1028                        .map(|t| {
1029                            t.duration_since(std::time::UNIX_EPOCH)
1030                                .unwrap_or_default()
1031                                .as_secs() as i64
1032                        })
1033                        .unwrap_or(0);
1034                    Some((path, mtime))
1035                })
1036                .collect();
1037
1038            imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
1039                b_mtime
1040                    .cmp(a_mtime)
1041                    .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
1042            });
1043            imports.truncate(Self::IMPORT_FILE_LIMIT);
1044
1045            for (import_path, mtime) in imports {
1046                for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
1047                    desired_paths.insert(exchange.path.clone());
1048                    let mtype = detect_memory_type(&exchange.content);
1049                    match self.index_chunks_with_room_and_type(
1050                        &exchange.path,
1051                        exchange.last_modified,
1052                        "session",
1053                        mtype,
1054                        std::slice::from_ref(&exchange.content),
1055                    ) {
1056                        Ok(new_chunks) if !new_chunks.is_empty() => {
1057                            count += 1;
1058                        }
1059                        Ok(_) => {}
1060                        Err(_) => {}
1061                    }
1062                }
1063            }
1064        }
1065
1066        self.prune_indexed_prefix("session/imports/", &desired_paths);
1067        count
1068    }
1069
1070    /// Embed any FTS chunks that don't yet have a vector in chunks_vec.
1071    /// Called at the end of index_project so that loading the embedding model
1072    /// after the initial index automatically triggers a semantic upgrade on the
1073    /// next agent turn — no /forget or file-touch required.
1074    fn backfill_missing_embeddings(&self) {
1075        // Fast path: if chunk counts match, nothing to do.
1076        let (fts_count, vec_count) = {
1077            let db = self.db.lock().unwrap();
1078            let fts: i64 = db
1079                .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
1080                .unwrap_or(0);
1081            let vec: i64 = db
1082                .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
1083                .unwrap_or(0);
1084            (fts, vec)
1085        };
1086        if fts_count == 0 || fts_count == vec_count {
1087            return;
1088        }
1089
1090        // Fetch (path, chunk_idx, content) for chunks with no embedding.
1091        // chunks_fts rowid serves as chunk_idx (1-based → convert to 0-based).
1092        let missing: Vec<(String, i64, String)> = {
1093            let db = self.db.lock().unwrap();
1094            let mut stmt = db
1095                .prepare(
1096                    "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
1097                     FROM chunks_fts f
1098                     LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
1099                     WHERE v.path IS NULL
1100                     ORDER BY CASE
1101                         WHEN f.path LIKE '%.rs' THEN 0
1102                         WHEN f.path LIKE '%.toml' THEN 1
1103                         WHEN f.path LIKE '%.json' THEN 2
1104                         ELSE 3
1105                     END, f.path
1106                     LIMIT 20",
1107                )
1108                .unwrap();
1109            stmt.query_map([], |r| {
1110                Ok((
1111                    r.get::<_, String>(0)?,
1112                    r.get::<_, i64>(1)?,
1113                    r.get::<_, String>(2)?,
1114                ))
1115            })
1116            .unwrap()
1117            .filter_map(|r| r.ok())
1118            .collect()
1119        };
1120
1121        let Some(embed_model) = self.current_embed_model() else {
1122            return;
1123        };
1124
1125        for (path, idx, content) in missing {
1126            if let Some(vec) = embed_text_blocking(&content, &self.base_url, &embed_model) {
1127                let blob = floats_to_blob(&vec);
1128                let db = self.db.lock().unwrap();
1129                let _ = db.execute(
1130                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
1131                    params![path, idx, blob],
1132                );
1133            } else {
1134                // Embedding model not available — stop trying for this pass.
1135                break;
1136            }
1137        }
1138    }
1139
1140    /// Total number of unique files currently indexed.
1141    /// Session exchange chunks are excluded so status counts stay source/doc centric.
1142    pub fn file_count(&self) -> usize {
1143        let db = self.db.lock().unwrap();
1144        db.query_row(
1145            "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
1146            [],
1147            |r| r.get::<_, i64>(0),
1148        )
1149        .unwrap_or(0) as usize
1150    }
1151
1152    /// Number of source/doc chunks that have semantic embedding vectors stored.
1153    /// Session exchange chunks are excluded so status counts stay source/doc centric.
1154    pub fn embedded_chunk_count(&self) -> usize {
1155        let db = self.db.lock().unwrap();
1156        db.query_row(
1157            "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1158            [],
1159            |r| r.get::<_, i64>(0),
1160        )
1161        .unwrap_or(0) as usize
1162    }
1163
1164    /// True when any chunk type currently has embeddings available.
1165    pub fn has_any_embeddings(&self) -> bool {
1166        let db = self.db.lock().unwrap();
1167        db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1168            r.get::<_, i64>(0)
1169        })
1170        .unwrap_or(0)
1171            != 0
1172    }
1173
1174    /// Wipe all indexed data. The DB file stays on disk; next index_project()
1175    /// call rebuilds from scratch (re-reads all files, re-embeds all chunks).
1176    pub fn reset(&self) {
1177        let db = self.db.lock().unwrap();
1178        let _ = db.execute_batch(
1179            "DELETE FROM chunks_fts;
1180             DELETE FROM chunks_vec;
1181             DELETE FROM chunks_meta;",
1182        );
1183    }
1184
1185    /// Return a compact operator-facing snapshot of what The Vein currently knows.
1186    /// Intended for trust/debug surfaces like `/vein-inspect`.
1187    pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1188        let db = self.db.lock().unwrap();
1189        let indexed_source_files = db
1190            .query_row(
1191                "SELECT COUNT(*) FROM chunks_meta
1192                 WHERE path NOT LIKE 'session/%'
1193                   AND path NOT LIKE '.hematite/docs/%'",
1194                [],
1195                |r| r.get::<_, i64>(0),
1196            )
1197            .unwrap_or(0) as usize;
1198        let indexed_docs = db
1199            .query_row(
1200                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1201                [],
1202                |r| r.get::<_, i64>(0),
1203            )
1204            .unwrap_or(0) as usize;
1205        let indexed_session_exchanges = db
1206            .query_row(
1207                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1208                [],
1209                |r| r.get::<_, i64>(0),
1210            )
1211            .unwrap_or(0) as usize;
1212        let embedded_source_doc_chunks = db
1213            .query_row(
1214                "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1215                [],
1216                |r| r.get::<_, i64>(0),
1217            )
1218            .unwrap_or(0) as usize;
1219        let has_any_embeddings = db
1220            .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1221                r.get::<_, i64>(0)
1222            })
1223            .unwrap_or(0)
1224            != 0;
1225        drop(db);
1226
1227        let hot_files = self
1228            .hot_files(hot_limit.max(1))
1229            .into_iter()
1230            .map(|(path, heat, last_modified, room)| VeinHotFile {
1231                path,
1232                heat,
1233                last_modified,
1234                room,
1235            })
1236            .collect::<Vec<_>>();
1237
1238        VeinInspectionSnapshot {
1239            indexed_source_files,
1240            indexed_docs,
1241            indexed_session_exchanges,
1242            embedded_source_doc_chunks,
1243            has_any_embeddings,
1244            active_room: self.active_room(),
1245            l1_ready: !hot_files.is_empty(),
1246            hot_files,
1247        }
1248    }
1249
1250    // ── L1 heat tracking ──────────────────────────────────────────────────────
1251
1252    /// Record an edit to a file. Increments its heat score in file_heat.
1253    /// Called from the tool dispatch after a successful edit_file / write_file /
1254    /// patch_hunk / multi_search_replace so the L1 context stays current.
1255    pub fn bump_heat(&self, path: &str) {
1256        if path.is_empty() {
1257            return;
1258        }
1259        let now = std::time::SystemTime::now()
1260            .duration_since(std::time::UNIX_EPOCH)
1261            .unwrap_or_default()
1262            .as_secs() as i64;
1263        let db = self.db.lock().unwrap();
1264        let _ = db.execute(
1265            "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1266             ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1267            params![path, now],
1268        );
1269    }
1270
1271    /// Return the top N hot files ranked by edit count (heat) then recency.
1272    /// Joins file_heat with chunks_meta so only indexed files are included.
1273    /// Returns (path, heat, mtime, room).
1274    fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1275        let db = self.db.lock().unwrap();
1276        let mut stmt = match db.prepare(
1277            "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1278             FROM file_heat fh
1279             JOIN chunks_meta cm ON cm.path = fh.path
1280             ORDER BY fh.heat DESC, cm.last_modified DESC
1281             LIMIT ?1",
1282        ) {
1283            Ok(s) => s,
1284            Err(_) => return vec![],
1285        };
1286        stmt.query_map(params![n as i64], |row| {
1287            Ok((
1288                row.get::<_, String>(0)?,
1289                row.get::<_, i64>(1)?,
1290                row.get::<_, i64>(2)?,
1291                row.get::<_, String>(3)?,
1292            ))
1293        })
1294        .map(|rows| rows.filter_map(|r| r.ok()).collect())
1295        .unwrap_or_default()
1296    }
1297
1298    /// Return the paths of the top hot files (most edited).
1299    /// Used by RepoMapGenerator to bias PageRank toward active files.
1300    pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1301        self.hot_files(n)
1302            .into_iter()
1303            .map(|(path, _, _, _)| path)
1304            .collect()
1305    }
1306
1307    /// Return hot files with normalized heat weights in [0.0, 1.0].
1308    /// The hottest file gets weight 1.0; others are scaled proportionally.
1309    /// Used by RepoMapGenerator to apply heat-weighted PageRank personalization.
1310    pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1311        let files = self.hot_files(n);
1312        if files.is_empty() {
1313            return vec![];
1314        }
1315        let max_heat = files
1316            .iter()
1317            .map(|(_, h, _, _)| *h)
1318            .max()
1319            .unwrap_or(1)
1320            .max(1) as f64;
1321        files
1322            .into_iter()
1323            .map(|(path, heat, _, _)| {
1324                let weight = (heat as f64) / max_heat;
1325                (path, weight)
1326            })
1327            .collect()
1328    }
1329
1330    /// Build the L1 context block — a compact "hot files" summary injected into
1331    /// the system prompt at session start. Capped at ~150 tokens.
1332    /// Files are grouped by room so the model sees subsystem structure at a glance.
1333    /// Returns None when there are no heat records yet (fresh project).
1334    pub fn l1_context(&self) -> Option<String> {
1335        let files = self.hot_files(8);
1336        if files.is_empty() {
1337            return None;
1338        }
1339        let now = std::time::SystemTime::now()
1340            .duration_since(std::time::UNIX_EPOCH)
1341            .unwrap_or_default()
1342            .as_secs() as i64;
1343
1344        // Group by room for readability.
1345        let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1346            std::collections::BTreeMap::new();
1347        for (path, heat, mtime, room) in &files {
1348            by_room
1349                .entry(room.clone())
1350                .or_default()
1351                .push((path.clone(), *heat, *mtime));
1352        }
1353
1354        let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1355        for (room, entries) in &by_room {
1356            out.push_str(&format!("[{}]\n", room));
1357            for (path, heat, mtime) in entries {
1358                let age_secs = now - mtime;
1359                let age = if age_secs < 3600 {
1360                    "just now".to_string()
1361                } else if age_secs < 86400 {
1362                    format!("{}h ago", age_secs / 3600)
1363                } else {
1364                    format!("{}d ago", age_secs / 86400)
1365                };
1366                out.push_str(&format!(
1367                    "  - {} [{} edit{}, {}]\n",
1368                    path,
1369                    heat,
1370                    if *heat == 1 { "" } else { "s" },
1371                    age
1372                ));
1373            }
1374        }
1375        Some(out)
1376    }
1377
1378    fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1379        let pattern = format!("{}%", prefix);
1380        let existing_paths: Vec<String> = {
1381            let db = self.db.lock().unwrap();
1382            let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1383                Ok(stmt) => stmt,
1384                Err(_) => return,
1385            };
1386            stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1387                .map(|rows| rows.filter_map(|row| row.ok()).collect())
1388                .unwrap_or_default()
1389        };
1390
1391        if existing_paths.is_empty() {
1392            return;
1393        }
1394
1395        let db = self.db.lock().unwrap();
1396        for path in existing_paths {
1397            if desired_paths.contains(&path) {
1398                continue;
1399            }
1400            let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1401            let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1402            let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1403        }
1404    }
1405}
1406
1407impl QuerySignals {
1408    fn from_query(query: &str) -> Self {
1409        let lower = query.to_ascii_lowercase();
1410        let historical_memory_hint = [
1411            "remember",
1412            "earlier",
1413            "previous",
1414            "last time",
1415            "what did we decide",
1416            "why did we decide",
1417            "what did we say",
1418            "why did we change",
1419        ]
1420        .iter()
1421        .any(|needle| lower.contains(needle));
1422
1423        // Detect what memory type the query is asking about.
1424        let query_memory_type = if lower.contains("decide")
1425            || lower.contains("decision")
1426            || lower.contains("we agreed")
1427            || lower.contains("we chose")
1428        {
1429            Some("decision")
1430        } else if lower.contains("bug")
1431            || lower.contains("error")
1432            || lower.contains("issue")
1433            || lower.contains("problem")
1434            || lower.contains("fix")
1435            || lower.contains("broken")
1436        {
1437            Some("problem")
1438        } else if lower.contains("shipped")
1439            || lower.contains("milestone")
1440            || lower.contains("finished")
1441            || lower.contains("working now")
1442        {
1443            Some("milestone")
1444        } else if lower.contains("prefer")
1445            || lower.contains("my preference")
1446            || lower.contains("i like")
1447            || lower.contains("i want")
1448        {
1449            Some("preference")
1450        } else {
1451            None
1452        };
1453
1454        Self {
1455            exact_phrases: extract_exact_phrases(query),
1456            standout_terms: extract_standout_terms(query),
1457            historical_memory_hint,
1458            temporal_reference: extract_temporal_reference(query),
1459            query_memory_type,
1460        }
1461    }
1462}
1463
1464fn merge_scored_result(
1465    merged_by_path: &mut HashMap<String, SearchResult>,
1466    candidate: SearchResult,
1467) {
1468    match merged_by_path.get_mut(&candidate.path) {
1469        Some(existing) if candidate.score > existing.score => *existing = candidate,
1470        Some(_) => {}
1471        None => {
1472            merged_by_path.insert(candidate.path.clone(), candidate);
1473        }
1474    }
1475}
1476
1477fn reranked_score(
1478    signals: &QuerySignals,
1479    active_room: Option<&str>,
1480    result: &SearchResult,
1481    is_semantic: bool,
1482) -> f32 {
1483    let base = if is_semantic {
1484        1.0 + result.score.clamp(0.0, 1.0)
1485    } else {
1486        (result.score / 10.0).clamp(0.0, 1.0)
1487    };
1488    base + room_bias(active_room, result)
1489        + retrieval_signal_boost(signals, result)
1490        + temporal_memory_boost(signals, result)
1491}
1492
1493fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1494    if active_room == Some(result.room.as_str()) {
1495        0.15
1496    } else {
1497        0.0
1498    }
1499}
1500
1501fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1502    let mut boost = 0.0f32;
1503
1504    // Lowercase once; reuse for both the phrase-match haystack and the
1505    // per-term standout loop — avoids one allocation per standout term.
1506    let lower_path = result.path.to_ascii_lowercase();
1507    let lower_content = result.content.to_ascii_lowercase();
1508    let haystack = format!("{}\n{}", lower_path, lower_content);
1509
1510    let phrase_matches = signals
1511        .exact_phrases
1512        .iter()
1513        .filter(|phrase| haystack.contains(phrase.as_str()))
1514        .count();
1515    if phrase_matches > 0 {
1516        boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1517    }
1518
1519    let mut standout_matches = 0;
1520    for term in &signals.standout_terms {
1521        if lower_path.contains(term.as_str()) {
1522            boost += 0.40;
1523            standout_matches += 1;
1524        } else if lower_content.contains(term.as_str()) {
1525            boost += 0.12;
1526            standout_matches += 1;
1527        }
1528        if standout_matches >= 3 {
1529            break;
1530        }
1531    }
1532
1533    if signals.historical_memory_hint && result.room == "session" {
1534        boost += 0.45;
1535    }
1536
1537    // Boost session chunks whose tagged memory type matches the query's intent.
1538    if let Some(qtype) = signals.query_memory_type {
1539        if !result.memory_type.is_empty() && result.memory_type == qtype {
1540            boost += 0.35;
1541        }
1542    }
1543
1544    boost
1545}
1546
1547fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1548    if result.room != "session" {
1549        return 0.0;
1550    }
1551    let Some(reference) = signals.temporal_reference else {
1552        return 0.0;
1553    };
1554    let Some(memory_ts) = session_memory_timestamp(result) else {
1555        return 0.0;
1556    };
1557
1558    let span = reference.window_secs.max(86_400);
1559    let full_fade = span.saturating_mul(8);
1560    if full_fade <= 0 {
1561        return 0.0;
1562    }
1563
1564    let distance = (memory_ts - reference.target_ts).abs();
1565    let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1566    if closeness <= 0.0 {
1567        0.0
1568    } else {
1569        0.22 * closeness
1570    }
1571}
1572
1573fn extract_exact_phrases(query: &str) -> Vec<String> {
1574    let mut phrases = Vec::new();
1575    let chars: Vec<char> = query.chars().collect();
1576    let mut i = 0usize;
1577
1578    while i < chars.len() {
1579        let quote = chars[i];
1580        if !matches!(quote, '"' | '\'' | '`') {
1581            i += 1;
1582            continue;
1583        }
1584        let start = i + 1;
1585        let mut end = start;
1586        while end < chars.len() && chars[end] != quote {
1587            end += 1;
1588        }
1589        if end > start {
1590            let phrase = chars[start..end]
1591                .iter()
1592                .collect::<String>()
1593                .trim()
1594                .to_ascii_lowercase();
1595            if phrase.len() >= 3 && !phrases.contains(&phrase) {
1596                phrases.push(phrase);
1597            }
1598        }
1599        i = end.saturating_add(1);
1600    }
1601
1602    phrases
1603}
1604
1605fn extract_standout_terms(query: &str) -> Vec<String> {
1606    static STANDOUT_SW: std::sync::OnceLock<HashSet<&'static str>> = std::sync::OnceLock::new();
1607    let stopwords = STANDOUT_SW.get_or_init(|| {
1608        [
1609            "about", "after", "before", "change", "changed", "decide", "decided", "does",
1610            "earlier", "flow", "from", "have", "into", "just", "last", "local", "make", "more",
1611            "remember", "should", "that", "their", "there", "these", "they", "this", "those",
1612            "what", "when", "where", "which", "why", "with", "work",
1613        ]
1614        .iter()
1615        .copied()
1616        .collect()
1617    });
1618
1619    let mut standout = Vec::new();
1620    for token in query.split(|ch: char| {
1621        !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1622    }) {
1623        let trimmed = token.trim();
1624        if trimmed.len() < 4 {
1625            continue;
1626        }
1627        let lower = trimmed.to_ascii_lowercase();
1628        if stopwords.contains(lower.as_str()) {
1629            continue;
1630        }
1631
1632        let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1633            || trimmed
1634                .chars()
1635                .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1636            || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1637            || trimmed.len() >= 9;
1638
1639        if interesting && !standout.contains(&lower) {
1640            standout.push(lower);
1641        }
1642    }
1643
1644    standout
1645}
1646
1647fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1648    if let Some(ts) = extract_iso_date_from_query(query) {
1649        return Some(TemporalReference {
1650            target_ts: ts,
1651            window_secs: 86_400,
1652        });
1653    }
1654
1655    let now = current_unix_timestamp();
1656    let lower = query.to_ascii_lowercase();
1657    if lower.contains("yesterday") {
1658        Some(TemporalReference {
1659            target_ts: now.saturating_sub(86_400),
1660            window_secs: 86_400,
1661        })
1662    } else if lower.contains("today") || lower.contains("earlier today") {
1663        Some(TemporalReference {
1664            target_ts: now,
1665            window_secs: 86_400,
1666        })
1667    } else if lower.contains("last week") {
1668        Some(TemporalReference {
1669            target_ts: now.saturating_sub(7 * 86_400),
1670            window_secs: 7 * 86_400,
1671        })
1672    } else if lower.contains("last month") {
1673        Some(TemporalReference {
1674            target_ts: now.saturating_sub(30 * 86_400),
1675            window_secs: 30 * 86_400,
1676        })
1677    } else {
1678        None
1679    }
1680}
1681
1682fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1683    query
1684        .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1685        .find_map(parse_iso_date_token)
1686}
1687
1688fn parse_iso_date_token(token: &str) -> Option<i64> {
1689    if token.len() != 10 {
1690        return None;
1691    }
1692    let bytes = token.as_bytes();
1693    if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1694        return None;
1695    }
1696
1697    let year = token.get(0..4)?.parse::<i32>().ok()?;
1698    let month = token.get(5..7)?.parse::<u32>().ok()?;
1699    let day = token.get(8..10)?.parse::<u32>().ok()?;
1700    if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1701        return None;
1702    }
1703
1704    Some(days_from_civil(year, month, day).saturating_mul(86_400))
1705}
1706
1707fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1708    let year = year - if month <= 2 { 1 } else { 0 };
1709    let era = if year >= 0 { year } else { year - 399 } / 400;
1710    let yoe = year - era * 400;
1711    let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1712    let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1713    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1714    era as i64 * 146_097 + doe as i64 - 719_468
1715}
1716
1717fn current_unix_timestamp() -> i64 {
1718    std::time::SystemTime::now()
1719        .duration_since(std::time::UNIX_EPOCH)
1720        .unwrap_or_default()
1721        .as_secs() as i64
1722}
1723
1724fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1725    extract_session_path_timestamp(&result.path).or_else(|| {
1726        if result.last_modified > 0 {
1727            Some(result.last_modified)
1728        } else {
1729            None
1730        }
1731    })
1732}
1733
1734fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1735    let normalized = path.replace('\\', "/");
1736    let mut parts = normalized.split('/');
1737    if parts.next()? != "session" {
1738        return None;
1739    }
1740    parse_iso_date_token(parts.next()?)
1741}
1742
1743fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1744    let normalized = speaker.trim().to_ascii_lowercase();
1745    match normalized.as_str() {
1746        "you" | "user" => SessionSpeakerKind::User,
1747        "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1748        _ => SessionSpeakerKind::Assistant,
1749    }
1750}
1751
1752fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1753    let Ok(raw) = std::fs::read_to_string(report_path) else {
1754        return Vec::new();
1755    };
1756    let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1757        return Vec::new();
1758    };
1759
1760    let session_key = report_path
1761        .file_stem()
1762        .and_then(|stem| stem.to_str())
1763        .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1764        .unwrap_or("unknown-session")
1765        .to_string();
1766    let session_date = report
1767        .session_start
1768        .split('_')
1769        .next()
1770        .filter(|date| !date.is_empty())
1771        .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1772        .to_string();
1773
1774    let mut exchanges = Vec::new();
1775    let mut pending_user: Option<String> = None;
1776    let mut turn_index = 0usize;
1777
1778    for entry in report.transcript {
1779        match session_speaker_kind(&entry.speaker) {
1780            SessionSpeakerKind::User => {
1781                let text = entry.text.trim();
1782                if !text.is_empty() {
1783                    pending_user = Some(text.to_string());
1784                }
1785            }
1786            SessionSpeakerKind::Assistant => {
1787                let text = entry.text.trim();
1788                if text.is_empty() {
1789                    continue;
1790                }
1791                let Some(user_text) = pending_user.take() else {
1792                    continue;
1793                };
1794                turn_index += 1;
1795                exchanges.push(SessionExchange {
1796                    path: format!(
1797                        "session/{}/{}/turn-{}",
1798                        session_date, session_key, turn_index
1799                    ),
1800                    last_modified,
1801                    content: format!(
1802                        "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1803                        user_text, text
1804                    ),
1805                });
1806            }
1807            SessionSpeakerKind::Ignore => {}
1808        }
1809    }
1810
1811    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1812        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1813        exchanges = exchanges.into_iter().skip(keep_from).collect();
1814    }
1815
1816    exchanges
1817}
1818
1819fn load_imported_session_exchanges(
1820    import_path: &Path,
1821    imports_root: &Path,
1822    last_modified: i64,
1823) -> Vec<SessionExchange> {
1824    let Ok(raw) = std::fs::read_to_string(import_path) else {
1825        return Vec::new();
1826    };
1827
1828    let messages = normalize_import_messages(&raw, import_path);
1829    if messages.is_empty() {
1830        return Vec::new();
1831    }
1832
1833    let rel = import_path
1834        .strip_prefix(imports_root)
1835        .unwrap_or(import_path);
1836    let rel_slug = slugify_import_path(rel);
1837    let mut exchanges = Vec::new();
1838    let mut pending_user: Option<String> = None;
1839    let mut turn_index = 0usize;
1840
1841    for (role, text) in messages {
1842        let cleaned = text.trim();
1843        if cleaned.is_empty() {
1844            continue;
1845        }
1846        match role.as_str() {
1847            "user" => pending_user = Some(cleaned.to_string()),
1848            "assistant" => {
1849                let Some(user_text) = pending_user.take() else {
1850                    continue;
1851                };
1852                turn_index += 1;
1853                exchanges.push(SessionExchange {
1854                    path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1855                    last_modified,
1856                    content: format!(
1857                        "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1858                        rel.to_string_lossy().replace('\\', "/"),
1859                        user_text,
1860                        cleaned
1861                    ),
1862                });
1863            }
1864            _ => {}
1865        }
1866    }
1867
1868    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1869        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1870        exchanges = exchanges.into_iter().skip(keep_from).collect();
1871    }
1872
1873    exchanges
1874}
1875
1876fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1877    if raw.trim().is_empty() {
1878        return Vec::new();
1879    }
1880
1881    if let Some(messages) = parse_marker_transcript(raw) {
1882        return messages;
1883    }
1884
1885    let ext = import_path
1886        .extension()
1887        .and_then(|ext| ext.to_str())
1888        .unwrap_or("")
1889        .to_ascii_lowercase();
1890
1891    if matches!(ext.as_str(), "json" | "jsonl")
1892        || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1893    {
1894        if let Some(messages) = parse_jsonl_messages(raw) {
1895            if !messages.is_empty() {
1896                return messages;
1897            }
1898        }
1899
1900        if let Ok(value) = serde_json::from_str::<Value>(raw) {
1901            if let Some(messages) = parse_session_report_messages(&value) {
1902                return messages;
1903            }
1904            if let Some(messages) = parse_simple_role_messages(&value) {
1905                return messages;
1906            }
1907            if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1908                return messages;
1909            }
1910        }
1911    }
1912
1913    Vec::new()
1914}
1915
1916fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1917    let lines = raw.lines().collect::<Vec<_>>();
1918    if lines
1919        .iter()
1920        .filter(|line| line.trim_start().starts_with("> "))
1921        .count()
1922        < 2
1923    {
1924        return None;
1925    }
1926
1927    let mut messages = Vec::new();
1928    let mut i = 0usize;
1929    while i < lines.len() {
1930        let line = lines[i].trim_start();
1931        if let Some(rest) = line.strip_prefix("> ") {
1932            messages.push(("user".to_string(), rest.trim().to_string()));
1933            i += 1;
1934            let mut assistant_lines = Vec::new();
1935            while i < lines.len() {
1936                let next = lines[i];
1937                if next.trim_start().starts_with("> ") {
1938                    break;
1939                }
1940                let trimmed = next.trim();
1941                if !trimmed.is_empty() && trimmed != "---" {
1942                    assistant_lines.push(trimmed.to_string());
1943                }
1944                i += 1;
1945            }
1946            if !assistant_lines.is_empty() {
1947                messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1948            }
1949        } else {
1950            i += 1;
1951        }
1952    }
1953
1954    (!messages.is_empty()).then_some(messages)
1955}
1956
1957fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1958    let mut messages = Vec::new();
1959    let mut has_codex_session_meta = false;
1960    let mut saw_jsonl = false;
1961
1962    for line in raw.lines() {
1963        let trimmed = line.trim();
1964        if trimmed.is_empty() {
1965            continue;
1966        }
1967        let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1968            continue;
1969        };
1970        saw_jsonl = true;
1971        let Some(object) = value.as_object() else {
1972            continue;
1973        };
1974
1975        match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1976            "session_meta" => {
1977                has_codex_session_meta = true;
1978            }
1979            "event_msg" => {
1980                let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1981                    continue;
1982                };
1983                let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1984                    continue;
1985                };
1986                match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1987                    "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1988                    "agent_message" => {
1989                        messages.push(("assistant".to_string(), text.trim().to_string()))
1990                    }
1991                    _ => {}
1992                }
1993            }
1994            "human" | "user" => {
1995                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1996                    messages.push(("user".to_string(), text));
1997                }
1998            }
1999            "assistant" => {
2000                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
2001                    messages.push(("assistant".to_string(), text));
2002                }
2003            }
2004            _ => {
2005                if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
2006                    if let Some(text) = extract_text_content(&value) {
2007                        match role {
2008                            "user" | "human" => messages.push(("user".to_string(), text)),
2009                            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2010                            _ => {}
2011                        }
2012                    }
2013                }
2014            }
2015        }
2016    }
2017
2018    if !saw_jsonl {
2019        return None;
2020    }
2021
2022    if has_codex_session_meta || !messages.is_empty() {
2023        return Some(messages);
2024    }
2025
2026    None
2027}
2028
2029fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
2030    let report = value.as_object()?;
2031    let transcript = report.get("transcript")?.as_array()?;
2032    let mut messages = Vec::new();
2033
2034    for entry in transcript {
2035        let Some(obj) = entry.as_object() else {
2036            continue;
2037        };
2038        let speaker = obj
2039            .get("speaker")
2040            .and_then(|v| v.as_str())
2041            .unwrap_or_default();
2042        let text = obj
2043            .get("text")
2044            .and_then(|v| v.as_str())
2045            .unwrap_or_default()
2046            .trim()
2047            .to_string();
2048        if text.is_empty() {
2049            continue;
2050        }
2051        match session_speaker_kind(speaker) {
2052            SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
2053            SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
2054            SessionSpeakerKind::Ignore => {}
2055        }
2056    }
2057
2058    (!messages.is_empty()).then_some(messages)
2059}
2060
2061fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
2062    if let Some(array) = value.as_array() {
2063        let messages = collect_role_messages(array);
2064        return (!messages.is_empty()).then_some(messages);
2065    }
2066
2067    let obj = value.as_object()?;
2068    if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
2069        let array = messages_value.as_array()?;
2070        let messages = collect_role_messages(array);
2071        return (!messages.is_empty()).then_some(messages);
2072    }
2073
2074    None
2075}
2076
2077fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
2078    let mut messages = Vec::new();
2079    for item in items {
2080        let Some(obj) = item.as_object() else {
2081            continue;
2082        };
2083        let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
2084            continue;
2085        };
2086        let Some(text) = extract_text_content(item) else {
2087            continue;
2088        };
2089        match role {
2090            "user" | "human" => messages.push(("user".to_string(), text)),
2091            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2092            _ => {}
2093        }
2094    }
2095    messages
2096}
2097
2098fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
2099    let mapping = value.get("mapping")?.as_object()?;
2100    let mut current_id = mapping.iter().find_map(|(node_id, node)| {
2101        let obj = node.as_object()?;
2102        (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
2103    })?;
2104
2105    let mut messages = Vec::new();
2106    let mut visited = std::collections::HashSet::new();
2107
2108    while visited.insert(current_id.clone()) {
2109        let Some(node) = mapping.get(&current_id).and_then(|v| v.as_object()) else {
2110            break;
2111        };
2112
2113        if let Some(message) = node.get("message") {
2114            let role = message
2115                .get("author")
2116                .and_then(|author| author.get("role"))
2117                .and_then(|v| v.as_str())
2118                .unwrap_or("");
2119            if let Some(text) = extract_text_content(message) {
2120                match role {
2121                    "user" => messages.push(("user".to_string(), text)),
2122                    "assistant" => messages.push(("assistant".to_string(), text)),
2123                    _ => {}
2124                }
2125            }
2126        }
2127
2128        let Some(next_id) = node
2129            .get("children")
2130            .and_then(|children| children.as_array())
2131            .and_then(|children| children.first())
2132            .and_then(|child| child.as_str())
2133        else {
2134            break;
2135        };
2136        current_id = next_id.to_string();
2137    }
2138
2139    (!messages.is_empty()).then_some(messages)
2140}
2141
2142fn extract_text_content(value: &Value) -> Option<String> {
2143    if let Some(text) = value.as_str() {
2144        let trimmed = text.trim();
2145        return (!trimmed.is_empty()).then_some(trimmed.to_string());
2146    }
2147
2148    if let Some(array) = value.as_array() {
2149        let joined = array
2150            .iter()
2151            .filter_map(extract_text_content)
2152            .filter(|part| !part.is_empty())
2153            .collect::<Vec<_>>()
2154            .join("\n");
2155        return (!joined.is_empty()).then_some(joined);
2156    }
2157
2158    let obj = value.as_object()?;
2159
2160    if let Some(content) = obj.get("content") {
2161        if let Some(text) = extract_text_content(content) {
2162            return Some(text);
2163        }
2164    }
2165
2166    if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
2167        let trimmed = text.trim();
2168        if !trimmed.is_empty() {
2169            return Some(trimmed.to_string());
2170        }
2171    }
2172
2173    if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
2174        let joined = parts
2175            .iter()
2176            .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
2177            .filter(|part| !part.is_empty())
2178            .collect::<Vec<_>>()
2179            .join("\n");
2180        if !joined.is_empty() {
2181            return Some(joined);
2182        }
2183    }
2184
2185    None
2186}
2187
2188fn slugify_import_path(path: &Path) -> String {
2189    path.to_string_lossy()
2190        .replace('\\', "/")
2191        .chars()
2192        .map(|ch| {
2193            if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
2194                ch
2195            } else {
2196                '_'
2197            }
2198        })
2199        .collect::<String>()
2200        .trim_matches('/')
2201        .replace('/', "__")
2202}
2203
2204// ── Embedding API ─────────────────────────────────────────────────────────────
2205
2206/// Call the active provider's embedding endpoint synchronously.
2207///
2208/// Nomic-style models use task instruction prefixes:
2209/// - Chunks stored in the index use `"search_document: "` prefix
2210/// - Queries at search time use `"search_query: "` prefix
2211///
2212/// Callers must tolerate `None` and fall back to BM25-only search.
2213fn embed_text_blocking(text: &str, base_url: &str, embed_model: &str) -> Option<Vec<f32>> {
2214    embed_text_with_prefix(text, "search_document", base_url, embed_model)
2215}
2216
2217fn embed_query_blocking(text: &str, base_url: &str, embed_model: &str) -> Option<Vec<f32>> {
2218    embed_text_with_prefix(text, "search_query", base_url, embed_model)
2219}
2220
2221fn embed_text_with_prefix(
2222    text: &str,
2223    task: &str,
2224    base_url: &str,
2225    embed_model: &str,
2226) -> Option<Vec<f32>> {
2227    // Nomic v2 task instruction prefix format: "<task>: <text>"
2228    let prefixed = format!("{}: {}", task, text);
2229    // Truncate to ~8000 chars to stay within typical embedding model limits.
2230    let input = if prefixed.len() > 8000 {
2231        &prefixed[..8000]
2232    } else {
2233        &prefixed
2234    };
2235
2236    let client = reqwest::blocking::Client::builder()
2237        .timeout(std::time::Duration::from_secs(10))
2238        .build()
2239        .ok()?;
2240
2241    let body = serde_json::json!({
2242        "model": embed_model,
2243        "input": input
2244    });
2245
2246    let trimmed = base_url.trim_end_matches('/');
2247    let is_ollama = trimmed.contains("11434");
2248    let url = if is_ollama {
2249        format!("{}/api/embed", trimmed)
2250    } else {
2251        format!("{}/v1/embeddings", trimmed)
2252    };
2253    let resp = client.post(&url).json(&body).send().ok()?;
2254
2255    if !resp.status().is_success() {
2256        return None;
2257    }
2258
2259    let json: serde_json::Value = resp.json().ok()?;
2260    let embedding = if is_ollama {
2261        json["embeddings"][0].as_array()?
2262    } else {
2263        json["data"][0]["embedding"].as_array()?
2264    };
2265    let vec: Vec<f32> = embedding
2266        .iter()
2267        .filter_map(|v| v.as_f64().map(|f| f as f32))
2268        .collect();
2269
2270    if vec.is_empty() {
2271        None
2272    } else {
2273        Some(vec)
2274    }
2275}
2276
2277// ── Vector math ───────────────────────────────────────────────────────────────
2278
2279fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2280    if a.len() != b.len() || a.is_empty() {
2281        return 0.0;
2282    }
2283    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2284    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2285    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2286    if norm_a == 0.0 || norm_b == 0.0 {
2287        0.0
2288    } else {
2289        dot / (norm_a * norm_b)
2290    }
2291}
2292
2293fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2294    floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2295}
2296
2297fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2298    blob.chunks_exact(4)
2299        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2300        .collect()
2301}
2302
2303// ── Document extraction ───────────────────────────────────────────────────────
2304
2305/// Extract plain text from a PDF file using pdf-extract.
2306/// Returns None if the file can't be read or yields no text.
2307/// Output is best-effort — layout is not preserved, but content is.
2308fn normalize_extracted_document_text(text: String) -> Option<String> {
2309    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2310    let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2311    if trimmed.is_empty() {
2312        None
2313    } else {
2314        Some(trimmed.to_string())
2315    }
2316}
2317
2318fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2319    let previous_hook = std::panic::take_hook();
2320    std::panic::set_hook(Box::new(|_| {}));
2321    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2322        pdf_extract::extract_text(path)
2323    }));
2324    std::panic::set_hook(previous_hook);
2325
2326    match result {
2327        Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2328        Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2329        Err(payload) => {
2330            let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2331                (*msg).to_string()
2332            } else if let Some(msg) = payload.downcast_ref::<String>() {
2333                msg.clone()
2334            } else {
2335                "unknown parser panic".to_string()
2336            };
2337            Err(format!("pdf-extract panicked: {}", panic_text))
2338        }
2339    }
2340}
2341
2342fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2343    let mut doc =
2344        lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2345
2346    if doc.is_encrypted() {
2347        doc.decrypt("")
2348            .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2349    }
2350
2351    let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2352    if page_numbers.is_empty() {
2353        return Ok(None);
2354    }
2355
2356    let mut extracted_pages = Vec::new();
2357    let mut page_errors = Vec::new();
2358
2359    for page_number in page_numbers {
2360        match doc.extract_text(&[page_number]) {
2361            Ok(text) => {
2362                if let Some(page_text) = normalize_extracted_document_text(text) {
2363                    extracted_pages.push(page_text);
2364                }
2365            }
2366            Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2367        }
2368    }
2369
2370    if !extracted_pages.is_empty() {
2371        return Ok(Some(extracted_pages.join("\n\n")));
2372    }
2373
2374    if !page_errors.is_empty() {
2375        let sample_errors = page_errors
2376            .into_iter()
2377            .take(3)
2378            .collect::<Vec<_>>()
2379            .join("; ");
2380        return Err(format!(
2381            "lopdf could not extract usable page text ({sample_errors})"
2382        ));
2383    }
2384
2385    Ok(None)
2386}
2387
2388fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2389    let mut failures = Vec::new();
2390
2391    match extract_pdf_text_with_pdf_extract(path) {
2392        Ok(Some(text)) => return Ok(Some(text)),
2393        Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2394        Err(e) => failures.push(e),
2395    }
2396
2397    match extract_pdf_text_with_lopdf(path) {
2398        Ok(Some(text)) => return Ok(Some(text)),
2399        Ok(None) => failures.push("lopdf found no usable text".to_string()),
2400        Err(e) => failures.push(e),
2401    }
2402
2403    let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2404    Err(format!(
2405        "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2406        detail
2407    ))
2408}
2409
2410fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2411    let exe = std::env::current_exe()
2412        .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2413    let output = std::process::Command::new(exe)
2414        .arg("--pdf-extract-helper")
2415        .arg(path)
2416        .stdin(std::process::Stdio::null())
2417        .stdout(std::process::Stdio::piped())
2418        .stderr(std::process::Stdio::piped())
2419        .output()
2420        .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2421
2422    if !output.status.success() {
2423        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2424        return Err(if stderr.is_empty() {
2425            "PDF extraction failed.".to_string()
2426        } else {
2427            stderr
2428        });
2429    }
2430
2431    let text = String::from_utf8(output.stdout)
2432        .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2433    if text.trim().is_empty() {
2434        Ok(None)
2435    } else {
2436        Ok(Some(text))
2437    }
2438}
2439
2440pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2441    match extract_pdf_text_inside_helper(path) {
2442        Ok(Some(text)) => {
2443            use std::io::Write;
2444            let mut stdout = std::io::stdout();
2445            if stdout.write_all(text.as_bytes()).is_ok() {
2446                0
2447            } else {
2448                let _ = writeln!(
2449                    std::io::stderr(),
2450                    "PDF helper could not write extracted text."
2451                );
2452                1
2453            }
2454        }
2455        Ok(None) => {
2456            eprintln!(
2457                "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2458            );
2459            1
2460        }
2461        Err(e) => {
2462            eprintln!("{}", e);
2463            1
2464        }
2465    }
2466}
2467
2468/// Extract text from any supported document type (PDF, markdown, plain text).
2469/// Used by /attach for one-shot context injection.
2470pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2471    let ext = path
2472        .extension()
2473        .and_then(|e| e.to_str())
2474        .unwrap_or("")
2475        .to_lowercase();
2476    match ext.as_str() {
2477        "pdf" => {
2478            let text = extract_pdf_text(path)?.ok_or_else(|| {
2479                "PDF contains no extractable text — it may be scanned/image-only. \
2480                     Try attaching page screenshots with /image instead."
2481                    .to_string()
2482            })?;
2483            pdf_quality_check(text)
2484        }
2485        _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2486    }
2487}
2488
2489/// Detect garbled PDF extraction — common with academic publisher PDFs that use
2490/// custom embedded fonts with non-standard glyph mappings.
2491///
2492/// Returns the text if it looks usable, or an informative error if it looks garbled.
2493fn pdf_quality_check(text: String) -> Result<String, String> {
2494    let trimmed = text.trim();
2495
2496    // Too little content to be useful.
2497    if trimmed.len() < 150 {
2498        return Err(format!(
2499            "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2500             or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2501            trimmed.len()
2502        ));
2503    }
2504
2505    // Detect words smashed together: space ratio too low.
2506    // Normal prose is ~15–20% spaces. Below 4% means glyphs aren't mapping to spaces.
2507    let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2508    let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2509    let space_ratio = if non_newline > 0 {
2510        spaces as f32 / non_newline as f32
2511    } else {
2512        0.0
2513    };
2514
2515    if space_ratio < 0.04 {
2516        return Err(
2517            "PDF text extraction produced garbled output — words are merged with no spaces. \
2518             This usually means the PDF uses custom embedded fonts (common with academic publishers \
2519             like EBSCO, Elsevier, Springer). \
2520             Try a PDF exported from Word, Google Docs, or LaTeX, \
2521             or attach page screenshots with /image instead.".to_string()
2522        );
2523    }
2524
2525    Ok(text)
2526}
2527
2528// ── Chunking strategies ───────────────────────────────────────────────────────
2529
2530/// Dispatch to the correct chunking strategy based on file extension.
2531fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2532    if ext == "rs" {
2533        chunk_rust_symbols(text)
2534    } else {
2535        chunk_paragraphs(text)
2536    }
2537}
2538
2539/// Chunk Rust source at top-level item boundaries.
2540///
2541/// Detects lines at column 0 that start a Rust declaration keyword, flushes
2542/// the accumulated buffer, then moves any trailing doc-comments / attributes
2543/// forward so they stay with the item they annotate.
2544///
2545/// Items larger than 3000 chars (e.g. large impl blocks) are further split
2546/// by sliding window so no single chunk blows the retrieval budget.
2547fn chunk_rust_symbols(text: &str) -> Vec<String> {
2548    const ITEM_STARTS: &[&str] = &[
2549        "pub fn ",
2550        "pub async fn ",
2551        "pub unsafe fn ",
2552        "async fn ",
2553        "unsafe fn ",
2554        "fn ",
2555        "pub impl",
2556        "impl ",
2557        "pub struct ",
2558        "struct ",
2559        "pub enum ",
2560        "enum ",
2561        "pub trait ",
2562        "trait ",
2563        "pub mod ",
2564        "mod ",
2565        "pub type ",
2566        "type ",
2567        "pub const ",
2568        "const ",
2569        "pub static ",
2570        "static ",
2571    ];
2572
2573    let lines: Vec<&str> = text.lines().collect();
2574    let mut chunks: Vec<String> = Vec::new();
2575    let mut current: Vec<&str> = Vec::new();
2576
2577    for &line in &lines {
2578        let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2579        let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2580
2581        if is_item && !current.is_empty() {
2582            // Scan backward to find where trailing doc-comments / attributes start —
2583            // move them to the new chunk so they land with their item.
2584            let mut split = current.len();
2585            while split > 0 {
2586                let prev = current[split - 1].trim();
2587                if prev.starts_with("///")
2588                    || prev.starts_with("//!")
2589                    || prev.starts_with("#[")
2590                    || prev.is_empty()
2591                {
2592                    split -= 1;
2593                } else {
2594                    break;
2595                }
2596            }
2597            let body = current[..split].join("\n");
2598            if !body.trim().is_empty() {
2599                chunks.push(body);
2600            }
2601            current = current[split..].to_vec();
2602        }
2603        current.push(line);
2604    }
2605    if !current.is_empty() {
2606        let body = current.join("\n");
2607        if !body.trim().is_empty() {
2608            chunks.push(body);
2609        }
2610    }
2611
2612    // Subdivide any oversized blocks (e.g. long impl blocks with many methods).
2613    let mut result = Vec::new();
2614    for chunk in chunks {
2615        if chunk.len() > 3000 {
2616            result.extend(sliding_window_chunks(&chunk, 2000, 200));
2617        } else {
2618            result.push(chunk);
2619        }
2620    }
2621    result
2622}
2623
2624/// Chunk non-Rust text at paragraph boundaries (double newline).
2625fn chunk_paragraphs(text: &str) -> Vec<String> {
2626    let mut result: Vec<String> = Vec::new();
2627    let mut current = String::new();
2628
2629    for para in text.split("\n\n") {
2630        if current.len() + para.len() + 2 > 2000 {
2631            if !current.trim().is_empty() {
2632                result.push(current.clone());
2633            }
2634            current = para.to_string();
2635        } else {
2636            if !current.is_empty() {
2637                current.push_str("\n\n");
2638            }
2639            current.push_str(para);
2640        }
2641    }
2642    if !current.trim().is_empty() {
2643        result.push(current);
2644    }
2645
2646    let mut final_result = Vec::new();
2647    for chunk in result {
2648        if chunk.len() > 2000 {
2649            final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2650        } else {
2651            final_result.push(chunk);
2652        }
2653    }
2654    final_result
2655}
2656
2657/// Classic sliding-window fallback for oversized blocks.
2658fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2659    let chars: Vec<char> = text.chars().collect();
2660    let mut result = Vec::new();
2661    let mut i = 0;
2662    while i < chars.len() {
2663        let end = (i + chunk_size).min(chars.len());
2664        result.push(chars[i..end].iter().collect());
2665        if end == chars.len() {
2666            break;
2667        }
2668        i += chunk_size - overlap;
2669    }
2670    result
2671}
hematite/memory/vein.rs

hematite/memory/
vein.rs