hematite/memory/
vein.rs

1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7/// "The Vein" — local RAG memory engine backed by SQLite FTS5 + semantic embeddings.
8///
9/// Two retrieval modes, used together:
10///
11/// **BM25 (always available)**
12/// Full-text search via SQLite FTS5 with Porter-stemming. Fast, zero extra GPU cost,
13/// works as the fallback when the embedding model isn't loaded.
14///
15/// **Semantic (when LM Studio has an embedding model loaded)**
16/// Calls `/v1/embeddings` (nomic-embed-text-v1.5 or similar) to produce 768-dim float
17/// vectors for each chunk. At search time the query is embedded and cosine similarity
18/// selects the most conceptually relevant chunks — even when no keywords match.
19///
20/// Hybrid search runs BM25 and semantic in parallel, deduplicates by path, and returns
21/// the top-k results ranked by combined score. Semantic results score higher when the
22/// embedding model is available; BM25 fills the gap when it isn't.
23///
24/// Indexing is incremental: files are re-indexed only when their mtime changes. Embedding
25/// vectors are stored in a separate `chunks_vec` SQLite table so they survive re-runs
26/// without hitting the embedding API again.
27pub struct Vein {
28    db: std::sync::Arc<std::sync::Mutex<Connection>>,
29    /// Base URL of the LLM provider, used for the embeddings endpoint.
30    base_url: String,
31}
32
33// SAFETY: rusqlite::Connection is !Send by default, but we wrap it in Arc<Mutex>
34// and ensure all accesses are serialized by the mutex.
35unsafe impl Send for Vein {}
36unsafe impl Sync for Vein {}
37
38#[derive(Debug, Clone)]
39pub struct SearchResult {
40    pub path: String,
41    pub content: String,
42    /// Combined relevance score (higher = more relevant).
43    pub score: f32,
44    /// Subsystem room derived from the file path (e.g. "agent", "ui", "tools").
45    pub room: String,
46    /// Last-modified timestamp from chunks_meta (unix seconds).
47    pub last_modified: i64,
48    /// Semantic memory type tagged at index time: "decision", "problem",
49    /// "milestone", "preference", or "" for unclassified/source/doc chunks.
50    pub memory_type: String,
51}
52
53#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct VeinHotFile {
55    pub path: String,
56    pub heat: i64,
57    pub last_modified: i64,
58    pub room: String,
59}
60
61#[derive(Debug, Clone)]
62pub struct VeinInspectionSnapshot {
63    pub indexed_source_files: usize,
64    pub indexed_docs: usize,
65    pub indexed_session_exchanges: usize,
66    pub embedded_source_doc_chunks: usize,
67    pub has_any_embeddings: bool,
68    pub active_room: Option<String>,
69    pub hot_files: Vec<VeinHotFile>,
70    pub l1_ready: bool,
71}
72
73#[derive(Debug, Default)]
74struct QuerySignals {
75    exact_phrases: Vec<String>,
76    standout_terms: Vec<String>,
77    historical_memory_hint: bool,
78    temporal_reference: Option<TemporalReference>,
79    /// Memory type the query is asking about — boosts matching chunks.
80    /// e.g. "what did we decide" → "decision", "what was the bug" → "problem"
81    query_memory_type: Option<&'static str>,
82}
83
84#[derive(Debug, Clone, Copy)]
85struct TemporalReference {
86    target_ts: i64,
87    window_secs: i64,
88}
89
90#[derive(Debug, Deserialize)]
91struct SessionReport {
92    #[serde(default)]
93    session_start: String,
94    #[serde(default)]
95    transcript: Vec<SessionTranscriptEntry>,
96}
97
98#[derive(Debug, Deserialize)]
99struct SessionTranscriptEntry {
100    #[serde(default)]
101    speaker: String,
102    #[serde(default)]
103    text: String,
104}
105
106#[derive(Debug)]
107struct SessionExchange {
108    path: String,
109    last_modified: i64,
110    content: String,
111}
112
113#[derive(Debug, Clone, Copy, PartialEq, Eq)]
114enum SessionSpeakerKind {
115    User,
116    Assistant,
117    Ignore,
118}
119
120/// Derive a subsystem room label from a file path.
121/// Uses path segments, filenames, and repo-role hints to map files into
122/// stable subsystem rooms. Falls back to the first directory component or
123/// "root" when no stronger signal exists.
124pub fn detect_room(path: &str) -> String {
125    let lower = path.to_lowercase().replace('\\', "/");
126    let filename = lower.rsplit('/').next().unwrap_or(&lower);
127    let ext = filename.rsplit('.').next().unwrap_or("");
128
129    let mut best_room = None::<&str>;
130    let mut best_score = 0i32;
131    let mut consider = |room: &'static str, score: i32| {
132        if score > best_score {
133            best_score = score;
134            best_room = Some(room);
135        }
136    };
137
138    let is_component = |segment: &str| {
139        lower == segment
140            || lower.starts_with(&format!("{segment}/"))
141            || lower.contains(&format!("/{segment}/"))
142    };
143
144    if lower.starts_with("session/")
145        || lower.starts_with(".hematite/reports/")
146        || lower.starts_with(".hematite/imports/")
147        || is_component("reports")
148        || is_component("imports")
149    {
150        consider("session", 100);
151    }
152
153    if lower.starts_with(".hematite/docs/")
154        || is_component("docs")
155        || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
156        || matches!(ext, "md" | "markdown" | "pdf" | "rst")
157    {
158        consider("docs", 80);
159    }
160
161    if is_component("tests")
162        || filename.contains("diagnostic")
163        || filename.ends_with("_test.rs")
164        || filename.ends_with(".test.ts")
165    {
166        consider("tests", 85);
167    }
168
169    if lower.starts_with(".github/workflows/")
170        || is_component("workflows")
171        || filename == ".pre-commit-config.yaml"
172        || filename == ".pre-commit-config.yml"
173        || filename.contains("hook")
174    {
175        consider("automation", 84);
176    }
177
178    if lower.starts_with("installer/")
179        || lower.starts_with("dist/")
180        || lower.starts_with("scripts/package-")
181        || filename.contains("release")
182        || filename.contains("bump-version")
183        || ext == "iss"
184    {
185        consider("release", 82);
186    }
187
188    if matches!(
189        filename,
190        "cargo.toml"
191            | "cargo.lock"
192            | "package.json"
193            | "pnpm-lock.yaml"
194            | "yarn.lock"
195            | "bun.lock"
196            | "bun.lockb"
197            | "pyproject.toml"
198            | "setup.py"
199            | "go.mod"
200            | "pom.xml"
201            | "build.gradle"
202            | "build.gradle.kts"
203            | "cmakelists.txt"
204            | ".gitignore"
205            | "settings.json"
206            | "mcp_servers.json"
207    ) || filename.ends_with(".sln")
208        || filename.ends_with(".csproj")
209        || filename.contains("config")
210    {
211        consider("config", 76);
212    }
213
214    if is_component("ui")
215        || matches!(
216            filename,
217            "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
218        )
219    {
220        consider("ui", 70);
221    }
222
223    if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
224        consider("memory", 72);
225    }
226
227    if is_component("tools")
228        || matches!(
229            filename,
230            "verify_build.rs"
231                | "host_inspect.rs"
232                | "shell.rs"
233                | "code_sandbox.rs"
234                | "project_map.rs"
235                | "runtime_trace.rs"
236        )
237    {
238        consider("tools", 68);
239    }
240
241    if filename.contains("mcp")
242        || filename.contains("lsp")
243        || lower.contains("/mcp/")
244        || lower.contains("/lsp/")
245    {
246        consider("integration", 67);
247    }
248
249    if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
250        || filename.contains("startup")
251        || filename.contains("runtime")
252    {
253        consider("runtime", 66);
254    }
255
256    if is_component("agent") {
257        consider("agent", 60);
258    }
259
260    if lower.starts_with("libs/") || is_component("libs") {
261        consider("libs", 58);
262    }
263
264    if lower.starts_with("scripts/") || is_component("scripts") {
265        consider("scripts", 55);
266    }
267
268    if let Some(room) = best_room {
269        return room.to_string();
270    }
271
272    // Fall back to first directory component
273    lower
274        .split('/')
275        .next()
276        .filter(|s| !s.is_empty() && !s.contains('.'))
277        .unwrap_or("root")
278        .to_string()
279}
280
281/// Classify session memory text into a semantic type using zero-cost regex patterns.
282/// Returns one of: "decision", "problem", "milestone", "preference", or "" (unclassified).
283///
284/// Applied only to session/import chunks at index time. Source and doc chunks always get "".
285/// Used by reranking to boost chunks whose type matches the query's implied intent.
286pub fn detect_memory_type(text: &str) -> &'static str {
287    let lower = text.to_lowercase();
288
289    // Decision markers — architectural choices, agreed approaches, "let's use X"
290    let decision_patterns = [
291        "let's use ",
292        "we'll use ",
293        "decided to ",
294        "going with ",
295        "we agreed ",
296        "the plan is",
297        "we're going to",
298        "switching to",
299        "we chose",
300        "final decision",
301        "we settled on",
302        "agreed on",
303        "we decided",
304    ];
305    for pat in &decision_patterns {
306        if lower.contains(pat) {
307            return "decision";
308        }
309    }
310
311    // Problem markers — bugs, errors, failures, blockers
312    let problem_patterns = [
313        "bug fixed",
314        "bug was",
315        "the issue was",
316        "root cause",
317        "error was",
318        "turned out to be",
319        "the fix was",
320        "was caused by",
321        "broken because",
322        "fixed by",
323        "the problem was",
324        "found the bug",
325        "port conflict",
326        "crash",
327        "panicked",
328        "segfault",
329        "oom",
330        "out of memory",
331    ];
332    for pat in &problem_patterns {
333        if lower.contains(pat) {
334            return "problem";
335        }
336    }
337
338    // Milestone markers — shipped, completed, working
339    let milestone_patterns = [
340        "now working",
341        "successfully",
342        "shipped",
343        "deployed",
344        "it works",
345        "tests pass",
346        "all green",
347        "breakthrough",
348        "finally got",
349        "got it working",
350        "completed",
351        "finished",
352        "done with",
353        "landed",
354    ];
355    for pat in &milestone_patterns {
356        if lower.contains(pat) {
357            return "milestone";
358        }
359    }
360
361    // Preference markers — personal/operator preferences for style or workflow
362    let preference_patterns = [
363        "i prefer",
364        "i like",
365        "i don't like",
366        "i want",
367        "always use",
368        "never use",
369        "i usually",
370        "my preference",
371        "keep it",
372        "avoid using",
373    ];
374    for pat in &preference_patterns {
375        if lower.contains(pat) {
376            return "preference";
377        }
378    }
379
380    ""
381}
382
383impl Vein {
384    const SESSION_REPORT_LIMIT: usize = 5;
385    const SESSION_TURN_LIMIT: usize = 50;
386    const IMPORT_FILE_LIMIT: usize = 12;
387    const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
388
389    pub fn new<P: AsRef<Path>>(
390        db_path: P,
391        base_url: String,
392    ) -> Result<Self, Box<dyn std::error::Error>> {
393        let db = Connection::open(db_path)?;
394
395        // WAL mode for better concurrent read performance.
396        db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
397
398        // chunks_meta: tracks last-modified time per path for incremental indexing.
399        // chunks_fts:  BM25 full-text index of all code chunks.
400        // chunks_vec:  semantic embedding vectors, keyed by (path, chunk_idx).
401        db.execute_batch(
402            "CREATE TABLE IF NOT EXISTS chunks_meta (
403                path TEXT PRIMARY KEY,
404                last_modified INTEGER NOT NULL,
405                room TEXT NOT NULL DEFAULT 'root'
406            );
407            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
408                path UNINDEXED,
409                content,
410                tokenize='porter ascii'
411            );
412            CREATE TABLE IF NOT EXISTS chunks_vec (
413                path TEXT NOT NULL,
414                chunk_idx INTEGER NOT NULL,
415                embedding BLOB NOT NULL,
416                PRIMARY KEY (path, chunk_idx)
417            );
418            CREATE TABLE IF NOT EXISTS file_heat (
419                path TEXT PRIMARY KEY,
420                heat INTEGER NOT NULL DEFAULT 0,
421                last_edit INTEGER NOT NULL DEFAULT 0
422            );",
423        )?;
424
425        // Schema migrations — safe to run on every open (IF NOT EXISTS / ignored if col exists).
426        let _ = db
427            .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
428        let _ = db.execute_batch(
429            "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
430        );
431        let _ = db.execute_batch(
432            "ALTER TABLE chunks_meta ADD COLUMN memory_type TEXT NOT NULL DEFAULT '';",
433        );
434
435        Ok(Self {
436            db: std::sync::Arc::new(std::sync::Mutex::new(db)),
437            base_url,
438        })
439    }
440
441    // ── Indexing ──────────────────────────────────────────────────────────────
442
443    /// Index a single file for BM25 search. Skip if mtime hasn't changed.
444    /// Returns the chunks that were written (empty if file was unchanged).
445    pub fn index_document(
446        &mut self,
447        path: &str,
448        last_modified: i64,
449        full_text: &str,
450    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
451        let room = detect_room(path);
452        let ext = std::path::Path::new(path)
453            .extension()
454            .and_then(|e| e.to_str())
455            .unwrap_or("");
456        let chunks = chunk_by_symbols(ext, full_text);
457        // Tag session memory with semantic type; source/doc chunks leave it empty.
458        let memory_type = if room == "session" {
459            detect_memory_type(full_text)
460        } else {
461            ""
462        };
463        self.index_chunks_with_room_and_type(path, last_modified, &room, memory_type, &chunks)
464    }
465
466    fn index_chunks_with_room_and_type(
467        &mut self,
468        path: &str,
469        last_modified: i64,
470        room: &str,
471        memory_type: &str,
472        chunks: &[String],
473    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
474        let db = self.db.lock().unwrap();
475        let existing: Option<i64> = db
476            .query_row(
477                "SELECT last_modified FROM chunks_meta WHERE path = ?1",
478                params![path],
479                |r| r.get(0),
480            )
481            .ok();
482
483        if let Some(ts) = existing {
484            if ts >= last_modified {
485                return Ok(Vec::new()); // unchanged — skip
486            }
487        }
488
489        // Evict stale BM25 chunks, stale embedding vectors, then update metadata.
490        db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
491        db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
492        db.execute(
493            "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room, memory_type) VALUES (?1, ?2, ?3, ?4)",
494            params![path, last_modified, room, memory_type],
495        )?;
496
497        drop(db);
498
499        let mut db = self.db.lock().unwrap();
500        let tx = db.transaction()?;
501        {
502            let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
503            for chunk in chunks {
504                stmt.execute(params![path, chunk.as_str()])?;
505            }
506        }
507        tx.commit()?;
508
509        Ok(chunks.to_vec())
510    }
511
512    /// Embed a set of chunks for one file and store the vectors.
513    /// Called after `index_document` returns new chunks.
514    /// Silently skips if the embedding model is unavailable.
515    pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
516        for (idx, chunk) in chunks.iter().enumerate() {
517            if let Some(vec) = embed_text_blocking(chunk, &self.base_url) {
518                let blob = floats_to_blob(&vec);
519                let db = self.db.lock().unwrap();
520                let _ = db.execute(
521                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
522                    params![path, idx as i64, blob],
523                );
524            }
525        }
526    }
527
528    // ── Search ────────────────────────────────────────────────────────────────
529
530    /// BM25-ranked full-text search via FTS5 MATCH.
531    pub fn search_bm25(
532        &self,
533        query: &str,
534        limit: usize,
535    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
536        // Strip common English stopwords so FTS5 MATCH gets meaningful tokens only.
537        // FTS5 uses implicit AND by default — passing stopwords like "how", "does",
538        // "the" causes zero results because source code never contains those phrases.
539        const STOPWORDS: &[&str] = &[
540            "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
541            "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an", "the",
542            "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "get",
543            "gets", "got", "work", "works", "make", "makes", "use", "uses", "into", "that", "this",
544            "it", "its",
545        ];
546
547        let safe_query: String = query
548            .chars()
549            .map(|c| {
550                if c.is_alphanumeric() || c == ' ' || c == '_' {
551                    c
552                } else {
553                    ' '
554                }
555            })
556            .collect();
557
558        // Build an OR query from non-stopword tokens so any relevant term matches.
559        let fts_query = safe_query
560            .split_whitespace()
561            .filter(|w| w.len() >= 3 && !STOPWORDS.contains(&w.to_lowercase().as_str()))
562            .collect::<Vec<_>>()
563            .join(" OR ");
564
565        if fts_query.is_empty() {
566            return Ok(Vec::new());
567        }
568
569        let db = self.db.lock().unwrap();
570        let mut stmt = db.prepare(
571            "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room, cm.memory_type
572             FROM chunks_fts
573             JOIN chunks_meta cm ON cm.path = chunks_fts.path
574             WHERE chunks_fts MATCH ?1
575             ORDER BY rank
576             LIMIT ?2",
577        )?;
578
579        let results: Vec<SearchResult> = stmt
580            .query_map(params![fts_query, limit as i64], |row| {
581                Ok(SearchResult {
582                    path: row.get(0)?,
583                    content: row.get(1)?,
584                    score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
585                    last_modified: row.get(3)?,
586                    room: row.get(4)?,
587                    memory_type: row.get::<_, String>(5).unwrap_or_default(),
588                })
589            })?
590            .filter_map(|r| r.ok())
591            .collect();
592
593        Ok(results)
594    }
595
596    /// Semantic search: embed the query, cosine-similarity against all stored vectors.
597    /// Returns empty if the embedding model isn't loaded.
598    pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
599        let query_vec = match embed_query_blocking(query, &self.base_url) {
600            Some(v) => v,
601            None => return Vec::new(),
602        };
603
604        // Load all stored embeddings.
605        let rows: Vec<(String, i64, Vec<u8>, i64, String, String)> = {
606            let db = self.db.lock().unwrap();
607            let mut stmt = match db.prepare(
608                "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room, cm.memory_type
609                 FROM chunks_vec cv
610                 JOIN chunks_meta cm ON cm.path = cv.path",
611            ) {
612                Ok(s) => s,
613                Err(_) => return Vec::new(),
614            };
615            stmt.query_map([], |row| {
616                Ok((
617                    row.get::<_, String>(0)?,
618                    row.get::<_, i64>(1)?,
619                    row.get::<_, Vec<u8>>(2)?,
620                    row.get::<_, i64>(3)?,
621                    row.get::<_, String>(4)?,
622                    row.get::<_, String>(5).unwrap_or_default(),
623                ))
624            })
625            .ok()
626            .map(|rows| rows.filter_map(|r| r.ok()).collect())
627            .unwrap_or_default()
628        };
629
630        if rows.is_empty() {
631            return Vec::new();
632        }
633
634        // Score each chunk.
635        let mut scored: Vec<(f32, String, i64, i64, String, String)> = rows
636            .into_iter()
637            .filter_map(|(path, idx, blob, last_modified, room, memory_type)| {
638                let vec = blob_to_floats(&blob);
639                let sim = cosine_similarity(&query_vec, &vec);
640                Some((sim, path, idx, last_modified, room, memory_type))
641            })
642            .collect();
643
644        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
645        scored.truncate(limit);
646
647        // Fetch the content for the top chunks.
648        let db = self.db.lock().unwrap();
649        scored
650            .into_iter()
651            .filter_map(|(score, path, idx, last_modified, room, memory_type)| {
652                let content: Option<String> = db
653                    .query_row(
654                        "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
655                        params![path, idx],
656                        |r| r.get(0),
657                    )
658                    .ok();
659                content.map(|c| SearchResult {
660                    path,
661                    content: c,
662                    score,
663                    room,
664                    last_modified,
665                    memory_type,
666                })
667            })
668            .collect()
669    }
670
671    /// Hybrid search: BM25 + semantic, deduplicated and re-ranked.
672    ///
673    /// Semantic results are preferred (they score higher) when the embedding model
674    /// is available. BM25 fills in or takes over when it isn't.
675    /// Results from the active room (hottest subsystem by edit count) get a
676    /// small boost so the model gravitates toward what's currently being worked on.
677    pub fn search_context(
678        &self,
679        query: &str,
680        limit: usize,
681    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
682        let candidate_limit = (limit.max(1) * 4).max(12);
683        let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
684        let semantic = self.search_semantic(query, candidate_limit);
685        let signals = QuerySignals::from_query(query);
686
687        // Determine the active room from heat scores.
688        let active_room = self.active_room();
689
690        // Merge: semantic results win ties (scored 1.0–2.0 range after boost).
691        // BM25 results land in 0.0–1.0 range.
692        let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
693
694        for r in semantic {
695            let score = reranked_score(&signals, active_room.as_deref(), &r, true);
696            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
697        }
698
699        for r in bm25 {
700            let score = reranked_score(&signals, active_room.as_deref(), &r, false);
701            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
702        }
703
704        let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
705        merged.sort_by(|a, b| {
706            b.score
707                .partial_cmp(&a.score)
708                .unwrap_or(std::cmp::Ordering::Equal)
709        });
710        merged.truncate(limit);
711        Ok(merged)
712    }
713
714    /// Returns the room with the highest total heat (most edited subsystem).
715    /// Used to bias retrieval toward what the user is actively working on.
716    fn active_room(&self) -> Option<String> {
717        let db = self.db.lock().unwrap();
718        db.query_row(
719            "SELECT cm.room, SUM(fh.heat) as total
720             FROM file_heat fh
721             JOIN chunks_meta cm ON cm.path = fh.path
722             GROUP BY cm.room
723             ORDER BY total DESC
724             LIMIT 1",
725            [],
726            |row| row.get::<_, String>(0),
727        )
728        .ok()
729    }
730
731    // ── Project Indexing ──────────────────────────────────────────────────────
732
733    /// Walk the entire project and index all source files (BM25 + embeddings).
734    ///
735    /// Skips: `target/`, `.git/`, `node_modules/`, `.hematite/`, files > 512 KB.
736    /// Also indexes `.hematite/docs/` — the designated reference document drop folder.
737    /// Returns the number of files processed (unchanged files are fast-pathed).
738    pub fn index_project(&mut self) -> usize {
739        let root = crate::tools::file_ops::workspace_root();
740        let mut count = 0usize;
741
742        const INDEXABLE: &[&str] = &[
743            "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
744            "yml", "txt",
745        ];
746        const SKIP_DIRS: &[&str] = &[
747            "target",
748            ".git",
749            "node_modules",
750            ".hematite",
751            ".hematite_logs",
752        ];
753
754        for entry in walkdir::WalkDir::new(&root)
755            .follow_links(false)
756            .into_iter()
757            .filter_entry(|e| {
758                if e.file_type().is_dir() {
759                    let name = e.file_name().to_string_lossy();
760                    return !SKIP_DIRS.contains(&name.as_ref());
761                }
762                true
763            })
764            .filter_map(|e| e.ok())
765            .filter(|e| e.file_type().is_file())
766        {
767            let path = entry.path();
768            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
769            if !INDEXABLE.contains(&ext) {
770                continue;
771            }
772
773            let Ok(meta) = std::fs::metadata(path) else {
774                continue;
775            };
776            if meta.len() > 512_000 {
777                continue;
778            }
779
780            let mtime = meta
781                .modified()
782                .map(|t| {
783                    t.duration_since(std::time::UNIX_EPOCH)
784                        .unwrap_or_default()
785                        .as_secs() as i64
786                })
787                .unwrap_or(0);
788
789            let rel = path.strip_prefix(&root).unwrap_or(path);
790            let rel_str = rel.to_string_lossy().replace('\\', "/");
791
792            if let Ok(content) = std::fs::read_to_string(path) {
793                match self.index_document(&rel_str, mtime, &content) {
794                    Ok(new_chunks) if !new_chunks.is_empty() => {
795                        count += 1;
796                    }
797                    Ok(_) => {}
798                    Err(_) => {}
799                }
800            }
801        }
802
803        count += self.index_workspace_artifacts(&root);
804
805        count
806    }
807
808    /// Index workspace-local supporting context that should be available even
809    /// outside a real project workspace: `.hematite/docs/`, recent session
810    /// reports stored in `.hematite/reports/`, and imported chat exports in
811    /// `.hematite/imports/`.
812    pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
813        let mut count = self.index_docs_folder(workspace_root);
814        count += self.index_recent_session_reports(workspace_root);
815        count += self.index_imported_session_exports(workspace_root);
816        self.backfill_missing_embeddings();
817        count
818    }
819
820    /// Index reference documents in `.hematite/docs/`.
821    /// Supports PDF (text extraction), markdown, and plain text.
822    /// Documents are stored with path prefix `docs/filename` so they are
823    /// distinguishable from source files in retrieval results.
824    fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
825        let docs_dir = workspace_root.join(".hematite").join("docs");
826        const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
827        let mut count = 0usize;
828        let mut desired_paths = HashSet::new();
829
830        if docs_dir.exists() {
831            for entry in walkdir::WalkDir::new(&docs_dir)
832                .max_depth(3)
833                .follow_links(false)
834                .into_iter()
835                .filter_map(|e| e.ok())
836                .filter(|e| e.file_type().is_file())
837            {
838                let path = entry.path();
839                let ext = path
840                    .extension()
841                    .and_then(|e| e.to_str())
842                    .unwrap_or("")
843                    .to_lowercase();
844                if !DOCS_INDEXABLE.contains(&ext.as_str()) {
845                    continue;
846                }
847
848                let Ok(meta) = std::fs::metadata(path) else {
849                    continue;
850                };
851                if meta.len() > 50_000_000 {
852                    continue;
853                }
854
855                let mtime = meta
856                    .modified()
857                    .map(|t| {
858                        t.duration_since(std::time::UNIX_EPOCH)
859                            .unwrap_or_default()
860                            .as_secs() as i64
861                    })
862                    .unwrap_or(0);
863
864                let rel = path.strip_prefix(workspace_root).unwrap_or(path);
865                let rel_str = rel.to_string_lossy().replace('\\', "/");
866                desired_paths.insert(rel_str.clone());
867
868                let content = if ext == "pdf" {
869                    extract_pdf_text(path).ok().flatten()
870                } else {
871                    std::fs::read_to_string(path).ok()
872                };
873
874                if let Some(text) = content {
875                    if text.trim().is_empty() {
876                        continue;
877                    }
878                    match self.index_document(&rel_str, mtime, &text) {
879                        Ok(new_chunks) if !new_chunks.is_empty() => {
880                            count += 1;
881                        }
882                        Ok(_) => {}
883                        Err(_) => {}
884                    }
885                }
886            }
887        }
888
889        self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
890        count
891    }
892
893    /// Index the most recent local session reports by exchange pair so prior
894    /// decisions remain searchable across launches without flooding the vein.
895    pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
896        let reports_dir = workspace_root.join(".hematite").join("reports");
897        let mut count = 0usize;
898        let mut desired_paths = HashSet::new();
899
900        if reports_dir.exists() {
901            let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
902                .ok()
903                .into_iter()
904                .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
905                .map(|entry| entry.path())
906                .filter(|path| {
907                    path.is_file()
908                        && path.extension().and_then(|ext| ext.to_str()) == Some("json")
909                        && path
910                            .file_stem()
911                            .and_then(|stem| stem.to_str())
912                            .map(|stem| stem.starts_with("session_"))
913                            .unwrap_or(false)
914                })
915                .collect();
916
917            reports.sort_by(|a, b| {
918                let a_name = a
919                    .file_name()
920                    .and_then(|name| name.to_str())
921                    .unwrap_or_default();
922                let b_name = b
923                    .file_name()
924                    .and_then(|name| name.to_str())
925                    .unwrap_or_default();
926                b_name.cmp(a_name)
927            });
928            reports.truncate(Self::SESSION_REPORT_LIMIT);
929
930            for report_path in reports {
931                let Ok(meta) = std::fs::metadata(&report_path) else {
932                    continue;
933                };
934                let mtime = meta
935                    .modified()
936                    .map(|t| {
937                        t.duration_since(std::time::UNIX_EPOCH)
938                            .unwrap_or_default()
939                            .as_secs() as i64
940                    })
941                    .unwrap_or(0);
942
943                for exchange in load_session_exchanges(&report_path, mtime) {
944                    desired_paths.insert(exchange.path.clone());
945                    let mtype = detect_memory_type(&exchange.content);
946                    match self.index_chunks_with_room_and_type(
947                        &exchange.path,
948                        exchange.last_modified,
949                        "session",
950                        mtype,
951                        std::slice::from_ref(&exchange.content),
952                    ) {
953                        Ok(new_chunks) if !new_chunks.is_empty() => {
954                            count += 1;
955                        }
956                        Ok(_) => {}
957                        Err(_) => {}
958                    }
959                }
960            }
961        }
962
963        self.prune_indexed_prefix("session/", &desired_paths);
964        count
965    }
966
967    /// Index imported chat exports from `.hematite/imports/`.
968    /// Supported inputs include already-normalized `>` transcripts, Claude Code
969    /// JSONL, Codex CLI JSONL, simple role/content JSON exports, ChatGPT
970    /// `mapping` exports, and Hematite session-report JSON.
971    pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
972        let imports_dir = workspace_root.join(".hematite").join("imports");
973        let mut count = 0usize;
974        let mut desired_paths = HashSet::new();
975
976        if imports_dir.exists() {
977            let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
978                .max_depth(4)
979                .follow_links(false)
980                .into_iter()
981                .filter_map(|entry| entry.ok())
982                .filter(|entry| entry.file_type().is_file())
983                .filter_map(|entry| {
984                    let path = entry.into_path();
985                    let ext = path
986                        .extension()
987                        .and_then(|ext| ext.to_str())
988                        .unwrap_or("")
989                        .to_ascii_lowercase();
990                    if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
991                        return None;
992                    }
993                    let meta = std::fs::metadata(&path).ok()?;
994                    if meta.len() > Self::IMPORT_MAX_BYTES {
995                        return None;
996                    }
997                    let mtime = meta
998                        .modified()
999                        .map(|t| {
1000                            t.duration_since(std::time::UNIX_EPOCH)
1001                                .unwrap_or_default()
1002                                .as_secs() as i64
1003                        })
1004                        .unwrap_or(0);
1005                    Some((path, mtime))
1006                })
1007                .collect();
1008
1009            imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
1010                b_mtime
1011                    .cmp(a_mtime)
1012                    .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
1013            });
1014            imports.truncate(Self::IMPORT_FILE_LIMIT);
1015
1016            for (import_path, mtime) in imports {
1017                for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
1018                    desired_paths.insert(exchange.path.clone());
1019                    let mtype = detect_memory_type(&exchange.content);
1020                    match self.index_chunks_with_room_and_type(
1021                        &exchange.path,
1022                        exchange.last_modified,
1023                        "session",
1024                        mtype,
1025                        std::slice::from_ref(&exchange.content),
1026                    ) {
1027                        Ok(new_chunks) if !new_chunks.is_empty() => {
1028                            count += 1;
1029                        }
1030                        Ok(_) => {}
1031                        Err(_) => {}
1032                    }
1033                }
1034            }
1035        }
1036
1037        self.prune_indexed_prefix("session/imports/", &desired_paths);
1038        count
1039    }
1040
1041    /// Embed any FTS chunks that don't yet have a vector in chunks_vec.
1042    /// Called at the end of index_project so that loading the embedding model
1043    /// after the initial index automatically triggers a semantic upgrade on the
1044    /// next agent turn — no /forget or file-touch required.
1045    fn backfill_missing_embeddings(&self) {
1046        // Fast path: if chunk counts match, nothing to do.
1047        let (fts_count, vec_count) = {
1048            let db = self.db.lock().unwrap();
1049            let fts: i64 = db
1050                .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
1051                .unwrap_or(0);
1052            let vec: i64 = db
1053                .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
1054                .unwrap_or(0);
1055            (fts, vec)
1056        };
1057        if fts_count == 0 || fts_count == vec_count {
1058            return;
1059        }
1060
1061        // Fetch (path, chunk_idx, content) for chunks with no embedding.
1062        // chunks_fts rowid serves as chunk_idx (1-based → convert to 0-based).
1063        let missing: Vec<(String, i64, String)> = {
1064            let db = self.db.lock().unwrap();
1065            let mut stmt = db
1066                .prepare(
1067                    "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
1068                     FROM chunks_fts f
1069                     LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
1070                     WHERE v.path IS NULL
1071                     ORDER BY CASE
1072                         WHEN f.path LIKE '%.rs' THEN 0
1073                         WHEN f.path LIKE '%.toml' THEN 1
1074                         WHEN f.path LIKE '%.json' THEN 2
1075                         ELSE 3
1076                     END, f.path
1077                     LIMIT 20",
1078                )
1079                .unwrap();
1080            stmt.query_map([], |r| {
1081                Ok((
1082                    r.get::<_, String>(0)?,
1083                    r.get::<_, i64>(1)?,
1084                    r.get::<_, String>(2)?,
1085                ))
1086            })
1087            .unwrap()
1088            .filter_map(|r| r.ok())
1089            .collect()
1090        };
1091
1092        for (path, idx, content) in missing {
1093            if let Some(vec) = embed_text_blocking(&content, &self.base_url) {
1094                let blob = floats_to_blob(&vec);
1095                let db = self.db.lock().unwrap();
1096                let _ = db.execute(
1097                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
1098                    params![path, idx, blob],
1099                );
1100            } else {
1101                // Embedding model not available — stop trying for this pass.
1102                break;
1103            }
1104        }
1105    }
1106
1107    /// Total number of unique files currently indexed.
1108    /// Session exchange chunks are excluded so status counts stay source/doc centric.
1109    pub fn file_count(&self) -> usize {
1110        let db = self.db.lock().unwrap();
1111        db.query_row(
1112            "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
1113            [],
1114            |r| r.get::<_, i64>(0),
1115        )
1116        .unwrap_or(0) as usize
1117    }
1118
1119    /// Number of source/doc chunks that have semantic embedding vectors stored.
1120    /// Session exchange chunks are excluded so status counts stay source/doc centric.
1121    pub fn embedded_chunk_count(&self) -> usize {
1122        let db = self.db.lock().unwrap();
1123        db.query_row(
1124            "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1125            [],
1126            |r| r.get::<_, i64>(0),
1127        )
1128        .unwrap_or(0) as usize
1129    }
1130
1131    /// True when any chunk type currently has embeddings available.
1132    pub fn has_any_embeddings(&self) -> bool {
1133        let db = self.db.lock().unwrap();
1134        db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1135            r.get::<_, i64>(0)
1136        })
1137        .unwrap_or(0)
1138            != 0
1139    }
1140
1141    /// Wipe all indexed data. The DB file stays on disk; next index_project()
1142    /// call rebuilds from scratch (re-reads all files, re-embeds all chunks).
1143    pub fn reset(&self) {
1144        let db = self.db.lock().unwrap();
1145        let _ = db.execute_batch(
1146            "DELETE FROM chunks_fts;
1147             DELETE FROM chunks_vec;
1148             DELETE FROM chunks_meta;",
1149        );
1150    }
1151
1152    /// Return a compact operator-facing snapshot of what The Vein currently knows.
1153    /// Intended for trust/debug surfaces like `/vein-inspect`.
1154    pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1155        let db = self.db.lock().unwrap();
1156        let indexed_source_files = db
1157            .query_row(
1158                "SELECT COUNT(*) FROM chunks_meta
1159                 WHERE path NOT LIKE 'session/%'
1160                   AND path NOT LIKE '.hematite/docs/%'",
1161                [],
1162                |r| r.get::<_, i64>(0),
1163            )
1164            .unwrap_or(0) as usize;
1165        let indexed_docs = db
1166            .query_row(
1167                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1168                [],
1169                |r| r.get::<_, i64>(0),
1170            )
1171            .unwrap_or(0) as usize;
1172        let indexed_session_exchanges = db
1173            .query_row(
1174                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1175                [],
1176                |r| r.get::<_, i64>(0),
1177            )
1178            .unwrap_or(0) as usize;
1179        let embedded_source_doc_chunks = db
1180            .query_row(
1181                "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1182                [],
1183                |r| r.get::<_, i64>(0),
1184            )
1185            .unwrap_or(0) as usize;
1186        let has_any_embeddings = db
1187            .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1188                r.get::<_, i64>(0)
1189            })
1190            .unwrap_or(0)
1191            != 0;
1192        drop(db);
1193
1194        let hot_files = self
1195            .hot_files(hot_limit.max(1))
1196            .into_iter()
1197            .map(|(path, heat, last_modified, room)| VeinHotFile {
1198                path,
1199                heat,
1200                last_modified,
1201                room,
1202            })
1203            .collect::<Vec<_>>();
1204
1205        VeinInspectionSnapshot {
1206            indexed_source_files,
1207            indexed_docs,
1208            indexed_session_exchanges,
1209            embedded_source_doc_chunks,
1210            has_any_embeddings,
1211            active_room: self.active_room(),
1212            l1_ready: !hot_files.is_empty(),
1213            hot_files,
1214        }
1215    }
1216
1217    // ── L1 heat tracking ──────────────────────────────────────────────────────
1218
1219    /// Record an edit to a file. Increments its heat score in file_heat.
1220    /// Called from the tool dispatch after a successful edit_file / write_file /
1221    /// patch_hunk / multi_search_replace so the L1 context stays current.
1222    pub fn bump_heat(&self, path: &str) {
1223        if path.is_empty() {
1224            return;
1225        }
1226        let now = std::time::SystemTime::now()
1227            .duration_since(std::time::UNIX_EPOCH)
1228            .unwrap_or_default()
1229            .as_secs() as i64;
1230        let db = self.db.lock().unwrap();
1231        let _ = db.execute(
1232            "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1233             ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1234            params![path, now],
1235        );
1236    }
1237
1238    /// Return the top N hot files ranked by edit count (heat) then recency.
1239    /// Joins file_heat with chunks_meta so only indexed files are included.
1240    /// Returns (path, heat, mtime, room).
1241    fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1242        let db = self.db.lock().unwrap();
1243        let mut stmt = match db.prepare(
1244            "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1245             FROM file_heat fh
1246             JOIN chunks_meta cm ON cm.path = fh.path
1247             ORDER BY fh.heat DESC, cm.last_modified DESC
1248             LIMIT ?1",
1249        ) {
1250            Ok(s) => s,
1251            Err(_) => return vec![],
1252        };
1253        stmt.query_map(params![n as i64], |row| {
1254            Ok((
1255                row.get::<_, String>(0)?,
1256                row.get::<_, i64>(1)?,
1257                row.get::<_, i64>(2)?,
1258                row.get::<_, String>(3)?,
1259            ))
1260        })
1261        .map(|rows| rows.filter_map(|r| r.ok()).collect())
1262        .unwrap_or_default()
1263    }
1264
1265    /// Return the paths of the top hot files (most edited).
1266    /// Used by RepoMapGenerator to bias PageRank toward active files.
1267    pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1268        self.hot_files(n)
1269            .into_iter()
1270            .map(|(path, _, _, _)| path)
1271            .collect()
1272    }
1273
1274    /// Return hot files with normalized heat weights in [0.0, 1.0].
1275    /// The hottest file gets weight 1.0; others are scaled proportionally.
1276    /// Used by RepoMapGenerator to apply heat-weighted PageRank personalization.
1277    pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1278        let files = self.hot_files(n);
1279        if files.is_empty() {
1280            return vec![];
1281        }
1282        let max_heat = files
1283            .iter()
1284            .map(|(_, h, _, _)| *h)
1285            .max()
1286            .unwrap_or(1)
1287            .max(1) as f64;
1288        files
1289            .into_iter()
1290            .map(|(path, heat, _, _)| {
1291                let weight = (heat as f64) / max_heat;
1292                (path, weight)
1293            })
1294            .collect()
1295    }
1296
1297    /// Build the L1 context block — a compact "hot files" summary injected into
1298    /// the system prompt at session start. Capped at ~150 tokens.
1299    /// Files are grouped by room so the model sees subsystem structure at a glance.
1300    /// Returns None when there are no heat records yet (fresh project).
1301    pub fn l1_context(&self) -> Option<String> {
1302        let files = self.hot_files(8);
1303        if files.is_empty() {
1304            return None;
1305        }
1306        let now = std::time::SystemTime::now()
1307            .duration_since(std::time::UNIX_EPOCH)
1308            .unwrap_or_default()
1309            .as_secs() as i64;
1310
1311        // Group by room for readability.
1312        let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1313            std::collections::BTreeMap::new();
1314        for (path, heat, mtime, room) in &files {
1315            by_room
1316                .entry(room.clone())
1317                .or_default()
1318                .push((path.clone(), *heat, *mtime));
1319        }
1320
1321        let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1322        for (room, entries) in &by_room {
1323            out.push_str(&format!("[{}]\n", room));
1324            for (path, heat, mtime) in entries {
1325                let age_secs = now - mtime;
1326                let age = if age_secs < 3600 {
1327                    "just now".to_string()
1328                } else if age_secs < 86400 {
1329                    format!("{}h ago", age_secs / 3600)
1330                } else {
1331                    format!("{}d ago", age_secs / 86400)
1332                };
1333                out.push_str(&format!(
1334                    "  - {} [{} edit{}, {}]\n",
1335                    path,
1336                    heat,
1337                    if *heat == 1 { "" } else { "s" },
1338                    age
1339                ));
1340            }
1341        }
1342        Some(out)
1343    }
1344
1345    fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1346        let pattern = format!("{}%", prefix);
1347        let existing_paths: Vec<String> = {
1348            let db = self.db.lock().unwrap();
1349            let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1350                Ok(stmt) => stmt,
1351                Err(_) => return,
1352            };
1353            stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1354                .map(|rows| rows.filter_map(|row| row.ok()).collect())
1355                .unwrap_or_default()
1356        };
1357
1358        if existing_paths.is_empty() {
1359            return;
1360        }
1361
1362        let db = self.db.lock().unwrap();
1363        for path in existing_paths {
1364            if desired_paths.contains(&path) {
1365                continue;
1366            }
1367            let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1368            let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1369            let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1370        }
1371    }
1372}
1373
1374impl QuerySignals {
1375    fn from_query(query: &str) -> Self {
1376        let lower = query.to_ascii_lowercase();
1377        let historical_memory_hint = [
1378            "remember",
1379            "earlier",
1380            "previous",
1381            "last time",
1382            "what did we decide",
1383            "why did we decide",
1384            "what did we say",
1385            "why did we change",
1386        ]
1387        .iter()
1388        .any(|needle| lower.contains(needle));
1389
1390        // Detect what memory type the query is asking about.
1391        let query_memory_type = if lower.contains("decide")
1392            || lower.contains("decision")
1393            || lower.contains("we agreed")
1394            || lower.contains("we chose")
1395        {
1396            Some("decision")
1397        } else if lower.contains("bug")
1398            || lower.contains("error")
1399            || lower.contains("issue")
1400            || lower.contains("problem")
1401            || lower.contains("fix")
1402            || lower.contains("broken")
1403        {
1404            Some("problem")
1405        } else if lower.contains("shipped")
1406            || lower.contains("milestone")
1407            || lower.contains("finished")
1408            || lower.contains("working now")
1409        {
1410            Some("milestone")
1411        } else if lower.contains("prefer")
1412            || lower.contains("my preference")
1413            || lower.contains("i like")
1414            || lower.contains("i want")
1415        {
1416            Some("preference")
1417        } else {
1418            None
1419        };
1420
1421        Self {
1422            exact_phrases: extract_exact_phrases(query),
1423            standout_terms: extract_standout_terms(query),
1424            historical_memory_hint,
1425            temporal_reference: extract_temporal_reference(query),
1426            query_memory_type,
1427        }
1428    }
1429}
1430
1431fn merge_scored_result(
1432    merged_by_path: &mut HashMap<String, SearchResult>,
1433    candidate: SearchResult,
1434) {
1435    match merged_by_path.get_mut(&candidate.path) {
1436        Some(existing) if candidate.score > existing.score => *existing = candidate,
1437        Some(_) => {}
1438        None => {
1439            merged_by_path.insert(candidate.path.clone(), candidate);
1440        }
1441    }
1442}
1443
1444fn reranked_score(
1445    signals: &QuerySignals,
1446    active_room: Option<&str>,
1447    result: &SearchResult,
1448    is_semantic: bool,
1449) -> f32 {
1450    let base = if is_semantic {
1451        1.0 + result.score.clamp(0.0, 1.0)
1452    } else {
1453        (result.score / 10.0).clamp(0.0, 1.0)
1454    };
1455    base + room_bias(active_room, result)
1456        + retrieval_signal_boost(signals, result)
1457        + temporal_memory_boost(signals, result)
1458}
1459
1460fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1461    if active_room == Some(result.room.as_str()) {
1462        0.15
1463    } else {
1464        0.0
1465    }
1466}
1467
1468fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1469    let mut boost = 0.0f32;
1470    let haystack = format!(
1471        "{}\n{}",
1472        result.path.to_ascii_lowercase(),
1473        result.content.to_ascii_lowercase()
1474    );
1475
1476    let phrase_matches = signals
1477        .exact_phrases
1478        .iter()
1479        .filter(|phrase| haystack.contains(phrase.as_str()))
1480        .count();
1481    if phrase_matches > 0 {
1482        boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1483    }
1484
1485    let mut standout_matches = 0;
1486    for term in &signals.standout_terms {
1487        if result.path.to_ascii_lowercase().contains(term.as_str()) {
1488            boost += 0.40;
1489            standout_matches += 1;
1490        } else if result.content.to_ascii_lowercase().contains(term.as_str()) {
1491            boost += 0.12;
1492            standout_matches += 1;
1493        }
1494        if standout_matches >= 3 {
1495            break;
1496        }
1497    }
1498
1499    if signals.historical_memory_hint && result.room == "session" {
1500        boost += 0.45;
1501    }
1502
1503    // Boost session chunks whose tagged memory type matches the query's intent.
1504    if let Some(qtype) = signals.query_memory_type {
1505        if !result.memory_type.is_empty() && result.memory_type == qtype {
1506            boost += 0.35;
1507        }
1508    }
1509
1510    boost
1511}
1512
1513fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1514    if result.room != "session" {
1515        return 0.0;
1516    }
1517    let Some(reference) = signals.temporal_reference else {
1518        return 0.0;
1519    };
1520    let Some(memory_ts) = session_memory_timestamp(result) else {
1521        return 0.0;
1522    };
1523
1524    let span = reference.window_secs.max(86_400);
1525    let full_fade = span.saturating_mul(8);
1526    if full_fade <= 0 {
1527        return 0.0;
1528    }
1529
1530    let distance = (memory_ts - reference.target_ts).abs();
1531    let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1532    if closeness <= 0.0 {
1533        0.0
1534    } else {
1535        0.22 * closeness
1536    }
1537}
1538
1539fn extract_exact_phrases(query: &str) -> Vec<String> {
1540    let mut phrases = Vec::new();
1541    let chars: Vec<char> = query.chars().collect();
1542    let mut i = 0usize;
1543
1544    while i < chars.len() {
1545        let quote = chars[i];
1546        if !matches!(quote, '"' | '\'' | '`') {
1547            i += 1;
1548            continue;
1549        }
1550        let start = i + 1;
1551        let mut end = start;
1552        while end < chars.len() && chars[end] != quote {
1553            end += 1;
1554        }
1555        if end > start {
1556            let phrase = chars[start..end]
1557                .iter()
1558                .collect::<String>()
1559                .trim()
1560                .to_ascii_lowercase();
1561            if phrase.len() >= 3 && !phrases.contains(&phrase) {
1562                phrases.push(phrase);
1563            }
1564        }
1565        i = end.saturating_add(1);
1566    }
1567
1568    phrases
1569}
1570
1571fn extract_standout_terms(query: &str) -> Vec<String> {
1572    const STOPWORDS: &[&str] = &[
1573        "about", "after", "before", "change", "changed", "decide", "decided", "does", "earlier",
1574        "flow", "from", "have", "into", "just", "last", "local", "make", "more", "remember",
1575        "should", "that", "their", "there", "these", "they", "this", "those", "what", "when",
1576        "where", "which", "why", "with", "work",
1577    ];
1578
1579    let mut standout = Vec::new();
1580    for token in query.split(|ch: char| {
1581        !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1582    }) {
1583        let trimmed = token.trim();
1584        if trimmed.len() < 4 {
1585            continue;
1586        }
1587        let lower = trimmed.to_ascii_lowercase();
1588        if STOPWORDS.contains(&lower.as_str()) {
1589            continue;
1590        }
1591
1592        let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1593            || trimmed
1594                .chars()
1595                .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1596            || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1597            || trimmed.len() >= 9;
1598
1599        if interesting && !standout.contains(&lower) {
1600            standout.push(lower);
1601        }
1602    }
1603
1604    standout
1605}
1606
1607fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1608    if let Some(ts) = extract_iso_date_from_query(query) {
1609        return Some(TemporalReference {
1610            target_ts: ts,
1611            window_secs: 86_400,
1612        });
1613    }
1614
1615    let now = current_unix_timestamp();
1616    let lower = query.to_ascii_lowercase();
1617    if lower.contains("yesterday") {
1618        Some(TemporalReference {
1619            target_ts: now.saturating_sub(86_400),
1620            window_secs: 86_400,
1621        })
1622    } else if lower.contains("today") || lower.contains("earlier today") {
1623        Some(TemporalReference {
1624            target_ts: now,
1625            window_secs: 86_400,
1626        })
1627    } else if lower.contains("last week") {
1628        Some(TemporalReference {
1629            target_ts: now.saturating_sub(7 * 86_400),
1630            window_secs: 7 * 86_400,
1631        })
1632    } else if lower.contains("last month") {
1633        Some(TemporalReference {
1634            target_ts: now.saturating_sub(30 * 86_400),
1635            window_secs: 30 * 86_400,
1636        })
1637    } else {
1638        None
1639    }
1640}
1641
1642fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1643    query
1644        .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1645        .find_map(parse_iso_date_token)
1646}
1647
1648fn parse_iso_date_token(token: &str) -> Option<i64> {
1649    if token.len() != 10 {
1650        return None;
1651    }
1652    let bytes = token.as_bytes();
1653    if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1654        return None;
1655    }
1656
1657    let year = token.get(0..4)?.parse::<i32>().ok()?;
1658    let month = token.get(5..7)?.parse::<u32>().ok()?;
1659    let day = token.get(8..10)?.parse::<u32>().ok()?;
1660    if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1661        return None;
1662    }
1663
1664    Some(days_from_civil(year, month, day).saturating_mul(86_400))
1665}
1666
1667fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1668    let year = year - if month <= 2 { 1 } else { 0 };
1669    let era = if year >= 0 { year } else { year - 399 } / 400;
1670    let yoe = year - era * 400;
1671    let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1672    let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1673    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1674    era as i64 * 146_097 + doe as i64 - 719_468
1675}
1676
1677fn current_unix_timestamp() -> i64 {
1678    std::time::SystemTime::now()
1679        .duration_since(std::time::UNIX_EPOCH)
1680        .unwrap_or_default()
1681        .as_secs() as i64
1682}
1683
1684fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1685    extract_session_path_timestamp(&result.path).or_else(|| {
1686        if result.last_modified > 0 {
1687            Some(result.last_modified)
1688        } else {
1689            None
1690        }
1691    })
1692}
1693
1694fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1695    let normalized = path.replace('\\', "/");
1696    let mut parts = normalized.split('/');
1697    if parts.next()? != "session" {
1698        return None;
1699    }
1700    parse_iso_date_token(parts.next()?)
1701}
1702
1703fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1704    let normalized = speaker.trim().to_ascii_lowercase();
1705    match normalized.as_str() {
1706        "you" | "user" => SessionSpeakerKind::User,
1707        "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1708        _ => SessionSpeakerKind::Assistant,
1709    }
1710}
1711
1712fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1713    let Ok(raw) = std::fs::read_to_string(report_path) else {
1714        return Vec::new();
1715    };
1716    let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1717        return Vec::new();
1718    };
1719
1720    let session_key = report_path
1721        .file_stem()
1722        .and_then(|stem| stem.to_str())
1723        .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1724        .unwrap_or("unknown-session")
1725        .to_string();
1726    let session_date = report
1727        .session_start
1728        .split('_')
1729        .next()
1730        .filter(|date| !date.is_empty())
1731        .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1732        .to_string();
1733
1734    let mut exchanges = Vec::new();
1735    let mut pending_user: Option<String> = None;
1736    let mut turn_index = 0usize;
1737
1738    for entry in report.transcript {
1739        match session_speaker_kind(&entry.speaker) {
1740            SessionSpeakerKind::User => {
1741                let text = entry.text.trim();
1742                if !text.is_empty() {
1743                    pending_user = Some(text.to_string());
1744                }
1745            }
1746            SessionSpeakerKind::Assistant => {
1747                let text = entry.text.trim();
1748                if text.is_empty() {
1749                    continue;
1750                }
1751                let Some(user_text) = pending_user.take() else {
1752                    continue;
1753                };
1754                turn_index += 1;
1755                exchanges.push(SessionExchange {
1756                    path: format!(
1757                        "session/{}/{}/turn-{}",
1758                        session_date, session_key, turn_index
1759                    ),
1760                    last_modified,
1761                    content: format!(
1762                        "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1763                        user_text, text
1764                    ),
1765                });
1766            }
1767            SessionSpeakerKind::Ignore => {}
1768        }
1769    }
1770
1771    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1772        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1773        exchanges = exchanges.into_iter().skip(keep_from).collect();
1774    }
1775
1776    exchanges
1777}
1778
1779fn load_imported_session_exchanges(
1780    import_path: &Path,
1781    imports_root: &Path,
1782    last_modified: i64,
1783) -> Vec<SessionExchange> {
1784    let Ok(raw) = std::fs::read_to_string(import_path) else {
1785        return Vec::new();
1786    };
1787
1788    let messages = normalize_import_messages(&raw, import_path);
1789    if messages.is_empty() {
1790        return Vec::new();
1791    }
1792
1793    let rel = import_path
1794        .strip_prefix(imports_root)
1795        .unwrap_or(import_path);
1796    let rel_slug = slugify_import_path(rel);
1797    let mut exchanges = Vec::new();
1798    let mut pending_user: Option<String> = None;
1799    let mut turn_index = 0usize;
1800
1801    for (role, text) in messages {
1802        let cleaned = text.trim();
1803        if cleaned.is_empty() {
1804            continue;
1805        }
1806        match role.as_str() {
1807            "user" => pending_user = Some(cleaned.to_string()),
1808            "assistant" => {
1809                let Some(user_text) = pending_user.take() else {
1810                    continue;
1811                };
1812                turn_index += 1;
1813                exchanges.push(SessionExchange {
1814                    path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1815                    last_modified,
1816                    content: format!(
1817                        "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1818                        rel.to_string_lossy().replace('\\', "/"),
1819                        user_text,
1820                        cleaned
1821                    ),
1822                });
1823            }
1824            _ => {}
1825        }
1826    }
1827
1828    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1829        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1830        exchanges = exchanges.into_iter().skip(keep_from).collect();
1831    }
1832
1833    exchanges
1834}
1835
1836fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1837    if raw.trim().is_empty() {
1838        return Vec::new();
1839    }
1840
1841    if let Some(messages) = parse_marker_transcript(raw) {
1842        return messages;
1843    }
1844
1845    let ext = import_path
1846        .extension()
1847        .and_then(|ext| ext.to_str())
1848        .unwrap_or("")
1849        .to_ascii_lowercase();
1850
1851    if matches!(ext.as_str(), "json" | "jsonl")
1852        || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1853    {
1854        if let Some(messages) = parse_jsonl_messages(raw) {
1855            if !messages.is_empty() {
1856                return messages;
1857            }
1858        }
1859
1860        if let Ok(value) = serde_json::from_str::<Value>(raw) {
1861            if let Some(messages) = parse_session_report_messages(&value) {
1862                return messages;
1863            }
1864            if let Some(messages) = parse_simple_role_messages(&value) {
1865                return messages;
1866            }
1867            if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1868                return messages;
1869            }
1870        }
1871    }
1872
1873    Vec::new()
1874}
1875
1876fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1877    let lines = raw.lines().collect::<Vec<_>>();
1878    if lines
1879        .iter()
1880        .filter(|line| line.trim_start().starts_with("> "))
1881        .count()
1882        < 2
1883    {
1884        return None;
1885    }
1886
1887    let mut messages = Vec::new();
1888    let mut i = 0usize;
1889    while i < lines.len() {
1890        let line = lines[i].trim_start();
1891        if let Some(rest) = line.strip_prefix("> ") {
1892            messages.push(("user".to_string(), rest.trim().to_string()));
1893            i += 1;
1894            let mut assistant_lines = Vec::new();
1895            while i < lines.len() {
1896                let next = lines[i];
1897                if next.trim_start().starts_with("> ") {
1898                    break;
1899                }
1900                let trimmed = next.trim();
1901                if !trimmed.is_empty() && trimmed != "---" {
1902                    assistant_lines.push(trimmed.to_string());
1903                }
1904                i += 1;
1905            }
1906            if !assistant_lines.is_empty() {
1907                messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1908            }
1909        } else {
1910            i += 1;
1911        }
1912    }
1913
1914    (!messages.is_empty()).then_some(messages)
1915}
1916
1917fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1918    let mut messages = Vec::new();
1919    let mut has_codex_session_meta = false;
1920    let mut saw_jsonl = false;
1921
1922    for line in raw.lines() {
1923        let trimmed = line.trim();
1924        if trimmed.is_empty() {
1925            continue;
1926        }
1927        let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1928            continue;
1929        };
1930        saw_jsonl = true;
1931        let Some(object) = value.as_object() else {
1932            continue;
1933        };
1934
1935        match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1936            "session_meta" => {
1937                has_codex_session_meta = true;
1938            }
1939            "event_msg" => {
1940                let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1941                    continue;
1942                };
1943                let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1944                    continue;
1945                };
1946                match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1947                    "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1948                    "agent_message" => {
1949                        messages.push(("assistant".to_string(), text.trim().to_string()))
1950                    }
1951                    _ => {}
1952                }
1953            }
1954            "human" | "user" => {
1955                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1956                    messages.push(("user".to_string(), text));
1957                }
1958            }
1959            "assistant" => {
1960                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1961                    messages.push(("assistant".to_string(), text));
1962                }
1963            }
1964            _ => {
1965                if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
1966                    if let Some(text) = extract_text_content(&value) {
1967                        match role {
1968                            "user" | "human" => messages.push(("user".to_string(), text)),
1969                            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1970                            _ => {}
1971                        }
1972                    }
1973                }
1974            }
1975        }
1976    }
1977
1978    if !saw_jsonl {
1979        return None;
1980    }
1981
1982    if has_codex_session_meta || !messages.is_empty() {
1983        return Some(messages);
1984    }
1985
1986    None
1987}
1988
1989fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
1990    let report = value.as_object()?;
1991    let transcript = report.get("transcript")?.as_array()?;
1992    let mut messages = Vec::new();
1993
1994    for entry in transcript {
1995        let Some(obj) = entry.as_object() else {
1996            continue;
1997        };
1998        let speaker = obj
1999            .get("speaker")
2000            .and_then(|v| v.as_str())
2001            .unwrap_or_default();
2002        let text = obj
2003            .get("text")
2004            .and_then(|v| v.as_str())
2005            .unwrap_or_default()
2006            .trim()
2007            .to_string();
2008        if text.is_empty() {
2009            continue;
2010        }
2011        match session_speaker_kind(speaker) {
2012            SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
2013            SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
2014            SessionSpeakerKind::Ignore => {}
2015        }
2016    }
2017
2018    (!messages.is_empty()).then_some(messages)
2019}
2020
2021fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
2022    if let Some(array) = value.as_array() {
2023        let messages = collect_role_messages(array);
2024        return (!messages.is_empty()).then_some(messages);
2025    }
2026
2027    let obj = value.as_object()?;
2028    if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
2029        let array = messages_value.as_array()?;
2030        let messages = collect_role_messages(array);
2031        return (!messages.is_empty()).then_some(messages);
2032    }
2033
2034    None
2035}
2036
2037fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
2038    let mut messages = Vec::new();
2039    for item in items {
2040        let Some(obj) = item.as_object() else {
2041            continue;
2042        };
2043        let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
2044            continue;
2045        };
2046        let Some(text) = extract_text_content(item) else {
2047            continue;
2048        };
2049        match role {
2050            "user" | "human" => messages.push(("user".to_string(), text)),
2051            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2052            _ => {}
2053        }
2054    }
2055    messages
2056}
2057
2058fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
2059    let mapping = value.get("mapping")?.as_object()?;
2060    let mut current_id = mapping.iter().find_map(|(node_id, node)| {
2061        let obj = node.as_object()?;
2062        (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
2063    })?;
2064
2065    let mut messages = Vec::new();
2066    let mut visited = std::collections::HashSet::new();
2067
2068    while visited.insert(current_id.clone()) {
2069        let Some(node) = mapping.get(&current_id).and_then(|v| v.as_object()) else {
2070            break;
2071        };
2072
2073        if let Some(message) = node.get("message") {
2074            let role = message
2075                .get("author")
2076                .and_then(|author| author.get("role"))
2077                .and_then(|v| v.as_str())
2078                .unwrap_or("");
2079            if let Some(text) = extract_text_content(message) {
2080                match role {
2081                    "user" => messages.push(("user".to_string(), text)),
2082                    "assistant" => messages.push(("assistant".to_string(), text)),
2083                    _ => {}
2084                }
2085            }
2086        }
2087
2088        let Some(next_id) = node
2089            .get("children")
2090            .and_then(|children| children.as_array())
2091            .and_then(|children| children.first())
2092            .and_then(|child| child.as_str())
2093        else {
2094            break;
2095        };
2096        current_id = next_id.to_string();
2097    }
2098
2099    (!messages.is_empty()).then_some(messages)
2100}
2101
2102fn extract_text_content(value: &Value) -> Option<String> {
2103    if let Some(text) = value.as_str() {
2104        let trimmed = text.trim();
2105        return (!trimmed.is_empty()).then_some(trimmed.to_string());
2106    }
2107
2108    if let Some(array) = value.as_array() {
2109        let joined = array
2110            .iter()
2111            .filter_map(extract_text_content)
2112            .filter(|part| !part.is_empty())
2113            .collect::<Vec<_>>()
2114            .join("\n");
2115        return (!joined.is_empty()).then_some(joined);
2116    }
2117
2118    let obj = value.as_object()?;
2119
2120    if let Some(content) = obj.get("content") {
2121        if let Some(text) = extract_text_content(content) {
2122            return Some(text);
2123        }
2124    }
2125
2126    if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
2127        let trimmed = text.trim();
2128        if !trimmed.is_empty() {
2129            return Some(trimmed.to_string());
2130        }
2131    }
2132
2133    if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
2134        let joined = parts
2135            .iter()
2136            .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
2137            .filter(|part| !part.is_empty())
2138            .collect::<Vec<_>>()
2139            .join("\n");
2140        if !joined.is_empty() {
2141            return Some(joined);
2142        }
2143    }
2144
2145    None
2146}
2147
2148fn slugify_import_path(path: &Path) -> String {
2149    path.to_string_lossy()
2150        .replace('\\', "/")
2151        .chars()
2152        .map(|ch| {
2153            if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
2154                ch
2155            } else {
2156                '_'
2157            }
2158        })
2159        .collect::<String>()
2160        .trim_matches('/')
2161        .replace('/', "__")
2162}
2163
2164// ── Embedding API ─────────────────────────────────────────────────────────────
2165
2166/// Call LM Studio's `/v1/embeddings` endpoint synchronously.
2167///
2168/// Uses nomic-embed-text-v2 MoE. Nomic v2 requires task instruction prefixes:
2169/// - Chunks stored in the index use `"search_document: "` prefix
2170/// - Queries at search time use `"search_query: "` prefix
2171/// LM Studio matches loaded models by substring so the quant suffix doesn't matter.
2172///
2173/// Returns `None` if:
2174/// - No embedding model is loaded in LM Studio
2175/// - LM Studio is not running
2176/// - Any network or parse error occurs
2177///
2178/// Callers must tolerate `None` and fall back to BM25-only search.
2179fn embed_text_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2180    embed_text_with_prefix(text, "search_document", base_url)
2181}
2182
2183fn embed_query_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2184    embed_text_with_prefix(text, "search_query", base_url)
2185}
2186
2187fn embed_text_with_prefix(text: &str, task: &str, base_url: &str) -> Option<Vec<f32>> {
2188    // Nomic v2 task instruction prefix format: "<task>: <text>"
2189    let prefixed = format!("{}: {}", task, text);
2190    // Truncate to ~8000 chars to stay within typical embedding model limits.
2191    let input = if prefixed.len() > 8000 {
2192        &prefixed[..8000]
2193    } else {
2194        &prefixed
2195    };
2196
2197    let client = reqwest::blocking::Client::builder()
2198        .timeout(std::time::Duration::from_secs(10))
2199        .build()
2200        .ok()?;
2201
2202    let body = serde_json::json!({
2203        "model": "nomic-embed-text-v2",
2204        "input": input
2205    });
2206
2207    let url = format!("{}/v1/embeddings", base_url);
2208    let resp = client.post(&url).json(&body).send().ok()?;
2209
2210    if !resp.status().is_success() {
2211        return None;
2212    }
2213
2214    let json: serde_json::Value = resp.json().ok()?;
2215    let embedding = json["data"][0]["embedding"].as_array()?;
2216    let vec: Vec<f32> = embedding
2217        .iter()
2218        .filter_map(|v| v.as_f64().map(|f| f as f32))
2219        .collect();
2220
2221    if vec.is_empty() {
2222        None
2223    } else {
2224        Some(vec)
2225    }
2226}
2227
2228// ── Vector math ───────────────────────────────────────────────────────────────
2229
2230fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2231    if a.len() != b.len() || a.is_empty() {
2232        return 0.0;
2233    }
2234    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2235    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2236    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2237    if norm_a == 0.0 || norm_b == 0.0 {
2238        0.0
2239    } else {
2240        dot / (norm_a * norm_b)
2241    }
2242}
2243
2244fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2245    floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2246}
2247
2248fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2249    blob.chunks_exact(4)
2250        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2251        .collect()
2252}
2253
2254// ── Document extraction ───────────────────────────────────────────────────────
2255
2256/// Extract plain text from a PDF file using pdf-extract.
2257/// Returns None if the file can't be read or yields no text.
2258/// Output is best-effort — layout is not preserved, but content is.
2259fn normalize_extracted_document_text(text: String) -> Option<String> {
2260    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2261    let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2262    if trimmed.is_empty() {
2263        None
2264    } else {
2265        Some(trimmed.to_string())
2266    }
2267}
2268
2269fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2270    let previous_hook = std::panic::take_hook();
2271    std::panic::set_hook(Box::new(|_| {}));
2272    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2273        pdf_extract::extract_text(path)
2274    }));
2275    std::panic::set_hook(previous_hook);
2276
2277    match result {
2278        Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2279        Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2280        Err(payload) => {
2281            let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2282                (*msg).to_string()
2283            } else if let Some(msg) = payload.downcast_ref::<String>() {
2284                msg.clone()
2285            } else {
2286                "unknown parser panic".to_string()
2287            };
2288            Err(format!("pdf-extract panicked: {}", panic_text))
2289        }
2290    }
2291}
2292
2293fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2294    let mut doc =
2295        lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2296
2297    if doc.is_encrypted() {
2298        doc.decrypt("")
2299            .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2300    }
2301
2302    let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2303    if page_numbers.is_empty() {
2304        return Ok(None);
2305    }
2306
2307    let mut extracted_pages = Vec::new();
2308    let mut page_errors = Vec::new();
2309
2310    for page_number in page_numbers {
2311        match doc.extract_text(&[page_number]) {
2312            Ok(text) => {
2313                if let Some(page_text) = normalize_extracted_document_text(text) {
2314                    extracted_pages.push(page_text);
2315                }
2316            }
2317            Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2318        }
2319    }
2320
2321    if !extracted_pages.is_empty() {
2322        return Ok(Some(extracted_pages.join("\n\n")));
2323    }
2324
2325    if !page_errors.is_empty() {
2326        let sample_errors = page_errors
2327            .into_iter()
2328            .take(3)
2329            .collect::<Vec<_>>()
2330            .join("; ");
2331        return Err(format!(
2332            "lopdf could not extract usable page text ({sample_errors})"
2333        ));
2334    }
2335
2336    Ok(None)
2337}
2338
2339fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2340    let mut failures = Vec::new();
2341
2342    match extract_pdf_text_with_pdf_extract(path) {
2343        Ok(Some(text)) => return Ok(Some(text)),
2344        Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2345        Err(e) => failures.push(e),
2346    }
2347
2348    match extract_pdf_text_with_lopdf(path) {
2349        Ok(Some(text)) => return Ok(Some(text)),
2350        Ok(None) => failures.push("lopdf found no usable text".to_string()),
2351        Err(e) => failures.push(e),
2352    }
2353
2354    let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2355    Err(format!(
2356        "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2357        detail
2358    ))
2359}
2360
2361fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2362    let exe = std::env::current_exe()
2363        .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2364    let output = std::process::Command::new(exe)
2365        .arg("--pdf-extract-helper")
2366        .arg(path)
2367        .stdin(std::process::Stdio::null())
2368        .stdout(std::process::Stdio::piped())
2369        .stderr(std::process::Stdio::piped())
2370        .output()
2371        .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2372
2373    if !output.status.success() {
2374        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2375        return Err(if stderr.is_empty() {
2376            "PDF extraction failed.".to_string()
2377        } else {
2378            stderr
2379        });
2380    }
2381
2382    let text = String::from_utf8(output.stdout)
2383        .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2384    if text.trim().is_empty() {
2385        Ok(None)
2386    } else {
2387        Ok(Some(text))
2388    }
2389}
2390
2391pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2392    match extract_pdf_text_inside_helper(path) {
2393        Ok(Some(text)) => {
2394            use std::io::Write;
2395            let mut stdout = std::io::stdout();
2396            if stdout.write_all(text.as_bytes()).is_ok() {
2397                0
2398            } else {
2399                let _ = writeln!(
2400                    std::io::stderr(),
2401                    "PDF helper could not write extracted text."
2402                );
2403                1
2404            }
2405        }
2406        Ok(None) => {
2407            eprintln!(
2408                "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2409            );
2410            1
2411        }
2412        Err(e) => {
2413            eprintln!("{}", e);
2414            1
2415        }
2416    }
2417}
2418
2419/// Extract text from any supported document type (PDF, markdown, plain text).
2420/// Used by /attach for one-shot context injection.
2421pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2422    let ext = path
2423        .extension()
2424        .and_then(|e| e.to_str())
2425        .unwrap_or("")
2426        .to_lowercase();
2427    match ext.as_str() {
2428        "pdf" => {
2429            let text = extract_pdf_text(path)?.ok_or_else(|| {
2430                "PDF contains no extractable text — it may be scanned/image-only. \
2431                     Try attaching page screenshots with /image instead."
2432                    .to_string()
2433            })?;
2434            pdf_quality_check(text)
2435        }
2436        _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2437    }
2438}
2439
2440/// Detect garbled PDF extraction — common with academic publisher PDFs that use
2441/// custom embedded fonts with non-standard glyph mappings.
2442///
2443/// Returns the text if it looks usable, or an informative error if it looks garbled.
2444fn pdf_quality_check(text: String) -> Result<String, String> {
2445    let trimmed = text.trim();
2446
2447    // Too little content to be useful.
2448    if trimmed.len() < 150 {
2449        return Err(format!(
2450            "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2451             or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2452            trimmed.len()
2453        ));
2454    }
2455
2456    // Detect words smashed together: space ratio too low.
2457    // Normal prose is ~15–20% spaces. Below 4% means glyphs aren't mapping to spaces.
2458    let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2459    let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2460    let space_ratio = if non_newline > 0 {
2461        spaces as f32 / non_newline as f32
2462    } else {
2463        0.0
2464    };
2465
2466    if space_ratio < 0.04 {
2467        return Err(
2468            "PDF text extraction produced garbled output — words are merged with no spaces. \
2469             This usually means the PDF uses custom embedded fonts (common with academic publishers \
2470             like EBSCO, Elsevier, Springer). \
2471             Try a PDF exported from Word, Google Docs, or LaTeX, \
2472             or attach page screenshots with /image instead.".to_string()
2473        );
2474    }
2475
2476    Ok(text)
2477}
2478
2479// ── Chunking strategies ───────────────────────────────────────────────────────
2480
2481/// Dispatch to the correct chunking strategy based on file extension.
2482fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2483    if ext == "rs" {
2484        chunk_rust_symbols(text)
2485    } else {
2486        chunk_paragraphs(text)
2487    }
2488}
2489
2490/// Chunk Rust source at top-level item boundaries.
2491///
2492/// Detects lines at column 0 that start a Rust declaration keyword, flushes
2493/// the accumulated buffer, then moves any trailing doc-comments / attributes
2494/// forward so they stay with the item they annotate.
2495///
2496/// Items larger than 3000 chars (e.g. large impl blocks) are further split
2497/// by sliding window so no single chunk blows the retrieval budget.
2498fn chunk_rust_symbols(text: &str) -> Vec<String> {
2499    const ITEM_STARTS: &[&str] = &[
2500        "pub fn ",
2501        "pub async fn ",
2502        "pub unsafe fn ",
2503        "async fn ",
2504        "unsafe fn ",
2505        "fn ",
2506        "pub impl",
2507        "impl ",
2508        "pub struct ",
2509        "struct ",
2510        "pub enum ",
2511        "enum ",
2512        "pub trait ",
2513        "trait ",
2514        "pub mod ",
2515        "mod ",
2516        "pub type ",
2517        "type ",
2518        "pub const ",
2519        "const ",
2520        "pub static ",
2521        "static ",
2522    ];
2523
2524    let lines: Vec<&str> = text.lines().collect();
2525    let mut chunks: Vec<String> = Vec::new();
2526    let mut current: Vec<&str> = Vec::new();
2527
2528    for &line in &lines {
2529        let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2530        let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2531
2532        if is_item && !current.is_empty() {
2533            // Scan backward to find where trailing doc-comments / attributes start —
2534            // move them to the new chunk so they land with their item.
2535            let mut split = current.len();
2536            while split > 0 {
2537                let prev = current[split - 1].trim();
2538                if prev.starts_with("///")
2539                    || prev.starts_with("//!")
2540                    || prev.starts_with("#[")
2541                    || prev.is_empty()
2542                {
2543                    split -= 1;
2544                } else {
2545                    break;
2546                }
2547            }
2548            let body = current[..split].join("\n");
2549            if !body.trim().is_empty() {
2550                chunks.push(body);
2551            }
2552            current = current[split..].to_vec();
2553        }
2554        current.push(line);
2555    }
2556    if !current.is_empty() {
2557        let body = current.join("\n");
2558        if !body.trim().is_empty() {
2559            chunks.push(body);
2560        }
2561    }
2562
2563    // Subdivide any oversized blocks (e.g. long impl blocks with many methods).
2564    let mut result = Vec::new();
2565    for chunk in chunks {
2566        if chunk.len() > 3000 {
2567            result.extend(sliding_window_chunks(&chunk, 2000, 200));
2568        } else {
2569            result.push(chunk);
2570        }
2571    }
2572    result
2573}
2574
2575/// Chunk non-Rust text at paragraph boundaries (double newline).
2576fn chunk_paragraphs(text: &str) -> Vec<String> {
2577    let mut result: Vec<String> = Vec::new();
2578    let mut current = String::new();
2579
2580    for para in text.split("\n\n") {
2581        if current.len() + para.len() + 2 > 2000 {
2582            if !current.trim().is_empty() {
2583                result.push(current.clone());
2584            }
2585            current = para.to_string();
2586        } else {
2587            if !current.is_empty() {
2588                current.push_str("\n\n");
2589            }
2590            current.push_str(para);
2591        }
2592    }
2593    if !current.trim().is_empty() {
2594        result.push(current);
2595    }
2596
2597    let mut final_result = Vec::new();
2598    for chunk in result {
2599        if chunk.len() > 2000 {
2600            final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2601        } else {
2602            final_result.push(chunk);
2603        }
2604    }
2605    final_result
2606}
2607
2608/// Classic sliding-window fallback for oversized blocks.
2609fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2610    let chars: Vec<char> = text.chars().collect();
2611    let mut result = Vec::new();
2612    let mut i = 0;
2613    while i < chars.len() {
2614        let end = (i + chunk_size).min(chars.len());
2615        result.push(chars[i..end].iter().collect());
2616        if end == chars.len() {
2617            break;
2618        }
2619        i += chunk_size - overlap;
2620    }
2621    result
2622}
hematite/memory/vein.rs

hematite/memory/
vein.rs