Skip to main content

hematite/memory/
vein.rs

1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7/// "The Vein" — local RAG memory engine backed by SQLite FTS5 + semantic embeddings.
8///
9/// Two retrieval modes, used together:
10///
11/// **BM25 (always available)**
12/// Full-text search via SQLite FTS5 with Porter-stemming. Fast, zero extra GPU cost,
13/// works as the fallback when the embedding model isn't loaded.
14///
15/// **Semantic (when LM Studio has an embedding model loaded)**
16/// Calls `/v1/embeddings` (nomic-embed-text-v1.5 or similar) to produce 768-dim float
17/// vectors for each chunk. At search time the query is embedded and cosine similarity
18/// selects the most conceptually relevant chunks — even when no keywords match.
19///
20/// Hybrid search runs BM25 and semantic in parallel, deduplicates by path, and returns
21/// the top-k results ranked by combined score. Semantic results score higher when the
22/// embedding model is available; BM25 fills the gap when it isn't.
23///
24/// Indexing is incremental: files are re-indexed only when their mtime changes. Embedding
25/// vectors are stored in a separate `chunks_vec` SQLite table so they survive re-runs
26/// without hitting the embedding API again.
27pub struct Vein {
28    db: std::sync::Arc<std::sync::Mutex<Connection>>,
29    /// Base URL of the LLM provider, used for the embeddings endpoint.
30    base_url: String,
31}
32
33// SAFETY: rusqlite::Connection is !Send by default, but we wrap it in Arc<Mutex>
34// and ensure all accesses are serialized by the mutex.
35unsafe impl Send for Vein {}
36unsafe impl Sync for Vein {}
37
38#[derive(Debug, Clone)]
39pub struct SearchResult {
40    pub path: String,
41    pub content: String,
42    /// Combined relevance score (higher = more relevant).
43    pub score: f32,
44    /// Subsystem room derived from the file path (e.g. "agent", "ui", "tools").
45    pub room: String,
46    /// Last-modified timestamp from chunks_meta (unix seconds).
47    pub last_modified: i64,
48    /// Semantic memory type tagged at index time: "decision", "problem",
49    /// "milestone", "preference", or "" for unclassified/source/doc chunks.
50    pub memory_type: String,
51}
52
53#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct VeinHotFile {
55    pub path: String,
56    pub heat: i64,
57    pub last_modified: i64,
58    pub room: String,
59}
60
61#[derive(Debug, Clone)]
62pub struct VeinInspectionSnapshot {
63    pub indexed_source_files: usize,
64    pub indexed_docs: usize,
65    pub indexed_session_exchanges: usize,
66    pub embedded_source_doc_chunks: usize,
67    pub has_any_embeddings: bool,
68    pub active_room: Option<String>,
69    pub hot_files: Vec<VeinHotFile>,
70    pub l1_ready: bool,
71}
72
73#[derive(Debug, Default)]
74struct QuerySignals {
75    exact_phrases: Vec<String>,
76    standout_terms: Vec<String>,
77    historical_memory_hint: bool,
78    temporal_reference: Option<TemporalReference>,
79    /// Memory type the query is asking about — boosts matching chunks.
80    /// e.g. "what did we decide" → "decision", "what was the bug" → "problem"
81    query_memory_type: Option<&'static str>,
82}
83
84#[derive(Debug, Clone, Copy)]
85struct TemporalReference {
86    target_ts: i64,
87    window_secs: i64,
88}
89
90#[derive(Debug, Deserialize)]
91struct SessionReport {
92    #[serde(default)]
93    session_start: String,
94    #[serde(default)]
95    transcript: Vec<SessionTranscriptEntry>,
96}
97
98#[derive(Debug, Deserialize)]
99struct SessionTranscriptEntry {
100    #[serde(default)]
101    speaker: String,
102    #[serde(default)]
103    text: String,
104}
105
106#[derive(Debug)]
107struct SessionExchange {
108    path: String,
109    last_modified: i64,
110    content: String,
111}
112
113#[derive(Debug, Clone, Copy, PartialEq, Eq)]
114enum SessionSpeakerKind {
115    User,
116    Assistant,
117    Ignore,
118}
119
120/// Derive a subsystem room label from a file path.
121/// Uses path segments, filenames, and repo-role hints to map files into
122/// stable subsystem rooms. Falls back to the first directory component or
123/// "root" when no stronger signal exists.
124pub fn detect_room(path: &str) -> String {
125    let lower = path.to_lowercase().replace('\\', "/");
126    let filename = lower.rsplit('/').next().unwrap_or(&lower);
127    let ext = filename.rsplit('.').next().unwrap_or("");
128
129    let mut best_room = None::<&str>;
130    let mut best_score = 0i32;
131    let mut consider = |room: &'static str, score: i32| {
132        if score > best_score {
133            best_score = score;
134            best_room = Some(room);
135        }
136    };
137
138    let is_component = |segment: &str| {
139        lower == segment
140            || lower.starts_with(&format!("{segment}/"))
141            || lower.contains(&format!("/{segment}/"))
142    };
143
144    if lower.starts_with("session/")
145        || lower.starts_with(".hematite/reports/")
146        || lower.starts_with(".hematite/imports/")
147        || is_component("reports")
148        || is_component("imports")
149    {
150        consider("session", 100);
151    }
152
153    if lower.starts_with(".hematite/docs/")
154        || is_component("docs")
155        || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
156        || matches!(ext, "md" | "markdown" | "pdf" | "rst")
157    {
158        consider("docs", 80);
159    }
160
161    if is_component("tests")
162        || filename.contains("diagnostic")
163        || filename.ends_with("_test.rs")
164        || filename.ends_with(".test.ts")
165    {
166        consider("tests", 85);
167    }
168
169    if lower.starts_with(".github/workflows/")
170        || is_component("workflows")
171        || filename == ".pre-commit-config.yaml"
172        || filename == ".pre-commit-config.yml"
173        || filename.contains("hook")
174    {
175        consider("automation", 84);
176    }
177
178    if lower.starts_with("installer/")
179        || lower.starts_with("dist/")
180        || lower.starts_with("scripts/package-")
181        || filename.contains("release")
182        || filename.contains("bump-version")
183        || ext == "iss"
184    {
185        consider("release", 82);
186    }
187
188    if matches!(
189        filename,
190        "cargo.toml"
191            | "cargo.lock"
192            | "package.json"
193            | "pnpm-lock.yaml"
194            | "yarn.lock"
195            | "bun.lock"
196            | "bun.lockb"
197            | "pyproject.toml"
198            | "setup.py"
199            | "go.mod"
200            | "pom.xml"
201            | "build.gradle"
202            | "build.gradle.kts"
203            | "cmakelists.txt"
204            | ".gitignore"
205            | "settings.json"
206            | "mcp_servers.json"
207    ) || filename.ends_with(".sln")
208        || filename.ends_with(".csproj")
209        || filename.contains("config")
210    {
211        consider("config", 76);
212    }
213
214    if is_component("ui")
215        || matches!(
216            filename,
217            "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
218        )
219    {
220        consider("ui", 70);
221    }
222
223    if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
224        consider("memory", 72);
225    }
226
227    if is_component("tools")
228        || matches!(
229            filename,
230            "verify_build.rs"
231                | "host_inspect.rs"
232                | "shell.rs"
233                | "code_sandbox.rs"
234                | "project_map.rs"
235                | "runtime_trace.rs"
236        )
237    {
238        consider("tools", 68);
239    }
240
241    if filename.contains("mcp")
242        || filename.contains("lsp")
243        || lower.contains("/mcp/")
244        || lower.contains("/lsp/")
245    {
246        consider("integration", 67);
247    }
248
249    if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
250        || filename.contains("startup")
251        || filename.contains("runtime")
252    {
253        consider("runtime", 66);
254    }
255
256    if is_component("agent") {
257        consider("agent", 60);
258    }
259
260    if lower.starts_with("libs/") || is_component("libs") {
261        consider("libs", 58);
262    }
263
264    if lower.starts_with("scripts/") || is_component("scripts") {
265        consider("scripts", 55);
266    }
267
268    if let Some(room) = best_room {
269        return room.to_string();
270    }
271
272    // Fall back to first directory component
273    lower
274        .split('/')
275        .next()
276        .filter(|s| !s.is_empty() && !s.contains('.'))
277        .unwrap_or("root")
278        .to_string()
279}
280
281/// Classify session memory text into a semantic type using zero-cost regex patterns.
282/// Returns one of: "decision", "problem", "milestone", "preference", or "" (unclassified).
283///
284/// Applied only to session/import chunks at index time. Source and doc chunks always get "".
285/// Used by reranking to boost chunks whose type matches the query's implied intent.
286pub fn detect_memory_type(text: &str) -> &'static str {
287    let lower = text.to_lowercase();
288
289    // Decision markers — architectural choices, agreed approaches, "let's use X"
290    let decision_patterns = [
291        "let's use ",
292        "we'll use ",
293        "decided to ",
294        "going with ",
295        "we agreed ",
296        "the plan is",
297        "we're going to",
298        "switching to",
299        "we chose",
300        "final decision",
301        "we settled on",
302        "agreed on",
303        "we decided",
304    ];
305    for pat in &decision_patterns {
306        if lower.contains(pat) {
307            return "decision";
308        }
309    }
310
311    // Problem markers — bugs, errors, failures, blockers
312    let problem_patterns = [
313        "bug fixed",
314        "bug was",
315        "the issue was",
316        "root cause",
317        "error was",
318        "turned out to be",
319        "the fix was",
320        "was caused by",
321        "broken because",
322        "fixed by",
323        "the problem was",
324        "found the bug",
325        "port conflict",
326        "crash",
327        "panicked",
328        "segfault",
329        "oom",
330        "out of memory",
331    ];
332    for pat in &problem_patterns {
333        if lower.contains(pat) {
334            return "problem";
335        }
336    }
337
338    // Milestone markers — shipped, completed, working
339    let milestone_patterns = [
340        "now working",
341        "successfully",
342        "shipped",
343        "deployed",
344        "it works",
345        "tests pass",
346        "all green",
347        "breakthrough",
348        "finally got",
349        "got it working",
350        "completed",
351        "finished",
352        "done with",
353        "landed",
354    ];
355    for pat in &milestone_patterns {
356        if lower.contains(pat) {
357            return "milestone";
358        }
359    }
360
361    // Preference markers — personal/operator preferences for style or workflow
362    let preference_patterns = [
363        "i prefer",
364        "i like",
365        "i don't like",
366        "i want",
367        "always use",
368        "never use",
369        "i usually",
370        "my preference",
371        "keep it",
372        "avoid using",
373    ];
374    for pat in &preference_patterns {
375        if lower.contains(pat) {
376            return "preference";
377        }
378    }
379
380    ""
381}
382
383impl Vein {
384    const SESSION_REPORT_LIMIT: usize = 5;
385    const SESSION_TURN_LIMIT: usize = 50;
386    const IMPORT_FILE_LIMIT: usize = 12;
387    const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
388
389    pub fn new<P: AsRef<Path>>(
390        db_path: P,
391        base_url: String,
392    ) -> Result<Self, Box<dyn std::error::Error>> {
393        let db = Connection::open(db_path)?;
394
395        // WAL mode for better concurrent read performance.
396        db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
397
398        // chunks_meta: tracks last-modified time per path for incremental indexing.
399        // chunks_fts:  BM25 full-text index of all code chunks.
400        // chunks_vec:  semantic embedding vectors, keyed by (path, chunk_idx).
401        db.execute_batch(
402            "CREATE TABLE IF NOT EXISTS chunks_meta (
403                path TEXT PRIMARY KEY,
404                last_modified INTEGER NOT NULL,
405                room TEXT NOT NULL DEFAULT 'root'
406            );
407            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
408                path UNINDEXED,
409                content,
410                tokenize='porter ascii'
411            );
412            CREATE TABLE IF NOT EXISTS chunks_vec (
413                path TEXT NOT NULL,
414                chunk_idx INTEGER NOT NULL,
415                embedding BLOB NOT NULL,
416                PRIMARY KEY (path, chunk_idx)
417            );
418            CREATE TABLE IF NOT EXISTS file_heat (
419                path TEXT PRIMARY KEY,
420                heat INTEGER NOT NULL DEFAULT 0,
421                last_edit INTEGER NOT NULL DEFAULT 0
422            );",
423        )?;
424
425        // Schema migrations — safe to run on every open (IF NOT EXISTS / ignored if col exists).
426        let _ = db
427            .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
428        let _ = db.execute_batch(
429            "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
430        );
431        let _ = db.execute_batch(
432            "ALTER TABLE chunks_meta ADD COLUMN memory_type TEXT NOT NULL DEFAULT '';",
433        );
434
435        Ok(Self {
436            db: std::sync::Arc::new(std::sync::Mutex::new(db)),
437            base_url,
438        })
439    }
440
441    // ── Indexing ──────────────────────────────────────────────────────────────
442
443    /// Index a single file for BM25 search. Skip if mtime hasn't changed.
444    /// Returns the chunks that were written (empty if file was unchanged).
445    pub fn index_document(
446        &mut self,
447        path: &str,
448        last_modified: i64,
449        full_text: &str,
450    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
451        let room = detect_room(path);
452        let ext = std::path::Path::new(path)
453            .extension()
454            .and_then(|e| e.to_str())
455            .unwrap_or("");
456        let chunks = chunk_by_symbols(ext, full_text);
457        // Tag session memory with semantic type; source/doc chunks leave it empty.
458        let memory_type = if room == "session" {
459            detect_memory_type(full_text)
460        } else {
461            ""
462        };
463        self.index_chunks_with_room_and_type(path, last_modified, &room, memory_type, &chunks)
464    }
465
466    fn index_chunks_with_room_and_type(
467        &mut self,
468        path: &str,
469        last_modified: i64,
470        room: &str,
471        memory_type: &str,
472        chunks: &[String],
473    ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
474        let db = self.db.lock().unwrap();
475        let existing: Option<i64> = db
476            .query_row(
477                "SELECT last_modified FROM chunks_meta WHERE path = ?1",
478                params![path],
479                |r| r.get(0),
480            )
481            .ok();
482
483        if let Some(ts) = existing {
484            if ts >= last_modified {
485                return Ok(Vec::new()); // unchanged — skip
486            }
487        }
488
489        // Evict stale BM25 chunks, stale embedding vectors, then update metadata.
490        db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
491        db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
492        db.execute(
493            "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room, memory_type) VALUES (?1, ?2, ?3, ?4)",
494            params![path, last_modified, room, memory_type],
495        )?;
496
497        drop(db);
498
499        let mut db = self.db.lock().unwrap();
500        let tx = db.transaction()?;
501        {
502            let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
503            for chunk in chunks {
504                stmt.execute(params![path, chunk.as_str()])?;
505            }
506        }
507        tx.commit()?;
508
509        Ok(chunks.to_vec())
510    }
511
512    /// Embed a set of chunks for one file and store the vectors.
513    /// Called after `index_document` returns new chunks.
514    /// Silently skips if the embedding model is unavailable.
515    pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
516        for (idx, chunk) in chunks.iter().enumerate() {
517            if let Some(vec) = embed_text_blocking(chunk, &self.base_url) {
518                let blob = floats_to_blob(&vec);
519                let db = self.db.lock().unwrap();
520                let _ = db.execute(
521                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
522                    params![path, idx as i64, blob],
523                );
524            }
525        }
526    }
527
528    // ── Search ────────────────────────────────────────────────────────────────
529
530    /// BM25-ranked full-text search via FTS5 MATCH.
531    pub fn search_bm25(
532        &self,
533        query: &str,
534        limit: usize,
535    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
536        // Strip common English stopwords so FTS5 MATCH gets meaningful tokens only.
537        // FTS5 uses implicit AND by default — passing stopwords like "how", "does",
538        // "the" causes zero results because source code never contains those phrases.
539        const STOPWORDS: &[&str] = &[
540            "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
541            "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an", "the",
542            "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "get",
543            "gets", "got", "work", "works", "make", "makes", "use", "uses", "into", "that", "this",
544            "it", "its",
545        ];
546
547        let safe_query: String = query
548            .chars()
549            .map(|c| {
550                if c.is_alphanumeric() || c == ' ' || c == '_' {
551                    c
552                } else {
553                    ' '
554                }
555            })
556            .collect();
557
558        // Build an OR query from non-stopword tokens so any relevant term matches.
559        let fts_query = safe_query
560            .split_whitespace()
561            .filter(|w| w.len() >= 3 && !STOPWORDS.contains(&w.to_lowercase().as_str()))
562            .collect::<Vec<_>>()
563            .join(" OR ");
564
565        if fts_query.is_empty() {
566            return Ok(Vec::new());
567        }
568
569        let db = self.db.lock().unwrap();
570        let mut stmt = db.prepare(
571            "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room, cm.memory_type
572             FROM chunks_fts
573             JOIN chunks_meta cm ON cm.path = chunks_fts.path
574             WHERE chunks_fts MATCH ?1
575             ORDER BY rank
576             LIMIT ?2",
577        )?;
578
579        let results: Vec<SearchResult> = stmt
580            .query_map(params![fts_query, limit as i64], |row| {
581                Ok(SearchResult {
582                    path: row.get(0)?,
583                    content: row.get(1)?,
584                    score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
585                    last_modified: row.get(3)?,
586                    room: row.get(4)?,
587                    memory_type: row.get::<_, String>(5).unwrap_or_default(),
588                })
589            })?
590            .filter_map(|r| r.ok())
591            .collect();
592
593        Ok(results)
594    }
595
596    /// Semantic search: embed the query, cosine-similarity against all stored vectors.
597    /// Returns empty if the embedding model isn't loaded.
598    pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
599        let query_vec = match embed_query_blocking(query, &self.base_url) {
600            Some(v) => v,
601            None => return Vec::new(),
602        };
603
604        // Load all stored embeddings.
605        let rows: Vec<(String, i64, Vec<u8>, i64, String, String)> = {
606            let db = self.db.lock().unwrap();
607            let mut stmt = match db.prepare(
608                "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room, cm.memory_type
609                 FROM chunks_vec cv
610                 JOIN chunks_meta cm ON cm.path = cv.path",
611            ) {
612                Ok(s) => s,
613                Err(_) => return Vec::new(),
614            };
615            stmt.query_map([], |row| {
616                Ok((
617                    row.get::<_, String>(0)?,
618                    row.get::<_, i64>(1)?,
619                    row.get::<_, Vec<u8>>(2)?,
620                    row.get::<_, i64>(3)?,
621                    row.get::<_, String>(4)?,
622                    row.get::<_, String>(5).unwrap_or_default(),
623                ))
624            })
625            .ok()
626            .map(|rows| rows.filter_map(|r| r.ok()).collect())
627            .unwrap_or_default()
628        };
629
630        if rows.is_empty() {
631            return Vec::new();
632        }
633
634        // Score each chunk.
635        let mut scored: Vec<(f32, String, i64, i64, String, String)> = rows
636            .into_iter()
637            .filter_map(|(path, idx, blob, last_modified, room, memory_type)| {
638                let vec = blob_to_floats(&blob);
639                let sim = cosine_similarity(&query_vec, &vec);
640                Some((sim, path, idx, last_modified, room, memory_type))
641            })
642            .collect();
643
644        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
645        scored.truncate(limit);
646
647        // Fetch the content for the top chunks.
648        let db = self.db.lock().unwrap();
649        scored
650            .into_iter()
651            .filter_map(|(score, path, idx, last_modified, room, memory_type)| {
652                let content: Option<String> = db
653                    .query_row(
654                        "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
655                        params![path, idx],
656                        |r| r.get(0),
657                    )
658                    .ok();
659                content.map(|c| SearchResult {
660                    path,
661                    content: c,
662                    score,
663                    room,
664                    last_modified,
665                    memory_type,
666                })
667            })
668            .collect()
669    }
670
671    /// Hybrid search: BM25 + semantic, deduplicated and re-ranked.
672    ///
673    /// Semantic results are preferred (they score higher) when the embedding model
674    /// is available. BM25 fills in or takes over when it isn't.
675    /// Results from the active room (hottest subsystem by edit count) get a
676    /// small boost so the model gravitates toward what's currently being worked on.
677    pub fn search_context(
678        &self,
679        query: &str,
680        limit: usize,
681    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
682        let candidate_limit = (limit.max(1) * 4).max(12);
683        let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
684        let semantic = self.search_semantic(query, candidate_limit);
685        let signals = QuerySignals::from_query(query);
686
687        // Determine the active room from heat scores.
688        let active_room = self.active_room();
689
690        // Merge: semantic results win ties (scored 1.0–2.0 range after boost).
691        // BM25 results land in 0.0–1.0 range.
692        let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
693
694        for r in semantic {
695            let score = reranked_score(&signals, active_room.as_deref(), &r, true);
696            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
697        }
698
699        for r in bm25 {
700            let score = reranked_score(&signals, active_room.as_deref(), &r, false);
701            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
702        }
703
704        let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
705        merged.sort_by(|a, b| {
706            b.score
707                .partial_cmp(&a.score)
708                .unwrap_or(std::cmp::Ordering::Equal)
709        });
710        merged.truncate(limit);
711        Ok(merged)
712    }
713
714    /// Returns the room with the highest total heat (most edited subsystem).
715    /// Used to bias retrieval toward what the user is actively working on.
716    fn active_room(&self) -> Option<String> {
717        let db = self.db.lock().unwrap();
718        db.query_row(
719            "SELECT cm.room, SUM(fh.heat) as total
720             FROM file_heat fh
721             JOIN chunks_meta cm ON cm.path = fh.path
722             GROUP BY cm.room
723             ORDER BY total DESC
724             LIMIT 1",
725            [],
726            |row| row.get::<_, String>(0),
727        )
728        .ok()
729    }
730
731    // ── Project Indexing ──────────────────────────────────────────────────────
732
733    /// Walk the entire project and index all source files (BM25 + embeddings).
734    ///
735    /// Skips: `target/`, `.git/`, `node_modules/`, `.hematite/`, files > 512 KB.
736    /// Also indexes `.hematite/docs/` — the designated reference document drop folder.
737    /// Returns the number of files processed (unchanged files are fast-pathed).
738    pub fn index_project(&mut self) -> usize {
739        let root = crate::tools::file_ops::workspace_root();
740        let mut count = 0usize;
741
742        const INDEXABLE: &[&str] = &[
743            "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
744            "yml", "txt",
745        ];
746        const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
747
748        for entry in walkdir::WalkDir::new(&root)
749            .follow_links(false)
750            .into_iter()
751            .filter_entry(|e| {
752                if e.file_type().is_dir() {
753                    let name = e.file_name().to_string_lossy();
754                    return !SKIP_DIRS.contains(&name.as_ref());
755                }
756                true
757            })
758            .filter_map(|e| e.ok())
759            .filter(|e| e.file_type().is_file())
760        {
761            let path = entry.path();
762            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
763            if !INDEXABLE.contains(&ext) {
764                continue;
765            }
766
767            let Ok(meta) = std::fs::metadata(path) else {
768                continue;
769            };
770            if meta.len() > 512_000 {
771                continue;
772            }
773
774            let mtime = meta
775                .modified()
776                .map(|t| {
777                    t.duration_since(std::time::UNIX_EPOCH)
778                        .unwrap_or_default()
779                        .as_secs() as i64
780                })
781                .unwrap_or(0);
782
783            let rel = path.strip_prefix(&root).unwrap_or(path);
784            let rel_str = rel.to_string_lossy().replace('\\', "/");
785
786            if let Ok(content) = std::fs::read_to_string(path) {
787                match self.index_document(&rel_str, mtime, &content) {
788                    Ok(new_chunks) if !new_chunks.is_empty() => {
789                        count += 1;
790                    }
791                    Ok(_) => {}
792                    Err(_) => {}
793                }
794            }
795        }
796
797        count += self.index_workspace_artifacts(&root);
798
799        count
800    }
801
802    /// Index workspace-local supporting context that should be available even
803    /// outside a real project workspace: `.hematite/docs/`, recent session
804    /// reports stored in `.hematite/reports/`, and imported chat exports in
805    /// `.hematite/imports/`.
806    pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
807        let mut count = self.index_docs_folder(workspace_root);
808        count += self.index_recent_session_reports(workspace_root);
809        count += self.index_imported_session_exports(workspace_root);
810        self.backfill_missing_embeddings();
811        count
812    }
813
814    /// Index reference documents in `.hematite/docs/`.
815    /// Supports PDF (text extraction), markdown, and plain text.
816    /// Documents are stored with path prefix `docs/filename` so they are
817    /// distinguishable from source files in retrieval results.
818    fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
819        let docs_dir = workspace_root.join(".hematite").join("docs");
820        const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
821        let mut count = 0usize;
822        let mut desired_paths = HashSet::new();
823
824        if docs_dir.exists() {
825            for entry in walkdir::WalkDir::new(&docs_dir)
826                .max_depth(3)
827                .follow_links(false)
828                .into_iter()
829                .filter_map(|e| e.ok())
830                .filter(|e| e.file_type().is_file())
831            {
832                let path = entry.path();
833                let ext = path
834                    .extension()
835                    .and_then(|e| e.to_str())
836                    .unwrap_or("")
837                    .to_lowercase();
838                if !DOCS_INDEXABLE.contains(&ext.as_str()) {
839                    continue;
840                }
841
842                let Ok(meta) = std::fs::metadata(path) else {
843                    continue;
844                };
845                if meta.len() > 50_000_000 {
846                    continue;
847                }
848
849                let mtime = meta
850                    .modified()
851                    .map(|t| {
852                        t.duration_since(std::time::UNIX_EPOCH)
853                            .unwrap_or_default()
854                            .as_secs() as i64
855                    })
856                    .unwrap_or(0);
857
858                let rel = path.strip_prefix(workspace_root).unwrap_or(path);
859                let rel_str = rel.to_string_lossy().replace('\\', "/");
860                desired_paths.insert(rel_str.clone());
861
862                let content = if ext == "pdf" {
863                    extract_pdf_text(path).ok().flatten()
864                } else {
865                    std::fs::read_to_string(path).ok()
866                };
867
868                if let Some(text) = content {
869                    if text.trim().is_empty() {
870                        continue;
871                    }
872                    match self.index_document(&rel_str, mtime, &text) {
873                        Ok(new_chunks) if !new_chunks.is_empty() => {
874                            count += 1;
875                        }
876                        Ok(_) => {}
877                        Err(_) => {}
878                    }
879                }
880            }
881        }
882
883        self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
884        count
885    }
886
887    /// Index the most recent local session reports by exchange pair so prior
888    /// decisions remain searchable across launches without flooding the vein.
889    pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
890        let reports_dir = workspace_root.join(".hematite").join("reports");
891        let mut count = 0usize;
892        let mut desired_paths = HashSet::new();
893
894        if reports_dir.exists() {
895            let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
896                .ok()
897                .into_iter()
898                .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
899                .map(|entry| entry.path())
900                .filter(|path| {
901                    path.is_file()
902                        && path.extension().and_then(|ext| ext.to_str()) == Some("json")
903                        && path
904                            .file_stem()
905                            .and_then(|stem| stem.to_str())
906                            .map(|stem| stem.starts_with("session_"))
907                            .unwrap_or(false)
908                })
909                .collect();
910
911            reports.sort_by(|a, b| {
912                let a_name = a
913                    .file_name()
914                    .and_then(|name| name.to_str())
915                    .unwrap_or_default();
916                let b_name = b
917                    .file_name()
918                    .and_then(|name| name.to_str())
919                    .unwrap_or_default();
920                b_name.cmp(a_name)
921            });
922            reports.truncate(Self::SESSION_REPORT_LIMIT);
923
924            for report_path in reports {
925                let Ok(meta) = std::fs::metadata(&report_path) else {
926                    continue;
927                };
928                let mtime = meta
929                    .modified()
930                    .map(|t| {
931                        t.duration_since(std::time::UNIX_EPOCH)
932                            .unwrap_or_default()
933                            .as_secs() as i64
934                    })
935                    .unwrap_or(0);
936
937                for exchange in load_session_exchanges(&report_path, mtime) {
938                    desired_paths.insert(exchange.path.clone());
939                    let mtype = detect_memory_type(&exchange.content);
940                    match self.index_chunks_with_room_and_type(
941                        &exchange.path,
942                        exchange.last_modified,
943                        "session",
944                        mtype,
945                        std::slice::from_ref(&exchange.content),
946                    ) {
947                        Ok(new_chunks) if !new_chunks.is_empty() => {
948                            count += 1;
949                        }
950                        Ok(_) => {}
951                        Err(_) => {}
952                    }
953                }
954            }
955        }
956
957        self.prune_indexed_prefix("session/", &desired_paths);
958        count
959    }
960
961    /// Index imported chat exports from `.hematite/imports/`.
962    /// Supported inputs include already-normalized `>` transcripts, Claude Code
963    /// JSONL, Codex CLI JSONL, simple role/content JSON exports, ChatGPT
964    /// `mapping` exports, and Hematite session-report JSON.
965    pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
966        let imports_dir = workspace_root.join(".hematite").join("imports");
967        let mut count = 0usize;
968        let mut desired_paths = HashSet::new();
969
970        if imports_dir.exists() {
971            let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
972                .max_depth(4)
973                .follow_links(false)
974                .into_iter()
975                .filter_map(|entry| entry.ok())
976                .filter(|entry| entry.file_type().is_file())
977                .filter_map(|entry| {
978                    let path = entry.into_path();
979                    let ext = path
980                        .extension()
981                        .and_then(|ext| ext.to_str())
982                        .unwrap_or("")
983                        .to_ascii_lowercase();
984                    if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
985                        return None;
986                    }
987                    let meta = std::fs::metadata(&path).ok()?;
988                    if meta.len() > Self::IMPORT_MAX_BYTES {
989                        return None;
990                    }
991                    let mtime = meta
992                        .modified()
993                        .map(|t| {
994                            t.duration_since(std::time::UNIX_EPOCH)
995                                .unwrap_or_default()
996                                .as_secs() as i64
997                        })
998                        .unwrap_or(0);
999                    Some((path, mtime))
1000                })
1001                .collect();
1002
1003            imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
1004                b_mtime
1005                    .cmp(a_mtime)
1006                    .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
1007            });
1008            imports.truncate(Self::IMPORT_FILE_LIMIT);
1009
1010            for (import_path, mtime) in imports {
1011                for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
1012                    desired_paths.insert(exchange.path.clone());
1013                    let mtype = detect_memory_type(&exchange.content);
1014                    match self.index_chunks_with_room_and_type(
1015                        &exchange.path,
1016                        exchange.last_modified,
1017                        "session",
1018                        mtype,
1019                        std::slice::from_ref(&exchange.content),
1020                    ) {
1021                        Ok(new_chunks) if !new_chunks.is_empty() => {
1022                            count += 1;
1023                        }
1024                        Ok(_) => {}
1025                        Err(_) => {}
1026                    }
1027                }
1028            }
1029        }
1030
1031        self.prune_indexed_prefix("session/imports/", &desired_paths);
1032        count
1033    }
1034
1035    /// Embed any FTS chunks that don't yet have a vector in chunks_vec.
1036    /// Called at the end of index_project so that loading the embedding model
1037    /// after the initial index automatically triggers a semantic upgrade on the
1038    /// next agent turn — no /forget or file-touch required.
1039    fn backfill_missing_embeddings(&self) {
1040        // Fast path: if chunk counts match, nothing to do.
1041        let (fts_count, vec_count) = {
1042            let db = self.db.lock().unwrap();
1043            let fts: i64 = db
1044                .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
1045                .unwrap_or(0);
1046            let vec: i64 = db
1047                .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
1048                .unwrap_or(0);
1049            (fts, vec)
1050        };
1051        if fts_count == 0 || fts_count == vec_count {
1052            return;
1053        }
1054
1055        // Fetch (path, chunk_idx, content) for chunks with no embedding.
1056        // chunks_fts rowid serves as chunk_idx (1-based → convert to 0-based).
1057        let missing: Vec<(String, i64, String)> = {
1058            let db = self.db.lock().unwrap();
1059            let mut stmt = db
1060                .prepare(
1061                    "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
1062                     FROM chunks_fts f
1063                     LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
1064                     WHERE v.path IS NULL
1065                     ORDER BY CASE
1066                         WHEN f.path LIKE '%.rs' THEN 0
1067                         WHEN f.path LIKE '%.toml' THEN 1
1068                         WHEN f.path LIKE '%.json' THEN 2
1069                         ELSE 3
1070                     END, f.path
1071                     LIMIT 20",
1072                )
1073                .unwrap();
1074            stmt.query_map([], |r| {
1075                Ok((
1076                    r.get::<_, String>(0)?,
1077                    r.get::<_, i64>(1)?,
1078                    r.get::<_, String>(2)?,
1079                ))
1080            })
1081            .unwrap()
1082            .filter_map(|r| r.ok())
1083            .collect()
1084        };
1085
1086        for (path, idx, content) in missing {
1087            if let Some(vec) = embed_text_blocking(&content, &self.base_url) {
1088                let blob = floats_to_blob(&vec);
1089                let db = self.db.lock().unwrap();
1090                let _ = db.execute(
1091                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
1092                    params![path, idx, blob],
1093                );
1094            } else {
1095                // Embedding model not available — stop trying for this pass.
1096                break;
1097            }
1098        }
1099    }
1100
1101    /// Total number of unique files currently indexed.
1102    /// Session exchange chunks are excluded so status counts stay source/doc centric.
1103    pub fn file_count(&self) -> usize {
1104        let db = self.db.lock().unwrap();
1105        db.query_row(
1106            "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
1107            [],
1108            |r| r.get::<_, i64>(0),
1109        )
1110        .unwrap_or(0) as usize
1111    }
1112
1113    /// Number of source/doc chunks that have semantic embedding vectors stored.
1114    /// Session exchange chunks are excluded so status counts stay source/doc centric.
1115    pub fn embedded_chunk_count(&self) -> usize {
1116        let db = self.db.lock().unwrap();
1117        db.query_row(
1118            "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1119            [],
1120            |r| r.get::<_, i64>(0),
1121        )
1122        .unwrap_or(0) as usize
1123    }
1124
1125    /// True when any chunk type currently has embeddings available.
1126    pub fn has_any_embeddings(&self) -> bool {
1127        let db = self.db.lock().unwrap();
1128        db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1129            r.get::<_, i64>(0)
1130        })
1131        .unwrap_or(0)
1132            != 0
1133    }
1134
1135    /// Wipe all indexed data. The DB file stays on disk; next index_project()
1136    /// call rebuilds from scratch (re-reads all files, re-embeds all chunks).
1137    pub fn reset(&self) {
1138        let db = self.db.lock().unwrap();
1139        let _ = db.execute_batch(
1140            "DELETE FROM chunks_fts;
1141             DELETE FROM chunks_vec;
1142             DELETE FROM chunks_meta;",
1143        );
1144    }
1145
1146    /// Return a compact operator-facing snapshot of what The Vein currently knows.
1147    /// Intended for trust/debug surfaces like `/vein-inspect`.
1148    pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1149        let db = self.db.lock().unwrap();
1150        let indexed_source_files = db
1151            .query_row(
1152                "SELECT COUNT(*) FROM chunks_meta
1153                 WHERE path NOT LIKE 'session/%'
1154                   AND path NOT LIKE '.hematite/docs/%'",
1155                [],
1156                |r| r.get::<_, i64>(0),
1157            )
1158            .unwrap_or(0) as usize;
1159        let indexed_docs = db
1160            .query_row(
1161                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1162                [],
1163                |r| r.get::<_, i64>(0),
1164            )
1165            .unwrap_or(0) as usize;
1166        let indexed_session_exchanges = db
1167            .query_row(
1168                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1169                [],
1170                |r| r.get::<_, i64>(0),
1171            )
1172            .unwrap_or(0) as usize;
1173        let embedded_source_doc_chunks = db
1174            .query_row(
1175                "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1176                [],
1177                |r| r.get::<_, i64>(0),
1178            )
1179            .unwrap_or(0) as usize;
1180        let has_any_embeddings = db
1181            .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1182                r.get::<_, i64>(0)
1183            })
1184            .unwrap_or(0)
1185            != 0;
1186        drop(db);
1187
1188        let hot_files = self
1189            .hot_files(hot_limit.max(1))
1190            .into_iter()
1191            .map(|(path, heat, last_modified, room)| VeinHotFile {
1192                path,
1193                heat,
1194                last_modified,
1195                room,
1196            })
1197            .collect::<Vec<_>>();
1198
1199        VeinInspectionSnapshot {
1200            indexed_source_files,
1201            indexed_docs,
1202            indexed_session_exchanges,
1203            embedded_source_doc_chunks,
1204            has_any_embeddings,
1205            active_room: self.active_room(),
1206            l1_ready: !hot_files.is_empty(),
1207            hot_files,
1208        }
1209    }
1210
1211    // ── L1 heat tracking ──────────────────────────────────────────────────────
1212
1213    /// Record an edit to a file. Increments its heat score in file_heat.
1214    /// Called from the tool dispatch after a successful edit_file / write_file /
1215    /// patch_hunk / multi_search_replace so the L1 context stays current.
1216    pub fn bump_heat(&self, path: &str) {
1217        if path.is_empty() {
1218            return;
1219        }
1220        let now = std::time::SystemTime::now()
1221            .duration_since(std::time::UNIX_EPOCH)
1222            .unwrap_or_default()
1223            .as_secs() as i64;
1224        let db = self.db.lock().unwrap();
1225        let _ = db.execute(
1226            "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1227             ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1228            params![path, now],
1229        );
1230    }
1231
1232    /// Return the top N hot files ranked by edit count (heat) then recency.
1233    /// Joins file_heat with chunks_meta so only indexed files are included.
1234    /// Returns (path, heat, mtime, room).
1235    fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1236        let db = self.db.lock().unwrap();
1237        let mut stmt = match db.prepare(
1238            "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1239             FROM file_heat fh
1240             JOIN chunks_meta cm ON cm.path = fh.path
1241             ORDER BY fh.heat DESC, cm.last_modified DESC
1242             LIMIT ?1",
1243        ) {
1244            Ok(s) => s,
1245            Err(_) => return vec![],
1246        };
1247        stmt.query_map(params![n as i64], |row| {
1248            Ok((
1249                row.get::<_, String>(0)?,
1250                row.get::<_, i64>(1)?,
1251                row.get::<_, i64>(2)?,
1252                row.get::<_, String>(3)?,
1253            ))
1254        })
1255        .map(|rows| rows.filter_map(|r| r.ok()).collect())
1256        .unwrap_or_default()
1257    }
1258
1259    /// Return the paths of the top hot files (most edited).
1260    /// Used by RepoMapGenerator to bias PageRank toward active files.
1261    pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1262        self.hot_files(n)
1263            .into_iter()
1264            .map(|(path, _, _, _)| path)
1265            .collect()
1266    }
1267
1268    /// Return hot files with normalized heat weights in [0.0, 1.0].
1269    /// The hottest file gets weight 1.0; others are scaled proportionally.
1270    /// Used by RepoMapGenerator to apply heat-weighted PageRank personalization.
1271    pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1272        let files = self.hot_files(n);
1273        if files.is_empty() {
1274            return vec![];
1275        }
1276        let max_heat = files
1277            .iter()
1278            .map(|(_, h, _, _)| *h)
1279            .max()
1280            .unwrap_or(1)
1281            .max(1) as f64;
1282        files
1283            .into_iter()
1284            .map(|(path, heat, _, _)| {
1285                let weight = (heat as f64) / max_heat;
1286                (path, weight)
1287            })
1288            .collect()
1289    }
1290
1291    /// Build the L1 context block — a compact "hot files" summary injected into
1292    /// the system prompt at session start. Capped at ~150 tokens.
1293    /// Files are grouped by room so the model sees subsystem structure at a glance.
1294    /// Returns None when there are no heat records yet (fresh project).
1295    pub fn l1_context(&self) -> Option<String> {
1296        let files = self.hot_files(8);
1297        if files.is_empty() {
1298            return None;
1299        }
1300        let now = std::time::SystemTime::now()
1301            .duration_since(std::time::UNIX_EPOCH)
1302            .unwrap_or_default()
1303            .as_secs() as i64;
1304
1305        // Group by room for readability.
1306        let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1307            std::collections::BTreeMap::new();
1308        for (path, heat, mtime, room) in &files {
1309            by_room
1310                .entry(room.clone())
1311                .or_default()
1312                .push((path.clone(), *heat, *mtime));
1313        }
1314
1315        let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1316        for (room, entries) in &by_room {
1317            out.push_str(&format!("[{}]\n", room));
1318            for (path, heat, mtime) in entries {
1319                let age_secs = now - mtime;
1320                let age = if age_secs < 3600 {
1321                    "just now".to_string()
1322                } else if age_secs < 86400 {
1323                    format!("{}h ago", age_secs / 3600)
1324                } else {
1325                    format!("{}d ago", age_secs / 86400)
1326                };
1327                out.push_str(&format!(
1328                    "  - {} [{} edit{}, {}]\n",
1329                    path,
1330                    heat,
1331                    if *heat == 1 { "" } else { "s" },
1332                    age
1333                ));
1334            }
1335        }
1336        Some(out)
1337    }
1338
1339    fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1340        let pattern = format!("{}%", prefix);
1341        let existing_paths: Vec<String> = {
1342            let db = self.db.lock().unwrap();
1343            let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1344                Ok(stmt) => stmt,
1345                Err(_) => return,
1346            };
1347            stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1348                .map(|rows| rows.filter_map(|row| row.ok()).collect())
1349                .unwrap_or_default()
1350        };
1351
1352        if existing_paths.is_empty() {
1353            return;
1354        }
1355
1356        let db = self.db.lock().unwrap();
1357        for path in existing_paths {
1358            if desired_paths.contains(&path) {
1359                continue;
1360            }
1361            let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1362            let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1363            let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1364        }
1365    }
1366}
1367
1368impl QuerySignals {
1369    fn from_query(query: &str) -> Self {
1370        let lower = query.to_ascii_lowercase();
1371        let historical_memory_hint = [
1372            "remember",
1373            "earlier",
1374            "previous",
1375            "last time",
1376            "what did we decide",
1377            "why did we decide",
1378            "what did we say",
1379            "why did we change",
1380        ]
1381        .iter()
1382        .any(|needle| lower.contains(needle));
1383
1384        // Detect what memory type the query is asking about.
1385        let query_memory_type = if lower.contains("decide")
1386            || lower.contains("decision")
1387            || lower.contains("we agreed")
1388            || lower.contains("we chose")
1389        {
1390            Some("decision")
1391        } else if lower.contains("bug")
1392            || lower.contains("error")
1393            || lower.contains("issue")
1394            || lower.contains("problem")
1395            || lower.contains("fix")
1396            || lower.contains("broken")
1397        {
1398            Some("problem")
1399        } else if lower.contains("shipped")
1400            || lower.contains("milestone")
1401            || lower.contains("finished")
1402            || lower.contains("working now")
1403        {
1404            Some("milestone")
1405        } else if lower.contains("prefer")
1406            || lower.contains("my preference")
1407            || lower.contains("i like")
1408            || lower.contains("i want")
1409        {
1410            Some("preference")
1411        } else {
1412            None
1413        };
1414
1415        Self {
1416            exact_phrases: extract_exact_phrases(query),
1417            standout_terms: extract_standout_terms(query),
1418            historical_memory_hint,
1419            temporal_reference: extract_temporal_reference(query),
1420            query_memory_type,
1421        }
1422    }
1423}
1424
1425fn merge_scored_result(
1426    merged_by_path: &mut HashMap<String, SearchResult>,
1427    candidate: SearchResult,
1428) {
1429    match merged_by_path.get_mut(&candidate.path) {
1430        Some(existing) if candidate.score > existing.score => *existing = candidate,
1431        Some(_) => {}
1432        None => {
1433            merged_by_path.insert(candidate.path.clone(), candidate);
1434        }
1435    }
1436}
1437
1438fn reranked_score(
1439    signals: &QuerySignals,
1440    active_room: Option<&str>,
1441    result: &SearchResult,
1442    is_semantic: bool,
1443) -> f32 {
1444    let base = if is_semantic {
1445        1.0 + result.score.clamp(0.0, 1.0)
1446    } else {
1447        (result.score / 10.0).clamp(0.0, 1.0)
1448    };
1449    base + room_bias(active_room, result)
1450        + retrieval_signal_boost(signals, result)
1451        + temporal_memory_boost(signals, result)
1452}
1453
1454fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1455    if active_room == Some(result.room.as_str()) {
1456        0.15
1457    } else {
1458        0.0
1459    }
1460}
1461
1462fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1463    let mut boost = 0.0f32;
1464    let haystack = format!(
1465        "{}\n{}",
1466        result.path.to_ascii_lowercase(),
1467        result.content.to_ascii_lowercase()
1468    );
1469
1470    let phrase_matches = signals
1471        .exact_phrases
1472        .iter()
1473        .filter(|phrase| haystack.contains(phrase.as_str()))
1474        .count();
1475    if phrase_matches > 0 {
1476        boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1477    }
1478
1479    let mut standout_matches = 0;
1480    for term in &signals.standout_terms {
1481        if result.path.to_ascii_lowercase().contains(term.as_str()) {
1482            boost += 0.40;
1483            standout_matches += 1;
1484        } else if result.content.to_ascii_lowercase().contains(term.as_str()) {
1485            boost += 0.12;
1486            standout_matches += 1;
1487        }
1488        if standout_matches >= 3 {
1489            break;
1490        }
1491    }
1492
1493    if signals.historical_memory_hint && result.room == "session" {
1494        boost += 0.45;
1495    }
1496
1497    // Boost session chunks whose tagged memory type matches the query's intent.
1498    if let Some(qtype) = signals.query_memory_type {
1499        if !result.memory_type.is_empty() && result.memory_type == qtype {
1500            boost += 0.35;
1501        }
1502    }
1503
1504    boost
1505}
1506
1507fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1508    if result.room != "session" {
1509        return 0.0;
1510    }
1511    let Some(reference) = signals.temporal_reference else {
1512        return 0.0;
1513    };
1514    let Some(memory_ts) = session_memory_timestamp(result) else {
1515        return 0.0;
1516    };
1517
1518    let span = reference.window_secs.max(86_400);
1519    let full_fade = span.saturating_mul(8);
1520    if full_fade <= 0 {
1521        return 0.0;
1522    }
1523
1524    let distance = (memory_ts - reference.target_ts).abs();
1525    let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1526    if closeness <= 0.0 {
1527        0.0
1528    } else {
1529        0.22 * closeness
1530    }
1531}
1532
1533fn extract_exact_phrases(query: &str) -> Vec<String> {
1534    let mut phrases = Vec::new();
1535    let chars: Vec<char> = query.chars().collect();
1536    let mut i = 0usize;
1537
1538    while i < chars.len() {
1539        let quote = chars[i];
1540        if !matches!(quote, '"' | '\'' | '`') {
1541            i += 1;
1542            continue;
1543        }
1544        let start = i + 1;
1545        let mut end = start;
1546        while end < chars.len() && chars[end] != quote {
1547            end += 1;
1548        }
1549        if end > start {
1550            let phrase = chars[start..end]
1551                .iter()
1552                .collect::<String>()
1553                .trim()
1554                .to_ascii_lowercase();
1555            if phrase.len() >= 3 && !phrases.contains(&phrase) {
1556                phrases.push(phrase);
1557            }
1558        }
1559        i = end.saturating_add(1);
1560    }
1561
1562    phrases
1563}
1564
1565fn extract_standout_terms(query: &str) -> Vec<String> {
1566    const STOPWORDS: &[&str] = &[
1567        "about", "after", "before", "change", "changed", "decide", "decided", "does", "earlier",
1568        "flow", "from", "have", "into", "just", "last", "local", "make", "more", "remember",
1569        "should", "that", "their", "there", "these", "they", "this", "those", "what", "when",
1570        "where", "which", "why", "with", "work",
1571    ];
1572
1573    let mut standout = Vec::new();
1574    for token in query.split(|ch: char| {
1575        !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1576    }) {
1577        let trimmed = token.trim();
1578        if trimmed.len() < 4 {
1579            continue;
1580        }
1581        let lower = trimmed.to_ascii_lowercase();
1582        if STOPWORDS.contains(&lower.as_str()) {
1583            continue;
1584        }
1585
1586        let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1587            || trimmed
1588                .chars()
1589                .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1590            || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1591            || trimmed.len() >= 9;
1592
1593        if interesting && !standout.contains(&lower) {
1594            standout.push(lower);
1595        }
1596    }
1597
1598    standout
1599}
1600
1601fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1602    if let Some(ts) = extract_iso_date_from_query(query) {
1603        return Some(TemporalReference {
1604            target_ts: ts,
1605            window_secs: 86_400,
1606        });
1607    }
1608
1609    let now = current_unix_timestamp();
1610    let lower = query.to_ascii_lowercase();
1611    if lower.contains("yesterday") {
1612        Some(TemporalReference {
1613            target_ts: now.saturating_sub(86_400),
1614            window_secs: 86_400,
1615        })
1616    } else if lower.contains("today") || lower.contains("earlier today") {
1617        Some(TemporalReference {
1618            target_ts: now,
1619            window_secs: 86_400,
1620        })
1621    } else if lower.contains("last week") {
1622        Some(TemporalReference {
1623            target_ts: now.saturating_sub(7 * 86_400),
1624            window_secs: 7 * 86_400,
1625        })
1626    } else if lower.contains("last month") {
1627        Some(TemporalReference {
1628            target_ts: now.saturating_sub(30 * 86_400),
1629            window_secs: 30 * 86_400,
1630        })
1631    } else {
1632        None
1633    }
1634}
1635
1636fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1637    query
1638        .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1639        .find_map(parse_iso_date_token)
1640}
1641
1642fn parse_iso_date_token(token: &str) -> Option<i64> {
1643    if token.len() != 10 {
1644        return None;
1645    }
1646    let bytes = token.as_bytes();
1647    if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1648        return None;
1649    }
1650
1651    let year = token.get(0..4)?.parse::<i32>().ok()?;
1652    let month = token.get(5..7)?.parse::<u32>().ok()?;
1653    let day = token.get(8..10)?.parse::<u32>().ok()?;
1654    if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1655        return None;
1656    }
1657
1658    Some(days_from_civil(year, month, day).saturating_mul(86_400))
1659}
1660
1661fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1662    let year = year - if month <= 2 { 1 } else { 0 };
1663    let era = if year >= 0 { year } else { year - 399 } / 400;
1664    let yoe = year - era * 400;
1665    let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1666    let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1667    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1668    era as i64 * 146_097 + doe as i64 - 719_468
1669}
1670
1671fn current_unix_timestamp() -> i64 {
1672    std::time::SystemTime::now()
1673        .duration_since(std::time::UNIX_EPOCH)
1674        .unwrap_or_default()
1675        .as_secs() as i64
1676}
1677
1678fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1679    extract_session_path_timestamp(&result.path).or_else(|| {
1680        if result.last_modified > 0 {
1681            Some(result.last_modified)
1682        } else {
1683            None
1684        }
1685    })
1686}
1687
1688fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1689    let normalized = path.replace('\\', "/");
1690    let mut parts = normalized.split('/');
1691    if parts.next()? != "session" {
1692        return None;
1693    }
1694    parse_iso_date_token(parts.next()?)
1695}
1696
1697fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1698    let normalized = speaker.trim().to_ascii_lowercase();
1699    match normalized.as_str() {
1700        "you" | "user" => SessionSpeakerKind::User,
1701        "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1702        _ => SessionSpeakerKind::Assistant,
1703    }
1704}
1705
1706fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1707    let Ok(raw) = std::fs::read_to_string(report_path) else {
1708        return Vec::new();
1709    };
1710    let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1711        return Vec::new();
1712    };
1713
1714    let session_key = report_path
1715        .file_stem()
1716        .and_then(|stem| stem.to_str())
1717        .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1718        .unwrap_or("unknown-session")
1719        .to_string();
1720    let session_date = report
1721        .session_start
1722        .split('_')
1723        .next()
1724        .filter(|date| !date.is_empty())
1725        .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1726        .to_string();
1727
1728    let mut exchanges = Vec::new();
1729    let mut pending_user: Option<String> = None;
1730    let mut turn_index = 0usize;
1731
1732    for entry in report.transcript {
1733        match session_speaker_kind(&entry.speaker) {
1734            SessionSpeakerKind::User => {
1735                let text = entry.text.trim();
1736                if !text.is_empty() {
1737                    pending_user = Some(text.to_string());
1738                }
1739            }
1740            SessionSpeakerKind::Assistant => {
1741                let text = entry.text.trim();
1742                if text.is_empty() {
1743                    continue;
1744                }
1745                let Some(user_text) = pending_user.take() else {
1746                    continue;
1747                };
1748                turn_index += 1;
1749                exchanges.push(SessionExchange {
1750                    path: format!(
1751                        "session/{}/{}/turn-{}",
1752                        session_date, session_key, turn_index
1753                    ),
1754                    last_modified,
1755                    content: format!(
1756                        "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1757                        user_text, text
1758                    ),
1759                });
1760            }
1761            SessionSpeakerKind::Ignore => {}
1762        }
1763    }
1764
1765    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1766        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1767        exchanges = exchanges.into_iter().skip(keep_from).collect();
1768    }
1769
1770    exchanges
1771}
1772
1773fn load_imported_session_exchanges(
1774    import_path: &Path,
1775    imports_root: &Path,
1776    last_modified: i64,
1777) -> Vec<SessionExchange> {
1778    let Ok(raw) = std::fs::read_to_string(import_path) else {
1779        return Vec::new();
1780    };
1781
1782    let messages = normalize_import_messages(&raw, import_path);
1783    if messages.is_empty() {
1784        return Vec::new();
1785    }
1786
1787    let rel = import_path
1788        .strip_prefix(imports_root)
1789        .unwrap_or(import_path);
1790    let rel_slug = slugify_import_path(rel);
1791    let mut exchanges = Vec::new();
1792    let mut pending_user: Option<String> = None;
1793    let mut turn_index = 0usize;
1794
1795    for (role, text) in messages {
1796        let cleaned = text.trim();
1797        if cleaned.is_empty() {
1798            continue;
1799        }
1800        match role.as_str() {
1801            "user" => pending_user = Some(cleaned.to_string()),
1802            "assistant" => {
1803                let Some(user_text) = pending_user.take() else {
1804                    continue;
1805                };
1806                turn_index += 1;
1807                exchanges.push(SessionExchange {
1808                    path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1809                    last_modified,
1810                    content: format!(
1811                        "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1812                        rel.to_string_lossy().replace('\\', "/"),
1813                        user_text,
1814                        cleaned
1815                    ),
1816                });
1817            }
1818            _ => {}
1819        }
1820    }
1821
1822    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1823        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1824        exchanges = exchanges.into_iter().skip(keep_from).collect();
1825    }
1826
1827    exchanges
1828}
1829
1830fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1831    if raw.trim().is_empty() {
1832        return Vec::new();
1833    }
1834
1835    if let Some(messages) = parse_marker_transcript(raw) {
1836        return messages;
1837    }
1838
1839    let ext = import_path
1840        .extension()
1841        .and_then(|ext| ext.to_str())
1842        .unwrap_or("")
1843        .to_ascii_lowercase();
1844
1845    if matches!(ext.as_str(), "json" | "jsonl")
1846        || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1847    {
1848        if let Some(messages) = parse_jsonl_messages(raw) {
1849            if !messages.is_empty() {
1850                return messages;
1851            }
1852        }
1853
1854        if let Ok(value) = serde_json::from_str::<Value>(raw) {
1855            if let Some(messages) = parse_session_report_messages(&value) {
1856                return messages;
1857            }
1858            if let Some(messages) = parse_simple_role_messages(&value) {
1859                return messages;
1860            }
1861            if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1862                return messages;
1863            }
1864        }
1865    }
1866
1867    Vec::new()
1868}
1869
1870fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1871    let lines = raw.lines().collect::<Vec<_>>();
1872    if lines
1873        .iter()
1874        .filter(|line| line.trim_start().starts_with("> "))
1875        .count()
1876        < 2
1877    {
1878        return None;
1879    }
1880
1881    let mut messages = Vec::new();
1882    let mut i = 0usize;
1883    while i < lines.len() {
1884        let line = lines[i].trim_start();
1885        if let Some(rest) = line.strip_prefix("> ") {
1886            messages.push(("user".to_string(), rest.trim().to_string()));
1887            i += 1;
1888            let mut assistant_lines = Vec::new();
1889            while i < lines.len() {
1890                let next = lines[i];
1891                if next.trim_start().starts_with("> ") {
1892                    break;
1893                }
1894                let trimmed = next.trim();
1895                if !trimmed.is_empty() && trimmed != "---" {
1896                    assistant_lines.push(trimmed.to_string());
1897                }
1898                i += 1;
1899            }
1900            if !assistant_lines.is_empty() {
1901                messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1902            }
1903        } else {
1904            i += 1;
1905        }
1906    }
1907
1908    (!messages.is_empty()).then_some(messages)
1909}
1910
1911fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1912    let mut messages = Vec::new();
1913    let mut has_codex_session_meta = false;
1914    let mut saw_jsonl = false;
1915
1916    for line in raw.lines() {
1917        let trimmed = line.trim();
1918        if trimmed.is_empty() {
1919            continue;
1920        }
1921        let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1922            continue;
1923        };
1924        saw_jsonl = true;
1925        let Some(object) = value.as_object() else {
1926            continue;
1927        };
1928
1929        match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1930            "session_meta" => {
1931                has_codex_session_meta = true;
1932            }
1933            "event_msg" => {
1934                let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1935                    continue;
1936                };
1937                let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1938                    continue;
1939                };
1940                match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1941                    "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1942                    "agent_message" => {
1943                        messages.push(("assistant".to_string(), text.trim().to_string()))
1944                    }
1945                    _ => {}
1946                }
1947            }
1948            "human" | "user" => {
1949                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1950                    messages.push(("user".to_string(), text));
1951                }
1952            }
1953            "assistant" => {
1954                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1955                    messages.push(("assistant".to_string(), text));
1956                }
1957            }
1958            _ => {
1959                if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
1960                    if let Some(text) = extract_text_content(&value) {
1961                        match role {
1962                            "user" | "human" => messages.push(("user".to_string(), text)),
1963                            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1964                            _ => {}
1965                        }
1966                    }
1967                }
1968            }
1969        }
1970    }
1971
1972    if !saw_jsonl {
1973        return None;
1974    }
1975
1976    if has_codex_session_meta || !messages.is_empty() {
1977        return Some(messages);
1978    }
1979
1980    None
1981}
1982
1983fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
1984    let report = value.as_object()?;
1985    let transcript = report.get("transcript")?.as_array()?;
1986    let mut messages = Vec::new();
1987
1988    for entry in transcript {
1989        let Some(obj) = entry.as_object() else {
1990            continue;
1991        };
1992        let speaker = obj
1993            .get("speaker")
1994            .and_then(|v| v.as_str())
1995            .unwrap_or_default();
1996        let text = obj
1997            .get("text")
1998            .and_then(|v| v.as_str())
1999            .unwrap_or_default()
2000            .trim()
2001            .to_string();
2002        if text.is_empty() {
2003            continue;
2004        }
2005        match session_speaker_kind(speaker) {
2006            SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
2007            SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
2008            SessionSpeakerKind::Ignore => {}
2009        }
2010    }
2011
2012    (!messages.is_empty()).then_some(messages)
2013}
2014
2015fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
2016    if let Some(array) = value.as_array() {
2017        let messages = collect_role_messages(array);
2018        return (!messages.is_empty()).then_some(messages);
2019    }
2020
2021    let obj = value.as_object()?;
2022    if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
2023        let array = messages_value.as_array()?;
2024        let messages = collect_role_messages(array);
2025        return (!messages.is_empty()).then_some(messages);
2026    }
2027
2028    None
2029}
2030
2031fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
2032    let mut messages = Vec::new();
2033    for item in items {
2034        let Some(obj) = item.as_object() else {
2035            continue;
2036        };
2037        let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
2038            continue;
2039        };
2040        let Some(text) = extract_text_content(item) else {
2041            continue;
2042        };
2043        match role {
2044            "user" | "human" => messages.push(("user".to_string(), text)),
2045            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2046            _ => {}
2047        }
2048    }
2049    messages
2050}
2051
2052fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
2053    let mapping = value.get("mapping")?.as_object()?;
2054    let mut current_id = mapping.iter().find_map(|(node_id, node)| {
2055        let obj = node.as_object()?;
2056        (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
2057    })?;
2058
2059    let mut messages = Vec::new();
2060    let mut visited = std::collections::HashSet::new();
2061
2062    while visited.insert(current_id.clone()) {
2063        let Some(node) = mapping.get(&current_id).and_then(|v| v.as_object()) else {
2064            break;
2065        };
2066
2067        if let Some(message) = node.get("message") {
2068            let role = message
2069                .get("author")
2070                .and_then(|author| author.get("role"))
2071                .and_then(|v| v.as_str())
2072                .unwrap_or("");
2073            if let Some(text) = extract_text_content(message) {
2074                match role {
2075                    "user" => messages.push(("user".to_string(), text)),
2076                    "assistant" => messages.push(("assistant".to_string(), text)),
2077                    _ => {}
2078                }
2079            }
2080        }
2081
2082        let Some(next_id) = node
2083            .get("children")
2084            .and_then(|children| children.as_array())
2085            .and_then(|children| children.first())
2086            .and_then(|child| child.as_str())
2087        else {
2088            break;
2089        };
2090        current_id = next_id.to_string();
2091    }
2092
2093    (!messages.is_empty()).then_some(messages)
2094}
2095
2096fn extract_text_content(value: &Value) -> Option<String> {
2097    if let Some(text) = value.as_str() {
2098        let trimmed = text.trim();
2099        return (!trimmed.is_empty()).then_some(trimmed.to_string());
2100    }
2101
2102    if let Some(array) = value.as_array() {
2103        let joined = array
2104            .iter()
2105            .filter_map(extract_text_content)
2106            .filter(|part| !part.is_empty())
2107            .collect::<Vec<_>>()
2108            .join("\n");
2109        return (!joined.is_empty()).then_some(joined);
2110    }
2111
2112    let obj = value.as_object()?;
2113
2114    if let Some(content) = obj.get("content") {
2115        if let Some(text) = extract_text_content(content) {
2116            return Some(text);
2117        }
2118    }
2119
2120    if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
2121        let trimmed = text.trim();
2122        if !trimmed.is_empty() {
2123            return Some(trimmed.to_string());
2124        }
2125    }
2126
2127    if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
2128        let joined = parts
2129            .iter()
2130            .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
2131            .filter(|part| !part.is_empty())
2132            .collect::<Vec<_>>()
2133            .join("\n");
2134        if !joined.is_empty() {
2135            return Some(joined);
2136        }
2137    }
2138
2139    None
2140}
2141
2142fn slugify_import_path(path: &Path) -> String {
2143    path.to_string_lossy()
2144        .replace('\\', "/")
2145        .chars()
2146        .map(|ch| {
2147            if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
2148                ch
2149            } else {
2150                '_'
2151            }
2152        })
2153        .collect::<String>()
2154        .trim_matches('/')
2155        .replace('/', "__")
2156}
2157
2158// ── Embedding API ─────────────────────────────────────────────────────────────
2159
2160/// Call LM Studio's `/v1/embeddings` endpoint synchronously.
2161///
2162/// Uses nomic-embed-text-v2 MoE. Nomic v2 requires task instruction prefixes:
2163/// - Chunks stored in the index use `"search_document: "` prefix
2164/// - Queries at search time use `"search_query: "` prefix
2165/// LM Studio matches loaded models by substring so the quant suffix doesn't matter.
2166///
2167/// Returns `None` if:
2168/// - No embedding model is loaded in LM Studio
2169/// - LM Studio is not running
2170/// - Any network or parse error occurs
2171///
2172/// Callers must tolerate `None` and fall back to BM25-only search.
2173fn embed_text_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2174    embed_text_with_prefix(text, "search_document", base_url)
2175}
2176
2177fn embed_query_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2178    embed_text_with_prefix(text, "search_query", base_url)
2179}
2180
2181fn embed_text_with_prefix(text: &str, task: &str, base_url: &str) -> Option<Vec<f32>> {
2182    // Nomic v2 task instruction prefix format: "<task>: <text>"
2183    let prefixed = format!("{}: {}", task, text);
2184    // Truncate to ~8000 chars to stay within typical embedding model limits.
2185    let input = if prefixed.len() > 8000 {
2186        &prefixed[..8000]
2187    } else {
2188        &prefixed
2189    };
2190
2191    let client = reqwest::blocking::Client::builder()
2192        .timeout(std::time::Duration::from_secs(10))
2193        .build()
2194        .ok()?;
2195
2196    let body = serde_json::json!({
2197        "model": "nomic-embed-text-v2",
2198        "input": input
2199    });
2200
2201    let url = format!("{}/v1/embeddings", base_url);
2202    let resp = client.post(&url).json(&body).send().ok()?;
2203
2204    if !resp.status().is_success() {
2205        return None;
2206    }
2207
2208    let json: serde_json::Value = resp.json().ok()?;
2209    let embedding = json["data"][0]["embedding"].as_array()?;
2210    let vec: Vec<f32> = embedding
2211        .iter()
2212        .filter_map(|v| v.as_f64().map(|f| f as f32))
2213        .collect();
2214
2215    if vec.is_empty() {
2216        None
2217    } else {
2218        Some(vec)
2219    }
2220}
2221
2222// ── Vector math ───────────────────────────────────────────────────────────────
2223
2224fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2225    if a.len() != b.len() || a.is_empty() {
2226        return 0.0;
2227    }
2228    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2229    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2230    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2231    if norm_a == 0.0 || norm_b == 0.0 {
2232        0.0
2233    } else {
2234        dot / (norm_a * norm_b)
2235    }
2236}
2237
2238fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2239    floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2240}
2241
2242fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2243    blob.chunks_exact(4)
2244        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2245        .collect()
2246}
2247
2248// ── Document extraction ───────────────────────────────────────────────────────
2249
2250/// Extract plain text from a PDF file using pdf-extract.
2251/// Returns None if the file can't be read or yields no text.
2252/// Output is best-effort — layout is not preserved, but content is.
2253fn normalize_extracted_document_text(text: String) -> Option<String> {
2254    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2255    let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2256    if trimmed.is_empty() {
2257        None
2258    } else {
2259        Some(trimmed.to_string())
2260    }
2261}
2262
2263fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2264    let previous_hook = std::panic::take_hook();
2265    std::panic::set_hook(Box::new(|_| {}));
2266    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2267        pdf_extract::extract_text(path)
2268    }));
2269    std::panic::set_hook(previous_hook);
2270
2271    match result {
2272        Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2273        Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2274        Err(payload) => {
2275            let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2276                (*msg).to_string()
2277            } else if let Some(msg) = payload.downcast_ref::<String>() {
2278                msg.clone()
2279            } else {
2280                "unknown parser panic".to_string()
2281            };
2282            Err(format!("pdf-extract panicked: {}", panic_text))
2283        }
2284    }
2285}
2286
2287fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2288    let mut doc =
2289        lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2290
2291    if doc.is_encrypted() {
2292        doc.decrypt("")
2293            .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2294    }
2295
2296    let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2297    if page_numbers.is_empty() {
2298        return Ok(None);
2299    }
2300
2301    let mut extracted_pages = Vec::new();
2302    let mut page_errors = Vec::new();
2303
2304    for page_number in page_numbers {
2305        match doc.extract_text(&[page_number]) {
2306            Ok(text) => {
2307                if let Some(page_text) = normalize_extracted_document_text(text) {
2308                    extracted_pages.push(page_text);
2309                }
2310            }
2311            Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2312        }
2313    }
2314
2315    if !extracted_pages.is_empty() {
2316        return Ok(Some(extracted_pages.join("\n\n")));
2317    }
2318
2319    if !page_errors.is_empty() {
2320        let sample_errors = page_errors
2321            .into_iter()
2322            .take(3)
2323            .collect::<Vec<_>>()
2324            .join("; ");
2325        return Err(format!(
2326            "lopdf could not extract usable page text ({sample_errors})"
2327        ));
2328    }
2329
2330    Ok(None)
2331}
2332
2333fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2334    let mut failures = Vec::new();
2335
2336    match extract_pdf_text_with_pdf_extract(path) {
2337        Ok(Some(text)) => return Ok(Some(text)),
2338        Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2339        Err(e) => failures.push(e),
2340    }
2341
2342    match extract_pdf_text_with_lopdf(path) {
2343        Ok(Some(text)) => return Ok(Some(text)),
2344        Ok(None) => failures.push("lopdf found no usable text".to_string()),
2345        Err(e) => failures.push(e),
2346    }
2347
2348    let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2349    Err(format!(
2350        "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2351        detail
2352    ))
2353}
2354
2355fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2356    let exe = std::env::current_exe()
2357        .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2358    let output = std::process::Command::new(exe)
2359        .arg("--pdf-extract-helper")
2360        .arg(path)
2361        .stdin(std::process::Stdio::null())
2362        .stdout(std::process::Stdio::piped())
2363        .stderr(std::process::Stdio::piped())
2364        .output()
2365        .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2366
2367    if !output.status.success() {
2368        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2369        return Err(if stderr.is_empty() {
2370            "PDF extraction failed.".to_string()
2371        } else {
2372            stderr
2373        });
2374    }
2375
2376    let text = String::from_utf8(output.stdout)
2377        .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2378    if text.trim().is_empty() {
2379        Ok(None)
2380    } else {
2381        Ok(Some(text))
2382    }
2383}
2384
2385pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2386    match extract_pdf_text_inside_helper(path) {
2387        Ok(Some(text)) => {
2388            use std::io::Write;
2389            let mut stdout = std::io::stdout();
2390            if stdout.write_all(text.as_bytes()).is_ok() {
2391                0
2392            } else {
2393                let _ = writeln!(
2394                    std::io::stderr(),
2395                    "PDF helper could not write extracted text."
2396                );
2397                1
2398            }
2399        }
2400        Ok(None) => {
2401            eprintln!(
2402                "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2403            );
2404            1
2405        }
2406        Err(e) => {
2407            eprintln!("{}", e);
2408            1
2409        }
2410    }
2411}
2412
2413/// Extract text from any supported document type (PDF, markdown, plain text).
2414/// Used by /attach for one-shot context injection.
2415pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2416    let ext = path
2417        .extension()
2418        .and_then(|e| e.to_str())
2419        .unwrap_or("")
2420        .to_lowercase();
2421    match ext.as_str() {
2422        "pdf" => {
2423            let text = extract_pdf_text(path)?.ok_or_else(|| {
2424                "PDF contains no extractable text — it may be scanned/image-only. \
2425                     Try attaching page screenshots with /image instead."
2426                    .to_string()
2427            })?;
2428            pdf_quality_check(text)
2429        }
2430        _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2431    }
2432}
2433
2434/// Detect garbled PDF extraction — common with academic publisher PDFs that use
2435/// custom embedded fonts with non-standard glyph mappings.
2436///
2437/// Returns the text if it looks usable, or an informative error if it looks garbled.
2438fn pdf_quality_check(text: String) -> Result<String, String> {
2439    let trimmed = text.trim();
2440
2441    // Too little content to be useful.
2442    if trimmed.len() < 150 {
2443        return Err(format!(
2444            "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2445             or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2446            trimmed.len()
2447        ));
2448    }
2449
2450    // Detect words smashed together: space ratio too low.
2451    // Normal prose is ~15–20% spaces. Below 4% means glyphs aren't mapping to spaces.
2452    let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2453    let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2454    let space_ratio = if non_newline > 0 {
2455        spaces as f32 / non_newline as f32
2456    } else {
2457        0.0
2458    };
2459
2460    if space_ratio < 0.04 {
2461        return Err(
2462            "PDF text extraction produced garbled output — words are merged with no spaces. \
2463             This usually means the PDF uses custom embedded fonts (common with academic publishers \
2464             like EBSCO, Elsevier, Springer). \
2465             Try a PDF exported from Word, Google Docs, or LaTeX, \
2466             or attach page screenshots with /image instead.".to_string()
2467        );
2468    }
2469
2470    Ok(text)
2471}
2472
2473// ── Chunking strategies ───────────────────────────────────────────────────────
2474
2475/// Dispatch to the correct chunking strategy based on file extension.
2476fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2477    if ext == "rs" {
2478        chunk_rust_symbols(text)
2479    } else {
2480        chunk_paragraphs(text)
2481    }
2482}
2483
2484/// Chunk Rust source at top-level item boundaries.
2485///
2486/// Detects lines at column 0 that start a Rust declaration keyword, flushes
2487/// the accumulated buffer, then moves any trailing doc-comments / attributes
2488/// forward so they stay with the item they annotate.
2489///
2490/// Items larger than 3000 chars (e.g. large impl blocks) are further split
2491/// by sliding window so no single chunk blows the retrieval budget.
2492fn chunk_rust_symbols(text: &str) -> Vec<String> {
2493    const ITEM_STARTS: &[&str] = &[
2494        "pub fn ",
2495        "pub async fn ",
2496        "pub unsafe fn ",
2497        "async fn ",
2498        "unsafe fn ",
2499        "fn ",
2500        "pub impl",
2501        "impl ",
2502        "pub struct ",
2503        "struct ",
2504        "pub enum ",
2505        "enum ",
2506        "pub trait ",
2507        "trait ",
2508        "pub mod ",
2509        "mod ",
2510        "pub type ",
2511        "type ",
2512        "pub const ",
2513        "const ",
2514        "pub static ",
2515        "static ",
2516    ];
2517
2518    let lines: Vec<&str> = text.lines().collect();
2519    let mut chunks: Vec<String> = Vec::new();
2520    let mut current: Vec<&str> = Vec::new();
2521
2522    for &line in &lines {
2523        let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2524        let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2525
2526        if is_item && !current.is_empty() {
2527            // Scan backward to find where trailing doc-comments / attributes start —
2528            // move them to the new chunk so they land with their item.
2529            let mut split = current.len();
2530            while split > 0 {
2531                let prev = current[split - 1].trim();
2532                if prev.starts_with("///")
2533                    || prev.starts_with("//!")
2534                    || prev.starts_with("#[")
2535                    || prev.is_empty()
2536                {
2537                    split -= 1;
2538                } else {
2539                    break;
2540                }
2541            }
2542            let body = current[..split].join("\n");
2543            if !body.trim().is_empty() {
2544                chunks.push(body);
2545            }
2546            current = current[split..].to_vec();
2547        }
2548        current.push(line);
2549    }
2550    if !current.is_empty() {
2551        let body = current.join("\n");
2552        if !body.trim().is_empty() {
2553            chunks.push(body);
2554        }
2555    }
2556
2557    // Subdivide any oversized blocks (e.g. long impl blocks with many methods).
2558    let mut result = Vec::new();
2559    for chunk in chunks {
2560        if chunk.len() > 3000 {
2561            result.extend(sliding_window_chunks(&chunk, 2000, 200));
2562        } else {
2563            result.push(chunk);
2564        }
2565    }
2566    result
2567}
2568
2569/// Chunk non-Rust text at paragraph boundaries (double newline).
2570fn chunk_paragraphs(text: &str) -> Vec<String> {
2571    let mut result: Vec<String> = Vec::new();
2572    let mut current = String::new();
2573
2574    for para in text.split("\n\n") {
2575        if current.len() + para.len() + 2 > 2000 {
2576            if !current.trim().is_empty() {
2577                result.push(current.clone());
2578            }
2579            current = para.to_string();
2580        } else {
2581            if !current.is_empty() {
2582                current.push_str("\n\n");
2583            }
2584            current.push_str(para);
2585        }
2586    }
2587    if !current.trim().is_empty() {
2588        result.push(current);
2589    }
2590
2591    let mut final_result = Vec::new();
2592    for chunk in result {
2593        if chunk.len() > 2000 {
2594            final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2595        } else {
2596            final_result.push(chunk);
2597        }
2598    }
2599    final_result
2600}
2601
2602/// Classic sliding-window fallback for oversized blocks.
2603fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2604    let chars: Vec<char> = text.chars().collect();
2605    let mut result = Vec::new();
2606    let mut i = 0;
2607    while i < chars.len() {
2608        let end = (i + chunk_size).min(chars.len());
2609        result.push(chars[i..end].iter().collect());
2610        if end == chars.len() {
2611            break;
2612        }
2613        i += chunk_size - overlap;
2614    }
2615    result
2616}