Skip to main content

hematite/memory/
vein.rs

1use crate::agent::truncation::safe_head;
2use rusqlite::{params, Connection};
3use serde::Deserialize;
4use serde_json::Value;
5use std::collections::{HashMap, HashSet};
6use std::fmt::Write as _;
7use std::path::Path;
8
9/// "The Vein" — local RAG memory engine backed by SQLite FTS5 + semantic embeddings.
10///
11/// Two retrieval modes, used together:
12///
13/// **BM25 (always available)**
14/// Full-text search via SQLite FTS5 with Porter-stemming. Fast, zero extra GPU cost,
15/// works as the fallback when the embedding model isn't loaded.
16///
17/// **Semantic (when LM Studio has an embedding model loaded)**
18/// Calls `/v1/embeddings` (nomic-embed-text-v1.5 or similar) to produce 768-dim float
19/// vectors for each chunk. At search time the query is embedded and cosine similarity
20/// selects the most conceptually relevant chunks — even when no keywords match.
21///
22/// Hybrid search runs BM25 and semantic in parallel, deduplicates by path, and returns
23/// the top-k results ranked by combined score. Semantic results score higher when the
24/// embedding model is available; BM25 fills the gap when it isn't.
25///
26/// Indexing is incremental: files are re-indexed only when their mtime changes. Embedding
27/// vectors are stored in a separate `chunks_vec` SQLite table so they survive re-runs
28/// without hitting the embedding API again.
29pub struct Vein {
30    db: std::sync::Arc<std::sync::Mutex<Connection>>,
31    /// Base URL of the LLM provider, used for the embeddings endpoint.
32    base_url: String,
33    embed_model: std::sync::Arc<std::sync::RwLock<Option<String>>>,
34    /// In-memory cache of decoded embedding vectors. Avoids per-turn full-table-scan
35    /// of chunks_vec + blob deserialization. Populated at open time, updated
36    /// incrementally as new embeddings are stored or paths are evicted.
37    embedding_cache: std::sync::RwLock<Vec<EmbeddedChunk>>,
38}
39
40/// One cached embedding entry — the decoded float vector plus retrieval metadata.
41struct EmbeddedChunk {
42    path: String,
43    chunk_idx: i64,
44    embedding: Vec<f32>,
45    last_modified: i64,
46    room: String,
47    memory_type: String,
48}
49
50// SAFETY: rusqlite::Connection is !Send by default, but we wrap it in Arc<Mutex>
51// and ensure all accesses are serialized by the mutex.
52unsafe impl Send for Vein {}
53unsafe impl Sync for Vein {}
54
55#[derive(Debug, Clone)]
56pub struct SearchResult {
57    pub path: String,
58    pub content: String,
59    /// Combined relevance score (higher = more relevant).
60    pub score: f32,
61    /// Subsystem room derived from the file path (e.g. "agent", "ui", "tools").
62    pub room: String,
63    /// Last-modified timestamp from chunks_meta (unix seconds).
64    pub last_modified: i64,
65    /// Semantic memory type tagged at index time: "decision", "problem",
66    /// "milestone", "preference", or "" for unclassified/source/doc chunks.
67    pub memory_type: String,
68}
69
70#[derive(Debug, Clone, PartialEq, Eq)]
71pub struct VeinHotFile {
72    pub path: String,
73    pub heat: i64,
74    pub last_modified: i64,
75    pub room: String,
76}
77
78#[derive(Debug, Clone)]
79pub struct VeinInspectionSnapshot {
80    pub indexed_source_files: usize,
81    pub indexed_docs: usize,
82    pub indexed_session_exchanges: usize,
83    pub embedded_source_doc_chunks: usize,
84    pub has_any_embeddings: bool,
85    pub active_room: Option<String>,
86    pub hot_files: Vec<VeinHotFile>,
87    pub l1_ready: bool,
88}
89
90#[derive(Debug, Default)]
91struct QuerySignals {
92    exact_phrases: Vec<String>,
93    standout_terms: Vec<String>,
94    historical_memory_hint: bool,
95    temporal_reference: Option<TemporalReference>,
96    /// Memory type the query is asking about — boosts matching chunks.
97    /// e.g. "what did we decide" → "decision", "what was the bug" → "problem"
98    query_memory_type: Option<&'static str>,
99}
100
101#[derive(Debug, Clone, Copy)]
102struct TemporalReference {
103    target_ts: i64,
104    window_secs: i64,
105}
106
107#[derive(Debug, Deserialize)]
108struct SessionReport {
109    #[serde(default)]
110    session_start: String,
111    #[serde(default)]
112    transcript: Vec<SessionTranscriptEntry>,
113}
114
115#[derive(Debug, Deserialize)]
116struct SessionTranscriptEntry {
117    #[serde(default)]
118    speaker: String,
119    #[serde(default)]
120    text: String,
121}
122
123#[derive(Debug)]
124struct SessionExchange {
125    path: String,
126    last_modified: i64,
127    content: String,
128}
129
130#[derive(Debug, Clone, Copy, PartialEq, Eq)]
131enum SessionSpeakerKind {
132    User,
133    Assistant,
134    Ignore,
135}
136
137/// Derive a subsystem room label from a file path.
138/// Uses path segments, filenames, and repo-role hints to map files into
139/// stable subsystem rooms. Falls back to the first directory component or
140/// "root" when no stronger signal exists.
141pub fn detect_room(path: &str) -> String {
142    let lower = path.to_lowercase().replace('\\', "/");
143    let filename = lower.rsplit('/').next().unwrap_or(&lower);
144    let ext = filename.rsplit('.').next().unwrap_or("");
145
146    let mut best_room = None::<&str>;
147    let mut best_score = 0i32;
148    let mut consider = |room: &'static str, score: i32| {
149        if score > best_score {
150            best_score = score;
151            best_room = Some(room);
152        }
153    };
154
155    let is_component = |segment: &str| -> bool {
156        if lower == segment {
157            return true;
158        }
159        let seg = segment.as_bytes();
160        let lo = lower.as_bytes();
161        let slen = seg.len();
162        // starts_with("{segment}/")
163        if lo.len() > slen && &lo[..slen] == seg && lo[slen] == b'/' {
164            return true;
165        }
166        // contains("/{segment}/")
167        lo.len() >= slen + 2
168            && lo
169                .windows(slen + 2)
170                .any(|w| w[0] == b'/' && &w[1..slen + 1] == seg && w[slen + 1] == b'/')
171    };
172
173    if lower.starts_with("session/")
174        || lower.starts_with(".hematite/reports/")
175        || lower.starts_with(".hematite/imports/")
176        || is_component("reports")
177        || is_component("imports")
178    {
179        consider("session", 100);
180    }
181
182    if lower.starts_with(".hematite/docs/")
183        || is_component("docs")
184        || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
185        || matches!(ext, "md" | "markdown" | "pdf" | "rst")
186    {
187        consider("docs", 80);
188    }
189
190    if is_component("tests")
191        || filename.contains("diagnostic")
192        || filename.ends_with("_test.rs")
193        || filename.ends_with(".test.ts")
194    {
195        consider("tests", 85);
196    }
197
198    if lower.starts_with(".github/workflows/")
199        || is_component("workflows")
200        || filename == ".pre-commit-config.yaml"
201        || filename == ".pre-commit-config.yml"
202        || filename.contains("hook")
203    {
204        consider("automation", 84);
205    }
206
207    if lower.starts_with("installer/")
208        || lower.starts_with("dist/")
209        || lower.starts_with("scripts/package-")
210        || filename.contains("release")
211        || filename.contains("bump-version")
212        || ext == "iss"
213    {
214        consider("release", 82);
215    }
216
217    if matches!(
218        filename,
219        "cargo.toml"
220            | "cargo.lock"
221            | "package.json"
222            | "pnpm-lock.yaml"
223            | "yarn.lock"
224            | "bun.lock"
225            | "bun.lockb"
226            | "pyproject.toml"
227            | "setup.py"
228            | "go.mod"
229            | "pom.xml"
230            | "build.gradle"
231            | "build.gradle.kts"
232            | "cmakelists.txt"
233            | ".gitignore"
234            | "settings.json"
235            | "mcp_servers.json"
236    ) || filename.ends_with(".sln")
237        || filename.ends_with(".csproj")
238        || filename.contains("config")
239    {
240        consider("config", 76);
241    }
242
243    if is_component("ui")
244        || matches!(
245            filename,
246            "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
247        )
248    {
249        consider("ui", 70);
250    }
251
252    if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
253        consider("memory", 72);
254    }
255
256    if is_component("tools")
257        || matches!(
258            filename,
259            "verify_build.rs"
260                | "host_inspect.rs"
261                | "shell.rs"
262                | "code_sandbox.rs"
263                | "project_map.rs"
264                | "runtime_trace.rs"
265        )
266    {
267        consider("tools", 68);
268    }
269
270    if filename.contains("mcp")
271        || filename.contains("lsp")
272        || lower.contains("/mcp/")
273        || lower.contains("/lsp/")
274    {
275        consider("integration", 67);
276    }
277
278    if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
279        || filename.contains("startup")
280        || filename.contains("runtime")
281    {
282        consider("runtime", 66);
283    }
284
285    if is_component("agent") {
286        consider("agent", 60);
287    }
288
289    if lower.starts_with("libs/") || is_component("libs") {
290        consider("libs", 58);
291    }
292
293    if lower.starts_with("scripts/") || is_component("scripts") {
294        consider("scripts", 55);
295    }
296
297    if let Some(room) = best_room {
298        return room.to_string();
299    }
300
301    // Fall back to first directory component
302    lower
303        .split('/')
304        .next()
305        .filter(|s| !s.is_empty() && !s.contains('.'))
306        .unwrap_or("root")
307        .to_string()
308}
309
310/// Classify session memory text into a semantic type using zero-cost regex patterns.
311/// Returns one of: "decision", "problem", "milestone", "preference", or "" (unclassified).
312///
313/// Applied only to session/import chunks at index time. Source and doc chunks always get "".
314/// Used by reranking to boost chunks whose type matches the query's implied intent.
315pub fn detect_memory_type(text: &str) -> &'static str {
316    let lower = text.to_lowercase();
317
318    // Decision markers — architectural choices, agreed approaches, "let's use X"
319    let decision_patterns = [
320        "let's use ",
321        "we'll use ",
322        "decided to ",
323        "going with ",
324        "we agreed ",
325        "the plan is",
326        "we're going to",
327        "switching to",
328        "we chose",
329        "final decision",
330        "we settled on",
331        "agreed on",
332        "we decided",
333    ];
334    for pat in &decision_patterns {
335        if lower.contains(pat) {
336            return "decision";
337        }
338    }
339
340    // Problem markers — bugs, errors, failures, blockers
341    let problem_patterns = [
342        "bug fixed",
343        "bug was",
344        "the issue was",
345        "root cause",
346        "error was",
347        "turned out to be",
348        "the fix was",
349        "was caused by",
350        "broken because",
351        "fixed by",
352        "the problem was",
353        "found the bug",
354        "port conflict",
355        "crash",
356        "panicked",
357        "segfault",
358        "oom",
359        "out of memory",
360    ];
361    for pat in &problem_patterns {
362        if lower.contains(pat) {
363            return "problem";
364        }
365    }
366
367    // Milestone markers — shipped, completed, working
368    let milestone_patterns = [
369        "now working",
370        "successfully",
371        "shipped",
372        "deployed",
373        "it works",
374        "tests pass",
375        "all green",
376        "breakthrough",
377        "finally got",
378        "got it working",
379        "completed",
380        "finished",
381        "done with",
382        "landed",
383    ];
384    for pat in &milestone_patterns {
385        if lower.contains(pat) {
386            return "milestone";
387        }
388    }
389
390    // Preference markers — personal/operator preferences for style or workflow
391    let preference_patterns = [
392        "i prefer",
393        "i like",
394        "i don't like",
395        "i want",
396        "always use",
397        "never use",
398        "i usually",
399        "my preference",
400        "keep it",
401        "avoid using",
402    ];
403    for pat in &preference_patterns {
404        if lower.contains(pat) {
405            return "preference";
406        }
407    }
408
409    ""
410}
411
412impl Vein {
413    const SESSION_REPORT_LIMIT: usize = 5;
414    const SESSION_TURN_LIMIT: usize = 50;
415    const IMPORT_FILE_LIMIT: usize = 12;
416    const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
417
418    pub fn new<P: AsRef<Path>>(
419        db_path: P,
420        base_url: String,
421    ) -> Result<Self, Box<dyn std::error::Error>> {
422        let db = Connection::open(db_path)?;
423
424        // WAL mode for better concurrent read performance.
425        db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
426
427        // chunks_meta: tracks last-modified time per path for incremental indexing.
428        // chunks_fts:  BM25 full-text index of all code chunks.
429        // chunks_vec:  semantic embedding vectors, keyed by (path, chunk_idx).
430        db.execute_batch(
431            "CREATE TABLE IF NOT EXISTS chunks_meta (
432                path TEXT PRIMARY KEY,
433                last_modified INTEGER NOT NULL,
434                room TEXT NOT NULL DEFAULT 'root'
435            );
436            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
437                path UNINDEXED,
438                content,
439                tokenize='porter ascii'
440            );
441            CREATE TABLE IF NOT EXISTS chunks_vec (
442                path TEXT NOT NULL,
443                chunk_idx INTEGER NOT NULL,
444                embedding BLOB NOT NULL,
445                PRIMARY KEY (path, chunk_idx)
446            );
447            CREATE TABLE IF NOT EXISTS file_heat (
448                path TEXT PRIMARY KEY,
449                heat INTEGER NOT NULL DEFAULT 0,
450                last_edit INTEGER NOT NULL DEFAULT 0
451            );",
452        )?;
453
454        // Schema migrations — safe to run on every open (IF NOT EXISTS / ignored if col exists).
455        let _ = db
456            .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
457        let _ = db.execute_batch(
458            "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
459        );
460        let _ = db.execute_batch(
461            "ALTER TABLE chunks_meta ADD COLUMN memory_type TEXT NOT NULL DEFAULT '';",
462        );
463
464        // Pre-load all stored embeddings into RAM so search_semantic never needs
465        // to do a full table scan. blob_to_floats decodes each BLOB once here.
466        let embedding_cache: Vec<EmbeddedChunk> = {
467            let mut stmt = db.prepare(
468                "SELECT cv.path, cv.chunk_idx, cv.embedding,
469                        cm.last_modified,
470                        COALESCE(cm.room, 'root'),
471                        COALESCE(cm.memory_type, '')
472                 FROM chunks_vec cv
473                 JOIN chunks_meta cm ON cm.path = cv.path",
474            )?;
475            // Collect raw rows first so stmt is dropped before the block ends.
476            let raw: Vec<(String, i64, Vec<u8>, i64, String, String)> = stmt
477                .query_map([], |row| {
478                    Ok((
479                        row.get::<_, String>(0)?,
480                        row.get::<_, i64>(1)?,
481                        row.get::<_, Vec<u8>>(2)?,
482                        row.get::<_, i64>(3)?,
483                        row.get::<_, String>(4)?,
484                        row.get::<_, String>(5)?,
485                    ))
486                })?
487                .filter_map(|r| r.ok())
488                .collect();
489            raw.into_iter()
490                .map(
491                    |(path, chunk_idx, blob, last_modified, room, memory_type)| EmbeddedChunk {
492                        path,
493                        chunk_idx,
494                        embedding: blob_to_floats(&blob),
495                        last_modified,
496                        room,
497                        memory_type,
498                    },
499                )
500                .collect()
501        };
502
503        Ok(Self {
504            db: std::sync::Arc::new(std::sync::Mutex::new(db)),
505            base_url,
506            embed_model: std::sync::Arc::new(std::sync::RwLock::new(None)),
507            embedding_cache: std::sync::RwLock::new(embedding_cache),
508        })
509    }
510
511    pub fn set_embed_model(&self, model_id: Option<String>) {
512        if let Ok(mut guard) = self.embed_model.write() {
513            *guard = model_id;
514        }
515    }
516
517    fn current_embed_model(&self) -> Option<String> {
518        self.embed_model.read().ok().and_then(|guard| guard.clone())
519    }
520
521    // ── Indexing ──────────────────────────────────────────────────────────────
522
523    /// Index a single file for BM25 search. Skip if mtime hasn't changed.
524    /// Returns the number of chunks written (0 if file was unchanged).
525    pub fn index_document(
526        &mut self,
527        path: &str,
528        last_modified: i64,
529        full_text: &str,
530    ) -> Result<usize, Box<dyn std::error::Error>> {
531        let room = detect_room(path);
532        let ext = std::path::Path::new(path)
533            .extension()
534            .and_then(|e| e.to_str())
535            .unwrap_or("");
536        let chunks = chunk_by_symbols(ext, full_text);
537        // Tag session memory with semantic type; source/doc chunks leave it empty.
538        let memory_type = if room == "session" {
539            detect_memory_type(full_text)
540        } else {
541            ""
542        };
543        self.index_chunks_with_room_and_type(path, last_modified, &room, memory_type, &chunks)
544    }
545
546    fn index_chunks_with_room_and_type(
547        &mut self,
548        path: &str,
549        last_modified: i64,
550        room: &str,
551        memory_type: &str,
552        chunks: &[String],
553    ) -> Result<usize, Box<dyn std::error::Error>> {
554        let db = self.db.lock().unwrap();
555        let existing: Option<i64> = db
556            .query_row(
557                "SELECT last_modified FROM chunks_meta WHERE path = ?1",
558                params![path],
559                |r| r.get(0),
560            )
561            .ok();
562
563        if let Some(ts) = existing {
564            if ts >= last_modified {
565                return Ok(0); // unchanged — skip
566            }
567        }
568
569        // Evict stale BM25 chunks, stale embedding vectors, then update metadata.
570        db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
571        db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
572        // Evict the in-memory cache for this path as well.
573        if let Ok(mut cache) = self.embedding_cache.write() {
574            cache.retain(|e| e.path != path);
575        }
576        db.execute(
577            "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room, memory_type) VALUES (?1, ?2, ?3, ?4)",
578            params![path, last_modified, room, memory_type],
579        )?;
580
581        drop(db);
582
583        let mut db = self.db.lock().unwrap();
584        let tx = db.transaction()?;
585        {
586            let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
587            for chunk in chunks {
588                stmt.execute(params![path, chunk.as_str()])?;
589            }
590        }
591        tx.commit()?;
592
593        Ok(chunks.len())
594    }
595
596    /// Embed a set of chunks for one file and store the vectors.
597    /// Called after `index_document` returns new chunks.
598    /// Silently skips if the embedding model is unavailable.
599    pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
600        let Some(embed_model) = self.current_embed_model() else {
601            return;
602        };
603        // Fetch room + last_modified for this path once (needed for cache entry).
604        let (last_modified, room, memory_type) = {
605            let db = self.db.lock().unwrap();
606            db.query_row(
607                "SELECT last_modified, room, memory_type FROM chunks_meta WHERE path = ?1",
608                params![path],
609                |r| {
610                    Ok((
611                        r.get::<_, i64>(0)?,
612                        r.get::<_, String>(1)?,
613                        r.get::<_, String>(2).unwrap_or_default(),
614                    ))
615                },
616            )
617            .unwrap_or((0, "root".to_string(), String::new()))
618        };
619
620        // Build prefixed inputs and send all chunks in a single HTTP round-trip.
621        let prefixed: Vec<String> = chunks
622            .iter()
623            .map(|c| {
624                let p = format!("search_document: {}", c);
625                if p.len() > 8000 {
626                    safe_head(&p, 8000).to_string()
627                } else {
628                    p
629                }
630            })
631            .collect();
632
633        let embeddings = embed_batch(&prefixed, &self.base_url, &embed_model);
634
635        for (idx, maybe_vec) in embeddings.into_iter().enumerate() {
636            if let Some(vec) = maybe_vec {
637                let blob = floats_to_blob(&vec);
638                let db = self.db.lock().unwrap();
639                let _ = db.execute(
640                    "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
641                    params![path, idx as i64, blob],
642                );
643                drop(db);
644                // Mirror into in-memory cache — avoids per-turn full scan in search_semantic.
645                if let Ok(mut cache) = self.embedding_cache.write() {
646                    cache.retain(|e| !(e.path == path && e.chunk_idx == idx as i64));
647                    cache.push(EmbeddedChunk {
648                        path: path.to_string(),
649                        chunk_idx: idx as i64,
650                        embedding: vec,
651                        last_modified,
652                        room: room.clone(),
653                        memory_type: memory_type.clone(),
654                    });
655                }
656            }
657        }
658    }
659
660    // ── Search ────────────────────────────────────────────────────────────────
661
662    /// BM25-ranked full-text search via FTS5 MATCH.
663    pub fn search_bm25(
664        &self,
665        query: &str,
666        limit: usize,
667    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
668        // Strip common English stopwords so FTS5 MATCH gets meaningful tokens only.
669        // FTS5 uses implicit AND by default — passing stopwords like "how", "does",
670        // "the" causes zero results because source code never contains those phrases.
671        static STOPWORDS: std::sync::OnceLock<HashSet<&'static str>> = std::sync::OnceLock::new();
672        let stopwords = STOPWORDS.get_or_init(|| {
673            [
674                "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
675                "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an",
676                "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
677                "from", "get", "gets", "got", "work", "works", "make", "makes", "use", "uses",
678                "into", "that", "this", "it", "its",
679            ]
680            .iter()
681            .copied()
682            .collect()
683        });
684
685        // Lowercase once so stopword matching below needs no per-token allocation.
686        // FTS5 is case-insensitive, so lowercased tokens match correctly.
687        let safe_query: String = {
688            let mut s = String::with_capacity(query.len());
689            s.extend(query.chars().map(|c| {
690                if c.is_alphanumeric() || c == ' ' || c == '_' {
691                    c.to_ascii_lowercase()
692                } else {
693                    ' '
694                }
695            }));
696            s
697        };
698
699        // Build an OR query from non-stopword tokens so any relevant term matches.
700        let fts_query = {
701            let mut q = String::with_capacity(safe_query.len() + 16);
702            for w in safe_query
703                .split_whitespace()
704                .filter(|w| w.len() >= 3 && !stopwords.contains(*w))
705            {
706                if !q.is_empty() {
707                    q.push_str(" OR ");
708                }
709                q.push_str(w);
710            }
711            q
712        };
713
714        if fts_query.is_empty() {
715            return Ok(Vec::new());
716        }
717
718        let db = self.db.lock().unwrap();
719        let mut stmt = db.prepare_cached(
720            "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room, cm.memory_type
721             FROM chunks_fts
722             JOIN chunks_meta cm ON cm.path = chunks_fts.path
723             WHERE chunks_fts MATCH ?1
724             ORDER BY rank
725             LIMIT ?2",
726        )?;
727
728        let results: Vec<SearchResult> = stmt
729            .query_map(params![fts_query, limit as i64], |row| {
730                Ok(SearchResult {
731                    path: row.get(0)?,
732                    content: row.get(1)?,
733                    score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
734                    last_modified: row.get(3)?,
735                    room: row.get(4)?,
736                    memory_type: row.get::<_, String>(5).unwrap_or_default(),
737                })
738            })?
739            .filter_map(|r| r.ok())
740            .collect();
741
742        Ok(results)
743    }
744
745    /// Semantic search: embed the query, cosine-similarity against all stored vectors.
746    /// Returns empty if the embedding model isn't loaded.
747    pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
748        let Some(embed_model) = self.current_embed_model() else {
749            return Vec::new();
750        };
751        let query_vec = match embed_query_blocking(query, &self.base_url, &embed_model) {
752            Some(v) => v,
753            None => return Vec::new(),
754        };
755
756        // Precompute the query vector's squared norm once — avoids recomputing it
757        // for every chunk in the scoring loop below (typically ~1000 comparisons).
758        let query_norm_sq: f32 = query_vec.iter().map(|x| x * x).sum();
759
760        // Score every chunk using indices only — no String clones yet.
761        // Sort and truncate to top-`limit` before paying the clone cost.
762        // Old approach cloned 3 Strings per chunk × ~1000 chunks, discarding ~990 of them.
763        let top_indices: Vec<(f32, usize)> = {
764            let cache = match self.embedding_cache.read() {
765                Ok(g) => g,
766                Err(_) => return Vec::new(),
767            };
768            if cache.is_empty() {
769                return Vec::new();
770            }
771            let mut iv: Vec<(f32, usize)> = cache
772                .iter()
773                .enumerate()
774                .map(|(i, e)| {
775                    let sim =
776                        cosine_similarity_precomputed(&query_vec, query_norm_sq, &e.embedding);
777                    (sim, i)
778                })
779                .collect();
780            iv.sort_unstable_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
781            iv.truncate(limit);
782            iv
783        };
784
785        // Clone Strings only for the top-N survivors, then fetch their content.
786        let scored: Vec<(f32, String, i64, i64, String, String)> = {
787            let cache = match self.embedding_cache.read() {
788                Ok(g) => g,
789                Err(_) => return Vec::new(),
790            };
791            top_indices
792                .iter()
793                .map(|&(score, i)| {
794                    let e = &cache[i];
795                    (
796                        score,
797                        e.path.clone(),
798                        e.chunk_idx,
799                        e.last_modified,
800                        e.room.clone(),
801                        e.memory_type.clone(),
802                    )
803                })
804                .collect()
805        };
806
807        let db = self.db.lock().unwrap();
808        // prepare_cached reuses the compiled statement across calls — avoids recompiling
809        // the same SQL for every chunk (up to candidate_limit times per turn).
810        let mut stmt = match db
811            .prepare_cached("SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2")
812        {
813            Ok(s) => s,
814            Err(_) => return Vec::new(),
815        };
816        scored
817            .into_iter()
818            .filter_map(|(score, path, idx, last_modified, room, memory_type)| {
819                let content: Option<String> = stmt.query_row(params![path, idx], |r| r.get(0)).ok();
820                content.map(|c| SearchResult {
821                    path,
822                    content: c,
823                    score,
824                    room,
825                    last_modified,
826                    memory_type,
827                })
828            })
829            .collect()
830    }
831
832    /// Hybrid search: BM25 + semantic, deduplicated and re-ranked.
833    ///
834    /// Semantic results are preferred (they score higher) when the embedding model
835    /// is available. BM25 fills in or takes over when it isn't.
836    /// Results from the active room (hottest subsystem by edit count) get a
837    /// small boost so the model gravitates toward what's currently being worked on.
838    pub fn search_context(
839        &self,
840        query: &str,
841        limit: usize,
842    ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
843        let candidate_limit = (limit.max(1) * 4).max(12);
844        let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
845        let semantic = self.search_semantic(query, candidate_limit);
846        let signals = QuerySignals::from_query(query);
847
848        // Determine the active room from heat scores.
849        let active_room = self.active_room();
850
851        // Merge: semantic results win ties (scored 1.0–2.0 range after boost).
852        // BM25 results land in 0.0–1.0 range.
853        let mut merged_by_path: HashMap<String, SearchResult> =
854            HashMap::with_capacity(semantic.len() + bm25.len());
855
856        for r in semantic {
857            let score = reranked_score(&signals, active_room.as_deref(), &r, true);
858            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
859        }
860
861        for r in bm25 {
862            let score = reranked_score(&signals, active_room.as_deref(), &r, false);
863            merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
864        }
865
866        let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
867        merged.sort_unstable_by(|a, b| {
868            b.score
869                .partial_cmp(&a.score)
870                .unwrap_or(std::cmp::Ordering::Equal)
871        });
872        merged.truncate(limit);
873        Ok(merged)
874    }
875
876    /// Returns the room with the highest total heat (most edited subsystem).
877    /// Used to bias retrieval toward what the user is actively working on.
878    fn active_room(&self) -> Option<String> {
879        let db = self.db.lock().unwrap();
880        db.prepare_cached(
881            "SELECT cm.room, SUM(fh.heat) as total
882             FROM file_heat fh
883             JOIN chunks_meta cm ON cm.path = fh.path
884             GROUP BY cm.room
885             ORDER BY total DESC
886             LIMIT 1",
887        )
888        .ok()
889        .and_then(|mut stmt| stmt.query_row([], |row| row.get::<_, String>(0)).ok())
890    }
891
892    // ── Project Indexing ──────────────────────────────────────────────────────
893
894    /// Walk the entire project and index all source files (BM25 + embeddings).
895    ///
896    /// Skips: `target/`, `.git/`, `node_modules/`, `.hematite/`, files > 512 KB.
897    /// Also indexes `.hematite/docs/` — the designated reference document drop folder.
898    /// Returns the number of files processed (unchanged files are fast-pathed).
899    pub fn index_project(&mut self) -> usize {
900        let root = crate::tools::file_ops::workspace_root();
901        let mut count = 0usize;
902
903        const INDEXABLE: &[&str] = &[
904            "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
905            "yml", "txt",
906        ];
907        const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
908
909        // O(1) membership tests for the per-file extension check and the per-dir
910        // name check inside the walkdir iterator, built once via OnceLock.
911        static INDEXABLE_SET: std::sync::OnceLock<std::collections::HashSet<&'static str>> =
912            std::sync::OnceLock::new();
913        static SKIP_DIRS_SET: std::sync::OnceLock<std::collections::HashSet<&'static str>> =
914            std::sync::OnceLock::new();
915        let indexable = INDEXABLE_SET.get_or_init(|| INDEXABLE.iter().copied().collect());
916        let skip_dirs = SKIP_DIRS_SET.get_or_init(|| SKIP_DIRS.iter().copied().collect());
917
918        for entry in walkdir::WalkDir::new(&root)
919            .follow_links(false)
920            .into_iter()
921            .filter_entry(|e| {
922                if e.file_type().is_dir() {
923                    let name = e.file_name().to_string_lossy();
924                    return !skip_dirs.contains(name.as_ref());
925                }
926                true
927            })
928            .filter_map(|e| e.ok())
929            .filter(|e| e.file_type().is_file())
930        {
931            let path = entry.path();
932            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
933            if !indexable.contains(ext) {
934                continue;
935            }
936
937            // entry.metadata() reuses WIN32_FIND_DATA cached by walkdir on Windows
938            // (no extra stat syscall), falling back to symlink_metadata on Unix.
939            let Ok(meta) = entry.metadata() else {
940                continue;
941            };
942            if meta.len() > 512_000 {
943                continue;
944            }
945
946            let mtime = meta
947                .modified()
948                .map(|t| {
949                    t.duration_since(std::time::UNIX_EPOCH)
950                        .unwrap_or_default()
951                        .as_secs() as i64
952                })
953                .unwrap_or(0);
954
955            let rel = path.strip_prefix(&root).unwrap_or(path);
956            let rel_str = rel.to_string_lossy().replace('\\', "/");
957
958            if let Ok(content) = std::fs::read_to_string(path) {
959                match self.index_document(&rel_str, mtime, &content) {
960                    Ok(n) if n > 0 => {
961                        count += 1;
962                    }
963                    Ok(_) => {}
964                    Err(_) => {}
965                }
966            }
967        }
968
969        count += self.index_workspace_artifacts(&crate::tools::file_ops::hematite_dir());
970
971        count
972    }
973
974    /// Index workspace-local supporting context that should be available even
975    /// outside a real project workspace: `.hematite/docs/`, recent session
976    /// reports stored in `.hematite/reports/`, and imported chat exports in
977    /// `.hematite/imports/`.
978    pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
979        let mut count = self.index_docs_folder(workspace_root);
980        count += self.index_recent_session_reports(workspace_root);
981        count += self.index_imported_session_exports(workspace_root);
982        self.backfill_missing_embeddings();
983        count
984    }
985
986    /// Index reference documents in `.hematite/docs/`.
987    /// Supports PDF (text extraction), markdown, and plain text.
988    /// Documents are stored with path prefix `docs/filename` so they are
989    /// distinguishable from source files in retrieval results.
990    fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
991        let docs_dir = workspace_root.join(".hematite").join("docs");
992        const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
993        let mut count = 0usize;
994        let mut desired_paths = HashSet::new();
995
996        if docs_dir.exists() {
997            for entry in walkdir::WalkDir::new(&docs_dir)
998                .max_depth(3)
999                .follow_links(false)
1000                .into_iter()
1001                .filter_map(|e| e.ok())
1002                .filter(|e| e.file_type().is_file())
1003            {
1004                let path = entry.path();
1005                let ext = path
1006                    .extension()
1007                    .and_then(|e| e.to_str())
1008                    .unwrap_or("")
1009                    .to_lowercase();
1010                if !DOCS_INDEXABLE.contains(&ext.as_str()) {
1011                    continue;
1012                }
1013
1014                let Ok(meta) = entry.metadata() else {
1015                    continue;
1016                };
1017                if meta.len() > 50_000_000 {
1018                    continue;
1019                }
1020
1021                let mtime = meta
1022                    .modified()
1023                    .map(|t| {
1024                        t.duration_since(std::time::UNIX_EPOCH)
1025                            .unwrap_or_default()
1026                            .as_secs() as i64
1027                    })
1028                    .unwrap_or(0);
1029
1030                let rel = path.strip_prefix(workspace_root).unwrap_or(path);
1031                let rel_str = rel.to_string_lossy().replace('\\', "/");
1032                desired_paths.insert(rel_str.clone());
1033
1034                let content = if ext == "pdf" {
1035                    extract_pdf_text(path).ok().flatten()
1036                } else {
1037                    std::fs::read_to_string(path).ok()
1038                };
1039
1040                if let Some(text) = content {
1041                    if text.trim().is_empty() {
1042                        continue;
1043                    }
1044                    match self.index_document(&rel_str, mtime, &text) {
1045                        Ok(n) if n > 0 => {
1046                            count += 1;
1047                        }
1048                        Ok(_) => {}
1049                        Err(_) => {}
1050                    }
1051                }
1052            }
1053        }
1054
1055        self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
1056        count
1057    }
1058
1059    /// Index the most recent local session reports by exchange pair so prior
1060    /// decisions remain searchable across launches without flooding the vein.
1061    pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
1062        let reports_dir = workspace_root.join(".hematite").join("reports");
1063        let mut count = 0usize;
1064        let mut desired_paths = HashSet::new();
1065
1066        if reports_dir.exists() {
1067            let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
1068                .ok()
1069                .into_iter()
1070                .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
1071                .map(|entry| entry.path())
1072                .filter(|path| {
1073                    path.is_file()
1074                        && path.extension().and_then(|ext| ext.to_str()) == Some("json")
1075                        && path
1076                            .file_stem()
1077                            .and_then(|stem| stem.to_str())
1078                            .map(|stem| stem.starts_with("session_"))
1079                            .unwrap_or(false)
1080                })
1081                .collect();
1082
1083            reports.sort_by(|a, b| {
1084                let a_name = a
1085                    .file_name()
1086                    .and_then(|name| name.to_str())
1087                    .unwrap_or_default();
1088                let b_name = b
1089                    .file_name()
1090                    .and_then(|name| name.to_str())
1091                    .unwrap_or_default();
1092                b_name.cmp(a_name)
1093            });
1094            reports.truncate(Self::SESSION_REPORT_LIMIT);
1095
1096            for report_path in reports {
1097                let Ok(meta) = std::fs::metadata(&report_path) else {
1098                    continue;
1099                };
1100                let mtime = meta
1101                    .modified()
1102                    .map(|t| {
1103                        t.duration_since(std::time::UNIX_EPOCH)
1104                            .unwrap_or_default()
1105                            .as_secs() as i64
1106                    })
1107                    .unwrap_or(0);
1108
1109                for exchange in load_session_exchanges(&report_path, mtime) {
1110                    desired_paths.insert(exchange.path.clone());
1111                    let mtype = detect_memory_type(&exchange.content);
1112                    match self.index_chunks_with_room_and_type(
1113                        &exchange.path,
1114                        exchange.last_modified,
1115                        "session",
1116                        mtype,
1117                        std::slice::from_ref(&exchange.content),
1118                    ) {
1119                        Ok(n) if n > 0 => {
1120                            count += 1;
1121                        }
1122                        Ok(_) => {}
1123                        Err(_) => {}
1124                    }
1125                }
1126            }
1127        }
1128
1129        self.prune_indexed_prefix("session/", &desired_paths);
1130        count
1131    }
1132
1133    /// Index imported chat exports from `.hematite/imports/`.
1134    /// Supported inputs include already-normalized `>` transcripts, Claude Code
1135    /// JSONL, Codex CLI JSONL, simple role/content JSON exports, ChatGPT
1136    /// `mapping` exports, and Hematite session-report JSON.
1137    pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
1138        let imports_dir = workspace_root.join(".hematite").join("imports");
1139        let mut count = 0usize;
1140        let mut desired_paths = HashSet::new();
1141
1142        if imports_dir.exists() {
1143            let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
1144                .max_depth(4)
1145                .follow_links(false)
1146                .into_iter()
1147                .filter_map(|entry| entry.ok())
1148                .filter(|entry| entry.file_type().is_file())
1149                .filter_map(|entry| {
1150                    let path = entry.into_path();
1151                    let ext = path
1152                        .extension()
1153                        .and_then(|ext| ext.to_str())
1154                        .unwrap_or("")
1155                        .to_ascii_lowercase();
1156                    if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
1157                        return None;
1158                    }
1159                    let meta = std::fs::metadata(&path).ok()?;
1160                    if meta.len() > Self::IMPORT_MAX_BYTES {
1161                        return None;
1162                    }
1163                    let mtime = meta
1164                        .modified()
1165                        .map(|t| {
1166                            t.duration_since(std::time::UNIX_EPOCH)
1167                                .unwrap_or_default()
1168                                .as_secs() as i64
1169                        })
1170                        .unwrap_or(0);
1171                    Some((path, mtime))
1172                })
1173                .collect();
1174
1175            imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
1176                b_mtime
1177                    .cmp(a_mtime)
1178                    .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
1179            });
1180            imports.truncate(Self::IMPORT_FILE_LIMIT);
1181
1182            for (import_path, mtime) in imports {
1183                for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
1184                    desired_paths.insert(exchange.path.clone());
1185                    let mtype = detect_memory_type(&exchange.content);
1186                    match self.index_chunks_with_room_and_type(
1187                        &exchange.path,
1188                        exchange.last_modified,
1189                        "session",
1190                        mtype,
1191                        std::slice::from_ref(&exchange.content),
1192                    ) {
1193                        Ok(n) if n > 0 => {
1194                            count += 1;
1195                        }
1196                        Ok(_) => {}
1197                        Err(_) => {}
1198                    }
1199                }
1200            }
1201        }
1202
1203        self.prune_indexed_prefix("session/imports/", &desired_paths);
1204        count
1205    }
1206
1207    /// Embed any FTS chunks that don't yet have a vector in chunks_vec.
1208    /// Called at the end of index_project so that loading the embedding model
1209    /// after the initial index automatically triggers a semantic upgrade on the
1210    /// next agent turn — no /forget or file-touch required.
1211    fn backfill_missing_embeddings(&self) {
1212        // Fast path: if chunk counts match, nothing to do.
1213        let (fts_count, vec_count) = {
1214            let db = self.db.lock().unwrap();
1215            let fts: i64 = db
1216                .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
1217                .unwrap_or(0);
1218            let vec: i64 = db
1219                .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
1220                .unwrap_or(0);
1221            (fts, vec)
1222        };
1223        if fts_count == 0 || fts_count == vec_count {
1224            return;
1225        }
1226
1227        // Fetch (path, chunk_idx, content, last_modified, room, memory_type) for chunks with no embedding.
1228        // chunks_fts rowid serves as chunk_idx (1-based → convert to 0-based).
1229        let missing: Vec<(String, i64, String, i64, String, String)> = {
1230            let db = self.db.lock().unwrap();
1231            let mut stmt = match db.prepare_cached(
1232                "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content,
1233                        COALESCE(cm.last_modified, 0),
1234                        COALESCE(cm.room, 'root'),
1235                        COALESCE(cm.memory_type, '')
1236                 FROM chunks_fts f
1237                 LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
1238                 LEFT JOIN chunks_meta cm ON cm.path = f.path
1239                 WHERE v.path IS NULL
1240                 ORDER BY CASE
1241                     WHEN f.path LIKE '%.rs' THEN 0
1242                     WHEN f.path LIKE '%.toml' THEN 1
1243                     WHEN f.path LIKE '%.json' THEN 2
1244                     ELSE 3
1245                 END, f.path
1246                 LIMIT 20",
1247            ) {
1248                Ok(s) => s,
1249                Err(_) => return,
1250            };
1251            let Ok(rows) = stmt.query_map([], |r| {
1252                Ok((
1253                    r.get::<_, String>(0)?,
1254                    r.get::<_, i64>(1)?,
1255                    r.get::<_, String>(2)?,
1256                    r.get::<_, i64>(3)?,
1257                    r.get::<_, String>(4)?,
1258                    r.get::<_, String>(5)?,
1259                ))
1260            }) else {
1261                return;
1262            };
1263            rows.filter_map(|r| r.ok()).collect()
1264        };
1265
1266        let Some(embed_model) = self.current_embed_model() else {
1267            return;
1268        };
1269
1270        // Send all missing chunks in one batch request instead of N sequential calls.
1271        let prefixed: Vec<String> = missing
1272            .iter()
1273            .map(|(_, _, content, _, _, _)| {
1274                let p = format!("search_document: {}", content);
1275                if p.len() > 8000 {
1276                    safe_head(&p, 8000).to_string()
1277                } else {
1278                    p
1279                }
1280            })
1281            .collect();
1282
1283        let embeddings = embed_batch(&prefixed, &self.base_url, &embed_model);
1284
1285        for (i, maybe_vec) in embeddings.into_iter().enumerate() {
1286            let Some(vec) = maybe_vec else {
1287                // Embedding model not available — stop trying for this pass.
1288                break;
1289            };
1290            let (path, idx, _, last_modified, room, memory_type) = &missing[i];
1291            let blob = floats_to_blob(&vec);
1292            let db = self.db.lock().unwrap();
1293            let _ = db.execute(
1294                "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
1295                params![path, idx, blob],
1296            );
1297            drop(db);
1298            // Mirror into in-memory cache.
1299            if let Ok(mut cache) = self.embedding_cache.write() {
1300                cache.retain(|e| !(e.path == *path && e.chunk_idx == *idx));
1301                cache.push(EmbeddedChunk {
1302                    path: path.clone(),
1303                    chunk_idx: *idx,
1304                    embedding: vec,
1305                    last_modified: *last_modified,
1306                    room: room.clone(),
1307                    memory_type: memory_type.clone(),
1308                });
1309            }
1310        }
1311    }
1312
1313    /// Total number of unique files currently indexed.
1314    /// Session exchange chunks are excluded so status counts stay source/doc centric.
1315    pub fn file_count(&self) -> usize {
1316        let db = self.db.lock().unwrap();
1317        db.query_row(
1318            "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
1319            [],
1320            |r| r.get::<_, i64>(0),
1321        )
1322        .unwrap_or(0) as usize
1323    }
1324
1325    /// Number of source/doc chunks that have semantic embedding vectors stored.
1326    /// Session exchange chunks are excluded so status counts stay source/doc centric.
1327    pub fn embedded_chunk_count(&self) -> usize {
1328        let db = self.db.lock().unwrap();
1329        db.query_row(
1330            "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1331            [],
1332            |r| r.get::<_, i64>(0),
1333        )
1334        .unwrap_or(0) as usize
1335    }
1336
1337    /// True when any chunk type currently has embeddings available.
1338    pub fn has_any_embeddings(&self) -> bool {
1339        let db = self.db.lock().unwrap();
1340        db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1341            r.get::<_, i64>(0)
1342        })
1343        .unwrap_or(0)
1344            != 0
1345    }
1346
1347    /// Wipe all indexed data. The DB file stays on disk; next index_project()
1348    /// call rebuilds from scratch (re-reads all files, re-embeds all chunks).
1349    pub fn reset(&self) {
1350        let db = self.db.lock().unwrap();
1351        let _ = db.execute_batch(
1352            "DELETE FROM chunks_fts;
1353             DELETE FROM chunks_vec;
1354             DELETE FROM chunks_meta;",
1355        );
1356    }
1357
1358    /// Return a compact operator-facing snapshot of what The Vein currently knows.
1359    /// Intended for trust/debug surfaces like `/vein-inspect`.
1360    pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1361        let db = self.db.lock().unwrap();
1362        let indexed_source_files = db
1363            .query_row(
1364                "SELECT COUNT(*) FROM chunks_meta
1365                 WHERE path NOT LIKE 'session/%'
1366                   AND path NOT LIKE '.hematite/docs/%'",
1367                [],
1368                |r| r.get::<_, i64>(0),
1369            )
1370            .unwrap_or(0) as usize;
1371        let indexed_docs = db
1372            .query_row(
1373                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1374                [],
1375                |r| r.get::<_, i64>(0),
1376            )
1377            .unwrap_or(0) as usize;
1378        let indexed_session_exchanges = db
1379            .query_row(
1380                "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1381                [],
1382                |r| r.get::<_, i64>(0),
1383            )
1384            .unwrap_or(0) as usize;
1385        let embedded_source_doc_chunks = db
1386            .query_row(
1387                "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1388                [],
1389                |r| r.get::<_, i64>(0),
1390            )
1391            .unwrap_or(0) as usize;
1392        let has_any_embeddings = db
1393            .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1394                r.get::<_, i64>(0)
1395            })
1396            .unwrap_or(0)
1397            != 0;
1398        drop(db);
1399
1400        let hot_files = self
1401            .hot_files(hot_limit.max(1))
1402            .into_iter()
1403            .map(|(path, heat, last_modified, room)| VeinHotFile {
1404                path,
1405                heat,
1406                last_modified,
1407                room,
1408            })
1409            .collect::<Vec<_>>();
1410
1411        VeinInspectionSnapshot {
1412            indexed_source_files,
1413            indexed_docs,
1414            indexed_session_exchanges,
1415            embedded_source_doc_chunks,
1416            has_any_embeddings,
1417            active_room: self.active_room(),
1418            l1_ready: !hot_files.is_empty(),
1419            hot_files,
1420        }
1421    }
1422
1423    // ── L1 heat tracking ──────────────────────────────────────────────────────
1424
1425    /// Record an edit to a file. Increments its heat score in file_heat.
1426    /// Called from the tool dispatch after a successful edit_file / write_file /
1427    /// patch_hunk / multi_search_replace so the L1 context stays current.
1428    pub fn bump_heat(&self, path: &str) {
1429        if path.is_empty() {
1430            return;
1431        }
1432        let now = std::time::SystemTime::now()
1433            .duration_since(std::time::UNIX_EPOCH)
1434            .unwrap_or_default()
1435            .as_secs() as i64;
1436        let db = self.db.lock().unwrap();
1437        let _ = db.execute(
1438            "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1439             ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1440            params![path, now],
1441        );
1442    }
1443
1444    /// Return the top N hot files ranked by edit count (heat) then recency.
1445    /// Joins file_heat with chunks_meta so only indexed files are included.
1446    /// Returns (path, heat, mtime, room).
1447    fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1448        let db = self.db.lock().unwrap();
1449        let mut stmt = match db.prepare_cached(
1450            "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1451             FROM file_heat fh
1452             JOIN chunks_meta cm ON cm.path = fh.path
1453             ORDER BY fh.heat DESC, cm.last_modified DESC
1454             LIMIT ?1",
1455        ) {
1456            Ok(s) => s,
1457            Err(_) => return vec![],
1458        };
1459        stmt.query_map(params![n as i64], |row| {
1460            Ok((
1461                row.get::<_, String>(0)?,
1462                row.get::<_, i64>(1)?,
1463                row.get::<_, i64>(2)?,
1464                row.get::<_, String>(3)?,
1465            ))
1466        })
1467        .map(|rows| rows.filter_map(|r| r.ok()).collect())
1468        .unwrap_or_default()
1469    }
1470
1471    /// Return the paths of the top hot files (most edited).
1472    /// Used by RepoMapGenerator to bias PageRank toward active files.
1473    pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1474        self.hot_files(n)
1475            .into_iter()
1476            .map(|(path, _, _, _)| path)
1477            .collect()
1478    }
1479
1480    /// Return hot files with normalized heat weights in [0.0, 1.0].
1481    /// The hottest file gets weight 1.0; others are scaled proportionally.
1482    /// Used by RepoMapGenerator to apply heat-weighted PageRank personalization.
1483    pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1484        let files = self.hot_files(n);
1485        if files.is_empty() {
1486            return vec![];
1487        }
1488        let max_heat = files
1489            .iter()
1490            .map(|(_, h, _, _)| *h)
1491            .max()
1492            .unwrap_or(1)
1493            .max(1) as f64;
1494        files
1495            .into_iter()
1496            .map(|(path, heat, _, _)| {
1497                let weight = (heat as f64) / max_heat;
1498                (path, weight)
1499            })
1500            .collect()
1501    }
1502
1503    /// Build the L1 context block — a compact "hot files" summary injected into
1504    /// the system prompt at session start. Capped at ~150 tokens.
1505    /// Files are grouped by room so the model sees subsystem structure at a glance.
1506    /// Returns None when there are no heat records yet (fresh project).
1507    pub fn l1_context(&self) -> Option<String> {
1508        let files = self.hot_files(8);
1509        if files.is_empty() {
1510            return None;
1511        }
1512        let now = std::time::SystemTime::now()
1513            .duration_since(std::time::UNIX_EPOCH)
1514            .unwrap_or_default()
1515            .as_secs() as i64;
1516
1517        // Group by room for readability.
1518        let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1519            std::collections::BTreeMap::new();
1520        for (path, heat, mtime, room) in &files {
1521            by_room
1522                .entry(room.clone())
1523                .or_default()
1524                .push((path.clone(), *heat, *mtime));
1525        }
1526
1527        let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1528        for (room, entries) in &by_room {
1529            let _ = writeln!(out, "[{}]", room);
1530            for (path, heat, mtime) in entries {
1531                let age_secs = now - mtime;
1532                let age = if age_secs < 3600 {
1533                    "just now".to_string()
1534                } else if age_secs < 86400 {
1535                    format!("{}h ago", age_secs / 3600)
1536                } else {
1537                    format!("{}d ago", age_secs / 86400)
1538                };
1539                let _ = writeln!(
1540                    out,
1541                    "  - {} [{} edit{}, {}]",
1542                    path,
1543                    heat,
1544                    if *heat == 1 { "" } else { "s" },
1545                    age
1546                );
1547            }
1548        }
1549        Some(out)
1550    }
1551
1552    fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1553        let pattern = format!("{}%", prefix);
1554        let existing_paths: Vec<String> = {
1555            let db = self.db.lock().unwrap();
1556            let mut stmt =
1557                match db.prepare_cached("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1558                    Ok(stmt) => stmt,
1559                    Err(_) => return,
1560                };
1561            stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1562                .map(|rows| rows.filter_map(|row| row.ok()).collect())
1563                .unwrap_or_default()
1564        };
1565
1566        if existing_paths.is_empty() {
1567            return;
1568        }
1569
1570        let db = self.db.lock().unwrap();
1571        for path in existing_paths {
1572            if desired_paths.contains(&path) {
1573                continue;
1574            }
1575            let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1576            let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1577            let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1578        }
1579    }
1580}
1581
1582impl QuerySignals {
1583    fn from_query(query: &str) -> Self {
1584        let lower = query.to_ascii_lowercase();
1585        let historical_memory_hint = [
1586            "remember",
1587            "earlier",
1588            "previous",
1589            "last time",
1590            "what did we decide",
1591            "why did we decide",
1592            "what did we say",
1593            "why did we change",
1594        ]
1595        .iter()
1596        .any(|needle| lower.contains(needle));
1597
1598        // Detect what memory type the query is asking about.
1599        let query_memory_type = if lower.contains("decide")
1600            || lower.contains("decision")
1601            || lower.contains("we agreed")
1602            || lower.contains("we chose")
1603        {
1604            Some("decision")
1605        } else if lower.contains("bug")
1606            || lower.contains("error")
1607            || lower.contains("issue")
1608            || lower.contains("problem")
1609            || lower.contains("fix")
1610            || lower.contains("broken")
1611        {
1612            Some("problem")
1613        } else if lower.contains("shipped")
1614            || lower.contains("milestone")
1615            || lower.contains("finished")
1616            || lower.contains("working now")
1617        {
1618            Some("milestone")
1619        } else if lower.contains("prefer")
1620            || lower.contains("my preference")
1621            || lower.contains("i like")
1622            || lower.contains("i want")
1623        {
1624            Some("preference")
1625        } else {
1626            None
1627        };
1628
1629        Self {
1630            exact_phrases: extract_exact_phrases(query),
1631            standout_terms: extract_standout_terms(query),
1632            historical_memory_hint,
1633            temporal_reference: extract_temporal_reference(query),
1634            query_memory_type,
1635        }
1636    }
1637}
1638
1639fn merge_scored_result(
1640    merged_by_path: &mut HashMap<String, SearchResult>,
1641    candidate: SearchResult,
1642) {
1643    match merged_by_path.get_mut(&candidate.path) {
1644        Some(existing) if candidate.score > existing.score => *existing = candidate,
1645        Some(_) => {}
1646        None => {
1647            merged_by_path.insert(candidate.path.clone(), candidate);
1648        }
1649    }
1650}
1651
1652fn reranked_score(
1653    signals: &QuerySignals,
1654    active_room: Option<&str>,
1655    result: &SearchResult,
1656    is_semantic: bool,
1657) -> f32 {
1658    let base = if is_semantic {
1659        1.0 + result.score.clamp(0.0, 1.0)
1660    } else {
1661        (result.score / 10.0).clamp(0.0, 1.0)
1662    };
1663    base + room_bias(active_room, result)
1664        + retrieval_signal_boost(signals, result)
1665        + temporal_memory_boost(signals, result)
1666}
1667
1668fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1669    if active_room == Some(result.room.as_str()) {
1670        0.15
1671    } else {
1672        0.0
1673    }
1674}
1675
1676fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1677    let mut boost = 0.0f32;
1678
1679    // Lowercase once; reuse for both the phrase-match haystack and the
1680    // per-term standout loop — avoids one allocation per standout term.
1681    let lower_path = result.path.to_ascii_lowercase();
1682    let lower_content = result.content.to_ascii_lowercase();
1683    let mut haystack = String::with_capacity(lower_path.len() + 1 + lower_content.len());
1684    haystack.push_str(&lower_path);
1685    haystack.push('\n');
1686    haystack.push_str(&lower_content);
1687
1688    let phrase_matches = signals
1689        .exact_phrases
1690        .iter()
1691        .filter(|phrase| haystack.contains(phrase.as_str()))
1692        .count();
1693    if phrase_matches > 0 {
1694        boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1695    }
1696
1697    let mut standout_matches = 0;
1698    for term in &signals.standout_terms {
1699        if lower_path.contains(term.as_str()) {
1700            boost += 0.40;
1701            standout_matches += 1;
1702        } else if lower_content.contains(term.as_str()) {
1703            boost += 0.12;
1704            standout_matches += 1;
1705        }
1706        if standout_matches >= 3 {
1707            break;
1708        }
1709    }
1710
1711    if signals.historical_memory_hint && result.room == "session" {
1712        boost += 0.45;
1713    }
1714
1715    // Boost session chunks whose tagged memory type matches the query's intent.
1716    if let Some(qtype) = signals.query_memory_type {
1717        if !result.memory_type.is_empty() && result.memory_type == qtype {
1718            boost += 0.35;
1719        }
1720    }
1721
1722    boost
1723}
1724
1725fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1726    if result.room != "session" {
1727        return 0.0;
1728    }
1729    let Some(reference) = signals.temporal_reference else {
1730        return 0.0;
1731    };
1732    let Some(memory_ts) = session_memory_timestamp(result) else {
1733        return 0.0;
1734    };
1735
1736    let span = reference.window_secs.max(86_400);
1737    let full_fade = span.saturating_mul(8);
1738    if full_fade <= 0 {
1739        return 0.0;
1740    }
1741
1742    let distance = (memory_ts - reference.target_ts).abs();
1743    let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1744    if closeness <= 0.0 {
1745        0.0
1746    } else {
1747        0.22 * closeness
1748    }
1749}
1750
1751fn extract_exact_phrases(query: &str) -> Vec<String> {
1752    let mut phrases = Vec::new();
1753    let chars: Vec<char> = query.chars().collect();
1754    let mut i = 0usize;
1755
1756    while i < chars.len() {
1757        let quote = chars[i];
1758        if !matches!(quote, '"' | '\'' | '`') {
1759            i += 1;
1760            continue;
1761        }
1762        let start = i + 1;
1763        let mut end = start;
1764        while end < chars.len() && chars[end] != quote {
1765            end += 1;
1766        }
1767        if end > start {
1768            let phrase = chars[start..end]
1769                .iter()
1770                .collect::<String>()
1771                .trim()
1772                .to_ascii_lowercase();
1773            if phrase.len() >= 3 && !phrases.contains(&phrase) {
1774                phrases.push(phrase);
1775            }
1776        }
1777        i = end.saturating_add(1);
1778    }
1779
1780    phrases
1781}
1782
1783fn extract_standout_terms(query: &str) -> Vec<String> {
1784    static STANDOUT_SW: std::sync::OnceLock<HashSet<&'static str>> = std::sync::OnceLock::new();
1785    let stopwords = STANDOUT_SW.get_or_init(|| {
1786        [
1787            "about", "after", "before", "change", "changed", "decide", "decided", "does",
1788            "earlier", "flow", "from", "have", "into", "just", "last", "local", "make", "more",
1789            "remember", "should", "that", "their", "there", "these", "they", "this", "those",
1790            "what", "when", "where", "which", "why", "with", "work",
1791        ]
1792        .iter()
1793        .copied()
1794        .collect()
1795    });
1796
1797    let mut standout = Vec::new();
1798    for token in query.split(|ch: char| {
1799        !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1800    }) {
1801        let trimmed = token.trim();
1802        if trimmed.len() < 4 {
1803            continue;
1804        }
1805        let lower = trimmed.to_ascii_lowercase();
1806        if stopwords.contains(lower.as_str()) {
1807            continue;
1808        }
1809
1810        let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1811            || trimmed
1812                .chars()
1813                .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1814            || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1815            || trimmed.len() >= 9;
1816
1817        if interesting && !standout.contains(&lower) {
1818            standout.push(lower);
1819        }
1820    }
1821
1822    standout
1823}
1824
1825fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1826    if let Some(ts) = extract_iso_date_from_query(query) {
1827        return Some(TemporalReference {
1828            target_ts: ts,
1829            window_secs: 86_400,
1830        });
1831    }
1832
1833    let now = current_unix_timestamp();
1834    let lower = query.to_ascii_lowercase();
1835    if lower.contains("yesterday") {
1836        Some(TemporalReference {
1837            target_ts: now.saturating_sub(86_400),
1838            window_secs: 86_400,
1839        })
1840    } else if lower.contains("today") || lower.contains("earlier today") {
1841        Some(TemporalReference {
1842            target_ts: now,
1843            window_secs: 86_400,
1844        })
1845    } else if lower.contains("last week") {
1846        Some(TemporalReference {
1847            target_ts: now.saturating_sub(7 * 86_400),
1848            window_secs: 7 * 86_400,
1849        })
1850    } else if lower.contains("last month") {
1851        Some(TemporalReference {
1852            target_ts: now.saturating_sub(30 * 86_400),
1853            window_secs: 30 * 86_400,
1854        })
1855    } else {
1856        None
1857    }
1858}
1859
1860fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1861    query
1862        .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1863        .find_map(parse_iso_date_token)
1864}
1865
1866fn parse_iso_date_token(token: &str) -> Option<i64> {
1867    if token.len() != 10 {
1868        return None;
1869    }
1870    let bytes = token.as_bytes();
1871    if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1872        return None;
1873    }
1874
1875    let year = token.get(0..4)?.parse::<i32>().ok()?;
1876    let month = token.get(5..7)?.parse::<u32>().ok()?;
1877    let day = token.get(8..10)?.parse::<u32>().ok()?;
1878    if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1879        return None;
1880    }
1881
1882    Some(days_from_civil(year, month, day).saturating_mul(86_400))
1883}
1884
1885fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1886    let year = year - if month <= 2 { 1 } else { 0 };
1887    let era = if year >= 0 { year } else { year - 399 } / 400;
1888    let yoe = year - era * 400;
1889    let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1890    let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1891    let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1892    era as i64 * 146_097 + doe as i64 - 719_468
1893}
1894
1895fn current_unix_timestamp() -> i64 {
1896    std::time::SystemTime::now()
1897        .duration_since(std::time::UNIX_EPOCH)
1898        .unwrap_or_default()
1899        .as_secs() as i64
1900}
1901
1902fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1903    extract_session_path_timestamp(&result.path).or({
1904        if result.last_modified > 0 {
1905            Some(result.last_modified)
1906        } else {
1907            None
1908        }
1909    })
1910}
1911
1912fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1913    let normalized = path.replace('\\', "/");
1914    let mut parts = normalized.split('/');
1915    if parts.next()? != "session" {
1916        return None;
1917    }
1918    parse_iso_date_token(parts.next()?)
1919}
1920
1921fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1922    let normalized = speaker.trim().to_ascii_lowercase();
1923    match normalized.as_str() {
1924        "you" | "user" => SessionSpeakerKind::User,
1925        "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1926        _ => SessionSpeakerKind::Assistant,
1927    }
1928}
1929
1930fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1931    let Ok(raw) = std::fs::read_to_string(report_path) else {
1932        return Vec::new();
1933    };
1934    let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1935        return Vec::new();
1936    };
1937
1938    let session_key = report_path
1939        .file_stem()
1940        .and_then(|stem| stem.to_str())
1941        .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1942        .unwrap_or("unknown-session")
1943        .to_string();
1944    let session_date = report
1945        .session_start
1946        .split('_')
1947        .next()
1948        .filter(|date| !date.is_empty())
1949        .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1950        .to_string();
1951
1952    let mut exchanges = Vec::with_capacity(report.transcript.len() / 2);
1953    let mut pending_user: Option<String> = None;
1954    let mut turn_index = 0usize;
1955
1956    for entry in report.transcript {
1957        match session_speaker_kind(&entry.speaker) {
1958            SessionSpeakerKind::User => {
1959                let text = entry.text.trim();
1960                if !text.is_empty() {
1961                    pending_user = Some(text.to_string());
1962                }
1963            }
1964            SessionSpeakerKind::Assistant => {
1965                let text = entry.text.trim();
1966                if text.is_empty() {
1967                    continue;
1968                }
1969                let Some(user_text) = pending_user.take() else {
1970                    continue;
1971                };
1972                turn_index += 1;
1973                exchanges.push(SessionExchange {
1974                    path: format!(
1975                        "session/{}/{}/turn-{}",
1976                        session_date, session_key, turn_index
1977                    ),
1978                    last_modified,
1979                    content: format!(
1980                        "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1981                        user_text, text
1982                    ),
1983                });
1984            }
1985            SessionSpeakerKind::Ignore => {}
1986        }
1987    }
1988
1989    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1990        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1991        exchanges = exchanges.into_iter().skip(keep_from).collect();
1992    }
1993
1994    exchanges
1995}
1996
1997fn load_imported_session_exchanges(
1998    import_path: &Path,
1999    imports_root: &Path,
2000    last_modified: i64,
2001) -> Vec<SessionExchange> {
2002    let Ok(raw) = std::fs::read_to_string(import_path) else {
2003        return Vec::new();
2004    };
2005
2006    let messages = normalize_import_messages(&raw, import_path);
2007    if messages.is_empty() {
2008        return Vec::new();
2009    }
2010
2011    let rel = import_path
2012        .strip_prefix(imports_root)
2013        .unwrap_or(import_path);
2014    let rel_slug = slugify_import_path(rel);
2015    let mut exchanges = Vec::with_capacity(messages.len() / 2);
2016    let mut pending_user: Option<String> = None;
2017    let mut turn_index = 0usize;
2018
2019    for (role, text) in messages {
2020        let cleaned = text.trim();
2021        if cleaned.is_empty() {
2022            continue;
2023        }
2024        match role.as_str() {
2025            "user" => pending_user = Some(cleaned.to_string()),
2026            "assistant" => {
2027                let Some(user_text) = pending_user.take() else {
2028                    continue;
2029                };
2030                turn_index += 1;
2031                exchanges.push(SessionExchange {
2032                    path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
2033                    last_modified,
2034                    content: format!(
2035                        "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
2036                        rel.to_string_lossy().replace('\\', "/"),
2037                        user_text,
2038                        cleaned
2039                    ),
2040                });
2041            }
2042            _ => {}
2043        }
2044    }
2045
2046    if exchanges.len() > Vein::SESSION_TURN_LIMIT {
2047        let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
2048        exchanges = exchanges.into_iter().skip(keep_from).collect();
2049    }
2050
2051    exchanges
2052}
2053
2054fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
2055    if raw.trim().is_empty() {
2056        return Vec::new();
2057    }
2058
2059    if let Some(messages) = parse_marker_transcript(raw) {
2060        return messages;
2061    }
2062
2063    let ext = import_path
2064        .extension()
2065        .and_then(|ext| ext.to_str())
2066        .unwrap_or("")
2067        .to_ascii_lowercase();
2068
2069    if matches!(ext.as_str(), "json" | "jsonl")
2070        || matches!(raw.trim().chars().next(), Some('{') | Some('['))
2071    {
2072        if let Some(messages) = parse_jsonl_messages(raw) {
2073            if !messages.is_empty() {
2074                return messages;
2075            }
2076        }
2077
2078        if let Ok(value) = serde_json::from_str::<Value>(raw) {
2079            if let Some(messages) = parse_session_report_messages(&value) {
2080                return messages;
2081            }
2082            if let Some(messages) = parse_simple_role_messages(&value) {
2083                return messages;
2084            }
2085            if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
2086                return messages;
2087            }
2088        }
2089    }
2090
2091    Vec::new()
2092}
2093
2094fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
2095    let lines = raw.lines().collect::<Vec<_>>();
2096    if lines
2097        .iter()
2098        .filter(|line| line.trim_start().starts_with("> "))
2099        .count()
2100        < 2
2101    {
2102        return None;
2103    }
2104
2105    let mut messages = Vec::with_capacity(lines.len() / 2);
2106    let mut i = 0usize;
2107    while i < lines.len() {
2108        let line = lines[i].trim_start();
2109        if let Some(rest) = line.strip_prefix("> ") {
2110            messages.push(("user".to_string(), rest.trim().to_string()));
2111            i += 1;
2112            let mut assistant_lines = Vec::new();
2113            while i < lines.len() {
2114                let next = lines[i];
2115                if next.trim_start().starts_with("> ") {
2116                    break;
2117                }
2118                let trimmed = next.trim();
2119                if !trimmed.is_empty() && trimmed != "---" {
2120                    assistant_lines.push(trimmed.to_string());
2121                }
2122                i += 1;
2123            }
2124            if !assistant_lines.is_empty() {
2125                messages.push(("assistant".to_string(), assistant_lines.join("\n")));
2126            }
2127        } else {
2128            i += 1;
2129        }
2130    }
2131
2132    (!messages.is_empty()).then_some(messages)
2133}
2134
2135fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
2136    let mut messages = Vec::new();
2137    let mut has_codex_session_meta = false;
2138    let mut saw_jsonl = false;
2139
2140    for line in raw.lines() {
2141        let trimmed = line.trim();
2142        if trimmed.is_empty() {
2143            continue;
2144        }
2145        let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
2146            continue;
2147        };
2148        saw_jsonl = true;
2149        let Some(object) = value.as_object() else {
2150            continue;
2151        };
2152
2153        match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
2154            "session_meta" => {
2155                has_codex_session_meta = true;
2156            }
2157            "event_msg" => {
2158                let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
2159                    continue;
2160                };
2161                let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
2162                    continue;
2163                };
2164                match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
2165                    "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
2166                    "agent_message" => {
2167                        messages.push(("assistant".to_string(), text.trim().to_string()))
2168                    }
2169                    _ => {}
2170                }
2171            }
2172            "human" | "user" => {
2173                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
2174                    messages.push(("user".to_string(), text));
2175                }
2176            }
2177            "assistant" => {
2178                if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
2179                    messages.push(("assistant".to_string(), text));
2180                }
2181            }
2182            _ => {
2183                if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
2184                    if let Some(text) = extract_text_content(&value) {
2185                        match role {
2186                            "user" | "human" => messages.push(("user".to_string(), text)),
2187                            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2188                            _ => {}
2189                        }
2190                    }
2191                }
2192            }
2193        }
2194    }
2195
2196    if !saw_jsonl {
2197        return None;
2198    }
2199
2200    if has_codex_session_meta || !messages.is_empty() {
2201        return Some(messages);
2202    }
2203
2204    None
2205}
2206
2207fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
2208    let report = value.as_object()?;
2209    let transcript = report.get("transcript")?.as_array()?;
2210    let mut messages = Vec::with_capacity(transcript.len());
2211
2212    for entry in transcript {
2213        let Some(obj) = entry.as_object() else {
2214            continue;
2215        };
2216        let speaker = obj
2217            .get("speaker")
2218            .and_then(|v| v.as_str())
2219            .unwrap_or_default();
2220        let text = obj
2221            .get("text")
2222            .and_then(|v| v.as_str())
2223            .unwrap_or_default()
2224            .trim()
2225            .to_string();
2226        if text.is_empty() {
2227            continue;
2228        }
2229        match session_speaker_kind(speaker) {
2230            SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
2231            SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
2232            SessionSpeakerKind::Ignore => {}
2233        }
2234    }
2235
2236    (!messages.is_empty()).then_some(messages)
2237}
2238
2239fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
2240    if let Some(array) = value.as_array() {
2241        let messages = collect_role_messages(array);
2242        return (!messages.is_empty()).then_some(messages);
2243    }
2244
2245    let obj = value.as_object()?;
2246    if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
2247        let array = messages_value.as_array()?;
2248        let messages = collect_role_messages(array);
2249        return (!messages.is_empty()).then_some(messages);
2250    }
2251
2252    None
2253}
2254
2255fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
2256    let mut messages = Vec::with_capacity(items.len());
2257    for item in items {
2258        let Some(obj) = item.as_object() else {
2259            continue;
2260        };
2261        let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
2262            continue;
2263        };
2264        let Some(text) = extract_text_content(item) else {
2265            continue;
2266        };
2267        match role {
2268            "user" | "human" => messages.push(("user".to_string(), text)),
2269            "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2270            _ => {}
2271        }
2272    }
2273    messages
2274}
2275
2276fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
2277    let mapping = value.get("mapping")?.as_object()?;
2278    let mut current_id = mapping.iter().find_map(|(node_id, node)| {
2279        let obj = node.as_object()?;
2280        (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
2281    })?;
2282
2283    let mut messages = Vec::with_capacity(mapping.len());
2284    let mut visited = std::collections::HashSet::new();
2285
2286    while visited.insert(current_id.clone()) {
2287        let Some(node) = mapping.get(&current_id).and_then(|v| v.as_object()) else {
2288            break;
2289        };
2290
2291        if let Some(message) = node.get("message") {
2292            let role = message
2293                .get("author")
2294                .and_then(|author| author.get("role"))
2295                .and_then(|v| v.as_str())
2296                .unwrap_or("");
2297            if let Some(text) = extract_text_content(message) {
2298                match role {
2299                    "user" => messages.push(("user".to_string(), text)),
2300                    "assistant" => messages.push(("assistant".to_string(), text)),
2301                    _ => {}
2302                }
2303            }
2304        }
2305
2306        let Some(next_id) = node
2307            .get("children")
2308            .and_then(|children| children.as_array())
2309            .and_then(|children| children.first())
2310            .and_then(|child| child.as_str())
2311        else {
2312            break;
2313        };
2314        current_id = next_id.to_string();
2315    }
2316
2317    (!messages.is_empty()).then_some(messages)
2318}
2319
2320fn extract_text_content(value: &Value) -> Option<String> {
2321    if let Some(text) = value.as_str() {
2322        let trimmed = text.trim();
2323        return (!trimmed.is_empty()).then_some(trimmed.to_string());
2324    }
2325
2326    if let Some(array) = value.as_array() {
2327        let mut joined = String::new();
2328        for part in array
2329            .iter()
2330            .filter_map(extract_text_content)
2331            .filter(|p| !p.is_empty())
2332        {
2333            if !joined.is_empty() {
2334                joined.push('\n');
2335            }
2336            joined.push_str(&part);
2337        }
2338        return (!joined.is_empty()).then_some(joined);
2339    }
2340
2341    let obj = value.as_object()?;
2342
2343    if let Some(content) = obj.get("content") {
2344        if let Some(text) = extract_text_content(content) {
2345            return Some(text);
2346        }
2347    }
2348
2349    if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
2350        let trimmed = text.trim();
2351        if !trimmed.is_empty() {
2352            return Some(trimmed.to_string());
2353        }
2354    }
2355
2356    if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
2357        let mut joined = String::new();
2358        for part in parts
2359            .iter()
2360            .filter_map(|p| p.as_str().map(|s| s.trim().to_string()))
2361            .filter(|p| !p.is_empty())
2362        {
2363            if !joined.is_empty() {
2364                joined.push('\n');
2365            }
2366            joined.push_str(&part);
2367        }
2368        if !joined.is_empty() {
2369            return Some(joined);
2370        }
2371    }
2372
2373    None
2374}
2375
2376fn slugify_import_path(path: &Path) -> String {
2377    path.to_string_lossy()
2378        .replace('\\', "/")
2379        .chars()
2380        .map(|ch| {
2381            if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
2382                ch
2383            } else {
2384                '_'
2385            }
2386        })
2387        .collect::<String>()
2388        .trim_matches('/')
2389        .replace('/', "__")
2390}
2391
2392// ── Embedding API ─────────────────────────────────────────────────────────────
2393
2394// Shared HTTP client — built once, reuses TCP keep-alive connections.
2395// Embedding calls previously created a new client on every request, which
2396// incurred a fresh TCP handshake each time (even to localhost).
2397static EMBED_CLIENT: std::sync::OnceLock<reqwest::blocking::Client> = std::sync::OnceLock::new();
2398
2399fn embed_client() -> &'static reqwest::blocking::Client {
2400    EMBED_CLIENT.get_or_init(|| {
2401        reqwest::blocking::Client::builder()
2402            .timeout(std::time::Duration::from_secs(30))
2403            .build()
2404            .expect("failed to build embedding HTTP client")
2405    })
2406}
2407
2408fn embed_query_blocking(text: &str, base_url: &str, embed_model: &str) -> Option<Vec<f32>> {
2409    embed_text_with_prefix(text, "search_query", base_url, embed_model)
2410}
2411
2412/// Embed a batch of pre-prefixed texts in a single HTTP round-trip.
2413/// Returns embeddings in the same order as `inputs`; missing slots are `None`.
2414fn embed_batch(inputs: &[String], base_url: &str, embed_model: &str) -> Vec<Option<Vec<f32>>> {
2415    if inputs.is_empty() {
2416        return Vec::new();
2417    }
2418
2419    let client = embed_client();
2420    let trimmed = base_url.trim_end_matches('/');
2421    let is_ollama = trimmed.contains("11434");
2422    let url = if is_ollama {
2423        format!("{}/api/embed", trimmed)
2424    } else {
2425        format!("{}/v1/embeddings", trimmed)
2426    };
2427
2428    let body = serde_json::json!({
2429        "model": embed_model,
2430        "input": inputs
2431    });
2432
2433    let resp = match client.post(&url).json(&body).send() {
2434        Ok(r) if r.status().is_success() => r,
2435        _ => return vec![None; inputs.len()],
2436    };
2437    let json: serde_json::Value = match resp.json() {
2438        Ok(j) => j,
2439        Err(_) => return vec![None; inputs.len()],
2440    };
2441
2442    if is_ollama {
2443        // Ollama: {"embeddings": [[...], [...], ...]}
2444        match json["embeddings"].as_array() {
2445            Some(arr) => arr
2446                .iter()
2447                .map(|e| {
2448                    e.as_array().and_then(|v| {
2449                        let floats: Vec<f32> = v
2450                            .iter()
2451                            .filter_map(|x| x.as_f64().map(|f| f as f32))
2452                            .collect();
2453                        if floats.is_empty() {
2454                            None
2455                        } else {
2456                            Some(floats)
2457                        }
2458                    })
2459                })
2460                .collect(),
2461            None => vec![None; inputs.len()],
2462        }
2463    } else {
2464        // OpenAI: {"data": [{"index": N, "embedding": [...]}, ...]}
2465        let mut out = vec![None; inputs.len()];
2466        if let Some(arr) = json["data"].as_array() {
2467            for item in arr {
2468                let idx = item["index"].as_u64().unwrap_or(0) as usize;
2469                if idx < out.len() {
2470                    if let Some(emb) = item["embedding"].as_array() {
2471                        let floats: Vec<f32> = emb
2472                            .iter()
2473                            .filter_map(|x| x.as_f64().map(|f| f as f32))
2474                            .collect();
2475                        if !floats.is_empty() {
2476                            out[idx] = Some(floats);
2477                        }
2478                    }
2479                }
2480            }
2481        }
2482        out
2483    }
2484}
2485
2486fn embed_text_with_prefix(
2487    text: &str,
2488    task: &str,
2489    base_url: &str,
2490    embed_model: &str,
2491) -> Option<Vec<f32>> {
2492    // Nomic v2 task instruction prefix format: "<task>: <text>"
2493    let prefixed = format!("{}: {}", task, text);
2494    // Truncate to ~8000 chars to stay within typical embedding model limits.
2495    let input = if prefixed.len() > 8000 {
2496        safe_head(&prefixed, 8000).to_string()
2497    } else {
2498        prefixed
2499    };
2500
2501    let client = embed_client();
2502    let body = serde_json::json!({ "model": embed_model, "input": input });
2503
2504    let trimmed = base_url.trim_end_matches('/');
2505    let is_ollama = trimmed.contains("11434");
2506    let url = if is_ollama {
2507        format!("{}/api/embed", trimmed)
2508    } else {
2509        format!("{}/v1/embeddings", trimmed)
2510    };
2511    let resp = client.post(&url).json(&body).send().ok()?;
2512
2513    if !resp.status().is_success() {
2514        return None;
2515    }
2516
2517    let json: serde_json::Value = resp.json().ok()?;
2518    let embedding = if is_ollama {
2519        json["embeddings"][0].as_array()?
2520    } else {
2521        json["data"][0]["embedding"].as_array()?
2522    };
2523    let vec: Vec<f32> = embedding
2524        .iter()
2525        .filter_map(|v| v.as_f64().map(|f| f as f32))
2526        .collect();
2527
2528    if vec.is_empty() {
2529        None
2530    } else {
2531        Some(vec)
2532    }
2533}
2534
2535// ── Vector math ───────────────────────────────────────────────────────────────
2536
2537/// Cosine similarity when the query's squared norm is already known.
2538/// Saves one pass over `a` on each call — use when scoring many chunks
2539/// against the same query vector.
2540#[inline]
2541fn cosine_similarity_precomputed(a: &[f32], a_norm_sq: f32, b: &[f32]) -> f32 {
2542    if a.len() != b.len() || a_norm_sq == 0.0 {
2543        return 0.0;
2544    }
2545    let (dot, nb_sq) = a
2546        .iter()
2547        .zip(b.iter())
2548        .fold((0.0f32, 0.0f32), |(dot, nb), (x, y)| {
2549            (dot + x * y, nb + y * y)
2550        });
2551    if nb_sq == 0.0 {
2552        0.0
2553    } else {
2554        dot / (a_norm_sq.sqrt() * nb_sq.sqrt())
2555    }
2556}
2557
2558fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2559    let mut out = Vec::with_capacity(floats.len() * 4);
2560    for f in floats {
2561        out.extend_from_slice(&f.to_le_bytes());
2562    }
2563    out
2564}
2565
2566fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2567    let mut out = Vec::with_capacity(blob.len() / 4);
2568    for b in blob.chunks_exact(4) {
2569        out.push(f32::from_le_bytes([b[0], b[1], b[2], b[3]]));
2570    }
2571    out
2572}
2573
2574// ── Document extraction ───────────────────────────────────────────────────────
2575
2576/// Extract plain text from a PDF file using pdf-extract.
2577/// Returns None if the file can't be read or yields no text.
2578/// Output is best-effort — layout is not preserved, but content is.
2579fn normalize_extracted_document_text(text: String) -> Option<String> {
2580    let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2581    let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2582    if trimmed.is_empty() {
2583        None
2584    } else {
2585        Some(trimmed.to_string())
2586    }
2587}
2588
2589fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2590    let previous_hook = std::panic::take_hook();
2591    std::panic::set_hook(Box::new(|_| {}));
2592    let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2593        pdf_extract::extract_text(path)
2594    }));
2595    std::panic::set_hook(previous_hook);
2596
2597    match result {
2598        Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2599        Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2600        Err(payload) => {
2601            let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2602                (*msg).to_string()
2603            } else if let Some(msg) = payload.downcast_ref::<String>() {
2604                msg.clone()
2605            } else {
2606                "unknown parser panic".to_string()
2607            };
2608            Err(format!("pdf-extract panicked: {}", panic_text))
2609        }
2610    }
2611}
2612
2613fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2614    let mut doc =
2615        lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2616
2617    if doc.is_encrypted() {
2618        doc.decrypt("")
2619            .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2620    }
2621
2622    let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2623    if page_numbers.is_empty() {
2624        return Ok(None);
2625    }
2626
2627    let mut extracted_pages = Vec::with_capacity(page_numbers.len());
2628    let mut page_errors = Vec::with_capacity(page_numbers.len());
2629
2630    for page_number in page_numbers {
2631        match doc.extract_text(&[page_number]) {
2632            Ok(text) => {
2633                if let Some(page_text) = normalize_extracted_document_text(text) {
2634                    extracted_pages.push(page_text);
2635                }
2636            }
2637            Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2638        }
2639    }
2640
2641    if !extracted_pages.is_empty() {
2642        return Ok(Some(extracted_pages.join("\n\n")));
2643    }
2644
2645    if !page_errors.is_empty() {
2646        let sample_errors = {
2647            let mut s = String::with_capacity(160);
2648            for e in page_errors.into_iter().take(3) {
2649                if !s.is_empty() {
2650                    s.push_str("; ");
2651                }
2652                s.push_str(&e);
2653            }
2654            s
2655        };
2656        return Err(format!(
2657            "lopdf could not extract usable page text ({sample_errors})"
2658        ));
2659    }
2660
2661    Ok(None)
2662}
2663
2664fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2665    let mut failures = Vec::with_capacity(2);
2666
2667    match extract_pdf_text_with_pdf_extract(path) {
2668        Ok(Some(text)) => return Ok(Some(text)),
2669        Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2670        Err(e) => failures.push(e),
2671    }
2672
2673    match extract_pdf_text_with_lopdf(path) {
2674        Ok(Some(text)) => return Ok(Some(text)),
2675        Ok(None) => failures.push("lopdf found no usable text".to_string()),
2676        Err(e) => failures.push(e),
2677    }
2678
2679    let detail = {
2680        let mut d = String::with_capacity(160);
2681        for (i, f) in failures.into_iter().take(2).enumerate() {
2682            if i > 0 {
2683                d.push_str("; ");
2684            }
2685            d.push_str(&f);
2686        }
2687        d
2688    };
2689    Err(format!(
2690        "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2691        detail
2692    ))
2693}
2694
2695fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2696    let exe = std::env::current_exe()
2697        .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2698    let output = std::process::Command::new(exe)
2699        .arg("--pdf-extract-helper")
2700        .arg(path)
2701        .stdin(std::process::Stdio::null())
2702        .stdout(std::process::Stdio::piped())
2703        .stderr(std::process::Stdio::piped())
2704        .output()
2705        .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2706
2707    if !output.status.success() {
2708        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2709        return Err(if stderr.is_empty() {
2710            "PDF extraction failed.".to_string()
2711        } else {
2712            stderr
2713        });
2714    }
2715
2716    let text = String::from_utf8(output.stdout)
2717        .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2718    if text.trim().is_empty() {
2719        Ok(None)
2720    } else {
2721        Ok(Some(text))
2722    }
2723}
2724
2725pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2726    match extract_pdf_text_inside_helper(path) {
2727        Ok(Some(text)) => {
2728            use std::io::Write;
2729            let mut stdout = std::io::stdout();
2730            if stdout.write_all(text.as_bytes()).is_ok() {
2731                0
2732            } else {
2733                let _ = writeln!(
2734                    std::io::stderr(),
2735                    "PDF helper could not write extracted text."
2736                );
2737                1
2738            }
2739        }
2740        Ok(None) => {
2741            eprintln!(
2742                "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2743            );
2744            1
2745        }
2746        Err(e) => {
2747            eprintln!("{}", e);
2748            1
2749        }
2750    }
2751}
2752
2753/// Extract text from any supported document type (PDF, markdown, plain text).
2754/// Used by /attach for one-shot context injection.
2755pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2756    let ext = path
2757        .extension()
2758        .and_then(|e| e.to_str())
2759        .unwrap_or("")
2760        .to_lowercase();
2761    match ext.as_str() {
2762        "pdf" => {
2763            let text = extract_pdf_text(path)?.ok_or_else(|| {
2764                "PDF contains no extractable text — it may be scanned/image-only. \
2765                     Try attaching page screenshots with /image instead."
2766                    .to_string()
2767            })?;
2768            pdf_quality_check(text)
2769        }
2770        _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2771    }
2772}
2773
2774/// Detect garbled PDF extraction — common with academic publisher PDFs that use
2775/// custom embedded fonts with non-standard glyph mappings.
2776///
2777/// Returns the text if it looks usable, or an informative error if it looks garbled.
2778fn pdf_quality_check(text: String) -> Result<String, String> {
2779    let trimmed = text.trim();
2780
2781    // Too little content to be useful.
2782    if trimmed.len() < 150 {
2783        return Err(format!(
2784            "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2785             or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2786            trimmed.len()
2787        ));
2788    }
2789
2790    // Detect words smashed together: space ratio too low.
2791    // Normal prose is ~15–20% spaces. Below 4% means glyphs aren't mapping to spaces.
2792    let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2793    let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2794    let space_ratio = if non_newline > 0 {
2795        spaces as f32 / non_newline as f32
2796    } else {
2797        0.0
2798    };
2799
2800    if space_ratio < 0.04 {
2801        return Err(
2802            "PDF text extraction produced garbled output — words are merged with no spaces. \
2803             This usually means the PDF uses custom embedded fonts (common with academic publishers \
2804             like EBSCO, Elsevier, Springer). \
2805             Try a PDF exported from Word, Google Docs, or LaTeX, \
2806             or attach page screenshots with /image instead.".to_string()
2807        );
2808    }
2809
2810    Ok(text)
2811}
2812
2813// ── Chunking strategies ───────────────────────────────────────────────────────
2814
2815/// Dispatch to the correct chunking strategy based on file extension.
2816fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2817    if ext == "rs" {
2818        chunk_rust_symbols(text)
2819    } else {
2820        chunk_paragraphs(text)
2821    }
2822}
2823
2824/// Chunk Rust source at top-level item boundaries.
2825///
2826/// Detects lines at column 0 that start a Rust declaration keyword, flushes
2827/// the accumulated buffer, then moves any trailing doc-comments / attributes
2828/// forward so they stay with the item they annotate.
2829///
2830/// Items larger than 3000 chars (e.g. large impl blocks) are further split
2831/// by sliding window so no single chunk blows the retrieval budget.
2832fn chunk_rust_symbols(text: &str) -> Vec<String> {
2833    const ITEM_STARTS: &[&str] = &[
2834        "pub fn ",
2835        "pub async fn ",
2836        "pub unsafe fn ",
2837        "async fn ",
2838        "unsafe fn ",
2839        "fn ",
2840        "pub impl",
2841        "impl ",
2842        "pub struct ",
2843        "struct ",
2844        "pub enum ",
2845        "enum ",
2846        "pub trait ",
2847        "trait ",
2848        "pub mod ",
2849        "mod ",
2850        "pub type ",
2851        "type ",
2852        "pub const ",
2853        "const ",
2854        "pub static ",
2855        "static ",
2856    ];
2857
2858    let lines: Vec<&str> = text.lines().collect();
2859    let mut chunks: Vec<String> = Vec::with_capacity(lines.len() / 20 + 1);
2860    let mut current: Vec<&str> = Vec::with_capacity(64);
2861
2862    for &line in &lines {
2863        let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2864        let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2865
2866        if is_item && !current.is_empty() {
2867            // Scan backward to find where trailing doc-comments / attributes start —
2868            // move them to the new chunk so they land with their item.
2869            let mut split = current.len();
2870            while split > 0 {
2871                let prev = current[split - 1].trim();
2872                if prev.starts_with("///")
2873                    || prev.starts_with("//!")
2874                    || prev.starts_with("#[")
2875                    || prev.is_empty()
2876                {
2877                    split -= 1;
2878                } else {
2879                    break;
2880                }
2881            }
2882            let body = current[..split].join("\n");
2883            if !body.trim().is_empty() {
2884                chunks.push(body);
2885            }
2886            current.drain(..split);
2887        }
2888        current.push(line);
2889    }
2890    if !current.is_empty() {
2891        let body = current.join("\n");
2892        if !body.trim().is_empty() {
2893            chunks.push(body);
2894        }
2895    }
2896
2897    // Subdivide any oversized blocks (e.g. long impl blocks with many methods).
2898    let mut result = Vec::with_capacity(chunks.len());
2899    for chunk in chunks {
2900        if chunk.len() > 3000 {
2901            result.extend(sliding_window_chunks(&chunk, 2000, 200));
2902        } else {
2903            result.push(chunk);
2904        }
2905    }
2906    result
2907}
2908
2909/// Chunk non-Rust text at paragraph boundaries (double newline).
2910fn chunk_paragraphs(text: &str) -> Vec<String> {
2911    let mut result: Vec<String> = Vec::new();
2912    let mut current = String::with_capacity(2000);
2913
2914    for para in text.split("\n\n") {
2915        if current.len() + para.len() + 2 > 2000 {
2916            if !current.trim().is_empty() {
2917                result.push(std::mem::replace(&mut current, para.to_string()));
2918            } else {
2919                current = para.to_string();
2920            }
2921        } else {
2922            if !current.is_empty() {
2923                current.push_str("\n\n");
2924            }
2925            current.push_str(para);
2926        }
2927    }
2928    if !current.trim().is_empty() {
2929        result.push(current);
2930    }
2931
2932    let mut final_result = Vec::with_capacity(result.len());
2933    for chunk in result {
2934        if chunk.len() > 2000 {
2935            final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2936        } else {
2937            final_result.push(chunk);
2938        }
2939    }
2940    final_result
2941}
2942
2943/// Classic sliding-window fallback for oversized blocks.
2944fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2945    let chars: Vec<char> = text.chars().collect();
2946    let stride = chunk_size.saturating_sub(overlap).max(1);
2947    let mut result = Vec::with_capacity(chars.len() / stride + 1);
2948    let mut i = 0;
2949    while i < chars.len() {
2950        let end = (i + chunk_size).min(chars.len());
2951        result.push(chars[i..end].iter().collect());
2952        if end == chars.len() {
2953            break;
2954        }
2955        i += chunk_size - overlap;
2956    }
2957    result
2958}