1use crate::agent::truncation::safe_head;
2use rusqlite::{params, Connection};
3use serde::Deserialize;
4use serde_json::Value;
5use std::collections::{HashMap, HashSet};
6use std::fmt::Write as _;
7use std::path::Path;
8
9pub struct Vein {
30 db: std::sync::Arc<std::sync::Mutex<Connection>>,
31 base_url: String,
33 embed_model: std::sync::Arc<std::sync::RwLock<Option<String>>>,
34 embedding_cache: std::sync::RwLock<Vec<EmbeddedChunk>>,
38}
39
40struct EmbeddedChunk {
42 path: String,
43 chunk_idx: i64,
44 embedding: Vec<f32>,
45 last_modified: i64,
46 room: String,
47 memory_type: String,
48}
49
50unsafe impl Send for Vein {}
53unsafe impl Sync for Vein {}
54
55#[derive(Debug, Clone)]
56pub struct SearchResult {
57 pub path: String,
58 pub content: String,
59 pub score: f32,
61 pub room: String,
63 pub last_modified: i64,
65 pub memory_type: String,
68}
69
70#[derive(Debug, Clone, PartialEq, Eq)]
71pub struct VeinHotFile {
72 pub path: String,
73 pub heat: i64,
74 pub last_modified: i64,
75 pub room: String,
76}
77
78#[derive(Debug, Clone)]
79pub struct VeinInspectionSnapshot {
80 pub indexed_source_files: usize,
81 pub indexed_docs: usize,
82 pub indexed_session_exchanges: usize,
83 pub embedded_source_doc_chunks: usize,
84 pub has_any_embeddings: bool,
85 pub active_room: Option<String>,
86 pub hot_files: Vec<VeinHotFile>,
87 pub l1_ready: bool,
88}
89
90#[derive(Debug, Default)]
91struct QuerySignals {
92 exact_phrases: Vec<String>,
93 standout_terms: Vec<String>,
94 historical_memory_hint: bool,
95 temporal_reference: Option<TemporalReference>,
96 query_memory_type: Option<&'static str>,
99}
100
101#[derive(Debug, Clone, Copy)]
102struct TemporalReference {
103 target_ts: i64,
104 window_secs: i64,
105}
106
107#[derive(Debug, Deserialize)]
108struct SessionReport {
109 #[serde(default)]
110 session_start: String,
111 #[serde(default)]
112 transcript: Vec<SessionTranscriptEntry>,
113}
114
115#[derive(Debug, Deserialize)]
116struct SessionTranscriptEntry {
117 #[serde(default)]
118 speaker: String,
119 #[serde(default)]
120 text: String,
121}
122
123#[derive(Debug)]
124struct SessionExchange {
125 path: String,
126 last_modified: i64,
127 content: String,
128}
129
130#[derive(Debug, Clone, Copy, PartialEq, Eq)]
131enum SessionSpeakerKind {
132 User,
133 Assistant,
134 Ignore,
135}
136
137pub fn detect_room(path: &str) -> String {
142 let lower = path.to_lowercase().replace('\\', "/");
143 let filename = lower.rsplit('/').next().unwrap_or(&lower);
144 let ext = filename.rsplit('.').next().unwrap_or("");
145
146 let mut best_room = None::<&str>;
147 let mut best_score = 0i32;
148 let mut consider = |room: &'static str, score: i32| {
149 if score > best_score {
150 best_score = score;
151 best_room = Some(room);
152 }
153 };
154
155 let is_component = |segment: &str| -> bool {
156 if lower == segment {
157 return true;
158 }
159 let seg = segment.as_bytes();
160 let lo = lower.as_bytes();
161 let slen = seg.len();
162 if lo.len() > slen && &lo[..slen] == seg && lo[slen] == b'/' {
164 return true;
165 }
166 lo.len() >= slen + 2
168 && lo
169 .windows(slen + 2)
170 .any(|w| w[0] == b'/' && &w[1..slen + 1] == seg && w[slen + 1] == b'/')
171 };
172
173 if lower.starts_with("session/")
174 || lower.starts_with(".hematite/reports/")
175 || lower.starts_with(".hematite/imports/")
176 || is_component("reports")
177 || is_component("imports")
178 {
179 consider("session", 100);
180 }
181
182 if lower.starts_with(".hematite/docs/")
183 || is_component("docs")
184 || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
185 || matches!(ext, "md" | "markdown" | "pdf" | "rst")
186 {
187 consider("docs", 80);
188 }
189
190 if is_component("tests")
191 || filename.contains("diagnostic")
192 || filename.ends_with("_test.rs")
193 || filename.ends_with(".test.ts")
194 {
195 consider("tests", 85);
196 }
197
198 if lower.starts_with(".github/workflows/")
199 || is_component("workflows")
200 || filename == ".pre-commit-config.yaml"
201 || filename == ".pre-commit-config.yml"
202 || filename.contains("hook")
203 {
204 consider("automation", 84);
205 }
206
207 if lower.starts_with("installer/")
208 || lower.starts_with("dist/")
209 || lower.starts_with("scripts/package-")
210 || filename.contains("release")
211 || filename.contains("bump-version")
212 || ext == "iss"
213 {
214 consider("release", 82);
215 }
216
217 if matches!(
218 filename,
219 "cargo.toml"
220 | "cargo.lock"
221 | "package.json"
222 | "pnpm-lock.yaml"
223 | "yarn.lock"
224 | "bun.lock"
225 | "bun.lockb"
226 | "pyproject.toml"
227 | "setup.py"
228 | "go.mod"
229 | "pom.xml"
230 | "build.gradle"
231 | "build.gradle.kts"
232 | "cmakelists.txt"
233 | ".gitignore"
234 | "settings.json"
235 | "mcp_servers.json"
236 ) || filename.ends_with(".sln")
237 || filename.ends_with(".csproj")
238 || filename.contains("config")
239 {
240 consider("config", 76);
241 }
242
243 if is_component("ui")
244 || matches!(
245 filename,
246 "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
247 )
248 {
249 consider("ui", 70);
250 }
251
252 if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
253 consider("memory", 72);
254 }
255
256 if is_component("tools")
257 || matches!(
258 filename,
259 "verify_build.rs"
260 | "host_inspect.rs"
261 | "shell.rs"
262 | "code_sandbox.rs"
263 | "project_map.rs"
264 | "runtime_trace.rs"
265 )
266 {
267 consider("tools", 68);
268 }
269
270 if filename.contains("mcp")
271 || filename.contains("lsp")
272 || lower.contains("/mcp/")
273 || lower.contains("/lsp/")
274 {
275 consider("integration", 67);
276 }
277
278 if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
279 || filename.contains("startup")
280 || filename.contains("runtime")
281 {
282 consider("runtime", 66);
283 }
284
285 if is_component("agent") {
286 consider("agent", 60);
287 }
288
289 if lower.starts_with("libs/") || is_component("libs") {
290 consider("libs", 58);
291 }
292
293 if lower.starts_with("scripts/") || is_component("scripts") {
294 consider("scripts", 55);
295 }
296
297 if let Some(room) = best_room {
298 return room.to_string();
299 }
300
301 lower
303 .split('/')
304 .next()
305 .filter(|s| !s.is_empty() && !s.contains('.'))
306 .unwrap_or("root")
307 .to_string()
308}
309
310pub fn detect_memory_type(text: &str) -> &'static str {
316 let lower = text.to_lowercase();
317
318 let decision_patterns = [
320 "let's use ",
321 "we'll use ",
322 "decided to ",
323 "going with ",
324 "we agreed ",
325 "the plan is",
326 "we're going to",
327 "switching to",
328 "we chose",
329 "final decision",
330 "we settled on",
331 "agreed on",
332 "we decided",
333 ];
334 for pat in &decision_patterns {
335 if lower.contains(pat) {
336 return "decision";
337 }
338 }
339
340 let problem_patterns = [
342 "bug fixed",
343 "bug was",
344 "the issue was",
345 "root cause",
346 "error was",
347 "turned out to be",
348 "the fix was",
349 "was caused by",
350 "broken because",
351 "fixed by",
352 "the problem was",
353 "found the bug",
354 "port conflict",
355 "crash",
356 "panicked",
357 "segfault",
358 "oom",
359 "out of memory",
360 ];
361 for pat in &problem_patterns {
362 if lower.contains(pat) {
363 return "problem";
364 }
365 }
366
367 let milestone_patterns = [
369 "now working",
370 "successfully",
371 "shipped",
372 "deployed",
373 "it works",
374 "tests pass",
375 "all green",
376 "breakthrough",
377 "finally got",
378 "got it working",
379 "completed",
380 "finished",
381 "done with",
382 "landed",
383 ];
384 for pat in &milestone_patterns {
385 if lower.contains(pat) {
386 return "milestone";
387 }
388 }
389
390 let preference_patterns = [
392 "i prefer",
393 "i like",
394 "i don't like",
395 "i want",
396 "always use",
397 "never use",
398 "i usually",
399 "my preference",
400 "keep it",
401 "avoid using",
402 ];
403 for pat in &preference_patterns {
404 if lower.contains(pat) {
405 return "preference";
406 }
407 }
408
409 ""
410}
411
412impl Vein {
413 const SESSION_REPORT_LIMIT: usize = 5;
414 const SESSION_TURN_LIMIT: usize = 50;
415 const IMPORT_FILE_LIMIT: usize = 12;
416 const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
417
418 pub fn new<P: AsRef<Path>>(
419 db_path: P,
420 base_url: String,
421 ) -> Result<Self, Box<dyn std::error::Error>> {
422 let db = Connection::open(db_path)?;
423
424 db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
426
427 db.execute_batch(
431 "CREATE TABLE IF NOT EXISTS chunks_meta (
432 path TEXT PRIMARY KEY,
433 last_modified INTEGER NOT NULL,
434 room TEXT NOT NULL DEFAULT 'root'
435 );
436 CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
437 path UNINDEXED,
438 content,
439 tokenize='porter ascii'
440 );
441 CREATE TABLE IF NOT EXISTS chunks_vec (
442 path TEXT NOT NULL,
443 chunk_idx INTEGER NOT NULL,
444 embedding BLOB NOT NULL,
445 PRIMARY KEY (path, chunk_idx)
446 );
447 CREATE TABLE IF NOT EXISTS file_heat (
448 path TEXT PRIMARY KEY,
449 heat INTEGER NOT NULL DEFAULT 0,
450 last_edit INTEGER NOT NULL DEFAULT 0
451 );",
452 )?;
453
454 let _ = db
456 .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
457 let _ = db.execute_batch(
458 "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
459 );
460 let _ = db.execute_batch(
461 "ALTER TABLE chunks_meta ADD COLUMN memory_type TEXT NOT NULL DEFAULT '';",
462 );
463
464 let embedding_cache: Vec<EmbeddedChunk> = {
467 let mut stmt = db.prepare(
468 "SELECT cv.path, cv.chunk_idx, cv.embedding,
469 cm.last_modified,
470 COALESCE(cm.room, 'root'),
471 COALESCE(cm.memory_type, '')
472 FROM chunks_vec cv
473 JOIN chunks_meta cm ON cm.path = cv.path",
474 )?;
475 let raw: Vec<(String, i64, Vec<u8>, i64, String, String)> = stmt
477 .query_map([], |row| {
478 Ok((
479 row.get::<_, String>(0)?,
480 row.get::<_, i64>(1)?,
481 row.get::<_, Vec<u8>>(2)?,
482 row.get::<_, i64>(3)?,
483 row.get::<_, String>(4)?,
484 row.get::<_, String>(5)?,
485 ))
486 })?
487 .filter_map(|r| r.ok())
488 .collect();
489 raw.into_iter()
490 .map(
491 |(path, chunk_idx, blob, last_modified, room, memory_type)| EmbeddedChunk {
492 path,
493 chunk_idx,
494 embedding: blob_to_floats(&blob),
495 last_modified,
496 room,
497 memory_type,
498 },
499 )
500 .collect()
501 };
502
503 Ok(Self {
504 db: std::sync::Arc::new(std::sync::Mutex::new(db)),
505 base_url,
506 embed_model: std::sync::Arc::new(std::sync::RwLock::new(None)),
507 embedding_cache: std::sync::RwLock::new(embedding_cache),
508 })
509 }
510
511 pub fn set_embed_model(&self, model_id: Option<String>) {
512 if let Ok(mut guard) = self.embed_model.write() {
513 *guard = model_id;
514 }
515 }
516
517 fn current_embed_model(&self) -> Option<String> {
518 self.embed_model.read().ok().and_then(|guard| guard.clone())
519 }
520
521 pub fn index_document(
526 &mut self,
527 path: &str,
528 last_modified: i64,
529 full_text: &str,
530 ) -> Result<usize, Box<dyn std::error::Error>> {
531 let room = detect_room(path);
532 let ext = std::path::Path::new(path)
533 .extension()
534 .and_then(|e| e.to_str())
535 .unwrap_or("");
536 let chunks = chunk_by_symbols(ext, full_text);
537 let memory_type = if room == "session" {
539 detect_memory_type(full_text)
540 } else {
541 ""
542 };
543 self.index_chunks_with_room_and_type(path, last_modified, &room, memory_type, &chunks)
544 }
545
546 fn index_chunks_with_room_and_type(
547 &mut self,
548 path: &str,
549 last_modified: i64,
550 room: &str,
551 memory_type: &str,
552 chunks: &[String],
553 ) -> Result<usize, Box<dyn std::error::Error>> {
554 let db = self.db.lock().unwrap();
555 let existing: Option<i64> = db
556 .query_row(
557 "SELECT last_modified FROM chunks_meta WHERE path = ?1",
558 params![path],
559 |r| r.get(0),
560 )
561 .ok();
562
563 if let Some(ts) = existing {
564 if ts >= last_modified {
565 return Ok(0); }
567 }
568
569 db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
571 db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
572 if let Ok(mut cache) = self.embedding_cache.write() {
574 cache.retain(|e| e.path != path);
575 }
576 db.execute(
577 "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room, memory_type) VALUES (?1, ?2, ?3, ?4)",
578 params![path, last_modified, room, memory_type],
579 )?;
580
581 drop(db);
582
583 let mut db = self.db.lock().unwrap();
584 let tx = db.transaction()?;
585 {
586 let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
587 for chunk in chunks {
588 stmt.execute(params![path, chunk.as_str()])?;
589 }
590 }
591 tx.commit()?;
592
593 Ok(chunks.len())
594 }
595
596 pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
600 let Some(embed_model) = self.current_embed_model() else {
601 return;
602 };
603 let (last_modified, room, memory_type) = {
605 let db = self.db.lock().unwrap();
606 db.query_row(
607 "SELECT last_modified, room, memory_type FROM chunks_meta WHERE path = ?1",
608 params![path],
609 |r| {
610 Ok((
611 r.get::<_, i64>(0)?,
612 r.get::<_, String>(1)?,
613 r.get::<_, String>(2).unwrap_or_default(),
614 ))
615 },
616 )
617 .unwrap_or((0, "root".to_string(), String::new()))
618 };
619
620 let prefixed: Vec<String> = chunks
622 .iter()
623 .map(|c| {
624 let p = format!("search_document: {}", c);
625 if p.len() > 8000 {
626 safe_head(&p, 8000).to_string()
627 } else {
628 p
629 }
630 })
631 .collect();
632
633 let embeddings = embed_batch(&prefixed, &self.base_url, &embed_model);
634
635 for (idx, maybe_vec) in embeddings.into_iter().enumerate() {
636 if let Some(vec) = maybe_vec {
637 let blob = floats_to_blob(&vec);
638 let db = self.db.lock().unwrap();
639 let _ = db.execute(
640 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
641 params![path, idx as i64, blob],
642 );
643 drop(db);
644 if let Ok(mut cache) = self.embedding_cache.write() {
646 cache.retain(|e| !(e.path == path && e.chunk_idx == idx as i64));
647 cache.push(EmbeddedChunk {
648 path: path.to_string(),
649 chunk_idx: idx as i64,
650 embedding: vec,
651 last_modified,
652 room: room.clone(),
653 memory_type: memory_type.clone(),
654 });
655 }
656 }
657 }
658 }
659
660 pub fn search_bm25(
664 &self,
665 query: &str,
666 limit: usize,
667 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
668 static STOPWORDS: std::sync::OnceLock<HashSet<&'static str>> = std::sync::OnceLock::new();
672 let stopwords = STOPWORDS.get_or_init(|| {
673 [
674 "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
675 "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an",
676 "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
677 "from", "get", "gets", "got", "work", "works", "make", "makes", "use", "uses",
678 "into", "that", "this", "it", "its",
679 ]
680 .iter()
681 .copied()
682 .collect()
683 });
684
685 let safe_query: String = {
688 let mut s = String::with_capacity(query.len());
689 s.extend(query.chars().map(|c| {
690 if c.is_alphanumeric() || c == ' ' || c == '_' {
691 c.to_ascii_lowercase()
692 } else {
693 ' '
694 }
695 }));
696 s
697 };
698
699 let fts_query = {
701 let mut q = String::with_capacity(safe_query.len() + 16);
702 for w in safe_query
703 .split_whitespace()
704 .filter(|w| w.len() >= 3 && !stopwords.contains(*w))
705 {
706 if !q.is_empty() {
707 q.push_str(" OR ");
708 }
709 q.push_str(w);
710 }
711 q
712 };
713
714 if fts_query.is_empty() {
715 return Ok(Vec::new());
716 }
717
718 let db = self.db.lock().unwrap();
719 let mut stmt = db.prepare_cached(
720 "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room, cm.memory_type
721 FROM chunks_fts
722 JOIN chunks_meta cm ON cm.path = chunks_fts.path
723 WHERE chunks_fts MATCH ?1
724 ORDER BY rank
725 LIMIT ?2",
726 )?;
727
728 let results: Vec<SearchResult> = stmt
729 .query_map(params![fts_query, limit as i64], |row| {
730 Ok(SearchResult {
731 path: row.get(0)?,
732 content: row.get(1)?,
733 score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
734 last_modified: row.get(3)?,
735 room: row.get(4)?,
736 memory_type: row.get::<_, String>(5).unwrap_or_default(),
737 })
738 })?
739 .filter_map(|r| r.ok())
740 .collect();
741
742 Ok(results)
743 }
744
745 pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
748 let Some(embed_model) = self.current_embed_model() else {
749 return Vec::new();
750 };
751 let query_vec = match embed_query_blocking(query, &self.base_url, &embed_model) {
752 Some(v) => v,
753 None => return Vec::new(),
754 };
755
756 let query_norm_sq: f32 = query_vec.iter().map(|x| x * x).sum();
759
760 let top_indices: Vec<(f32, usize)> = {
764 let cache = match self.embedding_cache.read() {
765 Ok(g) => g,
766 Err(_) => return Vec::new(),
767 };
768 if cache.is_empty() {
769 return Vec::new();
770 }
771 let mut iv: Vec<(f32, usize)> = cache
772 .iter()
773 .enumerate()
774 .map(|(i, e)| {
775 let sim =
776 cosine_similarity_precomputed(&query_vec, query_norm_sq, &e.embedding);
777 (sim, i)
778 })
779 .collect();
780 iv.sort_unstable_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
781 iv.truncate(limit);
782 iv
783 };
784
785 let scored: Vec<(f32, String, i64, i64, String, String)> = {
787 let cache = match self.embedding_cache.read() {
788 Ok(g) => g,
789 Err(_) => return Vec::new(),
790 };
791 top_indices
792 .iter()
793 .map(|&(score, i)| {
794 let e = &cache[i];
795 (
796 score,
797 e.path.clone(),
798 e.chunk_idx,
799 e.last_modified,
800 e.room.clone(),
801 e.memory_type.clone(),
802 )
803 })
804 .collect()
805 };
806
807 let db = self.db.lock().unwrap();
808 let mut stmt = match db
811 .prepare_cached("SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2")
812 {
813 Ok(s) => s,
814 Err(_) => return Vec::new(),
815 };
816 scored
817 .into_iter()
818 .filter_map(|(score, path, idx, last_modified, room, memory_type)| {
819 let content: Option<String> = stmt.query_row(params![path, idx], |r| r.get(0)).ok();
820 content.map(|c| SearchResult {
821 path,
822 content: c,
823 score,
824 room,
825 last_modified,
826 memory_type,
827 })
828 })
829 .collect()
830 }
831
832 pub fn search_context(
839 &self,
840 query: &str,
841 limit: usize,
842 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
843 let candidate_limit = (limit.max(1) * 4).max(12);
844 let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
845 let semantic = self.search_semantic(query, candidate_limit);
846 let signals = QuerySignals::from_query(query);
847
848 let active_room = self.active_room();
850
851 let mut merged_by_path: HashMap<String, SearchResult> =
854 HashMap::with_capacity(semantic.len() + bm25.len());
855
856 for r in semantic {
857 let score = reranked_score(&signals, active_room.as_deref(), &r, true);
858 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
859 }
860
861 for r in bm25 {
862 let score = reranked_score(&signals, active_room.as_deref(), &r, false);
863 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
864 }
865
866 let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
867 merged.sort_unstable_by(|a, b| {
868 b.score
869 .partial_cmp(&a.score)
870 .unwrap_or(std::cmp::Ordering::Equal)
871 });
872 merged.truncate(limit);
873 Ok(merged)
874 }
875
876 fn active_room(&self) -> Option<String> {
879 let db = self.db.lock().unwrap();
880 db.prepare_cached(
881 "SELECT cm.room, SUM(fh.heat) as total
882 FROM file_heat fh
883 JOIN chunks_meta cm ON cm.path = fh.path
884 GROUP BY cm.room
885 ORDER BY total DESC
886 LIMIT 1",
887 )
888 .ok()
889 .and_then(|mut stmt| stmt.query_row([], |row| row.get::<_, String>(0)).ok())
890 }
891
892 pub fn index_project(&mut self) -> usize {
900 let root = crate::tools::file_ops::workspace_root();
901 let mut count = 0usize;
902
903 const INDEXABLE: &[&str] = &[
904 "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
905 "yml", "txt",
906 ];
907 const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
908
909 static INDEXABLE_SET: std::sync::OnceLock<std::collections::HashSet<&'static str>> =
912 std::sync::OnceLock::new();
913 static SKIP_DIRS_SET: std::sync::OnceLock<std::collections::HashSet<&'static str>> =
914 std::sync::OnceLock::new();
915 let indexable = INDEXABLE_SET.get_or_init(|| INDEXABLE.iter().copied().collect());
916 let skip_dirs = SKIP_DIRS_SET.get_or_init(|| SKIP_DIRS.iter().copied().collect());
917
918 for entry in walkdir::WalkDir::new(&root)
919 .follow_links(false)
920 .into_iter()
921 .filter_entry(|e| {
922 if e.file_type().is_dir() {
923 let name = e.file_name().to_string_lossy();
924 return !skip_dirs.contains(name.as_ref());
925 }
926 true
927 })
928 .filter_map(|e| e.ok())
929 .filter(|e| e.file_type().is_file())
930 {
931 let path = entry.path();
932 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
933 if !indexable.contains(ext) {
934 continue;
935 }
936
937 let Ok(meta) = entry.metadata() else {
940 continue;
941 };
942 if meta.len() > 512_000 {
943 continue;
944 }
945
946 let mtime = meta
947 .modified()
948 .map(|t| {
949 t.duration_since(std::time::UNIX_EPOCH)
950 .unwrap_or_default()
951 .as_secs() as i64
952 })
953 .unwrap_or(0);
954
955 let rel = path.strip_prefix(&root).unwrap_or(path);
956 let rel_str = rel.to_string_lossy().replace('\\', "/");
957
958 if let Ok(content) = std::fs::read_to_string(path) {
959 match self.index_document(&rel_str, mtime, &content) {
960 Ok(n) if n > 0 => {
961 count += 1;
962 }
963 Ok(_) => {}
964 Err(_) => {}
965 }
966 }
967 }
968
969 count += self.index_workspace_artifacts(&crate::tools::file_ops::hematite_dir());
970
971 count
972 }
973
974 pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
979 let mut count = self.index_docs_folder(workspace_root);
980 count += self.index_recent_session_reports(workspace_root);
981 count += self.index_imported_session_exports(workspace_root);
982 self.backfill_missing_embeddings();
983 count
984 }
985
986 fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
991 let docs_dir = workspace_root.join(".hematite").join("docs");
992 const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
993 let mut count = 0usize;
994 let mut desired_paths = HashSet::new();
995
996 if docs_dir.exists() {
997 for entry in walkdir::WalkDir::new(&docs_dir)
998 .max_depth(3)
999 .follow_links(false)
1000 .into_iter()
1001 .filter_map(|e| e.ok())
1002 .filter(|e| e.file_type().is_file())
1003 {
1004 let path = entry.path();
1005 let ext = path
1006 .extension()
1007 .and_then(|e| e.to_str())
1008 .unwrap_or("")
1009 .to_lowercase();
1010 if !DOCS_INDEXABLE.contains(&ext.as_str()) {
1011 continue;
1012 }
1013
1014 let Ok(meta) = entry.metadata() else {
1015 continue;
1016 };
1017 if meta.len() > 50_000_000 {
1018 continue;
1019 }
1020
1021 let mtime = meta
1022 .modified()
1023 .map(|t| {
1024 t.duration_since(std::time::UNIX_EPOCH)
1025 .unwrap_or_default()
1026 .as_secs() as i64
1027 })
1028 .unwrap_or(0);
1029
1030 let rel = path.strip_prefix(workspace_root).unwrap_or(path);
1031 let rel_str = rel.to_string_lossy().replace('\\', "/");
1032 desired_paths.insert(rel_str.clone());
1033
1034 let content = if ext == "pdf" {
1035 extract_pdf_text(path).ok().flatten()
1036 } else {
1037 std::fs::read_to_string(path).ok()
1038 };
1039
1040 if let Some(text) = content {
1041 if text.trim().is_empty() {
1042 continue;
1043 }
1044 match self.index_document(&rel_str, mtime, &text) {
1045 Ok(n) if n > 0 => {
1046 count += 1;
1047 }
1048 Ok(_) => {}
1049 Err(_) => {}
1050 }
1051 }
1052 }
1053 }
1054
1055 self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
1056 count
1057 }
1058
1059 pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
1062 let reports_dir = workspace_root.join(".hematite").join("reports");
1063 let mut count = 0usize;
1064 let mut desired_paths = HashSet::new();
1065
1066 if reports_dir.exists() {
1067 let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
1068 .ok()
1069 .into_iter()
1070 .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
1071 .map(|entry| entry.path())
1072 .filter(|path| {
1073 path.is_file()
1074 && path.extension().and_then(|ext| ext.to_str()) == Some("json")
1075 && path
1076 .file_stem()
1077 .and_then(|stem| stem.to_str())
1078 .map(|stem| stem.starts_with("session_"))
1079 .unwrap_or(false)
1080 })
1081 .collect();
1082
1083 reports.sort_by(|a, b| {
1084 let a_name = a
1085 .file_name()
1086 .and_then(|name| name.to_str())
1087 .unwrap_or_default();
1088 let b_name = b
1089 .file_name()
1090 .and_then(|name| name.to_str())
1091 .unwrap_or_default();
1092 b_name.cmp(a_name)
1093 });
1094 reports.truncate(Self::SESSION_REPORT_LIMIT);
1095
1096 for report_path in reports {
1097 let Ok(meta) = std::fs::metadata(&report_path) else {
1098 continue;
1099 };
1100 let mtime = meta
1101 .modified()
1102 .map(|t| {
1103 t.duration_since(std::time::UNIX_EPOCH)
1104 .unwrap_or_default()
1105 .as_secs() as i64
1106 })
1107 .unwrap_or(0);
1108
1109 for exchange in load_session_exchanges(&report_path, mtime) {
1110 desired_paths.insert(exchange.path.clone());
1111 let mtype = detect_memory_type(&exchange.content);
1112 match self.index_chunks_with_room_and_type(
1113 &exchange.path,
1114 exchange.last_modified,
1115 "session",
1116 mtype,
1117 std::slice::from_ref(&exchange.content),
1118 ) {
1119 Ok(n) if n > 0 => {
1120 count += 1;
1121 }
1122 Ok(_) => {}
1123 Err(_) => {}
1124 }
1125 }
1126 }
1127 }
1128
1129 self.prune_indexed_prefix("session/", &desired_paths);
1130 count
1131 }
1132
1133 pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
1138 let imports_dir = workspace_root.join(".hematite").join("imports");
1139 let mut count = 0usize;
1140 let mut desired_paths = HashSet::new();
1141
1142 if imports_dir.exists() {
1143 let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
1144 .max_depth(4)
1145 .follow_links(false)
1146 .into_iter()
1147 .filter_map(|entry| entry.ok())
1148 .filter(|entry| entry.file_type().is_file())
1149 .filter_map(|entry| {
1150 let path = entry.into_path();
1151 let ext = path
1152 .extension()
1153 .and_then(|ext| ext.to_str())
1154 .unwrap_or("")
1155 .to_ascii_lowercase();
1156 if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
1157 return None;
1158 }
1159 let meta = std::fs::metadata(&path).ok()?;
1160 if meta.len() > Self::IMPORT_MAX_BYTES {
1161 return None;
1162 }
1163 let mtime = meta
1164 .modified()
1165 .map(|t| {
1166 t.duration_since(std::time::UNIX_EPOCH)
1167 .unwrap_or_default()
1168 .as_secs() as i64
1169 })
1170 .unwrap_or(0);
1171 Some((path, mtime))
1172 })
1173 .collect();
1174
1175 imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
1176 b_mtime
1177 .cmp(a_mtime)
1178 .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
1179 });
1180 imports.truncate(Self::IMPORT_FILE_LIMIT);
1181
1182 for (import_path, mtime) in imports {
1183 for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
1184 desired_paths.insert(exchange.path.clone());
1185 let mtype = detect_memory_type(&exchange.content);
1186 match self.index_chunks_with_room_and_type(
1187 &exchange.path,
1188 exchange.last_modified,
1189 "session",
1190 mtype,
1191 std::slice::from_ref(&exchange.content),
1192 ) {
1193 Ok(n) if n > 0 => {
1194 count += 1;
1195 }
1196 Ok(_) => {}
1197 Err(_) => {}
1198 }
1199 }
1200 }
1201 }
1202
1203 self.prune_indexed_prefix("session/imports/", &desired_paths);
1204 count
1205 }
1206
1207 fn backfill_missing_embeddings(&self) {
1212 let (fts_count, vec_count) = {
1214 let db = self.db.lock().unwrap();
1215 let fts: i64 = db
1216 .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
1217 .unwrap_or(0);
1218 let vec: i64 = db
1219 .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
1220 .unwrap_or(0);
1221 (fts, vec)
1222 };
1223 if fts_count == 0 || fts_count == vec_count {
1224 return;
1225 }
1226
1227 let missing: Vec<(String, i64, String, i64, String, String)> = {
1230 let db = self.db.lock().unwrap();
1231 let mut stmt = match db.prepare_cached(
1232 "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content,
1233 COALESCE(cm.last_modified, 0),
1234 COALESCE(cm.room, 'root'),
1235 COALESCE(cm.memory_type, '')
1236 FROM chunks_fts f
1237 LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
1238 LEFT JOIN chunks_meta cm ON cm.path = f.path
1239 WHERE v.path IS NULL
1240 ORDER BY CASE
1241 WHEN f.path LIKE '%.rs' THEN 0
1242 WHEN f.path LIKE '%.toml' THEN 1
1243 WHEN f.path LIKE '%.json' THEN 2
1244 ELSE 3
1245 END, f.path
1246 LIMIT 20",
1247 ) {
1248 Ok(s) => s,
1249 Err(_) => return,
1250 };
1251 let Ok(rows) = stmt.query_map([], |r| {
1252 Ok((
1253 r.get::<_, String>(0)?,
1254 r.get::<_, i64>(1)?,
1255 r.get::<_, String>(2)?,
1256 r.get::<_, i64>(3)?,
1257 r.get::<_, String>(4)?,
1258 r.get::<_, String>(5)?,
1259 ))
1260 }) else {
1261 return;
1262 };
1263 rows.filter_map(|r| r.ok()).collect()
1264 };
1265
1266 let Some(embed_model) = self.current_embed_model() else {
1267 return;
1268 };
1269
1270 let prefixed: Vec<String> = missing
1272 .iter()
1273 .map(|(_, _, content, _, _, _)| {
1274 let p = format!("search_document: {}", content);
1275 if p.len() > 8000 {
1276 safe_head(&p, 8000).to_string()
1277 } else {
1278 p
1279 }
1280 })
1281 .collect();
1282
1283 let embeddings = embed_batch(&prefixed, &self.base_url, &embed_model);
1284
1285 for (i, maybe_vec) in embeddings.into_iter().enumerate() {
1286 let Some(vec) = maybe_vec else {
1287 break;
1289 };
1290 let (path, idx, _, last_modified, room, memory_type) = &missing[i];
1291 let blob = floats_to_blob(&vec);
1292 let db = self.db.lock().unwrap();
1293 let _ = db.execute(
1294 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
1295 params![path, idx, blob],
1296 );
1297 drop(db);
1298 if let Ok(mut cache) = self.embedding_cache.write() {
1300 cache.retain(|e| !(e.path == *path && e.chunk_idx == *idx));
1301 cache.push(EmbeddedChunk {
1302 path: path.clone(),
1303 chunk_idx: *idx,
1304 embedding: vec,
1305 last_modified: *last_modified,
1306 room: room.clone(),
1307 memory_type: memory_type.clone(),
1308 });
1309 }
1310 }
1311 }
1312
1313 pub fn file_count(&self) -> usize {
1316 let db = self.db.lock().unwrap();
1317 db.query_row(
1318 "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
1319 [],
1320 |r| r.get::<_, i64>(0),
1321 )
1322 .unwrap_or(0) as usize
1323 }
1324
1325 pub fn embedded_chunk_count(&self) -> usize {
1328 let db = self.db.lock().unwrap();
1329 db.query_row(
1330 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1331 [],
1332 |r| r.get::<_, i64>(0),
1333 )
1334 .unwrap_or(0) as usize
1335 }
1336
1337 pub fn has_any_embeddings(&self) -> bool {
1339 let db = self.db.lock().unwrap();
1340 db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1341 r.get::<_, i64>(0)
1342 })
1343 .unwrap_or(0)
1344 != 0
1345 }
1346
1347 pub fn reset(&self) {
1350 let db = self.db.lock().unwrap();
1351 let _ = db.execute_batch(
1352 "DELETE FROM chunks_fts;
1353 DELETE FROM chunks_vec;
1354 DELETE FROM chunks_meta;",
1355 );
1356 }
1357
1358 pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1361 let db = self.db.lock().unwrap();
1362 let indexed_source_files = db
1363 .query_row(
1364 "SELECT COUNT(*) FROM chunks_meta
1365 WHERE path NOT LIKE 'session/%'
1366 AND path NOT LIKE '.hematite/docs/%'",
1367 [],
1368 |r| r.get::<_, i64>(0),
1369 )
1370 .unwrap_or(0) as usize;
1371 let indexed_docs = db
1372 .query_row(
1373 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1374 [],
1375 |r| r.get::<_, i64>(0),
1376 )
1377 .unwrap_or(0) as usize;
1378 let indexed_session_exchanges = db
1379 .query_row(
1380 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1381 [],
1382 |r| r.get::<_, i64>(0),
1383 )
1384 .unwrap_or(0) as usize;
1385 let embedded_source_doc_chunks = db
1386 .query_row(
1387 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1388 [],
1389 |r| r.get::<_, i64>(0),
1390 )
1391 .unwrap_or(0) as usize;
1392 let has_any_embeddings = db
1393 .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1394 r.get::<_, i64>(0)
1395 })
1396 .unwrap_or(0)
1397 != 0;
1398 drop(db);
1399
1400 let hot_files = self
1401 .hot_files(hot_limit.max(1))
1402 .into_iter()
1403 .map(|(path, heat, last_modified, room)| VeinHotFile {
1404 path,
1405 heat,
1406 last_modified,
1407 room,
1408 })
1409 .collect::<Vec<_>>();
1410
1411 VeinInspectionSnapshot {
1412 indexed_source_files,
1413 indexed_docs,
1414 indexed_session_exchanges,
1415 embedded_source_doc_chunks,
1416 has_any_embeddings,
1417 active_room: self.active_room(),
1418 l1_ready: !hot_files.is_empty(),
1419 hot_files,
1420 }
1421 }
1422
1423 pub fn bump_heat(&self, path: &str) {
1429 if path.is_empty() {
1430 return;
1431 }
1432 let now = std::time::SystemTime::now()
1433 .duration_since(std::time::UNIX_EPOCH)
1434 .unwrap_or_default()
1435 .as_secs() as i64;
1436 let db = self.db.lock().unwrap();
1437 let _ = db.execute(
1438 "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1439 ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1440 params![path, now],
1441 );
1442 }
1443
1444 fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1448 let db = self.db.lock().unwrap();
1449 let mut stmt = match db.prepare_cached(
1450 "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1451 FROM file_heat fh
1452 JOIN chunks_meta cm ON cm.path = fh.path
1453 ORDER BY fh.heat DESC, cm.last_modified DESC
1454 LIMIT ?1",
1455 ) {
1456 Ok(s) => s,
1457 Err(_) => return vec![],
1458 };
1459 stmt.query_map(params![n as i64], |row| {
1460 Ok((
1461 row.get::<_, String>(0)?,
1462 row.get::<_, i64>(1)?,
1463 row.get::<_, i64>(2)?,
1464 row.get::<_, String>(3)?,
1465 ))
1466 })
1467 .map(|rows| rows.filter_map(|r| r.ok()).collect())
1468 .unwrap_or_default()
1469 }
1470
1471 pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1474 self.hot_files(n)
1475 .into_iter()
1476 .map(|(path, _, _, _)| path)
1477 .collect()
1478 }
1479
1480 pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1484 let files = self.hot_files(n);
1485 if files.is_empty() {
1486 return vec![];
1487 }
1488 let max_heat = files
1489 .iter()
1490 .map(|(_, h, _, _)| *h)
1491 .max()
1492 .unwrap_or(1)
1493 .max(1) as f64;
1494 files
1495 .into_iter()
1496 .map(|(path, heat, _, _)| {
1497 let weight = (heat as f64) / max_heat;
1498 (path, weight)
1499 })
1500 .collect()
1501 }
1502
1503 pub fn l1_context(&self) -> Option<String> {
1508 let files = self.hot_files(8);
1509 if files.is_empty() {
1510 return None;
1511 }
1512 let now = std::time::SystemTime::now()
1513 .duration_since(std::time::UNIX_EPOCH)
1514 .unwrap_or_default()
1515 .as_secs() as i64;
1516
1517 let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1519 std::collections::BTreeMap::new();
1520 for (path, heat, mtime, room) in &files {
1521 by_room
1522 .entry(room.clone())
1523 .or_default()
1524 .push((path.clone(), *heat, *mtime));
1525 }
1526
1527 let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1528 for (room, entries) in &by_room {
1529 let _ = writeln!(out, "[{}]", room);
1530 for (path, heat, mtime) in entries {
1531 let age_secs = now - mtime;
1532 let age = if age_secs < 3600 {
1533 "just now".to_string()
1534 } else if age_secs < 86400 {
1535 format!("{}h ago", age_secs / 3600)
1536 } else {
1537 format!("{}d ago", age_secs / 86400)
1538 };
1539 let _ = writeln!(
1540 out,
1541 " - {} [{} edit{}, {}]",
1542 path,
1543 heat,
1544 if *heat == 1 { "" } else { "s" },
1545 age
1546 );
1547 }
1548 }
1549 Some(out)
1550 }
1551
1552 fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1553 let pattern = format!("{}%", prefix);
1554 let existing_paths: Vec<String> = {
1555 let db = self.db.lock().unwrap();
1556 let mut stmt =
1557 match db.prepare_cached("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1558 Ok(stmt) => stmt,
1559 Err(_) => return,
1560 };
1561 stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1562 .map(|rows| rows.filter_map(|row| row.ok()).collect())
1563 .unwrap_or_default()
1564 };
1565
1566 if existing_paths.is_empty() {
1567 return;
1568 }
1569
1570 let db = self.db.lock().unwrap();
1571 for path in existing_paths {
1572 if desired_paths.contains(&path) {
1573 continue;
1574 }
1575 let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1576 let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1577 let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1578 }
1579 }
1580}
1581
1582impl QuerySignals {
1583 fn from_query(query: &str) -> Self {
1584 let lower = query.to_ascii_lowercase();
1585 let historical_memory_hint = [
1586 "remember",
1587 "earlier",
1588 "previous",
1589 "last time",
1590 "what did we decide",
1591 "why did we decide",
1592 "what did we say",
1593 "why did we change",
1594 ]
1595 .iter()
1596 .any(|needle| lower.contains(needle));
1597
1598 let query_memory_type = if lower.contains("decide")
1600 || lower.contains("decision")
1601 || lower.contains("we agreed")
1602 || lower.contains("we chose")
1603 {
1604 Some("decision")
1605 } else if lower.contains("bug")
1606 || lower.contains("error")
1607 || lower.contains("issue")
1608 || lower.contains("problem")
1609 || lower.contains("fix")
1610 || lower.contains("broken")
1611 {
1612 Some("problem")
1613 } else if lower.contains("shipped")
1614 || lower.contains("milestone")
1615 || lower.contains("finished")
1616 || lower.contains("working now")
1617 {
1618 Some("milestone")
1619 } else if lower.contains("prefer")
1620 || lower.contains("my preference")
1621 || lower.contains("i like")
1622 || lower.contains("i want")
1623 {
1624 Some("preference")
1625 } else {
1626 None
1627 };
1628
1629 Self {
1630 exact_phrases: extract_exact_phrases(query),
1631 standout_terms: extract_standout_terms(query),
1632 historical_memory_hint,
1633 temporal_reference: extract_temporal_reference(query),
1634 query_memory_type,
1635 }
1636 }
1637}
1638
1639fn merge_scored_result(
1640 merged_by_path: &mut HashMap<String, SearchResult>,
1641 candidate: SearchResult,
1642) {
1643 match merged_by_path.get_mut(&candidate.path) {
1644 Some(existing) if candidate.score > existing.score => *existing = candidate,
1645 Some(_) => {}
1646 None => {
1647 merged_by_path.insert(candidate.path.clone(), candidate);
1648 }
1649 }
1650}
1651
1652fn reranked_score(
1653 signals: &QuerySignals,
1654 active_room: Option<&str>,
1655 result: &SearchResult,
1656 is_semantic: bool,
1657) -> f32 {
1658 let base = if is_semantic {
1659 1.0 + result.score.clamp(0.0, 1.0)
1660 } else {
1661 (result.score / 10.0).clamp(0.0, 1.0)
1662 };
1663 base + room_bias(active_room, result)
1664 + retrieval_signal_boost(signals, result)
1665 + temporal_memory_boost(signals, result)
1666}
1667
1668fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1669 if active_room == Some(result.room.as_str()) {
1670 0.15
1671 } else {
1672 0.0
1673 }
1674}
1675
1676fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1677 let mut boost = 0.0f32;
1678
1679 let lower_path = result.path.to_ascii_lowercase();
1682 let lower_content = result.content.to_ascii_lowercase();
1683 let mut haystack = String::with_capacity(lower_path.len() + 1 + lower_content.len());
1684 haystack.push_str(&lower_path);
1685 haystack.push('\n');
1686 haystack.push_str(&lower_content);
1687
1688 let phrase_matches = signals
1689 .exact_phrases
1690 .iter()
1691 .filter(|phrase| haystack.contains(phrase.as_str()))
1692 .count();
1693 if phrase_matches > 0 {
1694 boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1695 }
1696
1697 let mut standout_matches = 0;
1698 for term in &signals.standout_terms {
1699 if lower_path.contains(term.as_str()) {
1700 boost += 0.40;
1701 standout_matches += 1;
1702 } else if lower_content.contains(term.as_str()) {
1703 boost += 0.12;
1704 standout_matches += 1;
1705 }
1706 if standout_matches >= 3 {
1707 break;
1708 }
1709 }
1710
1711 if signals.historical_memory_hint && result.room == "session" {
1712 boost += 0.45;
1713 }
1714
1715 if let Some(qtype) = signals.query_memory_type {
1717 if !result.memory_type.is_empty() && result.memory_type == qtype {
1718 boost += 0.35;
1719 }
1720 }
1721
1722 boost
1723}
1724
1725fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1726 if result.room != "session" {
1727 return 0.0;
1728 }
1729 let Some(reference) = signals.temporal_reference else {
1730 return 0.0;
1731 };
1732 let Some(memory_ts) = session_memory_timestamp(result) else {
1733 return 0.0;
1734 };
1735
1736 let span = reference.window_secs.max(86_400);
1737 let full_fade = span.saturating_mul(8);
1738 if full_fade <= 0 {
1739 return 0.0;
1740 }
1741
1742 let distance = (memory_ts - reference.target_ts).abs();
1743 let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1744 if closeness <= 0.0 {
1745 0.0
1746 } else {
1747 0.22 * closeness
1748 }
1749}
1750
1751fn extract_exact_phrases(query: &str) -> Vec<String> {
1752 let mut phrases = Vec::new();
1753 let chars: Vec<char> = query.chars().collect();
1754 let mut i = 0usize;
1755
1756 while i < chars.len() {
1757 let quote = chars[i];
1758 if !matches!(quote, '"' | '\'' | '`') {
1759 i += 1;
1760 continue;
1761 }
1762 let start = i + 1;
1763 let mut end = start;
1764 while end < chars.len() && chars[end] != quote {
1765 end += 1;
1766 }
1767 if end > start {
1768 let phrase = chars[start..end]
1769 .iter()
1770 .collect::<String>()
1771 .trim()
1772 .to_ascii_lowercase();
1773 if phrase.len() >= 3 && !phrases.contains(&phrase) {
1774 phrases.push(phrase);
1775 }
1776 }
1777 i = end.saturating_add(1);
1778 }
1779
1780 phrases
1781}
1782
1783fn extract_standout_terms(query: &str) -> Vec<String> {
1784 static STANDOUT_SW: std::sync::OnceLock<HashSet<&'static str>> = std::sync::OnceLock::new();
1785 let stopwords = STANDOUT_SW.get_or_init(|| {
1786 [
1787 "about", "after", "before", "change", "changed", "decide", "decided", "does",
1788 "earlier", "flow", "from", "have", "into", "just", "last", "local", "make", "more",
1789 "remember", "should", "that", "their", "there", "these", "they", "this", "those",
1790 "what", "when", "where", "which", "why", "with", "work",
1791 ]
1792 .iter()
1793 .copied()
1794 .collect()
1795 });
1796
1797 let mut standout = Vec::new();
1798 for token in query.split(|ch: char| {
1799 !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1800 }) {
1801 let trimmed = token.trim();
1802 if trimmed.len() < 4 {
1803 continue;
1804 }
1805 let lower = trimmed.to_ascii_lowercase();
1806 if stopwords.contains(lower.as_str()) {
1807 continue;
1808 }
1809
1810 let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1811 || trimmed
1812 .chars()
1813 .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1814 || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1815 || trimmed.len() >= 9;
1816
1817 if interesting && !standout.contains(&lower) {
1818 standout.push(lower);
1819 }
1820 }
1821
1822 standout
1823}
1824
1825fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1826 if let Some(ts) = extract_iso_date_from_query(query) {
1827 return Some(TemporalReference {
1828 target_ts: ts,
1829 window_secs: 86_400,
1830 });
1831 }
1832
1833 let now = current_unix_timestamp();
1834 let lower = query.to_ascii_lowercase();
1835 if lower.contains("yesterday") {
1836 Some(TemporalReference {
1837 target_ts: now.saturating_sub(86_400),
1838 window_secs: 86_400,
1839 })
1840 } else if lower.contains("today") || lower.contains("earlier today") {
1841 Some(TemporalReference {
1842 target_ts: now,
1843 window_secs: 86_400,
1844 })
1845 } else if lower.contains("last week") {
1846 Some(TemporalReference {
1847 target_ts: now.saturating_sub(7 * 86_400),
1848 window_secs: 7 * 86_400,
1849 })
1850 } else if lower.contains("last month") {
1851 Some(TemporalReference {
1852 target_ts: now.saturating_sub(30 * 86_400),
1853 window_secs: 30 * 86_400,
1854 })
1855 } else {
1856 None
1857 }
1858}
1859
1860fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1861 query
1862 .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1863 .find_map(parse_iso_date_token)
1864}
1865
1866fn parse_iso_date_token(token: &str) -> Option<i64> {
1867 if token.len() != 10 {
1868 return None;
1869 }
1870 let bytes = token.as_bytes();
1871 if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1872 return None;
1873 }
1874
1875 let year = token.get(0..4)?.parse::<i32>().ok()?;
1876 let month = token.get(5..7)?.parse::<u32>().ok()?;
1877 let day = token.get(8..10)?.parse::<u32>().ok()?;
1878 if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1879 return None;
1880 }
1881
1882 Some(days_from_civil(year, month, day).saturating_mul(86_400))
1883}
1884
1885fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1886 let year = year - if month <= 2 { 1 } else { 0 };
1887 let era = if year >= 0 { year } else { year - 399 } / 400;
1888 let yoe = year - era * 400;
1889 let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1890 let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1891 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1892 era as i64 * 146_097 + doe as i64 - 719_468
1893}
1894
1895fn current_unix_timestamp() -> i64 {
1896 std::time::SystemTime::now()
1897 .duration_since(std::time::UNIX_EPOCH)
1898 .unwrap_or_default()
1899 .as_secs() as i64
1900}
1901
1902fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1903 extract_session_path_timestamp(&result.path).or({
1904 if result.last_modified > 0 {
1905 Some(result.last_modified)
1906 } else {
1907 None
1908 }
1909 })
1910}
1911
1912fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1913 let normalized = path.replace('\\', "/");
1914 let mut parts = normalized.split('/');
1915 if parts.next()? != "session" {
1916 return None;
1917 }
1918 parse_iso_date_token(parts.next()?)
1919}
1920
1921fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1922 let normalized = speaker.trim().to_ascii_lowercase();
1923 match normalized.as_str() {
1924 "you" | "user" => SessionSpeakerKind::User,
1925 "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1926 _ => SessionSpeakerKind::Assistant,
1927 }
1928}
1929
1930fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1931 let Ok(raw) = std::fs::read_to_string(report_path) else {
1932 return Vec::new();
1933 };
1934 let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1935 return Vec::new();
1936 };
1937
1938 let session_key = report_path
1939 .file_stem()
1940 .and_then(|stem| stem.to_str())
1941 .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1942 .unwrap_or("unknown-session")
1943 .to_string();
1944 let session_date = report
1945 .session_start
1946 .split('_')
1947 .next()
1948 .filter(|date| !date.is_empty())
1949 .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1950 .to_string();
1951
1952 let mut exchanges = Vec::with_capacity(report.transcript.len() / 2);
1953 let mut pending_user: Option<String> = None;
1954 let mut turn_index = 0usize;
1955
1956 for entry in report.transcript {
1957 match session_speaker_kind(&entry.speaker) {
1958 SessionSpeakerKind::User => {
1959 let text = entry.text.trim();
1960 if !text.is_empty() {
1961 pending_user = Some(text.to_string());
1962 }
1963 }
1964 SessionSpeakerKind::Assistant => {
1965 let text = entry.text.trim();
1966 if text.is_empty() {
1967 continue;
1968 }
1969 let Some(user_text) = pending_user.take() else {
1970 continue;
1971 };
1972 turn_index += 1;
1973 exchanges.push(SessionExchange {
1974 path: format!(
1975 "session/{}/{}/turn-{}",
1976 session_date, session_key, turn_index
1977 ),
1978 last_modified,
1979 content: format!(
1980 "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1981 user_text, text
1982 ),
1983 });
1984 }
1985 SessionSpeakerKind::Ignore => {}
1986 }
1987 }
1988
1989 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1990 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1991 exchanges = exchanges.into_iter().skip(keep_from).collect();
1992 }
1993
1994 exchanges
1995}
1996
1997fn load_imported_session_exchanges(
1998 import_path: &Path,
1999 imports_root: &Path,
2000 last_modified: i64,
2001) -> Vec<SessionExchange> {
2002 let Ok(raw) = std::fs::read_to_string(import_path) else {
2003 return Vec::new();
2004 };
2005
2006 let messages = normalize_import_messages(&raw, import_path);
2007 if messages.is_empty() {
2008 return Vec::new();
2009 }
2010
2011 let rel = import_path
2012 .strip_prefix(imports_root)
2013 .unwrap_or(import_path);
2014 let rel_slug = slugify_import_path(rel);
2015 let mut exchanges = Vec::with_capacity(messages.len() / 2);
2016 let mut pending_user: Option<String> = None;
2017 let mut turn_index = 0usize;
2018
2019 for (role, text) in messages {
2020 let cleaned = text.trim();
2021 if cleaned.is_empty() {
2022 continue;
2023 }
2024 match role.as_str() {
2025 "user" => pending_user = Some(cleaned.to_string()),
2026 "assistant" => {
2027 let Some(user_text) = pending_user.take() else {
2028 continue;
2029 };
2030 turn_index += 1;
2031 exchanges.push(SessionExchange {
2032 path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
2033 last_modified,
2034 content: format!(
2035 "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
2036 rel.to_string_lossy().replace('\\', "/"),
2037 user_text,
2038 cleaned
2039 ),
2040 });
2041 }
2042 _ => {}
2043 }
2044 }
2045
2046 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
2047 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
2048 exchanges = exchanges.into_iter().skip(keep_from).collect();
2049 }
2050
2051 exchanges
2052}
2053
2054fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
2055 if raw.trim().is_empty() {
2056 return Vec::new();
2057 }
2058
2059 if let Some(messages) = parse_marker_transcript(raw) {
2060 return messages;
2061 }
2062
2063 let ext = import_path
2064 .extension()
2065 .and_then(|ext| ext.to_str())
2066 .unwrap_or("")
2067 .to_ascii_lowercase();
2068
2069 if matches!(ext.as_str(), "json" | "jsonl")
2070 || matches!(raw.trim().chars().next(), Some('{') | Some('['))
2071 {
2072 if let Some(messages) = parse_jsonl_messages(raw) {
2073 if !messages.is_empty() {
2074 return messages;
2075 }
2076 }
2077
2078 if let Ok(value) = serde_json::from_str::<Value>(raw) {
2079 if let Some(messages) = parse_session_report_messages(&value) {
2080 return messages;
2081 }
2082 if let Some(messages) = parse_simple_role_messages(&value) {
2083 return messages;
2084 }
2085 if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
2086 return messages;
2087 }
2088 }
2089 }
2090
2091 Vec::new()
2092}
2093
2094fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
2095 let lines = raw.lines().collect::<Vec<_>>();
2096 if lines
2097 .iter()
2098 .filter(|line| line.trim_start().starts_with("> "))
2099 .count()
2100 < 2
2101 {
2102 return None;
2103 }
2104
2105 let mut messages = Vec::with_capacity(lines.len() / 2);
2106 let mut i = 0usize;
2107 while i < lines.len() {
2108 let line = lines[i].trim_start();
2109 if let Some(rest) = line.strip_prefix("> ") {
2110 messages.push(("user".to_string(), rest.trim().to_string()));
2111 i += 1;
2112 let mut assistant_lines = Vec::new();
2113 while i < lines.len() {
2114 let next = lines[i];
2115 if next.trim_start().starts_with("> ") {
2116 break;
2117 }
2118 let trimmed = next.trim();
2119 if !trimmed.is_empty() && trimmed != "---" {
2120 assistant_lines.push(trimmed.to_string());
2121 }
2122 i += 1;
2123 }
2124 if !assistant_lines.is_empty() {
2125 messages.push(("assistant".to_string(), assistant_lines.join("\n")));
2126 }
2127 } else {
2128 i += 1;
2129 }
2130 }
2131
2132 (!messages.is_empty()).then_some(messages)
2133}
2134
2135fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
2136 let mut messages = Vec::new();
2137 let mut has_codex_session_meta = false;
2138 let mut saw_jsonl = false;
2139
2140 for line in raw.lines() {
2141 let trimmed = line.trim();
2142 if trimmed.is_empty() {
2143 continue;
2144 }
2145 let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
2146 continue;
2147 };
2148 saw_jsonl = true;
2149 let Some(object) = value.as_object() else {
2150 continue;
2151 };
2152
2153 match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
2154 "session_meta" => {
2155 has_codex_session_meta = true;
2156 }
2157 "event_msg" => {
2158 let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
2159 continue;
2160 };
2161 let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
2162 continue;
2163 };
2164 match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
2165 "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
2166 "agent_message" => {
2167 messages.push(("assistant".to_string(), text.trim().to_string()))
2168 }
2169 _ => {}
2170 }
2171 }
2172 "human" | "user" => {
2173 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
2174 messages.push(("user".to_string(), text));
2175 }
2176 }
2177 "assistant" => {
2178 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
2179 messages.push(("assistant".to_string(), text));
2180 }
2181 }
2182 _ => {
2183 if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
2184 if let Some(text) = extract_text_content(&value) {
2185 match role {
2186 "user" | "human" => messages.push(("user".to_string(), text)),
2187 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2188 _ => {}
2189 }
2190 }
2191 }
2192 }
2193 }
2194 }
2195
2196 if !saw_jsonl {
2197 return None;
2198 }
2199
2200 if has_codex_session_meta || !messages.is_empty() {
2201 return Some(messages);
2202 }
2203
2204 None
2205}
2206
2207fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
2208 let report = value.as_object()?;
2209 let transcript = report.get("transcript")?.as_array()?;
2210 let mut messages = Vec::with_capacity(transcript.len());
2211
2212 for entry in transcript {
2213 let Some(obj) = entry.as_object() else {
2214 continue;
2215 };
2216 let speaker = obj
2217 .get("speaker")
2218 .and_then(|v| v.as_str())
2219 .unwrap_or_default();
2220 let text = obj
2221 .get("text")
2222 .and_then(|v| v.as_str())
2223 .unwrap_or_default()
2224 .trim()
2225 .to_string();
2226 if text.is_empty() {
2227 continue;
2228 }
2229 match session_speaker_kind(speaker) {
2230 SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
2231 SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
2232 SessionSpeakerKind::Ignore => {}
2233 }
2234 }
2235
2236 (!messages.is_empty()).then_some(messages)
2237}
2238
2239fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
2240 if let Some(array) = value.as_array() {
2241 let messages = collect_role_messages(array);
2242 return (!messages.is_empty()).then_some(messages);
2243 }
2244
2245 let obj = value.as_object()?;
2246 if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
2247 let array = messages_value.as_array()?;
2248 let messages = collect_role_messages(array);
2249 return (!messages.is_empty()).then_some(messages);
2250 }
2251
2252 None
2253}
2254
2255fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
2256 let mut messages = Vec::with_capacity(items.len());
2257 for item in items {
2258 let Some(obj) = item.as_object() else {
2259 continue;
2260 };
2261 let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
2262 continue;
2263 };
2264 let Some(text) = extract_text_content(item) else {
2265 continue;
2266 };
2267 match role {
2268 "user" | "human" => messages.push(("user".to_string(), text)),
2269 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2270 _ => {}
2271 }
2272 }
2273 messages
2274}
2275
2276fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
2277 let mapping = value.get("mapping")?.as_object()?;
2278 let mut current_id = mapping.iter().find_map(|(node_id, node)| {
2279 let obj = node.as_object()?;
2280 (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
2281 })?;
2282
2283 let mut messages = Vec::with_capacity(mapping.len());
2284 let mut visited = std::collections::HashSet::new();
2285
2286 while visited.insert(current_id.clone()) {
2287 let Some(node) = mapping.get(¤t_id).and_then(|v| v.as_object()) else {
2288 break;
2289 };
2290
2291 if let Some(message) = node.get("message") {
2292 let role = message
2293 .get("author")
2294 .and_then(|author| author.get("role"))
2295 .and_then(|v| v.as_str())
2296 .unwrap_or("");
2297 if let Some(text) = extract_text_content(message) {
2298 match role {
2299 "user" => messages.push(("user".to_string(), text)),
2300 "assistant" => messages.push(("assistant".to_string(), text)),
2301 _ => {}
2302 }
2303 }
2304 }
2305
2306 let Some(next_id) = node
2307 .get("children")
2308 .and_then(|children| children.as_array())
2309 .and_then(|children| children.first())
2310 .and_then(|child| child.as_str())
2311 else {
2312 break;
2313 };
2314 current_id = next_id.to_string();
2315 }
2316
2317 (!messages.is_empty()).then_some(messages)
2318}
2319
2320fn extract_text_content(value: &Value) -> Option<String> {
2321 if let Some(text) = value.as_str() {
2322 let trimmed = text.trim();
2323 return (!trimmed.is_empty()).then_some(trimmed.to_string());
2324 }
2325
2326 if let Some(array) = value.as_array() {
2327 let mut joined = String::new();
2328 for part in array
2329 .iter()
2330 .filter_map(extract_text_content)
2331 .filter(|p| !p.is_empty())
2332 {
2333 if !joined.is_empty() {
2334 joined.push('\n');
2335 }
2336 joined.push_str(&part);
2337 }
2338 return (!joined.is_empty()).then_some(joined);
2339 }
2340
2341 let obj = value.as_object()?;
2342
2343 if let Some(content) = obj.get("content") {
2344 if let Some(text) = extract_text_content(content) {
2345 return Some(text);
2346 }
2347 }
2348
2349 if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
2350 let trimmed = text.trim();
2351 if !trimmed.is_empty() {
2352 return Some(trimmed.to_string());
2353 }
2354 }
2355
2356 if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
2357 let mut joined = String::new();
2358 for part in parts
2359 .iter()
2360 .filter_map(|p| p.as_str().map(|s| s.trim().to_string()))
2361 .filter(|p| !p.is_empty())
2362 {
2363 if !joined.is_empty() {
2364 joined.push('\n');
2365 }
2366 joined.push_str(&part);
2367 }
2368 if !joined.is_empty() {
2369 return Some(joined);
2370 }
2371 }
2372
2373 None
2374}
2375
2376fn slugify_import_path(path: &Path) -> String {
2377 path.to_string_lossy()
2378 .replace('\\', "/")
2379 .chars()
2380 .map(|ch| {
2381 if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
2382 ch
2383 } else {
2384 '_'
2385 }
2386 })
2387 .collect::<String>()
2388 .trim_matches('/')
2389 .replace('/', "__")
2390}
2391
2392static EMBED_CLIENT: std::sync::OnceLock<reqwest::blocking::Client> = std::sync::OnceLock::new();
2398
2399fn embed_client() -> &'static reqwest::blocking::Client {
2400 EMBED_CLIENT.get_or_init(|| {
2401 reqwest::blocking::Client::builder()
2402 .timeout(std::time::Duration::from_secs(30))
2403 .build()
2404 .expect("failed to build embedding HTTP client")
2405 })
2406}
2407
2408fn embed_query_blocking(text: &str, base_url: &str, embed_model: &str) -> Option<Vec<f32>> {
2409 embed_text_with_prefix(text, "search_query", base_url, embed_model)
2410}
2411
2412fn embed_batch(inputs: &[String], base_url: &str, embed_model: &str) -> Vec<Option<Vec<f32>>> {
2415 if inputs.is_empty() {
2416 return Vec::new();
2417 }
2418
2419 let client = embed_client();
2420 let trimmed = base_url.trim_end_matches('/');
2421 let is_ollama = trimmed.contains("11434");
2422 let url = if is_ollama {
2423 format!("{}/api/embed", trimmed)
2424 } else {
2425 format!("{}/v1/embeddings", trimmed)
2426 };
2427
2428 let body = serde_json::json!({
2429 "model": embed_model,
2430 "input": inputs
2431 });
2432
2433 let resp = match client.post(&url).json(&body).send() {
2434 Ok(r) if r.status().is_success() => r,
2435 _ => return vec![None; inputs.len()],
2436 };
2437 let json: serde_json::Value = match resp.json() {
2438 Ok(j) => j,
2439 Err(_) => return vec![None; inputs.len()],
2440 };
2441
2442 if is_ollama {
2443 match json["embeddings"].as_array() {
2445 Some(arr) => arr
2446 .iter()
2447 .map(|e| {
2448 e.as_array().and_then(|v| {
2449 let floats: Vec<f32> = v
2450 .iter()
2451 .filter_map(|x| x.as_f64().map(|f| f as f32))
2452 .collect();
2453 if floats.is_empty() {
2454 None
2455 } else {
2456 Some(floats)
2457 }
2458 })
2459 })
2460 .collect(),
2461 None => vec![None; inputs.len()],
2462 }
2463 } else {
2464 let mut out = vec![None; inputs.len()];
2466 if let Some(arr) = json["data"].as_array() {
2467 for item in arr {
2468 let idx = item["index"].as_u64().unwrap_or(0) as usize;
2469 if idx < out.len() {
2470 if let Some(emb) = item["embedding"].as_array() {
2471 let floats: Vec<f32> = emb
2472 .iter()
2473 .filter_map(|x| x.as_f64().map(|f| f as f32))
2474 .collect();
2475 if !floats.is_empty() {
2476 out[idx] = Some(floats);
2477 }
2478 }
2479 }
2480 }
2481 }
2482 out
2483 }
2484}
2485
2486fn embed_text_with_prefix(
2487 text: &str,
2488 task: &str,
2489 base_url: &str,
2490 embed_model: &str,
2491) -> Option<Vec<f32>> {
2492 let prefixed = format!("{}: {}", task, text);
2494 let input = if prefixed.len() > 8000 {
2496 safe_head(&prefixed, 8000).to_string()
2497 } else {
2498 prefixed
2499 };
2500
2501 let client = embed_client();
2502 let body = serde_json::json!({ "model": embed_model, "input": input });
2503
2504 let trimmed = base_url.trim_end_matches('/');
2505 let is_ollama = trimmed.contains("11434");
2506 let url = if is_ollama {
2507 format!("{}/api/embed", trimmed)
2508 } else {
2509 format!("{}/v1/embeddings", trimmed)
2510 };
2511 let resp = client.post(&url).json(&body).send().ok()?;
2512
2513 if !resp.status().is_success() {
2514 return None;
2515 }
2516
2517 let json: serde_json::Value = resp.json().ok()?;
2518 let embedding = if is_ollama {
2519 json["embeddings"][0].as_array()?
2520 } else {
2521 json["data"][0]["embedding"].as_array()?
2522 };
2523 let vec: Vec<f32> = embedding
2524 .iter()
2525 .filter_map(|v| v.as_f64().map(|f| f as f32))
2526 .collect();
2527
2528 if vec.is_empty() {
2529 None
2530 } else {
2531 Some(vec)
2532 }
2533}
2534
2535#[inline]
2541fn cosine_similarity_precomputed(a: &[f32], a_norm_sq: f32, b: &[f32]) -> f32 {
2542 if a.len() != b.len() || a_norm_sq == 0.0 {
2543 return 0.0;
2544 }
2545 let (dot, nb_sq) = a
2546 .iter()
2547 .zip(b.iter())
2548 .fold((0.0f32, 0.0f32), |(dot, nb), (x, y)| {
2549 (dot + x * y, nb + y * y)
2550 });
2551 if nb_sq == 0.0 {
2552 0.0
2553 } else {
2554 dot / (a_norm_sq.sqrt() * nb_sq.sqrt())
2555 }
2556}
2557
2558fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2559 let mut out = Vec::with_capacity(floats.len() * 4);
2560 for f in floats {
2561 out.extend_from_slice(&f.to_le_bytes());
2562 }
2563 out
2564}
2565
2566fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2567 let mut out = Vec::with_capacity(blob.len() / 4);
2568 for b in blob.chunks_exact(4) {
2569 out.push(f32::from_le_bytes([b[0], b[1], b[2], b[3]]));
2570 }
2571 out
2572}
2573
2574fn normalize_extracted_document_text(text: String) -> Option<String> {
2580 let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2581 let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2582 if trimmed.is_empty() {
2583 None
2584 } else {
2585 Some(trimmed.to_string())
2586 }
2587}
2588
2589fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2590 let previous_hook = std::panic::take_hook();
2591 std::panic::set_hook(Box::new(|_| {}));
2592 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2593 pdf_extract::extract_text(path)
2594 }));
2595 std::panic::set_hook(previous_hook);
2596
2597 match result {
2598 Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2599 Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2600 Err(payload) => {
2601 let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2602 (*msg).to_string()
2603 } else if let Some(msg) = payload.downcast_ref::<String>() {
2604 msg.clone()
2605 } else {
2606 "unknown parser panic".to_string()
2607 };
2608 Err(format!("pdf-extract panicked: {}", panic_text))
2609 }
2610 }
2611}
2612
2613fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2614 let mut doc =
2615 lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2616
2617 if doc.is_encrypted() {
2618 doc.decrypt("")
2619 .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2620 }
2621
2622 let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2623 if page_numbers.is_empty() {
2624 return Ok(None);
2625 }
2626
2627 let mut extracted_pages = Vec::with_capacity(page_numbers.len());
2628 let mut page_errors = Vec::with_capacity(page_numbers.len());
2629
2630 for page_number in page_numbers {
2631 match doc.extract_text(&[page_number]) {
2632 Ok(text) => {
2633 if let Some(page_text) = normalize_extracted_document_text(text) {
2634 extracted_pages.push(page_text);
2635 }
2636 }
2637 Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2638 }
2639 }
2640
2641 if !extracted_pages.is_empty() {
2642 return Ok(Some(extracted_pages.join("\n\n")));
2643 }
2644
2645 if !page_errors.is_empty() {
2646 let sample_errors = {
2647 let mut s = String::with_capacity(160);
2648 for e in page_errors.into_iter().take(3) {
2649 if !s.is_empty() {
2650 s.push_str("; ");
2651 }
2652 s.push_str(&e);
2653 }
2654 s
2655 };
2656 return Err(format!(
2657 "lopdf could not extract usable page text ({sample_errors})"
2658 ));
2659 }
2660
2661 Ok(None)
2662}
2663
2664fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2665 let mut failures = Vec::with_capacity(2);
2666
2667 match extract_pdf_text_with_pdf_extract(path) {
2668 Ok(Some(text)) => return Ok(Some(text)),
2669 Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2670 Err(e) => failures.push(e),
2671 }
2672
2673 match extract_pdf_text_with_lopdf(path) {
2674 Ok(Some(text)) => return Ok(Some(text)),
2675 Ok(None) => failures.push("lopdf found no usable text".to_string()),
2676 Err(e) => failures.push(e),
2677 }
2678
2679 let detail = {
2680 let mut d = String::with_capacity(160);
2681 for (i, f) in failures.into_iter().take(2).enumerate() {
2682 if i > 0 {
2683 d.push_str("; ");
2684 }
2685 d.push_str(&f);
2686 }
2687 d
2688 };
2689 Err(format!(
2690 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2691 detail
2692 ))
2693}
2694
2695fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2696 let exe = std::env::current_exe()
2697 .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2698 let output = std::process::Command::new(exe)
2699 .arg("--pdf-extract-helper")
2700 .arg(path)
2701 .stdin(std::process::Stdio::null())
2702 .stdout(std::process::Stdio::piped())
2703 .stderr(std::process::Stdio::piped())
2704 .output()
2705 .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2706
2707 if !output.status.success() {
2708 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2709 return Err(if stderr.is_empty() {
2710 "PDF extraction failed.".to_string()
2711 } else {
2712 stderr
2713 });
2714 }
2715
2716 let text = String::from_utf8(output.stdout)
2717 .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2718 if text.trim().is_empty() {
2719 Ok(None)
2720 } else {
2721 Ok(Some(text))
2722 }
2723}
2724
2725pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2726 match extract_pdf_text_inside_helper(path) {
2727 Ok(Some(text)) => {
2728 use std::io::Write;
2729 let mut stdout = std::io::stdout();
2730 if stdout.write_all(text.as_bytes()).is_ok() {
2731 0
2732 } else {
2733 let _ = writeln!(
2734 std::io::stderr(),
2735 "PDF helper could not write extracted text."
2736 );
2737 1
2738 }
2739 }
2740 Ok(None) => {
2741 eprintln!(
2742 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2743 );
2744 1
2745 }
2746 Err(e) => {
2747 eprintln!("{}", e);
2748 1
2749 }
2750 }
2751}
2752
2753pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2756 let ext = path
2757 .extension()
2758 .and_then(|e| e.to_str())
2759 .unwrap_or("")
2760 .to_lowercase();
2761 match ext.as_str() {
2762 "pdf" => {
2763 let text = extract_pdf_text(path)?.ok_or_else(|| {
2764 "PDF contains no extractable text — it may be scanned/image-only. \
2765 Try attaching page screenshots with /image instead."
2766 .to_string()
2767 })?;
2768 pdf_quality_check(text)
2769 }
2770 _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2771 }
2772}
2773
2774fn pdf_quality_check(text: String) -> Result<String, String> {
2779 let trimmed = text.trim();
2780
2781 if trimmed.len() < 150 {
2783 return Err(format!(
2784 "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2785 or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2786 trimmed.len()
2787 ));
2788 }
2789
2790 let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2793 let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2794 let space_ratio = if non_newline > 0 {
2795 spaces as f32 / non_newline as f32
2796 } else {
2797 0.0
2798 };
2799
2800 if space_ratio < 0.04 {
2801 return Err(
2802 "PDF text extraction produced garbled output — words are merged with no spaces. \
2803 This usually means the PDF uses custom embedded fonts (common with academic publishers \
2804 like EBSCO, Elsevier, Springer). \
2805 Try a PDF exported from Word, Google Docs, or LaTeX, \
2806 or attach page screenshots with /image instead.".to_string()
2807 );
2808 }
2809
2810 Ok(text)
2811}
2812
2813fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2817 if ext == "rs" {
2818 chunk_rust_symbols(text)
2819 } else {
2820 chunk_paragraphs(text)
2821 }
2822}
2823
2824fn chunk_rust_symbols(text: &str) -> Vec<String> {
2833 const ITEM_STARTS: &[&str] = &[
2834 "pub fn ",
2835 "pub async fn ",
2836 "pub unsafe fn ",
2837 "async fn ",
2838 "unsafe fn ",
2839 "fn ",
2840 "pub impl",
2841 "impl ",
2842 "pub struct ",
2843 "struct ",
2844 "pub enum ",
2845 "enum ",
2846 "pub trait ",
2847 "trait ",
2848 "pub mod ",
2849 "mod ",
2850 "pub type ",
2851 "type ",
2852 "pub const ",
2853 "const ",
2854 "pub static ",
2855 "static ",
2856 ];
2857
2858 let lines: Vec<&str> = text.lines().collect();
2859 let mut chunks: Vec<String> = Vec::with_capacity(lines.len() / 20 + 1);
2860 let mut current: Vec<&str> = Vec::with_capacity(64);
2861
2862 for &line in &lines {
2863 let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2864 let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2865
2866 if is_item && !current.is_empty() {
2867 let mut split = current.len();
2870 while split > 0 {
2871 let prev = current[split - 1].trim();
2872 if prev.starts_with("///")
2873 || prev.starts_with("//!")
2874 || prev.starts_with("#[")
2875 || prev.is_empty()
2876 {
2877 split -= 1;
2878 } else {
2879 break;
2880 }
2881 }
2882 let body = current[..split].join("\n");
2883 if !body.trim().is_empty() {
2884 chunks.push(body);
2885 }
2886 current.drain(..split);
2887 }
2888 current.push(line);
2889 }
2890 if !current.is_empty() {
2891 let body = current.join("\n");
2892 if !body.trim().is_empty() {
2893 chunks.push(body);
2894 }
2895 }
2896
2897 let mut result = Vec::with_capacity(chunks.len());
2899 for chunk in chunks {
2900 if chunk.len() > 3000 {
2901 result.extend(sliding_window_chunks(&chunk, 2000, 200));
2902 } else {
2903 result.push(chunk);
2904 }
2905 }
2906 result
2907}
2908
2909fn chunk_paragraphs(text: &str) -> Vec<String> {
2911 let mut result: Vec<String> = Vec::new();
2912 let mut current = String::with_capacity(2000);
2913
2914 for para in text.split("\n\n") {
2915 if current.len() + para.len() + 2 > 2000 {
2916 if !current.trim().is_empty() {
2917 result.push(std::mem::replace(&mut current, para.to_string()));
2918 } else {
2919 current = para.to_string();
2920 }
2921 } else {
2922 if !current.is_empty() {
2923 current.push_str("\n\n");
2924 }
2925 current.push_str(para);
2926 }
2927 }
2928 if !current.trim().is_empty() {
2929 result.push(current);
2930 }
2931
2932 let mut final_result = Vec::with_capacity(result.len());
2933 for chunk in result {
2934 if chunk.len() > 2000 {
2935 final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2936 } else {
2937 final_result.push(chunk);
2938 }
2939 }
2940 final_result
2941}
2942
2943fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2945 let chars: Vec<char> = text.chars().collect();
2946 let stride = chunk_size.saturating_sub(overlap).max(1);
2947 let mut result = Vec::with_capacity(chars.len() / stride + 1);
2948 let mut i = 0;
2949 while i < chars.len() {
2950 let end = (i + chunk_size).min(chars.len());
2951 result.push(chars[i..end].iter().collect());
2952 if end == chars.len() {
2953 break;
2954 }
2955 i += chunk_size - overlap;
2956 }
2957 result
2958}