1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7pub struct Vein {
28 db: std::sync::Arc<std::sync::Mutex<Connection>>,
29 base_url: String,
31 embed_model: std::sync::Arc<std::sync::RwLock<Option<String>>>,
32}
33
34unsafe impl Send for Vein {}
37unsafe impl Sync for Vein {}
38
39#[derive(Debug, Clone)]
40pub struct SearchResult {
41 pub path: String,
42 pub content: String,
43 pub score: f32,
45 pub room: String,
47 pub last_modified: i64,
49 pub memory_type: String,
52}
53
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub struct VeinHotFile {
56 pub path: String,
57 pub heat: i64,
58 pub last_modified: i64,
59 pub room: String,
60}
61
62#[derive(Debug, Clone)]
63pub struct VeinInspectionSnapshot {
64 pub indexed_source_files: usize,
65 pub indexed_docs: usize,
66 pub indexed_session_exchanges: usize,
67 pub embedded_source_doc_chunks: usize,
68 pub has_any_embeddings: bool,
69 pub active_room: Option<String>,
70 pub hot_files: Vec<VeinHotFile>,
71 pub l1_ready: bool,
72}
73
74#[derive(Debug, Default)]
75struct QuerySignals {
76 exact_phrases: Vec<String>,
77 standout_terms: Vec<String>,
78 historical_memory_hint: bool,
79 temporal_reference: Option<TemporalReference>,
80 query_memory_type: Option<&'static str>,
83}
84
85#[derive(Debug, Clone, Copy)]
86struct TemporalReference {
87 target_ts: i64,
88 window_secs: i64,
89}
90
91#[derive(Debug, Deserialize)]
92struct SessionReport {
93 #[serde(default)]
94 session_start: String,
95 #[serde(default)]
96 transcript: Vec<SessionTranscriptEntry>,
97}
98
99#[derive(Debug, Deserialize)]
100struct SessionTranscriptEntry {
101 #[serde(default)]
102 speaker: String,
103 #[serde(default)]
104 text: String,
105}
106
107#[derive(Debug)]
108struct SessionExchange {
109 path: String,
110 last_modified: i64,
111 content: String,
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115enum SessionSpeakerKind {
116 User,
117 Assistant,
118 Ignore,
119}
120
121pub fn detect_room(path: &str) -> String {
126 let lower = path.to_lowercase().replace('\\', "/");
127 let filename = lower.rsplit('/').next().unwrap_or(&lower);
128 let ext = filename.rsplit('.').next().unwrap_or("");
129
130 let mut best_room = None::<&str>;
131 let mut best_score = 0i32;
132 let mut consider = |room: &'static str, score: i32| {
133 if score > best_score {
134 best_score = score;
135 best_room = Some(room);
136 }
137 };
138
139 let is_component = |segment: &str| {
140 lower == segment
141 || lower.starts_with(&format!("{segment}/"))
142 || lower.contains(&format!("/{segment}/"))
143 };
144
145 if lower.starts_with("session/")
146 || lower.starts_with(".hematite/reports/")
147 || lower.starts_with(".hematite/imports/")
148 || is_component("reports")
149 || is_component("imports")
150 {
151 consider("session", 100);
152 }
153
154 if lower.starts_with(".hematite/docs/")
155 || is_component("docs")
156 || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
157 || matches!(ext, "md" | "markdown" | "pdf" | "rst")
158 {
159 consider("docs", 80);
160 }
161
162 if is_component("tests")
163 || filename.contains("diagnostic")
164 || filename.ends_with("_test.rs")
165 || filename.ends_with(".test.ts")
166 {
167 consider("tests", 85);
168 }
169
170 if lower.starts_with(".github/workflows/")
171 || is_component("workflows")
172 || filename == ".pre-commit-config.yaml"
173 || filename == ".pre-commit-config.yml"
174 || filename.contains("hook")
175 {
176 consider("automation", 84);
177 }
178
179 if lower.starts_with("installer/")
180 || lower.starts_with("dist/")
181 || lower.starts_with("scripts/package-")
182 || filename.contains("release")
183 || filename.contains("bump-version")
184 || ext == "iss"
185 {
186 consider("release", 82);
187 }
188
189 if matches!(
190 filename,
191 "cargo.toml"
192 | "cargo.lock"
193 | "package.json"
194 | "pnpm-lock.yaml"
195 | "yarn.lock"
196 | "bun.lock"
197 | "bun.lockb"
198 | "pyproject.toml"
199 | "setup.py"
200 | "go.mod"
201 | "pom.xml"
202 | "build.gradle"
203 | "build.gradle.kts"
204 | "cmakelists.txt"
205 | ".gitignore"
206 | "settings.json"
207 | "mcp_servers.json"
208 ) || filename.ends_with(".sln")
209 || filename.ends_with(".csproj")
210 || filename.contains("config")
211 {
212 consider("config", 76);
213 }
214
215 if is_component("ui")
216 || matches!(
217 filename,
218 "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
219 )
220 {
221 consider("ui", 70);
222 }
223
224 if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
225 consider("memory", 72);
226 }
227
228 if is_component("tools")
229 || matches!(
230 filename,
231 "verify_build.rs"
232 | "host_inspect.rs"
233 | "shell.rs"
234 | "code_sandbox.rs"
235 | "project_map.rs"
236 | "runtime_trace.rs"
237 )
238 {
239 consider("tools", 68);
240 }
241
242 if filename.contains("mcp")
243 || filename.contains("lsp")
244 || lower.contains("/mcp/")
245 || lower.contains("/lsp/")
246 {
247 consider("integration", 67);
248 }
249
250 if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
251 || filename.contains("startup")
252 || filename.contains("runtime")
253 {
254 consider("runtime", 66);
255 }
256
257 if is_component("agent") {
258 consider("agent", 60);
259 }
260
261 if lower.starts_with("libs/") || is_component("libs") {
262 consider("libs", 58);
263 }
264
265 if lower.starts_with("scripts/") || is_component("scripts") {
266 consider("scripts", 55);
267 }
268
269 if let Some(room) = best_room {
270 return room.to_string();
271 }
272
273 lower
275 .split('/')
276 .next()
277 .filter(|s| !s.is_empty() && !s.contains('.'))
278 .unwrap_or("root")
279 .to_string()
280}
281
282pub fn detect_memory_type(text: &str) -> &'static str {
288 let lower = text.to_lowercase();
289
290 let decision_patterns = [
292 "let's use ",
293 "we'll use ",
294 "decided to ",
295 "going with ",
296 "we agreed ",
297 "the plan is",
298 "we're going to",
299 "switching to",
300 "we chose",
301 "final decision",
302 "we settled on",
303 "agreed on",
304 "we decided",
305 ];
306 for pat in &decision_patterns {
307 if lower.contains(pat) {
308 return "decision";
309 }
310 }
311
312 let problem_patterns = [
314 "bug fixed",
315 "bug was",
316 "the issue was",
317 "root cause",
318 "error was",
319 "turned out to be",
320 "the fix was",
321 "was caused by",
322 "broken because",
323 "fixed by",
324 "the problem was",
325 "found the bug",
326 "port conflict",
327 "crash",
328 "panicked",
329 "segfault",
330 "oom",
331 "out of memory",
332 ];
333 for pat in &problem_patterns {
334 if lower.contains(pat) {
335 return "problem";
336 }
337 }
338
339 let milestone_patterns = [
341 "now working",
342 "successfully",
343 "shipped",
344 "deployed",
345 "it works",
346 "tests pass",
347 "all green",
348 "breakthrough",
349 "finally got",
350 "got it working",
351 "completed",
352 "finished",
353 "done with",
354 "landed",
355 ];
356 for pat in &milestone_patterns {
357 if lower.contains(pat) {
358 return "milestone";
359 }
360 }
361
362 let preference_patterns = [
364 "i prefer",
365 "i like",
366 "i don't like",
367 "i want",
368 "always use",
369 "never use",
370 "i usually",
371 "my preference",
372 "keep it",
373 "avoid using",
374 ];
375 for pat in &preference_patterns {
376 if lower.contains(pat) {
377 return "preference";
378 }
379 }
380
381 ""
382}
383
384impl Vein {
385 const SESSION_REPORT_LIMIT: usize = 5;
386 const SESSION_TURN_LIMIT: usize = 50;
387 const IMPORT_FILE_LIMIT: usize = 12;
388 const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
389
390 pub fn new<P: AsRef<Path>>(
391 db_path: P,
392 base_url: String,
393 ) -> Result<Self, Box<dyn std::error::Error>> {
394 let db = Connection::open(db_path)?;
395
396 db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
398
399 db.execute_batch(
403 "CREATE TABLE IF NOT EXISTS chunks_meta (
404 path TEXT PRIMARY KEY,
405 last_modified INTEGER NOT NULL,
406 room TEXT NOT NULL DEFAULT 'root'
407 );
408 CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
409 path UNINDEXED,
410 content,
411 tokenize='porter ascii'
412 );
413 CREATE TABLE IF NOT EXISTS chunks_vec (
414 path TEXT NOT NULL,
415 chunk_idx INTEGER NOT NULL,
416 embedding BLOB NOT NULL,
417 PRIMARY KEY (path, chunk_idx)
418 );
419 CREATE TABLE IF NOT EXISTS file_heat (
420 path TEXT PRIMARY KEY,
421 heat INTEGER NOT NULL DEFAULT 0,
422 last_edit INTEGER NOT NULL DEFAULT 0
423 );",
424 )?;
425
426 let _ = db
428 .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
429 let _ = db.execute_batch(
430 "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
431 );
432 let _ = db.execute_batch(
433 "ALTER TABLE chunks_meta ADD COLUMN memory_type TEXT NOT NULL DEFAULT '';",
434 );
435
436 Ok(Self {
437 db: std::sync::Arc::new(std::sync::Mutex::new(db)),
438 base_url,
439 embed_model: std::sync::Arc::new(std::sync::RwLock::new(None)),
440 })
441 }
442
443 pub fn set_embed_model(&self, model_id: Option<String>) {
444 if let Ok(mut guard) = self.embed_model.write() {
445 *guard = model_id;
446 }
447 }
448
449 fn current_embed_model(&self) -> Option<String> {
450 self.embed_model.read().ok().and_then(|guard| guard.clone())
451 }
452
453 pub fn index_document(
458 &mut self,
459 path: &str,
460 last_modified: i64,
461 full_text: &str,
462 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
463 let room = detect_room(path);
464 let ext = std::path::Path::new(path)
465 .extension()
466 .and_then(|e| e.to_str())
467 .unwrap_or("");
468 let chunks = chunk_by_symbols(ext, full_text);
469 let memory_type = if room == "session" {
471 detect_memory_type(full_text)
472 } else {
473 ""
474 };
475 self.index_chunks_with_room_and_type(path, last_modified, &room, memory_type, &chunks)
476 }
477
478 fn index_chunks_with_room_and_type(
479 &mut self,
480 path: &str,
481 last_modified: i64,
482 room: &str,
483 memory_type: &str,
484 chunks: &[String],
485 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
486 let db = self.db.lock().unwrap();
487 let existing: Option<i64> = db
488 .query_row(
489 "SELECT last_modified FROM chunks_meta WHERE path = ?1",
490 params![path],
491 |r| r.get(0),
492 )
493 .ok();
494
495 if let Some(ts) = existing {
496 if ts >= last_modified {
497 return Ok(Vec::new()); }
499 }
500
501 db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
503 db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
504 db.execute(
505 "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room, memory_type) VALUES (?1, ?2, ?3, ?4)",
506 params![path, last_modified, room, memory_type],
507 )?;
508
509 drop(db);
510
511 let mut db = self.db.lock().unwrap();
512 let tx = db.transaction()?;
513 {
514 let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
515 for chunk in chunks {
516 stmt.execute(params![path, chunk.as_str()])?;
517 }
518 }
519 tx.commit()?;
520
521 Ok(chunks.to_vec())
522 }
523
524 pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
528 let Some(embed_model) = self.current_embed_model() else {
529 return;
530 };
531 for (idx, chunk) in chunks.iter().enumerate() {
532 if let Some(vec) = embed_text_blocking(chunk, &self.base_url, &embed_model) {
533 let blob = floats_to_blob(&vec);
534 let db = self.db.lock().unwrap();
535 let _ = db.execute(
536 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
537 params![path, idx as i64, blob],
538 );
539 }
540 }
541 }
542
543 pub fn search_bm25(
547 &self,
548 query: &str,
549 limit: usize,
550 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
551 static STOPWORDS: std::sync::OnceLock<HashSet<&'static str>> = std::sync::OnceLock::new();
555 let stopwords = STOPWORDS.get_or_init(|| {
556 [
557 "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
558 "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an",
559 "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by",
560 "from", "get", "gets", "got", "work", "works", "make", "makes", "use", "uses",
561 "into", "that", "this", "it", "its",
562 ]
563 .iter()
564 .copied()
565 .collect()
566 });
567
568 let safe_query: String = query
571 .chars()
572 .map(|c| {
573 if c.is_alphanumeric() || c == ' ' || c == '_' {
574 c.to_ascii_lowercase()
575 } else {
576 ' '
577 }
578 })
579 .collect();
580
581 let fts_query = safe_query
583 .split_whitespace()
584 .filter(|w| w.len() >= 3 && !stopwords.contains(*w))
585 .collect::<Vec<_>>()
586 .join(" OR ");
587
588 if fts_query.is_empty() {
589 return Ok(Vec::new());
590 }
591
592 let db = self.db.lock().unwrap();
593 let mut stmt = db.prepare(
594 "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room, cm.memory_type
595 FROM chunks_fts
596 JOIN chunks_meta cm ON cm.path = chunks_fts.path
597 WHERE chunks_fts MATCH ?1
598 ORDER BY rank
599 LIMIT ?2",
600 )?;
601
602 let results: Vec<SearchResult> = stmt
603 .query_map(params![fts_query, limit as i64], |row| {
604 Ok(SearchResult {
605 path: row.get(0)?,
606 content: row.get(1)?,
607 score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
608 last_modified: row.get(3)?,
609 room: row.get(4)?,
610 memory_type: row.get::<_, String>(5).unwrap_or_default(),
611 })
612 })?
613 .filter_map(|r| r.ok())
614 .collect();
615
616 Ok(results)
617 }
618
619 pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
622 let Some(embed_model) = self.current_embed_model() else {
623 return Vec::new();
624 };
625 let query_vec = match embed_query_blocking(query, &self.base_url, &embed_model) {
626 Some(v) => v,
627 None => return Vec::new(),
628 };
629
630 let rows: Vec<(String, i64, Vec<u8>, i64, String, String)> = {
632 let db = self.db.lock().unwrap();
633 let mut stmt = match db.prepare(
634 "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room, cm.memory_type
635 FROM chunks_vec cv
636 JOIN chunks_meta cm ON cm.path = cv.path",
637 ) {
638 Ok(s) => s,
639 Err(_) => return Vec::new(),
640 };
641 stmt.query_map([], |row| {
642 Ok((
643 row.get::<_, String>(0)?,
644 row.get::<_, i64>(1)?,
645 row.get::<_, Vec<u8>>(2)?,
646 row.get::<_, i64>(3)?,
647 row.get::<_, String>(4)?,
648 row.get::<_, String>(5).unwrap_or_default(),
649 ))
650 })
651 .ok()
652 .map(|rows| rows.filter_map(|r| r.ok()).collect())
653 .unwrap_or_default()
654 };
655
656 if rows.is_empty() {
657 return Vec::new();
658 }
659
660 let mut scored: Vec<(f32, String, i64, i64, String, String)> = rows
662 .into_iter()
663 .filter_map(|(path, idx, blob, last_modified, room, memory_type)| {
664 let vec = blob_to_floats(&blob);
665 let sim = cosine_similarity(&query_vec, &vec);
666 Some((sim, path, idx, last_modified, room, memory_type))
667 })
668 .collect();
669
670 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
671 scored.truncate(limit);
672
673 let db = self.db.lock().unwrap();
675 scored
676 .into_iter()
677 .filter_map(|(score, path, idx, last_modified, room, memory_type)| {
678 let content: Option<String> = db
679 .query_row(
680 "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
681 params![path, idx],
682 |r| r.get(0),
683 )
684 .ok();
685 content.map(|c| SearchResult {
686 path,
687 content: c,
688 score,
689 room,
690 last_modified,
691 memory_type,
692 })
693 })
694 .collect()
695 }
696
697 pub fn search_context(
704 &self,
705 query: &str,
706 limit: usize,
707 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
708 let candidate_limit = (limit.max(1) * 4).max(12);
709 let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
710 let semantic = self.search_semantic(query, candidate_limit);
711 let signals = QuerySignals::from_query(query);
712
713 let active_room = self.active_room();
715
716 let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
719
720 for r in semantic {
721 let score = reranked_score(&signals, active_room.as_deref(), &r, true);
722 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
723 }
724
725 for r in bm25 {
726 let score = reranked_score(&signals, active_room.as_deref(), &r, false);
727 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
728 }
729
730 let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
731 merged.sort_by(|a, b| {
732 b.score
733 .partial_cmp(&a.score)
734 .unwrap_or(std::cmp::Ordering::Equal)
735 });
736 merged.truncate(limit);
737 Ok(merged)
738 }
739
740 fn active_room(&self) -> Option<String> {
743 let db = self.db.lock().unwrap();
744 db.query_row(
745 "SELECT cm.room, SUM(fh.heat) as total
746 FROM file_heat fh
747 JOIN chunks_meta cm ON cm.path = fh.path
748 GROUP BY cm.room
749 ORDER BY total DESC
750 LIMIT 1",
751 [],
752 |row| row.get::<_, String>(0),
753 )
754 .ok()
755 }
756
757 pub fn index_project(&mut self) -> usize {
765 let root = crate::tools::file_ops::workspace_root();
766 let mut count = 0usize;
767
768 const INDEXABLE: &[&str] = &[
769 "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
770 "yml", "txt",
771 ];
772 const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
773
774 static INDEXABLE_SET: std::sync::OnceLock<std::collections::HashSet<&'static str>> =
777 std::sync::OnceLock::new();
778 static SKIP_DIRS_SET: std::sync::OnceLock<std::collections::HashSet<&'static str>> =
779 std::sync::OnceLock::new();
780 let indexable = INDEXABLE_SET.get_or_init(|| INDEXABLE.iter().copied().collect());
781 let skip_dirs = SKIP_DIRS_SET.get_or_init(|| SKIP_DIRS.iter().copied().collect());
782
783 for entry in walkdir::WalkDir::new(&root)
784 .follow_links(false)
785 .into_iter()
786 .filter_entry(|e| {
787 if e.file_type().is_dir() {
788 let name = e.file_name().to_string_lossy();
789 return !skip_dirs.contains(name.as_ref());
790 }
791 true
792 })
793 .filter_map(|e| e.ok())
794 .filter(|e| e.file_type().is_file())
795 {
796 let path = entry.path();
797 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
798 if !indexable.contains(ext) {
799 continue;
800 }
801
802 let Ok(meta) = std::fs::metadata(path) else {
803 continue;
804 };
805 if meta.len() > 512_000 {
806 continue;
807 }
808
809 let mtime = meta
810 .modified()
811 .map(|t| {
812 t.duration_since(std::time::UNIX_EPOCH)
813 .unwrap_or_default()
814 .as_secs() as i64
815 })
816 .unwrap_or(0);
817
818 let rel = path.strip_prefix(&root).unwrap_or(path);
819 let rel_str = rel.to_string_lossy().replace('\\', "/");
820
821 if let Ok(content) = std::fs::read_to_string(path) {
822 match self.index_document(&rel_str, mtime, &content) {
823 Ok(new_chunks) if !new_chunks.is_empty() => {
824 count += 1;
825 }
826 Ok(_) => {}
827 Err(_) => {}
828 }
829 }
830 }
831
832 count += self.index_workspace_artifacts(&crate::tools::file_ops::hematite_dir());
833
834 count
835 }
836
837 pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
842 let mut count = self.index_docs_folder(workspace_root);
843 count += self.index_recent_session_reports(workspace_root);
844 count += self.index_imported_session_exports(workspace_root);
845 self.backfill_missing_embeddings();
846 count
847 }
848
849 fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
854 let docs_dir = workspace_root.join(".hematite").join("docs");
855 const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
856 let mut count = 0usize;
857 let mut desired_paths = HashSet::new();
858
859 if docs_dir.exists() {
860 for entry in walkdir::WalkDir::new(&docs_dir)
861 .max_depth(3)
862 .follow_links(false)
863 .into_iter()
864 .filter_map(|e| e.ok())
865 .filter(|e| e.file_type().is_file())
866 {
867 let path = entry.path();
868 let ext = path
869 .extension()
870 .and_then(|e| e.to_str())
871 .unwrap_or("")
872 .to_lowercase();
873 if !DOCS_INDEXABLE.contains(&ext.as_str()) {
874 continue;
875 }
876
877 let Ok(meta) = std::fs::metadata(path) else {
878 continue;
879 };
880 if meta.len() > 50_000_000 {
881 continue;
882 }
883
884 let mtime = meta
885 .modified()
886 .map(|t| {
887 t.duration_since(std::time::UNIX_EPOCH)
888 .unwrap_or_default()
889 .as_secs() as i64
890 })
891 .unwrap_or(0);
892
893 let rel = path.strip_prefix(workspace_root).unwrap_or(path);
894 let rel_str = rel.to_string_lossy().replace('\\', "/");
895 desired_paths.insert(rel_str.clone());
896
897 let content = if ext == "pdf" {
898 extract_pdf_text(path).ok().flatten()
899 } else {
900 std::fs::read_to_string(path).ok()
901 };
902
903 if let Some(text) = content {
904 if text.trim().is_empty() {
905 continue;
906 }
907 match self.index_document(&rel_str, mtime, &text) {
908 Ok(new_chunks) if !new_chunks.is_empty() => {
909 count += 1;
910 }
911 Ok(_) => {}
912 Err(_) => {}
913 }
914 }
915 }
916 }
917
918 self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
919 count
920 }
921
922 pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
925 let reports_dir = workspace_root.join(".hematite").join("reports");
926 let mut count = 0usize;
927 let mut desired_paths = HashSet::new();
928
929 if reports_dir.exists() {
930 let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
931 .ok()
932 .into_iter()
933 .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
934 .map(|entry| entry.path())
935 .filter(|path| {
936 path.is_file()
937 && path.extension().and_then(|ext| ext.to_str()) == Some("json")
938 && path
939 .file_stem()
940 .and_then(|stem| stem.to_str())
941 .map(|stem| stem.starts_with("session_"))
942 .unwrap_or(false)
943 })
944 .collect();
945
946 reports.sort_by(|a, b| {
947 let a_name = a
948 .file_name()
949 .and_then(|name| name.to_str())
950 .unwrap_or_default();
951 let b_name = b
952 .file_name()
953 .and_then(|name| name.to_str())
954 .unwrap_or_default();
955 b_name.cmp(a_name)
956 });
957 reports.truncate(Self::SESSION_REPORT_LIMIT);
958
959 for report_path in reports {
960 let Ok(meta) = std::fs::metadata(&report_path) else {
961 continue;
962 };
963 let mtime = meta
964 .modified()
965 .map(|t| {
966 t.duration_since(std::time::UNIX_EPOCH)
967 .unwrap_or_default()
968 .as_secs() as i64
969 })
970 .unwrap_or(0);
971
972 for exchange in load_session_exchanges(&report_path, mtime) {
973 desired_paths.insert(exchange.path.clone());
974 let mtype = detect_memory_type(&exchange.content);
975 match self.index_chunks_with_room_and_type(
976 &exchange.path,
977 exchange.last_modified,
978 "session",
979 mtype,
980 std::slice::from_ref(&exchange.content),
981 ) {
982 Ok(new_chunks) if !new_chunks.is_empty() => {
983 count += 1;
984 }
985 Ok(_) => {}
986 Err(_) => {}
987 }
988 }
989 }
990 }
991
992 self.prune_indexed_prefix("session/", &desired_paths);
993 count
994 }
995
996 pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
1001 let imports_dir = workspace_root.join(".hematite").join("imports");
1002 let mut count = 0usize;
1003 let mut desired_paths = HashSet::new();
1004
1005 if imports_dir.exists() {
1006 let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
1007 .max_depth(4)
1008 .follow_links(false)
1009 .into_iter()
1010 .filter_map(|entry| entry.ok())
1011 .filter(|entry| entry.file_type().is_file())
1012 .filter_map(|entry| {
1013 let path = entry.into_path();
1014 let ext = path
1015 .extension()
1016 .and_then(|ext| ext.to_str())
1017 .unwrap_or("")
1018 .to_ascii_lowercase();
1019 if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
1020 return None;
1021 }
1022 let meta = std::fs::metadata(&path).ok()?;
1023 if meta.len() > Self::IMPORT_MAX_BYTES {
1024 return None;
1025 }
1026 let mtime = meta
1027 .modified()
1028 .map(|t| {
1029 t.duration_since(std::time::UNIX_EPOCH)
1030 .unwrap_or_default()
1031 .as_secs() as i64
1032 })
1033 .unwrap_or(0);
1034 Some((path, mtime))
1035 })
1036 .collect();
1037
1038 imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
1039 b_mtime
1040 .cmp(a_mtime)
1041 .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
1042 });
1043 imports.truncate(Self::IMPORT_FILE_LIMIT);
1044
1045 for (import_path, mtime) in imports {
1046 for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
1047 desired_paths.insert(exchange.path.clone());
1048 let mtype = detect_memory_type(&exchange.content);
1049 match self.index_chunks_with_room_and_type(
1050 &exchange.path,
1051 exchange.last_modified,
1052 "session",
1053 mtype,
1054 std::slice::from_ref(&exchange.content),
1055 ) {
1056 Ok(new_chunks) if !new_chunks.is_empty() => {
1057 count += 1;
1058 }
1059 Ok(_) => {}
1060 Err(_) => {}
1061 }
1062 }
1063 }
1064 }
1065
1066 self.prune_indexed_prefix("session/imports/", &desired_paths);
1067 count
1068 }
1069
1070 fn backfill_missing_embeddings(&self) {
1075 let (fts_count, vec_count) = {
1077 let db = self.db.lock().unwrap();
1078 let fts: i64 = db
1079 .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
1080 .unwrap_or(0);
1081 let vec: i64 = db
1082 .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
1083 .unwrap_or(0);
1084 (fts, vec)
1085 };
1086 if fts_count == 0 || fts_count == vec_count {
1087 return;
1088 }
1089
1090 let missing: Vec<(String, i64, String)> = {
1093 let db = self.db.lock().unwrap();
1094 let mut stmt = db
1095 .prepare(
1096 "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
1097 FROM chunks_fts f
1098 LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
1099 WHERE v.path IS NULL
1100 ORDER BY CASE
1101 WHEN f.path LIKE '%.rs' THEN 0
1102 WHEN f.path LIKE '%.toml' THEN 1
1103 WHEN f.path LIKE '%.json' THEN 2
1104 ELSE 3
1105 END, f.path
1106 LIMIT 20",
1107 )
1108 .unwrap();
1109 stmt.query_map([], |r| {
1110 Ok((
1111 r.get::<_, String>(0)?,
1112 r.get::<_, i64>(1)?,
1113 r.get::<_, String>(2)?,
1114 ))
1115 })
1116 .unwrap()
1117 .filter_map(|r| r.ok())
1118 .collect()
1119 };
1120
1121 let Some(embed_model) = self.current_embed_model() else {
1122 return;
1123 };
1124
1125 for (path, idx, content) in missing {
1126 if let Some(vec) = embed_text_blocking(&content, &self.base_url, &embed_model) {
1127 let blob = floats_to_blob(&vec);
1128 let db = self.db.lock().unwrap();
1129 let _ = db.execute(
1130 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
1131 params![path, idx, blob],
1132 );
1133 } else {
1134 break;
1136 }
1137 }
1138 }
1139
1140 pub fn file_count(&self) -> usize {
1143 let db = self.db.lock().unwrap();
1144 db.query_row(
1145 "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
1146 [],
1147 |r| r.get::<_, i64>(0),
1148 )
1149 .unwrap_or(0) as usize
1150 }
1151
1152 pub fn embedded_chunk_count(&self) -> usize {
1155 let db = self.db.lock().unwrap();
1156 db.query_row(
1157 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1158 [],
1159 |r| r.get::<_, i64>(0),
1160 )
1161 .unwrap_or(0) as usize
1162 }
1163
1164 pub fn has_any_embeddings(&self) -> bool {
1166 let db = self.db.lock().unwrap();
1167 db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1168 r.get::<_, i64>(0)
1169 })
1170 .unwrap_or(0)
1171 != 0
1172 }
1173
1174 pub fn reset(&self) {
1177 let db = self.db.lock().unwrap();
1178 let _ = db.execute_batch(
1179 "DELETE FROM chunks_fts;
1180 DELETE FROM chunks_vec;
1181 DELETE FROM chunks_meta;",
1182 );
1183 }
1184
1185 pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1188 let db = self.db.lock().unwrap();
1189 let indexed_source_files = db
1190 .query_row(
1191 "SELECT COUNT(*) FROM chunks_meta
1192 WHERE path NOT LIKE 'session/%'
1193 AND path NOT LIKE '.hematite/docs/%'",
1194 [],
1195 |r| r.get::<_, i64>(0),
1196 )
1197 .unwrap_or(0) as usize;
1198 let indexed_docs = db
1199 .query_row(
1200 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1201 [],
1202 |r| r.get::<_, i64>(0),
1203 )
1204 .unwrap_or(0) as usize;
1205 let indexed_session_exchanges = db
1206 .query_row(
1207 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1208 [],
1209 |r| r.get::<_, i64>(0),
1210 )
1211 .unwrap_or(0) as usize;
1212 let embedded_source_doc_chunks = db
1213 .query_row(
1214 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1215 [],
1216 |r| r.get::<_, i64>(0),
1217 )
1218 .unwrap_or(0) as usize;
1219 let has_any_embeddings = db
1220 .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1221 r.get::<_, i64>(0)
1222 })
1223 .unwrap_or(0)
1224 != 0;
1225 drop(db);
1226
1227 let hot_files = self
1228 .hot_files(hot_limit.max(1))
1229 .into_iter()
1230 .map(|(path, heat, last_modified, room)| VeinHotFile {
1231 path,
1232 heat,
1233 last_modified,
1234 room,
1235 })
1236 .collect::<Vec<_>>();
1237
1238 VeinInspectionSnapshot {
1239 indexed_source_files,
1240 indexed_docs,
1241 indexed_session_exchanges,
1242 embedded_source_doc_chunks,
1243 has_any_embeddings,
1244 active_room: self.active_room(),
1245 l1_ready: !hot_files.is_empty(),
1246 hot_files,
1247 }
1248 }
1249
1250 pub fn bump_heat(&self, path: &str) {
1256 if path.is_empty() {
1257 return;
1258 }
1259 let now = std::time::SystemTime::now()
1260 .duration_since(std::time::UNIX_EPOCH)
1261 .unwrap_or_default()
1262 .as_secs() as i64;
1263 let db = self.db.lock().unwrap();
1264 let _ = db.execute(
1265 "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1266 ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1267 params![path, now],
1268 );
1269 }
1270
1271 fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1275 let db = self.db.lock().unwrap();
1276 let mut stmt = match db.prepare(
1277 "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1278 FROM file_heat fh
1279 JOIN chunks_meta cm ON cm.path = fh.path
1280 ORDER BY fh.heat DESC, cm.last_modified DESC
1281 LIMIT ?1",
1282 ) {
1283 Ok(s) => s,
1284 Err(_) => return vec![],
1285 };
1286 stmt.query_map(params![n as i64], |row| {
1287 Ok((
1288 row.get::<_, String>(0)?,
1289 row.get::<_, i64>(1)?,
1290 row.get::<_, i64>(2)?,
1291 row.get::<_, String>(3)?,
1292 ))
1293 })
1294 .map(|rows| rows.filter_map(|r| r.ok()).collect())
1295 .unwrap_or_default()
1296 }
1297
1298 pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1301 self.hot_files(n)
1302 .into_iter()
1303 .map(|(path, _, _, _)| path)
1304 .collect()
1305 }
1306
1307 pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1311 let files = self.hot_files(n);
1312 if files.is_empty() {
1313 return vec![];
1314 }
1315 let max_heat = files
1316 .iter()
1317 .map(|(_, h, _, _)| *h)
1318 .max()
1319 .unwrap_or(1)
1320 .max(1) as f64;
1321 files
1322 .into_iter()
1323 .map(|(path, heat, _, _)| {
1324 let weight = (heat as f64) / max_heat;
1325 (path, weight)
1326 })
1327 .collect()
1328 }
1329
1330 pub fn l1_context(&self) -> Option<String> {
1335 let files = self.hot_files(8);
1336 if files.is_empty() {
1337 return None;
1338 }
1339 let now = std::time::SystemTime::now()
1340 .duration_since(std::time::UNIX_EPOCH)
1341 .unwrap_or_default()
1342 .as_secs() as i64;
1343
1344 let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1346 std::collections::BTreeMap::new();
1347 for (path, heat, mtime, room) in &files {
1348 by_room
1349 .entry(room.clone())
1350 .or_default()
1351 .push((path.clone(), *heat, *mtime));
1352 }
1353
1354 let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1355 for (room, entries) in &by_room {
1356 out.push_str(&format!("[{}]\n", room));
1357 for (path, heat, mtime) in entries {
1358 let age_secs = now - mtime;
1359 let age = if age_secs < 3600 {
1360 "just now".to_string()
1361 } else if age_secs < 86400 {
1362 format!("{}h ago", age_secs / 3600)
1363 } else {
1364 format!("{}d ago", age_secs / 86400)
1365 };
1366 out.push_str(&format!(
1367 " - {} [{} edit{}, {}]\n",
1368 path,
1369 heat,
1370 if *heat == 1 { "" } else { "s" },
1371 age
1372 ));
1373 }
1374 }
1375 Some(out)
1376 }
1377
1378 fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1379 let pattern = format!("{}%", prefix);
1380 let existing_paths: Vec<String> = {
1381 let db = self.db.lock().unwrap();
1382 let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1383 Ok(stmt) => stmt,
1384 Err(_) => return,
1385 };
1386 stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1387 .map(|rows| rows.filter_map(|row| row.ok()).collect())
1388 .unwrap_or_default()
1389 };
1390
1391 if existing_paths.is_empty() {
1392 return;
1393 }
1394
1395 let db = self.db.lock().unwrap();
1396 for path in existing_paths {
1397 if desired_paths.contains(&path) {
1398 continue;
1399 }
1400 let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1401 let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1402 let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1403 }
1404 }
1405}
1406
1407impl QuerySignals {
1408 fn from_query(query: &str) -> Self {
1409 let lower = query.to_ascii_lowercase();
1410 let historical_memory_hint = [
1411 "remember",
1412 "earlier",
1413 "previous",
1414 "last time",
1415 "what did we decide",
1416 "why did we decide",
1417 "what did we say",
1418 "why did we change",
1419 ]
1420 .iter()
1421 .any(|needle| lower.contains(needle));
1422
1423 let query_memory_type = if lower.contains("decide")
1425 || lower.contains("decision")
1426 || lower.contains("we agreed")
1427 || lower.contains("we chose")
1428 {
1429 Some("decision")
1430 } else if lower.contains("bug")
1431 || lower.contains("error")
1432 || lower.contains("issue")
1433 || lower.contains("problem")
1434 || lower.contains("fix")
1435 || lower.contains("broken")
1436 {
1437 Some("problem")
1438 } else if lower.contains("shipped")
1439 || lower.contains("milestone")
1440 || lower.contains("finished")
1441 || lower.contains("working now")
1442 {
1443 Some("milestone")
1444 } else if lower.contains("prefer")
1445 || lower.contains("my preference")
1446 || lower.contains("i like")
1447 || lower.contains("i want")
1448 {
1449 Some("preference")
1450 } else {
1451 None
1452 };
1453
1454 Self {
1455 exact_phrases: extract_exact_phrases(query),
1456 standout_terms: extract_standout_terms(query),
1457 historical_memory_hint,
1458 temporal_reference: extract_temporal_reference(query),
1459 query_memory_type,
1460 }
1461 }
1462}
1463
1464fn merge_scored_result(
1465 merged_by_path: &mut HashMap<String, SearchResult>,
1466 candidate: SearchResult,
1467) {
1468 match merged_by_path.get_mut(&candidate.path) {
1469 Some(existing) if candidate.score > existing.score => *existing = candidate,
1470 Some(_) => {}
1471 None => {
1472 merged_by_path.insert(candidate.path.clone(), candidate);
1473 }
1474 }
1475}
1476
1477fn reranked_score(
1478 signals: &QuerySignals,
1479 active_room: Option<&str>,
1480 result: &SearchResult,
1481 is_semantic: bool,
1482) -> f32 {
1483 let base = if is_semantic {
1484 1.0 + result.score.clamp(0.0, 1.0)
1485 } else {
1486 (result.score / 10.0).clamp(0.0, 1.0)
1487 };
1488 base + room_bias(active_room, result)
1489 + retrieval_signal_boost(signals, result)
1490 + temporal_memory_boost(signals, result)
1491}
1492
1493fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1494 if active_room == Some(result.room.as_str()) {
1495 0.15
1496 } else {
1497 0.0
1498 }
1499}
1500
1501fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1502 let mut boost = 0.0f32;
1503
1504 let lower_path = result.path.to_ascii_lowercase();
1507 let lower_content = result.content.to_ascii_lowercase();
1508 let haystack = format!("{}\n{}", lower_path, lower_content);
1509
1510 let phrase_matches = signals
1511 .exact_phrases
1512 .iter()
1513 .filter(|phrase| haystack.contains(phrase.as_str()))
1514 .count();
1515 if phrase_matches > 0 {
1516 boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1517 }
1518
1519 let mut standout_matches = 0;
1520 for term in &signals.standout_terms {
1521 if lower_path.contains(term.as_str()) {
1522 boost += 0.40;
1523 standout_matches += 1;
1524 } else if lower_content.contains(term.as_str()) {
1525 boost += 0.12;
1526 standout_matches += 1;
1527 }
1528 if standout_matches >= 3 {
1529 break;
1530 }
1531 }
1532
1533 if signals.historical_memory_hint && result.room == "session" {
1534 boost += 0.45;
1535 }
1536
1537 if let Some(qtype) = signals.query_memory_type {
1539 if !result.memory_type.is_empty() && result.memory_type == qtype {
1540 boost += 0.35;
1541 }
1542 }
1543
1544 boost
1545}
1546
1547fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1548 if result.room != "session" {
1549 return 0.0;
1550 }
1551 let Some(reference) = signals.temporal_reference else {
1552 return 0.0;
1553 };
1554 let Some(memory_ts) = session_memory_timestamp(result) else {
1555 return 0.0;
1556 };
1557
1558 let span = reference.window_secs.max(86_400);
1559 let full_fade = span.saturating_mul(8);
1560 if full_fade <= 0 {
1561 return 0.0;
1562 }
1563
1564 let distance = (memory_ts - reference.target_ts).abs();
1565 let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1566 if closeness <= 0.0 {
1567 0.0
1568 } else {
1569 0.22 * closeness
1570 }
1571}
1572
1573fn extract_exact_phrases(query: &str) -> Vec<String> {
1574 let mut phrases = Vec::new();
1575 let chars: Vec<char> = query.chars().collect();
1576 let mut i = 0usize;
1577
1578 while i < chars.len() {
1579 let quote = chars[i];
1580 if !matches!(quote, '"' | '\'' | '`') {
1581 i += 1;
1582 continue;
1583 }
1584 let start = i + 1;
1585 let mut end = start;
1586 while end < chars.len() && chars[end] != quote {
1587 end += 1;
1588 }
1589 if end > start {
1590 let phrase = chars[start..end]
1591 .iter()
1592 .collect::<String>()
1593 .trim()
1594 .to_ascii_lowercase();
1595 if phrase.len() >= 3 && !phrases.contains(&phrase) {
1596 phrases.push(phrase);
1597 }
1598 }
1599 i = end.saturating_add(1);
1600 }
1601
1602 phrases
1603}
1604
1605fn extract_standout_terms(query: &str) -> Vec<String> {
1606 static STANDOUT_SW: std::sync::OnceLock<HashSet<&'static str>> = std::sync::OnceLock::new();
1607 let stopwords = STANDOUT_SW.get_or_init(|| {
1608 [
1609 "about", "after", "before", "change", "changed", "decide", "decided", "does",
1610 "earlier", "flow", "from", "have", "into", "just", "last", "local", "make", "more",
1611 "remember", "should", "that", "their", "there", "these", "they", "this", "those",
1612 "what", "when", "where", "which", "why", "with", "work",
1613 ]
1614 .iter()
1615 .copied()
1616 .collect()
1617 });
1618
1619 let mut standout = Vec::new();
1620 for token in query.split(|ch: char| {
1621 !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1622 }) {
1623 let trimmed = token.trim();
1624 if trimmed.len() < 4 {
1625 continue;
1626 }
1627 let lower = trimmed.to_ascii_lowercase();
1628 if stopwords.contains(lower.as_str()) {
1629 continue;
1630 }
1631
1632 let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1633 || trimmed
1634 .chars()
1635 .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1636 || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1637 || trimmed.len() >= 9;
1638
1639 if interesting && !standout.contains(&lower) {
1640 standout.push(lower);
1641 }
1642 }
1643
1644 standout
1645}
1646
1647fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1648 if let Some(ts) = extract_iso_date_from_query(query) {
1649 return Some(TemporalReference {
1650 target_ts: ts,
1651 window_secs: 86_400,
1652 });
1653 }
1654
1655 let now = current_unix_timestamp();
1656 let lower = query.to_ascii_lowercase();
1657 if lower.contains("yesterday") {
1658 Some(TemporalReference {
1659 target_ts: now.saturating_sub(86_400),
1660 window_secs: 86_400,
1661 })
1662 } else if lower.contains("today") || lower.contains("earlier today") {
1663 Some(TemporalReference {
1664 target_ts: now,
1665 window_secs: 86_400,
1666 })
1667 } else if lower.contains("last week") {
1668 Some(TemporalReference {
1669 target_ts: now.saturating_sub(7 * 86_400),
1670 window_secs: 7 * 86_400,
1671 })
1672 } else if lower.contains("last month") {
1673 Some(TemporalReference {
1674 target_ts: now.saturating_sub(30 * 86_400),
1675 window_secs: 30 * 86_400,
1676 })
1677 } else {
1678 None
1679 }
1680}
1681
1682fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1683 query
1684 .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1685 .find_map(parse_iso_date_token)
1686}
1687
1688fn parse_iso_date_token(token: &str) -> Option<i64> {
1689 if token.len() != 10 {
1690 return None;
1691 }
1692 let bytes = token.as_bytes();
1693 if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1694 return None;
1695 }
1696
1697 let year = token.get(0..4)?.parse::<i32>().ok()?;
1698 let month = token.get(5..7)?.parse::<u32>().ok()?;
1699 let day = token.get(8..10)?.parse::<u32>().ok()?;
1700 if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1701 return None;
1702 }
1703
1704 Some(days_from_civil(year, month, day).saturating_mul(86_400))
1705}
1706
1707fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1708 let year = year - if month <= 2 { 1 } else { 0 };
1709 let era = if year >= 0 { year } else { year - 399 } / 400;
1710 let yoe = year - era * 400;
1711 let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1712 let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1713 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1714 era as i64 * 146_097 + doe as i64 - 719_468
1715}
1716
1717fn current_unix_timestamp() -> i64 {
1718 std::time::SystemTime::now()
1719 .duration_since(std::time::UNIX_EPOCH)
1720 .unwrap_or_default()
1721 .as_secs() as i64
1722}
1723
1724fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1725 extract_session_path_timestamp(&result.path).or_else(|| {
1726 if result.last_modified > 0 {
1727 Some(result.last_modified)
1728 } else {
1729 None
1730 }
1731 })
1732}
1733
1734fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1735 let normalized = path.replace('\\', "/");
1736 let mut parts = normalized.split('/');
1737 if parts.next()? != "session" {
1738 return None;
1739 }
1740 parse_iso_date_token(parts.next()?)
1741}
1742
1743fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1744 let normalized = speaker.trim().to_ascii_lowercase();
1745 match normalized.as_str() {
1746 "you" | "user" => SessionSpeakerKind::User,
1747 "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1748 _ => SessionSpeakerKind::Assistant,
1749 }
1750}
1751
1752fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1753 let Ok(raw) = std::fs::read_to_string(report_path) else {
1754 return Vec::new();
1755 };
1756 let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1757 return Vec::new();
1758 };
1759
1760 let session_key = report_path
1761 .file_stem()
1762 .and_then(|stem| stem.to_str())
1763 .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1764 .unwrap_or("unknown-session")
1765 .to_string();
1766 let session_date = report
1767 .session_start
1768 .split('_')
1769 .next()
1770 .filter(|date| !date.is_empty())
1771 .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1772 .to_string();
1773
1774 let mut exchanges = Vec::new();
1775 let mut pending_user: Option<String> = None;
1776 let mut turn_index = 0usize;
1777
1778 for entry in report.transcript {
1779 match session_speaker_kind(&entry.speaker) {
1780 SessionSpeakerKind::User => {
1781 let text = entry.text.trim();
1782 if !text.is_empty() {
1783 pending_user = Some(text.to_string());
1784 }
1785 }
1786 SessionSpeakerKind::Assistant => {
1787 let text = entry.text.trim();
1788 if text.is_empty() {
1789 continue;
1790 }
1791 let Some(user_text) = pending_user.take() else {
1792 continue;
1793 };
1794 turn_index += 1;
1795 exchanges.push(SessionExchange {
1796 path: format!(
1797 "session/{}/{}/turn-{}",
1798 session_date, session_key, turn_index
1799 ),
1800 last_modified,
1801 content: format!(
1802 "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1803 user_text, text
1804 ),
1805 });
1806 }
1807 SessionSpeakerKind::Ignore => {}
1808 }
1809 }
1810
1811 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1812 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1813 exchanges = exchanges.into_iter().skip(keep_from).collect();
1814 }
1815
1816 exchanges
1817}
1818
1819fn load_imported_session_exchanges(
1820 import_path: &Path,
1821 imports_root: &Path,
1822 last_modified: i64,
1823) -> Vec<SessionExchange> {
1824 let Ok(raw) = std::fs::read_to_string(import_path) else {
1825 return Vec::new();
1826 };
1827
1828 let messages = normalize_import_messages(&raw, import_path);
1829 if messages.is_empty() {
1830 return Vec::new();
1831 }
1832
1833 let rel = import_path
1834 .strip_prefix(imports_root)
1835 .unwrap_or(import_path);
1836 let rel_slug = slugify_import_path(rel);
1837 let mut exchanges = Vec::new();
1838 let mut pending_user: Option<String> = None;
1839 let mut turn_index = 0usize;
1840
1841 for (role, text) in messages {
1842 let cleaned = text.trim();
1843 if cleaned.is_empty() {
1844 continue;
1845 }
1846 match role.as_str() {
1847 "user" => pending_user = Some(cleaned.to_string()),
1848 "assistant" => {
1849 let Some(user_text) = pending_user.take() else {
1850 continue;
1851 };
1852 turn_index += 1;
1853 exchanges.push(SessionExchange {
1854 path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1855 last_modified,
1856 content: format!(
1857 "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1858 rel.to_string_lossy().replace('\\', "/"),
1859 user_text,
1860 cleaned
1861 ),
1862 });
1863 }
1864 _ => {}
1865 }
1866 }
1867
1868 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1869 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1870 exchanges = exchanges.into_iter().skip(keep_from).collect();
1871 }
1872
1873 exchanges
1874}
1875
1876fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1877 if raw.trim().is_empty() {
1878 return Vec::new();
1879 }
1880
1881 if let Some(messages) = parse_marker_transcript(raw) {
1882 return messages;
1883 }
1884
1885 let ext = import_path
1886 .extension()
1887 .and_then(|ext| ext.to_str())
1888 .unwrap_or("")
1889 .to_ascii_lowercase();
1890
1891 if matches!(ext.as_str(), "json" | "jsonl")
1892 || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1893 {
1894 if let Some(messages) = parse_jsonl_messages(raw) {
1895 if !messages.is_empty() {
1896 return messages;
1897 }
1898 }
1899
1900 if let Ok(value) = serde_json::from_str::<Value>(raw) {
1901 if let Some(messages) = parse_session_report_messages(&value) {
1902 return messages;
1903 }
1904 if let Some(messages) = parse_simple_role_messages(&value) {
1905 return messages;
1906 }
1907 if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1908 return messages;
1909 }
1910 }
1911 }
1912
1913 Vec::new()
1914}
1915
1916fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1917 let lines = raw.lines().collect::<Vec<_>>();
1918 if lines
1919 .iter()
1920 .filter(|line| line.trim_start().starts_with("> "))
1921 .count()
1922 < 2
1923 {
1924 return None;
1925 }
1926
1927 let mut messages = Vec::new();
1928 let mut i = 0usize;
1929 while i < lines.len() {
1930 let line = lines[i].trim_start();
1931 if let Some(rest) = line.strip_prefix("> ") {
1932 messages.push(("user".to_string(), rest.trim().to_string()));
1933 i += 1;
1934 let mut assistant_lines = Vec::new();
1935 while i < lines.len() {
1936 let next = lines[i];
1937 if next.trim_start().starts_with("> ") {
1938 break;
1939 }
1940 let trimmed = next.trim();
1941 if !trimmed.is_empty() && trimmed != "---" {
1942 assistant_lines.push(trimmed.to_string());
1943 }
1944 i += 1;
1945 }
1946 if !assistant_lines.is_empty() {
1947 messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1948 }
1949 } else {
1950 i += 1;
1951 }
1952 }
1953
1954 (!messages.is_empty()).then_some(messages)
1955}
1956
1957fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1958 let mut messages = Vec::new();
1959 let mut has_codex_session_meta = false;
1960 let mut saw_jsonl = false;
1961
1962 for line in raw.lines() {
1963 let trimmed = line.trim();
1964 if trimmed.is_empty() {
1965 continue;
1966 }
1967 let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1968 continue;
1969 };
1970 saw_jsonl = true;
1971 let Some(object) = value.as_object() else {
1972 continue;
1973 };
1974
1975 match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1976 "session_meta" => {
1977 has_codex_session_meta = true;
1978 }
1979 "event_msg" => {
1980 let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1981 continue;
1982 };
1983 let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1984 continue;
1985 };
1986 match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1987 "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1988 "agent_message" => {
1989 messages.push(("assistant".to_string(), text.trim().to_string()))
1990 }
1991 _ => {}
1992 }
1993 }
1994 "human" | "user" => {
1995 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1996 messages.push(("user".to_string(), text));
1997 }
1998 }
1999 "assistant" => {
2000 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
2001 messages.push(("assistant".to_string(), text));
2002 }
2003 }
2004 _ => {
2005 if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
2006 if let Some(text) = extract_text_content(&value) {
2007 match role {
2008 "user" | "human" => messages.push(("user".to_string(), text)),
2009 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2010 _ => {}
2011 }
2012 }
2013 }
2014 }
2015 }
2016 }
2017
2018 if !saw_jsonl {
2019 return None;
2020 }
2021
2022 if has_codex_session_meta || !messages.is_empty() {
2023 return Some(messages);
2024 }
2025
2026 None
2027}
2028
2029fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
2030 let report = value.as_object()?;
2031 let transcript = report.get("transcript")?.as_array()?;
2032 let mut messages = Vec::new();
2033
2034 for entry in transcript {
2035 let Some(obj) = entry.as_object() else {
2036 continue;
2037 };
2038 let speaker = obj
2039 .get("speaker")
2040 .and_then(|v| v.as_str())
2041 .unwrap_or_default();
2042 let text = obj
2043 .get("text")
2044 .and_then(|v| v.as_str())
2045 .unwrap_or_default()
2046 .trim()
2047 .to_string();
2048 if text.is_empty() {
2049 continue;
2050 }
2051 match session_speaker_kind(speaker) {
2052 SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
2053 SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
2054 SessionSpeakerKind::Ignore => {}
2055 }
2056 }
2057
2058 (!messages.is_empty()).then_some(messages)
2059}
2060
2061fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
2062 if let Some(array) = value.as_array() {
2063 let messages = collect_role_messages(array);
2064 return (!messages.is_empty()).then_some(messages);
2065 }
2066
2067 let obj = value.as_object()?;
2068 if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
2069 let array = messages_value.as_array()?;
2070 let messages = collect_role_messages(array);
2071 return (!messages.is_empty()).then_some(messages);
2072 }
2073
2074 None
2075}
2076
2077fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
2078 let mut messages = Vec::new();
2079 for item in items {
2080 let Some(obj) = item.as_object() else {
2081 continue;
2082 };
2083 let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
2084 continue;
2085 };
2086 let Some(text) = extract_text_content(item) else {
2087 continue;
2088 };
2089 match role {
2090 "user" | "human" => messages.push(("user".to_string(), text)),
2091 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2092 _ => {}
2093 }
2094 }
2095 messages
2096}
2097
2098fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
2099 let mapping = value.get("mapping")?.as_object()?;
2100 let mut current_id = mapping.iter().find_map(|(node_id, node)| {
2101 let obj = node.as_object()?;
2102 (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
2103 })?;
2104
2105 let mut messages = Vec::new();
2106 let mut visited = std::collections::HashSet::new();
2107
2108 while visited.insert(current_id.clone()) {
2109 let Some(node) = mapping.get(¤t_id).and_then(|v| v.as_object()) else {
2110 break;
2111 };
2112
2113 if let Some(message) = node.get("message") {
2114 let role = message
2115 .get("author")
2116 .and_then(|author| author.get("role"))
2117 .and_then(|v| v.as_str())
2118 .unwrap_or("");
2119 if let Some(text) = extract_text_content(message) {
2120 match role {
2121 "user" => messages.push(("user".to_string(), text)),
2122 "assistant" => messages.push(("assistant".to_string(), text)),
2123 _ => {}
2124 }
2125 }
2126 }
2127
2128 let Some(next_id) = node
2129 .get("children")
2130 .and_then(|children| children.as_array())
2131 .and_then(|children| children.first())
2132 .and_then(|child| child.as_str())
2133 else {
2134 break;
2135 };
2136 current_id = next_id.to_string();
2137 }
2138
2139 (!messages.is_empty()).then_some(messages)
2140}
2141
2142fn extract_text_content(value: &Value) -> Option<String> {
2143 if let Some(text) = value.as_str() {
2144 let trimmed = text.trim();
2145 return (!trimmed.is_empty()).then_some(trimmed.to_string());
2146 }
2147
2148 if let Some(array) = value.as_array() {
2149 let joined = array
2150 .iter()
2151 .filter_map(extract_text_content)
2152 .filter(|part| !part.is_empty())
2153 .collect::<Vec<_>>()
2154 .join("\n");
2155 return (!joined.is_empty()).then_some(joined);
2156 }
2157
2158 let obj = value.as_object()?;
2159
2160 if let Some(content) = obj.get("content") {
2161 if let Some(text) = extract_text_content(content) {
2162 return Some(text);
2163 }
2164 }
2165
2166 if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
2167 let trimmed = text.trim();
2168 if !trimmed.is_empty() {
2169 return Some(trimmed.to_string());
2170 }
2171 }
2172
2173 if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
2174 let joined = parts
2175 .iter()
2176 .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
2177 .filter(|part| !part.is_empty())
2178 .collect::<Vec<_>>()
2179 .join("\n");
2180 if !joined.is_empty() {
2181 return Some(joined);
2182 }
2183 }
2184
2185 None
2186}
2187
2188fn slugify_import_path(path: &Path) -> String {
2189 path.to_string_lossy()
2190 .replace('\\', "/")
2191 .chars()
2192 .map(|ch| {
2193 if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
2194 ch
2195 } else {
2196 '_'
2197 }
2198 })
2199 .collect::<String>()
2200 .trim_matches('/')
2201 .replace('/', "__")
2202}
2203
2204fn embed_text_blocking(text: &str, base_url: &str, embed_model: &str) -> Option<Vec<f32>> {
2214 embed_text_with_prefix(text, "search_document", base_url, embed_model)
2215}
2216
2217fn embed_query_blocking(text: &str, base_url: &str, embed_model: &str) -> Option<Vec<f32>> {
2218 embed_text_with_prefix(text, "search_query", base_url, embed_model)
2219}
2220
2221fn embed_text_with_prefix(
2222 text: &str,
2223 task: &str,
2224 base_url: &str,
2225 embed_model: &str,
2226) -> Option<Vec<f32>> {
2227 let prefixed = format!("{}: {}", task, text);
2229 let input = if prefixed.len() > 8000 {
2231 &prefixed[..8000]
2232 } else {
2233 &prefixed
2234 };
2235
2236 let client = reqwest::blocking::Client::builder()
2237 .timeout(std::time::Duration::from_secs(10))
2238 .build()
2239 .ok()?;
2240
2241 let body = serde_json::json!({
2242 "model": embed_model,
2243 "input": input
2244 });
2245
2246 let trimmed = base_url.trim_end_matches('/');
2247 let is_ollama = trimmed.contains("11434");
2248 let url = if is_ollama {
2249 format!("{}/api/embed", trimmed)
2250 } else {
2251 format!("{}/v1/embeddings", trimmed)
2252 };
2253 let resp = client.post(&url).json(&body).send().ok()?;
2254
2255 if !resp.status().is_success() {
2256 return None;
2257 }
2258
2259 let json: serde_json::Value = resp.json().ok()?;
2260 let embedding = if is_ollama {
2261 json["embeddings"][0].as_array()?
2262 } else {
2263 json["data"][0]["embedding"].as_array()?
2264 };
2265 let vec: Vec<f32> = embedding
2266 .iter()
2267 .filter_map(|v| v.as_f64().map(|f| f as f32))
2268 .collect();
2269
2270 if vec.is_empty() {
2271 None
2272 } else {
2273 Some(vec)
2274 }
2275}
2276
2277fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2280 if a.len() != b.len() || a.is_empty() {
2281 return 0.0;
2282 }
2283 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2284 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2285 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2286 if norm_a == 0.0 || norm_b == 0.0 {
2287 0.0
2288 } else {
2289 dot / (norm_a * norm_b)
2290 }
2291}
2292
2293fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2294 floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2295}
2296
2297fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2298 blob.chunks_exact(4)
2299 .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2300 .collect()
2301}
2302
2303fn normalize_extracted_document_text(text: String) -> Option<String> {
2309 let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2310 let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2311 if trimmed.is_empty() {
2312 None
2313 } else {
2314 Some(trimmed.to_string())
2315 }
2316}
2317
2318fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2319 let previous_hook = std::panic::take_hook();
2320 std::panic::set_hook(Box::new(|_| {}));
2321 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2322 pdf_extract::extract_text(path)
2323 }));
2324 std::panic::set_hook(previous_hook);
2325
2326 match result {
2327 Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2328 Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2329 Err(payload) => {
2330 let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2331 (*msg).to_string()
2332 } else if let Some(msg) = payload.downcast_ref::<String>() {
2333 msg.clone()
2334 } else {
2335 "unknown parser panic".to_string()
2336 };
2337 Err(format!("pdf-extract panicked: {}", panic_text))
2338 }
2339 }
2340}
2341
2342fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2343 let mut doc =
2344 lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2345
2346 if doc.is_encrypted() {
2347 doc.decrypt("")
2348 .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2349 }
2350
2351 let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2352 if page_numbers.is_empty() {
2353 return Ok(None);
2354 }
2355
2356 let mut extracted_pages = Vec::new();
2357 let mut page_errors = Vec::new();
2358
2359 for page_number in page_numbers {
2360 match doc.extract_text(&[page_number]) {
2361 Ok(text) => {
2362 if let Some(page_text) = normalize_extracted_document_text(text) {
2363 extracted_pages.push(page_text);
2364 }
2365 }
2366 Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2367 }
2368 }
2369
2370 if !extracted_pages.is_empty() {
2371 return Ok(Some(extracted_pages.join("\n\n")));
2372 }
2373
2374 if !page_errors.is_empty() {
2375 let sample_errors = page_errors
2376 .into_iter()
2377 .take(3)
2378 .collect::<Vec<_>>()
2379 .join("; ");
2380 return Err(format!(
2381 "lopdf could not extract usable page text ({sample_errors})"
2382 ));
2383 }
2384
2385 Ok(None)
2386}
2387
2388fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2389 let mut failures = Vec::new();
2390
2391 match extract_pdf_text_with_pdf_extract(path) {
2392 Ok(Some(text)) => return Ok(Some(text)),
2393 Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2394 Err(e) => failures.push(e),
2395 }
2396
2397 match extract_pdf_text_with_lopdf(path) {
2398 Ok(Some(text)) => return Ok(Some(text)),
2399 Ok(None) => failures.push("lopdf found no usable text".to_string()),
2400 Err(e) => failures.push(e),
2401 }
2402
2403 let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2404 Err(format!(
2405 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2406 detail
2407 ))
2408}
2409
2410fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2411 let exe = std::env::current_exe()
2412 .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2413 let output = std::process::Command::new(exe)
2414 .arg("--pdf-extract-helper")
2415 .arg(path)
2416 .stdin(std::process::Stdio::null())
2417 .stdout(std::process::Stdio::piped())
2418 .stderr(std::process::Stdio::piped())
2419 .output()
2420 .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2421
2422 if !output.status.success() {
2423 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2424 return Err(if stderr.is_empty() {
2425 "PDF extraction failed.".to_string()
2426 } else {
2427 stderr
2428 });
2429 }
2430
2431 let text = String::from_utf8(output.stdout)
2432 .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2433 if text.trim().is_empty() {
2434 Ok(None)
2435 } else {
2436 Ok(Some(text))
2437 }
2438}
2439
2440pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2441 match extract_pdf_text_inside_helper(path) {
2442 Ok(Some(text)) => {
2443 use std::io::Write;
2444 let mut stdout = std::io::stdout();
2445 if stdout.write_all(text.as_bytes()).is_ok() {
2446 0
2447 } else {
2448 let _ = writeln!(
2449 std::io::stderr(),
2450 "PDF helper could not write extracted text."
2451 );
2452 1
2453 }
2454 }
2455 Ok(None) => {
2456 eprintln!(
2457 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2458 );
2459 1
2460 }
2461 Err(e) => {
2462 eprintln!("{}", e);
2463 1
2464 }
2465 }
2466}
2467
2468pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2471 let ext = path
2472 .extension()
2473 .and_then(|e| e.to_str())
2474 .unwrap_or("")
2475 .to_lowercase();
2476 match ext.as_str() {
2477 "pdf" => {
2478 let text = extract_pdf_text(path)?.ok_or_else(|| {
2479 "PDF contains no extractable text — it may be scanned/image-only. \
2480 Try attaching page screenshots with /image instead."
2481 .to_string()
2482 })?;
2483 pdf_quality_check(text)
2484 }
2485 _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2486 }
2487}
2488
2489fn pdf_quality_check(text: String) -> Result<String, String> {
2494 let trimmed = text.trim();
2495
2496 if trimmed.len() < 150 {
2498 return Err(format!(
2499 "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2500 or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2501 trimmed.len()
2502 ));
2503 }
2504
2505 let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2508 let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2509 let space_ratio = if non_newline > 0 {
2510 spaces as f32 / non_newline as f32
2511 } else {
2512 0.0
2513 };
2514
2515 if space_ratio < 0.04 {
2516 return Err(
2517 "PDF text extraction produced garbled output — words are merged with no spaces. \
2518 This usually means the PDF uses custom embedded fonts (common with academic publishers \
2519 like EBSCO, Elsevier, Springer). \
2520 Try a PDF exported from Word, Google Docs, or LaTeX, \
2521 or attach page screenshots with /image instead.".to_string()
2522 );
2523 }
2524
2525 Ok(text)
2526}
2527
2528fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2532 if ext == "rs" {
2533 chunk_rust_symbols(text)
2534 } else {
2535 chunk_paragraphs(text)
2536 }
2537}
2538
2539fn chunk_rust_symbols(text: &str) -> Vec<String> {
2548 const ITEM_STARTS: &[&str] = &[
2549 "pub fn ",
2550 "pub async fn ",
2551 "pub unsafe fn ",
2552 "async fn ",
2553 "unsafe fn ",
2554 "fn ",
2555 "pub impl",
2556 "impl ",
2557 "pub struct ",
2558 "struct ",
2559 "pub enum ",
2560 "enum ",
2561 "pub trait ",
2562 "trait ",
2563 "pub mod ",
2564 "mod ",
2565 "pub type ",
2566 "type ",
2567 "pub const ",
2568 "const ",
2569 "pub static ",
2570 "static ",
2571 ];
2572
2573 let lines: Vec<&str> = text.lines().collect();
2574 let mut chunks: Vec<String> = Vec::new();
2575 let mut current: Vec<&str> = Vec::new();
2576
2577 for &line in &lines {
2578 let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2579 let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2580
2581 if is_item && !current.is_empty() {
2582 let mut split = current.len();
2585 while split > 0 {
2586 let prev = current[split - 1].trim();
2587 if prev.starts_with("///")
2588 || prev.starts_with("//!")
2589 || prev.starts_with("#[")
2590 || prev.is_empty()
2591 {
2592 split -= 1;
2593 } else {
2594 break;
2595 }
2596 }
2597 let body = current[..split].join("\n");
2598 if !body.trim().is_empty() {
2599 chunks.push(body);
2600 }
2601 current = current[split..].to_vec();
2602 }
2603 current.push(line);
2604 }
2605 if !current.is_empty() {
2606 let body = current.join("\n");
2607 if !body.trim().is_empty() {
2608 chunks.push(body);
2609 }
2610 }
2611
2612 let mut result = Vec::new();
2614 for chunk in chunks {
2615 if chunk.len() > 3000 {
2616 result.extend(sliding_window_chunks(&chunk, 2000, 200));
2617 } else {
2618 result.push(chunk);
2619 }
2620 }
2621 result
2622}
2623
2624fn chunk_paragraphs(text: &str) -> Vec<String> {
2626 let mut result: Vec<String> = Vec::new();
2627 let mut current = String::new();
2628
2629 for para in text.split("\n\n") {
2630 if current.len() + para.len() + 2 > 2000 {
2631 if !current.trim().is_empty() {
2632 result.push(current.clone());
2633 }
2634 current = para.to_string();
2635 } else {
2636 if !current.is_empty() {
2637 current.push_str("\n\n");
2638 }
2639 current.push_str(para);
2640 }
2641 }
2642 if !current.trim().is_empty() {
2643 result.push(current);
2644 }
2645
2646 let mut final_result = Vec::new();
2647 for chunk in result {
2648 if chunk.len() > 2000 {
2649 final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2650 } else {
2651 final_result.push(chunk);
2652 }
2653 }
2654 final_result
2655}
2656
2657fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2659 let chars: Vec<char> = text.chars().collect();
2660 let mut result = Vec::new();
2661 let mut i = 0;
2662 while i < chars.len() {
2663 let end = (i + chunk_size).min(chars.len());
2664 result.push(chars[i..end].iter().collect());
2665 if end == chars.len() {
2666 break;
2667 }
2668 i += chunk_size - overlap;
2669 }
2670 result
2671}