1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7pub struct Vein {
28 db: std::sync::Arc<std::sync::Mutex<Connection>>,
29 base_url: String,
31 embed_model: std::sync::Arc<std::sync::RwLock<Option<String>>>,
32}
33
34unsafe impl Send for Vein {}
37unsafe impl Sync for Vein {}
38
39#[derive(Debug, Clone)]
40pub struct SearchResult {
41 pub path: String,
42 pub content: String,
43 pub score: f32,
45 pub room: String,
47 pub last_modified: i64,
49 pub memory_type: String,
52}
53
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub struct VeinHotFile {
56 pub path: String,
57 pub heat: i64,
58 pub last_modified: i64,
59 pub room: String,
60}
61
62#[derive(Debug, Clone)]
63pub struct VeinInspectionSnapshot {
64 pub indexed_source_files: usize,
65 pub indexed_docs: usize,
66 pub indexed_session_exchanges: usize,
67 pub embedded_source_doc_chunks: usize,
68 pub has_any_embeddings: bool,
69 pub active_room: Option<String>,
70 pub hot_files: Vec<VeinHotFile>,
71 pub l1_ready: bool,
72}
73
74#[derive(Debug, Default)]
75struct QuerySignals {
76 exact_phrases: Vec<String>,
77 standout_terms: Vec<String>,
78 historical_memory_hint: bool,
79 temporal_reference: Option<TemporalReference>,
80 query_memory_type: Option<&'static str>,
83}
84
85#[derive(Debug, Clone, Copy)]
86struct TemporalReference {
87 target_ts: i64,
88 window_secs: i64,
89}
90
91#[derive(Debug, Deserialize)]
92struct SessionReport {
93 #[serde(default)]
94 session_start: String,
95 #[serde(default)]
96 transcript: Vec<SessionTranscriptEntry>,
97}
98
99#[derive(Debug, Deserialize)]
100struct SessionTranscriptEntry {
101 #[serde(default)]
102 speaker: String,
103 #[serde(default)]
104 text: String,
105}
106
107#[derive(Debug)]
108struct SessionExchange {
109 path: String,
110 last_modified: i64,
111 content: String,
112}
113
114#[derive(Debug, Clone, Copy, PartialEq, Eq)]
115enum SessionSpeakerKind {
116 User,
117 Assistant,
118 Ignore,
119}
120
121pub fn detect_room(path: &str) -> String {
126 let lower = path.to_lowercase().replace('\\', "/");
127 let filename = lower.rsplit('/').next().unwrap_or(&lower);
128 let ext = filename.rsplit('.').next().unwrap_or("");
129
130 let mut best_room = None::<&str>;
131 let mut best_score = 0i32;
132 let mut consider = |room: &'static str, score: i32| {
133 if score > best_score {
134 best_score = score;
135 best_room = Some(room);
136 }
137 };
138
139 let is_component = |segment: &str| {
140 lower == segment
141 || lower.starts_with(&format!("{segment}/"))
142 || lower.contains(&format!("/{segment}/"))
143 };
144
145 if lower.starts_with("session/")
146 || lower.starts_with(".hematite/reports/")
147 || lower.starts_with(".hematite/imports/")
148 || is_component("reports")
149 || is_component("imports")
150 {
151 consider("session", 100);
152 }
153
154 if lower.starts_with(".hematite/docs/")
155 || is_component("docs")
156 || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
157 || matches!(ext, "md" | "markdown" | "pdf" | "rst")
158 {
159 consider("docs", 80);
160 }
161
162 if is_component("tests")
163 || filename.contains("diagnostic")
164 || filename.ends_with("_test.rs")
165 || filename.ends_with(".test.ts")
166 {
167 consider("tests", 85);
168 }
169
170 if lower.starts_with(".github/workflows/")
171 || is_component("workflows")
172 || filename == ".pre-commit-config.yaml"
173 || filename == ".pre-commit-config.yml"
174 || filename.contains("hook")
175 {
176 consider("automation", 84);
177 }
178
179 if lower.starts_with("installer/")
180 || lower.starts_with("dist/")
181 || lower.starts_with("scripts/package-")
182 || filename.contains("release")
183 || filename.contains("bump-version")
184 || ext == "iss"
185 {
186 consider("release", 82);
187 }
188
189 if matches!(
190 filename,
191 "cargo.toml"
192 | "cargo.lock"
193 | "package.json"
194 | "pnpm-lock.yaml"
195 | "yarn.lock"
196 | "bun.lock"
197 | "bun.lockb"
198 | "pyproject.toml"
199 | "setup.py"
200 | "go.mod"
201 | "pom.xml"
202 | "build.gradle"
203 | "build.gradle.kts"
204 | "cmakelists.txt"
205 | ".gitignore"
206 | "settings.json"
207 | "mcp_servers.json"
208 ) || filename.ends_with(".sln")
209 || filename.ends_with(".csproj")
210 || filename.contains("config")
211 {
212 consider("config", 76);
213 }
214
215 if is_component("ui")
216 || matches!(
217 filename,
218 "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
219 )
220 {
221 consider("ui", 70);
222 }
223
224 if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
225 consider("memory", 72);
226 }
227
228 if is_component("tools")
229 || matches!(
230 filename,
231 "verify_build.rs"
232 | "host_inspect.rs"
233 | "shell.rs"
234 | "code_sandbox.rs"
235 | "project_map.rs"
236 | "runtime_trace.rs"
237 )
238 {
239 consider("tools", 68);
240 }
241
242 if filename.contains("mcp")
243 || filename.contains("lsp")
244 || lower.contains("/mcp/")
245 || lower.contains("/lsp/")
246 {
247 consider("integration", 67);
248 }
249
250 if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
251 || filename.contains("startup")
252 || filename.contains("runtime")
253 {
254 consider("runtime", 66);
255 }
256
257 if is_component("agent") {
258 consider("agent", 60);
259 }
260
261 if lower.starts_with("libs/") || is_component("libs") {
262 consider("libs", 58);
263 }
264
265 if lower.starts_with("scripts/") || is_component("scripts") {
266 consider("scripts", 55);
267 }
268
269 if let Some(room) = best_room {
270 return room.to_string();
271 }
272
273 lower
275 .split('/')
276 .next()
277 .filter(|s| !s.is_empty() && !s.contains('.'))
278 .unwrap_or("root")
279 .to_string()
280}
281
282pub fn detect_memory_type(text: &str) -> &'static str {
288 let lower = text.to_lowercase();
289
290 let decision_patterns = [
292 "let's use ",
293 "we'll use ",
294 "decided to ",
295 "going with ",
296 "we agreed ",
297 "the plan is",
298 "we're going to",
299 "switching to",
300 "we chose",
301 "final decision",
302 "we settled on",
303 "agreed on",
304 "we decided",
305 ];
306 for pat in &decision_patterns {
307 if lower.contains(pat) {
308 return "decision";
309 }
310 }
311
312 let problem_patterns = [
314 "bug fixed",
315 "bug was",
316 "the issue was",
317 "root cause",
318 "error was",
319 "turned out to be",
320 "the fix was",
321 "was caused by",
322 "broken because",
323 "fixed by",
324 "the problem was",
325 "found the bug",
326 "port conflict",
327 "crash",
328 "panicked",
329 "segfault",
330 "oom",
331 "out of memory",
332 ];
333 for pat in &problem_patterns {
334 if lower.contains(pat) {
335 return "problem";
336 }
337 }
338
339 let milestone_patterns = [
341 "now working",
342 "successfully",
343 "shipped",
344 "deployed",
345 "it works",
346 "tests pass",
347 "all green",
348 "breakthrough",
349 "finally got",
350 "got it working",
351 "completed",
352 "finished",
353 "done with",
354 "landed",
355 ];
356 for pat in &milestone_patterns {
357 if lower.contains(pat) {
358 return "milestone";
359 }
360 }
361
362 let preference_patterns = [
364 "i prefer",
365 "i like",
366 "i don't like",
367 "i want",
368 "always use",
369 "never use",
370 "i usually",
371 "my preference",
372 "keep it",
373 "avoid using",
374 ];
375 for pat in &preference_patterns {
376 if lower.contains(pat) {
377 return "preference";
378 }
379 }
380
381 ""
382}
383
384impl Vein {
385 const SESSION_REPORT_LIMIT: usize = 5;
386 const SESSION_TURN_LIMIT: usize = 50;
387 const IMPORT_FILE_LIMIT: usize = 12;
388 const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
389
390 pub fn new<P: AsRef<Path>>(
391 db_path: P,
392 base_url: String,
393 ) -> Result<Self, Box<dyn std::error::Error>> {
394 let db = Connection::open(db_path)?;
395
396 db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
398
399 db.execute_batch(
403 "CREATE TABLE IF NOT EXISTS chunks_meta (
404 path TEXT PRIMARY KEY,
405 last_modified INTEGER NOT NULL,
406 room TEXT NOT NULL DEFAULT 'root'
407 );
408 CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
409 path UNINDEXED,
410 content,
411 tokenize='porter ascii'
412 );
413 CREATE TABLE IF NOT EXISTS chunks_vec (
414 path TEXT NOT NULL,
415 chunk_idx INTEGER NOT NULL,
416 embedding BLOB NOT NULL,
417 PRIMARY KEY (path, chunk_idx)
418 );
419 CREATE TABLE IF NOT EXISTS file_heat (
420 path TEXT PRIMARY KEY,
421 heat INTEGER NOT NULL DEFAULT 0,
422 last_edit INTEGER NOT NULL DEFAULT 0
423 );",
424 )?;
425
426 let _ = db
428 .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
429 let _ = db.execute_batch(
430 "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
431 );
432 let _ = db.execute_batch(
433 "ALTER TABLE chunks_meta ADD COLUMN memory_type TEXT NOT NULL DEFAULT '';",
434 );
435
436 Ok(Self {
437 db: std::sync::Arc::new(std::sync::Mutex::new(db)),
438 base_url,
439 embed_model: std::sync::Arc::new(std::sync::RwLock::new(None)),
440 })
441 }
442
443 pub fn set_embed_model(&self, model_id: Option<String>) {
444 if let Ok(mut guard) = self.embed_model.write() {
445 *guard = model_id;
446 }
447 }
448
449 fn current_embed_model(&self) -> Option<String> {
450 self.embed_model.read().ok().and_then(|guard| guard.clone())
451 }
452
453 pub fn index_document(
458 &mut self,
459 path: &str,
460 last_modified: i64,
461 full_text: &str,
462 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
463 let room = detect_room(path);
464 let ext = std::path::Path::new(path)
465 .extension()
466 .and_then(|e| e.to_str())
467 .unwrap_or("");
468 let chunks = chunk_by_symbols(ext, full_text);
469 let memory_type = if room == "session" {
471 detect_memory_type(full_text)
472 } else {
473 ""
474 };
475 self.index_chunks_with_room_and_type(path, last_modified, &room, memory_type, &chunks)
476 }
477
478 fn index_chunks_with_room_and_type(
479 &mut self,
480 path: &str,
481 last_modified: i64,
482 room: &str,
483 memory_type: &str,
484 chunks: &[String],
485 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
486 let db = self.db.lock().unwrap();
487 let existing: Option<i64> = db
488 .query_row(
489 "SELECT last_modified FROM chunks_meta WHERE path = ?1",
490 params![path],
491 |r| r.get(0),
492 )
493 .ok();
494
495 if let Some(ts) = existing {
496 if ts >= last_modified {
497 return Ok(Vec::new()); }
499 }
500
501 db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
503 db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
504 db.execute(
505 "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room, memory_type) VALUES (?1, ?2, ?3, ?4)",
506 params![path, last_modified, room, memory_type],
507 )?;
508
509 drop(db);
510
511 let mut db = self.db.lock().unwrap();
512 let tx = db.transaction()?;
513 {
514 let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
515 for chunk in chunks {
516 stmt.execute(params![path, chunk.as_str()])?;
517 }
518 }
519 tx.commit()?;
520
521 Ok(chunks.to_vec())
522 }
523
524 pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
528 let Some(embed_model) = self.current_embed_model() else {
529 return;
530 };
531 for (idx, chunk) in chunks.iter().enumerate() {
532 if let Some(vec) = embed_text_blocking(chunk, &self.base_url, &embed_model) {
533 let blob = floats_to_blob(&vec);
534 let db = self.db.lock().unwrap();
535 let _ = db.execute(
536 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
537 params![path, idx as i64, blob],
538 );
539 }
540 }
541 }
542
543 pub fn search_bm25(
547 &self,
548 query: &str,
549 limit: usize,
550 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
551 const STOPWORDS: &[&str] = &[
555 "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
556 "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an", "the",
557 "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "get",
558 "gets", "got", "work", "works", "make", "makes", "use", "uses", "into", "that", "this",
559 "it", "its",
560 ];
561
562 let safe_query: String = query
563 .chars()
564 .map(|c| {
565 if c.is_alphanumeric() || c == ' ' || c == '_' {
566 c
567 } else {
568 ' '
569 }
570 })
571 .collect();
572
573 let fts_query = safe_query
575 .split_whitespace()
576 .filter(|w| w.len() >= 3 && !STOPWORDS.contains(&w.to_lowercase().as_str()))
577 .collect::<Vec<_>>()
578 .join(" OR ");
579
580 if fts_query.is_empty() {
581 return Ok(Vec::new());
582 }
583
584 let db = self.db.lock().unwrap();
585 let mut stmt = db.prepare(
586 "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room, cm.memory_type
587 FROM chunks_fts
588 JOIN chunks_meta cm ON cm.path = chunks_fts.path
589 WHERE chunks_fts MATCH ?1
590 ORDER BY rank
591 LIMIT ?2",
592 )?;
593
594 let results: Vec<SearchResult> = stmt
595 .query_map(params![fts_query, limit as i64], |row| {
596 Ok(SearchResult {
597 path: row.get(0)?,
598 content: row.get(1)?,
599 score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
600 last_modified: row.get(3)?,
601 room: row.get(4)?,
602 memory_type: row.get::<_, String>(5).unwrap_or_default(),
603 })
604 })?
605 .filter_map(|r| r.ok())
606 .collect();
607
608 Ok(results)
609 }
610
611 pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
614 let Some(embed_model) = self.current_embed_model() else {
615 return Vec::new();
616 };
617 let query_vec = match embed_query_blocking(query, &self.base_url, &embed_model) {
618 Some(v) => v,
619 None => return Vec::new(),
620 };
621
622 let rows: Vec<(String, i64, Vec<u8>, i64, String, String)> = {
624 let db = self.db.lock().unwrap();
625 let mut stmt = match db.prepare(
626 "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room, cm.memory_type
627 FROM chunks_vec cv
628 JOIN chunks_meta cm ON cm.path = cv.path",
629 ) {
630 Ok(s) => s,
631 Err(_) => return Vec::new(),
632 };
633 stmt.query_map([], |row| {
634 Ok((
635 row.get::<_, String>(0)?,
636 row.get::<_, i64>(1)?,
637 row.get::<_, Vec<u8>>(2)?,
638 row.get::<_, i64>(3)?,
639 row.get::<_, String>(4)?,
640 row.get::<_, String>(5).unwrap_or_default(),
641 ))
642 })
643 .ok()
644 .map(|rows| rows.filter_map(|r| r.ok()).collect())
645 .unwrap_or_default()
646 };
647
648 if rows.is_empty() {
649 return Vec::new();
650 }
651
652 let mut scored: Vec<(f32, String, i64, i64, String, String)> = rows
654 .into_iter()
655 .filter_map(|(path, idx, blob, last_modified, room, memory_type)| {
656 let vec = blob_to_floats(&blob);
657 let sim = cosine_similarity(&query_vec, &vec);
658 Some((sim, path, idx, last_modified, room, memory_type))
659 })
660 .collect();
661
662 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
663 scored.truncate(limit);
664
665 let db = self.db.lock().unwrap();
667 scored
668 .into_iter()
669 .filter_map(|(score, path, idx, last_modified, room, memory_type)| {
670 let content: Option<String> = db
671 .query_row(
672 "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
673 params![path, idx],
674 |r| r.get(0),
675 )
676 .ok();
677 content.map(|c| SearchResult {
678 path,
679 content: c,
680 score,
681 room,
682 last_modified,
683 memory_type,
684 })
685 })
686 .collect()
687 }
688
689 pub fn search_context(
696 &self,
697 query: &str,
698 limit: usize,
699 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
700 let candidate_limit = (limit.max(1) * 4).max(12);
701 let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
702 let semantic = self.search_semantic(query, candidate_limit);
703 let signals = QuerySignals::from_query(query);
704
705 let active_room = self.active_room();
707
708 let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
711
712 for r in semantic {
713 let score = reranked_score(&signals, active_room.as_deref(), &r, true);
714 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
715 }
716
717 for r in bm25 {
718 let score = reranked_score(&signals, active_room.as_deref(), &r, false);
719 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
720 }
721
722 let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
723 merged.sort_by(|a, b| {
724 b.score
725 .partial_cmp(&a.score)
726 .unwrap_or(std::cmp::Ordering::Equal)
727 });
728 merged.truncate(limit);
729 Ok(merged)
730 }
731
732 fn active_room(&self) -> Option<String> {
735 let db = self.db.lock().unwrap();
736 db.query_row(
737 "SELECT cm.room, SUM(fh.heat) as total
738 FROM file_heat fh
739 JOIN chunks_meta cm ON cm.path = fh.path
740 GROUP BY cm.room
741 ORDER BY total DESC
742 LIMIT 1",
743 [],
744 |row| row.get::<_, String>(0),
745 )
746 .ok()
747 }
748
749 pub fn index_project(&mut self) -> usize {
757 let root = crate::tools::file_ops::workspace_root();
758 let mut count = 0usize;
759
760 const INDEXABLE: &[&str] = &[
761 "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
762 "yml", "txt",
763 ];
764 const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
765
766 for entry in walkdir::WalkDir::new(&root)
767 .follow_links(false)
768 .into_iter()
769 .filter_entry(|e| {
770 if e.file_type().is_dir() {
771 let name = e.file_name().to_string_lossy();
772 return !SKIP_DIRS.contains(&name.as_ref());
773 }
774 true
775 })
776 .filter_map(|e| e.ok())
777 .filter(|e| e.file_type().is_file())
778 {
779 let path = entry.path();
780 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
781 if !INDEXABLE.contains(&ext) {
782 continue;
783 }
784
785 let Ok(meta) = std::fs::metadata(path) else {
786 continue;
787 };
788 if meta.len() > 512_000 {
789 continue;
790 }
791
792 let mtime = meta
793 .modified()
794 .map(|t| {
795 t.duration_since(std::time::UNIX_EPOCH)
796 .unwrap_or_default()
797 .as_secs() as i64
798 })
799 .unwrap_or(0);
800
801 let rel = path.strip_prefix(&root).unwrap_or(path);
802 let rel_str = rel.to_string_lossy().replace('\\', "/");
803
804 if let Ok(content) = std::fs::read_to_string(path) {
805 match self.index_document(&rel_str, mtime, &content) {
806 Ok(new_chunks) if !new_chunks.is_empty() => {
807 count += 1;
808 }
809 Ok(_) => {}
810 Err(_) => {}
811 }
812 }
813 }
814
815 count += self.index_workspace_artifacts(&crate::tools::file_ops::hematite_dir());
816
817 count
818 }
819
820 pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
825 let mut count = self.index_docs_folder(workspace_root);
826 count += self.index_recent_session_reports(workspace_root);
827 count += self.index_imported_session_exports(workspace_root);
828 self.backfill_missing_embeddings();
829 count
830 }
831
832 fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
837 let docs_dir = workspace_root.join(".hematite").join("docs");
838 const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
839 let mut count = 0usize;
840 let mut desired_paths = HashSet::new();
841
842 if docs_dir.exists() {
843 for entry in walkdir::WalkDir::new(&docs_dir)
844 .max_depth(3)
845 .follow_links(false)
846 .into_iter()
847 .filter_map(|e| e.ok())
848 .filter(|e| e.file_type().is_file())
849 {
850 let path = entry.path();
851 let ext = path
852 .extension()
853 .and_then(|e| e.to_str())
854 .unwrap_or("")
855 .to_lowercase();
856 if !DOCS_INDEXABLE.contains(&ext.as_str()) {
857 continue;
858 }
859
860 let Ok(meta) = std::fs::metadata(path) else {
861 continue;
862 };
863 if meta.len() > 50_000_000 {
864 continue;
865 }
866
867 let mtime = meta
868 .modified()
869 .map(|t| {
870 t.duration_since(std::time::UNIX_EPOCH)
871 .unwrap_or_default()
872 .as_secs() as i64
873 })
874 .unwrap_or(0);
875
876 let rel = path.strip_prefix(workspace_root).unwrap_or(path);
877 let rel_str = rel.to_string_lossy().replace('\\', "/");
878 desired_paths.insert(rel_str.clone());
879
880 let content = if ext == "pdf" {
881 extract_pdf_text(path).ok().flatten()
882 } else {
883 std::fs::read_to_string(path).ok()
884 };
885
886 if let Some(text) = content {
887 if text.trim().is_empty() {
888 continue;
889 }
890 match self.index_document(&rel_str, mtime, &text) {
891 Ok(new_chunks) if !new_chunks.is_empty() => {
892 count += 1;
893 }
894 Ok(_) => {}
895 Err(_) => {}
896 }
897 }
898 }
899 }
900
901 self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
902 count
903 }
904
905 pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
908 let reports_dir = workspace_root.join(".hematite").join("reports");
909 let mut count = 0usize;
910 let mut desired_paths = HashSet::new();
911
912 if reports_dir.exists() {
913 let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
914 .ok()
915 .into_iter()
916 .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
917 .map(|entry| entry.path())
918 .filter(|path| {
919 path.is_file()
920 && path.extension().and_then(|ext| ext.to_str()) == Some("json")
921 && path
922 .file_stem()
923 .and_then(|stem| stem.to_str())
924 .map(|stem| stem.starts_with("session_"))
925 .unwrap_or(false)
926 })
927 .collect();
928
929 reports.sort_by(|a, b| {
930 let a_name = a
931 .file_name()
932 .and_then(|name| name.to_str())
933 .unwrap_or_default();
934 let b_name = b
935 .file_name()
936 .and_then(|name| name.to_str())
937 .unwrap_or_default();
938 b_name.cmp(a_name)
939 });
940 reports.truncate(Self::SESSION_REPORT_LIMIT);
941
942 for report_path in reports {
943 let Ok(meta) = std::fs::metadata(&report_path) else {
944 continue;
945 };
946 let mtime = meta
947 .modified()
948 .map(|t| {
949 t.duration_since(std::time::UNIX_EPOCH)
950 .unwrap_or_default()
951 .as_secs() as i64
952 })
953 .unwrap_or(0);
954
955 for exchange in load_session_exchanges(&report_path, mtime) {
956 desired_paths.insert(exchange.path.clone());
957 let mtype = detect_memory_type(&exchange.content);
958 match self.index_chunks_with_room_and_type(
959 &exchange.path,
960 exchange.last_modified,
961 "session",
962 mtype,
963 std::slice::from_ref(&exchange.content),
964 ) {
965 Ok(new_chunks) if !new_chunks.is_empty() => {
966 count += 1;
967 }
968 Ok(_) => {}
969 Err(_) => {}
970 }
971 }
972 }
973 }
974
975 self.prune_indexed_prefix("session/", &desired_paths);
976 count
977 }
978
979 pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
984 let imports_dir = workspace_root.join(".hematite").join("imports");
985 let mut count = 0usize;
986 let mut desired_paths = HashSet::new();
987
988 if imports_dir.exists() {
989 let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
990 .max_depth(4)
991 .follow_links(false)
992 .into_iter()
993 .filter_map(|entry| entry.ok())
994 .filter(|entry| entry.file_type().is_file())
995 .filter_map(|entry| {
996 let path = entry.into_path();
997 let ext = path
998 .extension()
999 .and_then(|ext| ext.to_str())
1000 .unwrap_or("")
1001 .to_ascii_lowercase();
1002 if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
1003 return None;
1004 }
1005 let meta = std::fs::metadata(&path).ok()?;
1006 if meta.len() > Self::IMPORT_MAX_BYTES {
1007 return None;
1008 }
1009 let mtime = meta
1010 .modified()
1011 .map(|t| {
1012 t.duration_since(std::time::UNIX_EPOCH)
1013 .unwrap_or_default()
1014 .as_secs() as i64
1015 })
1016 .unwrap_or(0);
1017 Some((path, mtime))
1018 })
1019 .collect();
1020
1021 imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
1022 b_mtime
1023 .cmp(a_mtime)
1024 .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
1025 });
1026 imports.truncate(Self::IMPORT_FILE_LIMIT);
1027
1028 for (import_path, mtime) in imports {
1029 for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
1030 desired_paths.insert(exchange.path.clone());
1031 let mtype = detect_memory_type(&exchange.content);
1032 match self.index_chunks_with_room_and_type(
1033 &exchange.path,
1034 exchange.last_modified,
1035 "session",
1036 mtype,
1037 std::slice::from_ref(&exchange.content),
1038 ) {
1039 Ok(new_chunks) if !new_chunks.is_empty() => {
1040 count += 1;
1041 }
1042 Ok(_) => {}
1043 Err(_) => {}
1044 }
1045 }
1046 }
1047 }
1048
1049 self.prune_indexed_prefix("session/imports/", &desired_paths);
1050 count
1051 }
1052
1053 fn backfill_missing_embeddings(&self) {
1058 let (fts_count, vec_count) = {
1060 let db = self.db.lock().unwrap();
1061 let fts: i64 = db
1062 .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
1063 .unwrap_or(0);
1064 let vec: i64 = db
1065 .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
1066 .unwrap_or(0);
1067 (fts, vec)
1068 };
1069 if fts_count == 0 || fts_count == vec_count {
1070 return;
1071 }
1072
1073 let missing: Vec<(String, i64, String)> = {
1076 let db = self.db.lock().unwrap();
1077 let mut stmt = db
1078 .prepare(
1079 "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
1080 FROM chunks_fts f
1081 LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
1082 WHERE v.path IS NULL
1083 ORDER BY CASE
1084 WHEN f.path LIKE '%.rs' THEN 0
1085 WHEN f.path LIKE '%.toml' THEN 1
1086 WHEN f.path LIKE '%.json' THEN 2
1087 ELSE 3
1088 END, f.path
1089 LIMIT 20",
1090 )
1091 .unwrap();
1092 stmt.query_map([], |r| {
1093 Ok((
1094 r.get::<_, String>(0)?,
1095 r.get::<_, i64>(1)?,
1096 r.get::<_, String>(2)?,
1097 ))
1098 })
1099 .unwrap()
1100 .filter_map(|r| r.ok())
1101 .collect()
1102 };
1103
1104 let Some(embed_model) = self.current_embed_model() else {
1105 return;
1106 };
1107
1108 for (path, idx, content) in missing {
1109 if let Some(vec) = embed_text_blocking(&content, &self.base_url, &embed_model) {
1110 let blob = floats_to_blob(&vec);
1111 let db = self.db.lock().unwrap();
1112 let _ = db.execute(
1113 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
1114 params![path, idx, blob],
1115 );
1116 } else {
1117 break;
1119 }
1120 }
1121 }
1122
1123 pub fn file_count(&self) -> usize {
1126 let db = self.db.lock().unwrap();
1127 db.query_row(
1128 "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
1129 [],
1130 |r| r.get::<_, i64>(0),
1131 )
1132 .unwrap_or(0) as usize
1133 }
1134
1135 pub fn embedded_chunk_count(&self) -> usize {
1138 let db = self.db.lock().unwrap();
1139 db.query_row(
1140 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1141 [],
1142 |r| r.get::<_, i64>(0),
1143 )
1144 .unwrap_or(0) as usize
1145 }
1146
1147 pub fn has_any_embeddings(&self) -> bool {
1149 let db = self.db.lock().unwrap();
1150 db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1151 r.get::<_, i64>(0)
1152 })
1153 .unwrap_or(0)
1154 != 0
1155 }
1156
1157 pub fn reset(&self) {
1160 let db = self.db.lock().unwrap();
1161 let _ = db.execute_batch(
1162 "DELETE FROM chunks_fts;
1163 DELETE FROM chunks_vec;
1164 DELETE FROM chunks_meta;",
1165 );
1166 }
1167
1168 pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1171 let db = self.db.lock().unwrap();
1172 let indexed_source_files = db
1173 .query_row(
1174 "SELECT COUNT(*) FROM chunks_meta
1175 WHERE path NOT LIKE 'session/%'
1176 AND path NOT LIKE '.hematite/docs/%'",
1177 [],
1178 |r| r.get::<_, i64>(0),
1179 )
1180 .unwrap_or(0) as usize;
1181 let indexed_docs = db
1182 .query_row(
1183 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1184 [],
1185 |r| r.get::<_, i64>(0),
1186 )
1187 .unwrap_or(0) as usize;
1188 let indexed_session_exchanges = db
1189 .query_row(
1190 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1191 [],
1192 |r| r.get::<_, i64>(0),
1193 )
1194 .unwrap_or(0) as usize;
1195 let embedded_source_doc_chunks = db
1196 .query_row(
1197 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1198 [],
1199 |r| r.get::<_, i64>(0),
1200 )
1201 .unwrap_or(0) as usize;
1202 let has_any_embeddings = db
1203 .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1204 r.get::<_, i64>(0)
1205 })
1206 .unwrap_or(0)
1207 != 0;
1208 drop(db);
1209
1210 let hot_files = self
1211 .hot_files(hot_limit.max(1))
1212 .into_iter()
1213 .map(|(path, heat, last_modified, room)| VeinHotFile {
1214 path,
1215 heat,
1216 last_modified,
1217 room,
1218 })
1219 .collect::<Vec<_>>();
1220
1221 VeinInspectionSnapshot {
1222 indexed_source_files,
1223 indexed_docs,
1224 indexed_session_exchanges,
1225 embedded_source_doc_chunks,
1226 has_any_embeddings,
1227 active_room: self.active_room(),
1228 l1_ready: !hot_files.is_empty(),
1229 hot_files,
1230 }
1231 }
1232
1233 pub fn bump_heat(&self, path: &str) {
1239 if path.is_empty() {
1240 return;
1241 }
1242 let now = std::time::SystemTime::now()
1243 .duration_since(std::time::UNIX_EPOCH)
1244 .unwrap_or_default()
1245 .as_secs() as i64;
1246 let db = self.db.lock().unwrap();
1247 let _ = db.execute(
1248 "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1249 ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1250 params![path, now],
1251 );
1252 }
1253
1254 fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1258 let db = self.db.lock().unwrap();
1259 let mut stmt = match db.prepare(
1260 "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1261 FROM file_heat fh
1262 JOIN chunks_meta cm ON cm.path = fh.path
1263 ORDER BY fh.heat DESC, cm.last_modified DESC
1264 LIMIT ?1",
1265 ) {
1266 Ok(s) => s,
1267 Err(_) => return vec![],
1268 };
1269 stmt.query_map(params![n as i64], |row| {
1270 Ok((
1271 row.get::<_, String>(0)?,
1272 row.get::<_, i64>(1)?,
1273 row.get::<_, i64>(2)?,
1274 row.get::<_, String>(3)?,
1275 ))
1276 })
1277 .map(|rows| rows.filter_map(|r| r.ok()).collect())
1278 .unwrap_or_default()
1279 }
1280
1281 pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1284 self.hot_files(n)
1285 .into_iter()
1286 .map(|(path, _, _, _)| path)
1287 .collect()
1288 }
1289
1290 pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1294 let files = self.hot_files(n);
1295 if files.is_empty() {
1296 return vec![];
1297 }
1298 let max_heat = files
1299 .iter()
1300 .map(|(_, h, _, _)| *h)
1301 .max()
1302 .unwrap_or(1)
1303 .max(1) as f64;
1304 files
1305 .into_iter()
1306 .map(|(path, heat, _, _)| {
1307 let weight = (heat as f64) / max_heat;
1308 (path, weight)
1309 })
1310 .collect()
1311 }
1312
1313 pub fn l1_context(&self) -> Option<String> {
1318 let files = self.hot_files(8);
1319 if files.is_empty() {
1320 return None;
1321 }
1322 let now = std::time::SystemTime::now()
1323 .duration_since(std::time::UNIX_EPOCH)
1324 .unwrap_or_default()
1325 .as_secs() as i64;
1326
1327 let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1329 std::collections::BTreeMap::new();
1330 for (path, heat, mtime, room) in &files {
1331 by_room
1332 .entry(room.clone())
1333 .or_default()
1334 .push((path.clone(), *heat, *mtime));
1335 }
1336
1337 let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1338 for (room, entries) in &by_room {
1339 out.push_str(&format!("[{}]\n", room));
1340 for (path, heat, mtime) in entries {
1341 let age_secs = now - mtime;
1342 let age = if age_secs < 3600 {
1343 "just now".to_string()
1344 } else if age_secs < 86400 {
1345 format!("{}h ago", age_secs / 3600)
1346 } else {
1347 format!("{}d ago", age_secs / 86400)
1348 };
1349 out.push_str(&format!(
1350 " - {} [{} edit{}, {}]\n",
1351 path,
1352 heat,
1353 if *heat == 1 { "" } else { "s" },
1354 age
1355 ));
1356 }
1357 }
1358 Some(out)
1359 }
1360
1361 fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1362 let pattern = format!("{}%", prefix);
1363 let existing_paths: Vec<String> = {
1364 let db = self.db.lock().unwrap();
1365 let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1366 Ok(stmt) => stmt,
1367 Err(_) => return,
1368 };
1369 stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1370 .map(|rows| rows.filter_map(|row| row.ok()).collect())
1371 .unwrap_or_default()
1372 };
1373
1374 if existing_paths.is_empty() {
1375 return;
1376 }
1377
1378 let db = self.db.lock().unwrap();
1379 for path in existing_paths {
1380 if desired_paths.contains(&path) {
1381 continue;
1382 }
1383 let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1384 let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1385 let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1386 }
1387 }
1388}
1389
1390impl QuerySignals {
1391 fn from_query(query: &str) -> Self {
1392 let lower = query.to_ascii_lowercase();
1393 let historical_memory_hint = [
1394 "remember",
1395 "earlier",
1396 "previous",
1397 "last time",
1398 "what did we decide",
1399 "why did we decide",
1400 "what did we say",
1401 "why did we change",
1402 ]
1403 .iter()
1404 .any(|needle| lower.contains(needle));
1405
1406 let query_memory_type = if lower.contains("decide")
1408 || lower.contains("decision")
1409 || lower.contains("we agreed")
1410 || lower.contains("we chose")
1411 {
1412 Some("decision")
1413 } else if lower.contains("bug")
1414 || lower.contains("error")
1415 || lower.contains("issue")
1416 || lower.contains("problem")
1417 || lower.contains("fix")
1418 || lower.contains("broken")
1419 {
1420 Some("problem")
1421 } else if lower.contains("shipped")
1422 || lower.contains("milestone")
1423 || lower.contains("finished")
1424 || lower.contains("working now")
1425 {
1426 Some("milestone")
1427 } else if lower.contains("prefer")
1428 || lower.contains("my preference")
1429 || lower.contains("i like")
1430 || lower.contains("i want")
1431 {
1432 Some("preference")
1433 } else {
1434 None
1435 };
1436
1437 Self {
1438 exact_phrases: extract_exact_phrases(query),
1439 standout_terms: extract_standout_terms(query),
1440 historical_memory_hint,
1441 temporal_reference: extract_temporal_reference(query),
1442 query_memory_type,
1443 }
1444 }
1445}
1446
1447fn merge_scored_result(
1448 merged_by_path: &mut HashMap<String, SearchResult>,
1449 candidate: SearchResult,
1450) {
1451 match merged_by_path.get_mut(&candidate.path) {
1452 Some(existing) if candidate.score > existing.score => *existing = candidate,
1453 Some(_) => {}
1454 None => {
1455 merged_by_path.insert(candidate.path.clone(), candidate);
1456 }
1457 }
1458}
1459
1460fn reranked_score(
1461 signals: &QuerySignals,
1462 active_room: Option<&str>,
1463 result: &SearchResult,
1464 is_semantic: bool,
1465) -> f32 {
1466 let base = if is_semantic {
1467 1.0 + result.score.clamp(0.0, 1.0)
1468 } else {
1469 (result.score / 10.0).clamp(0.0, 1.0)
1470 };
1471 base + room_bias(active_room, result)
1472 + retrieval_signal_boost(signals, result)
1473 + temporal_memory_boost(signals, result)
1474}
1475
1476fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1477 if active_room == Some(result.room.as_str()) {
1478 0.15
1479 } else {
1480 0.0
1481 }
1482}
1483
1484fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1485 let mut boost = 0.0f32;
1486 let haystack = format!(
1487 "{}\n{}",
1488 result.path.to_ascii_lowercase(),
1489 result.content.to_ascii_lowercase()
1490 );
1491
1492 let phrase_matches = signals
1493 .exact_phrases
1494 .iter()
1495 .filter(|phrase| haystack.contains(phrase.as_str()))
1496 .count();
1497 if phrase_matches > 0 {
1498 boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1499 }
1500
1501 let mut standout_matches = 0;
1502 for term in &signals.standout_terms {
1503 if result.path.to_ascii_lowercase().contains(term.as_str()) {
1504 boost += 0.40;
1505 standout_matches += 1;
1506 } else if result.content.to_ascii_lowercase().contains(term.as_str()) {
1507 boost += 0.12;
1508 standout_matches += 1;
1509 }
1510 if standout_matches >= 3 {
1511 break;
1512 }
1513 }
1514
1515 if signals.historical_memory_hint && result.room == "session" {
1516 boost += 0.45;
1517 }
1518
1519 if let Some(qtype) = signals.query_memory_type {
1521 if !result.memory_type.is_empty() && result.memory_type == qtype {
1522 boost += 0.35;
1523 }
1524 }
1525
1526 boost
1527}
1528
1529fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1530 if result.room != "session" {
1531 return 0.0;
1532 }
1533 let Some(reference) = signals.temporal_reference else {
1534 return 0.0;
1535 };
1536 let Some(memory_ts) = session_memory_timestamp(result) else {
1537 return 0.0;
1538 };
1539
1540 let span = reference.window_secs.max(86_400);
1541 let full_fade = span.saturating_mul(8);
1542 if full_fade <= 0 {
1543 return 0.0;
1544 }
1545
1546 let distance = (memory_ts - reference.target_ts).abs();
1547 let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1548 if closeness <= 0.0 {
1549 0.0
1550 } else {
1551 0.22 * closeness
1552 }
1553}
1554
1555fn extract_exact_phrases(query: &str) -> Vec<String> {
1556 let mut phrases = Vec::new();
1557 let chars: Vec<char> = query.chars().collect();
1558 let mut i = 0usize;
1559
1560 while i < chars.len() {
1561 let quote = chars[i];
1562 if !matches!(quote, '"' | '\'' | '`') {
1563 i += 1;
1564 continue;
1565 }
1566 let start = i + 1;
1567 let mut end = start;
1568 while end < chars.len() && chars[end] != quote {
1569 end += 1;
1570 }
1571 if end > start {
1572 let phrase = chars[start..end]
1573 .iter()
1574 .collect::<String>()
1575 .trim()
1576 .to_ascii_lowercase();
1577 if phrase.len() >= 3 && !phrases.contains(&phrase) {
1578 phrases.push(phrase);
1579 }
1580 }
1581 i = end.saturating_add(1);
1582 }
1583
1584 phrases
1585}
1586
1587fn extract_standout_terms(query: &str) -> Vec<String> {
1588 const STOPWORDS: &[&str] = &[
1589 "about", "after", "before", "change", "changed", "decide", "decided", "does", "earlier",
1590 "flow", "from", "have", "into", "just", "last", "local", "make", "more", "remember",
1591 "should", "that", "their", "there", "these", "they", "this", "those", "what", "when",
1592 "where", "which", "why", "with", "work",
1593 ];
1594
1595 let mut standout = Vec::new();
1596 for token in query.split(|ch: char| {
1597 !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1598 }) {
1599 let trimmed = token.trim();
1600 if trimmed.len() < 4 {
1601 continue;
1602 }
1603 let lower = trimmed.to_ascii_lowercase();
1604 if STOPWORDS.contains(&lower.as_str()) {
1605 continue;
1606 }
1607
1608 let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1609 || trimmed
1610 .chars()
1611 .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1612 || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1613 || trimmed.len() >= 9;
1614
1615 if interesting && !standout.contains(&lower) {
1616 standout.push(lower);
1617 }
1618 }
1619
1620 standout
1621}
1622
1623fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1624 if let Some(ts) = extract_iso_date_from_query(query) {
1625 return Some(TemporalReference {
1626 target_ts: ts,
1627 window_secs: 86_400,
1628 });
1629 }
1630
1631 let now = current_unix_timestamp();
1632 let lower = query.to_ascii_lowercase();
1633 if lower.contains("yesterday") {
1634 Some(TemporalReference {
1635 target_ts: now.saturating_sub(86_400),
1636 window_secs: 86_400,
1637 })
1638 } else if lower.contains("today") || lower.contains("earlier today") {
1639 Some(TemporalReference {
1640 target_ts: now,
1641 window_secs: 86_400,
1642 })
1643 } else if lower.contains("last week") {
1644 Some(TemporalReference {
1645 target_ts: now.saturating_sub(7 * 86_400),
1646 window_secs: 7 * 86_400,
1647 })
1648 } else if lower.contains("last month") {
1649 Some(TemporalReference {
1650 target_ts: now.saturating_sub(30 * 86_400),
1651 window_secs: 30 * 86_400,
1652 })
1653 } else {
1654 None
1655 }
1656}
1657
1658fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1659 query
1660 .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1661 .find_map(parse_iso_date_token)
1662}
1663
1664fn parse_iso_date_token(token: &str) -> Option<i64> {
1665 if token.len() != 10 {
1666 return None;
1667 }
1668 let bytes = token.as_bytes();
1669 if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1670 return None;
1671 }
1672
1673 let year = token.get(0..4)?.parse::<i32>().ok()?;
1674 let month = token.get(5..7)?.parse::<u32>().ok()?;
1675 let day = token.get(8..10)?.parse::<u32>().ok()?;
1676 if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1677 return None;
1678 }
1679
1680 Some(days_from_civil(year, month, day).saturating_mul(86_400))
1681}
1682
1683fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1684 let year = year - if month <= 2 { 1 } else { 0 };
1685 let era = if year >= 0 { year } else { year - 399 } / 400;
1686 let yoe = year - era * 400;
1687 let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1688 let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1689 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1690 era as i64 * 146_097 + doe as i64 - 719_468
1691}
1692
1693fn current_unix_timestamp() -> i64 {
1694 std::time::SystemTime::now()
1695 .duration_since(std::time::UNIX_EPOCH)
1696 .unwrap_or_default()
1697 .as_secs() as i64
1698}
1699
1700fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1701 extract_session_path_timestamp(&result.path).or_else(|| {
1702 if result.last_modified > 0 {
1703 Some(result.last_modified)
1704 } else {
1705 None
1706 }
1707 })
1708}
1709
1710fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1711 let normalized = path.replace('\\', "/");
1712 let mut parts = normalized.split('/');
1713 if parts.next()? != "session" {
1714 return None;
1715 }
1716 parse_iso_date_token(parts.next()?)
1717}
1718
1719fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1720 let normalized = speaker.trim().to_ascii_lowercase();
1721 match normalized.as_str() {
1722 "you" | "user" => SessionSpeakerKind::User,
1723 "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1724 _ => SessionSpeakerKind::Assistant,
1725 }
1726}
1727
1728fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1729 let Ok(raw) = std::fs::read_to_string(report_path) else {
1730 return Vec::new();
1731 };
1732 let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1733 return Vec::new();
1734 };
1735
1736 let session_key = report_path
1737 .file_stem()
1738 .and_then(|stem| stem.to_str())
1739 .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1740 .unwrap_or("unknown-session")
1741 .to_string();
1742 let session_date = report
1743 .session_start
1744 .split('_')
1745 .next()
1746 .filter(|date| !date.is_empty())
1747 .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1748 .to_string();
1749
1750 let mut exchanges = Vec::new();
1751 let mut pending_user: Option<String> = None;
1752 let mut turn_index = 0usize;
1753
1754 for entry in report.transcript {
1755 match session_speaker_kind(&entry.speaker) {
1756 SessionSpeakerKind::User => {
1757 let text = entry.text.trim();
1758 if !text.is_empty() {
1759 pending_user = Some(text.to_string());
1760 }
1761 }
1762 SessionSpeakerKind::Assistant => {
1763 let text = entry.text.trim();
1764 if text.is_empty() {
1765 continue;
1766 }
1767 let Some(user_text) = pending_user.take() else {
1768 continue;
1769 };
1770 turn_index += 1;
1771 exchanges.push(SessionExchange {
1772 path: format!(
1773 "session/{}/{}/turn-{}",
1774 session_date, session_key, turn_index
1775 ),
1776 last_modified,
1777 content: format!(
1778 "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1779 user_text, text
1780 ),
1781 });
1782 }
1783 SessionSpeakerKind::Ignore => {}
1784 }
1785 }
1786
1787 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1788 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1789 exchanges = exchanges.into_iter().skip(keep_from).collect();
1790 }
1791
1792 exchanges
1793}
1794
1795fn load_imported_session_exchanges(
1796 import_path: &Path,
1797 imports_root: &Path,
1798 last_modified: i64,
1799) -> Vec<SessionExchange> {
1800 let Ok(raw) = std::fs::read_to_string(import_path) else {
1801 return Vec::new();
1802 };
1803
1804 let messages = normalize_import_messages(&raw, import_path);
1805 if messages.is_empty() {
1806 return Vec::new();
1807 }
1808
1809 let rel = import_path
1810 .strip_prefix(imports_root)
1811 .unwrap_or(import_path);
1812 let rel_slug = slugify_import_path(rel);
1813 let mut exchanges = Vec::new();
1814 let mut pending_user: Option<String> = None;
1815 let mut turn_index = 0usize;
1816
1817 for (role, text) in messages {
1818 let cleaned = text.trim();
1819 if cleaned.is_empty() {
1820 continue;
1821 }
1822 match role.as_str() {
1823 "user" => pending_user = Some(cleaned.to_string()),
1824 "assistant" => {
1825 let Some(user_text) = pending_user.take() else {
1826 continue;
1827 };
1828 turn_index += 1;
1829 exchanges.push(SessionExchange {
1830 path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1831 last_modified,
1832 content: format!(
1833 "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1834 rel.to_string_lossy().replace('\\', "/"),
1835 user_text,
1836 cleaned
1837 ),
1838 });
1839 }
1840 _ => {}
1841 }
1842 }
1843
1844 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1845 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1846 exchanges = exchanges.into_iter().skip(keep_from).collect();
1847 }
1848
1849 exchanges
1850}
1851
1852fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1853 if raw.trim().is_empty() {
1854 return Vec::new();
1855 }
1856
1857 if let Some(messages) = parse_marker_transcript(raw) {
1858 return messages;
1859 }
1860
1861 let ext = import_path
1862 .extension()
1863 .and_then(|ext| ext.to_str())
1864 .unwrap_or("")
1865 .to_ascii_lowercase();
1866
1867 if matches!(ext.as_str(), "json" | "jsonl")
1868 || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1869 {
1870 if let Some(messages) = parse_jsonl_messages(raw) {
1871 if !messages.is_empty() {
1872 return messages;
1873 }
1874 }
1875
1876 if let Ok(value) = serde_json::from_str::<Value>(raw) {
1877 if let Some(messages) = parse_session_report_messages(&value) {
1878 return messages;
1879 }
1880 if let Some(messages) = parse_simple_role_messages(&value) {
1881 return messages;
1882 }
1883 if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1884 return messages;
1885 }
1886 }
1887 }
1888
1889 Vec::new()
1890}
1891
1892fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1893 let lines = raw.lines().collect::<Vec<_>>();
1894 if lines
1895 .iter()
1896 .filter(|line| line.trim_start().starts_with("> "))
1897 .count()
1898 < 2
1899 {
1900 return None;
1901 }
1902
1903 let mut messages = Vec::new();
1904 let mut i = 0usize;
1905 while i < lines.len() {
1906 let line = lines[i].trim_start();
1907 if let Some(rest) = line.strip_prefix("> ") {
1908 messages.push(("user".to_string(), rest.trim().to_string()));
1909 i += 1;
1910 let mut assistant_lines = Vec::new();
1911 while i < lines.len() {
1912 let next = lines[i];
1913 if next.trim_start().starts_with("> ") {
1914 break;
1915 }
1916 let trimmed = next.trim();
1917 if !trimmed.is_empty() && trimmed != "---" {
1918 assistant_lines.push(trimmed.to_string());
1919 }
1920 i += 1;
1921 }
1922 if !assistant_lines.is_empty() {
1923 messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1924 }
1925 } else {
1926 i += 1;
1927 }
1928 }
1929
1930 (!messages.is_empty()).then_some(messages)
1931}
1932
1933fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1934 let mut messages = Vec::new();
1935 let mut has_codex_session_meta = false;
1936 let mut saw_jsonl = false;
1937
1938 for line in raw.lines() {
1939 let trimmed = line.trim();
1940 if trimmed.is_empty() {
1941 continue;
1942 }
1943 let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1944 continue;
1945 };
1946 saw_jsonl = true;
1947 let Some(object) = value.as_object() else {
1948 continue;
1949 };
1950
1951 match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1952 "session_meta" => {
1953 has_codex_session_meta = true;
1954 }
1955 "event_msg" => {
1956 let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1957 continue;
1958 };
1959 let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1960 continue;
1961 };
1962 match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1963 "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1964 "agent_message" => {
1965 messages.push(("assistant".to_string(), text.trim().to_string()))
1966 }
1967 _ => {}
1968 }
1969 }
1970 "human" | "user" => {
1971 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1972 messages.push(("user".to_string(), text));
1973 }
1974 }
1975 "assistant" => {
1976 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1977 messages.push(("assistant".to_string(), text));
1978 }
1979 }
1980 _ => {
1981 if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
1982 if let Some(text) = extract_text_content(&value) {
1983 match role {
1984 "user" | "human" => messages.push(("user".to_string(), text)),
1985 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1986 _ => {}
1987 }
1988 }
1989 }
1990 }
1991 }
1992 }
1993
1994 if !saw_jsonl {
1995 return None;
1996 }
1997
1998 if has_codex_session_meta || !messages.is_empty() {
1999 return Some(messages);
2000 }
2001
2002 None
2003}
2004
2005fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
2006 let report = value.as_object()?;
2007 let transcript = report.get("transcript")?.as_array()?;
2008 let mut messages = Vec::new();
2009
2010 for entry in transcript {
2011 let Some(obj) = entry.as_object() else {
2012 continue;
2013 };
2014 let speaker = obj
2015 .get("speaker")
2016 .and_then(|v| v.as_str())
2017 .unwrap_or_default();
2018 let text = obj
2019 .get("text")
2020 .and_then(|v| v.as_str())
2021 .unwrap_or_default()
2022 .trim()
2023 .to_string();
2024 if text.is_empty() {
2025 continue;
2026 }
2027 match session_speaker_kind(speaker) {
2028 SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
2029 SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
2030 SessionSpeakerKind::Ignore => {}
2031 }
2032 }
2033
2034 (!messages.is_empty()).then_some(messages)
2035}
2036
2037fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
2038 if let Some(array) = value.as_array() {
2039 let messages = collect_role_messages(array);
2040 return (!messages.is_empty()).then_some(messages);
2041 }
2042
2043 let obj = value.as_object()?;
2044 if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
2045 let array = messages_value.as_array()?;
2046 let messages = collect_role_messages(array);
2047 return (!messages.is_empty()).then_some(messages);
2048 }
2049
2050 None
2051}
2052
2053fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
2054 let mut messages = Vec::new();
2055 for item in items {
2056 let Some(obj) = item.as_object() else {
2057 continue;
2058 };
2059 let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
2060 continue;
2061 };
2062 let Some(text) = extract_text_content(item) else {
2063 continue;
2064 };
2065 match role {
2066 "user" | "human" => messages.push(("user".to_string(), text)),
2067 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2068 _ => {}
2069 }
2070 }
2071 messages
2072}
2073
2074fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
2075 let mapping = value.get("mapping")?.as_object()?;
2076 let mut current_id = mapping.iter().find_map(|(node_id, node)| {
2077 let obj = node.as_object()?;
2078 (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
2079 })?;
2080
2081 let mut messages = Vec::new();
2082 let mut visited = std::collections::HashSet::new();
2083
2084 while visited.insert(current_id.clone()) {
2085 let Some(node) = mapping.get(¤t_id).and_then(|v| v.as_object()) else {
2086 break;
2087 };
2088
2089 if let Some(message) = node.get("message") {
2090 let role = message
2091 .get("author")
2092 .and_then(|author| author.get("role"))
2093 .and_then(|v| v.as_str())
2094 .unwrap_or("");
2095 if let Some(text) = extract_text_content(message) {
2096 match role {
2097 "user" => messages.push(("user".to_string(), text)),
2098 "assistant" => messages.push(("assistant".to_string(), text)),
2099 _ => {}
2100 }
2101 }
2102 }
2103
2104 let Some(next_id) = node
2105 .get("children")
2106 .and_then(|children| children.as_array())
2107 .and_then(|children| children.first())
2108 .and_then(|child| child.as_str())
2109 else {
2110 break;
2111 };
2112 current_id = next_id.to_string();
2113 }
2114
2115 (!messages.is_empty()).then_some(messages)
2116}
2117
2118fn extract_text_content(value: &Value) -> Option<String> {
2119 if let Some(text) = value.as_str() {
2120 let trimmed = text.trim();
2121 return (!trimmed.is_empty()).then_some(trimmed.to_string());
2122 }
2123
2124 if let Some(array) = value.as_array() {
2125 let joined = array
2126 .iter()
2127 .filter_map(extract_text_content)
2128 .filter(|part| !part.is_empty())
2129 .collect::<Vec<_>>()
2130 .join("\n");
2131 return (!joined.is_empty()).then_some(joined);
2132 }
2133
2134 let obj = value.as_object()?;
2135
2136 if let Some(content) = obj.get("content") {
2137 if let Some(text) = extract_text_content(content) {
2138 return Some(text);
2139 }
2140 }
2141
2142 if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
2143 let trimmed = text.trim();
2144 if !trimmed.is_empty() {
2145 return Some(trimmed.to_string());
2146 }
2147 }
2148
2149 if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
2150 let joined = parts
2151 .iter()
2152 .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
2153 .filter(|part| !part.is_empty())
2154 .collect::<Vec<_>>()
2155 .join("\n");
2156 if !joined.is_empty() {
2157 return Some(joined);
2158 }
2159 }
2160
2161 None
2162}
2163
2164fn slugify_import_path(path: &Path) -> String {
2165 path.to_string_lossy()
2166 .replace('\\', "/")
2167 .chars()
2168 .map(|ch| {
2169 if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
2170 ch
2171 } else {
2172 '_'
2173 }
2174 })
2175 .collect::<String>()
2176 .trim_matches('/')
2177 .replace('/', "__")
2178}
2179
2180fn embed_text_blocking(text: &str, base_url: &str, embed_model: &str) -> Option<Vec<f32>> {
2190 embed_text_with_prefix(text, "search_document", base_url, embed_model)
2191}
2192
2193fn embed_query_blocking(text: &str, base_url: &str, embed_model: &str) -> Option<Vec<f32>> {
2194 embed_text_with_prefix(text, "search_query", base_url, embed_model)
2195}
2196
2197fn embed_text_with_prefix(
2198 text: &str,
2199 task: &str,
2200 base_url: &str,
2201 embed_model: &str,
2202) -> Option<Vec<f32>> {
2203 let prefixed = format!("{}: {}", task, text);
2205 let input = if prefixed.len() > 8000 {
2207 &prefixed[..8000]
2208 } else {
2209 &prefixed
2210 };
2211
2212 let client = reqwest::blocking::Client::builder()
2213 .timeout(std::time::Duration::from_secs(10))
2214 .build()
2215 .ok()?;
2216
2217 let body = serde_json::json!({
2218 "model": embed_model,
2219 "input": input
2220 });
2221
2222 let trimmed = base_url.trim_end_matches('/');
2223 let is_ollama = trimmed.contains("11434");
2224 let url = if is_ollama {
2225 format!("{}/api/embed", trimmed)
2226 } else {
2227 format!("{}/v1/embeddings", trimmed)
2228 };
2229 let resp = client.post(&url).json(&body).send().ok()?;
2230
2231 if !resp.status().is_success() {
2232 return None;
2233 }
2234
2235 let json: serde_json::Value = resp.json().ok()?;
2236 let embedding = if is_ollama {
2237 json["embeddings"][0].as_array()?
2238 } else {
2239 json["data"][0]["embedding"].as_array()?
2240 };
2241 let vec: Vec<f32> = embedding
2242 .iter()
2243 .filter_map(|v| v.as_f64().map(|f| f as f32))
2244 .collect();
2245
2246 if vec.is_empty() {
2247 None
2248 } else {
2249 Some(vec)
2250 }
2251}
2252
2253fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2256 if a.len() != b.len() || a.is_empty() {
2257 return 0.0;
2258 }
2259 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2260 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2261 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2262 if norm_a == 0.0 || norm_b == 0.0 {
2263 0.0
2264 } else {
2265 dot / (norm_a * norm_b)
2266 }
2267}
2268
2269fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2270 floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2271}
2272
2273fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2274 blob.chunks_exact(4)
2275 .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2276 .collect()
2277}
2278
2279fn normalize_extracted_document_text(text: String) -> Option<String> {
2285 let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2286 let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2287 if trimmed.is_empty() {
2288 None
2289 } else {
2290 Some(trimmed.to_string())
2291 }
2292}
2293
2294fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2295 let previous_hook = std::panic::take_hook();
2296 std::panic::set_hook(Box::new(|_| {}));
2297 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2298 pdf_extract::extract_text(path)
2299 }));
2300 std::panic::set_hook(previous_hook);
2301
2302 match result {
2303 Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2304 Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2305 Err(payload) => {
2306 let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2307 (*msg).to_string()
2308 } else if let Some(msg) = payload.downcast_ref::<String>() {
2309 msg.clone()
2310 } else {
2311 "unknown parser panic".to_string()
2312 };
2313 Err(format!("pdf-extract panicked: {}", panic_text))
2314 }
2315 }
2316}
2317
2318fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2319 let mut doc =
2320 lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2321
2322 if doc.is_encrypted() {
2323 doc.decrypt("")
2324 .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2325 }
2326
2327 let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2328 if page_numbers.is_empty() {
2329 return Ok(None);
2330 }
2331
2332 let mut extracted_pages = Vec::new();
2333 let mut page_errors = Vec::new();
2334
2335 for page_number in page_numbers {
2336 match doc.extract_text(&[page_number]) {
2337 Ok(text) => {
2338 if let Some(page_text) = normalize_extracted_document_text(text) {
2339 extracted_pages.push(page_text);
2340 }
2341 }
2342 Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2343 }
2344 }
2345
2346 if !extracted_pages.is_empty() {
2347 return Ok(Some(extracted_pages.join("\n\n")));
2348 }
2349
2350 if !page_errors.is_empty() {
2351 let sample_errors = page_errors
2352 .into_iter()
2353 .take(3)
2354 .collect::<Vec<_>>()
2355 .join("; ");
2356 return Err(format!(
2357 "lopdf could not extract usable page text ({sample_errors})"
2358 ));
2359 }
2360
2361 Ok(None)
2362}
2363
2364fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2365 let mut failures = Vec::new();
2366
2367 match extract_pdf_text_with_pdf_extract(path) {
2368 Ok(Some(text)) => return Ok(Some(text)),
2369 Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2370 Err(e) => failures.push(e),
2371 }
2372
2373 match extract_pdf_text_with_lopdf(path) {
2374 Ok(Some(text)) => return Ok(Some(text)),
2375 Ok(None) => failures.push("lopdf found no usable text".to_string()),
2376 Err(e) => failures.push(e),
2377 }
2378
2379 let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2380 Err(format!(
2381 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2382 detail
2383 ))
2384}
2385
2386fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2387 let exe = std::env::current_exe()
2388 .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2389 let output = std::process::Command::new(exe)
2390 .arg("--pdf-extract-helper")
2391 .arg(path)
2392 .stdin(std::process::Stdio::null())
2393 .stdout(std::process::Stdio::piped())
2394 .stderr(std::process::Stdio::piped())
2395 .output()
2396 .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2397
2398 if !output.status.success() {
2399 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2400 return Err(if stderr.is_empty() {
2401 "PDF extraction failed.".to_string()
2402 } else {
2403 stderr
2404 });
2405 }
2406
2407 let text = String::from_utf8(output.stdout)
2408 .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2409 if text.trim().is_empty() {
2410 Ok(None)
2411 } else {
2412 Ok(Some(text))
2413 }
2414}
2415
2416pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2417 match extract_pdf_text_inside_helper(path) {
2418 Ok(Some(text)) => {
2419 use std::io::Write;
2420 let mut stdout = std::io::stdout();
2421 if stdout.write_all(text.as_bytes()).is_ok() {
2422 0
2423 } else {
2424 let _ = writeln!(
2425 std::io::stderr(),
2426 "PDF helper could not write extracted text."
2427 );
2428 1
2429 }
2430 }
2431 Ok(None) => {
2432 eprintln!(
2433 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2434 );
2435 1
2436 }
2437 Err(e) => {
2438 eprintln!("{}", e);
2439 1
2440 }
2441 }
2442}
2443
2444pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2447 let ext = path
2448 .extension()
2449 .and_then(|e| e.to_str())
2450 .unwrap_or("")
2451 .to_lowercase();
2452 match ext.as_str() {
2453 "pdf" => {
2454 let text = extract_pdf_text(path)?.ok_or_else(|| {
2455 "PDF contains no extractable text — it may be scanned/image-only. \
2456 Try attaching page screenshots with /image instead."
2457 .to_string()
2458 })?;
2459 pdf_quality_check(text)
2460 }
2461 _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2462 }
2463}
2464
2465fn pdf_quality_check(text: String) -> Result<String, String> {
2470 let trimmed = text.trim();
2471
2472 if trimmed.len() < 150 {
2474 return Err(format!(
2475 "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2476 or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2477 trimmed.len()
2478 ));
2479 }
2480
2481 let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2484 let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2485 let space_ratio = if non_newline > 0 {
2486 spaces as f32 / non_newline as f32
2487 } else {
2488 0.0
2489 };
2490
2491 if space_ratio < 0.04 {
2492 return Err(
2493 "PDF text extraction produced garbled output — words are merged with no spaces. \
2494 This usually means the PDF uses custom embedded fonts (common with academic publishers \
2495 like EBSCO, Elsevier, Springer). \
2496 Try a PDF exported from Word, Google Docs, or LaTeX, \
2497 or attach page screenshots with /image instead.".to_string()
2498 );
2499 }
2500
2501 Ok(text)
2502}
2503
2504fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2508 if ext == "rs" {
2509 chunk_rust_symbols(text)
2510 } else {
2511 chunk_paragraphs(text)
2512 }
2513}
2514
2515fn chunk_rust_symbols(text: &str) -> Vec<String> {
2524 const ITEM_STARTS: &[&str] = &[
2525 "pub fn ",
2526 "pub async fn ",
2527 "pub unsafe fn ",
2528 "async fn ",
2529 "unsafe fn ",
2530 "fn ",
2531 "pub impl",
2532 "impl ",
2533 "pub struct ",
2534 "struct ",
2535 "pub enum ",
2536 "enum ",
2537 "pub trait ",
2538 "trait ",
2539 "pub mod ",
2540 "mod ",
2541 "pub type ",
2542 "type ",
2543 "pub const ",
2544 "const ",
2545 "pub static ",
2546 "static ",
2547 ];
2548
2549 let lines: Vec<&str> = text.lines().collect();
2550 let mut chunks: Vec<String> = Vec::new();
2551 let mut current: Vec<&str> = Vec::new();
2552
2553 for &line in &lines {
2554 let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2555 let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2556
2557 if is_item && !current.is_empty() {
2558 let mut split = current.len();
2561 while split > 0 {
2562 let prev = current[split - 1].trim();
2563 if prev.starts_with("///")
2564 || prev.starts_with("//!")
2565 || prev.starts_with("#[")
2566 || prev.is_empty()
2567 {
2568 split -= 1;
2569 } else {
2570 break;
2571 }
2572 }
2573 let body = current[..split].join("\n");
2574 if !body.trim().is_empty() {
2575 chunks.push(body);
2576 }
2577 current = current[split..].to_vec();
2578 }
2579 current.push(line);
2580 }
2581 if !current.is_empty() {
2582 let body = current.join("\n");
2583 if !body.trim().is_empty() {
2584 chunks.push(body);
2585 }
2586 }
2587
2588 let mut result = Vec::new();
2590 for chunk in chunks {
2591 if chunk.len() > 3000 {
2592 result.extend(sliding_window_chunks(&chunk, 2000, 200));
2593 } else {
2594 result.push(chunk);
2595 }
2596 }
2597 result
2598}
2599
2600fn chunk_paragraphs(text: &str) -> Vec<String> {
2602 let mut result: Vec<String> = Vec::new();
2603 let mut current = String::new();
2604
2605 for para in text.split("\n\n") {
2606 if current.len() + para.len() + 2 > 2000 {
2607 if !current.trim().is_empty() {
2608 result.push(current.clone());
2609 }
2610 current = para.to_string();
2611 } else {
2612 if !current.is_empty() {
2613 current.push_str("\n\n");
2614 }
2615 current.push_str(para);
2616 }
2617 }
2618 if !current.trim().is_empty() {
2619 result.push(current);
2620 }
2621
2622 let mut final_result = Vec::new();
2623 for chunk in result {
2624 if chunk.len() > 2000 {
2625 final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2626 } else {
2627 final_result.push(chunk);
2628 }
2629 }
2630 final_result
2631}
2632
2633fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2635 let chars: Vec<char> = text.chars().collect();
2636 let mut result = Vec::new();
2637 let mut i = 0;
2638 while i < chars.len() {
2639 let end = (i + chunk_size).min(chars.len());
2640 result.push(chars[i..end].iter().collect());
2641 if end == chars.len() {
2642 break;
2643 }
2644 i += chunk_size - overlap;
2645 }
2646 result
2647}