1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7pub struct Vein {
28 db: std::sync::Arc<std::sync::Mutex<Connection>>,
29 base_url: String,
31}
32
33unsafe impl Send for Vein {}
36unsafe impl Sync for Vein {}
37
38#[derive(Debug, Clone)]
39pub struct SearchResult {
40 pub path: String,
41 pub content: String,
42 pub score: f32,
44 pub room: String,
46 pub last_modified: i64,
48 pub memory_type: String,
51}
52
53#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct VeinHotFile {
55 pub path: String,
56 pub heat: i64,
57 pub last_modified: i64,
58 pub room: String,
59}
60
61#[derive(Debug, Clone)]
62pub struct VeinInspectionSnapshot {
63 pub indexed_source_files: usize,
64 pub indexed_docs: usize,
65 pub indexed_session_exchanges: usize,
66 pub embedded_source_doc_chunks: usize,
67 pub has_any_embeddings: bool,
68 pub active_room: Option<String>,
69 pub hot_files: Vec<VeinHotFile>,
70 pub l1_ready: bool,
71}
72
73#[derive(Debug, Default)]
74struct QuerySignals {
75 exact_phrases: Vec<String>,
76 standout_terms: Vec<String>,
77 historical_memory_hint: bool,
78 temporal_reference: Option<TemporalReference>,
79 query_memory_type: Option<&'static str>,
82}
83
84#[derive(Debug, Clone, Copy)]
85struct TemporalReference {
86 target_ts: i64,
87 window_secs: i64,
88}
89
90#[derive(Debug, Deserialize)]
91struct SessionReport {
92 #[serde(default)]
93 session_start: String,
94 #[serde(default)]
95 transcript: Vec<SessionTranscriptEntry>,
96}
97
98#[derive(Debug, Deserialize)]
99struct SessionTranscriptEntry {
100 #[serde(default)]
101 speaker: String,
102 #[serde(default)]
103 text: String,
104}
105
106#[derive(Debug)]
107struct SessionExchange {
108 path: String,
109 last_modified: i64,
110 content: String,
111}
112
113#[derive(Debug, Clone, Copy, PartialEq, Eq)]
114enum SessionSpeakerKind {
115 User,
116 Assistant,
117 Ignore,
118}
119
120pub fn detect_room(path: &str) -> String {
125 let lower = path.to_lowercase().replace('\\', "/");
126 let filename = lower.rsplit('/').next().unwrap_or(&lower);
127 let ext = filename.rsplit('.').next().unwrap_or("");
128
129 let mut best_room = None::<&str>;
130 let mut best_score = 0i32;
131 let mut consider = |room: &'static str, score: i32| {
132 if score > best_score {
133 best_score = score;
134 best_room = Some(room);
135 }
136 };
137
138 let is_component = |segment: &str| {
139 lower == segment
140 || lower.starts_with(&format!("{segment}/"))
141 || lower.contains(&format!("/{segment}/"))
142 };
143
144 if lower.starts_with("session/")
145 || lower.starts_with(".hematite/reports/")
146 || lower.starts_with(".hematite/imports/")
147 || is_component("reports")
148 || is_component("imports")
149 {
150 consider("session", 100);
151 }
152
153 if lower.starts_with(".hematite/docs/")
154 || is_component("docs")
155 || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
156 || matches!(ext, "md" | "markdown" | "pdf" | "rst")
157 {
158 consider("docs", 80);
159 }
160
161 if is_component("tests")
162 || filename.contains("diagnostic")
163 || filename.ends_with("_test.rs")
164 || filename.ends_with(".test.ts")
165 {
166 consider("tests", 85);
167 }
168
169 if lower.starts_with(".github/workflows/")
170 || is_component("workflows")
171 || filename == ".pre-commit-config.yaml"
172 || filename == ".pre-commit-config.yml"
173 || filename.contains("hook")
174 {
175 consider("automation", 84);
176 }
177
178 if lower.starts_with("installer/")
179 || lower.starts_with("dist/")
180 || lower.starts_with("scripts/package-")
181 || filename.contains("release")
182 || filename.contains("bump-version")
183 || ext == "iss"
184 {
185 consider("release", 82);
186 }
187
188 if matches!(
189 filename,
190 "cargo.toml"
191 | "cargo.lock"
192 | "package.json"
193 | "pnpm-lock.yaml"
194 | "yarn.lock"
195 | "bun.lock"
196 | "bun.lockb"
197 | "pyproject.toml"
198 | "setup.py"
199 | "go.mod"
200 | "pom.xml"
201 | "build.gradle"
202 | "build.gradle.kts"
203 | "cmakelists.txt"
204 | ".gitignore"
205 | "settings.json"
206 | "mcp_servers.json"
207 ) || filename.ends_with(".sln")
208 || filename.ends_with(".csproj")
209 || filename.contains("config")
210 {
211 consider("config", 76);
212 }
213
214 if is_component("ui")
215 || matches!(
216 filename,
217 "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
218 )
219 {
220 consider("ui", 70);
221 }
222
223 if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
224 consider("memory", 72);
225 }
226
227 if is_component("tools")
228 || matches!(
229 filename,
230 "verify_build.rs"
231 | "host_inspect.rs"
232 | "shell.rs"
233 | "code_sandbox.rs"
234 | "project_map.rs"
235 | "runtime_trace.rs"
236 )
237 {
238 consider("tools", 68);
239 }
240
241 if filename.contains("mcp")
242 || filename.contains("lsp")
243 || lower.contains("/mcp/")
244 || lower.contains("/lsp/")
245 {
246 consider("integration", 67);
247 }
248
249 if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
250 || filename.contains("startup")
251 || filename.contains("runtime")
252 {
253 consider("runtime", 66);
254 }
255
256 if is_component("agent") {
257 consider("agent", 60);
258 }
259
260 if lower.starts_with("libs/") || is_component("libs") {
261 consider("libs", 58);
262 }
263
264 if lower.starts_with("scripts/") || is_component("scripts") {
265 consider("scripts", 55);
266 }
267
268 if let Some(room) = best_room {
269 return room.to_string();
270 }
271
272 lower
274 .split('/')
275 .next()
276 .filter(|s| !s.is_empty() && !s.contains('.'))
277 .unwrap_or("root")
278 .to_string()
279}
280
281pub fn detect_memory_type(text: &str) -> &'static str {
287 let lower = text.to_lowercase();
288
289 let decision_patterns = [
291 "let's use ",
292 "we'll use ",
293 "decided to ",
294 "going with ",
295 "we agreed ",
296 "the plan is",
297 "we're going to",
298 "switching to",
299 "we chose",
300 "final decision",
301 "we settled on",
302 "agreed on",
303 "we decided",
304 ];
305 for pat in &decision_patterns {
306 if lower.contains(pat) {
307 return "decision";
308 }
309 }
310
311 let problem_patterns = [
313 "bug fixed",
314 "bug was",
315 "the issue was",
316 "root cause",
317 "error was",
318 "turned out to be",
319 "the fix was",
320 "was caused by",
321 "broken because",
322 "fixed by",
323 "the problem was",
324 "found the bug",
325 "port conflict",
326 "crash",
327 "panicked",
328 "segfault",
329 "oom",
330 "out of memory",
331 ];
332 for pat in &problem_patterns {
333 if lower.contains(pat) {
334 return "problem";
335 }
336 }
337
338 let milestone_patterns = [
340 "now working",
341 "successfully",
342 "shipped",
343 "deployed",
344 "it works",
345 "tests pass",
346 "all green",
347 "breakthrough",
348 "finally got",
349 "got it working",
350 "completed",
351 "finished",
352 "done with",
353 "landed",
354 ];
355 for pat in &milestone_patterns {
356 if lower.contains(pat) {
357 return "milestone";
358 }
359 }
360
361 let preference_patterns = [
363 "i prefer",
364 "i like",
365 "i don't like",
366 "i want",
367 "always use",
368 "never use",
369 "i usually",
370 "my preference",
371 "keep it",
372 "avoid using",
373 ];
374 for pat in &preference_patterns {
375 if lower.contains(pat) {
376 return "preference";
377 }
378 }
379
380 ""
381}
382
383impl Vein {
384 const SESSION_REPORT_LIMIT: usize = 5;
385 const SESSION_TURN_LIMIT: usize = 50;
386 const IMPORT_FILE_LIMIT: usize = 12;
387 const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
388
389 pub fn new<P: AsRef<Path>>(
390 db_path: P,
391 base_url: String,
392 ) -> Result<Self, Box<dyn std::error::Error>> {
393 let db = Connection::open(db_path)?;
394
395 db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
397
398 db.execute_batch(
402 "CREATE TABLE IF NOT EXISTS chunks_meta (
403 path TEXT PRIMARY KEY,
404 last_modified INTEGER NOT NULL,
405 room TEXT NOT NULL DEFAULT 'root'
406 );
407 CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
408 path UNINDEXED,
409 content,
410 tokenize='porter ascii'
411 );
412 CREATE TABLE IF NOT EXISTS chunks_vec (
413 path TEXT NOT NULL,
414 chunk_idx INTEGER NOT NULL,
415 embedding BLOB NOT NULL,
416 PRIMARY KEY (path, chunk_idx)
417 );
418 CREATE TABLE IF NOT EXISTS file_heat (
419 path TEXT PRIMARY KEY,
420 heat INTEGER NOT NULL DEFAULT 0,
421 last_edit INTEGER NOT NULL DEFAULT 0
422 );",
423 )?;
424
425 let _ = db
427 .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
428 let _ = db.execute_batch(
429 "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
430 );
431 let _ = db.execute_batch(
432 "ALTER TABLE chunks_meta ADD COLUMN memory_type TEXT NOT NULL DEFAULT '';",
433 );
434
435 Ok(Self {
436 db: std::sync::Arc::new(std::sync::Mutex::new(db)),
437 base_url,
438 })
439 }
440
441 pub fn index_document(
446 &mut self,
447 path: &str,
448 last_modified: i64,
449 full_text: &str,
450 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
451 let room = detect_room(path);
452 let ext = std::path::Path::new(path)
453 .extension()
454 .and_then(|e| e.to_str())
455 .unwrap_or("");
456 let chunks = chunk_by_symbols(ext, full_text);
457 let memory_type = if room == "session" {
459 detect_memory_type(full_text)
460 } else {
461 ""
462 };
463 self.index_chunks_with_room_and_type(path, last_modified, &room, memory_type, &chunks)
464 }
465
466 fn index_chunks_with_room_and_type(
467 &mut self,
468 path: &str,
469 last_modified: i64,
470 room: &str,
471 memory_type: &str,
472 chunks: &[String],
473 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
474 let db = self.db.lock().unwrap();
475 let existing: Option<i64> = db
476 .query_row(
477 "SELECT last_modified FROM chunks_meta WHERE path = ?1",
478 params![path],
479 |r| r.get(0),
480 )
481 .ok();
482
483 if let Some(ts) = existing {
484 if ts >= last_modified {
485 return Ok(Vec::new()); }
487 }
488
489 db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
491 db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
492 db.execute(
493 "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room, memory_type) VALUES (?1, ?2, ?3, ?4)",
494 params![path, last_modified, room, memory_type],
495 )?;
496
497 drop(db);
498
499 let mut db = self.db.lock().unwrap();
500 let tx = db.transaction()?;
501 {
502 let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
503 for chunk in chunks {
504 stmt.execute(params![path, chunk.as_str()])?;
505 }
506 }
507 tx.commit()?;
508
509 Ok(chunks.to_vec())
510 }
511
512 pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
516 for (idx, chunk) in chunks.iter().enumerate() {
517 if let Some(vec) = embed_text_blocking(chunk, &self.base_url) {
518 let blob = floats_to_blob(&vec);
519 let db = self.db.lock().unwrap();
520 let _ = db.execute(
521 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
522 params![path, idx as i64, blob],
523 );
524 }
525 }
526 }
527
528 pub fn search_bm25(
532 &self,
533 query: &str,
534 limit: usize,
535 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
536 const STOPWORDS: &[&str] = &[
540 "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
541 "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an", "the",
542 "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "get",
543 "gets", "got", "work", "works", "make", "makes", "use", "uses", "into", "that", "this",
544 "it", "its",
545 ];
546
547 let safe_query: String = query
548 .chars()
549 .map(|c| {
550 if c.is_alphanumeric() || c == ' ' || c == '_' {
551 c
552 } else {
553 ' '
554 }
555 })
556 .collect();
557
558 let fts_query = safe_query
560 .split_whitespace()
561 .filter(|w| w.len() >= 3 && !STOPWORDS.contains(&w.to_lowercase().as_str()))
562 .collect::<Vec<_>>()
563 .join(" OR ");
564
565 if fts_query.is_empty() {
566 return Ok(Vec::new());
567 }
568
569 let db = self.db.lock().unwrap();
570 let mut stmt = db.prepare(
571 "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room, cm.memory_type
572 FROM chunks_fts
573 JOIN chunks_meta cm ON cm.path = chunks_fts.path
574 WHERE chunks_fts MATCH ?1
575 ORDER BY rank
576 LIMIT ?2",
577 )?;
578
579 let results: Vec<SearchResult> = stmt
580 .query_map(params![fts_query, limit as i64], |row| {
581 Ok(SearchResult {
582 path: row.get(0)?,
583 content: row.get(1)?,
584 score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
585 last_modified: row.get(3)?,
586 room: row.get(4)?,
587 memory_type: row.get::<_, String>(5).unwrap_or_default(),
588 })
589 })?
590 .filter_map(|r| r.ok())
591 .collect();
592
593 Ok(results)
594 }
595
596 pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
599 let query_vec = match embed_query_blocking(query, &self.base_url) {
600 Some(v) => v,
601 None => return Vec::new(),
602 };
603
604 let rows: Vec<(String, i64, Vec<u8>, i64, String, String)> = {
606 let db = self.db.lock().unwrap();
607 let mut stmt = match db.prepare(
608 "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room, cm.memory_type
609 FROM chunks_vec cv
610 JOIN chunks_meta cm ON cm.path = cv.path",
611 ) {
612 Ok(s) => s,
613 Err(_) => return Vec::new(),
614 };
615 stmt.query_map([], |row| {
616 Ok((
617 row.get::<_, String>(0)?,
618 row.get::<_, i64>(1)?,
619 row.get::<_, Vec<u8>>(2)?,
620 row.get::<_, i64>(3)?,
621 row.get::<_, String>(4)?,
622 row.get::<_, String>(5).unwrap_or_default(),
623 ))
624 })
625 .ok()
626 .map(|rows| rows.filter_map(|r| r.ok()).collect())
627 .unwrap_or_default()
628 };
629
630 if rows.is_empty() {
631 return Vec::new();
632 }
633
634 let mut scored: Vec<(f32, String, i64, i64, String, String)> = rows
636 .into_iter()
637 .filter_map(|(path, idx, blob, last_modified, room, memory_type)| {
638 let vec = blob_to_floats(&blob);
639 let sim = cosine_similarity(&query_vec, &vec);
640 Some((sim, path, idx, last_modified, room, memory_type))
641 })
642 .collect();
643
644 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
645 scored.truncate(limit);
646
647 let db = self.db.lock().unwrap();
649 scored
650 .into_iter()
651 .filter_map(|(score, path, idx, last_modified, room, memory_type)| {
652 let content: Option<String> = db
653 .query_row(
654 "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
655 params![path, idx],
656 |r| r.get(0),
657 )
658 .ok();
659 content.map(|c| SearchResult {
660 path,
661 content: c,
662 score,
663 room,
664 last_modified,
665 memory_type,
666 })
667 })
668 .collect()
669 }
670
671 pub fn search_context(
678 &self,
679 query: &str,
680 limit: usize,
681 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
682 let candidate_limit = (limit.max(1) * 4).max(12);
683 let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
684 let semantic = self.search_semantic(query, candidate_limit);
685 let signals = QuerySignals::from_query(query);
686
687 let active_room = self.active_room();
689
690 let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
693
694 for r in semantic {
695 let score = reranked_score(&signals, active_room.as_deref(), &r, true);
696 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
697 }
698
699 for r in bm25 {
700 let score = reranked_score(&signals, active_room.as_deref(), &r, false);
701 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
702 }
703
704 let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
705 merged.sort_by(|a, b| {
706 b.score
707 .partial_cmp(&a.score)
708 .unwrap_or(std::cmp::Ordering::Equal)
709 });
710 merged.truncate(limit);
711 Ok(merged)
712 }
713
714 fn active_room(&self) -> Option<String> {
717 let db = self.db.lock().unwrap();
718 db.query_row(
719 "SELECT cm.room, SUM(fh.heat) as total
720 FROM file_heat fh
721 JOIN chunks_meta cm ON cm.path = fh.path
722 GROUP BY cm.room
723 ORDER BY total DESC
724 LIMIT 1",
725 [],
726 |row| row.get::<_, String>(0),
727 )
728 .ok()
729 }
730
731 pub fn index_project(&mut self) -> usize {
739 let root = crate::tools::file_ops::workspace_root();
740 let mut count = 0usize;
741
742 const INDEXABLE: &[&str] = &[
743 "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
744 "yml", "txt",
745 ];
746 const SKIP_DIRS: &[&str] = &[
747 "target",
748 ".git",
749 "node_modules",
750 ".hematite",
751 ".hematite_logs",
752 ];
753
754 for entry in walkdir::WalkDir::new(&root)
755 .follow_links(false)
756 .into_iter()
757 .filter_entry(|e| {
758 if e.file_type().is_dir() {
759 let name = e.file_name().to_string_lossy();
760 return !SKIP_DIRS.contains(&name.as_ref());
761 }
762 true
763 })
764 .filter_map(|e| e.ok())
765 .filter(|e| e.file_type().is_file())
766 {
767 let path = entry.path();
768 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
769 if !INDEXABLE.contains(&ext) {
770 continue;
771 }
772
773 let Ok(meta) = std::fs::metadata(path) else {
774 continue;
775 };
776 if meta.len() > 512_000 {
777 continue;
778 }
779
780 let mtime = meta
781 .modified()
782 .map(|t| {
783 t.duration_since(std::time::UNIX_EPOCH)
784 .unwrap_or_default()
785 .as_secs() as i64
786 })
787 .unwrap_or(0);
788
789 let rel = path.strip_prefix(&root).unwrap_or(path);
790 let rel_str = rel.to_string_lossy().replace('\\', "/");
791
792 if let Ok(content) = std::fs::read_to_string(path) {
793 match self.index_document(&rel_str, mtime, &content) {
794 Ok(new_chunks) if !new_chunks.is_empty() => {
795 count += 1;
796 }
797 Ok(_) => {}
798 Err(_) => {}
799 }
800 }
801 }
802
803 count += self.index_workspace_artifacts(&root);
804
805 count
806 }
807
808 pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
813 let mut count = self.index_docs_folder(workspace_root);
814 count += self.index_recent_session_reports(workspace_root);
815 count += self.index_imported_session_exports(workspace_root);
816 self.backfill_missing_embeddings();
817 count
818 }
819
820 fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
825 let docs_dir = workspace_root.join(".hematite").join("docs");
826 const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
827 let mut count = 0usize;
828 let mut desired_paths = HashSet::new();
829
830 if docs_dir.exists() {
831 for entry in walkdir::WalkDir::new(&docs_dir)
832 .max_depth(3)
833 .follow_links(false)
834 .into_iter()
835 .filter_map(|e| e.ok())
836 .filter(|e| e.file_type().is_file())
837 {
838 let path = entry.path();
839 let ext = path
840 .extension()
841 .and_then(|e| e.to_str())
842 .unwrap_or("")
843 .to_lowercase();
844 if !DOCS_INDEXABLE.contains(&ext.as_str()) {
845 continue;
846 }
847
848 let Ok(meta) = std::fs::metadata(path) else {
849 continue;
850 };
851 if meta.len() > 50_000_000 {
852 continue;
853 }
854
855 let mtime = meta
856 .modified()
857 .map(|t| {
858 t.duration_since(std::time::UNIX_EPOCH)
859 .unwrap_or_default()
860 .as_secs() as i64
861 })
862 .unwrap_or(0);
863
864 let rel = path.strip_prefix(workspace_root).unwrap_or(path);
865 let rel_str = rel.to_string_lossy().replace('\\', "/");
866 desired_paths.insert(rel_str.clone());
867
868 let content = if ext == "pdf" {
869 extract_pdf_text(path).ok().flatten()
870 } else {
871 std::fs::read_to_string(path).ok()
872 };
873
874 if let Some(text) = content {
875 if text.trim().is_empty() {
876 continue;
877 }
878 match self.index_document(&rel_str, mtime, &text) {
879 Ok(new_chunks) if !new_chunks.is_empty() => {
880 count += 1;
881 }
882 Ok(_) => {}
883 Err(_) => {}
884 }
885 }
886 }
887 }
888
889 self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
890 count
891 }
892
893 pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
896 let reports_dir = workspace_root.join(".hematite").join("reports");
897 let mut count = 0usize;
898 let mut desired_paths = HashSet::new();
899
900 if reports_dir.exists() {
901 let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
902 .ok()
903 .into_iter()
904 .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
905 .map(|entry| entry.path())
906 .filter(|path| {
907 path.is_file()
908 && path.extension().and_then(|ext| ext.to_str()) == Some("json")
909 && path
910 .file_stem()
911 .and_then(|stem| stem.to_str())
912 .map(|stem| stem.starts_with("session_"))
913 .unwrap_or(false)
914 })
915 .collect();
916
917 reports.sort_by(|a, b| {
918 let a_name = a
919 .file_name()
920 .and_then(|name| name.to_str())
921 .unwrap_or_default();
922 let b_name = b
923 .file_name()
924 .and_then(|name| name.to_str())
925 .unwrap_or_default();
926 b_name.cmp(a_name)
927 });
928 reports.truncate(Self::SESSION_REPORT_LIMIT);
929
930 for report_path in reports {
931 let Ok(meta) = std::fs::metadata(&report_path) else {
932 continue;
933 };
934 let mtime = meta
935 .modified()
936 .map(|t| {
937 t.duration_since(std::time::UNIX_EPOCH)
938 .unwrap_or_default()
939 .as_secs() as i64
940 })
941 .unwrap_or(0);
942
943 for exchange in load_session_exchanges(&report_path, mtime) {
944 desired_paths.insert(exchange.path.clone());
945 let mtype = detect_memory_type(&exchange.content);
946 match self.index_chunks_with_room_and_type(
947 &exchange.path,
948 exchange.last_modified,
949 "session",
950 mtype,
951 std::slice::from_ref(&exchange.content),
952 ) {
953 Ok(new_chunks) if !new_chunks.is_empty() => {
954 count += 1;
955 }
956 Ok(_) => {}
957 Err(_) => {}
958 }
959 }
960 }
961 }
962
963 self.prune_indexed_prefix("session/", &desired_paths);
964 count
965 }
966
967 pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
972 let imports_dir = workspace_root.join(".hematite").join("imports");
973 let mut count = 0usize;
974 let mut desired_paths = HashSet::new();
975
976 if imports_dir.exists() {
977 let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
978 .max_depth(4)
979 .follow_links(false)
980 .into_iter()
981 .filter_map(|entry| entry.ok())
982 .filter(|entry| entry.file_type().is_file())
983 .filter_map(|entry| {
984 let path = entry.into_path();
985 let ext = path
986 .extension()
987 .and_then(|ext| ext.to_str())
988 .unwrap_or("")
989 .to_ascii_lowercase();
990 if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
991 return None;
992 }
993 let meta = std::fs::metadata(&path).ok()?;
994 if meta.len() > Self::IMPORT_MAX_BYTES {
995 return None;
996 }
997 let mtime = meta
998 .modified()
999 .map(|t| {
1000 t.duration_since(std::time::UNIX_EPOCH)
1001 .unwrap_or_default()
1002 .as_secs() as i64
1003 })
1004 .unwrap_or(0);
1005 Some((path, mtime))
1006 })
1007 .collect();
1008
1009 imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
1010 b_mtime
1011 .cmp(a_mtime)
1012 .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
1013 });
1014 imports.truncate(Self::IMPORT_FILE_LIMIT);
1015
1016 for (import_path, mtime) in imports {
1017 for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
1018 desired_paths.insert(exchange.path.clone());
1019 let mtype = detect_memory_type(&exchange.content);
1020 match self.index_chunks_with_room_and_type(
1021 &exchange.path,
1022 exchange.last_modified,
1023 "session",
1024 mtype,
1025 std::slice::from_ref(&exchange.content),
1026 ) {
1027 Ok(new_chunks) if !new_chunks.is_empty() => {
1028 count += 1;
1029 }
1030 Ok(_) => {}
1031 Err(_) => {}
1032 }
1033 }
1034 }
1035 }
1036
1037 self.prune_indexed_prefix("session/imports/", &desired_paths);
1038 count
1039 }
1040
1041 fn backfill_missing_embeddings(&self) {
1046 let (fts_count, vec_count) = {
1048 let db = self.db.lock().unwrap();
1049 let fts: i64 = db
1050 .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
1051 .unwrap_or(0);
1052 let vec: i64 = db
1053 .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
1054 .unwrap_or(0);
1055 (fts, vec)
1056 };
1057 if fts_count == 0 || fts_count == vec_count {
1058 return;
1059 }
1060
1061 let missing: Vec<(String, i64, String)> = {
1064 let db = self.db.lock().unwrap();
1065 let mut stmt = db
1066 .prepare(
1067 "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
1068 FROM chunks_fts f
1069 LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
1070 WHERE v.path IS NULL
1071 ORDER BY CASE
1072 WHEN f.path LIKE '%.rs' THEN 0
1073 WHEN f.path LIKE '%.toml' THEN 1
1074 WHEN f.path LIKE '%.json' THEN 2
1075 ELSE 3
1076 END, f.path
1077 LIMIT 20",
1078 )
1079 .unwrap();
1080 stmt.query_map([], |r| {
1081 Ok((
1082 r.get::<_, String>(0)?,
1083 r.get::<_, i64>(1)?,
1084 r.get::<_, String>(2)?,
1085 ))
1086 })
1087 .unwrap()
1088 .filter_map(|r| r.ok())
1089 .collect()
1090 };
1091
1092 for (path, idx, content) in missing {
1093 if let Some(vec) = embed_text_blocking(&content, &self.base_url) {
1094 let blob = floats_to_blob(&vec);
1095 let db = self.db.lock().unwrap();
1096 let _ = db.execute(
1097 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
1098 params![path, idx, blob],
1099 );
1100 } else {
1101 break;
1103 }
1104 }
1105 }
1106
1107 pub fn file_count(&self) -> usize {
1110 let db = self.db.lock().unwrap();
1111 db.query_row(
1112 "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
1113 [],
1114 |r| r.get::<_, i64>(0),
1115 )
1116 .unwrap_or(0) as usize
1117 }
1118
1119 pub fn embedded_chunk_count(&self) -> usize {
1122 let db = self.db.lock().unwrap();
1123 db.query_row(
1124 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1125 [],
1126 |r| r.get::<_, i64>(0),
1127 )
1128 .unwrap_or(0) as usize
1129 }
1130
1131 pub fn has_any_embeddings(&self) -> bool {
1133 let db = self.db.lock().unwrap();
1134 db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1135 r.get::<_, i64>(0)
1136 })
1137 .unwrap_or(0)
1138 != 0
1139 }
1140
1141 pub fn reset(&self) {
1144 let db = self.db.lock().unwrap();
1145 let _ = db.execute_batch(
1146 "DELETE FROM chunks_fts;
1147 DELETE FROM chunks_vec;
1148 DELETE FROM chunks_meta;",
1149 );
1150 }
1151
1152 pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1155 let db = self.db.lock().unwrap();
1156 let indexed_source_files = db
1157 .query_row(
1158 "SELECT COUNT(*) FROM chunks_meta
1159 WHERE path NOT LIKE 'session/%'
1160 AND path NOT LIKE '.hematite/docs/%'",
1161 [],
1162 |r| r.get::<_, i64>(0),
1163 )
1164 .unwrap_or(0) as usize;
1165 let indexed_docs = db
1166 .query_row(
1167 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1168 [],
1169 |r| r.get::<_, i64>(0),
1170 )
1171 .unwrap_or(0) as usize;
1172 let indexed_session_exchanges = db
1173 .query_row(
1174 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1175 [],
1176 |r| r.get::<_, i64>(0),
1177 )
1178 .unwrap_or(0) as usize;
1179 let embedded_source_doc_chunks = db
1180 .query_row(
1181 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1182 [],
1183 |r| r.get::<_, i64>(0),
1184 )
1185 .unwrap_or(0) as usize;
1186 let has_any_embeddings = db
1187 .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1188 r.get::<_, i64>(0)
1189 })
1190 .unwrap_or(0)
1191 != 0;
1192 drop(db);
1193
1194 let hot_files = self
1195 .hot_files(hot_limit.max(1))
1196 .into_iter()
1197 .map(|(path, heat, last_modified, room)| VeinHotFile {
1198 path,
1199 heat,
1200 last_modified,
1201 room,
1202 })
1203 .collect::<Vec<_>>();
1204
1205 VeinInspectionSnapshot {
1206 indexed_source_files,
1207 indexed_docs,
1208 indexed_session_exchanges,
1209 embedded_source_doc_chunks,
1210 has_any_embeddings,
1211 active_room: self.active_room(),
1212 l1_ready: !hot_files.is_empty(),
1213 hot_files,
1214 }
1215 }
1216
1217 pub fn bump_heat(&self, path: &str) {
1223 if path.is_empty() {
1224 return;
1225 }
1226 let now = std::time::SystemTime::now()
1227 .duration_since(std::time::UNIX_EPOCH)
1228 .unwrap_or_default()
1229 .as_secs() as i64;
1230 let db = self.db.lock().unwrap();
1231 let _ = db.execute(
1232 "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1233 ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1234 params![path, now],
1235 );
1236 }
1237
1238 fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1242 let db = self.db.lock().unwrap();
1243 let mut stmt = match db.prepare(
1244 "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1245 FROM file_heat fh
1246 JOIN chunks_meta cm ON cm.path = fh.path
1247 ORDER BY fh.heat DESC, cm.last_modified DESC
1248 LIMIT ?1",
1249 ) {
1250 Ok(s) => s,
1251 Err(_) => return vec![],
1252 };
1253 stmt.query_map(params![n as i64], |row| {
1254 Ok((
1255 row.get::<_, String>(0)?,
1256 row.get::<_, i64>(1)?,
1257 row.get::<_, i64>(2)?,
1258 row.get::<_, String>(3)?,
1259 ))
1260 })
1261 .map(|rows| rows.filter_map(|r| r.ok()).collect())
1262 .unwrap_or_default()
1263 }
1264
1265 pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1268 self.hot_files(n)
1269 .into_iter()
1270 .map(|(path, _, _, _)| path)
1271 .collect()
1272 }
1273
1274 pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1278 let files = self.hot_files(n);
1279 if files.is_empty() {
1280 return vec![];
1281 }
1282 let max_heat = files
1283 .iter()
1284 .map(|(_, h, _, _)| *h)
1285 .max()
1286 .unwrap_or(1)
1287 .max(1) as f64;
1288 files
1289 .into_iter()
1290 .map(|(path, heat, _, _)| {
1291 let weight = (heat as f64) / max_heat;
1292 (path, weight)
1293 })
1294 .collect()
1295 }
1296
1297 pub fn l1_context(&self) -> Option<String> {
1302 let files = self.hot_files(8);
1303 if files.is_empty() {
1304 return None;
1305 }
1306 let now = std::time::SystemTime::now()
1307 .duration_since(std::time::UNIX_EPOCH)
1308 .unwrap_or_default()
1309 .as_secs() as i64;
1310
1311 let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1313 std::collections::BTreeMap::new();
1314 for (path, heat, mtime, room) in &files {
1315 by_room
1316 .entry(room.clone())
1317 .or_default()
1318 .push((path.clone(), *heat, *mtime));
1319 }
1320
1321 let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1322 for (room, entries) in &by_room {
1323 out.push_str(&format!("[{}]\n", room));
1324 for (path, heat, mtime) in entries {
1325 let age_secs = now - mtime;
1326 let age = if age_secs < 3600 {
1327 "just now".to_string()
1328 } else if age_secs < 86400 {
1329 format!("{}h ago", age_secs / 3600)
1330 } else {
1331 format!("{}d ago", age_secs / 86400)
1332 };
1333 out.push_str(&format!(
1334 " - {} [{} edit{}, {}]\n",
1335 path,
1336 heat,
1337 if *heat == 1 { "" } else { "s" },
1338 age
1339 ));
1340 }
1341 }
1342 Some(out)
1343 }
1344
1345 fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1346 let pattern = format!("{}%", prefix);
1347 let existing_paths: Vec<String> = {
1348 let db = self.db.lock().unwrap();
1349 let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1350 Ok(stmt) => stmt,
1351 Err(_) => return,
1352 };
1353 stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1354 .map(|rows| rows.filter_map(|row| row.ok()).collect())
1355 .unwrap_or_default()
1356 };
1357
1358 if existing_paths.is_empty() {
1359 return;
1360 }
1361
1362 let db = self.db.lock().unwrap();
1363 for path in existing_paths {
1364 if desired_paths.contains(&path) {
1365 continue;
1366 }
1367 let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1368 let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1369 let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1370 }
1371 }
1372}
1373
1374impl QuerySignals {
1375 fn from_query(query: &str) -> Self {
1376 let lower = query.to_ascii_lowercase();
1377 let historical_memory_hint = [
1378 "remember",
1379 "earlier",
1380 "previous",
1381 "last time",
1382 "what did we decide",
1383 "why did we decide",
1384 "what did we say",
1385 "why did we change",
1386 ]
1387 .iter()
1388 .any(|needle| lower.contains(needle));
1389
1390 let query_memory_type = if lower.contains("decide")
1392 || lower.contains("decision")
1393 || lower.contains("we agreed")
1394 || lower.contains("we chose")
1395 {
1396 Some("decision")
1397 } else if lower.contains("bug")
1398 || lower.contains("error")
1399 || lower.contains("issue")
1400 || lower.contains("problem")
1401 || lower.contains("fix")
1402 || lower.contains("broken")
1403 {
1404 Some("problem")
1405 } else if lower.contains("shipped")
1406 || lower.contains("milestone")
1407 || lower.contains("finished")
1408 || lower.contains("working now")
1409 {
1410 Some("milestone")
1411 } else if lower.contains("prefer")
1412 || lower.contains("my preference")
1413 || lower.contains("i like")
1414 || lower.contains("i want")
1415 {
1416 Some("preference")
1417 } else {
1418 None
1419 };
1420
1421 Self {
1422 exact_phrases: extract_exact_phrases(query),
1423 standout_terms: extract_standout_terms(query),
1424 historical_memory_hint,
1425 temporal_reference: extract_temporal_reference(query),
1426 query_memory_type,
1427 }
1428 }
1429}
1430
1431fn merge_scored_result(
1432 merged_by_path: &mut HashMap<String, SearchResult>,
1433 candidate: SearchResult,
1434) {
1435 match merged_by_path.get_mut(&candidate.path) {
1436 Some(existing) if candidate.score > existing.score => *existing = candidate,
1437 Some(_) => {}
1438 None => {
1439 merged_by_path.insert(candidate.path.clone(), candidate);
1440 }
1441 }
1442}
1443
1444fn reranked_score(
1445 signals: &QuerySignals,
1446 active_room: Option<&str>,
1447 result: &SearchResult,
1448 is_semantic: bool,
1449) -> f32 {
1450 let base = if is_semantic {
1451 1.0 + result.score.clamp(0.0, 1.0)
1452 } else {
1453 (result.score / 10.0).clamp(0.0, 1.0)
1454 };
1455 base + room_bias(active_room, result)
1456 + retrieval_signal_boost(signals, result)
1457 + temporal_memory_boost(signals, result)
1458}
1459
1460fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1461 if active_room == Some(result.room.as_str()) {
1462 0.15
1463 } else {
1464 0.0
1465 }
1466}
1467
1468fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1469 let mut boost = 0.0f32;
1470 let haystack = format!(
1471 "{}\n{}",
1472 result.path.to_ascii_lowercase(),
1473 result.content.to_ascii_lowercase()
1474 );
1475
1476 let phrase_matches = signals
1477 .exact_phrases
1478 .iter()
1479 .filter(|phrase| haystack.contains(phrase.as_str()))
1480 .count();
1481 if phrase_matches > 0 {
1482 boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1483 }
1484
1485 let mut standout_matches = 0;
1486 for term in &signals.standout_terms {
1487 if result.path.to_ascii_lowercase().contains(term.as_str()) {
1488 boost += 0.40;
1489 standout_matches += 1;
1490 } else if result.content.to_ascii_lowercase().contains(term.as_str()) {
1491 boost += 0.12;
1492 standout_matches += 1;
1493 }
1494 if standout_matches >= 3 {
1495 break;
1496 }
1497 }
1498
1499 if signals.historical_memory_hint && result.room == "session" {
1500 boost += 0.45;
1501 }
1502
1503 if let Some(qtype) = signals.query_memory_type {
1505 if !result.memory_type.is_empty() && result.memory_type == qtype {
1506 boost += 0.35;
1507 }
1508 }
1509
1510 boost
1511}
1512
1513fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1514 if result.room != "session" {
1515 return 0.0;
1516 }
1517 let Some(reference) = signals.temporal_reference else {
1518 return 0.0;
1519 };
1520 let Some(memory_ts) = session_memory_timestamp(result) else {
1521 return 0.0;
1522 };
1523
1524 let span = reference.window_secs.max(86_400);
1525 let full_fade = span.saturating_mul(8);
1526 if full_fade <= 0 {
1527 return 0.0;
1528 }
1529
1530 let distance = (memory_ts - reference.target_ts).abs();
1531 let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1532 if closeness <= 0.0 {
1533 0.0
1534 } else {
1535 0.22 * closeness
1536 }
1537}
1538
1539fn extract_exact_phrases(query: &str) -> Vec<String> {
1540 let mut phrases = Vec::new();
1541 let chars: Vec<char> = query.chars().collect();
1542 let mut i = 0usize;
1543
1544 while i < chars.len() {
1545 let quote = chars[i];
1546 if !matches!(quote, '"' | '\'' | '`') {
1547 i += 1;
1548 continue;
1549 }
1550 let start = i + 1;
1551 let mut end = start;
1552 while end < chars.len() && chars[end] != quote {
1553 end += 1;
1554 }
1555 if end > start {
1556 let phrase = chars[start..end]
1557 .iter()
1558 .collect::<String>()
1559 .trim()
1560 .to_ascii_lowercase();
1561 if phrase.len() >= 3 && !phrases.contains(&phrase) {
1562 phrases.push(phrase);
1563 }
1564 }
1565 i = end.saturating_add(1);
1566 }
1567
1568 phrases
1569}
1570
1571fn extract_standout_terms(query: &str) -> Vec<String> {
1572 const STOPWORDS: &[&str] = &[
1573 "about", "after", "before", "change", "changed", "decide", "decided", "does", "earlier",
1574 "flow", "from", "have", "into", "just", "last", "local", "make", "more", "remember",
1575 "should", "that", "their", "there", "these", "they", "this", "those", "what", "when",
1576 "where", "which", "why", "with", "work",
1577 ];
1578
1579 let mut standout = Vec::new();
1580 for token in query.split(|ch: char| {
1581 !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1582 }) {
1583 let trimmed = token.trim();
1584 if trimmed.len() < 4 {
1585 continue;
1586 }
1587 let lower = trimmed.to_ascii_lowercase();
1588 if STOPWORDS.contains(&lower.as_str()) {
1589 continue;
1590 }
1591
1592 let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1593 || trimmed
1594 .chars()
1595 .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1596 || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1597 || trimmed.len() >= 9;
1598
1599 if interesting && !standout.contains(&lower) {
1600 standout.push(lower);
1601 }
1602 }
1603
1604 standout
1605}
1606
1607fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1608 if let Some(ts) = extract_iso_date_from_query(query) {
1609 return Some(TemporalReference {
1610 target_ts: ts,
1611 window_secs: 86_400,
1612 });
1613 }
1614
1615 let now = current_unix_timestamp();
1616 let lower = query.to_ascii_lowercase();
1617 if lower.contains("yesterday") {
1618 Some(TemporalReference {
1619 target_ts: now.saturating_sub(86_400),
1620 window_secs: 86_400,
1621 })
1622 } else if lower.contains("today") || lower.contains("earlier today") {
1623 Some(TemporalReference {
1624 target_ts: now,
1625 window_secs: 86_400,
1626 })
1627 } else if lower.contains("last week") {
1628 Some(TemporalReference {
1629 target_ts: now.saturating_sub(7 * 86_400),
1630 window_secs: 7 * 86_400,
1631 })
1632 } else if lower.contains("last month") {
1633 Some(TemporalReference {
1634 target_ts: now.saturating_sub(30 * 86_400),
1635 window_secs: 30 * 86_400,
1636 })
1637 } else {
1638 None
1639 }
1640}
1641
1642fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1643 query
1644 .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1645 .find_map(parse_iso_date_token)
1646}
1647
1648fn parse_iso_date_token(token: &str) -> Option<i64> {
1649 if token.len() != 10 {
1650 return None;
1651 }
1652 let bytes = token.as_bytes();
1653 if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1654 return None;
1655 }
1656
1657 let year = token.get(0..4)?.parse::<i32>().ok()?;
1658 let month = token.get(5..7)?.parse::<u32>().ok()?;
1659 let day = token.get(8..10)?.parse::<u32>().ok()?;
1660 if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1661 return None;
1662 }
1663
1664 Some(days_from_civil(year, month, day).saturating_mul(86_400))
1665}
1666
1667fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1668 let year = year - if month <= 2 { 1 } else { 0 };
1669 let era = if year >= 0 { year } else { year - 399 } / 400;
1670 let yoe = year - era * 400;
1671 let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1672 let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1673 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1674 era as i64 * 146_097 + doe as i64 - 719_468
1675}
1676
1677fn current_unix_timestamp() -> i64 {
1678 std::time::SystemTime::now()
1679 .duration_since(std::time::UNIX_EPOCH)
1680 .unwrap_or_default()
1681 .as_secs() as i64
1682}
1683
1684fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1685 extract_session_path_timestamp(&result.path).or_else(|| {
1686 if result.last_modified > 0 {
1687 Some(result.last_modified)
1688 } else {
1689 None
1690 }
1691 })
1692}
1693
1694fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1695 let normalized = path.replace('\\', "/");
1696 let mut parts = normalized.split('/');
1697 if parts.next()? != "session" {
1698 return None;
1699 }
1700 parse_iso_date_token(parts.next()?)
1701}
1702
1703fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1704 let normalized = speaker.trim().to_ascii_lowercase();
1705 match normalized.as_str() {
1706 "you" | "user" => SessionSpeakerKind::User,
1707 "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1708 _ => SessionSpeakerKind::Assistant,
1709 }
1710}
1711
1712fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1713 let Ok(raw) = std::fs::read_to_string(report_path) else {
1714 return Vec::new();
1715 };
1716 let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1717 return Vec::new();
1718 };
1719
1720 let session_key = report_path
1721 .file_stem()
1722 .and_then(|stem| stem.to_str())
1723 .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1724 .unwrap_or("unknown-session")
1725 .to_string();
1726 let session_date = report
1727 .session_start
1728 .split('_')
1729 .next()
1730 .filter(|date| !date.is_empty())
1731 .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1732 .to_string();
1733
1734 let mut exchanges = Vec::new();
1735 let mut pending_user: Option<String> = None;
1736 let mut turn_index = 0usize;
1737
1738 for entry in report.transcript {
1739 match session_speaker_kind(&entry.speaker) {
1740 SessionSpeakerKind::User => {
1741 let text = entry.text.trim();
1742 if !text.is_empty() {
1743 pending_user = Some(text.to_string());
1744 }
1745 }
1746 SessionSpeakerKind::Assistant => {
1747 let text = entry.text.trim();
1748 if text.is_empty() {
1749 continue;
1750 }
1751 let Some(user_text) = pending_user.take() else {
1752 continue;
1753 };
1754 turn_index += 1;
1755 exchanges.push(SessionExchange {
1756 path: format!(
1757 "session/{}/{}/turn-{}",
1758 session_date, session_key, turn_index
1759 ),
1760 last_modified,
1761 content: format!(
1762 "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1763 user_text, text
1764 ),
1765 });
1766 }
1767 SessionSpeakerKind::Ignore => {}
1768 }
1769 }
1770
1771 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1772 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1773 exchanges = exchanges.into_iter().skip(keep_from).collect();
1774 }
1775
1776 exchanges
1777}
1778
1779fn load_imported_session_exchanges(
1780 import_path: &Path,
1781 imports_root: &Path,
1782 last_modified: i64,
1783) -> Vec<SessionExchange> {
1784 let Ok(raw) = std::fs::read_to_string(import_path) else {
1785 return Vec::new();
1786 };
1787
1788 let messages = normalize_import_messages(&raw, import_path);
1789 if messages.is_empty() {
1790 return Vec::new();
1791 }
1792
1793 let rel = import_path
1794 .strip_prefix(imports_root)
1795 .unwrap_or(import_path);
1796 let rel_slug = slugify_import_path(rel);
1797 let mut exchanges = Vec::new();
1798 let mut pending_user: Option<String> = None;
1799 let mut turn_index = 0usize;
1800
1801 for (role, text) in messages {
1802 let cleaned = text.trim();
1803 if cleaned.is_empty() {
1804 continue;
1805 }
1806 match role.as_str() {
1807 "user" => pending_user = Some(cleaned.to_string()),
1808 "assistant" => {
1809 let Some(user_text) = pending_user.take() else {
1810 continue;
1811 };
1812 turn_index += 1;
1813 exchanges.push(SessionExchange {
1814 path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1815 last_modified,
1816 content: format!(
1817 "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1818 rel.to_string_lossy().replace('\\', "/"),
1819 user_text,
1820 cleaned
1821 ),
1822 });
1823 }
1824 _ => {}
1825 }
1826 }
1827
1828 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1829 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1830 exchanges = exchanges.into_iter().skip(keep_from).collect();
1831 }
1832
1833 exchanges
1834}
1835
1836fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1837 if raw.trim().is_empty() {
1838 return Vec::new();
1839 }
1840
1841 if let Some(messages) = parse_marker_transcript(raw) {
1842 return messages;
1843 }
1844
1845 let ext = import_path
1846 .extension()
1847 .and_then(|ext| ext.to_str())
1848 .unwrap_or("")
1849 .to_ascii_lowercase();
1850
1851 if matches!(ext.as_str(), "json" | "jsonl")
1852 || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1853 {
1854 if let Some(messages) = parse_jsonl_messages(raw) {
1855 if !messages.is_empty() {
1856 return messages;
1857 }
1858 }
1859
1860 if let Ok(value) = serde_json::from_str::<Value>(raw) {
1861 if let Some(messages) = parse_session_report_messages(&value) {
1862 return messages;
1863 }
1864 if let Some(messages) = parse_simple_role_messages(&value) {
1865 return messages;
1866 }
1867 if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1868 return messages;
1869 }
1870 }
1871 }
1872
1873 Vec::new()
1874}
1875
1876fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1877 let lines = raw.lines().collect::<Vec<_>>();
1878 if lines
1879 .iter()
1880 .filter(|line| line.trim_start().starts_with("> "))
1881 .count()
1882 < 2
1883 {
1884 return None;
1885 }
1886
1887 let mut messages = Vec::new();
1888 let mut i = 0usize;
1889 while i < lines.len() {
1890 let line = lines[i].trim_start();
1891 if let Some(rest) = line.strip_prefix("> ") {
1892 messages.push(("user".to_string(), rest.trim().to_string()));
1893 i += 1;
1894 let mut assistant_lines = Vec::new();
1895 while i < lines.len() {
1896 let next = lines[i];
1897 if next.trim_start().starts_with("> ") {
1898 break;
1899 }
1900 let trimmed = next.trim();
1901 if !trimmed.is_empty() && trimmed != "---" {
1902 assistant_lines.push(trimmed.to_string());
1903 }
1904 i += 1;
1905 }
1906 if !assistant_lines.is_empty() {
1907 messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1908 }
1909 } else {
1910 i += 1;
1911 }
1912 }
1913
1914 (!messages.is_empty()).then_some(messages)
1915}
1916
1917fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1918 let mut messages = Vec::new();
1919 let mut has_codex_session_meta = false;
1920 let mut saw_jsonl = false;
1921
1922 for line in raw.lines() {
1923 let trimmed = line.trim();
1924 if trimmed.is_empty() {
1925 continue;
1926 }
1927 let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1928 continue;
1929 };
1930 saw_jsonl = true;
1931 let Some(object) = value.as_object() else {
1932 continue;
1933 };
1934
1935 match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1936 "session_meta" => {
1937 has_codex_session_meta = true;
1938 }
1939 "event_msg" => {
1940 let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1941 continue;
1942 };
1943 let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1944 continue;
1945 };
1946 match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1947 "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1948 "agent_message" => {
1949 messages.push(("assistant".to_string(), text.trim().to_string()))
1950 }
1951 _ => {}
1952 }
1953 }
1954 "human" | "user" => {
1955 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1956 messages.push(("user".to_string(), text));
1957 }
1958 }
1959 "assistant" => {
1960 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1961 messages.push(("assistant".to_string(), text));
1962 }
1963 }
1964 _ => {
1965 if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
1966 if let Some(text) = extract_text_content(&value) {
1967 match role {
1968 "user" | "human" => messages.push(("user".to_string(), text)),
1969 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1970 _ => {}
1971 }
1972 }
1973 }
1974 }
1975 }
1976 }
1977
1978 if !saw_jsonl {
1979 return None;
1980 }
1981
1982 if has_codex_session_meta || !messages.is_empty() {
1983 return Some(messages);
1984 }
1985
1986 None
1987}
1988
1989fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
1990 let report = value.as_object()?;
1991 let transcript = report.get("transcript")?.as_array()?;
1992 let mut messages = Vec::new();
1993
1994 for entry in transcript {
1995 let Some(obj) = entry.as_object() else {
1996 continue;
1997 };
1998 let speaker = obj
1999 .get("speaker")
2000 .and_then(|v| v.as_str())
2001 .unwrap_or_default();
2002 let text = obj
2003 .get("text")
2004 .and_then(|v| v.as_str())
2005 .unwrap_or_default()
2006 .trim()
2007 .to_string();
2008 if text.is_empty() {
2009 continue;
2010 }
2011 match session_speaker_kind(speaker) {
2012 SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
2013 SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
2014 SessionSpeakerKind::Ignore => {}
2015 }
2016 }
2017
2018 (!messages.is_empty()).then_some(messages)
2019}
2020
2021fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
2022 if let Some(array) = value.as_array() {
2023 let messages = collect_role_messages(array);
2024 return (!messages.is_empty()).then_some(messages);
2025 }
2026
2027 let obj = value.as_object()?;
2028 if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
2029 let array = messages_value.as_array()?;
2030 let messages = collect_role_messages(array);
2031 return (!messages.is_empty()).then_some(messages);
2032 }
2033
2034 None
2035}
2036
2037fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
2038 let mut messages = Vec::new();
2039 for item in items {
2040 let Some(obj) = item.as_object() else {
2041 continue;
2042 };
2043 let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
2044 continue;
2045 };
2046 let Some(text) = extract_text_content(item) else {
2047 continue;
2048 };
2049 match role {
2050 "user" | "human" => messages.push(("user".to_string(), text)),
2051 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2052 _ => {}
2053 }
2054 }
2055 messages
2056}
2057
2058fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
2059 let mapping = value.get("mapping")?.as_object()?;
2060 let mut current_id = mapping.iter().find_map(|(node_id, node)| {
2061 let obj = node.as_object()?;
2062 (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
2063 })?;
2064
2065 let mut messages = Vec::new();
2066 let mut visited = std::collections::HashSet::new();
2067
2068 while visited.insert(current_id.clone()) {
2069 let Some(node) = mapping.get(¤t_id).and_then(|v| v.as_object()) else {
2070 break;
2071 };
2072
2073 if let Some(message) = node.get("message") {
2074 let role = message
2075 .get("author")
2076 .and_then(|author| author.get("role"))
2077 .and_then(|v| v.as_str())
2078 .unwrap_or("");
2079 if let Some(text) = extract_text_content(message) {
2080 match role {
2081 "user" => messages.push(("user".to_string(), text)),
2082 "assistant" => messages.push(("assistant".to_string(), text)),
2083 _ => {}
2084 }
2085 }
2086 }
2087
2088 let Some(next_id) = node
2089 .get("children")
2090 .and_then(|children| children.as_array())
2091 .and_then(|children| children.first())
2092 .and_then(|child| child.as_str())
2093 else {
2094 break;
2095 };
2096 current_id = next_id.to_string();
2097 }
2098
2099 (!messages.is_empty()).then_some(messages)
2100}
2101
2102fn extract_text_content(value: &Value) -> Option<String> {
2103 if let Some(text) = value.as_str() {
2104 let trimmed = text.trim();
2105 return (!trimmed.is_empty()).then_some(trimmed.to_string());
2106 }
2107
2108 if let Some(array) = value.as_array() {
2109 let joined = array
2110 .iter()
2111 .filter_map(extract_text_content)
2112 .filter(|part| !part.is_empty())
2113 .collect::<Vec<_>>()
2114 .join("\n");
2115 return (!joined.is_empty()).then_some(joined);
2116 }
2117
2118 let obj = value.as_object()?;
2119
2120 if let Some(content) = obj.get("content") {
2121 if let Some(text) = extract_text_content(content) {
2122 return Some(text);
2123 }
2124 }
2125
2126 if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
2127 let trimmed = text.trim();
2128 if !trimmed.is_empty() {
2129 return Some(trimmed.to_string());
2130 }
2131 }
2132
2133 if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
2134 let joined = parts
2135 .iter()
2136 .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
2137 .filter(|part| !part.is_empty())
2138 .collect::<Vec<_>>()
2139 .join("\n");
2140 if !joined.is_empty() {
2141 return Some(joined);
2142 }
2143 }
2144
2145 None
2146}
2147
2148fn slugify_import_path(path: &Path) -> String {
2149 path.to_string_lossy()
2150 .replace('\\', "/")
2151 .chars()
2152 .map(|ch| {
2153 if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
2154 ch
2155 } else {
2156 '_'
2157 }
2158 })
2159 .collect::<String>()
2160 .trim_matches('/')
2161 .replace('/', "__")
2162}
2163
2164fn embed_text_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2180 embed_text_with_prefix(text, "search_document", base_url)
2181}
2182
2183fn embed_query_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2184 embed_text_with_prefix(text, "search_query", base_url)
2185}
2186
2187fn embed_text_with_prefix(text: &str, task: &str, base_url: &str) -> Option<Vec<f32>> {
2188 let prefixed = format!("{}: {}", task, text);
2190 let input = if prefixed.len() > 8000 {
2192 &prefixed[..8000]
2193 } else {
2194 &prefixed
2195 };
2196
2197 let client = reqwest::blocking::Client::builder()
2198 .timeout(std::time::Duration::from_secs(10))
2199 .build()
2200 .ok()?;
2201
2202 let body = serde_json::json!({
2203 "model": "nomic-embed-text-v2",
2204 "input": input
2205 });
2206
2207 let url = format!("{}/v1/embeddings", base_url);
2208 let resp = client.post(&url).json(&body).send().ok()?;
2209
2210 if !resp.status().is_success() {
2211 return None;
2212 }
2213
2214 let json: serde_json::Value = resp.json().ok()?;
2215 let embedding = json["data"][0]["embedding"].as_array()?;
2216 let vec: Vec<f32> = embedding
2217 .iter()
2218 .filter_map(|v| v.as_f64().map(|f| f as f32))
2219 .collect();
2220
2221 if vec.is_empty() {
2222 None
2223 } else {
2224 Some(vec)
2225 }
2226}
2227
2228fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2231 if a.len() != b.len() || a.is_empty() {
2232 return 0.0;
2233 }
2234 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2235 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2236 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2237 if norm_a == 0.0 || norm_b == 0.0 {
2238 0.0
2239 } else {
2240 dot / (norm_a * norm_b)
2241 }
2242}
2243
2244fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2245 floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2246}
2247
2248fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2249 blob.chunks_exact(4)
2250 .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2251 .collect()
2252}
2253
2254fn normalize_extracted_document_text(text: String) -> Option<String> {
2260 let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2261 let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2262 if trimmed.is_empty() {
2263 None
2264 } else {
2265 Some(trimmed.to_string())
2266 }
2267}
2268
2269fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2270 let previous_hook = std::panic::take_hook();
2271 std::panic::set_hook(Box::new(|_| {}));
2272 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2273 pdf_extract::extract_text(path)
2274 }));
2275 std::panic::set_hook(previous_hook);
2276
2277 match result {
2278 Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2279 Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2280 Err(payload) => {
2281 let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2282 (*msg).to_string()
2283 } else if let Some(msg) = payload.downcast_ref::<String>() {
2284 msg.clone()
2285 } else {
2286 "unknown parser panic".to_string()
2287 };
2288 Err(format!("pdf-extract panicked: {}", panic_text))
2289 }
2290 }
2291}
2292
2293fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2294 let mut doc =
2295 lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2296
2297 if doc.is_encrypted() {
2298 doc.decrypt("")
2299 .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2300 }
2301
2302 let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2303 if page_numbers.is_empty() {
2304 return Ok(None);
2305 }
2306
2307 let mut extracted_pages = Vec::new();
2308 let mut page_errors = Vec::new();
2309
2310 for page_number in page_numbers {
2311 match doc.extract_text(&[page_number]) {
2312 Ok(text) => {
2313 if let Some(page_text) = normalize_extracted_document_text(text) {
2314 extracted_pages.push(page_text);
2315 }
2316 }
2317 Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2318 }
2319 }
2320
2321 if !extracted_pages.is_empty() {
2322 return Ok(Some(extracted_pages.join("\n\n")));
2323 }
2324
2325 if !page_errors.is_empty() {
2326 let sample_errors = page_errors
2327 .into_iter()
2328 .take(3)
2329 .collect::<Vec<_>>()
2330 .join("; ");
2331 return Err(format!(
2332 "lopdf could not extract usable page text ({sample_errors})"
2333 ));
2334 }
2335
2336 Ok(None)
2337}
2338
2339fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2340 let mut failures = Vec::new();
2341
2342 match extract_pdf_text_with_pdf_extract(path) {
2343 Ok(Some(text)) => return Ok(Some(text)),
2344 Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2345 Err(e) => failures.push(e),
2346 }
2347
2348 match extract_pdf_text_with_lopdf(path) {
2349 Ok(Some(text)) => return Ok(Some(text)),
2350 Ok(None) => failures.push("lopdf found no usable text".to_string()),
2351 Err(e) => failures.push(e),
2352 }
2353
2354 let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2355 Err(format!(
2356 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2357 detail
2358 ))
2359}
2360
2361fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2362 let exe = std::env::current_exe()
2363 .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2364 let output = std::process::Command::new(exe)
2365 .arg("--pdf-extract-helper")
2366 .arg(path)
2367 .stdin(std::process::Stdio::null())
2368 .stdout(std::process::Stdio::piped())
2369 .stderr(std::process::Stdio::piped())
2370 .output()
2371 .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2372
2373 if !output.status.success() {
2374 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2375 return Err(if stderr.is_empty() {
2376 "PDF extraction failed.".to_string()
2377 } else {
2378 stderr
2379 });
2380 }
2381
2382 let text = String::from_utf8(output.stdout)
2383 .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2384 if text.trim().is_empty() {
2385 Ok(None)
2386 } else {
2387 Ok(Some(text))
2388 }
2389}
2390
2391pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2392 match extract_pdf_text_inside_helper(path) {
2393 Ok(Some(text)) => {
2394 use std::io::Write;
2395 let mut stdout = std::io::stdout();
2396 if stdout.write_all(text.as_bytes()).is_ok() {
2397 0
2398 } else {
2399 let _ = writeln!(
2400 std::io::stderr(),
2401 "PDF helper could not write extracted text."
2402 );
2403 1
2404 }
2405 }
2406 Ok(None) => {
2407 eprintln!(
2408 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2409 );
2410 1
2411 }
2412 Err(e) => {
2413 eprintln!("{}", e);
2414 1
2415 }
2416 }
2417}
2418
2419pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2422 let ext = path
2423 .extension()
2424 .and_then(|e| e.to_str())
2425 .unwrap_or("")
2426 .to_lowercase();
2427 match ext.as_str() {
2428 "pdf" => {
2429 let text = extract_pdf_text(path)?.ok_or_else(|| {
2430 "PDF contains no extractable text — it may be scanned/image-only. \
2431 Try attaching page screenshots with /image instead."
2432 .to_string()
2433 })?;
2434 pdf_quality_check(text)
2435 }
2436 _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2437 }
2438}
2439
2440fn pdf_quality_check(text: String) -> Result<String, String> {
2445 let trimmed = text.trim();
2446
2447 if trimmed.len() < 150 {
2449 return Err(format!(
2450 "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2451 or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2452 trimmed.len()
2453 ));
2454 }
2455
2456 let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2459 let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2460 let space_ratio = if non_newline > 0 {
2461 spaces as f32 / non_newline as f32
2462 } else {
2463 0.0
2464 };
2465
2466 if space_ratio < 0.04 {
2467 return Err(
2468 "PDF text extraction produced garbled output — words are merged with no spaces. \
2469 This usually means the PDF uses custom embedded fonts (common with academic publishers \
2470 like EBSCO, Elsevier, Springer). \
2471 Try a PDF exported from Word, Google Docs, or LaTeX, \
2472 or attach page screenshots with /image instead.".to_string()
2473 );
2474 }
2475
2476 Ok(text)
2477}
2478
2479fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2483 if ext == "rs" {
2484 chunk_rust_symbols(text)
2485 } else {
2486 chunk_paragraphs(text)
2487 }
2488}
2489
2490fn chunk_rust_symbols(text: &str) -> Vec<String> {
2499 const ITEM_STARTS: &[&str] = &[
2500 "pub fn ",
2501 "pub async fn ",
2502 "pub unsafe fn ",
2503 "async fn ",
2504 "unsafe fn ",
2505 "fn ",
2506 "pub impl",
2507 "impl ",
2508 "pub struct ",
2509 "struct ",
2510 "pub enum ",
2511 "enum ",
2512 "pub trait ",
2513 "trait ",
2514 "pub mod ",
2515 "mod ",
2516 "pub type ",
2517 "type ",
2518 "pub const ",
2519 "const ",
2520 "pub static ",
2521 "static ",
2522 ];
2523
2524 let lines: Vec<&str> = text.lines().collect();
2525 let mut chunks: Vec<String> = Vec::new();
2526 let mut current: Vec<&str> = Vec::new();
2527
2528 for &line in &lines {
2529 let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2530 let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2531
2532 if is_item && !current.is_empty() {
2533 let mut split = current.len();
2536 while split > 0 {
2537 let prev = current[split - 1].trim();
2538 if prev.starts_with("///")
2539 || prev.starts_with("//!")
2540 || prev.starts_with("#[")
2541 || prev.is_empty()
2542 {
2543 split -= 1;
2544 } else {
2545 break;
2546 }
2547 }
2548 let body = current[..split].join("\n");
2549 if !body.trim().is_empty() {
2550 chunks.push(body);
2551 }
2552 current = current[split..].to_vec();
2553 }
2554 current.push(line);
2555 }
2556 if !current.is_empty() {
2557 let body = current.join("\n");
2558 if !body.trim().is_empty() {
2559 chunks.push(body);
2560 }
2561 }
2562
2563 let mut result = Vec::new();
2565 for chunk in chunks {
2566 if chunk.len() > 3000 {
2567 result.extend(sliding_window_chunks(&chunk, 2000, 200));
2568 } else {
2569 result.push(chunk);
2570 }
2571 }
2572 result
2573}
2574
2575fn chunk_paragraphs(text: &str) -> Vec<String> {
2577 let mut result: Vec<String> = Vec::new();
2578 let mut current = String::new();
2579
2580 for para in text.split("\n\n") {
2581 if current.len() + para.len() + 2 > 2000 {
2582 if !current.trim().is_empty() {
2583 result.push(current.clone());
2584 }
2585 current = para.to_string();
2586 } else {
2587 if !current.is_empty() {
2588 current.push_str("\n\n");
2589 }
2590 current.push_str(para);
2591 }
2592 }
2593 if !current.trim().is_empty() {
2594 result.push(current);
2595 }
2596
2597 let mut final_result = Vec::new();
2598 for chunk in result {
2599 if chunk.len() > 2000 {
2600 final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2601 } else {
2602 final_result.push(chunk);
2603 }
2604 }
2605 final_result
2606}
2607
2608fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2610 let chars: Vec<char> = text.chars().collect();
2611 let mut result = Vec::new();
2612 let mut i = 0;
2613 while i < chars.len() {
2614 let end = (i + chunk_size).min(chars.len());
2615 result.push(chars[i..end].iter().collect());
2616 if end == chars.len() {
2617 break;
2618 }
2619 i += chunk_size - overlap;
2620 }
2621 result
2622}