1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7pub struct Vein {
28 db: std::sync::Arc<std::sync::Mutex<Connection>>,
29 base_url: String,
31}
32
33unsafe impl Send for Vein {}
36unsafe impl Sync for Vein {}
37
38#[derive(Debug, Clone)]
39pub struct SearchResult {
40 pub path: String,
41 pub content: String,
42 pub score: f32,
44 pub room: String,
46 pub last_modified: i64,
48 pub memory_type: String,
51}
52
53#[derive(Debug, Clone, PartialEq, Eq)]
54pub struct VeinHotFile {
55 pub path: String,
56 pub heat: i64,
57 pub last_modified: i64,
58 pub room: String,
59}
60
61#[derive(Debug, Clone)]
62pub struct VeinInspectionSnapshot {
63 pub indexed_source_files: usize,
64 pub indexed_docs: usize,
65 pub indexed_session_exchanges: usize,
66 pub embedded_source_doc_chunks: usize,
67 pub has_any_embeddings: bool,
68 pub active_room: Option<String>,
69 pub hot_files: Vec<VeinHotFile>,
70 pub l1_ready: bool,
71}
72
73#[derive(Debug, Default)]
74struct QuerySignals {
75 exact_phrases: Vec<String>,
76 standout_terms: Vec<String>,
77 historical_memory_hint: bool,
78 temporal_reference: Option<TemporalReference>,
79 query_memory_type: Option<&'static str>,
82}
83
84#[derive(Debug, Clone, Copy)]
85struct TemporalReference {
86 target_ts: i64,
87 window_secs: i64,
88}
89
90#[derive(Debug, Deserialize)]
91struct SessionReport {
92 #[serde(default)]
93 session_start: String,
94 #[serde(default)]
95 transcript: Vec<SessionTranscriptEntry>,
96}
97
98#[derive(Debug, Deserialize)]
99struct SessionTranscriptEntry {
100 #[serde(default)]
101 speaker: String,
102 #[serde(default)]
103 text: String,
104}
105
106#[derive(Debug)]
107struct SessionExchange {
108 path: String,
109 last_modified: i64,
110 content: String,
111}
112
113#[derive(Debug, Clone, Copy, PartialEq, Eq)]
114enum SessionSpeakerKind {
115 User,
116 Assistant,
117 Ignore,
118}
119
120pub fn detect_room(path: &str) -> String {
125 let lower = path.to_lowercase().replace('\\', "/");
126 let filename = lower.rsplit('/').next().unwrap_or(&lower);
127 let ext = filename.rsplit('.').next().unwrap_or("");
128
129 let mut best_room = None::<&str>;
130 let mut best_score = 0i32;
131 let mut consider = |room: &'static str, score: i32| {
132 if score > best_score {
133 best_score = score;
134 best_room = Some(room);
135 }
136 };
137
138 let is_component = |segment: &str| {
139 lower == segment
140 || lower.starts_with(&format!("{segment}/"))
141 || lower.contains(&format!("/{segment}/"))
142 };
143
144 if lower.starts_with("session/")
145 || lower.starts_with(".hematite/reports/")
146 || lower.starts_with(".hematite/imports/")
147 || is_component("reports")
148 || is_component("imports")
149 {
150 consider("session", 100);
151 }
152
153 if lower.starts_with(".hematite/docs/")
154 || is_component("docs")
155 || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
156 || matches!(ext, "md" | "markdown" | "pdf" | "rst")
157 {
158 consider("docs", 80);
159 }
160
161 if is_component("tests")
162 || filename.contains("diagnostic")
163 || filename.ends_with("_test.rs")
164 || filename.ends_with(".test.ts")
165 {
166 consider("tests", 85);
167 }
168
169 if lower.starts_with(".github/workflows/")
170 || is_component("workflows")
171 || filename == ".pre-commit-config.yaml"
172 || filename == ".pre-commit-config.yml"
173 || filename.contains("hook")
174 {
175 consider("automation", 84);
176 }
177
178 if lower.starts_with("installer/")
179 || lower.starts_with("dist/")
180 || lower.starts_with("scripts/package-")
181 || filename.contains("release")
182 || filename.contains("bump-version")
183 || ext == "iss"
184 {
185 consider("release", 82);
186 }
187
188 if matches!(
189 filename,
190 "cargo.toml"
191 | "cargo.lock"
192 | "package.json"
193 | "pnpm-lock.yaml"
194 | "yarn.lock"
195 | "bun.lock"
196 | "bun.lockb"
197 | "pyproject.toml"
198 | "setup.py"
199 | "go.mod"
200 | "pom.xml"
201 | "build.gradle"
202 | "build.gradle.kts"
203 | "cmakelists.txt"
204 | ".gitignore"
205 | "settings.json"
206 | "mcp_servers.json"
207 ) || filename.ends_with(".sln")
208 || filename.ends_with(".csproj")
209 || filename.contains("config")
210 {
211 consider("config", 76);
212 }
213
214 if is_component("ui")
215 || matches!(
216 filename,
217 "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
218 )
219 {
220 consider("ui", 70);
221 }
222
223 if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
224 consider("memory", 72);
225 }
226
227 if is_component("tools")
228 || matches!(
229 filename,
230 "verify_build.rs"
231 | "host_inspect.rs"
232 | "shell.rs"
233 | "code_sandbox.rs"
234 | "project_map.rs"
235 | "runtime_trace.rs"
236 )
237 {
238 consider("tools", 68);
239 }
240
241 if filename.contains("mcp")
242 || filename.contains("lsp")
243 || lower.contains("/mcp/")
244 || lower.contains("/lsp/")
245 {
246 consider("integration", 67);
247 }
248
249 if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
250 || filename.contains("startup")
251 || filename.contains("runtime")
252 {
253 consider("runtime", 66);
254 }
255
256 if is_component("agent") {
257 consider("agent", 60);
258 }
259
260 if lower.starts_with("libs/") || is_component("libs") {
261 consider("libs", 58);
262 }
263
264 if lower.starts_with("scripts/") || is_component("scripts") {
265 consider("scripts", 55);
266 }
267
268 if let Some(room) = best_room {
269 return room.to_string();
270 }
271
272 lower
274 .split('/')
275 .next()
276 .filter(|s| !s.is_empty() && !s.contains('.'))
277 .unwrap_or("root")
278 .to_string()
279}
280
281pub fn detect_memory_type(text: &str) -> &'static str {
287 let lower = text.to_lowercase();
288
289 let decision_patterns = [
291 "let's use ",
292 "we'll use ",
293 "decided to ",
294 "going with ",
295 "we agreed ",
296 "the plan is",
297 "we're going to",
298 "switching to",
299 "we chose",
300 "final decision",
301 "we settled on",
302 "agreed on",
303 "we decided",
304 ];
305 for pat in &decision_patterns {
306 if lower.contains(pat) {
307 return "decision";
308 }
309 }
310
311 let problem_patterns = [
313 "bug fixed",
314 "bug was",
315 "the issue was",
316 "root cause",
317 "error was",
318 "turned out to be",
319 "the fix was",
320 "was caused by",
321 "broken because",
322 "fixed by",
323 "the problem was",
324 "found the bug",
325 "port conflict",
326 "crash",
327 "panicked",
328 "segfault",
329 "oom",
330 "out of memory",
331 ];
332 for pat in &problem_patterns {
333 if lower.contains(pat) {
334 return "problem";
335 }
336 }
337
338 let milestone_patterns = [
340 "now working",
341 "successfully",
342 "shipped",
343 "deployed",
344 "it works",
345 "tests pass",
346 "all green",
347 "breakthrough",
348 "finally got",
349 "got it working",
350 "completed",
351 "finished",
352 "done with",
353 "landed",
354 ];
355 for pat in &milestone_patterns {
356 if lower.contains(pat) {
357 return "milestone";
358 }
359 }
360
361 let preference_patterns = [
363 "i prefer",
364 "i like",
365 "i don't like",
366 "i want",
367 "always use",
368 "never use",
369 "i usually",
370 "my preference",
371 "keep it",
372 "avoid using",
373 ];
374 for pat in &preference_patterns {
375 if lower.contains(pat) {
376 return "preference";
377 }
378 }
379
380 ""
381}
382
383impl Vein {
384 const SESSION_REPORT_LIMIT: usize = 5;
385 const SESSION_TURN_LIMIT: usize = 50;
386 const IMPORT_FILE_LIMIT: usize = 12;
387 const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
388
389 pub fn new<P: AsRef<Path>>(
390 db_path: P,
391 base_url: String,
392 ) -> Result<Self, Box<dyn std::error::Error>> {
393 let db = Connection::open(db_path)?;
394
395 db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
397
398 db.execute_batch(
402 "CREATE TABLE IF NOT EXISTS chunks_meta (
403 path TEXT PRIMARY KEY,
404 last_modified INTEGER NOT NULL,
405 room TEXT NOT NULL DEFAULT 'root'
406 );
407 CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
408 path UNINDEXED,
409 content,
410 tokenize='porter ascii'
411 );
412 CREATE TABLE IF NOT EXISTS chunks_vec (
413 path TEXT NOT NULL,
414 chunk_idx INTEGER NOT NULL,
415 embedding BLOB NOT NULL,
416 PRIMARY KEY (path, chunk_idx)
417 );
418 CREATE TABLE IF NOT EXISTS file_heat (
419 path TEXT PRIMARY KEY,
420 heat INTEGER NOT NULL DEFAULT 0,
421 last_edit INTEGER NOT NULL DEFAULT 0
422 );",
423 )?;
424
425 let _ = db
427 .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
428 let _ = db.execute_batch(
429 "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
430 );
431 let _ = db.execute_batch(
432 "ALTER TABLE chunks_meta ADD COLUMN memory_type TEXT NOT NULL DEFAULT '';",
433 );
434
435 Ok(Self {
436 db: std::sync::Arc::new(std::sync::Mutex::new(db)),
437 base_url,
438 })
439 }
440
441 pub fn index_document(
446 &mut self,
447 path: &str,
448 last_modified: i64,
449 full_text: &str,
450 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
451 let room = detect_room(path);
452 let ext = std::path::Path::new(path)
453 .extension()
454 .and_then(|e| e.to_str())
455 .unwrap_or("");
456 let chunks = chunk_by_symbols(ext, full_text);
457 let memory_type = if room == "session" {
459 detect_memory_type(full_text)
460 } else {
461 ""
462 };
463 self.index_chunks_with_room_and_type(path, last_modified, &room, memory_type, &chunks)
464 }
465
466 fn index_chunks_with_room_and_type(
467 &mut self,
468 path: &str,
469 last_modified: i64,
470 room: &str,
471 memory_type: &str,
472 chunks: &[String],
473 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
474 let db = self.db.lock().unwrap();
475 let existing: Option<i64> = db
476 .query_row(
477 "SELECT last_modified FROM chunks_meta WHERE path = ?1",
478 params![path],
479 |r| r.get(0),
480 )
481 .ok();
482
483 if let Some(ts) = existing {
484 if ts >= last_modified {
485 return Ok(Vec::new()); }
487 }
488
489 db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
491 db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
492 db.execute(
493 "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room, memory_type) VALUES (?1, ?2, ?3, ?4)",
494 params![path, last_modified, room, memory_type],
495 )?;
496
497 drop(db);
498
499 let mut db = self.db.lock().unwrap();
500 let tx = db.transaction()?;
501 {
502 let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
503 for chunk in chunks {
504 stmt.execute(params![path, chunk.as_str()])?;
505 }
506 }
507 tx.commit()?;
508
509 Ok(chunks.to_vec())
510 }
511
512 pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
516 for (idx, chunk) in chunks.iter().enumerate() {
517 if let Some(vec) = embed_text_blocking(chunk, &self.base_url) {
518 let blob = floats_to_blob(&vec);
519 let db = self.db.lock().unwrap();
520 let _ = db.execute(
521 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
522 params![path, idx as i64, blob],
523 );
524 }
525 }
526 }
527
528 pub fn search_bm25(
532 &self,
533 query: &str,
534 limit: usize,
535 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
536 const STOPWORDS: &[&str] = &[
540 "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
541 "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an", "the",
542 "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "get",
543 "gets", "got", "work", "works", "make", "makes", "use", "uses", "into", "that", "this",
544 "it", "its",
545 ];
546
547 let safe_query: String = query
548 .chars()
549 .map(|c| {
550 if c.is_alphanumeric() || c == ' ' || c == '_' {
551 c
552 } else {
553 ' '
554 }
555 })
556 .collect();
557
558 let fts_query = safe_query
560 .split_whitespace()
561 .filter(|w| w.len() >= 3 && !STOPWORDS.contains(&w.to_lowercase().as_str()))
562 .collect::<Vec<_>>()
563 .join(" OR ");
564
565 if fts_query.is_empty() {
566 return Ok(Vec::new());
567 }
568
569 let db = self.db.lock().unwrap();
570 let mut stmt = db.prepare(
571 "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room, cm.memory_type
572 FROM chunks_fts
573 JOIN chunks_meta cm ON cm.path = chunks_fts.path
574 WHERE chunks_fts MATCH ?1
575 ORDER BY rank
576 LIMIT ?2",
577 )?;
578
579 let results: Vec<SearchResult> = stmt
580 .query_map(params![fts_query, limit as i64], |row| {
581 Ok(SearchResult {
582 path: row.get(0)?,
583 content: row.get(1)?,
584 score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
585 last_modified: row.get(3)?,
586 room: row.get(4)?,
587 memory_type: row.get::<_, String>(5).unwrap_or_default(),
588 })
589 })?
590 .filter_map(|r| r.ok())
591 .collect();
592
593 Ok(results)
594 }
595
596 pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
599 let query_vec = match embed_query_blocking(query, &self.base_url) {
600 Some(v) => v,
601 None => return Vec::new(),
602 };
603
604 let rows: Vec<(String, i64, Vec<u8>, i64, String, String)> = {
606 let db = self.db.lock().unwrap();
607 let mut stmt = match db.prepare(
608 "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room, cm.memory_type
609 FROM chunks_vec cv
610 JOIN chunks_meta cm ON cm.path = cv.path",
611 ) {
612 Ok(s) => s,
613 Err(_) => return Vec::new(),
614 };
615 stmt.query_map([], |row| {
616 Ok((
617 row.get::<_, String>(0)?,
618 row.get::<_, i64>(1)?,
619 row.get::<_, Vec<u8>>(2)?,
620 row.get::<_, i64>(3)?,
621 row.get::<_, String>(4)?,
622 row.get::<_, String>(5).unwrap_or_default(),
623 ))
624 })
625 .ok()
626 .map(|rows| rows.filter_map(|r| r.ok()).collect())
627 .unwrap_or_default()
628 };
629
630 if rows.is_empty() {
631 return Vec::new();
632 }
633
634 let mut scored: Vec<(f32, String, i64, i64, String, String)> = rows
636 .into_iter()
637 .filter_map(|(path, idx, blob, last_modified, room, memory_type)| {
638 let vec = blob_to_floats(&blob);
639 let sim = cosine_similarity(&query_vec, &vec);
640 Some((sim, path, idx, last_modified, room, memory_type))
641 })
642 .collect();
643
644 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
645 scored.truncate(limit);
646
647 let db = self.db.lock().unwrap();
649 scored
650 .into_iter()
651 .filter_map(|(score, path, idx, last_modified, room, memory_type)| {
652 let content: Option<String> = db
653 .query_row(
654 "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
655 params![path, idx],
656 |r| r.get(0),
657 )
658 .ok();
659 content.map(|c| SearchResult {
660 path,
661 content: c,
662 score,
663 room,
664 last_modified,
665 memory_type,
666 })
667 })
668 .collect()
669 }
670
671 pub fn search_context(
678 &self,
679 query: &str,
680 limit: usize,
681 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
682 let candidate_limit = (limit.max(1) * 4).max(12);
683 let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
684 let semantic = self.search_semantic(query, candidate_limit);
685 let signals = QuerySignals::from_query(query);
686
687 let active_room = self.active_room();
689
690 let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
693
694 for r in semantic {
695 let score = reranked_score(&signals, active_room.as_deref(), &r, true);
696 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
697 }
698
699 for r in bm25 {
700 let score = reranked_score(&signals, active_room.as_deref(), &r, false);
701 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
702 }
703
704 let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
705 merged.sort_by(|a, b| {
706 b.score
707 .partial_cmp(&a.score)
708 .unwrap_or(std::cmp::Ordering::Equal)
709 });
710 merged.truncate(limit);
711 Ok(merged)
712 }
713
714 fn active_room(&self) -> Option<String> {
717 let db = self.db.lock().unwrap();
718 db.query_row(
719 "SELECT cm.room, SUM(fh.heat) as total
720 FROM file_heat fh
721 JOIN chunks_meta cm ON cm.path = fh.path
722 GROUP BY cm.room
723 ORDER BY total DESC
724 LIMIT 1",
725 [],
726 |row| row.get::<_, String>(0),
727 )
728 .ok()
729 }
730
731 pub fn index_project(&mut self) -> usize {
739 let root = crate::tools::file_ops::workspace_root();
740 let mut count = 0usize;
741
742 const INDEXABLE: &[&str] = &[
743 "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
744 "yml", "txt",
745 ];
746 const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
747
748 for entry in walkdir::WalkDir::new(&root)
749 .follow_links(false)
750 .into_iter()
751 .filter_entry(|e| {
752 if e.file_type().is_dir() {
753 let name = e.file_name().to_string_lossy();
754 return !SKIP_DIRS.contains(&name.as_ref());
755 }
756 true
757 })
758 .filter_map(|e| e.ok())
759 .filter(|e| e.file_type().is_file())
760 {
761 let path = entry.path();
762 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
763 if !INDEXABLE.contains(&ext) {
764 continue;
765 }
766
767 let Ok(meta) = std::fs::metadata(path) else {
768 continue;
769 };
770 if meta.len() > 512_000 {
771 continue;
772 }
773
774 let mtime = meta
775 .modified()
776 .map(|t| {
777 t.duration_since(std::time::UNIX_EPOCH)
778 .unwrap_or_default()
779 .as_secs() as i64
780 })
781 .unwrap_or(0);
782
783 let rel = path.strip_prefix(&root).unwrap_or(path);
784 let rel_str = rel.to_string_lossy().replace('\\', "/");
785
786 if let Ok(content) = std::fs::read_to_string(path) {
787 match self.index_document(&rel_str, mtime, &content) {
788 Ok(new_chunks) if !new_chunks.is_empty() => {
789 count += 1;
790 }
791 Ok(_) => {}
792 Err(_) => {}
793 }
794 }
795 }
796
797 count += self.index_workspace_artifacts(&root);
798
799 count
800 }
801
802 pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
807 let mut count = self.index_docs_folder(workspace_root);
808 count += self.index_recent_session_reports(workspace_root);
809 count += self.index_imported_session_exports(workspace_root);
810 self.backfill_missing_embeddings();
811 count
812 }
813
814 fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
819 let docs_dir = workspace_root.join(".hematite").join("docs");
820 const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
821 let mut count = 0usize;
822 let mut desired_paths = HashSet::new();
823
824 if docs_dir.exists() {
825 for entry in walkdir::WalkDir::new(&docs_dir)
826 .max_depth(3)
827 .follow_links(false)
828 .into_iter()
829 .filter_map(|e| e.ok())
830 .filter(|e| e.file_type().is_file())
831 {
832 let path = entry.path();
833 let ext = path
834 .extension()
835 .and_then(|e| e.to_str())
836 .unwrap_or("")
837 .to_lowercase();
838 if !DOCS_INDEXABLE.contains(&ext.as_str()) {
839 continue;
840 }
841
842 let Ok(meta) = std::fs::metadata(path) else {
843 continue;
844 };
845 if meta.len() > 50_000_000 {
846 continue;
847 }
848
849 let mtime = meta
850 .modified()
851 .map(|t| {
852 t.duration_since(std::time::UNIX_EPOCH)
853 .unwrap_or_default()
854 .as_secs() as i64
855 })
856 .unwrap_or(0);
857
858 let rel = path.strip_prefix(workspace_root).unwrap_or(path);
859 let rel_str = rel.to_string_lossy().replace('\\', "/");
860 desired_paths.insert(rel_str.clone());
861
862 let content = if ext == "pdf" {
863 extract_pdf_text(path).ok().flatten()
864 } else {
865 std::fs::read_to_string(path).ok()
866 };
867
868 if let Some(text) = content {
869 if text.trim().is_empty() {
870 continue;
871 }
872 match self.index_document(&rel_str, mtime, &text) {
873 Ok(new_chunks) if !new_chunks.is_empty() => {
874 count += 1;
875 }
876 Ok(_) => {}
877 Err(_) => {}
878 }
879 }
880 }
881 }
882
883 self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
884 count
885 }
886
887 pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
890 let reports_dir = workspace_root.join(".hematite").join("reports");
891 let mut count = 0usize;
892 let mut desired_paths = HashSet::new();
893
894 if reports_dir.exists() {
895 let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
896 .ok()
897 .into_iter()
898 .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
899 .map(|entry| entry.path())
900 .filter(|path| {
901 path.is_file()
902 && path.extension().and_then(|ext| ext.to_str()) == Some("json")
903 && path
904 .file_stem()
905 .and_then(|stem| stem.to_str())
906 .map(|stem| stem.starts_with("session_"))
907 .unwrap_or(false)
908 })
909 .collect();
910
911 reports.sort_by(|a, b| {
912 let a_name = a
913 .file_name()
914 .and_then(|name| name.to_str())
915 .unwrap_or_default();
916 let b_name = b
917 .file_name()
918 .and_then(|name| name.to_str())
919 .unwrap_or_default();
920 b_name.cmp(a_name)
921 });
922 reports.truncate(Self::SESSION_REPORT_LIMIT);
923
924 for report_path in reports {
925 let Ok(meta) = std::fs::metadata(&report_path) else {
926 continue;
927 };
928 let mtime = meta
929 .modified()
930 .map(|t| {
931 t.duration_since(std::time::UNIX_EPOCH)
932 .unwrap_or_default()
933 .as_secs() as i64
934 })
935 .unwrap_or(0);
936
937 for exchange in load_session_exchanges(&report_path, mtime) {
938 desired_paths.insert(exchange.path.clone());
939 let mtype = detect_memory_type(&exchange.content);
940 match self.index_chunks_with_room_and_type(
941 &exchange.path,
942 exchange.last_modified,
943 "session",
944 mtype,
945 std::slice::from_ref(&exchange.content),
946 ) {
947 Ok(new_chunks) if !new_chunks.is_empty() => {
948 count += 1;
949 }
950 Ok(_) => {}
951 Err(_) => {}
952 }
953 }
954 }
955 }
956
957 self.prune_indexed_prefix("session/", &desired_paths);
958 count
959 }
960
961 pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
966 let imports_dir = workspace_root.join(".hematite").join("imports");
967 let mut count = 0usize;
968 let mut desired_paths = HashSet::new();
969
970 if imports_dir.exists() {
971 let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
972 .max_depth(4)
973 .follow_links(false)
974 .into_iter()
975 .filter_map(|entry| entry.ok())
976 .filter(|entry| entry.file_type().is_file())
977 .filter_map(|entry| {
978 let path = entry.into_path();
979 let ext = path
980 .extension()
981 .and_then(|ext| ext.to_str())
982 .unwrap_or("")
983 .to_ascii_lowercase();
984 if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
985 return None;
986 }
987 let meta = std::fs::metadata(&path).ok()?;
988 if meta.len() > Self::IMPORT_MAX_BYTES {
989 return None;
990 }
991 let mtime = meta
992 .modified()
993 .map(|t| {
994 t.duration_since(std::time::UNIX_EPOCH)
995 .unwrap_or_default()
996 .as_secs() as i64
997 })
998 .unwrap_or(0);
999 Some((path, mtime))
1000 })
1001 .collect();
1002
1003 imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
1004 b_mtime
1005 .cmp(a_mtime)
1006 .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
1007 });
1008 imports.truncate(Self::IMPORT_FILE_LIMIT);
1009
1010 for (import_path, mtime) in imports {
1011 for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
1012 desired_paths.insert(exchange.path.clone());
1013 let mtype = detect_memory_type(&exchange.content);
1014 match self.index_chunks_with_room_and_type(
1015 &exchange.path,
1016 exchange.last_modified,
1017 "session",
1018 mtype,
1019 std::slice::from_ref(&exchange.content),
1020 ) {
1021 Ok(new_chunks) if !new_chunks.is_empty() => {
1022 count += 1;
1023 }
1024 Ok(_) => {}
1025 Err(_) => {}
1026 }
1027 }
1028 }
1029 }
1030
1031 self.prune_indexed_prefix("session/imports/", &desired_paths);
1032 count
1033 }
1034
1035 fn backfill_missing_embeddings(&self) {
1040 let (fts_count, vec_count) = {
1042 let db = self.db.lock().unwrap();
1043 let fts: i64 = db
1044 .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
1045 .unwrap_or(0);
1046 let vec: i64 = db
1047 .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
1048 .unwrap_or(0);
1049 (fts, vec)
1050 };
1051 if fts_count == 0 || fts_count == vec_count {
1052 return;
1053 }
1054
1055 let missing: Vec<(String, i64, String)> = {
1058 let db = self.db.lock().unwrap();
1059 let mut stmt = db
1060 .prepare(
1061 "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
1062 FROM chunks_fts f
1063 LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
1064 WHERE v.path IS NULL
1065 ORDER BY CASE
1066 WHEN f.path LIKE '%.rs' THEN 0
1067 WHEN f.path LIKE '%.toml' THEN 1
1068 WHEN f.path LIKE '%.json' THEN 2
1069 ELSE 3
1070 END, f.path
1071 LIMIT 20",
1072 )
1073 .unwrap();
1074 stmt.query_map([], |r| {
1075 Ok((
1076 r.get::<_, String>(0)?,
1077 r.get::<_, i64>(1)?,
1078 r.get::<_, String>(2)?,
1079 ))
1080 })
1081 .unwrap()
1082 .filter_map(|r| r.ok())
1083 .collect()
1084 };
1085
1086 for (path, idx, content) in missing {
1087 if let Some(vec) = embed_text_blocking(&content, &self.base_url) {
1088 let blob = floats_to_blob(&vec);
1089 let db = self.db.lock().unwrap();
1090 let _ = db.execute(
1091 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
1092 params![path, idx, blob],
1093 );
1094 } else {
1095 break;
1097 }
1098 }
1099 }
1100
1101 pub fn file_count(&self) -> usize {
1104 let db = self.db.lock().unwrap();
1105 db.query_row(
1106 "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
1107 [],
1108 |r| r.get::<_, i64>(0),
1109 )
1110 .unwrap_or(0) as usize
1111 }
1112
1113 pub fn embedded_chunk_count(&self) -> usize {
1116 let db = self.db.lock().unwrap();
1117 db.query_row(
1118 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1119 [],
1120 |r| r.get::<_, i64>(0),
1121 )
1122 .unwrap_or(0) as usize
1123 }
1124
1125 pub fn has_any_embeddings(&self) -> bool {
1127 let db = self.db.lock().unwrap();
1128 db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1129 r.get::<_, i64>(0)
1130 })
1131 .unwrap_or(0)
1132 != 0
1133 }
1134
1135 pub fn reset(&self) {
1138 let db = self.db.lock().unwrap();
1139 let _ = db.execute_batch(
1140 "DELETE FROM chunks_fts;
1141 DELETE FROM chunks_vec;
1142 DELETE FROM chunks_meta;",
1143 );
1144 }
1145
1146 pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1149 let db = self.db.lock().unwrap();
1150 let indexed_source_files = db
1151 .query_row(
1152 "SELECT COUNT(*) FROM chunks_meta
1153 WHERE path NOT LIKE 'session/%'
1154 AND path NOT LIKE '.hematite/docs/%'",
1155 [],
1156 |r| r.get::<_, i64>(0),
1157 )
1158 .unwrap_or(0) as usize;
1159 let indexed_docs = db
1160 .query_row(
1161 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1162 [],
1163 |r| r.get::<_, i64>(0),
1164 )
1165 .unwrap_or(0) as usize;
1166 let indexed_session_exchanges = db
1167 .query_row(
1168 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1169 [],
1170 |r| r.get::<_, i64>(0),
1171 )
1172 .unwrap_or(0) as usize;
1173 let embedded_source_doc_chunks = db
1174 .query_row(
1175 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1176 [],
1177 |r| r.get::<_, i64>(0),
1178 )
1179 .unwrap_or(0) as usize;
1180 let has_any_embeddings = db
1181 .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1182 r.get::<_, i64>(0)
1183 })
1184 .unwrap_or(0)
1185 != 0;
1186 drop(db);
1187
1188 let hot_files = self
1189 .hot_files(hot_limit.max(1))
1190 .into_iter()
1191 .map(|(path, heat, last_modified, room)| VeinHotFile {
1192 path,
1193 heat,
1194 last_modified,
1195 room,
1196 })
1197 .collect::<Vec<_>>();
1198
1199 VeinInspectionSnapshot {
1200 indexed_source_files,
1201 indexed_docs,
1202 indexed_session_exchanges,
1203 embedded_source_doc_chunks,
1204 has_any_embeddings,
1205 active_room: self.active_room(),
1206 l1_ready: !hot_files.is_empty(),
1207 hot_files,
1208 }
1209 }
1210
1211 pub fn bump_heat(&self, path: &str) {
1217 if path.is_empty() {
1218 return;
1219 }
1220 let now = std::time::SystemTime::now()
1221 .duration_since(std::time::UNIX_EPOCH)
1222 .unwrap_or_default()
1223 .as_secs() as i64;
1224 let db = self.db.lock().unwrap();
1225 let _ = db.execute(
1226 "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1227 ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1228 params![path, now],
1229 );
1230 }
1231
1232 fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1236 let db = self.db.lock().unwrap();
1237 let mut stmt = match db.prepare(
1238 "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1239 FROM file_heat fh
1240 JOIN chunks_meta cm ON cm.path = fh.path
1241 ORDER BY fh.heat DESC, cm.last_modified DESC
1242 LIMIT ?1",
1243 ) {
1244 Ok(s) => s,
1245 Err(_) => return vec![],
1246 };
1247 stmt.query_map(params![n as i64], |row| {
1248 Ok((
1249 row.get::<_, String>(0)?,
1250 row.get::<_, i64>(1)?,
1251 row.get::<_, i64>(2)?,
1252 row.get::<_, String>(3)?,
1253 ))
1254 })
1255 .map(|rows| rows.filter_map(|r| r.ok()).collect())
1256 .unwrap_or_default()
1257 }
1258
1259 pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1262 self.hot_files(n)
1263 .into_iter()
1264 .map(|(path, _, _, _)| path)
1265 .collect()
1266 }
1267
1268 pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1272 let files = self.hot_files(n);
1273 if files.is_empty() {
1274 return vec![];
1275 }
1276 let max_heat = files
1277 .iter()
1278 .map(|(_, h, _, _)| *h)
1279 .max()
1280 .unwrap_or(1)
1281 .max(1) as f64;
1282 files
1283 .into_iter()
1284 .map(|(path, heat, _, _)| {
1285 let weight = (heat as f64) / max_heat;
1286 (path, weight)
1287 })
1288 .collect()
1289 }
1290
1291 pub fn l1_context(&self) -> Option<String> {
1296 let files = self.hot_files(8);
1297 if files.is_empty() {
1298 return None;
1299 }
1300 let now = std::time::SystemTime::now()
1301 .duration_since(std::time::UNIX_EPOCH)
1302 .unwrap_or_default()
1303 .as_secs() as i64;
1304
1305 let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1307 std::collections::BTreeMap::new();
1308 for (path, heat, mtime, room) in &files {
1309 by_room
1310 .entry(room.clone())
1311 .or_default()
1312 .push((path.clone(), *heat, *mtime));
1313 }
1314
1315 let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1316 for (room, entries) in &by_room {
1317 out.push_str(&format!("[{}]\n", room));
1318 for (path, heat, mtime) in entries {
1319 let age_secs = now - mtime;
1320 let age = if age_secs < 3600 {
1321 "just now".to_string()
1322 } else if age_secs < 86400 {
1323 format!("{}h ago", age_secs / 3600)
1324 } else {
1325 format!("{}d ago", age_secs / 86400)
1326 };
1327 out.push_str(&format!(
1328 " - {} [{} edit{}, {}]\n",
1329 path,
1330 heat,
1331 if *heat == 1 { "" } else { "s" },
1332 age
1333 ));
1334 }
1335 }
1336 Some(out)
1337 }
1338
1339 fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1340 let pattern = format!("{}%", prefix);
1341 let existing_paths: Vec<String> = {
1342 let db = self.db.lock().unwrap();
1343 let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1344 Ok(stmt) => stmt,
1345 Err(_) => return,
1346 };
1347 stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1348 .map(|rows| rows.filter_map(|row| row.ok()).collect())
1349 .unwrap_or_default()
1350 };
1351
1352 if existing_paths.is_empty() {
1353 return;
1354 }
1355
1356 let db = self.db.lock().unwrap();
1357 for path in existing_paths {
1358 if desired_paths.contains(&path) {
1359 continue;
1360 }
1361 let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1362 let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1363 let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1364 }
1365 }
1366}
1367
1368impl QuerySignals {
1369 fn from_query(query: &str) -> Self {
1370 let lower = query.to_ascii_lowercase();
1371 let historical_memory_hint = [
1372 "remember",
1373 "earlier",
1374 "previous",
1375 "last time",
1376 "what did we decide",
1377 "why did we decide",
1378 "what did we say",
1379 "why did we change",
1380 ]
1381 .iter()
1382 .any(|needle| lower.contains(needle));
1383
1384 let query_memory_type = if lower.contains("decide")
1386 || lower.contains("decision")
1387 || lower.contains("we agreed")
1388 || lower.contains("we chose")
1389 {
1390 Some("decision")
1391 } else if lower.contains("bug")
1392 || lower.contains("error")
1393 || lower.contains("issue")
1394 || lower.contains("problem")
1395 || lower.contains("fix")
1396 || lower.contains("broken")
1397 {
1398 Some("problem")
1399 } else if lower.contains("shipped")
1400 || lower.contains("milestone")
1401 || lower.contains("finished")
1402 || lower.contains("working now")
1403 {
1404 Some("milestone")
1405 } else if lower.contains("prefer")
1406 || lower.contains("my preference")
1407 || lower.contains("i like")
1408 || lower.contains("i want")
1409 {
1410 Some("preference")
1411 } else {
1412 None
1413 };
1414
1415 Self {
1416 exact_phrases: extract_exact_phrases(query),
1417 standout_terms: extract_standout_terms(query),
1418 historical_memory_hint,
1419 temporal_reference: extract_temporal_reference(query),
1420 query_memory_type,
1421 }
1422 }
1423}
1424
1425fn merge_scored_result(
1426 merged_by_path: &mut HashMap<String, SearchResult>,
1427 candidate: SearchResult,
1428) {
1429 match merged_by_path.get_mut(&candidate.path) {
1430 Some(existing) if candidate.score > existing.score => *existing = candidate,
1431 Some(_) => {}
1432 None => {
1433 merged_by_path.insert(candidate.path.clone(), candidate);
1434 }
1435 }
1436}
1437
1438fn reranked_score(
1439 signals: &QuerySignals,
1440 active_room: Option<&str>,
1441 result: &SearchResult,
1442 is_semantic: bool,
1443) -> f32 {
1444 let base = if is_semantic {
1445 1.0 + result.score.clamp(0.0, 1.0)
1446 } else {
1447 (result.score / 10.0).clamp(0.0, 1.0)
1448 };
1449 base + room_bias(active_room, result)
1450 + retrieval_signal_boost(signals, result)
1451 + temporal_memory_boost(signals, result)
1452}
1453
1454fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1455 if active_room == Some(result.room.as_str()) {
1456 0.15
1457 } else {
1458 0.0
1459 }
1460}
1461
1462fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1463 let mut boost = 0.0f32;
1464 let haystack = format!(
1465 "{}\n{}",
1466 result.path.to_ascii_lowercase(),
1467 result.content.to_ascii_lowercase()
1468 );
1469
1470 let phrase_matches = signals
1471 .exact_phrases
1472 .iter()
1473 .filter(|phrase| haystack.contains(phrase.as_str()))
1474 .count();
1475 if phrase_matches > 0 {
1476 boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1477 }
1478
1479 let mut standout_matches = 0;
1480 for term in &signals.standout_terms {
1481 if result.path.to_ascii_lowercase().contains(term.as_str()) {
1482 boost += 0.40;
1483 standout_matches += 1;
1484 } else if result.content.to_ascii_lowercase().contains(term.as_str()) {
1485 boost += 0.12;
1486 standout_matches += 1;
1487 }
1488 if standout_matches >= 3 {
1489 break;
1490 }
1491 }
1492
1493 if signals.historical_memory_hint && result.room == "session" {
1494 boost += 0.45;
1495 }
1496
1497 if let Some(qtype) = signals.query_memory_type {
1499 if !result.memory_type.is_empty() && result.memory_type == qtype {
1500 boost += 0.35;
1501 }
1502 }
1503
1504 boost
1505}
1506
1507fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1508 if result.room != "session" {
1509 return 0.0;
1510 }
1511 let Some(reference) = signals.temporal_reference else {
1512 return 0.0;
1513 };
1514 let Some(memory_ts) = session_memory_timestamp(result) else {
1515 return 0.0;
1516 };
1517
1518 let span = reference.window_secs.max(86_400);
1519 let full_fade = span.saturating_mul(8);
1520 if full_fade <= 0 {
1521 return 0.0;
1522 }
1523
1524 let distance = (memory_ts - reference.target_ts).abs();
1525 let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1526 if closeness <= 0.0 {
1527 0.0
1528 } else {
1529 0.22 * closeness
1530 }
1531}
1532
1533fn extract_exact_phrases(query: &str) -> Vec<String> {
1534 let mut phrases = Vec::new();
1535 let chars: Vec<char> = query.chars().collect();
1536 let mut i = 0usize;
1537
1538 while i < chars.len() {
1539 let quote = chars[i];
1540 if !matches!(quote, '"' | '\'' | '`') {
1541 i += 1;
1542 continue;
1543 }
1544 let start = i + 1;
1545 let mut end = start;
1546 while end < chars.len() && chars[end] != quote {
1547 end += 1;
1548 }
1549 if end > start {
1550 let phrase = chars[start..end]
1551 .iter()
1552 .collect::<String>()
1553 .trim()
1554 .to_ascii_lowercase();
1555 if phrase.len() >= 3 && !phrases.contains(&phrase) {
1556 phrases.push(phrase);
1557 }
1558 }
1559 i = end.saturating_add(1);
1560 }
1561
1562 phrases
1563}
1564
1565fn extract_standout_terms(query: &str) -> Vec<String> {
1566 const STOPWORDS: &[&str] = &[
1567 "about", "after", "before", "change", "changed", "decide", "decided", "does", "earlier",
1568 "flow", "from", "have", "into", "just", "last", "local", "make", "more", "remember",
1569 "should", "that", "their", "there", "these", "they", "this", "those", "what", "when",
1570 "where", "which", "why", "with", "work",
1571 ];
1572
1573 let mut standout = Vec::new();
1574 for token in query.split(|ch: char| {
1575 !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1576 }) {
1577 let trimmed = token.trim();
1578 if trimmed.len() < 4 {
1579 continue;
1580 }
1581 let lower = trimmed.to_ascii_lowercase();
1582 if STOPWORDS.contains(&lower.as_str()) {
1583 continue;
1584 }
1585
1586 let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1587 || trimmed
1588 .chars()
1589 .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1590 || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1591 || trimmed.len() >= 9;
1592
1593 if interesting && !standout.contains(&lower) {
1594 standout.push(lower);
1595 }
1596 }
1597
1598 standout
1599}
1600
1601fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1602 if let Some(ts) = extract_iso_date_from_query(query) {
1603 return Some(TemporalReference {
1604 target_ts: ts,
1605 window_secs: 86_400,
1606 });
1607 }
1608
1609 let now = current_unix_timestamp();
1610 let lower = query.to_ascii_lowercase();
1611 if lower.contains("yesterday") {
1612 Some(TemporalReference {
1613 target_ts: now.saturating_sub(86_400),
1614 window_secs: 86_400,
1615 })
1616 } else if lower.contains("today") || lower.contains("earlier today") {
1617 Some(TemporalReference {
1618 target_ts: now,
1619 window_secs: 86_400,
1620 })
1621 } else if lower.contains("last week") {
1622 Some(TemporalReference {
1623 target_ts: now.saturating_sub(7 * 86_400),
1624 window_secs: 7 * 86_400,
1625 })
1626 } else if lower.contains("last month") {
1627 Some(TemporalReference {
1628 target_ts: now.saturating_sub(30 * 86_400),
1629 window_secs: 30 * 86_400,
1630 })
1631 } else {
1632 None
1633 }
1634}
1635
1636fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1637 query
1638 .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1639 .find_map(parse_iso_date_token)
1640}
1641
1642fn parse_iso_date_token(token: &str) -> Option<i64> {
1643 if token.len() != 10 {
1644 return None;
1645 }
1646 let bytes = token.as_bytes();
1647 if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1648 return None;
1649 }
1650
1651 let year = token.get(0..4)?.parse::<i32>().ok()?;
1652 let month = token.get(5..7)?.parse::<u32>().ok()?;
1653 let day = token.get(8..10)?.parse::<u32>().ok()?;
1654 if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1655 return None;
1656 }
1657
1658 Some(days_from_civil(year, month, day).saturating_mul(86_400))
1659}
1660
1661fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1662 let year = year - if month <= 2 { 1 } else { 0 };
1663 let era = if year >= 0 { year } else { year - 399 } / 400;
1664 let yoe = year - era * 400;
1665 let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1666 let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1667 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1668 era as i64 * 146_097 + doe as i64 - 719_468
1669}
1670
1671fn current_unix_timestamp() -> i64 {
1672 std::time::SystemTime::now()
1673 .duration_since(std::time::UNIX_EPOCH)
1674 .unwrap_or_default()
1675 .as_secs() as i64
1676}
1677
1678fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1679 extract_session_path_timestamp(&result.path).or_else(|| {
1680 if result.last_modified > 0 {
1681 Some(result.last_modified)
1682 } else {
1683 None
1684 }
1685 })
1686}
1687
1688fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1689 let normalized = path.replace('\\', "/");
1690 let mut parts = normalized.split('/');
1691 if parts.next()? != "session" {
1692 return None;
1693 }
1694 parse_iso_date_token(parts.next()?)
1695}
1696
1697fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1698 let normalized = speaker.trim().to_ascii_lowercase();
1699 match normalized.as_str() {
1700 "you" | "user" => SessionSpeakerKind::User,
1701 "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1702 _ => SessionSpeakerKind::Assistant,
1703 }
1704}
1705
1706fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1707 let Ok(raw) = std::fs::read_to_string(report_path) else {
1708 return Vec::new();
1709 };
1710 let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1711 return Vec::new();
1712 };
1713
1714 let session_key = report_path
1715 .file_stem()
1716 .and_then(|stem| stem.to_str())
1717 .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1718 .unwrap_or("unknown-session")
1719 .to_string();
1720 let session_date = report
1721 .session_start
1722 .split('_')
1723 .next()
1724 .filter(|date| !date.is_empty())
1725 .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1726 .to_string();
1727
1728 let mut exchanges = Vec::new();
1729 let mut pending_user: Option<String> = None;
1730 let mut turn_index = 0usize;
1731
1732 for entry in report.transcript {
1733 match session_speaker_kind(&entry.speaker) {
1734 SessionSpeakerKind::User => {
1735 let text = entry.text.trim();
1736 if !text.is_empty() {
1737 pending_user = Some(text.to_string());
1738 }
1739 }
1740 SessionSpeakerKind::Assistant => {
1741 let text = entry.text.trim();
1742 if text.is_empty() {
1743 continue;
1744 }
1745 let Some(user_text) = pending_user.take() else {
1746 continue;
1747 };
1748 turn_index += 1;
1749 exchanges.push(SessionExchange {
1750 path: format!(
1751 "session/{}/{}/turn-{}",
1752 session_date, session_key, turn_index
1753 ),
1754 last_modified,
1755 content: format!(
1756 "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1757 user_text, text
1758 ),
1759 });
1760 }
1761 SessionSpeakerKind::Ignore => {}
1762 }
1763 }
1764
1765 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1766 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1767 exchanges = exchanges.into_iter().skip(keep_from).collect();
1768 }
1769
1770 exchanges
1771}
1772
1773fn load_imported_session_exchanges(
1774 import_path: &Path,
1775 imports_root: &Path,
1776 last_modified: i64,
1777) -> Vec<SessionExchange> {
1778 let Ok(raw) = std::fs::read_to_string(import_path) else {
1779 return Vec::new();
1780 };
1781
1782 let messages = normalize_import_messages(&raw, import_path);
1783 if messages.is_empty() {
1784 return Vec::new();
1785 }
1786
1787 let rel = import_path
1788 .strip_prefix(imports_root)
1789 .unwrap_or(import_path);
1790 let rel_slug = slugify_import_path(rel);
1791 let mut exchanges = Vec::new();
1792 let mut pending_user: Option<String> = None;
1793 let mut turn_index = 0usize;
1794
1795 for (role, text) in messages {
1796 let cleaned = text.trim();
1797 if cleaned.is_empty() {
1798 continue;
1799 }
1800 match role.as_str() {
1801 "user" => pending_user = Some(cleaned.to_string()),
1802 "assistant" => {
1803 let Some(user_text) = pending_user.take() else {
1804 continue;
1805 };
1806 turn_index += 1;
1807 exchanges.push(SessionExchange {
1808 path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1809 last_modified,
1810 content: format!(
1811 "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1812 rel.to_string_lossy().replace('\\', "/"),
1813 user_text,
1814 cleaned
1815 ),
1816 });
1817 }
1818 _ => {}
1819 }
1820 }
1821
1822 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1823 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1824 exchanges = exchanges.into_iter().skip(keep_from).collect();
1825 }
1826
1827 exchanges
1828}
1829
1830fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1831 if raw.trim().is_empty() {
1832 return Vec::new();
1833 }
1834
1835 if let Some(messages) = parse_marker_transcript(raw) {
1836 return messages;
1837 }
1838
1839 let ext = import_path
1840 .extension()
1841 .and_then(|ext| ext.to_str())
1842 .unwrap_or("")
1843 .to_ascii_lowercase();
1844
1845 if matches!(ext.as_str(), "json" | "jsonl")
1846 || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1847 {
1848 if let Some(messages) = parse_jsonl_messages(raw) {
1849 if !messages.is_empty() {
1850 return messages;
1851 }
1852 }
1853
1854 if let Ok(value) = serde_json::from_str::<Value>(raw) {
1855 if let Some(messages) = parse_session_report_messages(&value) {
1856 return messages;
1857 }
1858 if let Some(messages) = parse_simple_role_messages(&value) {
1859 return messages;
1860 }
1861 if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1862 return messages;
1863 }
1864 }
1865 }
1866
1867 Vec::new()
1868}
1869
1870fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1871 let lines = raw.lines().collect::<Vec<_>>();
1872 if lines
1873 .iter()
1874 .filter(|line| line.trim_start().starts_with("> "))
1875 .count()
1876 < 2
1877 {
1878 return None;
1879 }
1880
1881 let mut messages = Vec::new();
1882 let mut i = 0usize;
1883 while i < lines.len() {
1884 let line = lines[i].trim_start();
1885 if let Some(rest) = line.strip_prefix("> ") {
1886 messages.push(("user".to_string(), rest.trim().to_string()));
1887 i += 1;
1888 let mut assistant_lines = Vec::new();
1889 while i < lines.len() {
1890 let next = lines[i];
1891 if next.trim_start().starts_with("> ") {
1892 break;
1893 }
1894 let trimmed = next.trim();
1895 if !trimmed.is_empty() && trimmed != "---" {
1896 assistant_lines.push(trimmed.to_string());
1897 }
1898 i += 1;
1899 }
1900 if !assistant_lines.is_empty() {
1901 messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1902 }
1903 } else {
1904 i += 1;
1905 }
1906 }
1907
1908 (!messages.is_empty()).then_some(messages)
1909}
1910
1911fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1912 let mut messages = Vec::new();
1913 let mut has_codex_session_meta = false;
1914 let mut saw_jsonl = false;
1915
1916 for line in raw.lines() {
1917 let trimmed = line.trim();
1918 if trimmed.is_empty() {
1919 continue;
1920 }
1921 let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1922 continue;
1923 };
1924 saw_jsonl = true;
1925 let Some(object) = value.as_object() else {
1926 continue;
1927 };
1928
1929 match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1930 "session_meta" => {
1931 has_codex_session_meta = true;
1932 }
1933 "event_msg" => {
1934 let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1935 continue;
1936 };
1937 let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1938 continue;
1939 };
1940 match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1941 "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1942 "agent_message" => {
1943 messages.push(("assistant".to_string(), text.trim().to_string()))
1944 }
1945 _ => {}
1946 }
1947 }
1948 "human" | "user" => {
1949 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1950 messages.push(("user".to_string(), text));
1951 }
1952 }
1953 "assistant" => {
1954 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1955 messages.push(("assistant".to_string(), text));
1956 }
1957 }
1958 _ => {
1959 if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
1960 if let Some(text) = extract_text_content(&value) {
1961 match role {
1962 "user" | "human" => messages.push(("user".to_string(), text)),
1963 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1964 _ => {}
1965 }
1966 }
1967 }
1968 }
1969 }
1970 }
1971
1972 if !saw_jsonl {
1973 return None;
1974 }
1975
1976 if has_codex_session_meta || !messages.is_empty() {
1977 return Some(messages);
1978 }
1979
1980 None
1981}
1982
1983fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
1984 let report = value.as_object()?;
1985 let transcript = report.get("transcript")?.as_array()?;
1986 let mut messages = Vec::new();
1987
1988 for entry in transcript {
1989 let Some(obj) = entry.as_object() else {
1990 continue;
1991 };
1992 let speaker = obj
1993 .get("speaker")
1994 .and_then(|v| v.as_str())
1995 .unwrap_or_default();
1996 let text = obj
1997 .get("text")
1998 .and_then(|v| v.as_str())
1999 .unwrap_or_default()
2000 .trim()
2001 .to_string();
2002 if text.is_empty() {
2003 continue;
2004 }
2005 match session_speaker_kind(speaker) {
2006 SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
2007 SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
2008 SessionSpeakerKind::Ignore => {}
2009 }
2010 }
2011
2012 (!messages.is_empty()).then_some(messages)
2013}
2014
2015fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
2016 if let Some(array) = value.as_array() {
2017 let messages = collect_role_messages(array);
2018 return (!messages.is_empty()).then_some(messages);
2019 }
2020
2021 let obj = value.as_object()?;
2022 if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
2023 let array = messages_value.as_array()?;
2024 let messages = collect_role_messages(array);
2025 return (!messages.is_empty()).then_some(messages);
2026 }
2027
2028 None
2029}
2030
2031fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
2032 let mut messages = Vec::new();
2033 for item in items {
2034 let Some(obj) = item.as_object() else {
2035 continue;
2036 };
2037 let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
2038 continue;
2039 };
2040 let Some(text) = extract_text_content(item) else {
2041 continue;
2042 };
2043 match role {
2044 "user" | "human" => messages.push(("user".to_string(), text)),
2045 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
2046 _ => {}
2047 }
2048 }
2049 messages
2050}
2051
2052fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
2053 let mapping = value.get("mapping")?.as_object()?;
2054 let mut current_id = mapping.iter().find_map(|(node_id, node)| {
2055 let obj = node.as_object()?;
2056 (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
2057 })?;
2058
2059 let mut messages = Vec::new();
2060 let mut visited = std::collections::HashSet::new();
2061
2062 while visited.insert(current_id.clone()) {
2063 let Some(node) = mapping.get(¤t_id).and_then(|v| v.as_object()) else {
2064 break;
2065 };
2066
2067 if let Some(message) = node.get("message") {
2068 let role = message
2069 .get("author")
2070 .and_then(|author| author.get("role"))
2071 .and_then(|v| v.as_str())
2072 .unwrap_or("");
2073 if let Some(text) = extract_text_content(message) {
2074 match role {
2075 "user" => messages.push(("user".to_string(), text)),
2076 "assistant" => messages.push(("assistant".to_string(), text)),
2077 _ => {}
2078 }
2079 }
2080 }
2081
2082 let Some(next_id) = node
2083 .get("children")
2084 .and_then(|children| children.as_array())
2085 .and_then(|children| children.first())
2086 .and_then(|child| child.as_str())
2087 else {
2088 break;
2089 };
2090 current_id = next_id.to_string();
2091 }
2092
2093 (!messages.is_empty()).then_some(messages)
2094}
2095
2096fn extract_text_content(value: &Value) -> Option<String> {
2097 if let Some(text) = value.as_str() {
2098 let trimmed = text.trim();
2099 return (!trimmed.is_empty()).then_some(trimmed.to_string());
2100 }
2101
2102 if let Some(array) = value.as_array() {
2103 let joined = array
2104 .iter()
2105 .filter_map(extract_text_content)
2106 .filter(|part| !part.is_empty())
2107 .collect::<Vec<_>>()
2108 .join("\n");
2109 return (!joined.is_empty()).then_some(joined);
2110 }
2111
2112 let obj = value.as_object()?;
2113
2114 if let Some(content) = obj.get("content") {
2115 if let Some(text) = extract_text_content(content) {
2116 return Some(text);
2117 }
2118 }
2119
2120 if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
2121 let trimmed = text.trim();
2122 if !trimmed.is_empty() {
2123 return Some(trimmed.to_string());
2124 }
2125 }
2126
2127 if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
2128 let joined = parts
2129 .iter()
2130 .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
2131 .filter(|part| !part.is_empty())
2132 .collect::<Vec<_>>()
2133 .join("\n");
2134 if !joined.is_empty() {
2135 return Some(joined);
2136 }
2137 }
2138
2139 None
2140}
2141
2142fn slugify_import_path(path: &Path) -> String {
2143 path.to_string_lossy()
2144 .replace('\\', "/")
2145 .chars()
2146 .map(|ch| {
2147 if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
2148 ch
2149 } else {
2150 '_'
2151 }
2152 })
2153 .collect::<String>()
2154 .trim_matches('/')
2155 .replace('/', "__")
2156}
2157
2158fn embed_text_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2174 embed_text_with_prefix(text, "search_document", base_url)
2175}
2176
2177fn embed_query_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2178 embed_text_with_prefix(text, "search_query", base_url)
2179}
2180
2181fn embed_text_with_prefix(text: &str, task: &str, base_url: &str) -> Option<Vec<f32>> {
2182 let prefixed = format!("{}: {}", task, text);
2184 let input = if prefixed.len() > 8000 {
2186 &prefixed[..8000]
2187 } else {
2188 &prefixed
2189 };
2190
2191 let client = reqwest::blocking::Client::builder()
2192 .timeout(std::time::Duration::from_secs(10))
2193 .build()
2194 .ok()?;
2195
2196 let body = serde_json::json!({
2197 "model": "nomic-embed-text-v2",
2198 "input": input
2199 });
2200
2201 let url = format!("{}/v1/embeddings", base_url);
2202 let resp = client.post(&url).json(&body).send().ok()?;
2203
2204 if !resp.status().is_success() {
2205 return None;
2206 }
2207
2208 let json: serde_json::Value = resp.json().ok()?;
2209 let embedding = json["data"][0]["embedding"].as_array()?;
2210 let vec: Vec<f32> = embedding
2211 .iter()
2212 .filter_map(|v| v.as_f64().map(|f| f as f32))
2213 .collect();
2214
2215 if vec.is_empty() {
2216 None
2217 } else {
2218 Some(vec)
2219 }
2220}
2221
2222fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2225 if a.len() != b.len() || a.is_empty() {
2226 return 0.0;
2227 }
2228 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2229 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2230 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2231 if norm_a == 0.0 || norm_b == 0.0 {
2232 0.0
2233 } else {
2234 dot / (norm_a * norm_b)
2235 }
2236}
2237
2238fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2239 floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2240}
2241
2242fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2243 blob.chunks_exact(4)
2244 .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2245 .collect()
2246}
2247
2248fn normalize_extracted_document_text(text: String) -> Option<String> {
2254 let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2255 let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2256 if trimmed.is_empty() {
2257 None
2258 } else {
2259 Some(trimmed.to_string())
2260 }
2261}
2262
2263fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2264 let previous_hook = std::panic::take_hook();
2265 std::panic::set_hook(Box::new(|_| {}));
2266 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2267 pdf_extract::extract_text(path)
2268 }));
2269 std::panic::set_hook(previous_hook);
2270
2271 match result {
2272 Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2273 Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2274 Err(payload) => {
2275 let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2276 (*msg).to_string()
2277 } else if let Some(msg) = payload.downcast_ref::<String>() {
2278 msg.clone()
2279 } else {
2280 "unknown parser panic".to_string()
2281 };
2282 Err(format!("pdf-extract panicked: {}", panic_text))
2283 }
2284 }
2285}
2286
2287fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2288 let mut doc =
2289 lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2290
2291 if doc.is_encrypted() {
2292 doc.decrypt("")
2293 .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2294 }
2295
2296 let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2297 if page_numbers.is_empty() {
2298 return Ok(None);
2299 }
2300
2301 let mut extracted_pages = Vec::new();
2302 let mut page_errors = Vec::new();
2303
2304 for page_number in page_numbers {
2305 match doc.extract_text(&[page_number]) {
2306 Ok(text) => {
2307 if let Some(page_text) = normalize_extracted_document_text(text) {
2308 extracted_pages.push(page_text);
2309 }
2310 }
2311 Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2312 }
2313 }
2314
2315 if !extracted_pages.is_empty() {
2316 return Ok(Some(extracted_pages.join("\n\n")));
2317 }
2318
2319 if !page_errors.is_empty() {
2320 let sample_errors = page_errors
2321 .into_iter()
2322 .take(3)
2323 .collect::<Vec<_>>()
2324 .join("; ");
2325 return Err(format!(
2326 "lopdf could not extract usable page text ({sample_errors})"
2327 ));
2328 }
2329
2330 Ok(None)
2331}
2332
2333fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2334 let mut failures = Vec::new();
2335
2336 match extract_pdf_text_with_pdf_extract(path) {
2337 Ok(Some(text)) => return Ok(Some(text)),
2338 Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2339 Err(e) => failures.push(e),
2340 }
2341
2342 match extract_pdf_text_with_lopdf(path) {
2343 Ok(Some(text)) => return Ok(Some(text)),
2344 Ok(None) => failures.push("lopdf found no usable text".to_string()),
2345 Err(e) => failures.push(e),
2346 }
2347
2348 let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2349 Err(format!(
2350 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2351 detail
2352 ))
2353}
2354
2355fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2356 let exe = std::env::current_exe()
2357 .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2358 let output = std::process::Command::new(exe)
2359 .arg("--pdf-extract-helper")
2360 .arg(path)
2361 .stdin(std::process::Stdio::null())
2362 .stdout(std::process::Stdio::piped())
2363 .stderr(std::process::Stdio::piped())
2364 .output()
2365 .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2366
2367 if !output.status.success() {
2368 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2369 return Err(if stderr.is_empty() {
2370 "PDF extraction failed.".to_string()
2371 } else {
2372 stderr
2373 });
2374 }
2375
2376 let text = String::from_utf8(output.stdout)
2377 .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2378 if text.trim().is_empty() {
2379 Ok(None)
2380 } else {
2381 Ok(Some(text))
2382 }
2383}
2384
2385pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2386 match extract_pdf_text_inside_helper(path) {
2387 Ok(Some(text)) => {
2388 use std::io::Write;
2389 let mut stdout = std::io::stdout();
2390 if stdout.write_all(text.as_bytes()).is_ok() {
2391 0
2392 } else {
2393 let _ = writeln!(
2394 std::io::stderr(),
2395 "PDF helper could not write extracted text."
2396 );
2397 1
2398 }
2399 }
2400 Ok(None) => {
2401 eprintln!(
2402 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2403 );
2404 1
2405 }
2406 Err(e) => {
2407 eprintln!("{}", e);
2408 1
2409 }
2410 }
2411}
2412
2413pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2416 let ext = path
2417 .extension()
2418 .and_then(|e| e.to_str())
2419 .unwrap_or("")
2420 .to_lowercase();
2421 match ext.as_str() {
2422 "pdf" => {
2423 let text = extract_pdf_text(path)?.ok_or_else(|| {
2424 "PDF contains no extractable text — it may be scanned/image-only. \
2425 Try attaching page screenshots with /image instead."
2426 .to_string()
2427 })?;
2428 pdf_quality_check(text)
2429 }
2430 _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2431 }
2432}
2433
2434fn pdf_quality_check(text: String) -> Result<String, String> {
2439 let trimmed = text.trim();
2440
2441 if trimmed.len() < 150 {
2443 return Err(format!(
2444 "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2445 or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2446 trimmed.len()
2447 ));
2448 }
2449
2450 let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2453 let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2454 let space_ratio = if non_newline > 0 {
2455 spaces as f32 / non_newline as f32
2456 } else {
2457 0.0
2458 };
2459
2460 if space_ratio < 0.04 {
2461 return Err(
2462 "PDF text extraction produced garbled output — words are merged with no spaces. \
2463 This usually means the PDF uses custom embedded fonts (common with academic publishers \
2464 like EBSCO, Elsevier, Springer). \
2465 Try a PDF exported from Word, Google Docs, or LaTeX, \
2466 or attach page screenshots with /image instead.".to_string()
2467 );
2468 }
2469
2470 Ok(text)
2471}
2472
2473fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2477 if ext == "rs" {
2478 chunk_rust_symbols(text)
2479 } else {
2480 chunk_paragraphs(text)
2481 }
2482}
2483
2484fn chunk_rust_symbols(text: &str) -> Vec<String> {
2493 const ITEM_STARTS: &[&str] = &[
2494 "pub fn ",
2495 "pub async fn ",
2496 "pub unsafe fn ",
2497 "async fn ",
2498 "unsafe fn ",
2499 "fn ",
2500 "pub impl",
2501 "impl ",
2502 "pub struct ",
2503 "struct ",
2504 "pub enum ",
2505 "enum ",
2506 "pub trait ",
2507 "trait ",
2508 "pub mod ",
2509 "mod ",
2510 "pub type ",
2511 "type ",
2512 "pub const ",
2513 "const ",
2514 "pub static ",
2515 "static ",
2516 ];
2517
2518 let lines: Vec<&str> = text.lines().collect();
2519 let mut chunks: Vec<String> = Vec::new();
2520 let mut current: Vec<&str> = Vec::new();
2521
2522 for &line in &lines {
2523 let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2524 let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2525
2526 if is_item && !current.is_empty() {
2527 let mut split = current.len();
2530 while split > 0 {
2531 let prev = current[split - 1].trim();
2532 if prev.starts_with("///")
2533 || prev.starts_with("//!")
2534 || prev.starts_with("#[")
2535 || prev.is_empty()
2536 {
2537 split -= 1;
2538 } else {
2539 break;
2540 }
2541 }
2542 let body = current[..split].join("\n");
2543 if !body.trim().is_empty() {
2544 chunks.push(body);
2545 }
2546 current = current[split..].to_vec();
2547 }
2548 current.push(line);
2549 }
2550 if !current.is_empty() {
2551 let body = current.join("\n");
2552 if !body.trim().is_empty() {
2553 chunks.push(body);
2554 }
2555 }
2556
2557 let mut result = Vec::new();
2559 for chunk in chunks {
2560 if chunk.len() > 3000 {
2561 result.extend(sliding_window_chunks(&chunk, 2000, 200));
2562 } else {
2563 result.push(chunk);
2564 }
2565 }
2566 result
2567}
2568
2569fn chunk_paragraphs(text: &str) -> Vec<String> {
2571 let mut result: Vec<String> = Vec::new();
2572 let mut current = String::new();
2573
2574 for para in text.split("\n\n") {
2575 if current.len() + para.len() + 2 > 2000 {
2576 if !current.trim().is_empty() {
2577 result.push(current.clone());
2578 }
2579 current = para.to_string();
2580 } else {
2581 if !current.is_empty() {
2582 current.push_str("\n\n");
2583 }
2584 current.push_str(para);
2585 }
2586 }
2587 if !current.trim().is_empty() {
2588 result.push(current);
2589 }
2590
2591 let mut final_result = Vec::new();
2592 for chunk in result {
2593 if chunk.len() > 2000 {
2594 final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2595 } else {
2596 final_result.push(chunk);
2597 }
2598 }
2599 final_result
2600}
2601
2602fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2604 let chars: Vec<char> = text.chars().collect();
2605 let mut result = Vec::new();
2606 let mut i = 0;
2607 while i < chars.len() {
2608 let end = (i + chunk_size).min(chars.len());
2609 result.push(chars[i..end].iter().collect());
2610 if end == chars.len() {
2611 break;
2612 }
2613 i += chunk_size - overlap;
2614 }
2615 result
2616}