1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7pub struct Vein {
28 db: std::sync::Arc<std::sync::Mutex<Connection>>,
29 base_url: String,
31}
32
33unsafe impl Send for Vein {}
36unsafe impl Sync for Vein {}
37
38#[derive(Debug, Clone)]
39pub struct SearchResult {
40 pub path: String,
41 pub content: String,
42 pub score: f32,
44 pub room: String,
46 pub last_modified: i64,
48}
49
50#[derive(Debug, Clone, PartialEq, Eq)]
51pub struct VeinHotFile {
52 pub path: String,
53 pub heat: i64,
54 pub last_modified: i64,
55 pub room: String,
56}
57
58#[derive(Debug, Clone)]
59pub struct VeinInspectionSnapshot {
60 pub indexed_source_files: usize,
61 pub indexed_docs: usize,
62 pub indexed_session_exchanges: usize,
63 pub embedded_source_doc_chunks: usize,
64 pub has_any_embeddings: bool,
65 pub active_room: Option<String>,
66 pub hot_files: Vec<VeinHotFile>,
67 pub l1_ready: bool,
68}
69
70#[derive(Debug, Default)]
71struct QuerySignals {
72 exact_phrases: Vec<String>,
73 standout_terms: Vec<String>,
74 historical_memory_hint: bool,
75 temporal_reference: Option<TemporalReference>,
76}
77
78#[derive(Debug, Clone, Copy)]
79struct TemporalReference {
80 target_ts: i64,
81 window_secs: i64,
82}
83
84#[derive(Debug, Deserialize)]
85struct SessionReport {
86 #[serde(default)]
87 session_start: String,
88 #[serde(default)]
89 transcript: Vec<SessionTranscriptEntry>,
90}
91
92#[derive(Debug, Deserialize)]
93struct SessionTranscriptEntry {
94 #[serde(default)]
95 speaker: String,
96 #[serde(default)]
97 text: String,
98}
99
100#[derive(Debug)]
101struct SessionExchange {
102 path: String,
103 last_modified: i64,
104 content: String,
105}
106
107#[derive(Debug, Clone, Copy, PartialEq, Eq)]
108enum SessionSpeakerKind {
109 User,
110 Assistant,
111 Ignore,
112}
113
114pub fn detect_room(path: &str) -> String {
119 let lower = path.to_lowercase().replace('\\', "/");
120 let filename = lower.rsplit('/').next().unwrap_or(&lower);
121 let ext = filename.rsplit('.').next().unwrap_or("");
122
123 let mut best_room = None::<&str>;
124 let mut best_score = 0i32;
125 let mut consider = |room: &'static str, score: i32| {
126 if score > best_score {
127 best_score = score;
128 best_room = Some(room);
129 }
130 };
131
132 let is_component = |segment: &str| {
133 lower == segment
134 || lower.starts_with(&format!("{segment}/"))
135 || lower.contains(&format!("/{segment}/"))
136 };
137
138 if lower.starts_with("session/")
139 || lower.starts_with(".hematite/reports/")
140 || lower.starts_with(".hematite/imports/")
141 || is_component("reports")
142 || is_component("imports")
143 {
144 consider("session", 100);
145 }
146
147 if lower.starts_with(".hematite/docs/")
148 || is_component("docs")
149 || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
150 || matches!(ext, "md" | "markdown" | "pdf" | "rst")
151 {
152 consider("docs", 80);
153 }
154
155 if is_component("tests")
156 || filename.contains("diagnostic")
157 || filename.ends_with("_test.rs")
158 || filename.ends_with(".test.ts")
159 {
160 consider("tests", 85);
161 }
162
163 if lower.starts_with(".github/workflows/")
164 || is_component("workflows")
165 || filename == ".pre-commit-config.yaml"
166 || filename == ".pre-commit-config.yml"
167 || filename.contains("hook")
168 {
169 consider("automation", 84);
170 }
171
172 if lower.starts_with("installer/")
173 || lower.starts_with("dist/")
174 || lower.starts_with("scripts/package-")
175 || filename.contains("release")
176 || filename.contains("bump-version")
177 || ext == "iss"
178 {
179 consider("release", 82);
180 }
181
182 if matches!(
183 filename,
184 "cargo.toml"
185 | "cargo.lock"
186 | "package.json"
187 | "pnpm-lock.yaml"
188 | "yarn.lock"
189 | "bun.lock"
190 | "bun.lockb"
191 | "pyproject.toml"
192 | "setup.py"
193 | "go.mod"
194 | "pom.xml"
195 | "build.gradle"
196 | "build.gradle.kts"
197 | "cmakelists.txt"
198 | ".gitignore"
199 | "settings.json"
200 | "mcp_servers.json"
201 ) || filename.ends_with(".sln")
202 || filename.ends_with(".csproj")
203 || filename.contains("config")
204 {
205 consider("config", 76);
206 }
207
208 if is_component("ui")
209 || matches!(
210 filename,
211 "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
212 )
213 {
214 consider("ui", 70);
215 }
216
217 if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
218 consider("memory", 72);
219 }
220
221 if is_component("tools")
222 || matches!(
223 filename,
224 "verify_build.rs"
225 | "host_inspect.rs"
226 | "shell.rs"
227 | "code_sandbox.rs"
228 | "project_map.rs"
229 | "runtime_trace.rs"
230 )
231 {
232 consider("tools", 68);
233 }
234
235 if filename.contains("mcp")
236 || filename.contains("lsp")
237 || lower.contains("/mcp/")
238 || lower.contains("/lsp/")
239 {
240 consider("integration", 67);
241 }
242
243 if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
244 || filename.contains("startup")
245 || filename.contains("runtime")
246 {
247 consider("runtime", 66);
248 }
249
250 if is_component("agent") {
251 consider("agent", 60);
252 }
253
254 if lower.starts_with("libs/") || is_component("libs") {
255 consider("libs", 58);
256 }
257
258 if lower.starts_with("scripts/") || is_component("scripts") {
259 consider("scripts", 55);
260 }
261
262 if let Some(room) = best_room {
263 return room.to_string();
264 }
265
266 lower
268 .split('/')
269 .next()
270 .filter(|s| !s.is_empty() && !s.contains('.'))
271 .unwrap_or("root")
272 .to_string()
273}
274
275impl Vein {
276 const SESSION_REPORT_LIMIT: usize = 5;
277 const SESSION_TURN_LIMIT: usize = 50;
278 const IMPORT_FILE_LIMIT: usize = 12;
279 const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
280
281 pub fn new<P: AsRef<Path>>(
282 db_path: P,
283 base_url: String,
284 ) -> Result<Self, Box<dyn std::error::Error>> {
285 let db = Connection::open(db_path)?;
286
287 db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
289
290 db.execute_batch(
294 "CREATE TABLE IF NOT EXISTS chunks_meta (
295 path TEXT PRIMARY KEY,
296 last_modified INTEGER NOT NULL,
297 room TEXT NOT NULL DEFAULT 'root'
298 );
299 CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
300 path UNINDEXED,
301 content,
302 tokenize='porter ascii'
303 );
304 CREATE TABLE IF NOT EXISTS chunks_vec (
305 path TEXT NOT NULL,
306 chunk_idx INTEGER NOT NULL,
307 embedding BLOB NOT NULL,
308 PRIMARY KEY (path, chunk_idx)
309 );
310 CREATE TABLE IF NOT EXISTS file_heat (
311 path TEXT PRIMARY KEY,
312 heat INTEGER NOT NULL DEFAULT 0,
313 last_edit INTEGER NOT NULL DEFAULT 0
314 );",
315 )?;
316
317 let _ = db
319 .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
320 let _ = db.execute_batch(
321 "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
322 );
323
324 Ok(Self {
325 db: std::sync::Arc::new(std::sync::Mutex::new(db)),
326 base_url,
327 })
328 }
329
330 pub fn index_document(
335 &mut self,
336 path: &str,
337 last_modified: i64,
338 full_text: &str,
339 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
340 let room = detect_room(path);
341 let ext = std::path::Path::new(path)
342 .extension()
343 .and_then(|e| e.to_str())
344 .unwrap_or("");
345 let chunks = chunk_by_symbols(ext, full_text);
346 self.index_chunks_with_room(path, last_modified, &room, &chunks)
347 }
348
349 fn index_chunks_with_room(
350 &mut self,
351 path: &str,
352 last_modified: i64,
353 room: &str,
354 chunks: &[String],
355 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
356 let db = self.db.lock().unwrap();
357 let existing: Option<i64> = db
358 .query_row(
359 "SELECT last_modified FROM chunks_meta WHERE path = ?1",
360 params![path],
361 |r| r.get(0),
362 )
363 .ok();
364
365 if let Some(ts) = existing {
366 if ts >= last_modified {
367 return Ok(Vec::new()); }
369 }
370
371 db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
373 db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
374 db.execute(
375 "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room) VALUES (?1, ?2, ?3)",
376 params![path, last_modified, room],
377 )?;
378
379 drop(db);
380
381 let mut db = self.db.lock().unwrap();
382 let tx = db.transaction()?;
383 {
384 let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
385 for chunk in chunks {
386 stmt.execute(params![path, chunk.as_str()])?;
387 }
388 }
389 tx.commit()?;
390
391 Ok(chunks.to_vec())
392 }
393
394 pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
398 for (idx, chunk) in chunks.iter().enumerate() {
399 if let Some(vec) = embed_text_blocking(chunk, &self.base_url) {
400 let blob = floats_to_blob(&vec);
401 let db = self.db.lock().unwrap();
402 let _ = db.execute(
403 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
404 params![path, idx as i64, blob],
405 );
406 }
407 }
408 }
409
410 pub fn search_bm25(
414 &self,
415 query: &str,
416 limit: usize,
417 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
418 const STOPWORDS: &[&str] = &[
422 "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
423 "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an", "the",
424 "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "get",
425 "gets", "got", "work", "works", "make", "makes", "use", "uses", "into", "that", "this",
426 "it", "its",
427 ];
428
429 let safe_query: String = query
430 .chars()
431 .map(|c| {
432 if c.is_alphanumeric() || c == ' ' || c == '_' {
433 c
434 } else {
435 ' '
436 }
437 })
438 .collect();
439
440 let fts_query = safe_query
442 .split_whitespace()
443 .filter(|w| w.len() >= 3 && !STOPWORDS.contains(&w.to_lowercase().as_str()))
444 .collect::<Vec<_>>()
445 .join(" OR ");
446
447 if fts_query.is_empty() {
448 return Ok(Vec::new());
449 }
450
451 let db = self.db.lock().unwrap();
452 let mut stmt = db.prepare(
453 "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room
454 FROM chunks_fts
455 JOIN chunks_meta cm ON cm.path = chunks_fts.path
456 WHERE chunks_fts MATCH ?1
457 ORDER BY rank
458 LIMIT ?2",
459 )?;
460
461 let results: Vec<SearchResult> = stmt
462 .query_map(params![fts_query, limit as i64], |row| {
463 Ok(SearchResult {
464 path: row.get(0)?,
465 content: row.get(1)?,
466 score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
467 last_modified: row.get(3)?,
468 room: row.get(4)?,
469 })
470 })?
471 .filter_map(|r| r.ok())
472 .collect();
473
474 Ok(results)
475 }
476
477 pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
480 let query_vec = match embed_query_blocking(query, &self.base_url) {
481 Some(v) => v,
482 None => return Vec::new(),
483 };
484
485 let rows: Vec<(String, i64, Vec<u8>, i64, String)> = {
487 let db = self.db.lock().unwrap();
488 let mut stmt = match db.prepare(
489 "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room
490 FROM chunks_vec cv
491 JOIN chunks_meta cm ON cm.path = cv.path",
492 ) {
493 Ok(s) => s,
494 Err(_) => return Vec::new(),
495 };
496 stmt.query_map([], |row| {
497 Ok((
498 row.get::<_, String>(0)?,
499 row.get::<_, i64>(1)?,
500 row.get::<_, Vec<u8>>(2)?,
501 row.get::<_, i64>(3)?,
502 row.get::<_, String>(4)?,
503 ))
504 })
505 .ok()
506 .map(|rows| rows.filter_map(|r| r.ok()).collect())
507 .unwrap_or_default()
508 };
509
510 if rows.is_empty() {
511 return Vec::new();
512 }
513
514 let mut scored: Vec<(f32, String, i64, i64, String)> = rows
516 .into_iter()
517 .filter_map(|(path, idx, blob, last_modified, room)| {
518 let vec = blob_to_floats(&blob);
519 let sim = cosine_similarity(&query_vec, &vec);
520 Some((sim, path, idx, last_modified, room))
521 })
522 .collect();
523
524 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
525 scored.truncate(limit);
526
527 let db = self.db.lock().unwrap();
529 scored
530 .into_iter()
531 .filter_map(|(score, path, idx, last_modified, room)| {
532 let content: Option<String> = db
535 .query_row(
536 "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
537 params![path, idx],
538 |r| r.get(0),
539 )
540 .ok();
541 content.map(|c| SearchResult {
542 path,
543 content: c,
544 score,
545 room,
546 last_modified,
547 })
548 })
549 .collect()
550 }
551
552 pub fn search_context(
559 &self,
560 query: &str,
561 limit: usize,
562 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
563 let candidate_limit = (limit.max(1) * 4).max(12);
564 let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
565 let semantic = self.search_semantic(query, candidate_limit);
566 let signals = QuerySignals::from_query(query);
567
568 let active_room = self.active_room();
570
571 let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
574
575 for r in semantic {
576 let score = reranked_score(&signals, active_room.as_deref(), &r, true);
577 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
578 }
579
580 for r in bm25 {
581 let score = reranked_score(&signals, active_room.as_deref(), &r, false);
582 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
583 }
584
585 let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
586 merged.sort_by(|a, b| {
587 b.score
588 .partial_cmp(&a.score)
589 .unwrap_or(std::cmp::Ordering::Equal)
590 });
591 merged.truncate(limit);
592 Ok(merged)
593 }
594
595 fn active_room(&self) -> Option<String> {
598 let db = self.db.lock().unwrap();
599 db.query_row(
600 "SELECT cm.room, SUM(fh.heat) as total
601 FROM file_heat fh
602 JOIN chunks_meta cm ON cm.path = fh.path
603 GROUP BY cm.room
604 ORDER BY total DESC
605 LIMIT 1",
606 [],
607 |row| row.get::<_, String>(0),
608 )
609 .ok()
610 }
611
612 pub fn index_project(&mut self) -> usize {
620 let root = crate::tools::file_ops::workspace_root();
621 let mut count = 0usize;
622
623 const INDEXABLE: &[&str] = &[
624 "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
625 "yml", "txt",
626 ];
627 const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
628
629 for entry in walkdir::WalkDir::new(&root)
630 .follow_links(false)
631 .into_iter()
632 .filter_entry(|e| {
633 if e.file_type().is_dir() {
634 let name = e.file_name().to_string_lossy();
635 return !SKIP_DIRS.contains(&name.as_ref());
636 }
637 true
638 })
639 .filter_map(|e| e.ok())
640 .filter(|e| e.file_type().is_file())
641 {
642 let path = entry.path();
643 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
644 if !INDEXABLE.contains(&ext) {
645 continue;
646 }
647
648 let Ok(meta) = std::fs::metadata(path) else {
649 continue;
650 };
651 if meta.len() > 512_000 {
652 continue;
653 }
654
655 let mtime = meta
656 .modified()
657 .map(|t| {
658 t.duration_since(std::time::UNIX_EPOCH)
659 .unwrap_or_default()
660 .as_secs() as i64
661 })
662 .unwrap_or(0);
663
664 let rel = path.strip_prefix(&root).unwrap_or(path);
665 let rel_str = rel.to_string_lossy().replace('\\', "/");
666
667 if let Ok(content) = std::fs::read_to_string(path) {
668 match self.index_document(&rel_str, mtime, &content) {
669 Ok(new_chunks) if !new_chunks.is_empty() => {
670 count += 1;
671 }
672 Ok(_) => {}
673 Err(_) => {}
674 }
675 }
676 }
677
678 count += self.index_workspace_artifacts(&root);
679
680 count
681 }
682
683 pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
688 let mut count = self.index_docs_folder(workspace_root);
689 count += self.index_recent_session_reports(workspace_root);
690 count += self.index_imported_session_exports(workspace_root);
691 self.backfill_missing_embeddings();
692 count
693 }
694
695 fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
700 let docs_dir = workspace_root.join(".hematite").join("docs");
701 const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
702 let mut count = 0usize;
703 let mut desired_paths = HashSet::new();
704
705 if docs_dir.exists() {
706 for entry in walkdir::WalkDir::new(&docs_dir)
707 .max_depth(3)
708 .follow_links(false)
709 .into_iter()
710 .filter_map(|e| e.ok())
711 .filter(|e| e.file_type().is_file())
712 {
713 let path = entry.path();
714 let ext = path
715 .extension()
716 .and_then(|e| e.to_str())
717 .unwrap_or("")
718 .to_lowercase();
719 if !DOCS_INDEXABLE.contains(&ext.as_str()) {
720 continue;
721 }
722
723 let Ok(meta) = std::fs::metadata(path) else {
724 continue;
725 };
726 if meta.len() > 50_000_000 {
727 continue;
728 }
729
730 let mtime = meta
731 .modified()
732 .map(|t| {
733 t.duration_since(std::time::UNIX_EPOCH)
734 .unwrap_or_default()
735 .as_secs() as i64
736 })
737 .unwrap_or(0);
738
739 let rel = path.strip_prefix(workspace_root).unwrap_or(path);
740 let rel_str = rel.to_string_lossy().replace('\\', "/");
741 desired_paths.insert(rel_str.clone());
742
743 let content = if ext == "pdf" {
744 extract_pdf_text(path).ok().flatten()
745 } else {
746 std::fs::read_to_string(path).ok()
747 };
748
749 if let Some(text) = content {
750 if text.trim().is_empty() {
751 continue;
752 }
753 match self.index_document(&rel_str, mtime, &text) {
754 Ok(new_chunks) if !new_chunks.is_empty() => {
755 count += 1;
756 }
757 Ok(_) => {}
758 Err(_) => {}
759 }
760 }
761 }
762 }
763
764 self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
765 count
766 }
767
768 pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
771 let reports_dir = workspace_root.join(".hematite").join("reports");
772 let mut count = 0usize;
773 let mut desired_paths = HashSet::new();
774
775 if reports_dir.exists() {
776 let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
777 .ok()
778 .into_iter()
779 .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
780 .map(|entry| entry.path())
781 .filter(|path| {
782 path.is_file()
783 && path.extension().and_then(|ext| ext.to_str()) == Some("json")
784 && path
785 .file_stem()
786 .and_then(|stem| stem.to_str())
787 .map(|stem| stem.starts_with("session_"))
788 .unwrap_or(false)
789 })
790 .collect();
791
792 reports.sort_by(|a, b| {
793 let a_name = a
794 .file_name()
795 .and_then(|name| name.to_str())
796 .unwrap_or_default();
797 let b_name = b
798 .file_name()
799 .and_then(|name| name.to_str())
800 .unwrap_or_default();
801 b_name.cmp(a_name)
802 });
803 reports.truncate(Self::SESSION_REPORT_LIMIT);
804
805 for report_path in reports {
806 let Ok(meta) = std::fs::metadata(&report_path) else {
807 continue;
808 };
809 let mtime = meta
810 .modified()
811 .map(|t| {
812 t.duration_since(std::time::UNIX_EPOCH)
813 .unwrap_or_default()
814 .as_secs() as i64
815 })
816 .unwrap_or(0);
817
818 for exchange in load_session_exchanges(&report_path, mtime) {
819 desired_paths.insert(exchange.path.clone());
820 match self.index_chunks_with_room(
821 &exchange.path,
822 exchange.last_modified,
823 "session",
824 std::slice::from_ref(&exchange.content),
825 ) {
826 Ok(new_chunks) if !new_chunks.is_empty() => {
827 count += 1;
828 }
829 Ok(_) => {}
830 Err(_) => {}
831 }
832 }
833 }
834 }
835
836 self.prune_indexed_prefix("session/", &desired_paths);
837 count
838 }
839
840 pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
845 let imports_dir = workspace_root.join(".hematite").join("imports");
846 let mut count = 0usize;
847 let mut desired_paths = HashSet::new();
848
849 if imports_dir.exists() {
850 let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
851 .max_depth(4)
852 .follow_links(false)
853 .into_iter()
854 .filter_map(|entry| entry.ok())
855 .filter(|entry| entry.file_type().is_file())
856 .filter_map(|entry| {
857 let path = entry.into_path();
858 let ext = path
859 .extension()
860 .and_then(|ext| ext.to_str())
861 .unwrap_or("")
862 .to_ascii_lowercase();
863 if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
864 return None;
865 }
866 let meta = std::fs::metadata(&path).ok()?;
867 if meta.len() > Self::IMPORT_MAX_BYTES {
868 return None;
869 }
870 let mtime = meta
871 .modified()
872 .map(|t| {
873 t.duration_since(std::time::UNIX_EPOCH)
874 .unwrap_or_default()
875 .as_secs() as i64
876 })
877 .unwrap_or(0);
878 Some((path, mtime))
879 })
880 .collect();
881
882 imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
883 b_mtime
884 .cmp(a_mtime)
885 .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
886 });
887 imports.truncate(Self::IMPORT_FILE_LIMIT);
888
889 for (import_path, mtime) in imports {
890 for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
891 desired_paths.insert(exchange.path.clone());
892 match self.index_chunks_with_room(
893 &exchange.path,
894 exchange.last_modified,
895 "session",
896 std::slice::from_ref(&exchange.content),
897 ) {
898 Ok(new_chunks) if !new_chunks.is_empty() => {
899 count += 1;
900 }
901 Ok(_) => {}
902 Err(_) => {}
903 }
904 }
905 }
906 }
907
908 self.prune_indexed_prefix("session/imports/", &desired_paths);
909 count
910 }
911
912 fn backfill_missing_embeddings(&self) {
917 let (fts_count, vec_count) = {
919 let db = self.db.lock().unwrap();
920 let fts: i64 = db
921 .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
922 .unwrap_or(0);
923 let vec: i64 = db
924 .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
925 .unwrap_or(0);
926 (fts, vec)
927 };
928 if fts_count == 0 || fts_count == vec_count {
929 return;
930 }
931
932 let missing: Vec<(String, i64, String)> = {
935 let db = self.db.lock().unwrap();
936 let mut stmt = db
937 .prepare(
938 "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
939 FROM chunks_fts f
940 LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
941 WHERE v.path IS NULL
942 ORDER BY CASE
943 WHEN f.path LIKE '%.rs' THEN 0
944 WHEN f.path LIKE '%.toml' THEN 1
945 WHEN f.path LIKE '%.json' THEN 2
946 ELSE 3
947 END, f.path
948 LIMIT 20",
949 )
950 .unwrap();
951 stmt.query_map([], |r| {
952 Ok((
953 r.get::<_, String>(0)?,
954 r.get::<_, i64>(1)?,
955 r.get::<_, String>(2)?,
956 ))
957 })
958 .unwrap()
959 .filter_map(|r| r.ok())
960 .collect()
961 };
962
963 for (path, idx, content) in missing {
964 if let Some(vec) = embed_text_blocking(&content, &self.base_url) {
965 let blob = floats_to_blob(&vec);
966 let db = self.db.lock().unwrap();
967 let _ = db.execute(
968 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
969 params![path, idx, blob],
970 );
971 } else {
972 break;
974 }
975 }
976 }
977
978 pub fn file_count(&self) -> usize {
981 let db = self.db.lock().unwrap();
982 db.query_row(
983 "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
984 [],
985 |r| r.get::<_, i64>(0),
986 )
987 .unwrap_or(0) as usize
988 }
989
990 pub fn embedded_chunk_count(&self) -> usize {
993 let db = self.db.lock().unwrap();
994 db.query_row(
995 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
996 [],
997 |r| r.get::<_, i64>(0),
998 )
999 .unwrap_or(0) as usize
1000 }
1001
1002 pub fn has_any_embeddings(&self) -> bool {
1004 let db = self.db.lock().unwrap();
1005 db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1006 r.get::<_, i64>(0)
1007 })
1008 .unwrap_or(0)
1009 != 0
1010 }
1011
1012 pub fn reset(&self) {
1015 let db = self.db.lock().unwrap();
1016 let _ = db.execute_batch(
1017 "DELETE FROM chunks_fts;
1018 DELETE FROM chunks_vec;
1019 DELETE FROM chunks_meta;",
1020 );
1021 }
1022
1023 pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1026 let db = self.db.lock().unwrap();
1027 let indexed_source_files = db
1028 .query_row(
1029 "SELECT COUNT(*) FROM chunks_meta
1030 WHERE path NOT LIKE 'session/%'
1031 AND path NOT LIKE '.hematite/docs/%'",
1032 [],
1033 |r| r.get::<_, i64>(0),
1034 )
1035 .unwrap_or(0) as usize;
1036 let indexed_docs = db
1037 .query_row(
1038 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1039 [],
1040 |r| r.get::<_, i64>(0),
1041 )
1042 .unwrap_or(0) as usize;
1043 let indexed_session_exchanges = db
1044 .query_row(
1045 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1046 [],
1047 |r| r.get::<_, i64>(0),
1048 )
1049 .unwrap_or(0) as usize;
1050 let embedded_source_doc_chunks = db
1051 .query_row(
1052 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1053 [],
1054 |r| r.get::<_, i64>(0),
1055 )
1056 .unwrap_or(0) as usize;
1057 let has_any_embeddings = db
1058 .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1059 r.get::<_, i64>(0)
1060 })
1061 .unwrap_or(0)
1062 != 0;
1063 drop(db);
1064
1065 let hot_files = self
1066 .hot_files(hot_limit.max(1))
1067 .into_iter()
1068 .map(|(path, heat, last_modified, room)| VeinHotFile {
1069 path,
1070 heat,
1071 last_modified,
1072 room,
1073 })
1074 .collect::<Vec<_>>();
1075
1076 VeinInspectionSnapshot {
1077 indexed_source_files,
1078 indexed_docs,
1079 indexed_session_exchanges,
1080 embedded_source_doc_chunks,
1081 has_any_embeddings,
1082 active_room: self.active_room(),
1083 l1_ready: !hot_files.is_empty(),
1084 hot_files,
1085 }
1086 }
1087
1088 pub fn bump_heat(&self, path: &str) {
1094 if path.is_empty() {
1095 return;
1096 }
1097 let now = std::time::SystemTime::now()
1098 .duration_since(std::time::UNIX_EPOCH)
1099 .unwrap_or_default()
1100 .as_secs() as i64;
1101 let db = self.db.lock().unwrap();
1102 let _ = db.execute(
1103 "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1104 ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1105 params![path, now],
1106 );
1107 }
1108
1109 fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1113 let db = self.db.lock().unwrap();
1114 let mut stmt = match db.prepare(
1115 "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1116 FROM file_heat fh
1117 JOIN chunks_meta cm ON cm.path = fh.path
1118 ORDER BY fh.heat DESC, cm.last_modified DESC
1119 LIMIT ?1",
1120 ) {
1121 Ok(s) => s,
1122 Err(_) => return vec![],
1123 };
1124 stmt.query_map(params![n as i64], |row| {
1125 Ok((
1126 row.get::<_, String>(0)?,
1127 row.get::<_, i64>(1)?,
1128 row.get::<_, i64>(2)?,
1129 row.get::<_, String>(3)?,
1130 ))
1131 })
1132 .map(|rows| rows.filter_map(|r| r.ok()).collect())
1133 .unwrap_or_default()
1134 }
1135
1136 pub fn hot_file_paths(&self, n: usize) -> Vec<String> {
1139 self.hot_files(n)
1140 .into_iter()
1141 .map(|(path, _, _, _)| path)
1142 .collect()
1143 }
1144
1145 pub fn hot_files_weighted(&self, n: usize) -> Vec<(String, f64)> {
1149 let files = self.hot_files(n);
1150 if files.is_empty() {
1151 return vec![];
1152 }
1153 let max_heat = files.iter().map(|(_, h, _, _)| *h).max().unwrap_or(1).max(1) as f64;
1154 files
1155 .into_iter()
1156 .map(|(path, heat, _, _)| {
1157 let weight = (heat as f64) / max_heat;
1158 (path, weight)
1159 })
1160 .collect()
1161 }
1162
1163 pub fn l1_context(&self) -> Option<String> {
1168 let files = self.hot_files(8);
1169 if files.is_empty() {
1170 return None;
1171 }
1172 let now = std::time::SystemTime::now()
1173 .duration_since(std::time::UNIX_EPOCH)
1174 .unwrap_or_default()
1175 .as_secs() as i64;
1176
1177 let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1179 std::collections::BTreeMap::new();
1180 for (path, heat, mtime, room) in &files {
1181 by_room
1182 .entry(room.clone())
1183 .or_default()
1184 .push((path.clone(), *heat, *mtime));
1185 }
1186
1187 let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1188 for (room, entries) in &by_room {
1189 out.push_str(&format!("[{}]\n", room));
1190 for (path, heat, mtime) in entries {
1191 let age_secs = now - mtime;
1192 let age = if age_secs < 3600 {
1193 "just now".to_string()
1194 } else if age_secs < 86400 {
1195 format!("{}h ago", age_secs / 3600)
1196 } else {
1197 format!("{}d ago", age_secs / 86400)
1198 };
1199 out.push_str(&format!(
1200 " - {} [{} edit{}, {}]\n",
1201 path,
1202 heat,
1203 if *heat == 1 { "" } else { "s" },
1204 age
1205 ));
1206 }
1207 }
1208 Some(out)
1209 }
1210
1211 fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1212 let pattern = format!("{}%", prefix);
1213 let existing_paths: Vec<String> = {
1214 let db = self.db.lock().unwrap();
1215 let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1216 Ok(stmt) => stmt,
1217 Err(_) => return,
1218 };
1219 stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1220 .map(|rows| rows.filter_map(|row| row.ok()).collect())
1221 .unwrap_or_default()
1222 };
1223
1224 if existing_paths.is_empty() {
1225 return;
1226 }
1227
1228 let db = self.db.lock().unwrap();
1229 for path in existing_paths {
1230 if desired_paths.contains(&path) {
1231 continue;
1232 }
1233 let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1234 let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1235 let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1236 }
1237 }
1238}
1239
1240impl QuerySignals {
1241 fn from_query(query: &str) -> Self {
1242 let lower = query.to_ascii_lowercase();
1243 let historical_memory_hint = [
1244 "remember",
1245 "earlier",
1246 "previous",
1247 "last time",
1248 "what did we decide",
1249 "why did we decide",
1250 "what did we say",
1251 "why did we change",
1252 ]
1253 .iter()
1254 .any(|needle| lower.contains(needle));
1255
1256 Self {
1257 exact_phrases: extract_exact_phrases(query),
1258 standout_terms: extract_standout_terms(query),
1259 historical_memory_hint,
1260 temporal_reference: extract_temporal_reference(query),
1261 }
1262 }
1263}
1264
1265fn merge_scored_result(
1266 merged_by_path: &mut HashMap<String, SearchResult>,
1267 candidate: SearchResult,
1268) {
1269 match merged_by_path.get_mut(&candidate.path) {
1270 Some(existing) if candidate.score > existing.score => *existing = candidate,
1271 Some(_) => {}
1272 None => {
1273 merged_by_path.insert(candidate.path.clone(), candidate);
1274 }
1275 }
1276}
1277
1278fn reranked_score(
1279 signals: &QuerySignals,
1280 active_room: Option<&str>,
1281 result: &SearchResult,
1282 is_semantic: bool,
1283) -> f32 {
1284 let base = if is_semantic {
1285 1.0 + result.score.clamp(0.0, 1.0)
1286 } else {
1287 (result.score / 10.0).clamp(0.0, 1.0)
1288 };
1289 base + room_bias(active_room, result)
1290 + retrieval_signal_boost(signals, result)
1291 + temporal_memory_boost(signals, result)
1292}
1293
1294fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1295 if active_room == Some(result.room.as_str()) {
1296 0.15
1297 } else {
1298 0.0
1299 }
1300}
1301
1302fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1303 let mut boost = 0.0f32;
1304 let haystack = format!(
1305 "{}\n{}",
1306 result.path.to_ascii_lowercase(),
1307 result.content.to_ascii_lowercase()
1308 );
1309
1310 let phrase_matches = signals
1311 .exact_phrases
1312 .iter()
1313 .filter(|phrase| haystack.contains(phrase.as_str()))
1314 .count();
1315 if phrase_matches > 0 {
1316 boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1317 }
1318
1319 let mut standout_matches = 0;
1320 for term in &signals.standout_terms {
1321 if result.path.to_ascii_lowercase().contains(term.as_str()) {
1322 boost += 0.40;
1323 standout_matches += 1;
1324 } else if result.content.to_ascii_lowercase().contains(term.as_str()) {
1325 boost += 0.12;
1326 standout_matches += 1;
1327 }
1328 if standout_matches >= 3 {
1329 break;
1330 }
1331 }
1332
1333 if signals.historical_memory_hint && result.room == "session" {
1334 boost += 0.45;
1335 }
1336
1337 boost
1338}
1339
1340fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1341 if result.room != "session" {
1342 return 0.0;
1343 }
1344 let Some(reference) = signals.temporal_reference else {
1345 return 0.0;
1346 };
1347 let Some(memory_ts) = session_memory_timestamp(result) else {
1348 return 0.0;
1349 };
1350
1351 let span = reference.window_secs.max(86_400);
1352 let full_fade = span.saturating_mul(8);
1353 if full_fade <= 0 {
1354 return 0.0;
1355 }
1356
1357 let distance = (memory_ts - reference.target_ts).abs();
1358 let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1359 if closeness <= 0.0 {
1360 0.0
1361 } else {
1362 0.22 * closeness
1363 }
1364}
1365
1366fn extract_exact_phrases(query: &str) -> Vec<String> {
1367 let mut phrases = Vec::new();
1368 let chars: Vec<char> = query.chars().collect();
1369 let mut i = 0usize;
1370
1371 while i < chars.len() {
1372 let quote = chars[i];
1373 if !matches!(quote, '"' | '\'' | '`') {
1374 i += 1;
1375 continue;
1376 }
1377 let start = i + 1;
1378 let mut end = start;
1379 while end < chars.len() && chars[end] != quote {
1380 end += 1;
1381 }
1382 if end > start {
1383 let phrase = chars[start..end]
1384 .iter()
1385 .collect::<String>()
1386 .trim()
1387 .to_ascii_lowercase();
1388 if phrase.len() >= 3 && !phrases.contains(&phrase) {
1389 phrases.push(phrase);
1390 }
1391 }
1392 i = end.saturating_add(1);
1393 }
1394
1395 phrases
1396}
1397
1398fn extract_standout_terms(query: &str) -> Vec<String> {
1399 const STOPWORDS: &[&str] = &[
1400 "about", "after", "before", "change", "changed", "decide", "decided", "does", "earlier",
1401 "flow", "from", "have", "into", "just", "last", "local", "make", "more", "remember",
1402 "should", "that", "their", "there", "these", "they", "this", "those", "what", "when",
1403 "where", "which", "why", "with", "work",
1404 ];
1405
1406 let mut standout = Vec::new();
1407 for token in query.split(|ch: char| {
1408 !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1409 }) {
1410 let trimmed = token.trim();
1411 if trimmed.len() < 4 {
1412 continue;
1413 }
1414 let lower = trimmed.to_ascii_lowercase();
1415 if STOPWORDS.contains(&lower.as_str()) {
1416 continue;
1417 }
1418
1419 let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1420 || trimmed
1421 .chars()
1422 .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1423 || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1424 || trimmed.len() >= 9;
1425
1426 if interesting && !standout.contains(&lower) {
1427 standout.push(lower);
1428 }
1429 }
1430
1431 standout
1432}
1433
1434fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1435 if let Some(ts) = extract_iso_date_from_query(query) {
1436 return Some(TemporalReference {
1437 target_ts: ts,
1438 window_secs: 86_400,
1439 });
1440 }
1441
1442 let now = current_unix_timestamp();
1443 let lower = query.to_ascii_lowercase();
1444 if lower.contains("yesterday") {
1445 Some(TemporalReference {
1446 target_ts: now.saturating_sub(86_400),
1447 window_secs: 86_400,
1448 })
1449 } else if lower.contains("today") || lower.contains("earlier today") {
1450 Some(TemporalReference {
1451 target_ts: now,
1452 window_secs: 86_400,
1453 })
1454 } else if lower.contains("last week") {
1455 Some(TemporalReference {
1456 target_ts: now.saturating_sub(7 * 86_400),
1457 window_secs: 7 * 86_400,
1458 })
1459 } else if lower.contains("last month") {
1460 Some(TemporalReference {
1461 target_ts: now.saturating_sub(30 * 86_400),
1462 window_secs: 30 * 86_400,
1463 })
1464 } else {
1465 None
1466 }
1467}
1468
1469fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1470 query
1471 .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1472 .find_map(parse_iso_date_token)
1473}
1474
1475fn parse_iso_date_token(token: &str) -> Option<i64> {
1476 if token.len() != 10 {
1477 return None;
1478 }
1479 let bytes = token.as_bytes();
1480 if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1481 return None;
1482 }
1483
1484 let year = token.get(0..4)?.parse::<i32>().ok()?;
1485 let month = token.get(5..7)?.parse::<u32>().ok()?;
1486 let day = token.get(8..10)?.parse::<u32>().ok()?;
1487 if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1488 return None;
1489 }
1490
1491 Some(days_from_civil(year, month, day).saturating_mul(86_400))
1492}
1493
1494fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1495 let year = year - if month <= 2 { 1 } else { 0 };
1496 let era = if year >= 0 { year } else { year - 399 } / 400;
1497 let yoe = year - era * 400;
1498 let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1499 let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1500 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1501 era as i64 * 146_097 + doe as i64 - 719_468
1502}
1503
1504fn current_unix_timestamp() -> i64 {
1505 std::time::SystemTime::now()
1506 .duration_since(std::time::UNIX_EPOCH)
1507 .unwrap_or_default()
1508 .as_secs() as i64
1509}
1510
1511fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1512 extract_session_path_timestamp(&result.path).or_else(|| {
1513 if result.last_modified > 0 {
1514 Some(result.last_modified)
1515 } else {
1516 None
1517 }
1518 })
1519}
1520
1521fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1522 let normalized = path.replace('\\', "/");
1523 let mut parts = normalized.split('/');
1524 if parts.next()? != "session" {
1525 return None;
1526 }
1527 parse_iso_date_token(parts.next()?)
1528}
1529
1530fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1531 let normalized = speaker.trim().to_ascii_lowercase();
1532 match normalized.as_str() {
1533 "you" | "user" => SessionSpeakerKind::User,
1534 "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1535 _ => SessionSpeakerKind::Assistant,
1536 }
1537}
1538
1539fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1540 let Ok(raw) = std::fs::read_to_string(report_path) else {
1541 return Vec::new();
1542 };
1543 let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1544 return Vec::new();
1545 };
1546
1547 let session_key = report_path
1548 .file_stem()
1549 .and_then(|stem| stem.to_str())
1550 .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1551 .unwrap_or("unknown-session")
1552 .to_string();
1553 let session_date = report
1554 .session_start
1555 .split('_')
1556 .next()
1557 .filter(|date| !date.is_empty())
1558 .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1559 .to_string();
1560
1561 let mut exchanges = Vec::new();
1562 let mut pending_user: Option<String> = None;
1563 let mut turn_index = 0usize;
1564
1565 for entry in report.transcript {
1566 match session_speaker_kind(&entry.speaker) {
1567 SessionSpeakerKind::User => {
1568 let text = entry.text.trim();
1569 if !text.is_empty() {
1570 pending_user = Some(text.to_string());
1571 }
1572 }
1573 SessionSpeakerKind::Assistant => {
1574 let text = entry.text.trim();
1575 if text.is_empty() {
1576 continue;
1577 }
1578 let Some(user_text) = pending_user.take() else {
1579 continue;
1580 };
1581 turn_index += 1;
1582 exchanges.push(SessionExchange {
1583 path: format!(
1584 "session/{}/{}/turn-{}",
1585 session_date, session_key, turn_index
1586 ),
1587 last_modified,
1588 content: format!(
1589 "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1590 user_text, text
1591 ),
1592 });
1593 }
1594 SessionSpeakerKind::Ignore => {}
1595 }
1596 }
1597
1598 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1599 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1600 exchanges = exchanges.into_iter().skip(keep_from).collect();
1601 }
1602
1603 exchanges
1604}
1605
1606fn load_imported_session_exchanges(
1607 import_path: &Path,
1608 imports_root: &Path,
1609 last_modified: i64,
1610) -> Vec<SessionExchange> {
1611 let Ok(raw) = std::fs::read_to_string(import_path) else {
1612 return Vec::new();
1613 };
1614
1615 let messages = normalize_import_messages(&raw, import_path);
1616 if messages.is_empty() {
1617 return Vec::new();
1618 }
1619
1620 let rel = import_path
1621 .strip_prefix(imports_root)
1622 .unwrap_or(import_path);
1623 let rel_slug = slugify_import_path(rel);
1624 let mut exchanges = Vec::new();
1625 let mut pending_user: Option<String> = None;
1626 let mut turn_index = 0usize;
1627
1628 for (role, text) in messages {
1629 let cleaned = text.trim();
1630 if cleaned.is_empty() {
1631 continue;
1632 }
1633 match role.as_str() {
1634 "user" => pending_user = Some(cleaned.to_string()),
1635 "assistant" => {
1636 let Some(user_text) = pending_user.take() else {
1637 continue;
1638 };
1639 turn_index += 1;
1640 exchanges.push(SessionExchange {
1641 path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1642 last_modified,
1643 content: format!(
1644 "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1645 rel.to_string_lossy().replace('\\', "/"),
1646 user_text,
1647 cleaned
1648 ),
1649 });
1650 }
1651 _ => {}
1652 }
1653 }
1654
1655 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1656 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1657 exchanges = exchanges.into_iter().skip(keep_from).collect();
1658 }
1659
1660 exchanges
1661}
1662
1663fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1664 if raw.trim().is_empty() {
1665 return Vec::new();
1666 }
1667
1668 if let Some(messages) = parse_marker_transcript(raw) {
1669 return messages;
1670 }
1671
1672 let ext = import_path
1673 .extension()
1674 .and_then(|ext| ext.to_str())
1675 .unwrap_or("")
1676 .to_ascii_lowercase();
1677
1678 if matches!(ext.as_str(), "json" | "jsonl")
1679 || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1680 {
1681 if let Some(messages) = parse_jsonl_messages(raw) {
1682 if !messages.is_empty() {
1683 return messages;
1684 }
1685 }
1686
1687 if let Ok(value) = serde_json::from_str::<Value>(raw) {
1688 if let Some(messages) = parse_session_report_messages(&value) {
1689 return messages;
1690 }
1691 if let Some(messages) = parse_simple_role_messages(&value) {
1692 return messages;
1693 }
1694 if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1695 return messages;
1696 }
1697 }
1698 }
1699
1700 Vec::new()
1701}
1702
1703fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1704 let lines = raw.lines().collect::<Vec<_>>();
1705 if lines
1706 .iter()
1707 .filter(|line| line.trim_start().starts_with("> "))
1708 .count()
1709 < 2
1710 {
1711 return None;
1712 }
1713
1714 let mut messages = Vec::new();
1715 let mut i = 0usize;
1716 while i < lines.len() {
1717 let line = lines[i].trim_start();
1718 if let Some(rest) = line.strip_prefix("> ") {
1719 messages.push(("user".to_string(), rest.trim().to_string()));
1720 i += 1;
1721 let mut assistant_lines = Vec::new();
1722 while i < lines.len() {
1723 let next = lines[i];
1724 if next.trim_start().starts_with("> ") {
1725 break;
1726 }
1727 let trimmed = next.trim();
1728 if !trimmed.is_empty() && trimmed != "---" {
1729 assistant_lines.push(trimmed.to_string());
1730 }
1731 i += 1;
1732 }
1733 if !assistant_lines.is_empty() {
1734 messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1735 }
1736 } else {
1737 i += 1;
1738 }
1739 }
1740
1741 (!messages.is_empty()).then_some(messages)
1742}
1743
1744fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1745 let mut messages = Vec::new();
1746 let mut has_codex_session_meta = false;
1747 let mut saw_jsonl = false;
1748
1749 for line in raw.lines() {
1750 let trimmed = line.trim();
1751 if trimmed.is_empty() {
1752 continue;
1753 }
1754 let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1755 continue;
1756 };
1757 saw_jsonl = true;
1758 let Some(object) = value.as_object() else {
1759 continue;
1760 };
1761
1762 match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1763 "session_meta" => {
1764 has_codex_session_meta = true;
1765 }
1766 "event_msg" => {
1767 let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1768 continue;
1769 };
1770 let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1771 continue;
1772 };
1773 match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1774 "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1775 "agent_message" => {
1776 messages.push(("assistant".to_string(), text.trim().to_string()))
1777 }
1778 _ => {}
1779 }
1780 }
1781 "human" | "user" => {
1782 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1783 messages.push(("user".to_string(), text));
1784 }
1785 }
1786 "assistant" => {
1787 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1788 messages.push(("assistant".to_string(), text));
1789 }
1790 }
1791 _ => {
1792 if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
1793 if let Some(text) = extract_text_content(&value) {
1794 match role {
1795 "user" | "human" => messages.push(("user".to_string(), text)),
1796 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1797 _ => {}
1798 }
1799 }
1800 }
1801 }
1802 }
1803 }
1804
1805 if !saw_jsonl {
1806 return None;
1807 }
1808
1809 if has_codex_session_meta || !messages.is_empty() {
1810 return Some(messages);
1811 }
1812
1813 None
1814}
1815
1816fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
1817 let report = value.as_object()?;
1818 let transcript = report.get("transcript")?.as_array()?;
1819 let mut messages = Vec::new();
1820
1821 for entry in transcript {
1822 let Some(obj) = entry.as_object() else {
1823 continue;
1824 };
1825 let speaker = obj
1826 .get("speaker")
1827 .and_then(|v| v.as_str())
1828 .unwrap_or_default();
1829 let text = obj
1830 .get("text")
1831 .and_then(|v| v.as_str())
1832 .unwrap_or_default()
1833 .trim()
1834 .to_string();
1835 if text.is_empty() {
1836 continue;
1837 }
1838 match session_speaker_kind(speaker) {
1839 SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
1840 SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
1841 SessionSpeakerKind::Ignore => {}
1842 }
1843 }
1844
1845 (!messages.is_empty()).then_some(messages)
1846}
1847
1848fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
1849 if let Some(array) = value.as_array() {
1850 let messages = collect_role_messages(array);
1851 return (!messages.is_empty()).then_some(messages);
1852 }
1853
1854 let obj = value.as_object()?;
1855 if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
1856 let array = messages_value.as_array()?;
1857 let messages = collect_role_messages(array);
1858 return (!messages.is_empty()).then_some(messages);
1859 }
1860
1861 None
1862}
1863
1864fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
1865 let mut messages = Vec::new();
1866 for item in items {
1867 let Some(obj) = item.as_object() else {
1868 continue;
1869 };
1870 let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
1871 continue;
1872 };
1873 let Some(text) = extract_text_content(item) else {
1874 continue;
1875 };
1876 match role {
1877 "user" | "human" => messages.push(("user".to_string(), text)),
1878 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1879 _ => {}
1880 }
1881 }
1882 messages
1883}
1884
1885fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
1886 let mapping = value.get("mapping")?.as_object()?;
1887 let mut current_id = mapping.iter().find_map(|(node_id, node)| {
1888 let obj = node.as_object()?;
1889 (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
1890 })?;
1891
1892 let mut messages = Vec::new();
1893 let mut visited = std::collections::HashSet::new();
1894
1895 while visited.insert(current_id.clone()) {
1896 let Some(node) = mapping.get(¤t_id).and_then(|v| v.as_object()) else {
1897 break;
1898 };
1899
1900 if let Some(message) = node.get("message") {
1901 let role = message
1902 .get("author")
1903 .and_then(|author| author.get("role"))
1904 .and_then(|v| v.as_str())
1905 .unwrap_or("");
1906 if let Some(text) = extract_text_content(message) {
1907 match role {
1908 "user" => messages.push(("user".to_string(), text)),
1909 "assistant" => messages.push(("assistant".to_string(), text)),
1910 _ => {}
1911 }
1912 }
1913 }
1914
1915 let Some(next_id) = node
1916 .get("children")
1917 .and_then(|children| children.as_array())
1918 .and_then(|children| children.first())
1919 .and_then(|child| child.as_str())
1920 else {
1921 break;
1922 };
1923 current_id = next_id.to_string();
1924 }
1925
1926 (!messages.is_empty()).then_some(messages)
1927}
1928
1929fn extract_text_content(value: &Value) -> Option<String> {
1930 if let Some(text) = value.as_str() {
1931 let trimmed = text.trim();
1932 return (!trimmed.is_empty()).then_some(trimmed.to_string());
1933 }
1934
1935 if let Some(array) = value.as_array() {
1936 let joined = array
1937 .iter()
1938 .filter_map(extract_text_content)
1939 .filter(|part| !part.is_empty())
1940 .collect::<Vec<_>>()
1941 .join("\n");
1942 return (!joined.is_empty()).then_some(joined);
1943 }
1944
1945 let obj = value.as_object()?;
1946
1947 if let Some(content) = obj.get("content") {
1948 if let Some(text) = extract_text_content(content) {
1949 return Some(text);
1950 }
1951 }
1952
1953 if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
1954 let trimmed = text.trim();
1955 if !trimmed.is_empty() {
1956 return Some(trimmed.to_string());
1957 }
1958 }
1959
1960 if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
1961 let joined = parts
1962 .iter()
1963 .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
1964 .filter(|part| !part.is_empty())
1965 .collect::<Vec<_>>()
1966 .join("\n");
1967 if !joined.is_empty() {
1968 return Some(joined);
1969 }
1970 }
1971
1972 None
1973}
1974
1975fn slugify_import_path(path: &Path) -> String {
1976 path.to_string_lossy()
1977 .replace('\\', "/")
1978 .chars()
1979 .map(|ch| {
1980 if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
1981 ch
1982 } else {
1983 '_'
1984 }
1985 })
1986 .collect::<String>()
1987 .trim_matches('/')
1988 .replace('/', "__")
1989}
1990
1991fn embed_text_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2007 embed_text_with_prefix(text, "search_document", base_url)
2008}
2009
2010fn embed_query_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
2011 embed_text_with_prefix(text, "search_query", base_url)
2012}
2013
2014fn embed_text_with_prefix(text: &str, task: &str, base_url: &str) -> Option<Vec<f32>> {
2015 let prefixed = format!("{}: {}", task, text);
2017 let input = if prefixed.len() > 8000 {
2019 &prefixed[..8000]
2020 } else {
2021 &prefixed
2022 };
2023
2024 let client = reqwest::blocking::Client::builder()
2025 .timeout(std::time::Duration::from_secs(10))
2026 .build()
2027 .ok()?;
2028
2029 let body = serde_json::json!({
2030 "model": "nomic-embed-text-v2",
2031 "input": input
2032 });
2033
2034 let url = format!("{}/v1/embeddings", base_url);
2035 let resp = client.post(&url).json(&body).send().ok()?;
2036
2037 if !resp.status().is_success() {
2038 return None;
2039 }
2040
2041 let json: serde_json::Value = resp.json().ok()?;
2042 let embedding = json["data"][0]["embedding"].as_array()?;
2043 let vec: Vec<f32> = embedding
2044 .iter()
2045 .filter_map(|v| v.as_f64().map(|f| f as f32))
2046 .collect();
2047
2048 if vec.is_empty() {
2049 None
2050 } else {
2051 Some(vec)
2052 }
2053}
2054
2055fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2058 if a.len() != b.len() || a.is_empty() {
2059 return 0.0;
2060 }
2061 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2062 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2063 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2064 if norm_a == 0.0 || norm_b == 0.0 {
2065 0.0
2066 } else {
2067 dot / (norm_a * norm_b)
2068 }
2069}
2070
2071fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2072 floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2073}
2074
2075fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2076 blob.chunks_exact(4)
2077 .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2078 .collect()
2079}
2080
2081fn normalize_extracted_document_text(text: String) -> Option<String> {
2087 let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2088 let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2089 if trimmed.is_empty() {
2090 None
2091 } else {
2092 Some(trimmed.to_string())
2093 }
2094}
2095
2096fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2097 let previous_hook = std::panic::take_hook();
2098 std::panic::set_hook(Box::new(|_| {}));
2099 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2100 pdf_extract::extract_text(path)
2101 }));
2102 std::panic::set_hook(previous_hook);
2103
2104 match result {
2105 Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2106 Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2107 Err(payload) => {
2108 let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2109 (*msg).to_string()
2110 } else if let Some(msg) = payload.downcast_ref::<String>() {
2111 msg.clone()
2112 } else {
2113 "unknown parser panic".to_string()
2114 };
2115 Err(format!("pdf-extract panicked: {}", panic_text))
2116 }
2117 }
2118}
2119
2120fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2121 let mut doc =
2122 lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2123
2124 if doc.is_encrypted() {
2125 doc.decrypt("")
2126 .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2127 }
2128
2129 let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2130 if page_numbers.is_empty() {
2131 return Ok(None);
2132 }
2133
2134 let mut extracted_pages = Vec::new();
2135 let mut page_errors = Vec::new();
2136
2137 for page_number in page_numbers {
2138 match doc.extract_text(&[page_number]) {
2139 Ok(text) => {
2140 if let Some(page_text) = normalize_extracted_document_text(text) {
2141 extracted_pages.push(page_text);
2142 }
2143 }
2144 Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2145 }
2146 }
2147
2148 if !extracted_pages.is_empty() {
2149 return Ok(Some(extracted_pages.join("\n\n")));
2150 }
2151
2152 if !page_errors.is_empty() {
2153 let sample_errors = page_errors
2154 .into_iter()
2155 .take(3)
2156 .collect::<Vec<_>>()
2157 .join("; ");
2158 return Err(format!(
2159 "lopdf could not extract usable page text ({sample_errors})"
2160 ));
2161 }
2162
2163 Ok(None)
2164}
2165
2166fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2167 let mut failures = Vec::new();
2168
2169 match extract_pdf_text_with_pdf_extract(path) {
2170 Ok(Some(text)) => return Ok(Some(text)),
2171 Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2172 Err(e) => failures.push(e),
2173 }
2174
2175 match extract_pdf_text_with_lopdf(path) {
2176 Ok(Some(text)) => return Ok(Some(text)),
2177 Ok(None) => failures.push("lopdf found no usable text".to_string()),
2178 Err(e) => failures.push(e),
2179 }
2180
2181 let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2182 Err(format!(
2183 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2184 detail
2185 ))
2186}
2187
2188fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2189 let exe = std::env::current_exe()
2190 .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2191 let output = std::process::Command::new(exe)
2192 .arg("--pdf-extract-helper")
2193 .arg(path)
2194 .stdin(std::process::Stdio::null())
2195 .stdout(std::process::Stdio::piped())
2196 .stderr(std::process::Stdio::piped())
2197 .output()
2198 .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2199
2200 if !output.status.success() {
2201 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2202 return Err(if stderr.is_empty() {
2203 "PDF extraction failed.".to_string()
2204 } else {
2205 stderr
2206 });
2207 }
2208
2209 let text = String::from_utf8(output.stdout)
2210 .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2211 if text.trim().is_empty() {
2212 Ok(None)
2213 } else {
2214 Ok(Some(text))
2215 }
2216}
2217
2218pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2219 match extract_pdf_text_inside_helper(path) {
2220 Ok(Some(text)) => {
2221 use std::io::Write;
2222 let mut stdout = std::io::stdout();
2223 if stdout.write_all(text.as_bytes()).is_ok() {
2224 0
2225 } else {
2226 let _ = writeln!(
2227 std::io::stderr(),
2228 "PDF helper could not write extracted text."
2229 );
2230 1
2231 }
2232 }
2233 Ok(None) => {
2234 eprintln!(
2235 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2236 );
2237 1
2238 }
2239 Err(e) => {
2240 eprintln!("{}", e);
2241 1
2242 }
2243 }
2244}
2245
2246pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2249 let ext = path
2250 .extension()
2251 .and_then(|e| e.to_str())
2252 .unwrap_or("")
2253 .to_lowercase();
2254 match ext.as_str() {
2255 "pdf" => {
2256 let text = extract_pdf_text(path)?.ok_or_else(|| {
2257 "PDF contains no extractable text — it may be scanned/image-only. \
2258 Try attaching page screenshots with /image instead."
2259 .to_string()
2260 })?;
2261 pdf_quality_check(text)
2262 }
2263 _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2264 }
2265}
2266
2267fn pdf_quality_check(text: String) -> Result<String, String> {
2272 let trimmed = text.trim();
2273
2274 if trimmed.len() < 150 {
2276 return Err(format!(
2277 "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2278 or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2279 trimmed.len()
2280 ));
2281 }
2282
2283 let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2286 let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2287 let space_ratio = if non_newline > 0 {
2288 spaces as f32 / non_newline as f32
2289 } else {
2290 0.0
2291 };
2292
2293 if space_ratio < 0.04 {
2294 return Err(
2295 "PDF text extraction produced garbled output — words are merged with no spaces. \
2296 This usually means the PDF uses custom embedded fonts (common with academic publishers \
2297 like EBSCO, Elsevier, Springer). \
2298 Try a PDF exported from Word, Google Docs, or LaTeX, \
2299 or attach page screenshots with /image instead.".to_string()
2300 );
2301 }
2302
2303 Ok(text)
2304}
2305
2306fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2310 if ext == "rs" {
2311 chunk_rust_symbols(text)
2312 } else {
2313 chunk_paragraphs(text)
2314 }
2315}
2316
2317fn chunk_rust_symbols(text: &str) -> Vec<String> {
2326 const ITEM_STARTS: &[&str] = &[
2327 "pub fn ",
2328 "pub async fn ",
2329 "pub unsafe fn ",
2330 "async fn ",
2331 "unsafe fn ",
2332 "fn ",
2333 "pub impl",
2334 "impl ",
2335 "pub struct ",
2336 "struct ",
2337 "pub enum ",
2338 "enum ",
2339 "pub trait ",
2340 "trait ",
2341 "pub mod ",
2342 "mod ",
2343 "pub type ",
2344 "type ",
2345 "pub const ",
2346 "const ",
2347 "pub static ",
2348 "static ",
2349 ];
2350
2351 let lines: Vec<&str> = text.lines().collect();
2352 let mut chunks: Vec<String> = Vec::new();
2353 let mut current: Vec<&str> = Vec::new();
2354
2355 for &line in &lines {
2356 let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2357 let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2358
2359 if is_item && !current.is_empty() {
2360 let mut split = current.len();
2363 while split > 0 {
2364 let prev = current[split - 1].trim();
2365 if prev.starts_with("///")
2366 || prev.starts_with("//!")
2367 || prev.starts_with("#[")
2368 || prev.is_empty()
2369 {
2370 split -= 1;
2371 } else {
2372 break;
2373 }
2374 }
2375 let body = current[..split].join("\n");
2376 if !body.trim().is_empty() {
2377 chunks.push(body);
2378 }
2379 current = current[split..].to_vec();
2380 }
2381 current.push(line);
2382 }
2383 if !current.is_empty() {
2384 let body = current.join("\n");
2385 if !body.trim().is_empty() {
2386 chunks.push(body);
2387 }
2388 }
2389
2390 let mut result = Vec::new();
2392 for chunk in chunks {
2393 if chunk.len() > 3000 {
2394 result.extend(sliding_window_chunks(&chunk, 2000, 200));
2395 } else {
2396 result.push(chunk);
2397 }
2398 }
2399 result
2400}
2401
2402fn chunk_paragraphs(text: &str) -> Vec<String> {
2404 let mut result: Vec<String> = Vec::new();
2405 let mut current = String::new();
2406
2407 for para in text.split("\n\n") {
2408 if current.len() + para.len() + 2 > 2000 {
2409 if !current.trim().is_empty() {
2410 result.push(current.clone());
2411 }
2412 current = para.to_string();
2413 } else {
2414 if !current.is_empty() {
2415 current.push_str("\n\n");
2416 }
2417 current.push_str(para);
2418 }
2419 }
2420 if !current.trim().is_empty() {
2421 result.push(current);
2422 }
2423
2424 let mut final_result = Vec::new();
2425 for chunk in result {
2426 if chunk.len() > 2000 {
2427 final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2428 } else {
2429 final_result.push(chunk);
2430 }
2431 }
2432 final_result
2433}
2434
2435fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2437 let chars: Vec<char> = text.chars().collect();
2438 let mut result = Vec::new();
2439 let mut i = 0;
2440 while i < chars.len() {
2441 let end = (i + chunk_size).min(chars.len());
2442 result.push(chars[i..end].iter().collect());
2443 if end == chars.len() {
2444 break;
2445 }
2446 i += chunk_size - overlap;
2447 }
2448 result
2449}