1use rusqlite::{params, Connection};
2use serde::Deserialize;
3use serde_json::Value;
4use std::collections::{HashMap, HashSet};
5use std::path::Path;
6
7pub struct Vein {
28 db: std::sync::Arc<std::sync::Mutex<Connection>>,
29 base_url: String,
31}
32
33unsafe impl Send for Vein {}
36unsafe impl Sync for Vein {}
37
38#[derive(Debug, Clone)]
39pub struct SearchResult {
40 pub path: String,
41 pub content: String,
42 pub score: f32,
44 pub room: String,
46 pub last_modified: i64,
48}
49
50#[derive(Debug, Clone, PartialEq, Eq)]
51pub struct VeinHotFile {
52 pub path: String,
53 pub heat: i64,
54 pub last_modified: i64,
55 pub room: String,
56}
57
58#[derive(Debug, Clone)]
59pub struct VeinInspectionSnapshot {
60 pub indexed_source_files: usize,
61 pub indexed_docs: usize,
62 pub indexed_session_exchanges: usize,
63 pub embedded_source_doc_chunks: usize,
64 pub has_any_embeddings: bool,
65 pub active_room: Option<String>,
66 pub hot_files: Vec<VeinHotFile>,
67 pub l1_ready: bool,
68}
69
70#[derive(Debug, Default)]
71struct QuerySignals {
72 exact_phrases: Vec<String>,
73 standout_terms: Vec<String>,
74 historical_memory_hint: bool,
75 temporal_reference: Option<TemporalReference>,
76}
77
78#[derive(Debug, Clone, Copy)]
79struct TemporalReference {
80 target_ts: i64,
81 window_secs: i64,
82}
83
84#[derive(Debug, Deserialize)]
85struct SessionReport {
86 #[serde(default)]
87 session_start: String,
88 #[serde(default)]
89 transcript: Vec<SessionTranscriptEntry>,
90}
91
92#[derive(Debug, Deserialize)]
93struct SessionTranscriptEntry {
94 #[serde(default)]
95 speaker: String,
96 #[serde(default)]
97 text: String,
98}
99
100#[derive(Debug)]
101struct SessionExchange {
102 path: String,
103 last_modified: i64,
104 content: String,
105}
106
107#[derive(Debug, Clone, Copy, PartialEq, Eq)]
108enum SessionSpeakerKind {
109 User,
110 Assistant,
111 Ignore,
112}
113
114pub fn detect_room(path: &str) -> String {
119 let lower = path.to_lowercase().replace('\\', "/");
120 let filename = lower.rsplit('/').next().unwrap_or(&lower);
121 let ext = filename.rsplit('.').next().unwrap_or("");
122
123 let mut best_room = None::<&str>;
124 let mut best_score = 0i32;
125 let mut consider = |room: &'static str, score: i32| {
126 if score > best_score {
127 best_score = score;
128 best_room = Some(room);
129 }
130 };
131
132 let is_component = |segment: &str| {
133 lower == segment
134 || lower.starts_with(&format!("{segment}/"))
135 || lower.contains(&format!("/{segment}/"))
136 };
137
138 if lower.starts_with("session/")
139 || lower.starts_with(".hematite/reports/")
140 || lower.starts_with(".hematite/imports/")
141 || is_component("reports")
142 || is_component("imports")
143 {
144 consider("session", 100);
145 }
146
147 if lower.starts_with(".hematite/docs/")
148 || is_component("docs")
149 || matches!(filename, "readme.md" | "claude.md" | ".hematite.md")
150 || matches!(ext, "md" | "markdown" | "pdf" | "rst")
151 {
152 consider("docs", 80);
153 }
154
155 if is_component("tests")
156 || filename.contains("diagnostic")
157 || filename.ends_with("_test.rs")
158 || filename.ends_with(".test.ts")
159 {
160 consider("tests", 85);
161 }
162
163 if lower.starts_with(".github/workflows/")
164 || is_component("workflows")
165 || filename == ".pre-commit-config.yaml"
166 || filename == ".pre-commit-config.yml"
167 || filename.contains("hook")
168 {
169 consider("automation", 78);
170 }
171
172 if lower.starts_with("installer/")
173 || lower.starts_with("dist/")
174 || lower.starts_with("scripts/package-")
175 || filename.contains("release")
176 || filename.contains("bump-version")
177 || ext == "iss"
178 {
179 consider("release", 82);
180 }
181
182 if matches!(
183 filename,
184 "cargo.toml"
185 | "cargo.lock"
186 | "package.json"
187 | "pnpm-lock.yaml"
188 | "yarn.lock"
189 | "bun.lock"
190 | "bun.lockb"
191 | "pyproject.toml"
192 | "setup.py"
193 | "go.mod"
194 | "pom.xml"
195 | "build.gradle"
196 | "build.gradle.kts"
197 | "cmakelists.txt"
198 | ".gitignore"
199 | "settings.json"
200 | "mcp_servers.json"
201 ) || filename.ends_with(".sln")
202 || filename.ends_with(".csproj")
203 || filename.contains("config")
204 {
205 consider("config", 76);
206 }
207
208 if is_component("ui")
209 || matches!(
210 filename,
211 "tui.rs" | "voice.rs" | "hatch.rs" | "gpu_monitor.rs"
212 )
213 {
214 consider("ui", 70);
215 }
216
217 if is_component("memory") || matches!(filename, "vein.rs" | "deep_reflect.rs") {
218 consider("memory", 72);
219 }
220
221 if is_component("tools")
222 || matches!(
223 filename,
224 "verify_build.rs"
225 | "host_inspect.rs"
226 | "shell.rs"
227 | "code_sandbox.rs"
228 | "project_map.rs"
229 | "runtime_trace.rs"
230 )
231 {
232 consider("tools", 68);
233 }
234
235 if filename.contains("mcp")
236 || filename.contains("lsp")
237 || lower.contains("/mcp/")
238 || lower.contains("/lsp/")
239 {
240 consider("integration", 67);
241 }
242
243 if matches!(filename, "main.rs" | "runtime.rs" | "inference.rs")
244 || filename.contains("startup")
245 || filename.contains("runtime")
246 {
247 consider("runtime", 66);
248 }
249
250 if is_component("agent") {
251 consider("agent", 60);
252 }
253
254 if lower.starts_with("libs/") || is_component("libs") {
255 consider("libs", 58);
256 }
257
258 if lower.starts_with("scripts/") || is_component("scripts") {
259 consider("scripts", 55);
260 }
261
262 if let Some(room) = best_room {
263 return room.to_string();
264 }
265
266 lower
268 .split('/')
269 .next()
270 .filter(|s| !s.is_empty() && !s.contains('.'))
271 .unwrap_or("root")
272 .to_string()
273}
274
275impl Vein {
276 const SESSION_REPORT_LIMIT: usize = 5;
277 const SESSION_TURN_LIMIT: usize = 50;
278 const IMPORT_FILE_LIMIT: usize = 12;
279 const IMPORT_MAX_BYTES: u64 = 10 * 1024 * 1024;
280
281 pub fn new<P: AsRef<Path>>(
282 db_path: P,
283 base_url: String,
284 ) -> Result<Self, Box<dyn std::error::Error>> {
285 let db = Connection::open(db_path)?;
286
287 db.execute_batch("PRAGMA journal_mode=WAL; PRAGMA synchronous=NORMAL;")?;
289
290 db.execute_batch(
294 "CREATE TABLE IF NOT EXISTS chunks_meta (
295 path TEXT PRIMARY KEY,
296 last_modified INTEGER NOT NULL,
297 room TEXT NOT NULL DEFAULT 'root'
298 );
299 CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
300 path UNINDEXED,
301 content,
302 tokenize='porter ascii'
303 );
304 CREATE TABLE IF NOT EXISTS chunks_vec (
305 path TEXT NOT NULL,
306 chunk_idx INTEGER NOT NULL,
307 embedding BLOB NOT NULL,
308 PRIMARY KEY (path, chunk_idx)
309 );
310 CREATE TABLE IF NOT EXISTS file_heat (
311 path TEXT PRIMARY KEY,
312 heat INTEGER NOT NULL DEFAULT 0,
313 last_edit INTEGER NOT NULL DEFAULT 0
314 );",
315 )?;
316
317 let _ = db
319 .execute_batch("ALTER TABLE chunks_meta ADD COLUMN room TEXT NOT NULL DEFAULT 'root';");
320 let _ = db.execute_batch(
321 "ALTER TABLE file_heat ADD COLUMN last_edit INTEGER NOT NULL DEFAULT 0;",
322 );
323
324 Ok(Self {
325 db: std::sync::Arc::new(std::sync::Mutex::new(db)),
326 base_url,
327 })
328 }
329
330 pub fn index_document(
335 &mut self,
336 path: &str,
337 last_modified: i64,
338 full_text: &str,
339 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
340 let room = detect_room(path);
341 let ext = std::path::Path::new(path)
342 .extension()
343 .and_then(|e| e.to_str())
344 .unwrap_or("");
345 let chunks = chunk_by_symbols(ext, full_text);
346 self.index_chunks_with_room(path, last_modified, &room, &chunks)
347 }
348
349 fn index_chunks_with_room(
350 &mut self,
351 path: &str,
352 last_modified: i64,
353 room: &str,
354 chunks: &[String],
355 ) -> Result<Vec<String>, Box<dyn std::error::Error>> {
356 let db = self.db.lock().unwrap();
357 let existing: Option<i64> = db
358 .query_row(
359 "SELECT last_modified FROM chunks_meta WHERE path = ?1",
360 params![path],
361 |r| r.get(0),
362 )
363 .ok();
364
365 if let Some(ts) = existing {
366 if ts >= last_modified {
367 return Ok(Vec::new()); }
369 }
370
371 db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path])?;
373 db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path])?;
374 db.execute(
375 "INSERT OR REPLACE INTO chunks_meta (path, last_modified, room) VALUES (?1, ?2, ?3)",
376 params![path, last_modified, room],
377 )?;
378
379 drop(db);
380
381 let mut db = self.db.lock().unwrap();
382 let tx = db.transaction()?;
383 {
384 let mut stmt = tx.prepare("INSERT INTO chunks_fts (path, content) VALUES (?1, ?2)")?;
385 for chunk in chunks {
386 stmt.execute(params![path, chunk.as_str()])?;
387 }
388 }
389 tx.commit()?;
390
391 Ok(chunks.to_vec())
392 }
393
394 pub fn embed_and_store_chunks(&self, path: &str, chunks: &[String]) {
398 for (idx, chunk) in chunks.iter().enumerate() {
399 if let Some(vec) = embed_text_blocking(chunk, &self.base_url) {
400 let blob = floats_to_blob(&vec);
401 let db = self.db.lock().unwrap();
402 let _ = db.execute(
403 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
404 params![path, idx as i64, blob],
405 );
406 }
407 }
408 }
409
410 pub fn search_bm25(
414 &self,
415 query: &str,
416 limit: usize,
417 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
418 const STOPWORDS: &[&str] = &[
422 "how", "does", "do", "did", "what", "where", "when", "why", "which", "who", "is",
423 "are", "was", "were", "be", "been", "being", "have", "has", "had", "a", "an", "the",
424 "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "get",
425 "gets", "got", "work", "works", "make", "makes", "use", "uses", "into", "that", "this",
426 "it", "its",
427 ];
428
429 let safe_query: String = query
430 .chars()
431 .map(|c| {
432 if c.is_alphanumeric() || c == ' ' || c == '_' {
433 c
434 } else {
435 ' '
436 }
437 })
438 .collect();
439
440 let fts_query = safe_query
442 .split_whitespace()
443 .filter(|w| w.len() >= 3 && !STOPWORDS.contains(&w.to_lowercase().as_str()))
444 .collect::<Vec<_>>()
445 .join(" OR ");
446
447 if fts_query.is_empty() {
448 return Ok(Vec::new());
449 }
450
451 let db = self.db.lock().unwrap();
452 let mut stmt = db.prepare(
453 "SELECT chunks_fts.path, chunks_fts.content, rank, cm.last_modified, cm.room
454 FROM chunks_fts
455 JOIN chunks_meta cm ON cm.path = chunks_fts.path
456 WHERE chunks_fts MATCH ?1
457 ORDER BY rank
458 LIMIT ?2",
459 )?;
460
461 let results: Vec<SearchResult> = stmt
462 .query_map(params![fts_query, limit as i64], |row| {
463 Ok(SearchResult {
464 path: row.get(0)?,
465 content: row.get(1)?,
466 score: -(row.get::<_, f64>(2).unwrap_or(0.0) as f32),
467 last_modified: row.get(3)?,
468 room: row.get(4)?,
469 })
470 })?
471 .filter_map(|r| r.ok())
472 .collect();
473
474 Ok(results)
475 }
476
477 pub fn search_semantic(&self, query: &str, limit: usize) -> Vec<SearchResult> {
480 let query_vec = match embed_query_blocking(query, &self.base_url) {
481 Some(v) => v,
482 None => return Vec::new(),
483 };
484
485 let rows: Vec<(String, i64, Vec<u8>, i64, String)> = {
487 let db = self.db.lock().unwrap();
488 let mut stmt = match db.prepare(
489 "SELECT cv.path, cv.chunk_idx, cv.embedding, cm.last_modified, cm.room
490 FROM chunks_vec cv
491 JOIN chunks_meta cm ON cm.path = cv.path",
492 ) {
493 Ok(s) => s,
494 Err(_) => return Vec::new(),
495 };
496 stmt.query_map([], |row| {
497 Ok((
498 row.get::<_, String>(0)?,
499 row.get::<_, i64>(1)?,
500 row.get::<_, Vec<u8>>(2)?,
501 row.get::<_, i64>(3)?,
502 row.get::<_, String>(4)?,
503 ))
504 })
505 .ok()
506 .map(|rows| rows.filter_map(|r| r.ok()).collect())
507 .unwrap_or_default()
508 };
509
510 if rows.is_empty() {
511 return Vec::new();
512 }
513
514 let mut scored: Vec<(f32, String, i64, i64, String)> = rows
516 .into_iter()
517 .filter_map(|(path, idx, blob, last_modified, room)| {
518 let vec = blob_to_floats(&blob);
519 let sim = cosine_similarity(&query_vec, &vec);
520 Some((sim, path, idx, last_modified, room))
521 })
522 .collect();
523
524 scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
525 scored.truncate(limit);
526
527 let db = self.db.lock().unwrap();
529 scored
530 .into_iter()
531 .filter_map(|(score, path, idx, last_modified, room)| {
532 let content: Option<String> = db
535 .query_row(
536 "SELECT content FROM chunks_fts WHERE path = ?1 LIMIT 1 OFFSET ?2",
537 params![path, idx],
538 |r| r.get(0),
539 )
540 .ok();
541 content.map(|c| SearchResult {
542 path,
543 content: c,
544 score,
545 room,
546 last_modified,
547 })
548 })
549 .collect()
550 }
551
552 pub fn search_context(
559 &self,
560 query: &str,
561 limit: usize,
562 ) -> Result<Vec<SearchResult>, Box<dyn std::error::Error>> {
563 let candidate_limit = (limit.max(1) * 4).max(12);
564 let bm25 = self.search_bm25(query, candidate_limit).unwrap_or_default();
565 let semantic = self.search_semantic(query, candidate_limit);
566 let signals = QuerySignals::from_query(query);
567
568 let active_room = self.active_room();
570
571 let mut merged_by_path: HashMap<String, SearchResult> = HashMap::new();
574
575 for r in semantic {
576 let score = reranked_score(&signals, active_room.as_deref(), &r, true);
577 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
578 }
579
580 for r in bm25 {
581 let score = reranked_score(&signals, active_room.as_deref(), &r, false);
582 merge_scored_result(&mut merged_by_path, SearchResult { score, ..r });
583 }
584
585 let mut merged: Vec<SearchResult> = merged_by_path.into_values().collect();
586 merged.sort_by(|a, b| {
587 b.score
588 .partial_cmp(&a.score)
589 .unwrap_or(std::cmp::Ordering::Equal)
590 });
591 merged.truncate(limit);
592 Ok(merged)
593 }
594
595 fn active_room(&self) -> Option<String> {
598 let db = self.db.lock().unwrap();
599 db.query_row(
600 "SELECT cm.room, SUM(fh.heat) as total
601 FROM file_heat fh
602 JOIN chunks_meta cm ON cm.path = fh.path
603 GROUP BY cm.room
604 ORDER BY total DESC
605 LIMIT 1",
606 [],
607 |row| row.get::<_, String>(0),
608 )
609 .ok()
610 }
611
612 pub fn index_project(&mut self) -> usize {
620 let root = crate::tools::file_ops::workspace_root();
621 let mut count = 0usize;
622
623 const INDEXABLE: &[&str] = &[
624 "rs", "toml", "md", "json", "ts", "tsx", "js", "py", "go", "c", "cpp", "h", "yaml",
625 "yml", "txt",
626 ];
627 const SKIP_DIRS: &[&str] = &["target", ".git", "node_modules", ".hematite"];
628
629 for entry in walkdir::WalkDir::new(&root)
630 .follow_links(false)
631 .into_iter()
632 .filter_entry(|e| {
633 if e.file_type().is_dir() {
634 let name = e.file_name().to_string_lossy();
635 return !SKIP_DIRS.contains(&name.as_ref());
636 }
637 true
638 })
639 .filter_map(|e| e.ok())
640 .filter(|e| e.file_type().is_file())
641 {
642 let path = entry.path();
643 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
644 if !INDEXABLE.contains(&ext) {
645 continue;
646 }
647
648 let Ok(meta) = std::fs::metadata(path) else {
649 continue;
650 };
651 if meta.len() > 512_000 {
652 continue;
653 }
654
655 let mtime = meta
656 .modified()
657 .map(|t| {
658 t.duration_since(std::time::UNIX_EPOCH)
659 .unwrap_or_default()
660 .as_secs() as i64
661 })
662 .unwrap_or(0);
663
664 let rel = path.strip_prefix(&root).unwrap_or(path);
665 let rel_str = rel.to_string_lossy().replace('\\', "/");
666
667 if let Ok(content) = std::fs::read_to_string(path) {
668 match self.index_document(&rel_str, mtime, &content) {
669 Ok(new_chunks) if !new_chunks.is_empty() => {
670 count += 1;
671 }
672 Ok(_) => {}
673 Err(_) => {}
674 }
675 }
676 }
677
678 count += self.index_workspace_artifacts(&root);
679
680 count
681 }
682
683 pub fn index_workspace_artifacts(&mut self, workspace_root: &std::path::Path) -> usize {
688 let mut count = self.index_docs_folder(workspace_root);
689 count += self.index_recent_session_reports(workspace_root);
690 count += self.index_imported_session_exports(workspace_root);
691 self.backfill_missing_embeddings();
692 count
693 }
694
695 fn index_docs_folder(&mut self, workspace_root: &std::path::Path) -> usize {
700 let docs_dir = workspace_root.join(".hematite").join("docs");
701 const DOCS_INDEXABLE: &[&str] = &["pdf", "md", "txt", "markdown"];
702 let mut count = 0usize;
703 let mut desired_paths = HashSet::new();
704
705 if docs_dir.exists() {
706 for entry in walkdir::WalkDir::new(&docs_dir)
707 .max_depth(3)
708 .follow_links(false)
709 .into_iter()
710 .filter_map(|e| e.ok())
711 .filter(|e| e.file_type().is_file())
712 {
713 let path = entry.path();
714 let ext = path
715 .extension()
716 .and_then(|e| e.to_str())
717 .unwrap_or("")
718 .to_lowercase();
719 if !DOCS_INDEXABLE.contains(&ext.as_str()) {
720 continue;
721 }
722
723 let Ok(meta) = std::fs::metadata(path) else {
724 continue;
725 };
726 if meta.len() > 50_000_000 {
727 continue;
728 }
729
730 let mtime = meta
731 .modified()
732 .map(|t| {
733 t.duration_since(std::time::UNIX_EPOCH)
734 .unwrap_or_default()
735 .as_secs() as i64
736 })
737 .unwrap_or(0);
738
739 let rel = path.strip_prefix(workspace_root).unwrap_or(path);
740 let rel_str = rel.to_string_lossy().replace('\\', "/");
741 desired_paths.insert(rel_str.clone());
742
743 let content = if ext == "pdf" {
744 extract_pdf_text(path).ok().flatten()
745 } else {
746 std::fs::read_to_string(path).ok()
747 };
748
749 if let Some(text) = content {
750 if text.trim().is_empty() {
751 continue;
752 }
753 match self.index_document(&rel_str, mtime, &text) {
754 Ok(new_chunks) if !new_chunks.is_empty() => {
755 count += 1;
756 }
757 Ok(_) => {}
758 Err(_) => {}
759 }
760 }
761 }
762 }
763
764 self.prune_indexed_prefix(".hematite/docs/", &desired_paths);
765 count
766 }
767
768 pub fn index_recent_session_reports(&mut self, workspace_root: &std::path::Path) -> usize {
771 let reports_dir = workspace_root.join(".hematite").join("reports");
772 let mut count = 0usize;
773 let mut desired_paths = HashSet::new();
774
775 if reports_dir.exists() {
776 let mut reports: Vec<std::path::PathBuf> = std::fs::read_dir(&reports_dir)
777 .ok()
778 .into_iter()
779 .flat_map(|entries| entries.filter_map(|entry| entry.ok()))
780 .map(|entry| entry.path())
781 .filter(|path| {
782 path.is_file()
783 && path.extension().and_then(|ext| ext.to_str()) == Some("json")
784 && path
785 .file_stem()
786 .and_then(|stem| stem.to_str())
787 .map(|stem| stem.starts_with("session_"))
788 .unwrap_or(false)
789 })
790 .collect();
791
792 reports.sort_by(|a, b| {
793 let a_name = a
794 .file_name()
795 .and_then(|name| name.to_str())
796 .unwrap_or_default();
797 let b_name = b
798 .file_name()
799 .and_then(|name| name.to_str())
800 .unwrap_or_default();
801 b_name.cmp(a_name)
802 });
803 reports.truncate(Self::SESSION_REPORT_LIMIT);
804
805 for report_path in reports {
806 let Ok(meta) = std::fs::metadata(&report_path) else {
807 continue;
808 };
809 let mtime = meta
810 .modified()
811 .map(|t| {
812 t.duration_since(std::time::UNIX_EPOCH)
813 .unwrap_or_default()
814 .as_secs() as i64
815 })
816 .unwrap_or(0);
817
818 for exchange in load_session_exchanges(&report_path, mtime) {
819 desired_paths.insert(exchange.path.clone());
820 match self.index_chunks_with_room(
821 &exchange.path,
822 exchange.last_modified,
823 "session",
824 std::slice::from_ref(&exchange.content),
825 ) {
826 Ok(new_chunks) if !new_chunks.is_empty() => {
827 count += 1;
828 }
829 Ok(_) => {}
830 Err(_) => {}
831 }
832 }
833 }
834 }
835
836 self.prune_indexed_prefix("session/", &desired_paths);
837 count
838 }
839
840 pub fn index_imported_session_exports(&mut self, workspace_root: &std::path::Path) -> usize {
845 let imports_dir = workspace_root.join(".hematite").join("imports");
846 let mut count = 0usize;
847 let mut desired_paths = HashSet::new();
848
849 if imports_dir.exists() {
850 let mut imports: Vec<(std::path::PathBuf, i64)> = walkdir::WalkDir::new(&imports_dir)
851 .max_depth(4)
852 .follow_links(false)
853 .into_iter()
854 .filter_map(|entry| entry.ok())
855 .filter(|entry| entry.file_type().is_file())
856 .filter_map(|entry| {
857 let path = entry.into_path();
858 let ext = path
859 .extension()
860 .and_then(|ext| ext.to_str())
861 .unwrap_or("")
862 .to_ascii_lowercase();
863 if !matches!(ext.as_str(), "json" | "jsonl" | "md" | "txt") {
864 return None;
865 }
866 let meta = std::fs::metadata(&path).ok()?;
867 if meta.len() > Self::IMPORT_MAX_BYTES {
868 return None;
869 }
870 let mtime = meta
871 .modified()
872 .map(|t| {
873 t.duration_since(std::time::UNIX_EPOCH)
874 .unwrap_or_default()
875 .as_secs() as i64
876 })
877 .unwrap_or(0);
878 Some((path, mtime))
879 })
880 .collect();
881
882 imports.sort_by(|(a_path, a_mtime), (b_path, b_mtime)| {
883 b_mtime
884 .cmp(a_mtime)
885 .then_with(|| a_path.to_string_lossy().cmp(&b_path.to_string_lossy()))
886 });
887 imports.truncate(Self::IMPORT_FILE_LIMIT);
888
889 for (import_path, mtime) in imports {
890 for exchange in load_imported_session_exchanges(&import_path, &imports_dir, mtime) {
891 desired_paths.insert(exchange.path.clone());
892 match self.index_chunks_with_room(
893 &exchange.path,
894 exchange.last_modified,
895 "session",
896 std::slice::from_ref(&exchange.content),
897 ) {
898 Ok(new_chunks) if !new_chunks.is_empty() => {
899 count += 1;
900 }
901 Ok(_) => {}
902 Err(_) => {}
903 }
904 }
905 }
906 }
907
908 self.prune_indexed_prefix("session/imports/", &desired_paths);
909 count
910 }
911
912 fn backfill_missing_embeddings(&self) {
917 let (fts_count, vec_count) = {
919 let db = self.db.lock().unwrap();
920 let fts: i64 = db
921 .query_row("SELECT COUNT(*) FROM chunks_fts", [], |r| r.get(0))
922 .unwrap_or(0);
923 let vec: i64 = db
924 .query_row("SELECT COUNT(*) FROM chunks_vec", [], |r| r.get(0))
925 .unwrap_or(0);
926 (fts, vec)
927 };
928 if fts_count == 0 || fts_count == vec_count {
929 return;
930 }
931
932 let missing: Vec<(String, i64, String)> = {
935 let db = self.db.lock().unwrap();
936 let mut stmt = db
937 .prepare(
938 "SELECT f.path, (f.rowid - 1) AS chunk_idx, f.content
939 FROM chunks_fts f
940 LEFT JOIN chunks_vec v ON f.path = v.path AND (f.rowid - 1) = v.chunk_idx
941 WHERE v.path IS NULL
942 ORDER BY CASE
943 WHEN f.path LIKE '%.rs' THEN 0
944 WHEN f.path LIKE '%.toml' THEN 1
945 WHEN f.path LIKE '%.json' THEN 2
946 ELSE 3
947 END, f.path
948 LIMIT 20",
949 )
950 .unwrap();
951 stmt.query_map([], |r| {
952 Ok((
953 r.get::<_, String>(0)?,
954 r.get::<_, i64>(1)?,
955 r.get::<_, String>(2)?,
956 ))
957 })
958 .unwrap()
959 .filter_map(|r| r.ok())
960 .collect()
961 };
962
963 for (path, idx, content) in missing {
964 if let Some(vec) = embed_text_blocking(&content, &self.base_url) {
965 let blob = floats_to_blob(&vec);
966 let db = self.db.lock().unwrap();
967 let _ = db.execute(
968 "INSERT OR REPLACE INTO chunks_vec (path, chunk_idx, embedding) VALUES (?1, ?2, ?3)",
969 params![path, idx, blob],
970 );
971 } else {
972 break;
974 }
975 }
976 }
977
978 pub fn file_count(&self) -> usize {
981 let db = self.db.lock().unwrap();
982 db.query_row(
983 "SELECT COUNT(*) FROM chunks_meta WHERE path NOT LIKE 'session/%'",
984 [],
985 |r| r.get::<_, i64>(0),
986 )
987 .unwrap_or(0) as usize
988 }
989
990 pub fn embedded_chunk_count(&self) -> usize {
993 let db = self.db.lock().unwrap();
994 db.query_row(
995 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
996 [],
997 |r| r.get::<_, i64>(0),
998 )
999 .unwrap_or(0) as usize
1000 }
1001
1002 pub fn has_any_embeddings(&self) -> bool {
1004 let db = self.db.lock().unwrap();
1005 db.query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1006 r.get::<_, i64>(0)
1007 })
1008 .unwrap_or(0)
1009 != 0
1010 }
1011
1012 pub fn reset(&self) {
1015 let db = self.db.lock().unwrap();
1016 let _ = db.execute_batch(
1017 "DELETE FROM chunks_fts;
1018 DELETE FROM chunks_vec;
1019 DELETE FROM chunks_meta;",
1020 );
1021 }
1022
1023 pub fn inspect_snapshot(&self, hot_limit: usize) -> VeinInspectionSnapshot {
1026 let db = self.db.lock().unwrap();
1027 let indexed_source_files = db
1028 .query_row(
1029 "SELECT COUNT(*) FROM chunks_meta
1030 WHERE path NOT LIKE 'session/%'
1031 AND path NOT LIKE '.hematite/docs/%'",
1032 [],
1033 |r| r.get::<_, i64>(0),
1034 )
1035 .unwrap_or(0) as usize;
1036 let indexed_docs = db
1037 .query_row(
1038 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE '.hematite/docs/%'",
1039 [],
1040 |r| r.get::<_, i64>(0),
1041 )
1042 .unwrap_or(0) as usize;
1043 let indexed_session_exchanges = db
1044 .query_row(
1045 "SELECT COUNT(*) FROM chunks_meta WHERE path LIKE 'session/%'",
1046 [],
1047 |r| r.get::<_, i64>(0),
1048 )
1049 .unwrap_or(0) as usize;
1050 let embedded_source_doc_chunks = db
1051 .query_row(
1052 "SELECT COUNT(*) FROM chunks_vec WHERE path NOT LIKE 'session/%'",
1053 [],
1054 |r| r.get::<_, i64>(0),
1055 )
1056 .unwrap_or(0) as usize;
1057 let has_any_embeddings = db
1058 .query_row("SELECT EXISTS(SELECT 1 FROM chunks_vec LIMIT 1)", [], |r| {
1059 r.get::<_, i64>(0)
1060 })
1061 .unwrap_or(0)
1062 != 0;
1063 drop(db);
1064
1065 let hot_files = self
1066 .hot_files(hot_limit.max(1))
1067 .into_iter()
1068 .map(|(path, heat, last_modified, room)| VeinHotFile {
1069 path,
1070 heat,
1071 last_modified,
1072 room,
1073 })
1074 .collect::<Vec<_>>();
1075
1076 VeinInspectionSnapshot {
1077 indexed_source_files,
1078 indexed_docs,
1079 indexed_session_exchanges,
1080 embedded_source_doc_chunks,
1081 has_any_embeddings,
1082 active_room: self.active_room(),
1083 l1_ready: !hot_files.is_empty(),
1084 hot_files,
1085 }
1086 }
1087
1088 pub fn bump_heat(&self, path: &str) {
1094 if path.is_empty() {
1095 return;
1096 }
1097 let now = std::time::SystemTime::now()
1098 .duration_since(std::time::UNIX_EPOCH)
1099 .unwrap_or_default()
1100 .as_secs() as i64;
1101 let db = self.db.lock().unwrap();
1102 let _ = db.execute(
1103 "INSERT INTO file_heat (path, heat, last_edit) VALUES (?1, 1, ?2)
1104 ON CONFLICT(path) DO UPDATE SET heat = heat + 1, last_edit = ?2",
1105 params![path, now],
1106 );
1107 }
1108
1109 fn hot_files(&self, n: usize) -> Vec<(String, i64, i64, String)> {
1113 let db = self.db.lock().unwrap();
1114 let mut stmt = match db.prepare(
1115 "SELECT fh.path, fh.heat, cm.last_modified, cm.room
1116 FROM file_heat fh
1117 JOIN chunks_meta cm ON cm.path = fh.path
1118 ORDER BY fh.heat DESC, cm.last_modified DESC
1119 LIMIT ?1",
1120 ) {
1121 Ok(s) => s,
1122 Err(_) => return vec![],
1123 };
1124 stmt.query_map(params![n as i64], |row| {
1125 Ok((
1126 row.get::<_, String>(0)?,
1127 row.get::<_, i64>(1)?,
1128 row.get::<_, i64>(2)?,
1129 row.get::<_, String>(3)?,
1130 ))
1131 })
1132 .map(|rows| rows.filter_map(|r| r.ok()).collect())
1133 .unwrap_or_default()
1134 }
1135
1136 pub fn l1_context(&self) -> Option<String> {
1141 let files = self.hot_files(8);
1142 if files.is_empty() {
1143 return None;
1144 }
1145 let now = std::time::SystemTime::now()
1146 .duration_since(std::time::UNIX_EPOCH)
1147 .unwrap_or_default()
1148 .as_secs() as i64;
1149
1150 let mut by_room: std::collections::BTreeMap<String, Vec<(String, i64, i64)>> =
1152 std::collections::BTreeMap::new();
1153 for (path, heat, mtime, room) in &files {
1154 by_room
1155 .entry(room.clone())
1156 .or_default()
1157 .push((path.clone(), *heat, *mtime));
1158 }
1159
1160 let mut out = String::from("# Hot Files (most edited — grouped by subsystem)\n");
1161 for (room, entries) in &by_room {
1162 out.push_str(&format!("[{}]\n", room));
1163 for (path, heat, mtime) in entries {
1164 let age_secs = now - mtime;
1165 let age = if age_secs < 3600 {
1166 "just now".to_string()
1167 } else if age_secs < 86400 {
1168 format!("{}h ago", age_secs / 3600)
1169 } else {
1170 format!("{}d ago", age_secs / 86400)
1171 };
1172 out.push_str(&format!(
1173 " - {} [{} edit{}, {}]\n",
1174 path,
1175 heat,
1176 if *heat == 1 { "" } else { "s" },
1177 age
1178 ));
1179 }
1180 }
1181 Some(out)
1182 }
1183
1184 fn prune_indexed_prefix(&self, prefix: &str, desired_paths: &HashSet<String>) {
1185 let pattern = format!("{}%", prefix);
1186 let existing_paths: Vec<String> = {
1187 let db = self.db.lock().unwrap();
1188 let mut stmt = match db.prepare("SELECT path FROM chunks_meta WHERE path LIKE ?1") {
1189 Ok(stmt) => stmt,
1190 Err(_) => return,
1191 };
1192 stmt.query_map(params![pattern], |row| row.get::<_, String>(0))
1193 .map(|rows| rows.filter_map(|row| row.ok()).collect())
1194 .unwrap_or_default()
1195 };
1196
1197 if existing_paths.is_empty() {
1198 return;
1199 }
1200
1201 let db = self.db.lock().unwrap();
1202 for path in existing_paths {
1203 if desired_paths.contains(&path) {
1204 continue;
1205 }
1206 let _ = db.execute("DELETE FROM chunks_fts WHERE path = ?1", params![path]);
1207 let _ = db.execute("DELETE FROM chunks_vec WHERE path = ?1", params![path]);
1208 let _ = db.execute("DELETE FROM chunks_meta WHERE path = ?1", params![path]);
1209 }
1210 }
1211}
1212
1213impl QuerySignals {
1214 fn from_query(query: &str) -> Self {
1215 let lower = query.to_ascii_lowercase();
1216 let historical_memory_hint = [
1217 "remember",
1218 "earlier",
1219 "previous",
1220 "last time",
1221 "what did we decide",
1222 "why did we decide",
1223 "what did we say",
1224 "why did we change",
1225 ]
1226 .iter()
1227 .any(|needle| lower.contains(needle));
1228
1229 Self {
1230 exact_phrases: extract_exact_phrases(query),
1231 standout_terms: extract_standout_terms(query),
1232 historical_memory_hint,
1233 temporal_reference: extract_temporal_reference(query),
1234 }
1235 }
1236}
1237
1238fn merge_scored_result(
1239 merged_by_path: &mut HashMap<String, SearchResult>,
1240 candidate: SearchResult,
1241) {
1242 match merged_by_path.get_mut(&candidate.path) {
1243 Some(existing) if candidate.score > existing.score => *existing = candidate,
1244 Some(_) => {}
1245 None => {
1246 merged_by_path.insert(candidate.path.clone(), candidate);
1247 }
1248 }
1249}
1250
1251fn reranked_score(
1252 signals: &QuerySignals,
1253 active_room: Option<&str>,
1254 result: &SearchResult,
1255 is_semantic: bool,
1256) -> f32 {
1257 let base = if is_semantic {
1258 1.0 + result.score.clamp(0.0, 1.0)
1259 } else {
1260 (result.score / 10.0).clamp(0.0, 1.0)
1261 };
1262 base + room_bias(active_room, result)
1263 + retrieval_signal_boost(signals, result)
1264 + temporal_memory_boost(signals, result)
1265}
1266
1267fn room_bias(active_room: Option<&str>, result: &SearchResult) -> f32 {
1268 if active_room == Some(result.room.as_str()) {
1269 0.15
1270 } else {
1271 0.0
1272 }
1273}
1274
1275fn retrieval_signal_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1276 let mut boost = 0.0f32;
1277 let haystack = format!(
1278 "{}\n{}",
1279 result.path.to_ascii_lowercase(),
1280 result.content.to_ascii_lowercase()
1281 );
1282
1283 let phrase_matches = signals
1284 .exact_phrases
1285 .iter()
1286 .filter(|phrase| haystack.contains(phrase.as_str()))
1287 .count();
1288 if phrase_matches > 0 {
1289 boost += 0.35 + ((phrase_matches.saturating_sub(1)) as f32 * 0.1);
1290 }
1291
1292 let standout_matches = signals
1293 .standout_terms
1294 .iter()
1295 .filter(|term| haystack.contains(term.as_str()))
1296 .count()
1297 .min(3);
1298 boost += standout_matches as f32 * 0.12;
1299
1300 if signals.historical_memory_hint && result.room == "session" {
1301 boost += 0.1;
1302 }
1303
1304 boost
1305}
1306
1307fn temporal_memory_boost(signals: &QuerySignals, result: &SearchResult) -> f32 {
1308 if result.room != "session" {
1309 return 0.0;
1310 }
1311 let Some(reference) = signals.temporal_reference else {
1312 return 0.0;
1313 };
1314 let Some(memory_ts) = session_memory_timestamp(result) else {
1315 return 0.0;
1316 };
1317
1318 let span = reference.window_secs.max(86_400);
1319 let full_fade = span.saturating_mul(8);
1320 if full_fade <= 0 {
1321 return 0.0;
1322 }
1323
1324 let distance = (memory_ts - reference.target_ts).abs();
1325 let closeness = 1.0 - (distance as f32 / full_fade as f32).min(1.0);
1326 if closeness <= 0.0 {
1327 0.0
1328 } else {
1329 0.22 * closeness
1330 }
1331}
1332
1333fn extract_exact_phrases(query: &str) -> Vec<String> {
1334 let mut phrases = Vec::new();
1335 let chars: Vec<char> = query.chars().collect();
1336 let mut i = 0usize;
1337
1338 while i < chars.len() {
1339 let quote = chars[i];
1340 if !matches!(quote, '"' | '\'' | '`') {
1341 i += 1;
1342 continue;
1343 }
1344 let start = i + 1;
1345 let mut end = start;
1346 while end < chars.len() && chars[end] != quote {
1347 end += 1;
1348 }
1349 if end > start {
1350 let phrase = chars[start..end]
1351 .iter()
1352 .collect::<String>()
1353 .trim()
1354 .to_ascii_lowercase();
1355 if phrase.len() >= 3 && !phrases.contains(&phrase) {
1356 phrases.push(phrase);
1357 }
1358 }
1359 i = end.saturating_add(1);
1360 }
1361
1362 phrases
1363}
1364
1365fn extract_standout_terms(query: &str) -> Vec<String> {
1366 const STOPWORDS: &[&str] = &[
1367 "about", "after", "before", "change", "changed", "decide", "decided", "does", "earlier",
1368 "flow", "from", "have", "into", "just", "last", "local", "make", "more", "remember",
1369 "should", "that", "their", "there", "these", "they", "this", "those", "what", "when",
1370 "where", "which", "why", "with", "work",
1371 ];
1372
1373 let mut standout = Vec::new();
1374 for token in query.split(|ch: char| {
1375 !(ch.is_ascii_alphanumeric() || matches!(ch, '_' | '-' | '.' | '/' | ':'))
1376 }) {
1377 let trimmed = token.trim();
1378 if trimmed.len() < 4 {
1379 continue;
1380 }
1381 let lower = trimmed.to_ascii_lowercase();
1382 if STOPWORDS.contains(&lower.as_str()) {
1383 continue;
1384 }
1385
1386 let interesting = trimmed.chars().any(|ch| ch.is_ascii_digit())
1387 || trimmed
1388 .chars()
1389 .any(|ch| matches!(ch, '_' | '-' | '.' | '/' | ':'))
1390 || trimmed.chars().any(|ch| ch.is_ascii_uppercase())
1391 || trimmed.len() >= 9;
1392
1393 if interesting && !standout.contains(&lower) {
1394 standout.push(lower);
1395 }
1396 }
1397
1398 standout
1399}
1400
1401fn extract_temporal_reference(query: &str) -> Option<TemporalReference> {
1402 if let Some(ts) = extract_iso_date_from_query(query) {
1403 return Some(TemporalReference {
1404 target_ts: ts,
1405 window_secs: 86_400,
1406 });
1407 }
1408
1409 let now = current_unix_timestamp();
1410 let lower = query.to_ascii_lowercase();
1411 if lower.contains("yesterday") {
1412 Some(TemporalReference {
1413 target_ts: now.saturating_sub(86_400),
1414 window_secs: 86_400,
1415 })
1416 } else if lower.contains("today") || lower.contains("earlier today") {
1417 Some(TemporalReference {
1418 target_ts: now,
1419 window_secs: 86_400,
1420 })
1421 } else if lower.contains("last week") {
1422 Some(TemporalReference {
1423 target_ts: now.saturating_sub(7 * 86_400),
1424 window_secs: 7 * 86_400,
1425 })
1426 } else if lower.contains("last month") {
1427 Some(TemporalReference {
1428 target_ts: now.saturating_sub(30 * 86_400),
1429 window_secs: 30 * 86_400,
1430 })
1431 } else {
1432 None
1433 }
1434}
1435
1436fn extract_iso_date_from_query(query: &str) -> Option<i64> {
1437 query
1438 .split(|ch: char| !(ch.is_ascii_digit() || ch == '-'))
1439 .find_map(parse_iso_date_token)
1440}
1441
1442fn parse_iso_date_token(token: &str) -> Option<i64> {
1443 if token.len() != 10 {
1444 return None;
1445 }
1446 let bytes = token.as_bytes();
1447 if bytes.get(4) != Some(&b'-') || bytes.get(7) != Some(&b'-') {
1448 return None;
1449 }
1450
1451 let year = token.get(0..4)?.parse::<i32>().ok()?;
1452 let month = token.get(5..7)?.parse::<u32>().ok()?;
1453 let day = token.get(8..10)?.parse::<u32>().ok()?;
1454 if !(1..=12).contains(&month) || !(1..=31).contains(&day) {
1455 return None;
1456 }
1457
1458 Some(days_from_civil(year, month, day).saturating_mul(86_400))
1459}
1460
1461fn days_from_civil(year: i32, month: u32, day: u32) -> i64 {
1462 let year = year - if month <= 2 { 1 } else { 0 };
1463 let era = if year >= 0 { year } else { year - 399 } / 400;
1464 let yoe = year - era * 400;
1465 let month_prime = month as i32 + if month > 2 { -3 } else { 9 };
1466 let doy = (153 * month_prime + 2) / 5 + day as i32 - 1;
1467 let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy;
1468 era as i64 * 146_097 + doe as i64 - 719_468
1469}
1470
1471fn current_unix_timestamp() -> i64 {
1472 std::time::SystemTime::now()
1473 .duration_since(std::time::UNIX_EPOCH)
1474 .unwrap_or_default()
1475 .as_secs() as i64
1476}
1477
1478fn session_memory_timestamp(result: &SearchResult) -> Option<i64> {
1479 extract_session_path_timestamp(&result.path).or_else(|| {
1480 if result.last_modified > 0 {
1481 Some(result.last_modified)
1482 } else {
1483 None
1484 }
1485 })
1486}
1487
1488fn extract_session_path_timestamp(path: &str) -> Option<i64> {
1489 let normalized = path.replace('\\', "/");
1490 let mut parts = normalized.split('/');
1491 if parts.next()? != "session" {
1492 return None;
1493 }
1494 parse_iso_date_token(parts.next()?)
1495}
1496
1497fn session_speaker_kind(speaker: &str) -> SessionSpeakerKind {
1498 let normalized = speaker.trim().to_ascii_lowercase();
1499 match normalized.as_str() {
1500 "you" | "user" => SessionSpeakerKind::User,
1501 "" | "system" | "tool" => SessionSpeakerKind::Ignore,
1502 _ => SessionSpeakerKind::Assistant,
1503 }
1504}
1505
1506fn load_session_exchanges(report_path: &Path, last_modified: i64) -> Vec<SessionExchange> {
1507 let Ok(raw) = std::fs::read_to_string(report_path) else {
1508 return Vec::new();
1509 };
1510 let Ok(report) = serde_json::from_str::<SessionReport>(&raw) else {
1511 return Vec::new();
1512 };
1513
1514 let session_key = report_path
1515 .file_stem()
1516 .and_then(|stem| stem.to_str())
1517 .and_then(|stem| stem.strip_prefix("session_").or(Some(stem)))
1518 .unwrap_or("unknown-session")
1519 .to_string();
1520 let session_date = report
1521 .session_start
1522 .split('_')
1523 .next()
1524 .filter(|date| !date.is_empty())
1525 .unwrap_or_else(|| session_key.split('_').next().unwrap_or("unknown-date"))
1526 .to_string();
1527
1528 let mut exchanges = Vec::new();
1529 let mut pending_user: Option<String> = None;
1530 let mut turn_index = 0usize;
1531
1532 for entry in report.transcript {
1533 match session_speaker_kind(&entry.speaker) {
1534 SessionSpeakerKind::User => {
1535 let text = entry.text.trim();
1536 if !text.is_empty() {
1537 pending_user = Some(text.to_string());
1538 }
1539 }
1540 SessionSpeakerKind::Assistant => {
1541 let text = entry.text.trim();
1542 if text.is_empty() {
1543 continue;
1544 }
1545 let Some(user_text) = pending_user.take() else {
1546 continue;
1547 };
1548 turn_index += 1;
1549 exchanges.push(SessionExchange {
1550 path: format!(
1551 "session/{}/{}/turn-{}",
1552 session_date, session_key, turn_index
1553 ),
1554 last_modified,
1555 content: format!(
1556 "Earlier session exchange\nUser:\n{}\n\nAssistant:\n{}",
1557 user_text, text
1558 ),
1559 });
1560 }
1561 SessionSpeakerKind::Ignore => {}
1562 }
1563 }
1564
1565 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1566 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1567 exchanges = exchanges.into_iter().skip(keep_from).collect();
1568 }
1569
1570 exchanges
1571}
1572
1573fn load_imported_session_exchanges(
1574 import_path: &Path,
1575 imports_root: &Path,
1576 last_modified: i64,
1577) -> Vec<SessionExchange> {
1578 let Ok(raw) = std::fs::read_to_string(import_path) else {
1579 return Vec::new();
1580 };
1581
1582 let messages = normalize_import_messages(&raw, import_path);
1583 if messages.is_empty() {
1584 return Vec::new();
1585 }
1586
1587 let rel = import_path
1588 .strip_prefix(imports_root)
1589 .unwrap_or(import_path);
1590 let rel_slug = slugify_import_path(rel);
1591 let mut exchanges = Vec::new();
1592 let mut pending_user: Option<String> = None;
1593 let mut turn_index = 0usize;
1594
1595 for (role, text) in messages {
1596 let cleaned = text.trim();
1597 if cleaned.is_empty() {
1598 continue;
1599 }
1600 match role.as_str() {
1601 "user" => pending_user = Some(cleaned.to_string()),
1602 "assistant" => {
1603 let Some(user_text) = pending_user.take() else {
1604 continue;
1605 };
1606 turn_index += 1;
1607 exchanges.push(SessionExchange {
1608 path: format!("session/imports/{}/turn-{}", rel_slug, turn_index),
1609 last_modified,
1610 content: format!(
1611 "Imported session exchange\nSource: .hematite/imports/{}\n\nUser:\n{}\n\nAssistant:\n{}",
1612 rel.to_string_lossy().replace('\\', "/"),
1613 user_text,
1614 cleaned
1615 ),
1616 });
1617 }
1618 _ => {}
1619 }
1620 }
1621
1622 if exchanges.len() > Vein::SESSION_TURN_LIMIT {
1623 let keep_from = exchanges.len() - Vein::SESSION_TURN_LIMIT;
1624 exchanges = exchanges.into_iter().skip(keep_from).collect();
1625 }
1626
1627 exchanges
1628}
1629
1630fn normalize_import_messages(raw: &str, import_path: &Path) -> Vec<(String, String)> {
1631 if raw.trim().is_empty() {
1632 return Vec::new();
1633 }
1634
1635 if let Some(messages) = parse_marker_transcript(raw) {
1636 return messages;
1637 }
1638
1639 let ext = import_path
1640 .extension()
1641 .and_then(|ext| ext.to_str())
1642 .unwrap_or("")
1643 .to_ascii_lowercase();
1644
1645 if matches!(ext.as_str(), "json" | "jsonl")
1646 || matches!(raw.trim().chars().next(), Some('{') | Some('['))
1647 {
1648 if let Some(messages) = parse_jsonl_messages(raw) {
1649 if !messages.is_empty() {
1650 return messages;
1651 }
1652 }
1653
1654 if let Ok(value) = serde_json::from_str::<Value>(raw) {
1655 if let Some(messages) = parse_session_report_messages(&value) {
1656 return messages;
1657 }
1658 if let Some(messages) = parse_simple_role_messages(&value) {
1659 return messages;
1660 }
1661 if let Some(messages) = parse_chatgpt_mapping_messages(&value) {
1662 return messages;
1663 }
1664 }
1665 }
1666
1667 Vec::new()
1668}
1669
1670fn parse_marker_transcript(raw: &str) -> Option<Vec<(String, String)>> {
1671 let lines = raw.lines().collect::<Vec<_>>();
1672 if lines
1673 .iter()
1674 .filter(|line| line.trim_start().starts_with("> "))
1675 .count()
1676 < 2
1677 {
1678 return None;
1679 }
1680
1681 let mut messages = Vec::new();
1682 let mut i = 0usize;
1683 while i < lines.len() {
1684 let line = lines[i].trim_start();
1685 if let Some(rest) = line.strip_prefix("> ") {
1686 messages.push(("user".to_string(), rest.trim().to_string()));
1687 i += 1;
1688 let mut assistant_lines = Vec::new();
1689 while i < lines.len() {
1690 let next = lines[i];
1691 if next.trim_start().starts_with("> ") {
1692 break;
1693 }
1694 let trimmed = next.trim();
1695 if !trimmed.is_empty() && trimmed != "---" {
1696 assistant_lines.push(trimmed.to_string());
1697 }
1698 i += 1;
1699 }
1700 if !assistant_lines.is_empty() {
1701 messages.push(("assistant".to_string(), assistant_lines.join("\n")));
1702 }
1703 } else {
1704 i += 1;
1705 }
1706 }
1707
1708 (!messages.is_empty()).then_some(messages)
1709}
1710
1711fn parse_jsonl_messages(raw: &str) -> Option<Vec<(String, String)>> {
1712 let mut messages = Vec::new();
1713 let mut has_codex_session_meta = false;
1714 let mut saw_jsonl = false;
1715
1716 for line in raw.lines() {
1717 let trimmed = line.trim();
1718 if trimmed.is_empty() {
1719 continue;
1720 }
1721 let Ok(value) = serde_json::from_str::<Value>(trimmed) else {
1722 continue;
1723 };
1724 saw_jsonl = true;
1725 let Some(object) = value.as_object() else {
1726 continue;
1727 };
1728
1729 match object.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1730 "session_meta" => {
1731 has_codex_session_meta = true;
1732 }
1733 "event_msg" => {
1734 let Some(payload) = object.get("payload").and_then(|v| v.as_object()) else {
1735 continue;
1736 };
1737 let Some(text) = payload.get("message").and_then(|v| v.as_str()) else {
1738 continue;
1739 };
1740 match payload.get("type").and_then(|v| v.as_str()).unwrap_or("") {
1741 "user_message" => messages.push(("user".to_string(), text.trim().to_string())),
1742 "agent_message" => {
1743 messages.push(("assistant".to_string(), text.trim().to_string()))
1744 }
1745 _ => {}
1746 }
1747 }
1748 "human" | "user" => {
1749 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1750 messages.push(("user".to_string(), text));
1751 }
1752 }
1753 "assistant" => {
1754 if let Some(text) = extract_text_content(object.get("message").unwrap_or(&value)) {
1755 messages.push(("assistant".to_string(), text));
1756 }
1757 }
1758 _ => {
1759 if let Some(role) = object.get("role").and_then(|v| v.as_str()) {
1760 if let Some(text) = extract_text_content(&value) {
1761 match role {
1762 "user" | "human" => messages.push(("user".to_string(), text)),
1763 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1764 _ => {}
1765 }
1766 }
1767 }
1768 }
1769 }
1770 }
1771
1772 if !saw_jsonl {
1773 return None;
1774 }
1775
1776 if has_codex_session_meta || !messages.is_empty() {
1777 return Some(messages);
1778 }
1779
1780 None
1781}
1782
1783fn parse_session_report_messages(value: &Value) -> Option<Vec<(String, String)>> {
1784 let report = value.as_object()?;
1785 let transcript = report.get("transcript")?.as_array()?;
1786 let mut messages = Vec::new();
1787
1788 for entry in transcript {
1789 let Some(obj) = entry.as_object() else {
1790 continue;
1791 };
1792 let speaker = obj
1793 .get("speaker")
1794 .and_then(|v| v.as_str())
1795 .unwrap_or_default();
1796 let text = obj
1797 .get("text")
1798 .and_then(|v| v.as_str())
1799 .unwrap_or_default()
1800 .trim()
1801 .to_string();
1802 if text.is_empty() {
1803 continue;
1804 }
1805 match session_speaker_kind(speaker) {
1806 SessionSpeakerKind::User => messages.push(("user".to_string(), text)),
1807 SessionSpeakerKind::Assistant => messages.push(("assistant".to_string(), text)),
1808 SessionSpeakerKind::Ignore => {}
1809 }
1810 }
1811
1812 (!messages.is_empty()).then_some(messages)
1813}
1814
1815fn parse_simple_role_messages(value: &Value) -> Option<Vec<(String, String)>> {
1816 if let Some(array) = value.as_array() {
1817 let messages = collect_role_messages(array);
1818 return (!messages.is_empty()).then_some(messages);
1819 }
1820
1821 let obj = value.as_object()?;
1822 if let Some(messages_value) = obj.get("messages").or_else(|| obj.get("chat_messages")) {
1823 let array = messages_value.as_array()?;
1824 let messages = collect_role_messages(array);
1825 return (!messages.is_empty()).then_some(messages);
1826 }
1827
1828 None
1829}
1830
1831fn collect_role_messages(items: &[Value]) -> Vec<(String, String)> {
1832 let mut messages = Vec::new();
1833 for item in items {
1834 let Some(obj) = item.as_object() else {
1835 continue;
1836 };
1837 let Some(role) = obj.get("role").and_then(|v| v.as_str()) else {
1838 continue;
1839 };
1840 let Some(text) = extract_text_content(item) else {
1841 continue;
1842 };
1843 match role {
1844 "user" | "human" => messages.push(("user".to_string(), text)),
1845 "assistant" | "ai" => messages.push(("assistant".to_string(), text)),
1846 _ => {}
1847 }
1848 }
1849 messages
1850}
1851
1852fn parse_chatgpt_mapping_messages(value: &Value) -> Option<Vec<(String, String)>> {
1853 let mapping = value.get("mapping")?.as_object()?;
1854 let mut current_id = mapping.iter().find_map(|(node_id, node)| {
1855 let obj = node.as_object()?;
1856 (obj.get("parent").is_some_and(|parent| parent.is_null())).then_some(node_id.clone())
1857 })?;
1858
1859 let mut messages = Vec::new();
1860 let mut visited = std::collections::HashSet::new();
1861
1862 while visited.insert(current_id.clone()) {
1863 let Some(node) = mapping.get(¤t_id).and_then(|v| v.as_object()) else {
1864 break;
1865 };
1866
1867 if let Some(message) = node.get("message") {
1868 let role = message
1869 .get("author")
1870 .and_then(|author| author.get("role"))
1871 .and_then(|v| v.as_str())
1872 .unwrap_or("");
1873 if let Some(text) = extract_text_content(message) {
1874 match role {
1875 "user" => messages.push(("user".to_string(), text)),
1876 "assistant" => messages.push(("assistant".to_string(), text)),
1877 _ => {}
1878 }
1879 }
1880 }
1881
1882 let Some(next_id) = node
1883 .get("children")
1884 .and_then(|children| children.as_array())
1885 .and_then(|children| children.first())
1886 .and_then(|child| child.as_str())
1887 else {
1888 break;
1889 };
1890 current_id = next_id.to_string();
1891 }
1892
1893 (!messages.is_empty()).then_some(messages)
1894}
1895
1896fn extract_text_content(value: &Value) -> Option<String> {
1897 if let Some(text) = value.as_str() {
1898 let trimmed = text.trim();
1899 return (!trimmed.is_empty()).then_some(trimmed.to_string());
1900 }
1901
1902 if let Some(array) = value.as_array() {
1903 let joined = array
1904 .iter()
1905 .filter_map(extract_text_content)
1906 .filter(|part| !part.is_empty())
1907 .collect::<Vec<_>>()
1908 .join("\n");
1909 return (!joined.is_empty()).then_some(joined);
1910 }
1911
1912 let obj = value.as_object()?;
1913
1914 if let Some(content) = obj.get("content") {
1915 if let Some(text) = extract_text_content(content) {
1916 return Some(text);
1917 }
1918 }
1919
1920 if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
1921 let trimmed = text.trim();
1922 if !trimmed.is_empty() {
1923 return Some(trimmed.to_string());
1924 }
1925 }
1926
1927 if let Some(parts) = obj.get("parts").and_then(|v| v.as_array()) {
1928 let joined = parts
1929 .iter()
1930 .filter_map(|part| part.as_str().map(|s| s.trim().to_string()))
1931 .filter(|part| !part.is_empty())
1932 .collect::<Vec<_>>()
1933 .join("\n");
1934 if !joined.is_empty() {
1935 return Some(joined);
1936 }
1937 }
1938
1939 None
1940}
1941
1942fn slugify_import_path(path: &Path) -> String {
1943 path.to_string_lossy()
1944 .replace('\\', "/")
1945 .chars()
1946 .map(|ch| {
1947 if ch.is_ascii_alphanumeric() || matches!(ch, '/' | '-' | '_') {
1948 ch
1949 } else {
1950 '_'
1951 }
1952 })
1953 .collect::<String>()
1954 .trim_matches('/')
1955 .replace('/', "__")
1956}
1957
1958fn embed_text_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
1974 embed_text_with_prefix(text, "search_document", base_url)
1975}
1976
1977fn embed_query_blocking(text: &str, base_url: &str) -> Option<Vec<f32>> {
1978 embed_text_with_prefix(text, "search_query", base_url)
1979}
1980
1981fn embed_text_with_prefix(text: &str, task: &str, base_url: &str) -> Option<Vec<f32>> {
1982 let prefixed = format!("{}: {}", task, text);
1984 let input = if prefixed.len() > 8000 {
1986 &prefixed[..8000]
1987 } else {
1988 &prefixed
1989 };
1990
1991 let client = reqwest::blocking::Client::builder()
1992 .timeout(std::time::Duration::from_secs(10))
1993 .build()
1994 .ok()?;
1995
1996 let body = serde_json::json!({
1997 "model": "nomic-embed-text-v2",
1998 "input": input
1999 });
2000
2001 let url = format!("{}/v1/embeddings", base_url);
2002 let resp = client.post(&url).json(&body).send().ok()?;
2003
2004 if !resp.status().is_success() {
2005 return None;
2006 }
2007
2008 let json: serde_json::Value = resp.json().ok()?;
2009 let embedding = json["data"][0]["embedding"].as_array()?;
2010 let vec: Vec<f32> = embedding
2011 .iter()
2012 .filter_map(|v| v.as_f64().map(|f| f as f32))
2013 .collect();
2014
2015 if vec.is_empty() {
2016 None
2017 } else {
2018 Some(vec)
2019 }
2020}
2021
2022fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
2025 if a.len() != b.len() || a.is_empty() {
2026 return 0.0;
2027 }
2028 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
2029 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
2030 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
2031 if norm_a == 0.0 || norm_b == 0.0 {
2032 0.0
2033 } else {
2034 dot / (norm_a * norm_b)
2035 }
2036}
2037
2038fn floats_to_blob(floats: &[f32]) -> Vec<u8> {
2039 floats.iter().flat_map(|f| f.to_le_bytes()).collect()
2040}
2041
2042fn blob_to_floats(blob: &[u8]) -> Vec<f32> {
2043 blob.chunks_exact(4)
2044 .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
2045 .collect()
2046}
2047
2048fn normalize_extracted_document_text(text: String) -> Option<String> {
2054 let normalized = text.replace("\r\n", "\n").replace('\r', "\n");
2055 let trimmed = normalized.trim_matches(|c: char| c.is_whitespace() || c == '\0');
2056 if trimmed.is_empty() {
2057 None
2058 } else {
2059 Some(trimmed.to_string())
2060 }
2061}
2062
2063fn extract_pdf_text_with_pdf_extract(path: &std::path::Path) -> Result<Option<String>, String> {
2064 let previous_hook = std::panic::take_hook();
2065 std::panic::set_hook(Box::new(|_| {}));
2066 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
2067 pdf_extract::extract_text(path)
2068 }));
2069 std::panic::set_hook(previous_hook);
2070
2071 match result {
2072 Ok(Ok(text)) => Ok(normalize_extracted_document_text(text)),
2073 Ok(Err(e)) => Err(format!("pdf-extract failed: {}", e)),
2074 Err(payload) => {
2075 let panic_text = if let Some(msg) = payload.downcast_ref::<&str>() {
2076 (*msg).to_string()
2077 } else if let Some(msg) = payload.downcast_ref::<String>() {
2078 msg.clone()
2079 } else {
2080 "unknown parser panic".to_string()
2081 };
2082 Err(format!("pdf-extract panicked: {}", panic_text))
2083 }
2084 }
2085}
2086
2087fn extract_pdf_text_with_lopdf(path: &std::path::Path) -> Result<Option<String>, String> {
2088 let mut doc =
2089 lopdf::Document::load(path).map_err(|e| format!("lopdf could not open PDF: {}", e))?;
2090
2091 if doc.is_encrypted() {
2092 doc.decrypt("")
2093 .map_err(|e| format!("PDF is encrypted and could not be decrypted: {}", e))?;
2094 }
2095
2096 let page_numbers: Vec<u32> = doc.get_pages().keys().copied().collect();
2097 if page_numbers.is_empty() {
2098 return Ok(None);
2099 }
2100
2101 let mut extracted_pages = Vec::new();
2102 let mut page_errors = Vec::new();
2103
2104 for page_number in page_numbers {
2105 match doc.extract_text(&[page_number]) {
2106 Ok(text) => {
2107 if let Some(page_text) = normalize_extracted_document_text(text) {
2108 extracted_pages.push(page_text);
2109 }
2110 }
2111 Err(e) => page_errors.push(format!("page {page_number}: {e}")),
2112 }
2113 }
2114
2115 if !extracted_pages.is_empty() {
2116 return Ok(Some(extracted_pages.join("\n\n")));
2117 }
2118
2119 if !page_errors.is_empty() {
2120 let sample_errors = page_errors
2121 .into_iter()
2122 .take(3)
2123 .collect::<Vec<_>>()
2124 .join("; ");
2125 return Err(format!(
2126 "lopdf could not extract usable page text ({sample_errors})"
2127 ));
2128 }
2129
2130 Ok(None)
2131}
2132
2133fn extract_pdf_text_inside_helper(path: &std::path::Path) -> Result<Option<String>, String> {
2134 let mut failures = Vec::new();
2135
2136 match extract_pdf_text_with_pdf_extract(path) {
2137 Ok(Some(text)) => return Ok(Some(text)),
2138 Ok(None) => failures.push("pdf-extract found no usable text".to_string()),
2139 Err(e) => failures.push(e),
2140 }
2141
2142 match extract_pdf_text_with_lopdf(path) {
2143 Ok(Some(text)) => return Ok(Some(text)),
2144 Ok(None) => failures.push("lopdf found no usable text".to_string()),
2145 Err(e) => failures.push(e),
2146 }
2147
2148 let detail = failures.into_iter().take(2).collect::<Vec<_>>().join("; ");
2149 Err(format!(
2150 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file may be scanned/image-only, encrypted, or use unsupported font encoding. Try exporting it to text/markdown or attach page images instead. Detail: {}",
2151 detail
2152 ))
2153}
2154
2155fn extract_pdf_text(path: &std::path::Path) -> Result<Option<String>, String> {
2156 let exe = std::env::current_exe()
2157 .map_err(|e| format!("Could not locate Hematite executable for PDF helper: {}", e))?;
2158 let output = std::process::Command::new(exe)
2159 .arg("--pdf-extract-helper")
2160 .arg(path)
2161 .stdin(std::process::Stdio::null())
2162 .stdout(std::process::Stdio::piped())
2163 .stderr(std::process::Stdio::piped())
2164 .output()
2165 .map_err(|e| format!("Could not launch PDF helper: {}", e))?;
2166
2167 if !output.status.success() {
2168 let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
2169 return Err(if stderr.is_empty() {
2170 "PDF extraction failed.".to_string()
2171 } else {
2172 stderr
2173 });
2174 }
2175
2176 let text = String::from_utf8(output.stdout)
2177 .map_err(|e| format!("PDF helper returned non-UTF8 text: {}", e))?;
2178 if text.trim().is_empty() {
2179 Ok(None)
2180 } else {
2181 Ok(Some(text))
2182 }
2183}
2184
2185pub fn run_pdf_extract_helper(path: &std::path::Path) -> i32 {
2186 match extract_pdf_text_inside_helper(path) {
2187 Ok(Some(text)) => {
2188 use std::io::Write;
2189 let mut stdout = std::io::stdout();
2190 if stdout.write_all(text.as_bytes()).is_ok() {
2191 0
2192 } else {
2193 let _ = writeln!(
2194 std::io::stderr(),
2195 "PDF helper could not write extracted text."
2196 );
2197 1
2198 }
2199 }
2200 Ok(None) => {
2201 eprintln!(
2202 "Could not extract text from PDF. Hematite keeps PDF parsing best-effort so it can stay a lightweight single-binary local coding harness. The file appears to contain no usable embedded text. Try exporting it to text/markdown or attach page images instead."
2203 );
2204 1
2205 }
2206 Err(e) => {
2207 eprintln!("{}", e);
2208 1
2209 }
2210 }
2211}
2212
2213pub fn extract_document_text(path: &std::path::Path) -> Result<String, String> {
2216 let ext = path
2217 .extension()
2218 .and_then(|e| e.to_str())
2219 .unwrap_or("")
2220 .to_lowercase();
2221 match ext.as_str() {
2222 "pdf" => {
2223 let text = extract_pdf_text(path)?.ok_or_else(|| {
2224 "PDF contains no extractable text — it may be scanned/image-only. \
2225 Try attaching page screenshots with /image instead."
2226 .to_string()
2227 })?;
2228 pdf_quality_check(text)
2229 }
2230 _ => std::fs::read_to_string(path).map_err(|e| format!("Could not read file: {e}")),
2231 }
2232}
2233
2234fn pdf_quality_check(text: String) -> Result<String, String> {
2239 let trimmed = text.trim();
2240
2241 if trimmed.len() < 150 {
2243 return Err(format!(
2244 "PDF extracted only {} characters — likely a scanned or image-only PDF, \
2245 or uses unsupported custom fonts. Try attaching page screenshots with /image instead.",
2246 trimmed.len()
2247 ));
2248 }
2249
2250 let non_newline: usize = trimmed.chars().filter(|c| *c != '\n' && *c != '\r').count();
2253 let spaces: usize = trimmed.chars().filter(|c| *c == ' ').count();
2254 let space_ratio = if non_newline > 0 {
2255 spaces as f32 / non_newline as f32
2256 } else {
2257 0.0
2258 };
2259
2260 if space_ratio < 0.04 {
2261 return Err(
2262 "PDF text extraction produced garbled output — words are merged with no spaces. \
2263 This usually means the PDF uses custom embedded fonts (common with academic publishers \
2264 like EBSCO, Elsevier, Springer). \
2265 Try a PDF exported from Word, Google Docs, or LaTeX, \
2266 or attach page screenshots with /image instead.".to_string()
2267 );
2268 }
2269
2270 Ok(text)
2271}
2272
2273fn chunk_by_symbols(ext: &str, text: &str) -> Vec<String> {
2277 if ext == "rs" {
2278 chunk_rust_symbols(text)
2279 } else {
2280 chunk_paragraphs(text)
2281 }
2282}
2283
2284fn chunk_rust_symbols(text: &str) -> Vec<String> {
2293 const ITEM_STARTS: &[&str] = &[
2294 "pub fn ",
2295 "pub async fn ",
2296 "pub unsafe fn ",
2297 "async fn ",
2298 "unsafe fn ",
2299 "fn ",
2300 "pub impl",
2301 "impl ",
2302 "pub struct ",
2303 "struct ",
2304 "pub enum ",
2305 "enum ",
2306 "pub trait ",
2307 "trait ",
2308 "pub mod ",
2309 "mod ",
2310 "pub type ",
2311 "type ",
2312 "pub const ",
2313 "const ",
2314 "pub static ",
2315 "static ",
2316 ];
2317
2318 let lines: Vec<&str> = text.lines().collect();
2319 let mut chunks: Vec<String> = Vec::new();
2320 let mut current: Vec<&str> = Vec::new();
2321
2322 for &line in &lines {
2323 let top_level = !line.starts_with(' ') && !line.starts_with('\t');
2324 let is_item = top_level && ITEM_STARTS.iter().any(|s| line.starts_with(s));
2325
2326 if is_item && !current.is_empty() {
2327 let mut split = current.len();
2330 while split > 0 {
2331 let prev = current[split - 1].trim();
2332 if prev.starts_with("///")
2333 || prev.starts_with("//!")
2334 || prev.starts_with("#[")
2335 || prev.is_empty()
2336 {
2337 split -= 1;
2338 } else {
2339 break;
2340 }
2341 }
2342 let body = current[..split].join("\n");
2343 if !body.trim().is_empty() {
2344 chunks.push(body);
2345 }
2346 current = current[split..].to_vec();
2347 }
2348 current.push(line);
2349 }
2350 if !current.is_empty() {
2351 let body = current.join("\n");
2352 if !body.trim().is_empty() {
2353 chunks.push(body);
2354 }
2355 }
2356
2357 let mut result = Vec::new();
2359 for chunk in chunks {
2360 if chunk.len() > 3000 {
2361 result.extend(sliding_window_chunks(&chunk, 2000, 200));
2362 } else {
2363 result.push(chunk);
2364 }
2365 }
2366 result
2367}
2368
2369fn chunk_paragraphs(text: &str) -> Vec<String> {
2371 let mut result: Vec<String> = Vec::new();
2372 let mut current = String::new();
2373
2374 for para in text.split("\n\n") {
2375 if current.len() + para.len() + 2 > 2000 {
2376 if !current.trim().is_empty() {
2377 result.push(current.clone());
2378 }
2379 current = para.to_string();
2380 } else {
2381 if !current.is_empty() {
2382 current.push_str("\n\n");
2383 }
2384 current.push_str(para);
2385 }
2386 }
2387 if !current.trim().is_empty() {
2388 result.push(current);
2389 }
2390
2391 let mut final_result = Vec::new();
2392 for chunk in result {
2393 if chunk.len() > 2000 {
2394 final_result.extend(sliding_window_chunks(&chunk, 2000, 200));
2395 } else {
2396 final_result.push(chunk);
2397 }
2398 }
2399 final_result
2400}
2401
2402fn sliding_window_chunks(text: &str, chunk_size: usize, overlap: usize) -> Vec<String> {
2404 let chars: Vec<char> = text.chars().collect();
2405 let mut result = Vec::new();
2406 let mut i = 0;
2407 while i < chars.len() {
2408 let end = (i + chunk_size).min(chars.len());
2409 result.push(chars[i..end].iter().collect());
2410 if end == chars.len() {
2411 break;
2412 }
2413 i += chunk_size - overlap;
2414 }
2415 result
2416}