1use crate::collections::{find_context_for_path, list_collections as yaml_list_collections};
7use crate::config::{EXCLUDE_DIRS, get_default_db_path};
8use crate::error::{QmdError, Result};
9use rusqlite::{Connection, OptionalExtension, params};
10use sha2::{Digest, Sha256};
11use std::fs;
12use std::path::{Path, PathBuf};
13
14#[must_use]
17pub fn normalize_path_separators(path: &str) -> String {
18 path.replace('\\', "/")
19}
20
21#[must_use]
24pub fn convert_git_bash_path(path: &str) -> String {
25 let normalized = normalize_path_separators(path);
26
27 if normalized.len() >= 3
29 && normalized.starts_with('/')
30 && normalized
31 .chars()
32 .nth(1)
33 .map_or(false, |c| c.is_ascii_alphabetic())
34 && normalized.chars().nth(2) == Some('/')
35 {
36 let drive_letter = normalized.chars().nth(1).unwrap().to_ascii_uppercase();
37 return format!("{}:{}", drive_letter, &normalized[2..]);
38 }
39
40 normalized
41}
42
43#[must_use]
46pub fn normalize_filesystem_path(path: &str) -> String {
47 let converted = convert_git_bash_path(path);
48 normalize_path_separators(&converted)
49}
50
51#[must_use]
53pub fn is_absolute_path(path: &str) -> bool {
54 let normalized = normalize_path_separators(path);
55
56 if normalized.starts_with('/') {
58 return true;
59 }
60
61 if normalized.len() >= 3 {
63 let chars: Vec<char> = normalized.chars().take(3).collect();
64 if chars[0].is_ascii_alphabetic()
65 && chars[1] == ':'
66 && (chars[2] == '/' || chars[2] == '\\')
67 {
68 return true;
69 }
70 }
71
72 false
73}
74
75#[derive(Debug, Clone)]
77pub struct DocumentResult {
78 pub filepath: String,
80 pub display_path: String,
82 pub title: String,
84 pub context: Option<String>,
86 pub hash: String,
88 pub docid: String,
90 pub collection_name: String,
92 pub path: String,
94 pub modified_at: String,
96 pub body_length: usize,
98 pub body: Option<String>,
100}
101
102#[derive(Debug, Clone)]
104pub struct SearchResult {
105 pub doc: DocumentResult,
107 pub score: f64,
109 pub source: SearchSource,
111 pub chunk_pos: Option<usize>,
113}
114
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
117pub enum SearchSource {
118 Fts,
120 Vec,
122}
123
124#[derive(Debug, Clone)]
126pub struct CollectionInfo {
127 pub name: String,
129 pub pwd: String,
131 pub glob_pattern: String,
133 pub active_count: usize,
135 pub last_modified: Option<String>,
137}
138
139#[derive(Debug, Clone)]
141pub struct IndexStatus {
142 pub total_documents: usize,
144 pub needs_embedding: usize,
146 pub has_vector_index: bool,
148 pub collections: Vec<CollectionInfo>,
150}
151
152#[derive(Debug)]
154pub struct Store {
155 conn: Connection,
157 db_path: PathBuf,
159}
160
161impl Store {
162 pub fn new() -> Result<Self> {
164 let db_path = get_default_db_path("index")
165 .ok_or_else(|| QmdError::Config("Could not determine database path".to_string()))?;
166 Self::open(&db_path)
167 }
168
169 pub fn open(db_path: &Path) -> Result<Self> {
171 if let Some(parent) = db_path.parent() {
173 fs::create_dir_all(parent)?;
174 }
175
176 let conn = Connection::open(db_path)?;
177 let mut store = Self {
178 conn,
179 db_path: db_path.to_path_buf(),
180 };
181 store.initialize()?;
182 Ok(store)
183 }
184
185 #[must_use]
187 pub fn db_path(&self) -> &Path {
188 &self.db_path
189 }
190
191 fn initialize(&mut self) -> Result<()> {
193 self.conn.execute_batch(
194 r"
195 PRAGMA journal_mode = WAL;
196 PRAGMA foreign_keys = ON;
197
198 -- Content-addressable storage
199 CREATE TABLE IF NOT EXISTS content (
200 hash TEXT PRIMARY KEY,
201 doc TEXT NOT NULL,
202 created_at TEXT NOT NULL
203 );
204
205 -- Documents table
206 CREATE TABLE IF NOT EXISTS documents (
207 id INTEGER PRIMARY KEY AUTOINCREMENT,
208 collection TEXT NOT NULL,
209 path TEXT NOT NULL,
210 title TEXT NOT NULL,
211 hash TEXT NOT NULL,
212 created_at TEXT NOT NULL,
213 modified_at TEXT NOT NULL,
214 active INTEGER NOT NULL DEFAULT 1,
215 FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
216 UNIQUE(collection, path)
217 );
218
219 CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active);
220 CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash);
221 CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active);
222
223 -- FTS index
224 CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
225 filepath, title, body,
226 tokenize='porter unicode61'
227 );
228
229 -- LLM cache
230 CREATE TABLE IF NOT EXISTS llm_cache (
231 hash TEXT PRIMARY KEY,
232 result TEXT NOT NULL,
233 created_at TEXT NOT NULL
234 );
235
236 -- Content vectors metadata
237 CREATE TABLE IF NOT EXISTS content_vectors (
238 hash TEXT NOT NULL,
239 seq INTEGER NOT NULL DEFAULT 0,
240 pos INTEGER NOT NULL DEFAULT 0,
241 model TEXT NOT NULL,
242 embedded_at TEXT NOT NULL,
243 PRIMARY KEY (hash, seq)
244 );
245 ",
246 )?;
247
248 self.create_fts_triggers()?;
250
251 Ok(())
252 }
253
254 fn create_fts_triggers(&self) -> Result<()> {
256 let trigger_exists: bool = self
258 .conn
259 .query_row(
260 "SELECT 1 FROM sqlite_master WHERE type='trigger' AND name='documents_ai'",
261 [],
262 |_| Ok(true),
263 )
264 .unwrap_or(false);
265
266 if !trigger_exists {
267 self.conn.execute_batch(
268 r"
269 CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
270 WHEN new.active = 1
271 BEGIN
272 INSERT INTO documents_fts(rowid, filepath, title, body)
273 SELECT
274 new.id,
275 new.collection || '/' || new.path,
276 new.title,
277 (SELECT doc FROM content WHERE hash = new.hash)
278 WHERE new.active = 1;
279 END;
280
281 CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
282 DELETE FROM documents_fts WHERE rowid = old.id;
283 END;
284
285 CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
286 BEGIN
287 DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
288 INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
289 SELECT
290 new.id,
291 new.collection || '/' || new.path,
292 new.title,
293 (SELECT doc FROM content WHERE hash = new.hash)
294 WHERE new.active = 1;
295 END;
296 ",
297 )?;
298 }
299
300 Ok(())
301 }
302
303 #[must_use]
305 pub fn hash_content(content: &str) -> String {
306 let mut hasher = Sha256::new();
307 hasher.update(content.as_bytes());
308 format!("{:x}", hasher.finalize())
309 }
310
311 #[must_use]
313 pub fn get_docid(hash: &str) -> String {
314 hash.chars().take(6).collect()
315 }
316
317 #[must_use]
319 pub fn extract_title(content: &str) -> String {
320 for line in content.lines() {
321 let trimmed = line.trim();
322 if trimmed.starts_with("# ") {
323 return trimmed[2..].trim().to_string();
324 }
325 if trimmed.starts_with("## ") {
326 return trimmed[3..].trim().to_string();
327 }
328 }
329 String::new()
330 }
331
332 #[must_use]
334 pub fn handelize(path: &str) -> String {
335 path.replace("___", "/")
336 .to_lowercase()
337 .split('/')
338 .filter(|s| !s.is_empty())
339 .map(|segment| {
340 let cleaned: String = segment
341 .chars()
342 .map(|c| if c.is_alphanumeric() { c } else { '-' })
343 .collect();
344 cleaned.trim_matches('-').to_string()
345 })
346 .filter(|s| !s.is_empty())
347 .collect::<Vec<_>>()
348 .join("/")
349 }
350
351 pub fn insert_content(&self, hash: &str, content: &str, created_at: &str) -> Result<()> {
353 self.conn.execute(
354 "INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?1, ?2, ?3)",
355 params![hash, content, created_at],
356 )?;
357 Ok(())
358 }
359
360 pub fn insert_document(
362 &self,
363 collection: &str,
364 path: &str,
365 title: &str,
366 hash: &str,
367 created_at: &str,
368 modified_at: &str,
369 ) -> Result<()> {
370 self.conn.execute(
371 r"
372 INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
373 VALUES (?1, ?2, ?3, ?4, ?5, ?6, 1)
374 ON CONFLICT(collection, path) DO UPDATE SET
375 title = excluded.title,
376 hash = excluded.hash,
377 modified_at = excluded.modified_at,
378 active = 1
379 ",
380 params![collection, path, title, hash, created_at, modified_at],
381 )?;
382 Ok(())
383 }
384
385 pub fn find_active_document(
387 &self,
388 collection: &str,
389 path: &str,
390 ) -> Result<Option<(i64, String, String)>> {
391 let result = self
392 .conn
393 .query_row(
394 "SELECT id, hash, title FROM documents WHERE collection = ?1 AND path = ?2 AND active = 1",
395 params![collection, path],
396 |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
397 )
398 .optional()?;
399 Ok(result)
400 }
401
402 pub fn update_document_title(
404 &self,
405 document_id: i64,
406 title: &str,
407 modified_at: &str,
408 ) -> Result<()> {
409 self.conn.execute(
410 "UPDATE documents SET title = ?1, modified_at = ?2 WHERE id = ?3",
411 params![title, modified_at, document_id],
412 )?;
413 Ok(())
414 }
415
416 pub fn update_document(
418 &self,
419 document_id: i64,
420 title: &str,
421 hash: &str,
422 modified_at: &str,
423 ) -> Result<()> {
424 self.conn.execute(
425 "UPDATE documents SET title = ?1, hash = ?2, modified_at = ?3 WHERE id = ?4",
426 params![title, hash, modified_at, document_id],
427 )?;
428 Ok(())
429 }
430
431 pub fn deactivate_document(&self, collection: &str, path: &str) -> Result<()> {
433 self.conn.execute(
434 "UPDATE documents SET active = 0 WHERE collection = ?1 AND path = ?2",
435 params![collection, path],
436 )?;
437 Ok(())
438 }
439
440 pub fn get_active_document_paths(&self, collection: &str) -> Result<Vec<String>> {
442 let mut stmt = self
443 .conn
444 .prepare("SELECT path FROM documents WHERE collection = ?1 AND active = 1")?;
445 let paths = stmt
446 .query_map(params![collection], |row| row.get(0))?
447 .collect::<std::result::Result<Vec<String>, _>>()?;
448 Ok(paths)
449 }
450
451 pub fn search_fts(
453 &self,
454 query: &str,
455 limit: usize,
456 collection: Option<&str>,
457 ) -> Result<Vec<SearchResult>> {
458 let sql = if collection.is_some() {
459 r"
460 SELECT
461 d.collection,
462 d.path,
463 d.title,
464 d.hash,
465 d.modified_at,
466 bm25(documents_fts) as score,
467 LENGTH(c.doc) as body_length
468 FROM documents_fts fts
469 JOIN documents d ON d.id = fts.rowid
470 JOIN content c ON c.hash = d.hash
471 WHERE documents_fts MATCH ?1
472 AND d.collection = ?2
473 AND d.active = 1
474 ORDER BY score
475 LIMIT ?3
476 "
477 } else {
478 r"
479 SELECT
480 d.collection,
481 d.path,
482 d.title,
483 d.hash,
484 d.modified_at,
485 bm25(documents_fts) as score,
486 LENGTH(c.doc) as body_length
487 FROM documents_fts fts
488 JOIN documents d ON d.id = fts.rowid
489 JOIN content c ON c.hash = d.hash
490 WHERE documents_fts MATCH ?1
491 AND d.active = 1
492 ORDER BY score
493 LIMIT ?2
494 "
495 };
496
497 let mut stmt = self.conn.prepare(sql)?;
498
499 let results: Vec<SearchResult> = if let Some(coll) = collection {
500 stmt.query_map(params![query, coll, limit as i64], |row| {
501 let collection_name: String = row.get(0)?;
502 let path: String = row.get(1)?;
503 let title: String = row.get(2)?;
504 let hash: String = row.get(3)?;
505 let modified_at: String = row.get(4)?;
506 let score: f64 = row.get(5)?;
507 let body_length: i64 = row.get(6)?;
508 let body_length = body_length as usize;
509
510 Ok(SearchResult {
511 doc: DocumentResult {
512 filepath: format!("qmd://{collection_name}/{path}"),
513 display_path: format!("{collection_name}/{path}"),
514 title,
515 context: None,
516 hash: hash.clone(),
517 docid: Self::get_docid(&hash),
518 collection_name,
519 path,
520 modified_at,
521 body_length,
522 body: None,
523 },
524 score: -score, source: SearchSource::Fts,
526 chunk_pos: None,
527 })
528 })?
529 .collect::<std::result::Result<Vec<_>, _>>()?
530 } else {
531 stmt.query_map(params![query, limit as i64], |row| {
532 let collection_name: String = row.get(0)?;
533 let path: String = row.get(1)?;
534 let title: String = row.get(2)?;
535 let hash: String = row.get(3)?;
536 let modified_at: String = row.get(4)?;
537 let score: f64 = row.get(5)?;
538 let body_length: i64 = row.get(6)?;
539 let body_length = body_length as usize;
540
541 Ok(SearchResult {
542 doc: DocumentResult {
543 filepath: format!("qmd://{collection_name}/{path}"),
544 display_path: format!("{collection_name}/{path}"),
545 title,
546 context: None,
547 hash: hash.clone(),
548 docid: Self::get_docid(&hash),
549 collection_name,
550 path,
551 modified_at,
552 body_length,
553 body: None,
554 },
555 score: -score,
556 source: SearchSource::Fts,
557 chunk_pos: None,
558 })
559 })?
560 .collect::<std::result::Result<Vec<_>, _>>()?
561 };
562
563 let results_with_context: Vec<SearchResult> = results
565 .into_iter()
566 .map(|mut r| {
567 r.doc.context =
568 find_context_for_path(&r.doc.collection_name, &r.doc.path).unwrap_or(None);
569 r
570 })
571 .collect();
572
573 Ok(results_with_context)
574 }
575
576 pub fn get_document(&self, collection: &str, path: &str) -> Result<Option<DocumentResult>> {
578 let result = self
579 .conn
580 .query_row(
581 r"
582 SELECT
583 d.title,
584 d.hash,
585 d.modified_at,
586 c.doc,
587 LENGTH(c.doc) as body_length
588 FROM documents d
589 JOIN content c ON c.hash = d.hash
590 WHERE d.collection = ?1 AND d.path = ?2 AND d.active = 1
591 ",
592 params![collection, path],
593 |row| {
594 let title: String = row.get(0)?;
595 let hash: String = row.get(1)?;
596 let modified_at: String = row.get(2)?;
597 let body: String = row.get(3)?;
598 let body_length: i64 = row.get(4)?;
599 let body_length = body_length as usize;
600
601 Ok(DocumentResult {
602 filepath: format!("qmd://{collection}/{path}"),
603 display_path: format!("{collection}/{path}"),
604 title,
605 context: None,
606 hash: hash.clone(),
607 docid: Self::get_docid(&hash),
608 collection_name: collection.to_string(),
609 path: path.to_string(),
610 modified_at,
611 body_length,
612 body: Some(body),
613 })
614 },
615 )
616 .optional()?;
617
618 let result = result.map(|mut doc| {
620 doc.context = find_context_for_path(collection, path).unwrap_or(None);
621 doc
622 });
623
624 Ok(result)
625 }
626
627 pub fn find_document_by_docid(&self, docid: &str) -> Result<Option<(String, String)>> {
629 let clean_docid = docid.trim_start_matches('#');
630 let result = self
631 .conn
632 .query_row(
633 r"
634 SELECT d.collection, d.path
635 FROM documents d
636 WHERE d.hash LIKE ?1 || '%' AND d.active = 1
637 LIMIT 1
638 ",
639 params![clean_docid],
640 |row| Ok((row.get(0)?, row.get(1)?)),
641 )
642 .optional()?;
643 Ok(result)
644 }
645
646 pub fn list_collections(&self) -> Result<Vec<CollectionInfo>> {
648 let yaml_collections = yaml_list_collections()?;
649
650 let mut collections = Vec::new();
651
652 for coll in yaml_collections {
653 let stats: (i64, Option<String>) = self
654 .conn
655 .query_row(
656 r"
657 SELECT COUNT(*) as count, MAX(modified_at) as last_modified
658 FROM documents
659 WHERE collection = ?1 AND active = 1
660 ",
661 params![coll.name],
662 |row| Ok((row.get(0)?, row.get(1)?)),
663 )
664 .unwrap_or((0, None));
665
666 collections.push(CollectionInfo {
667 name: coll.name,
668 pwd: coll.path,
669 glob_pattern: coll.pattern,
670 active_count: stats.0 as usize,
671 last_modified: stats.1,
672 });
673 }
674
675 Ok(collections)
676 }
677
678 pub fn get_status(&self) -> Result<IndexStatus> {
680 let total_documents: i64 = self.conn.query_row(
681 "SELECT COUNT(*) FROM documents WHERE active = 1",
682 [],
683 |row| row.get(0),
684 )?;
685 let total_documents = total_documents as usize;
686
687 let needs_embedding: i64 = self.conn.query_row(
688 r"
689 SELECT COUNT(DISTINCT d.hash)
690 FROM documents d
691 LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
692 WHERE d.active = 1 AND v.hash IS NULL
693 ",
694 [],
695 |row| row.get(0),
696 )?;
697 let needs_embedding = needs_embedding as usize;
698
699 let has_vector_index: bool = self
700 .conn
701 .query_row(
702 "SELECT 1 FROM sqlite_master WHERE type='table' AND name='vectors_vec'",
703 [],
704 |_| Ok(true),
705 )
706 .unwrap_or(false);
707
708 let collections = self.list_collections()?;
709
710 Ok(IndexStatus {
711 total_documents,
712 needs_embedding,
713 has_vector_index,
714 collections,
715 })
716 }
717
718 pub fn remove_collection_documents(&self, name: &str) -> Result<(usize, usize)> {
720 let doc_count: i64 = self.conn.query_row(
722 "SELECT COUNT(*) FROM documents WHERE collection = ?1",
723 params![name],
724 |row| row.get(0),
725 )?;
726 let doc_count = doc_count as usize;
727
728 self.conn
730 .execute("DELETE FROM documents WHERE collection = ?1", params![name])?;
731
732 let cleaned = self.cleanup_orphaned_content()?;
734
735 Ok((doc_count, cleaned))
736 }
737
738 pub fn rename_collection_documents(&self, old_name: &str, new_name: &str) -> Result<()> {
740 self.conn.execute(
741 "UPDATE documents SET collection = ?1 WHERE collection = ?2",
742 params![new_name, old_name],
743 )?;
744 Ok(())
745 }
746
747 pub fn cleanup_orphaned_content(&self) -> Result<usize> {
749 let changes = self.conn.execute(
750 "DELETE FROM content WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)",
751 [],
752 )?;
753 Ok(changes)
754 }
755
756 pub fn cleanup_orphaned_vectors(&self) -> Result<usize> {
758 let changes = self.conn.execute(
759 r"
760 DELETE FROM content_vectors
761 WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
762 ",
763 [],
764 )?;
765 Ok(changes)
766 }
767
768 pub fn delete_inactive_documents(&self) -> Result<usize> {
770 let changes = self
771 .conn
772 .execute("DELETE FROM documents WHERE active = 0", [])?;
773 Ok(changes)
774 }
775
776 pub fn clear_cache(&self) -> Result<usize> {
778 let changes = self.conn.execute("DELETE FROM llm_cache", [])?;
779 Ok(changes)
780 }
781
782 pub fn vacuum(&self) -> Result<()> {
784 self.conn.execute("VACUUM", [])?;
785 Ok(())
786 }
787
788 pub fn ensure_vector_table(&self, _dimensions: usize) -> Result<()> {
790 self.conn.execute(
792 r"
793 CREATE TABLE IF NOT EXISTS vectors_vec (
794 hash_seq TEXT PRIMARY KEY,
795 embedding BLOB NOT NULL
796 )
797 ",
798 [],
799 )?;
800 Ok(())
801 }
802
803 pub fn insert_embedding(
805 &self,
806 hash: &str,
807 seq: usize,
808 pos: usize,
809 embedding: &[f32],
810 model: &str,
811 embedded_at: &str,
812 ) -> Result<()> {
813 self.conn.execute(
815 r"
816 INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at)
817 VALUES (?1, ?2, ?3, ?4, ?5)
818 ",
819 params![hash, seq as i64, pos as i64, model, embedded_at],
820 )?;
821
822 let hash_seq = format!("{hash}_{seq}");
824 let embedding_bytes: Vec<u8> = embedding.iter().flat_map(|f| f.to_le_bytes()).collect();
825
826 self.conn.execute(
827 "INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?1, ?2)",
828 params![hash_seq, embedding_bytes],
829 )?;
830
831 Ok(())
832 }
833
834 pub fn get_hashes_needing_embedding(&self) -> Result<Vec<(String, String, String)>> {
836 let mut stmt = self.conn.prepare(
837 r"
838 SELECT DISTINCT d.hash, d.path, c.doc
839 FROM documents d
840 JOIN content c ON c.hash = d.hash
841 LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
842 WHERE d.active = 1 AND v.hash IS NULL
843 ",
844 )?;
845
846 let results = stmt
847 .query_map([], |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)))?
848 .collect::<std::result::Result<Vec<_>, _>>()?;
849
850 Ok(results)
851 }
852
853 pub fn get_embedding(&self, hash: &str, seq: usize) -> Result<Option<Vec<f32>>> {
855 let hash_seq = format!("{hash}_{seq}");
856 let result: Option<Vec<u8>> = self
857 .conn
858 .query_row(
859 "SELECT embedding FROM vectors_vec WHERE hash_seq = ?1",
860 params![hash_seq],
861 |row| row.get(0),
862 )
863 .optional()?;
864
865 Ok(result.map(|bytes| {
866 bytes
867 .chunks_exact(4)
868 .map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
869 .collect()
870 }))
871 }
872
873 pub fn search_vec(
875 &self,
876 query_embedding: &[f32],
877 limit: usize,
878 collection: Option<&str>,
879 ) -> Result<Vec<SearchResult>> {
880 let sql = if collection.is_some() {
882 r"
883 SELECT DISTINCT
884 d.collection,
885 d.path,
886 d.title,
887 d.hash,
888 d.modified_at,
889 LENGTH(c.doc) as body_length,
890 v.hash_seq
891 FROM documents d
892 JOIN content c ON c.hash = d.hash
893 JOIN vectors_vec v ON v.hash_seq = d.hash || '_0'
894 WHERE d.active = 1 AND d.collection = ?1
895 "
896 } else {
897 r"
898 SELECT DISTINCT
899 d.collection,
900 d.path,
901 d.title,
902 d.hash,
903 d.modified_at,
904 LENGTH(c.doc) as body_length,
905 v.hash_seq
906 FROM documents d
907 JOIN content c ON c.hash = d.hash
908 JOIN vectors_vec v ON v.hash_seq = d.hash || '_0'
909 WHERE d.active = 1
910 "
911 };
912
913 let mut stmt = self.conn.prepare(sql)?;
914
915 let rows: Vec<(String, String, String, String, String, usize, String)> =
916 if let Some(coll) = collection {
917 stmt.query_map(params![coll], |row| {
918 let body_length: i64 = row.get(5)?;
919 Ok((
920 row.get(0)?,
921 row.get(1)?,
922 row.get(2)?,
923 row.get(3)?,
924 row.get(4)?,
925 body_length as usize,
926 row.get(6)?,
927 ))
928 })?
929 .collect::<std::result::Result<Vec<_>, _>>()?
930 } else {
931 stmt.query_map([], |row| {
932 let body_length: i64 = row.get(5)?;
933 Ok((
934 row.get(0)?,
935 row.get(1)?,
936 row.get(2)?,
937 row.get(3)?,
938 row.get(4)?,
939 body_length as usize,
940 row.get(6)?,
941 ))
942 })?
943 .collect::<std::result::Result<Vec<_>, _>>()?
944 };
945
946 let mut results: Vec<SearchResult> = Vec::new();
948
949 for (collection_name, path, title, hash, modified_at, body_length, _hash_seq) in rows {
950 if let Some(doc_embedding) = self.get_embedding(&hash, 0)? {
951 let similarity = crate::llm::cosine_similarity(query_embedding, &doc_embedding);
952
953 results.push(SearchResult {
954 doc: DocumentResult {
955 filepath: format!("qmd://{collection_name}/{path}"),
956 display_path: format!("{collection_name}/{path}"),
957 title,
958 context: None,
959 hash: hash.clone(),
960 docid: Self::get_docid(&hash),
961 collection_name: collection_name.clone(),
962 path: path.clone(),
963 modified_at,
964 body_length,
965 body: None,
966 },
967 score: f64::from(similarity),
968 source: SearchSource::Vec,
969 chunk_pos: Some(0), });
971 }
972 }
973
974 results.sort_by(|a, b| {
976 b.score
977 .partial_cmp(&a.score)
978 .unwrap_or(std::cmp::Ordering::Equal)
979 });
980 results.truncate(limit);
981
982 let results_with_context: Vec<SearchResult> = results
984 .into_iter()
985 .map(|mut r| {
986 r.doc.context =
987 find_context_for_path(&r.doc.collection_name, &r.doc.path).unwrap_or(None);
988 r
989 })
990 .collect();
991
992 Ok(results_with_context)
993 }
994
995 pub fn clear_embeddings(&self) -> Result<usize> {
997 let changes1 = self.conn.execute("DELETE FROM content_vectors", [])?;
998 let _ = self.conn.execute("DELETE FROM vectors_vec", []);
999 Ok(changes1)
1000 }
1001
1002 pub fn list_files(
1004 &self,
1005 collection: &str,
1006 path_prefix: Option<&str>,
1007 ) -> Result<Vec<(String, String, String, usize)>> {
1008 let mut stmt;
1009 let files: Vec<(String, String, String, usize)> = if let Some(prefix) = path_prefix {
1010 let prefix_pattern = format!("{prefix}%");
1011 stmt = self.conn.prepare(
1012 r"
1013 SELECT d.path, d.title, d.modified_at, LENGTH(c.doc) as size
1014 FROM documents d
1015 JOIN content c ON d.hash = c.hash
1016 WHERE d.collection = ?1 AND d.path LIKE ?2 AND d.active = 1
1017 ORDER BY d.path
1018 ",
1019 )?;
1020 stmt.query_map(params![collection, prefix_pattern], |row| {
1021 let size: i64 = row.get(3)?;
1022 Ok((row.get(0)?, row.get(1)?, row.get(2)?, size as usize))
1023 })?
1024 .collect::<std::result::Result<Vec<_>, _>>()?
1025 } else {
1026 stmt = self.conn.prepare(
1027 r"
1028 SELECT d.path, d.title, d.modified_at, LENGTH(c.doc) as size
1029 FROM documents d
1030 JOIN content c ON d.hash = c.hash
1031 WHERE d.collection = ?1 AND d.active = 1
1032 ORDER BY d.path
1033 ",
1034 )?;
1035 stmt.query_map(params![collection], |row| {
1036 let size: i64 = row.get(3)?;
1037 Ok((row.get(0)?, row.get(1)?, row.get(2)?, size as usize))
1038 })?
1039 .collect::<std::result::Result<Vec<_>, _>>()?
1040 };
1041
1042 Ok(files)
1043 }
1044
1045 pub fn get_index_health(&self) -> Result<crate::llm::IndexHealth> {
1047 let total_docs: usize = self.conn.query_row(
1049 "SELECT COUNT(*) FROM documents WHERE active = 1",
1050 [],
1051 |row| row.get::<_, i64>(0).map(|v| v as usize),
1052 )?;
1053
1054 let needs_embedding: usize = self.conn.query_row(
1056 r"
1057 SELECT COUNT(DISTINCT d.hash)
1058 FROM documents d
1059 LEFT JOIN content_vectors cv ON d.hash = cv.hash
1060 WHERE d.active = 1 AND cv.hash IS NULL
1061 ",
1062 [],
1063 |row| row.get::<_, i64>(0).map(|v| v as usize),
1064 )?;
1065
1066 let days_stale: Option<u64> = self
1068 .conn
1069 .query_row(
1070 "SELECT MAX(modified_at) FROM documents WHERE active = 1",
1071 [],
1072 |row| row.get::<_, Option<String>>(0),
1073 )
1074 .ok()
1075 .flatten()
1076 .and_then(|ts| {
1077 chrono::DateTime::parse_from_rfc3339(&ts).ok().map(|dt| {
1078 let now = chrono::Utc::now();
1079 let duration = now.signed_duration_since(dt);
1080 duration.num_days().max(0) as u64
1081 })
1082 });
1083
1084 Ok(crate::llm::IndexHealth {
1085 needs_embedding,
1086 total_docs,
1087 days_stale,
1088 })
1089 }
1090
1091 pub fn check_and_warn_health(&self) {
1093 if let Ok(health) = self.get_index_health()
1094 && let Some(msg) = health.warning_message()
1095 {
1096 eprintln!("{}", colored::Colorize::yellow(msg.as_str()));
1097 }
1098 }
1099
1100 pub fn get_document_count(&self) -> Result<usize> {
1102 let count: i64 = self.conn.query_row(
1103 "SELECT COUNT(*) FROM documents WHERE active = 1",
1104 [],
1105 |row| row.get(0),
1106 )?;
1107 Ok(count as usize)
1108 }
1109
1110 pub fn get_unique_hash_count(&self) -> Result<usize> {
1112 let count: i64 = self.conn.query_row(
1113 "SELECT COUNT(DISTINCT hash) FROM documents WHERE active = 1",
1114 [],
1115 |row| row.get(0),
1116 )?;
1117 Ok(count as usize)
1118 }
1119
1120 pub fn get_embedded_hash_count(&self) -> Result<usize> {
1122 let count: i64 = self.conn.query_row(
1123 "SELECT COUNT(DISTINCT hash) FROM content_vectors",
1124 [],
1125 |row| row.get(0),
1126 )?;
1127 Ok(count as usize)
1128 }
1129}
1130
1131#[must_use]
1133pub fn should_exclude(path: &Path) -> bool {
1134 for component in path.components() {
1135 if let std::path::Component::Normal(name) = component {
1136 let name_str = name.to_string_lossy();
1137 if name_str.starts_with('.') || EXCLUDE_DIRS.contains(&name_str.as_ref()) {
1138 return true;
1139 }
1140 }
1141 }
1142 false
1143}
1144
1145#[must_use]
1147pub fn is_docid(s: &str) -> bool {
1148 let clean = s.trim_start_matches('#');
1149 clean.len() == 6 && clean.chars().all(|c| c.is_ascii_hexdigit())
1150}
1151
1152#[must_use]
1154pub fn parse_virtual_path(path: &str) -> Option<(String, String)> {
1155 let normalized = normalize_virtual_path(path);
1156 let stripped = normalized.strip_prefix("qmd://")?;
1157 let mut parts = stripped.splitn(2, '/');
1158 let collection = parts.next()?.to_string();
1159 let file_path = parts.next().unwrap_or("").to_string();
1160 Some((collection, file_path))
1161}
1162
1163#[must_use]
1165pub fn build_virtual_path(collection: &str, path: &str) -> String {
1166 format!("qmd://{collection}/{path}")
1167}
1168
1169#[must_use]
1171pub fn is_virtual_path(path: &str) -> bool {
1172 let trimmed = path.trim();
1173 trimmed.starts_with("qmd:") || trimmed.starts_with("//")
1174}
1175
1176#[must_use]
1178pub fn normalize_virtual_path(input: &str) -> String {
1179 let path = input.trim();
1180
1181 if let Some(rest) = path.strip_prefix("qmd:") {
1182 let rest = rest.trim_start_matches('/');
1183 return format!("qmd://{rest}");
1184 }
1185
1186 if path.starts_with("//") {
1187 let rest = path.trim_start_matches('/');
1188 return format!("qmd://{rest}");
1189 }
1190
1191 path.to_string()
1192}
1193
1194pub fn find_similar_files(
1204 store: &Store,
1205 query: &str,
1206 _max_distance: usize,
1207 limit: usize,
1208) -> Result<Vec<(String, String, i64)>> {
1209 use fuzzy_matcher::FuzzyMatcher;
1210 use fuzzy_matcher::skim::SkimMatcherV2;
1211
1212 let matcher = SkimMatcherV2::default();
1213 let query_lower = query.to_lowercase();
1214
1215 let mut stmt = store.conn.prepare(
1217 r"
1218 SELECT collection, path
1219 FROM documents
1220 WHERE active = 1
1221 ",
1222 )?;
1223
1224 let files: Vec<(String, String)> = stmt
1225 .query_map([], |row| Ok((row.get(0)?, row.get(1)?)))?
1226 .filter_map(std::result::Result::ok)
1227 .collect();
1228
1229 let mut scored: Vec<(String, String, i64)> = files
1231 .into_iter()
1232 .filter_map(|(collection, path)| {
1233 let display_path = build_virtual_path(&collection, &path);
1234 let path_lower = path.to_lowercase();
1235
1236 matcher
1238 .fuzzy_match(&path_lower, &query_lower)
1239 .map(|score| (display_path, path, score))
1240 })
1241 .collect();
1242
1243 scored.sort_by(|a, b| b.2.cmp(&a.2));
1245 scored.truncate(limit);
1246
1247 Ok(scored)
1248}
1249
1250pub fn match_files_by_glob(store: &Store, pattern: &str) -> Result<Vec<DocumentResult>> {
1252 let glob_pattern = glob::Pattern::new(pattern).map_err(|e| QmdError::Config(e.to_string()))?;
1253
1254 let mut stmt = store.conn.prepare(
1255 r"
1256 SELECT d.collection, d.path, d.title, d.hash, d.modified_at, LENGTH(c.doc)
1257 FROM documents d
1258 JOIN content c ON d.hash = c.hash
1259 WHERE d.active = 1
1260 ",
1261 )?;
1262
1263 let results: Vec<DocumentResult> = stmt
1264 .query_map([], |row| {
1265 let collection: String = row.get(0)?;
1266 let path: String = row.get(1)?;
1267 let title: String = row.get(2)?;
1268 let hash: String = row.get(3)?;
1269 let modified_at: String = row.get(4)?;
1270 let body_length: i64 = row.get(5)?;
1271
1272 Ok((collection, path, title, hash, modified_at, body_length))
1273 })?
1274 .filter_map(std::result::Result::ok)
1275 .filter(|(_, path, _, _, _, _)| glob_pattern.matches(path))
1276 .map(
1277 |(collection, path, title, hash, modified_at, body_length)| {
1278 let display_path = build_virtual_path(&collection, &path);
1279 let docid = Store::get_docid(&hash);
1280 let context = find_context_for_path(&collection, &path).ok().flatten();
1281
1282 DocumentResult {
1283 filepath: display_path.clone(),
1284 display_path,
1285 title,
1286 context,
1287 hash,
1288 docid,
1289 collection_name: collection,
1290 path,
1291 modified_at,
1292 body_length: body_length as usize,
1293 body: None,
1294 }
1295 },
1296 )
1297 .collect();
1298
1299 Ok(results)
1300}
1301
1302#[cfg(test)]
1303mod path_tests {
1304 use super::*;
1305
1306 #[test]
1307 fn test_normalize_path_separators() {
1308 assert_eq!(normalize_path_separators(r"C:\Users\test"), "C:/Users/test");
1309 assert_eq!(normalize_path_separators("C:/Users/test"), "C:/Users/test");
1310 assert_eq!(normalize_path_separators("/home/user"), "/home/user");
1311 }
1312
1313 #[test]
1314 fn test_convert_git_bash_path() {
1315 assert_eq!(convert_git_bash_path("/c/Users/test"), "C:/Users/test");
1316 assert_eq!(convert_git_bash_path("/d/Projects/app"), "D:/Projects/app");
1317 assert_eq!(convert_git_bash_path("/home/user"), "/home/user");
1318 assert_eq!(convert_git_bash_path("C:/Users/test"), "C:/Users/test");
1319 }
1320
1321 #[test]
1322 fn test_normalize_filesystem_path() {
1323 assert_eq!(
1324 normalize_filesystem_path(r"C:\Users\test\file.md"),
1325 "C:/Users/test/file.md"
1326 );
1327 assert_eq!(
1328 normalize_filesystem_path("/c/Users/test/file.md"),
1329 "C:/Users/test/file.md"
1330 );
1331 }
1332
1333 #[test]
1334 fn test_is_absolute_path() {
1335 assert!(is_absolute_path("/home/user"));
1336 assert!(is_absolute_path("C:/Users/test"));
1337 assert!(is_absolute_path(r"C:\Users\test"));
1338 assert!(is_absolute_path("/c/Users/test"));
1339 assert!(!is_absolute_path("relative/path"));
1340 assert!(!is_absolute_path("./local"));
1341 }
1342}