qmd/
store.rs

1//! Database store for document indexing and retrieval.
2//!
3//! This module provides all database operations, search functions, and document
4//! retrieval for QMD.
5
6use crate::collections::{find_context_for_path, list_collections as yaml_list_collections};
7use crate::config::{EXCLUDE_DIRS, get_default_db_path};
8use crate::error::{QmdError, Result};
9use rusqlite::{Connection, OptionalExtension, params};
10use sha2::{Digest, Sha256};
11use std::fs;
12use std::path::{Path, PathBuf};
13
14/// Normalize path separators to forward slashes (Unix-style).
15/// This ensures consistent path handling across platforms.
16#[must_use]
17pub fn normalize_path_separators(path: &str) -> String {
18    path.replace('\\', "/")
19}
20
21/// Convert Git Bash style path (/c/Users/...) to Windows path (C:/Users/...).
22/// Returns the original path if not a Git Bash format.
23#[must_use]
24pub fn convert_git_bash_path(path: &str) -> String {
25    let normalized = normalize_path_separators(path);
26
27    // Check for Git Bash format: /c/... or /d/...
28    if normalized.len() >= 3
29        && normalized.starts_with('/')
30        && normalized
31            .chars()
32            .nth(1)
33            .map_or(false, |c| c.is_ascii_alphabetic())
34        && normalized.chars().nth(2) == Some('/')
35    {
36        let drive_letter = normalized.chars().nth(1).unwrap().to_ascii_uppercase();
37        return format!("{}:{}", drive_letter, &normalized[2..]);
38    }
39
40    normalized
41}
42
43/// Normalize a filesystem path for cross-platform compatibility.
44/// Handles Windows backslashes and Git Bash paths.
45#[must_use]
46pub fn normalize_filesystem_path(path: &str) -> String {
47    let converted = convert_git_bash_path(path);
48    normalize_path_separators(&converted)
49}
50
51/// Check if a path is absolute (works on both Windows and Unix).
52#[must_use]
53pub fn is_absolute_path(path: &str) -> bool {
54    let normalized = normalize_path_separators(path);
55
56    // Unix absolute path
57    if normalized.starts_with('/') {
58        return true;
59    }
60
61    // Windows absolute path (C:/ or C:\)
62    if normalized.len() >= 3 {
63        let chars: Vec<char> = normalized.chars().take(3).collect();
64        if chars[0].is_ascii_alphabetic()
65            && chars[1] == ':'
66            && (chars[2] == '/' || chars[2] == '\\')
67        {
68            return true;
69        }
70    }
71
72    false
73}
74
75/// Document result with all metadata.
76#[derive(Debug, Clone)]
77pub struct DocumentResult {
78    /// Full filesystem path.
79    pub filepath: String,
80    /// Short display path.
81    pub display_path: String,
82    /// Document title.
83    pub title: String,
84    /// Folder context description if configured.
85    pub context: Option<String>,
86    /// Content hash.
87    pub hash: String,
88    /// Short docid (first 6 chars of hash).
89    pub docid: String,
90    /// Parent collection name.
91    pub collection_name: String,
92    /// Relative path within collection.
93    pub path: String,
94    /// Last modification timestamp.
95    pub modified_at: String,
96    /// Body length in bytes.
97    pub body_length: usize,
98    /// Document body (optional).
99    pub body: Option<String>,
100}
101
102/// Search result with score.
103#[derive(Debug, Clone)]
104pub struct SearchResult {
105    /// The document result.
106    pub doc: DocumentResult,
107    /// Relevance score.
108    pub score: f64,
109    /// Source of the result.
110    pub source: SearchSource,
111    /// Chunk position for vector search results (0-indexed).
112    pub chunk_pos: Option<usize>,
113}
114
115/// Search source type.
116#[derive(Debug, Clone, Copy, PartialEq, Eq)]
117pub enum SearchSource {
118    /// Full-text search.
119    Fts,
120    /// Vector similarity search.
121    Vec,
122}
123
124/// Collection info from database.
125#[derive(Debug, Clone)]
126pub struct CollectionInfo {
127    /// Collection name.
128    pub name: String,
129    /// Working directory path.
130    pub pwd: String,
131    /// Glob pattern.
132    pub glob_pattern: String,
133    /// Number of active documents.
134    pub active_count: usize,
135    /// Last modification timestamp.
136    pub last_modified: Option<String>,
137}
138
139/// Index status information.
140#[derive(Debug, Clone)]
141pub struct IndexStatus {
142    /// Total active documents.
143    pub total_documents: usize,
144    /// Documents needing embedding.
145    pub needs_embedding: usize,
146    /// Whether vector index exists.
147    pub has_vector_index: bool,
148    /// Collection information.
149    pub collections: Vec<CollectionInfo>,
150}
151
152/// The database store.
153#[derive(Debug)]
154pub struct Store {
155    /// Database connection.
156    conn: Connection,
157    /// Database file path.
158    db_path: PathBuf,
159}
160
161impl Store {
162    /// Create a new store with default database path.
163    pub fn new() -> Result<Self> {
164        let db_path = get_default_db_path("index")
165            .ok_or_else(|| QmdError::Config("Could not determine database path".to_string()))?;
166        Self::open(&db_path)
167    }
168
169    /// Create a new store with explicit database path.
170    pub fn open(db_path: &Path) -> Result<Self> {
171        // Ensure parent directory exists.
172        if let Some(parent) = db_path.parent() {
173            fs::create_dir_all(parent)?;
174        }
175
176        let conn = Connection::open(db_path)?;
177        let mut store = Self {
178            conn,
179            db_path: db_path.to_path_buf(),
180        };
181        store.initialize()?;
182        Ok(store)
183    }
184
185    /// Get the database path.
186    #[must_use]
187    pub fn db_path(&self) -> &Path {
188        &self.db_path
189    }
190
191    /// Initialize database schema.
192    fn initialize(&mut self) -> Result<()> {
193        self.conn.execute_batch(
194            r"
195            PRAGMA journal_mode = WAL;
196            PRAGMA foreign_keys = ON;
197
198            -- Content-addressable storage
199            CREATE TABLE IF NOT EXISTS content (
200                hash TEXT PRIMARY KEY,
201                doc TEXT NOT NULL,
202                created_at TEXT NOT NULL
203            );
204
205            -- Documents table
206            CREATE TABLE IF NOT EXISTS documents (
207                id INTEGER PRIMARY KEY AUTOINCREMENT,
208                collection TEXT NOT NULL,
209                path TEXT NOT NULL,
210                title TEXT NOT NULL,
211                hash TEXT NOT NULL,
212                created_at TEXT NOT NULL,
213                modified_at TEXT NOT NULL,
214                active INTEGER NOT NULL DEFAULT 1,
215                FOREIGN KEY (hash) REFERENCES content(hash) ON DELETE CASCADE,
216                UNIQUE(collection, path)
217            );
218
219            CREATE INDEX IF NOT EXISTS idx_documents_collection ON documents(collection, active);
220            CREATE INDEX IF NOT EXISTS idx_documents_hash ON documents(hash);
221            CREATE INDEX IF NOT EXISTS idx_documents_path ON documents(path, active);
222
223            -- FTS index
224            CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
225                filepath, title, body,
226                tokenize='porter unicode61'
227            );
228
229            -- LLM cache
230            CREATE TABLE IF NOT EXISTS llm_cache (
231                hash TEXT PRIMARY KEY,
232                result TEXT NOT NULL,
233                created_at TEXT NOT NULL
234            );
235
236            -- Content vectors metadata
237            CREATE TABLE IF NOT EXISTS content_vectors (
238                hash TEXT NOT NULL,
239                seq INTEGER NOT NULL DEFAULT 0,
240                pos INTEGER NOT NULL DEFAULT 0,
241                model TEXT NOT NULL,
242                embedded_at TEXT NOT NULL,
243                PRIMARY KEY (hash, seq)
244            );
245            ",
246        )?;
247
248        // Create FTS triggers.
249        self.create_fts_triggers()?;
250
251        Ok(())
252    }
253
254    /// Create FTS synchronization triggers.
255    fn create_fts_triggers(&self) -> Result<()> {
256        // Check if triggers exist.
257        let trigger_exists: bool = self
258            .conn
259            .query_row(
260                "SELECT 1 FROM sqlite_master WHERE type='trigger' AND name='documents_ai'",
261                [],
262                |_| Ok(true),
263            )
264            .unwrap_or(false);
265
266        if !trigger_exists {
267            self.conn.execute_batch(
268                r"
269                CREATE TRIGGER IF NOT EXISTS documents_ai AFTER INSERT ON documents
270                WHEN new.active = 1
271                BEGIN
272                    INSERT INTO documents_fts(rowid, filepath, title, body)
273                    SELECT
274                        new.id,
275                        new.collection || '/' || new.path,
276                        new.title,
277                        (SELECT doc FROM content WHERE hash = new.hash)
278                    WHERE new.active = 1;
279                END;
280
281                CREATE TRIGGER IF NOT EXISTS documents_ad AFTER DELETE ON documents BEGIN
282                    DELETE FROM documents_fts WHERE rowid = old.id;
283                END;
284
285                CREATE TRIGGER IF NOT EXISTS documents_au AFTER UPDATE ON documents
286                BEGIN
287                    DELETE FROM documents_fts WHERE rowid = old.id AND new.active = 0;
288                    INSERT OR REPLACE INTO documents_fts(rowid, filepath, title, body)
289                    SELECT
290                        new.id,
291                        new.collection || '/' || new.path,
292                        new.title,
293                        (SELECT doc FROM content WHERE hash = new.hash)
294                    WHERE new.active = 1;
295                END;
296                ",
297            )?;
298        }
299
300        Ok(())
301    }
302
303    /// Hash content using SHA256.
304    #[must_use]
305    pub fn hash_content(content: &str) -> String {
306        let mut hasher = Sha256::new();
307        hasher.update(content.as_bytes());
308        format!("{:x}", hasher.finalize())
309    }
310
311    /// Get short docid from hash (first 6 characters).
312    #[must_use]
313    pub fn get_docid(hash: &str) -> String {
314        hash.chars().take(6).collect()
315    }
316
317    /// Extract title from markdown content.
318    #[must_use]
319    pub fn extract_title(content: &str) -> String {
320        for line in content.lines() {
321            let trimmed = line.trim();
322            if trimmed.starts_with("# ") {
323                return trimmed[2..].trim().to_string();
324            }
325            if trimmed.starts_with("## ") {
326                return trimmed[3..].trim().to_string();
327            }
328        }
329        String::new()
330    }
331
332    /// Handelize a path to be more token-friendly.
333    #[must_use]
334    pub fn handelize(path: &str) -> String {
335        path.replace("___", "/")
336            .to_lowercase()
337            .split('/')
338            .filter(|s| !s.is_empty())
339            .map(|segment| {
340                let cleaned: String = segment
341                    .chars()
342                    .map(|c| if c.is_alphanumeric() { c } else { '-' })
343                    .collect();
344                cleaned.trim_matches('-').to_string()
345            })
346            .filter(|s| !s.is_empty())
347            .collect::<Vec<_>>()
348            .join("/")
349    }
350
351    /// Insert content into content-addressable storage.
352    pub fn insert_content(&self, hash: &str, content: &str, created_at: &str) -> Result<()> {
353        self.conn.execute(
354            "INSERT OR IGNORE INTO content (hash, doc, created_at) VALUES (?1, ?2, ?3)",
355            params![hash, content, created_at],
356        )?;
357        Ok(())
358    }
359
360    /// Insert a document record.
361    pub fn insert_document(
362        &self,
363        collection: &str,
364        path: &str,
365        title: &str,
366        hash: &str,
367        created_at: &str,
368        modified_at: &str,
369    ) -> Result<()> {
370        self.conn.execute(
371            r"
372            INSERT INTO documents (collection, path, title, hash, created_at, modified_at, active)
373            VALUES (?1, ?2, ?3, ?4, ?5, ?6, 1)
374            ON CONFLICT(collection, path) DO UPDATE SET
375                title = excluded.title,
376                hash = excluded.hash,
377                modified_at = excluded.modified_at,
378                active = 1
379            ",
380            params![collection, path, title, hash, created_at, modified_at],
381        )?;
382        Ok(())
383    }
384
385    /// Find an active document by collection and path.
386    pub fn find_active_document(
387        &self,
388        collection: &str,
389        path: &str,
390    ) -> Result<Option<(i64, String, String)>> {
391        let result = self
392            .conn
393            .query_row(
394                "SELECT id, hash, title FROM documents WHERE collection = ?1 AND path = ?2 AND active = 1",
395                params![collection, path],
396                |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)),
397            )
398            .optional()?;
399        Ok(result)
400    }
401
402    /// Update document title.
403    pub fn update_document_title(
404        &self,
405        document_id: i64,
406        title: &str,
407        modified_at: &str,
408    ) -> Result<()> {
409        self.conn.execute(
410            "UPDATE documents SET title = ?1, modified_at = ?2 WHERE id = ?3",
411            params![title, modified_at, document_id],
412        )?;
413        Ok(())
414    }
415
416    /// Update document hash and title.
417    pub fn update_document(
418        &self,
419        document_id: i64,
420        title: &str,
421        hash: &str,
422        modified_at: &str,
423    ) -> Result<()> {
424        self.conn.execute(
425            "UPDATE documents SET title = ?1, hash = ?2, modified_at = ?3 WHERE id = ?4",
426            params![title, hash, modified_at, document_id],
427        )?;
428        Ok(())
429    }
430
431    /// Deactivate a document.
432    pub fn deactivate_document(&self, collection: &str, path: &str) -> Result<()> {
433        self.conn.execute(
434            "UPDATE documents SET active = 0 WHERE collection = ?1 AND path = ?2",
435            params![collection, path],
436        )?;
437        Ok(())
438    }
439
440    /// Get all active document paths for a collection.
441    pub fn get_active_document_paths(&self, collection: &str) -> Result<Vec<String>> {
442        let mut stmt = self
443            .conn
444            .prepare("SELECT path FROM documents WHERE collection = ?1 AND active = 1")?;
445        let paths = stmt
446            .query_map(params![collection], |row| row.get(0))?
447            .collect::<std::result::Result<Vec<String>, _>>()?;
448        Ok(paths)
449    }
450
451    /// Full-text search using FTS5.
452    pub fn search_fts(
453        &self,
454        query: &str,
455        limit: usize,
456        collection: Option<&str>,
457    ) -> Result<Vec<SearchResult>> {
458        let sql = if collection.is_some() {
459            r"
460            SELECT
461                d.collection,
462                d.path,
463                d.title,
464                d.hash,
465                d.modified_at,
466                bm25(documents_fts) as score,
467                LENGTH(c.doc) as body_length
468            FROM documents_fts fts
469            JOIN documents d ON d.id = fts.rowid
470            JOIN content c ON c.hash = d.hash
471            WHERE documents_fts MATCH ?1
472              AND d.collection = ?2
473              AND d.active = 1
474            ORDER BY score
475            LIMIT ?3
476            "
477        } else {
478            r"
479            SELECT
480                d.collection,
481                d.path,
482                d.title,
483                d.hash,
484                d.modified_at,
485                bm25(documents_fts) as score,
486                LENGTH(c.doc) as body_length
487            FROM documents_fts fts
488            JOIN documents d ON d.id = fts.rowid
489            JOIN content c ON c.hash = d.hash
490            WHERE documents_fts MATCH ?1
491              AND d.active = 1
492            ORDER BY score
493            LIMIT ?2
494            "
495        };
496
497        let mut stmt = self.conn.prepare(sql)?;
498
499        let results: Vec<SearchResult> = if let Some(coll) = collection {
500            stmt.query_map(params![query, coll, limit as i64], |row| {
501                let collection_name: String = row.get(0)?;
502                let path: String = row.get(1)?;
503                let title: String = row.get(2)?;
504                let hash: String = row.get(3)?;
505                let modified_at: String = row.get(4)?;
506                let score: f64 = row.get(5)?;
507                let body_length: i64 = row.get(6)?;
508                let body_length = body_length as usize;
509
510                Ok(SearchResult {
511                    doc: DocumentResult {
512                        filepath: format!("qmd://{collection_name}/{path}"),
513                        display_path: format!("{collection_name}/{path}"),
514                        title,
515                        context: None,
516                        hash: hash.clone(),
517                        docid: Self::get_docid(&hash),
518                        collection_name,
519                        path,
520                        modified_at,
521                        body_length,
522                        body: None,
523                    },
524                    score: -score, // BM25 returns negative scores, higher is better.
525                    source: SearchSource::Fts,
526                    chunk_pos: None,
527                })
528            })?
529            .collect::<std::result::Result<Vec<_>, _>>()?
530        } else {
531            stmt.query_map(params![query, limit as i64], |row| {
532                let collection_name: String = row.get(0)?;
533                let path: String = row.get(1)?;
534                let title: String = row.get(2)?;
535                let hash: String = row.get(3)?;
536                let modified_at: String = row.get(4)?;
537                let score: f64 = row.get(5)?;
538                let body_length: i64 = row.get(6)?;
539                let body_length = body_length as usize;
540
541                Ok(SearchResult {
542                    doc: DocumentResult {
543                        filepath: format!("qmd://{collection_name}/{path}"),
544                        display_path: format!("{collection_name}/{path}"),
545                        title,
546                        context: None,
547                        hash: hash.clone(),
548                        docid: Self::get_docid(&hash),
549                        collection_name,
550                        path,
551                        modified_at,
552                        body_length,
553                        body: None,
554                    },
555                    score: -score,
556                    source: SearchSource::Fts,
557                    chunk_pos: None,
558                })
559            })?
560            .collect::<std::result::Result<Vec<_>, _>>()?
561        };
562
563        // Add context to results.
564        let results_with_context: Vec<SearchResult> = results
565            .into_iter()
566            .map(|mut r| {
567                r.doc.context =
568                    find_context_for_path(&r.doc.collection_name, &r.doc.path).unwrap_or(None);
569                r
570            })
571            .collect();
572
573        Ok(results_with_context)
574    }
575
576    /// Get document by collection and path.
577    pub fn get_document(&self, collection: &str, path: &str) -> Result<Option<DocumentResult>> {
578        let result = self
579            .conn
580            .query_row(
581                r"
582                SELECT
583                    d.title,
584                    d.hash,
585                    d.modified_at,
586                    c.doc,
587                    LENGTH(c.doc) as body_length
588                FROM documents d
589                JOIN content c ON c.hash = d.hash
590                WHERE d.collection = ?1 AND d.path = ?2 AND d.active = 1
591                ",
592                params![collection, path],
593                |row| {
594                    let title: String = row.get(0)?;
595                    let hash: String = row.get(1)?;
596                    let modified_at: String = row.get(2)?;
597                    let body: String = row.get(3)?;
598                    let body_length: i64 = row.get(4)?;
599                    let body_length = body_length as usize;
600
601                    Ok(DocumentResult {
602                        filepath: format!("qmd://{collection}/{path}"),
603                        display_path: format!("{collection}/{path}"),
604                        title,
605                        context: None,
606                        hash: hash.clone(),
607                        docid: Self::get_docid(&hash),
608                        collection_name: collection.to_string(),
609                        path: path.to_string(),
610                        modified_at,
611                        body_length,
612                        body: Some(body),
613                    })
614                },
615            )
616            .optional()?;
617
618        // Add context if document found.
619        let result = result.map(|mut doc| {
620            doc.context = find_context_for_path(collection, path).unwrap_or(None);
621            doc
622        });
623
624        Ok(result)
625    }
626
627    /// Get document by docid (first 6 chars of hash).
628    pub fn find_document_by_docid(&self, docid: &str) -> Result<Option<(String, String)>> {
629        let clean_docid = docid.trim_start_matches('#');
630        let result = self
631            .conn
632            .query_row(
633                r"
634                SELECT d.collection, d.path
635                FROM documents d
636                WHERE d.hash LIKE ?1 || '%' AND d.active = 1
637                LIMIT 1
638                ",
639                params![clean_docid],
640                |row| Ok((row.get(0)?, row.get(1)?)),
641            )
642            .optional()?;
643        Ok(result)
644    }
645
646    /// List collections with stats from database.
647    pub fn list_collections(&self) -> Result<Vec<CollectionInfo>> {
648        let yaml_collections = yaml_list_collections()?;
649
650        let mut collections = Vec::new();
651
652        for coll in yaml_collections {
653            let stats: (i64, Option<String>) = self
654                .conn
655                .query_row(
656                    r"
657                    SELECT COUNT(*) as count, MAX(modified_at) as last_modified
658                    FROM documents
659                    WHERE collection = ?1 AND active = 1
660                    ",
661                    params![coll.name],
662                    |row| Ok((row.get(0)?, row.get(1)?)),
663                )
664                .unwrap_or((0, None));
665
666            collections.push(CollectionInfo {
667                name: coll.name,
668                pwd: coll.path,
669                glob_pattern: coll.pattern,
670                active_count: stats.0 as usize,
671                last_modified: stats.1,
672            });
673        }
674
675        Ok(collections)
676    }
677
678    /// Get index status.
679    pub fn get_status(&self) -> Result<IndexStatus> {
680        let total_documents: i64 = self.conn.query_row(
681            "SELECT COUNT(*) FROM documents WHERE active = 1",
682            [],
683            |row| row.get(0),
684        )?;
685        let total_documents = total_documents as usize;
686
687        let needs_embedding: i64 = self.conn.query_row(
688            r"
689            SELECT COUNT(DISTINCT d.hash)
690            FROM documents d
691            LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
692            WHERE d.active = 1 AND v.hash IS NULL
693            ",
694            [],
695            |row| row.get(0),
696        )?;
697        let needs_embedding = needs_embedding as usize;
698
699        let has_vector_index: bool = self
700            .conn
701            .query_row(
702                "SELECT 1 FROM sqlite_master WHERE type='table' AND name='vectors_vec'",
703                [],
704                |_| Ok(true),
705            )
706            .unwrap_or(false);
707
708        let collections = self.list_collections()?;
709
710        Ok(IndexStatus {
711            total_documents,
712            needs_embedding,
713            has_vector_index,
714            collections,
715        })
716    }
717
718    /// Remove a collection and its documents from the database.
719    pub fn remove_collection_documents(&self, name: &str) -> Result<(usize, usize)> {
720        // Get count before deletion.
721        let doc_count: i64 = self.conn.query_row(
722            "SELECT COUNT(*) FROM documents WHERE collection = ?1",
723            params![name],
724            |row| row.get(0),
725        )?;
726        let doc_count = doc_count as usize;
727
728        // Delete documents.
729        self.conn
730            .execute("DELETE FROM documents WHERE collection = ?1", params![name])?;
731
732        // Cleanup orphaned content.
733        let cleaned = self.cleanup_orphaned_content()?;
734
735        Ok((doc_count, cleaned))
736    }
737
738    /// Rename collection in database.
739    pub fn rename_collection_documents(&self, old_name: &str, new_name: &str) -> Result<()> {
740        self.conn.execute(
741            "UPDATE documents SET collection = ?1 WHERE collection = ?2",
742            params![new_name, old_name],
743        )?;
744        Ok(())
745    }
746
747    /// Cleanup orphaned content (not referenced by any active document).
748    pub fn cleanup_orphaned_content(&self) -> Result<usize> {
749        let changes = self.conn.execute(
750            "DELETE FROM content WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)",
751            [],
752        )?;
753        Ok(changes)
754    }
755
756    /// Cleanup orphaned vectors.
757    pub fn cleanup_orphaned_vectors(&self) -> Result<usize> {
758        let changes = self.conn.execute(
759            r"
760            DELETE FROM content_vectors
761            WHERE hash NOT IN (SELECT DISTINCT hash FROM documents WHERE active = 1)
762            ",
763            [],
764        )?;
765        Ok(changes)
766    }
767
768    /// Delete inactive documents.
769    pub fn delete_inactive_documents(&self) -> Result<usize> {
770        let changes = self
771            .conn
772            .execute("DELETE FROM documents WHERE active = 0", [])?;
773        Ok(changes)
774    }
775
776    /// Clear LLM cache.
777    pub fn clear_cache(&self) -> Result<usize> {
778        let changes = self.conn.execute("DELETE FROM llm_cache", [])?;
779        Ok(changes)
780    }
781
782    /// Vacuum database.
783    pub fn vacuum(&self) -> Result<()> {
784        self.conn.execute("VACUUM", [])?;
785        Ok(())
786    }
787
788    /// Ensure the vector table exists with the correct dimensions.
789    pub fn ensure_vector_table(&self, _dimensions: usize) -> Result<()> {
790        // Create vectors_vec table for storing embeddings
791        self.conn.execute(
792            r"
793                CREATE TABLE IF NOT EXISTS vectors_vec (
794                    hash_seq TEXT PRIMARY KEY,
795                    embedding BLOB NOT NULL
796                )
797                ",
798            [],
799        )?;
800        Ok(())
801    }
802
803    /// Insert an embedding for a content hash.
804    pub fn insert_embedding(
805        &self,
806        hash: &str,
807        seq: usize,
808        pos: usize,
809        embedding: &[f32],
810        model: &str,
811        embedded_at: &str,
812    ) -> Result<()> {
813        // Insert metadata
814        self.conn.execute(
815            r"
816            INSERT OR REPLACE INTO content_vectors (hash, seq, pos, model, embedded_at)
817            VALUES (?1, ?2, ?3, ?4, ?5)
818            ",
819            params![hash, seq as i64, pos as i64, model, embedded_at],
820        )?;
821
822        // Insert vector data
823        let hash_seq = format!("{hash}_{seq}");
824        let embedding_bytes: Vec<u8> = embedding.iter().flat_map(|f| f.to_le_bytes()).collect();
825
826        self.conn.execute(
827            "INSERT OR REPLACE INTO vectors_vec (hash_seq, embedding) VALUES (?1, ?2)",
828            params![hash_seq, embedding_bytes],
829        )?;
830
831        Ok(())
832    }
833
834    /// Get hashes that need embedding.
835    pub fn get_hashes_needing_embedding(&self) -> Result<Vec<(String, String, String)>> {
836        let mut stmt = self.conn.prepare(
837            r"
838            SELECT DISTINCT d.hash, d.path, c.doc
839            FROM documents d
840            JOIN content c ON c.hash = d.hash
841            LEFT JOIN content_vectors v ON d.hash = v.hash AND v.seq = 0
842            WHERE d.active = 1 AND v.hash IS NULL
843            ",
844        )?;
845
846        let results = stmt
847            .query_map([], |row| Ok((row.get(0)?, row.get(1)?, row.get(2)?)))?
848            .collect::<std::result::Result<Vec<_>, _>>()?;
849
850        Ok(results)
851    }
852
853    /// Get embedding for a hash.
854    pub fn get_embedding(&self, hash: &str, seq: usize) -> Result<Option<Vec<f32>>> {
855        let hash_seq = format!("{hash}_{seq}");
856        let result: Option<Vec<u8>> = self
857            .conn
858            .query_row(
859                "SELECT embedding FROM vectors_vec WHERE hash_seq = ?1",
860                params![hash_seq],
861                |row| row.get(0),
862            )
863            .optional()?;
864
865        Ok(result.map(|bytes| {
866            bytes
867                .chunks_exact(4)
868                .map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
869                .collect()
870        }))
871    }
872
873    /// Vector similarity search.
874    pub fn search_vec(
875        &self,
876        query_embedding: &[f32],
877        limit: usize,
878        collection: Option<&str>,
879    ) -> Result<Vec<SearchResult>> {
880        // Get all embeddings and compute similarity
881        let sql = if collection.is_some() {
882            r"
883            SELECT DISTINCT
884                d.collection,
885                d.path,
886                d.title,
887                d.hash,
888                d.modified_at,
889                LENGTH(c.doc) as body_length,
890                v.hash_seq
891            FROM documents d
892            JOIN content c ON c.hash = d.hash
893            JOIN vectors_vec v ON v.hash_seq = d.hash || '_0'
894            WHERE d.active = 1 AND d.collection = ?1
895            "
896        } else {
897            r"
898            SELECT DISTINCT
899                d.collection,
900                d.path,
901                d.title,
902                d.hash,
903                d.modified_at,
904                LENGTH(c.doc) as body_length,
905                v.hash_seq
906            FROM documents d
907            JOIN content c ON c.hash = d.hash
908            JOIN vectors_vec v ON v.hash_seq = d.hash || '_0'
909            WHERE d.active = 1
910            "
911        };
912
913        let mut stmt = self.conn.prepare(sql)?;
914
915        let rows: Vec<(String, String, String, String, String, usize, String)> =
916            if let Some(coll) = collection {
917                stmt.query_map(params![coll], |row| {
918                    let body_length: i64 = row.get(5)?;
919                    Ok((
920                        row.get(0)?,
921                        row.get(1)?,
922                        row.get(2)?,
923                        row.get(3)?,
924                        row.get(4)?,
925                        body_length as usize,
926                        row.get(6)?,
927                    ))
928                })?
929                .collect::<std::result::Result<Vec<_>, _>>()?
930            } else {
931                stmt.query_map([], |row| {
932                    let body_length: i64 = row.get(5)?;
933                    Ok((
934                        row.get(0)?,
935                        row.get(1)?,
936                        row.get(2)?,
937                        row.get(3)?,
938                        row.get(4)?,
939                        body_length as usize,
940                        row.get(6)?,
941                    ))
942                })?
943                .collect::<std::result::Result<Vec<_>, _>>()?
944            };
945
946        // Compute similarities
947        let mut results: Vec<SearchResult> = Vec::new();
948
949        for (collection_name, path, title, hash, modified_at, body_length, _hash_seq) in rows {
950            if let Some(doc_embedding) = self.get_embedding(&hash, 0)? {
951                let similarity = crate::llm::cosine_similarity(query_embedding, &doc_embedding);
952
953                results.push(SearchResult {
954                    doc: DocumentResult {
955                        filepath: format!("qmd://{collection_name}/{path}"),
956                        display_path: format!("{collection_name}/{path}"),
957                        title,
958                        context: None,
959                        hash: hash.clone(),
960                        docid: Self::get_docid(&hash),
961                        collection_name: collection_name.clone(),
962                        path: path.clone(),
963                        modified_at,
964                        body_length,
965                        body: None,
966                    },
967                    score: f64::from(similarity),
968                    source: SearchSource::Vec,
969                    chunk_pos: Some(0), // First chunk (chunk position tracking)
970                });
971            }
972        }
973
974        // Sort by similarity (descending) and limit
975        results.sort_by(|a, b| {
976            b.score
977                .partial_cmp(&a.score)
978                .unwrap_or(std::cmp::Ordering::Equal)
979        });
980        results.truncate(limit);
981
982        // Add context
983        let results_with_context: Vec<SearchResult> = results
984            .into_iter()
985            .map(|mut r| {
986                r.doc.context =
987                    find_context_for_path(&r.doc.collection_name, &r.doc.path).unwrap_or(None);
988                r
989            })
990            .collect();
991
992        Ok(results_with_context)
993    }
994
995    /// Clear all embeddings.
996    pub fn clear_embeddings(&self) -> Result<usize> {
997        let changes1 = self.conn.execute("DELETE FROM content_vectors", [])?;
998        let _ = self.conn.execute("DELETE FROM vectors_vec", []);
999        Ok(changes1)
1000    }
1001
1002    /// List files in a collection.
1003    pub fn list_files(
1004        &self,
1005        collection: &str,
1006        path_prefix: Option<&str>,
1007    ) -> Result<Vec<(String, String, String, usize)>> {
1008        let mut stmt;
1009        let files: Vec<(String, String, String, usize)> = if let Some(prefix) = path_prefix {
1010            let prefix_pattern = format!("{prefix}%");
1011            stmt = self.conn.prepare(
1012                r"
1013                SELECT d.path, d.title, d.modified_at, LENGTH(c.doc) as size
1014                FROM documents d
1015                JOIN content c ON d.hash = c.hash
1016                WHERE d.collection = ?1 AND d.path LIKE ?2 AND d.active = 1
1017                ORDER BY d.path
1018                ",
1019            )?;
1020            stmt.query_map(params![collection, prefix_pattern], |row| {
1021                let size: i64 = row.get(3)?;
1022                Ok((row.get(0)?, row.get(1)?, row.get(2)?, size as usize))
1023            })?
1024            .collect::<std::result::Result<Vec<_>, _>>()?
1025        } else {
1026            stmt = self.conn.prepare(
1027                r"
1028                SELECT d.path, d.title, d.modified_at, LENGTH(c.doc) as size
1029                FROM documents d
1030                JOIN content c ON d.hash = c.hash
1031                WHERE d.collection = ?1 AND d.active = 1
1032                ORDER BY d.path
1033                ",
1034            )?;
1035            stmt.query_map(params![collection], |row| {
1036                let size: i64 = row.get(3)?;
1037                Ok((row.get(0)?, row.get(1)?, row.get(2)?, size as usize))
1038            })?
1039            .collect::<std::result::Result<Vec<_>, _>>()?
1040        };
1041
1042        Ok(files)
1043    }
1044
1045    /// Get index health information.
1046    pub fn get_index_health(&self) -> Result<crate::llm::IndexHealth> {
1047        // Total documents
1048        let total_docs: usize = self.conn.query_row(
1049            "SELECT COUNT(*) FROM documents WHERE active = 1",
1050            [],
1051            |row| row.get::<_, i64>(0).map(|v| v as usize),
1052        )?;
1053
1054        // Hashes needing embedding
1055        let needs_embedding: usize = self.conn.query_row(
1056            r"
1057                SELECT COUNT(DISTINCT d.hash)
1058                FROM documents d
1059                LEFT JOIN content_vectors cv ON d.hash = cv.hash
1060                WHERE d.active = 1 AND cv.hash IS NULL
1061                ",
1062            [],
1063            |row| row.get::<_, i64>(0).map(|v| v as usize),
1064        )?;
1065
1066        // Days since last update
1067        let days_stale: Option<u64> = self
1068            .conn
1069            .query_row(
1070                "SELECT MAX(modified_at) FROM documents WHERE active = 1",
1071                [],
1072                |row| row.get::<_, Option<String>>(0),
1073            )
1074            .ok()
1075            .flatten()
1076            .and_then(|ts| {
1077                chrono::DateTime::parse_from_rfc3339(&ts).ok().map(|dt| {
1078                    let now = chrono::Utc::now();
1079                    let duration = now.signed_duration_since(dt);
1080                    duration.num_days().max(0) as u64
1081                })
1082            });
1083
1084        Ok(crate::llm::IndexHealth {
1085            needs_embedding,
1086            total_docs,
1087            days_stale,
1088        })
1089    }
1090
1091    /// Check index health and print warnings if needed.
1092    pub fn check_and_warn_health(&self) {
1093        if let Ok(health) = self.get_index_health()
1094            && let Some(msg) = health.warning_message()
1095        {
1096            eprintln!("{}", colored::Colorize::yellow(msg.as_str()));
1097        }
1098    }
1099
1100    /// Get total document count.
1101    pub fn get_document_count(&self) -> Result<usize> {
1102        let count: i64 = self.conn.query_row(
1103            "SELECT COUNT(*) FROM documents WHERE active = 1",
1104            [],
1105            |row| row.get(0),
1106        )?;
1107        Ok(count as usize)
1108    }
1109
1110    /// Get total unique hash count.
1111    pub fn get_unique_hash_count(&self) -> Result<usize> {
1112        let count: i64 = self.conn.query_row(
1113            "SELECT COUNT(DISTINCT hash) FROM documents WHERE active = 1",
1114            [],
1115            |row| row.get(0),
1116        )?;
1117        Ok(count as usize)
1118    }
1119
1120    /// Get embedded hash count.
1121    pub fn get_embedded_hash_count(&self) -> Result<usize> {
1122        let count: i64 = self.conn.query_row(
1123            "SELECT COUNT(DISTINCT hash) FROM content_vectors",
1124            [],
1125            |row| row.get(0),
1126        )?;
1127        Ok(count as usize)
1128    }
1129}
1130
1131/// Check if a path should be excluded from indexing.
1132#[must_use]
1133pub fn should_exclude(path: &Path) -> bool {
1134    for component in path.components() {
1135        if let std::path::Component::Normal(name) = component {
1136            let name_str = name.to_string_lossy();
1137            if name_str.starts_with('.') || EXCLUDE_DIRS.contains(&name_str.as_ref()) {
1138                return true;
1139            }
1140        }
1141    }
1142    false
1143}
1144
1145/// Check if a string looks like a docid.
1146#[must_use]
1147pub fn is_docid(s: &str) -> bool {
1148    let clean = s.trim_start_matches('#');
1149    clean.len() == 6 && clean.chars().all(|c| c.is_ascii_hexdigit())
1150}
1151
1152/// Parse a virtual path like "<qmd://collection/path>".
1153#[must_use]
1154pub fn parse_virtual_path(path: &str) -> Option<(String, String)> {
1155    let normalized = normalize_virtual_path(path);
1156    let stripped = normalized.strip_prefix("qmd://")?;
1157    let mut parts = stripped.splitn(2, '/');
1158    let collection = parts.next()?.to_string();
1159    let file_path = parts.next().unwrap_or("").to_string();
1160    Some((collection, file_path))
1161}
1162
1163/// Build a virtual path from collection and path.
1164#[must_use]
1165pub fn build_virtual_path(collection: &str, path: &str) -> String {
1166    format!("qmd://{collection}/{path}")
1167}
1168
1169/// Check if a path is a virtual path.
1170#[must_use]
1171pub fn is_virtual_path(path: &str) -> bool {
1172    let trimmed = path.trim();
1173    trimmed.starts_with("qmd:") || trimmed.starts_with("//")
1174}
1175
1176/// Normalize virtual path format.
1177#[must_use]
1178pub fn normalize_virtual_path(input: &str) -> String {
1179    let path = input.trim();
1180
1181    if let Some(rest) = path.strip_prefix("qmd:") {
1182        let rest = rest.trim_start_matches('/');
1183        return format!("qmd://{rest}");
1184    }
1185
1186    if path.starts_with("//") {
1187        let rest = path.trim_start_matches('/');
1188        return format!("qmd://{rest}");
1189    }
1190
1191    path.to_string()
1192}
1193
1194/// Find files similar to a query using fuzzy matching.
1195///
1196/// Uses the `SkimMatcherV2` algorithm for fuzzy string matching.
1197///
1198/// # Arguments
1199/// * `store` - Store instance
1200/// * `query` - Search query
1201/// * `max_distance` - Maximum edit distance (unused, for API compat)
1202/// * `limit` - Maximum results to return
1203pub fn find_similar_files(
1204    store: &Store,
1205    query: &str,
1206    _max_distance: usize,
1207    limit: usize,
1208) -> Result<Vec<(String, String, i64)>> {
1209    use fuzzy_matcher::FuzzyMatcher;
1210    use fuzzy_matcher::skim::SkimMatcherV2;
1211
1212    let matcher = SkimMatcherV2::default();
1213    let query_lower = query.to_lowercase();
1214
1215    // Get all active file paths
1216    let mut stmt = store.conn.prepare(
1217        r"
1218        SELECT collection, path
1219        FROM documents
1220        WHERE active = 1
1221        ",
1222    )?;
1223
1224    let files: Vec<(String, String)> = stmt
1225        .query_map([], |row| Ok((row.get(0)?, row.get(1)?)))?
1226        .filter_map(std::result::Result::ok)
1227        .collect();
1228
1229    // Score each file
1230    let mut scored: Vec<(String, String, i64)> = files
1231        .into_iter()
1232        .filter_map(|(collection, path)| {
1233            let display_path = build_virtual_path(&collection, &path);
1234            let path_lower = path.to_lowercase();
1235
1236            // Match against path
1237            matcher
1238                .fuzzy_match(&path_lower, &query_lower)
1239                .map(|score| (display_path, path, score))
1240        })
1241        .collect();
1242
1243    // Sort by score descending
1244    scored.sort_by(|a, b| b.2.cmp(&a.2));
1245    scored.truncate(limit);
1246
1247    Ok(scored)
1248}
1249
1250/// Match files using glob pattern.
1251pub fn match_files_by_glob(store: &Store, pattern: &str) -> Result<Vec<DocumentResult>> {
1252    let glob_pattern = glob::Pattern::new(pattern).map_err(|e| QmdError::Config(e.to_string()))?;
1253
1254    let mut stmt = store.conn.prepare(
1255        r"
1256        SELECT d.collection, d.path, d.title, d.hash, d.modified_at, LENGTH(c.doc)
1257        FROM documents d
1258        JOIN content c ON d.hash = c.hash
1259        WHERE d.active = 1
1260        ",
1261    )?;
1262
1263    let results: Vec<DocumentResult> = stmt
1264        .query_map([], |row| {
1265            let collection: String = row.get(0)?;
1266            let path: String = row.get(1)?;
1267            let title: String = row.get(2)?;
1268            let hash: String = row.get(3)?;
1269            let modified_at: String = row.get(4)?;
1270            let body_length: i64 = row.get(5)?;
1271
1272            Ok((collection, path, title, hash, modified_at, body_length))
1273        })?
1274        .filter_map(std::result::Result::ok)
1275        .filter(|(_, path, _, _, _, _)| glob_pattern.matches(path))
1276        .map(
1277            |(collection, path, title, hash, modified_at, body_length)| {
1278                let display_path = build_virtual_path(&collection, &path);
1279                let docid = Store::get_docid(&hash);
1280                let context = find_context_for_path(&collection, &path).ok().flatten();
1281
1282                DocumentResult {
1283                    filepath: display_path.clone(),
1284                    display_path,
1285                    title,
1286                    context,
1287                    hash,
1288                    docid,
1289                    collection_name: collection,
1290                    path,
1291                    modified_at,
1292                    body_length: body_length as usize,
1293                    body: None,
1294                }
1295            },
1296        )
1297        .collect();
1298
1299    Ok(results)
1300}
1301
1302#[cfg(test)]
1303mod path_tests {
1304    use super::*;
1305
1306    #[test]
1307    fn test_normalize_path_separators() {
1308        assert_eq!(normalize_path_separators(r"C:\Users\test"), "C:/Users/test");
1309        assert_eq!(normalize_path_separators("C:/Users/test"), "C:/Users/test");
1310        assert_eq!(normalize_path_separators("/home/user"), "/home/user");
1311    }
1312
1313    #[test]
1314    fn test_convert_git_bash_path() {
1315        assert_eq!(convert_git_bash_path("/c/Users/test"), "C:/Users/test");
1316        assert_eq!(convert_git_bash_path("/d/Projects/app"), "D:/Projects/app");
1317        assert_eq!(convert_git_bash_path("/home/user"), "/home/user");
1318        assert_eq!(convert_git_bash_path("C:/Users/test"), "C:/Users/test");
1319    }
1320
1321    #[test]
1322    fn test_normalize_filesystem_path() {
1323        assert_eq!(
1324            normalize_filesystem_path(r"C:\Users\test\file.md"),
1325            "C:/Users/test/file.md"
1326        );
1327        assert_eq!(
1328            normalize_filesystem_path("/c/Users/test/file.md"),
1329            "C:/Users/test/file.md"
1330        );
1331    }
1332
1333    #[test]
1334    fn test_is_absolute_path() {
1335        assert!(is_absolute_path("/home/user"));
1336        assert!(is_absolute_path("C:/Users/test"));
1337        assert!(is_absolute_path(r"C:\Users\test"));
1338        assert!(is_absolute_path("/c/Users/test"));
1339        assert!(!is_absolute_path("relative/path"));
1340        assert!(!is_absolute_path("./local"));
1341    }
1342}
qmd/store.rs

qmd/
store.rs