agentroot_core/db/
documents.rs

1//! Document operations
2
3use super::content::docid_from_hash;
4use super::Database;
5use crate::config::virtual_path::{is_virtual_path, parse_virtual_path};
6use crate::error::Result;
7use rusqlite::params;
8use std::collections::HashMap;
9use std::path::PathBuf;
10
11/// Document record from database
12#[derive(Debug, Clone)]
13pub struct Document {
14    pub id: i64,
15    pub collection: String,
16    pub path: String,
17    pub title: String,
18    pub hash: String,
19    pub created_at: String,
20    pub modified_at: String,
21    pub active: bool,
22    pub source_type: String,
23    pub source_uri: Option<String>,
24}
25
26/// Document result with content
27#[derive(Debug, Clone)]
28pub struct DocumentResult {
29    pub filepath: String,
30    pub display_path: String,
31    pub title: String,
32    pub context: Option<String>,
33    pub hash: String,
34    pub docid: String,
35    pub collection_name: String,
36    pub modified_at: String,
37    pub body_length: usize,
38    pub body: Option<String>,
39}
40
41impl Database {
42    /// Insert new document using struct parameters
43    pub fn insert_doc(&self, doc: &DocumentInsert) -> Result<i64> {
44        self.conn.execute(
45            "INSERT INTO documents (
46                collection, path, title, hash, created_at, modified_at, active, source_type, source_uri,
47                llm_summary, llm_title, llm_keywords, llm_category, llm_intent, llm_concepts,
48                llm_difficulty, llm_queries, llm_metadata_generated_at, llm_model
49             )
50             VALUES (?1, ?2, ?3, ?4, ?5, ?6, 1, ?7, ?8, ?9, ?10, ?11, ?12, ?13, ?14, ?15, ?16, ?17, ?18)",
51            params![
52                doc.collection,
53                doc.path,
54                doc.title,
55                doc.hash,
56                doc.created_at,
57                doc.modified_at,
58                doc.source_type,
59                doc.source_uri,
60                doc.llm_summary,
61                doc.llm_title,
62                doc.llm_keywords,
63                doc.llm_category,
64                doc.llm_intent,
65                doc.llm_concepts,
66                doc.llm_difficulty,
67                doc.llm_queries,
68                doc.llm_metadata_generated_at,
69                doc.llm_model,
70            ],
71        )?;
72        Ok(self.conn.last_insert_rowid())
73    }
74
75    /// Insert new document (legacy method)
76    #[allow(clippy::too_many_arguments)]
77    pub fn insert_document(
78        &self,
79        collection: &str,
80        path: &str,
81        title: &str,
82        hash: &str,
83        created_at: &str,
84        modified_at: &str,
85        source_type: &str,
86        source_uri: Option<&str>,
87    ) -> Result<i64> {
88        let doc = DocumentInsert {
89            collection,
90            path,
91            title,
92            hash,
93            created_at,
94            modified_at,
95            source_type,
96            source_uri,
97            llm_summary: None,
98            llm_title: None,
99            llm_keywords: None,
100            llm_category: None,
101            llm_intent: None,
102            llm_concepts: None,
103            llm_difficulty: None,
104            llm_queries: None,
105            llm_metadata_generated_at: None,
106            llm_model: None,
107        };
108        self.insert_doc(&doc)
109    }
110
111    /// Update existing document (new content hash)
112    pub fn update_document(
113        &self,
114        id: i64,
115        title: &str,
116        hash: &str,
117        modified_at: &str,
118    ) -> Result<()> {
119        self.conn.execute(
120            "UPDATE documents SET title = ?2, hash = ?3, modified_at = ?4 WHERE id = ?1",
121            params![id, title, hash, modified_at],
122        )?;
123        Ok(())
124    }
125
126    /// Update document title only
127    pub fn update_document_title(&self, id: i64, title: &str, modified_at: &str) -> Result<()> {
128        self.conn.execute(
129            "UPDATE documents SET title = ?2, modified_at = ?3 WHERE id = ?1",
130            params![id, title, modified_at],
131        )?;
132        Ok(())
133    }
134
135    /// Soft-delete document (set active = 0)
136    pub fn deactivate_document(&self, collection: &str, path: &str) -> Result<bool> {
137        let rows = self.conn.execute(
138            "UPDATE documents SET active = 0 WHERE collection = ?1 AND path = ?2",
139            params![collection, path],
140        )?;
141        Ok(rows > 0)
142    }
143
144    /// Find active document by collection and path
145    pub fn find_active_document(&self, collection: &str, path: &str) -> Result<Option<Document>> {
146        let result = self.conn.query_row(
147            "SELECT id, collection, path, title, hash, created_at, modified_at, active, source_type, source_uri
148             FROM documents WHERE collection = ?1 AND path = ?2 AND active = 1",
149            params![collection, path],
150            |row| {
151                Ok(Document {
152                    id: row.get(0)?,
153                    collection: row.get(1)?,
154                    path: row.get(2)?,
155                    title: row.get(3)?,
156                    hash: row.get(4)?,
157                    created_at: row.get(5)?,
158                    modified_at: row.get(6)?,
159                    active: row.get::<_, i32>(7)? == 1,
160                    source_type: row.get(8)?,
161                    source_uri: row.get(9)?,
162                })
163            },
164        );
165        match result {
166            Ok(doc) => Ok(Some(doc)),
167            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
168            Err(e) => Err(e.into()),
169        }
170    }
171
172    /// Get all active document paths in collection
173    pub fn get_active_document_paths(&self, collection: &str) -> Result<Vec<String>> {
174        let mut stmt = self
175            .conn
176            .prepare("SELECT path FROM documents WHERE collection = ?1 AND active = 1")?;
177        let paths = stmt
178            .query_map(params![collection], |row| row.get(0))?
179            .collect::<std::result::Result<Vec<_>, _>>()?;
180        Ok(paths)
181    }
182
183    /// Find document by docid (hash prefix)
184    pub fn find_by_docid(&self, docid: &str) -> Result<Option<DocumentResult>> {
185        let docid = docid.trim_start_matches('#');
186        let result = self.conn.query_row(
187            "SELECT d.id, d.collection, d.path, d.title, d.hash, d.modified_at,
188                    c.doc, LENGTH(c.doc)
189             FROM documents d
190             JOIN content c ON c.hash = d.hash
191             WHERE d.hash LIKE ?1 || '%' AND d.active = 1
192             LIMIT 1",
193            params![docid],
194            |row| {
195                Ok(DocumentResult {
196                    filepath: format!(
197                        "agentroot://{}/{}",
198                        row.get::<_, String>(1)?,
199                        row.get::<_, String>(2)?
200                    ),
201                    display_path: format!(
202                        "{}/{}",
203                        row.get::<_, String>(1)?,
204                        row.get::<_, String>(2)?
205                    ),
206                    title: row.get(3)?,
207                    context: None,
208                    hash: row.get(4)?,
209                    docid: docid_from_hash(&row.get::<_, String>(4)?),
210                    collection_name: row.get(1)?,
211                    modified_at: row.get(5)?,
212                    body: Some(row.get(6)?),
213                    body_length: row.get(7)?,
214                })
215            },
216        );
217        match result {
218            Ok(doc) => Ok(Some(doc)),
219            Err(rusqlite::Error::QueryReturnedNoRows) => Ok(None),
220            Err(e) => Err(e.into()),
221        }
222    }
223
224    /// Hard delete inactive documents
225    pub fn delete_inactive_documents(&self) -> Result<usize> {
226        let rows = self
227            .conn
228            .execute("DELETE FROM documents WHERE active = 0", [])?;
229        Ok(rows)
230    }
231
232    /// Multi-lookup with fallback chain
233    pub fn lookup_document(
234        &self,
235        query: &str,
236        collections: &HashMap<String, PathBuf>,
237    ) -> Result<Option<DocumentResult>> {
238        let query = query.trim();
239
240        // 1. Docid lookup
241        if query.starts_with('#')
242            || (query.len() == 6 && query.chars().all(|c| c.is_ascii_hexdigit()))
243        {
244            if let Some(doc) = self.find_by_docid(query)? {
245                return Ok(Some(doc));
246            }
247        }
248
249        // 2. Virtual path lookup
250        if is_virtual_path(query) {
251            if let Ok((collection, path)) = parse_virtual_path(query) {
252                if let Some(doc) = self.find_active_document(&collection, &path)? {
253                    return Ok(Some(self.document_to_result(&doc)?));
254                }
255            }
256        }
257
258        // 3. Absolute path with collection lookup
259        let expanded = if query.starts_with("~/") {
260            dirs::home_dir()
261                .map(|home| home.join(&query[2..]).to_string_lossy().to_string())
262                .unwrap_or_else(|| query.to_string())
263        } else {
264            query.to_string()
265        };
266
267        let abs_path = std::path::Path::new(&expanded);
268        if abs_path.is_absolute() {
269            for (coll_name, coll_path) in collections {
270                if let Ok(rel_path) = abs_path.strip_prefix(coll_path) {
271                    let path = rel_path.to_string_lossy().to_string();
272                    if let Some(doc) = self.find_active_document(coll_name, &path)? {
273                        return Ok(Some(self.document_to_result(&doc)?));
274                    }
275                }
276            }
277        }
278
279        // 4. Fuzzy matching fallback
280        let candidates = self.fuzzy_find_documents(query, 1)?;
281        Ok(candidates.into_iter().next())
282    }
283
284    /// Fuzzy matching using simple contains + length
285    pub fn fuzzy_find_documents(&self, query: &str, limit: usize) -> Result<Vec<DocumentResult>> {
286        let query_lower = query.to_lowercase();
287        let mut stmt = self.conn.prepare(
288            "SELECT d.collection, d.path, d.title, d.hash, d.modified_at, c.doc, LENGTH(c.doc)
289             FROM documents d
290             JOIN content c ON c.hash = d.hash
291             WHERE d.active = 1 AND (LOWER(d.path) LIKE '%' || ?1 || '%' OR LOWER(d.title) LIKE '%' || ?1 || '%')
292             ORDER BY LENGTH(d.path)
293             LIMIT ?2"
294        )?;
295
296        let results = stmt
297            .query_map(params![query_lower, limit as i64], |row| {
298                Ok(DocumentResult {
299                    filepath: format!(
300                        "agentroot://{}/{}",
301                        row.get::<_, String>(0)?,
302                        row.get::<_, String>(1)?
303                    ),
304                    display_path: format!(
305                        "{}/{}",
306                        row.get::<_, String>(0)?,
307                        row.get::<_, String>(1)?
308                    ),
309                    title: row.get(2)?,
310                    context: None,
311                    hash: row.get(3)?,
312                    docid: docid_from_hash(&row.get::<_, String>(3)?),
313                    collection_name: row.get(0)?,
314                    modified_at: row.get(4)?,
315                    body: Some(row.get(5)?),
316                    body_length: row.get(6)?,
317                })
318            })?
319            .collect::<std::result::Result<Vec<_>, _>>()?;
320
321        Ok(results)
322    }
323
324    fn document_to_result(&self, doc: &Document) -> Result<DocumentResult> {
325        let body = self.get_content(&doc.hash)?;
326        let body_length = body.as_ref().map(|b| b.len()).unwrap_or(0);
327
328        Ok(DocumentResult {
329            filepath: format!("agentroot://{}/{}", doc.collection, doc.path),
330            display_path: format!("{}/{}", doc.collection, doc.path),
331            title: doc.title.clone(),
332            context: None,
333            hash: doc.hash.clone(),
334            docid: docid_from_hash(&doc.hash),
335            collection_name: doc.collection.clone(),
336            modified_at: doc.modified_at.clone(),
337            body_length,
338            body,
339        })
340    }
341
342    /// Get document content by query (docid, virtual path, etc)
343    pub fn get_document(&self, query: &str) -> Result<String> {
344        let query = query.trim();
345
346        // Docid lookup
347        if query.starts_with('#')
348            || (query.len() == 6 && query.chars().all(|c| c.is_ascii_hexdigit()))
349        {
350            if let Some(doc) = self.find_by_docid(query)? {
351                return doc.body.ok_or_else(|| {
352                    crate::error::AgentRootError::DocumentNotFound(query.to_string())
353                });
354            }
355        }
356
357        // Virtual path lookup
358        if is_virtual_path(query) {
359            if let Ok((collection, path)) = parse_virtual_path(query) {
360                if let Some(doc) = self.find_active_document(&collection, &path)? {
361                    if let Some(content) = self.get_content(&doc.hash)? {
362                        return Ok(content);
363                    }
364                }
365            }
366        }
367
368        // Path prefix lookup (collection/path)
369        if query.contains('/') {
370            let parts: Vec<&str> = query.splitn(2, '/').collect();
371            if parts.len() == 2 {
372                if let Some(doc) = self.find_active_document(parts[0], parts[1])? {
373                    if let Some(content) = self.get_content(&doc.hash)? {
374                        return Ok(content);
375                    }
376                }
377            }
378        }
379
380        Err(crate::error::AgentRootError::DocumentNotFound(
381            query.to_string(),
382        ))
383    }
384
385    /// List documents by prefix
386    pub fn list_documents_by_prefix(&self, prefix: &str) -> Result<Vec<DocumentListItem>> {
387        let prefix = prefix.trim_start_matches("agentroot://");
388        let like_pattern = format!("{}%", prefix);
389
390        let mut stmt = self.conn.prepare(
391            "SELECT d.collection, d.path, d.title, d.hash
392             FROM documents d
393             WHERE d.active = 1 AND (d.collection || '/' || d.path) LIKE ?1
394             ORDER BY d.collection, d.path",
395        )?;
396
397        let results = stmt
398            .query_map(params![like_pattern], |row| {
399                Ok(DocumentListItem {
400                    path: format!("{}/{}", row.get::<_, String>(0)?, row.get::<_, String>(1)?),
401                    title: row.get(2)?,
402                    docid: docid_from_hash(&row.get::<_, String>(3)?),
403                })
404            })?
405            .collect::<std::result::Result<Vec<_>, _>>()?;
406
407        Ok(results)
408    }
409
410    /// Get multiple documents by pattern
411    pub fn get_documents_by_pattern(&self, pattern: &str) -> Result<Vec<DocumentContent>> {
412        // Handle comma-separated list of docids
413        if pattern.contains(',') {
414            let mut results = Vec::new();
415            for part in pattern.split(',') {
416                let part = part.trim();
417                if let Ok(content) = self.get_document(part) {
418                    results.push(DocumentContent {
419                        path: part.to_string(),
420                        content,
421                    });
422                }
423            }
424            return Ok(results);
425        }
426
427        // Glob pattern matching
428        let pattern = glob::Pattern::new(pattern)?;
429        let mut stmt = self.conn.prepare(
430            "SELECT d.collection, d.path, c.doc
431             FROM documents d
432             JOIN content c ON c.hash = d.hash
433             WHERE d.active = 1",
434        )?;
435
436        let results = stmt
437            .query_map([], |row| {
438                let path = format!("{}/{}", row.get::<_, String>(0)?, row.get::<_, String>(1)?);
439                Ok((path, row.get::<_, String>(2)?))
440            })?
441            .filter_map(|r| r.ok())
442            .filter(|(path, _)| pattern.matches(path))
443            .map(|(path, content)| DocumentContent { path, content })
444            .collect();
445
446        Ok(results)
447    }
448}
449
450/// Document list item (for ls command)
451#[derive(Debug, Clone, serde::Serialize)]
452pub struct DocumentListItem {
453    pub path: String,
454    pub title: String,
455    pub docid: String,
456}
457
458/// Document content (for multi-get)
459#[derive(Debug, Clone)]
460pub struct DocumentContent {
461    pub path: String,
462    pub content: String,
463}
464
465/// Document insert parameters
466#[derive(Debug, Clone)]
467pub struct DocumentInsert<'a> {
468    pub collection: &'a str,
469    pub path: &'a str,
470    pub title: &'a str,
471    pub hash: &'a str,
472    pub created_at: &'a str,
473    pub modified_at: &'a str,
474    pub source_type: &'a str,
475    pub source_uri: Option<&'a str>,
476    pub llm_summary: Option<&'a str>,
477    pub llm_title: Option<&'a str>,
478    pub llm_keywords: Option<&'a str>,
479    pub llm_category: Option<&'a str>,
480    pub llm_intent: Option<&'a str>,
481    pub llm_concepts: Option<&'a str>,
482    pub llm_difficulty: Option<&'a str>,
483    pub llm_queries: Option<&'a str>,
484    pub llm_metadata_generated_at: Option<&'a str>,
485    pub llm_model: Option<&'a str>,
486}
487
488impl<'a> DocumentInsert<'a> {
489    /// Create new document insert parameters
490    pub fn new(
491        collection: &'a str,
492        path: &'a str,
493        title: &'a str,
494        hash: &'a str,
495        created_at: &'a str,
496        modified_at: &'a str,
497    ) -> Self {
498        Self {
499            collection,
500            path,
501            title,
502            hash,
503            created_at,
504            modified_at,
505            source_type: "file",
506            source_uri: None,
507            llm_summary: None,
508            llm_title: None,
509            llm_keywords: None,
510            llm_category: None,
511            llm_intent: None,
512            llm_concepts: None,
513            llm_difficulty: None,
514            llm_queries: None,
515            llm_metadata_generated_at: None,
516            llm_model: None,
517        }
518    }
519
520    /// Set source type
521    pub fn with_source_type(mut self, source_type: &'a str) -> Self {
522        self.source_type = source_type;
523        self
524    }
525
526    /// Set source URI
527    pub fn with_source_uri(mut self, source_uri: &'a str) -> Self {
528        self.source_uri = Some(source_uri);
529        self
530    }
531
532    /// Set LLM metadata fields from DocumentMetadata
533    pub fn with_llm_metadata(
534        mut self,
535        metadata: &'a crate::llm::DocumentMetadata,
536        _metadata_json: &'a str,
537        model_name: &'a str,
538        generated_at: &'a str,
539    ) -> Self {
540        self.llm_summary = Some(&metadata.summary);
541        self.llm_title = Some(&metadata.semantic_title);
542        self.llm_category = Some(&metadata.category);
543        self.llm_intent = Some(&metadata.intent);
544        self.llm_difficulty = Some(&metadata.difficulty);
545        self.llm_metadata_generated_at = Some(generated_at);
546        self.llm_model = Some(model_name);
547        self
548    }
549
550    /// Set LLM metadata JSON strings (pre-serialized)
551    pub fn with_llm_metadata_strings(
552        mut self,
553        summary: &'a str,
554        title: &'a str,
555        keywords: &'a str,
556        category: &'a str,
557        intent: &'a str,
558        concepts: &'a str,
559        difficulty: &'a str,
560        queries: &'a str,
561        model_name: &'a str,
562        generated_at: &'a str,
563    ) -> Self {
564        self.llm_summary = Some(summary);
565        self.llm_title = Some(title);
566        self.llm_keywords = Some(keywords);
567        self.llm_category = Some(category);
568        self.llm_intent = Some(intent);
569        self.llm_concepts = Some(concepts);
570        self.llm_difficulty = Some(difficulty);
571        self.llm_queries = Some(queries);
572        self.llm_metadata_generated_at = Some(generated_at);
573        self.llm_model = Some(model_name);
574        self
575    }
576}