manx_cli/rag/
mod.rs

1//! Local RAG (Retrieval-Augmented Generation) system for Manx
2//!
3//! Provides document indexing, semantic search, and LLM integration
4//! for enhanced documentation discovery and AI synthesis.
5
6use crate::rag::embeddings::EmbeddingModel;
7use crate::rag::indexer::Indexer;
8use anyhow::Result;
9use serde::{Deserialize, Serialize};
10use std::path::PathBuf;
11
12pub mod benchmarks;
13pub mod embeddings;
14pub mod indexer;
15pub mod llm;
16pub mod model_metadata;
17pub mod providers;
18
19/// Embedding provider types
20#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
21pub enum EmbeddingProvider {
22    #[default]
23    Hash, // Default hash-based embeddings (current implementation)
24    Onnx(String),        // Local ONNX model path
25    Ollama(String),      // Ollama model name
26    OpenAI(String),      // OpenAI model name (requires API key)
27    HuggingFace(String), // HuggingFace model name (requires API key)
28    Custom(String),      // Custom endpoint URL
29}
30
31/// Configuration for embedding generation
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct EmbeddingConfig {
34    pub provider: EmbeddingProvider,
35    pub dimension: usize,
36    pub model_path: Option<PathBuf>, // For local models
37    pub api_key: Option<String>,     // For API providers
38    pub endpoint: Option<String>,    // For custom endpoints
39    pub timeout_seconds: u64,
40    pub batch_size: usize,
41}
42
43impl Default for EmbeddingConfig {
44    fn default() -> Self {
45        Self {
46            provider: EmbeddingProvider::Hash,
47            dimension: 384, // Hash provider default (will be updated dynamically for others)
48            model_path: None,
49            api_key: None,
50            endpoint: None,
51            timeout_seconds: 30,
52            batch_size: 32,
53        }
54    }
55}
56
57impl EmbeddingConfig {
58    /// Update dimension from actual provider detection
59    pub async fn detect_and_update_dimension(&mut self) -> Result<()> {
60        use crate::rag::embeddings::EmbeddingModel;
61
62        let model = EmbeddingModel::new_with_config(self.clone()).await?;
63        let detected_dimension = model.get_dimension().await?;
64
65        if self.dimension != detected_dimension {
66            log::info!(
67                "Updating dimension from {} to {} for provider {:?}",
68                self.dimension,
69                detected_dimension,
70                self.provider
71            );
72            self.dimension = detected_dimension;
73        }
74
75        Ok(())
76    }
77}
78
79/// Configuration for the RAG system
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct RagConfig {
82    pub enabled: bool,
83    pub index_path: PathBuf,
84    pub max_results: usize,
85    pub similarity_threshold: f32,
86    pub allow_pdf_processing: bool,
87    pub embedding: EmbeddingConfig,
88}
89
90impl Default for RagConfig {
91    fn default() -> Self {
92        Self {
93            enabled: true, // Enabled by default
94            index_path: PathBuf::from("~/.cache/manx/rag_index"),
95            max_results: 10,
96            similarity_threshold: 0.6,
97            allow_pdf_processing: false, // Disabled by default for security
98            embedding: EmbeddingConfig::default(),
99        }
100    }
101}
102
103/// Document chunk for indexing
104#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct DocumentChunk {
106    pub id: String,
107    pub content: String,
108    pub source_path: PathBuf,
109    pub source_type: SourceType,
110    pub title: Option<String>,
111    pub section: Option<String>,
112    pub chunk_index: usize,
113    pub metadata: DocumentMetadata,
114}
115
116/// Type of document source
117#[derive(Debug, Clone, Serialize, Deserialize)]
118pub enum SourceType {
119    Local,
120    Remote,
121    Curated,
122    Web,
123}
124
125/// Document metadata
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct DocumentMetadata {
128    pub file_type: String,
129    pub size: u64,
130    pub modified: chrono::DateTime<chrono::Utc>,
131    pub tags: Vec<String>,
132    pub language: Option<String>,
133}
134
135/// Search result from RAG
136#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct RagSearchResult {
138    pub id: String,
139    pub content: String,
140    pub source_path: PathBuf,
141    pub source_type: SourceType,
142    pub title: Option<String>,
143    pub section: Option<String>,
144    pub score: f32,
145    pub metadata: DocumentMetadata,
146}
147
148/// RAG system stats
149#[derive(Debug, Serialize, Deserialize)]
150pub struct RagStats {
151    pub total_documents: usize,
152    pub total_chunks: usize,
153    pub index_size_mb: f64,
154    pub last_updated: chrono::DateTime<chrono::Utc>,
155    pub sources: Vec<String>,
156}
157
158/// Stored chunk with embedding for file-based vector storage
159#[derive(Debug, Clone, Serialize, Deserialize)]
160struct StoredChunk {
161    pub id: String,
162    pub content: String,
163    pub source_path: PathBuf,
164    pub source_type: SourceType,
165    pub title: Option<String>,
166    pub section: Option<String>,
167    pub chunk_index: usize,
168    pub metadata: DocumentMetadata,
169    pub embedding: Vec<f32>,
170}
171
172/// Local file-based RAG system
173pub struct RagSystem {
174    #[allow(dead_code)]
175    config: RagConfig,
176}
177
178impl RagSystem {
179    pub async fn new(config: RagConfig) -> Result<Self> {
180        if !config.enabled {
181            return Err(anyhow::anyhow!("RAG system is disabled"));
182        }
183
184        // Initialize the local vector storage system
185        let indexer = Indexer::new(&config)?;
186        let index_path = indexer.get_index_path();
187
188        // Create index directory if it doesn't exist
189        std::fs::create_dir_all(index_path)?;
190
191        log::info!(
192            "RAG system initialized with local vector storage at {:?}",
193            index_path
194        );
195        Ok(Self { config })
196    }
197
198    pub async fn index_document(&mut self, path: PathBuf) -> Result<usize> {
199        if !self.config.enabled {
200            return Err(anyhow::anyhow!("RAG system is disabled"));
201        }
202
203        let indexer = Indexer::new(&self.config)?;
204        let chunks = indexer.index_document(path)?;
205        let chunk_count = chunks.len();
206
207        // Store chunks in local vector storage
208        self.store_chunks_locally(&chunks).await?;
209
210        log::info!("Successfully indexed and stored {} chunks", chunk_count);
211        Ok(chunk_count)
212    }
213
214    pub async fn index_directory(&mut self, path: PathBuf) -> Result<usize> {
215        if !self.config.enabled {
216            return Err(anyhow::anyhow!("RAG system is disabled"));
217        }
218
219        let indexer = Indexer::new(&self.config)?;
220        let chunks = indexer.index_directory(path)?;
221        let chunk_count = chunks.len();
222
223        // Store chunks in local vector storage
224        self.store_chunks_locally(&chunks).await?;
225
226        log::info!(
227            "Successfully indexed and stored {} chunks from directory",
228            chunk_count
229        );
230        Ok(chunk_count)
231    }
232
233    pub async fn index_url(&mut self, url: &str) -> Result<usize> {
234        if !self.config.enabled {
235            return Err(anyhow::anyhow!("RAG system is disabled"));
236        }
237
238        log::info!("Indexing URL: {}", url);
239
240        let indexer = Indexer::new(&self.config)?;
241        let chunks = indexer.index_url(url.to_string()).await?;
242        let chunk_count = chunks.len();
243
244        // Store chunks in local vector storage
245        self.store_chunks_locally(&chunks).await?;
246
247        log::info!(
248            "Successfully indexed and stored {} chunks from URL",
249            chunk_count
250        );
251        Ok(chunk_count)
252    }
253
254    pub async fn index_url_deep(
255        &mut self,
256        url: &str,
257        max_depth: Option<u32>,
258        max_pages: Option<u32>,
259    ) -> Result<usize> {
260        if !self.config.enabled {
261            return Err(anyhow::anyhow!("RAG system is disabled"));
262        }
263
264        log::info!(
265            "Deep indexing URL: {} (depth: {:?}, pages: {:?})",
266            url,
267            max_depth,
268            max_pages
269        );
270
271        let indexer = Indexer::new(&self.config)?;
272        let chunks = indexer
273            .index_url_deep(url.to_string(), max_depth, max_pages)
274            .await?;
275        let chunk_count = chunks.len();
276
277        // Store chunks in local vector storage
278        self.store_chunks_locally(&chunks).await?;
279
280        log::info!(
281            "Successfully deep indexed and stored {} chunks from URL",
282            chunk_count
283        );
284        Ok(chunk_count)
285    }
286
287    pub async fn search(
288        &self,
289        query: &str,
290        max_results: Option<usize>,
291    ) -> Result<Vec<RagSearchResult>> {
292        if !self.config.enabled {
293            return Err(anyhow::anyhow!("RAG system is disabled"));
294        }
295
296        let limit = max_results.unwrap_or(10);
297
298        log::info!("Searching local vector storage for: '{}'", query);
299
300        // Load the embedding model for semantic search
301        let embedding_model =
302            EmbeddingModel::new_with_config(self.config.embedding.clone()).await?;
303        let query_embedding = embedding_model.embed_text(query).await?;
304
305        // Search through stored embeddings
306        let indexer = Indexer::new(&self.config)?;
307        let index_path = indexer.get_index_path();
308
309        if !index_path.exists() {
310            log::info!("No indexed documents found yet");
311            return Ok(vec![]);
312        }
313
314        let mut results = Vec::new();
315        let embedding_dir = index_path.join("embeddings");
316
317        if !embedding_dir.exists() {
318            log::info!("No embeddings directory found yet");
319            return Ok(vec![]);
320        }
321
322        // Read all embedding files
323        let entries = std::fs::read_dir(embedding_dir)?;
324
325        for entry in entries.flatten() {
326            if let Some(file_name) = entry.file_name().to_str() {
327                if file_name.ends_with(".json") {
328                    match self
329                        .load_and_score_embedding(&entry.path(), &query_embedding, &embedding_model)
330                        .await
331                    {
332                        Ok(Some(result)) => {
333                            if result.score >= self.config.similarity_threshold {
334                                results.push(result);
335                            }
336                        }
337                        Ok(None) => continue,
338                        Err(e) => {
339                            log::warn!(
340                                "Failed to process embedding file {:?}: {}",
341                                entry.path(),
342                                e
343                            );
344                            continue;
345                        }
346                    }
347                }
348            }
349        }
350
351        // Sort by similarity score (highest first)
352        results.sort_by(|a, b| {
353            b.score
354                .partial_cmp(&a.score)
355                .unwrap_or(std::cmp::Ordering::Equal)
356        });
357        results.truncate(limit);
358
359        log::info!("Found {} results from local vector storage", results.len());
360        Ok(results)
361    }
362
363    /// Load and score an embedding file against the query
364    async fn load_and_score_embedding(
365        &self,
366        embedding_path: &std::path::Path,
367        query_embedding: &[f32],
368        _embedding_model: &EmbeddingModel,
369    ) -> Result<Option<RagSearchResult>> {
370        // Read the stored chunk data
371        let content = std::fs::read_to_string(embedding_path)?;
372        let chunk_data: StoredChunk = serde_json::from_str(&content)
373            .map_err(|e| anyhow::anyhow!("Failed to parse chunk data: {}", e))?;
374
375        // Calculate similarity score
376        let score = EmbeddingModel::cosine_similarity(query_embedding, &chunk_data.embedding);
377
378        // Convert to RagSearchResult
379        Ok(Some(RagSearchResult {
380            id: chunk_data.id,
381            content: chunk_data.content,
382            source_path: chunk_data.source_path,
383            source_type: chunk_data.source_type,
384            title: chunk_data.title,
385            section: chunk_data.section,
386            score,
387            metadata: chunk_data.metadata,
388        }))
389    }
390
391    pub async fn get_stats(&self) -> Result<RagStats> {
392        if !self.config.enabled {
393            return Err(anyhow::anyhow!("RAG system is disabled"));
394        }
395
396        let indexer = Indexer::new(&self.config)?;
397        let index_path = indexer.get_index_path();
398        let embedding_dir = index_path.join("embeddings");
399
400        if !embedding_dir.exists() {
401            return Ok(RagStats {
402                total_documents: 0,
403                total_chunks: 0,
404                index_size_mb: 0.0,
405                last_updated: chrono::Utc::now(),
406                sources: vec![],
407            });
408        }
409
410        // Count chunks and calculate size
411        let mut total_chunks = 0;
412        let mut total_size = 0u64;
413        let mut sources = std::collections::HashSet::new();
414        let mut last_modified = std::time::UNIX_EPOCH;
415
416        let entries = std::fs::read_dir(&embedding_dir)?;
417        for entry in entries.flatten() {
418            if let Some(file_name) = entry.file_name().to_str() {
419                if file_name.ends_with(".json") {
420                    total_chunks += 1;
421
422                    if let Ok(metadata) = entry.metadata() {
423                        total_size += metadata.len();
424
425                        // Track most recent modification
426                        if let Ok(modified) = metadata.modified() {
427                            if modified > last_modified {
428                                last_modified = modified;
429                            }
430                        }
431                    }
432
433                    // Try to extract source info from chunk data
434                    if let Ok(content) = std::fs::read_to_string(entry.path()) {
435                        if let Ok(chunk_data) = serde_json::from_str::<StoredChunk>(&content) {
436                            if let Some(source_str) = chunk_data.source_path.to_str() {
437                                sources.insert(source_str.to_string());
438                            }
439                        }
440                    }
441                }
442            }
443        }
444
445        // Convert sources to unique document count estimate
446        let total_documents = sources.len();
447        let index_size_mb = total_size as f64 / (1024.0 * 1024.0);
448
449        let last_updated = chrono::DateTime::<chrono::Utc>::from(last_modified);
450
451        let sources_vec: Vec<String> = sources.into_iter().collect();
452
453        Ok(RagStats {
454            total_documents,
455            total_chunks,
456            index_size_mb,
457            last_updated,
458            sources: sources_vec,
459        })
460    }
461
462    pub async fn clear_index(&self) -> Result<()> {
463        if !self.config.enabled {
464            return Err(anyhow::anyhow!("RAG system is disabled"));
465        }
466
467        log::info!("Clearing local vector storage");
468
469        // Get index path and embeddings directory
470        let indexer = Indexer::new(&self.config)?;
471        let index_path = indexer.get_index_path();
472        let embedding_dir = index_path.join("embeddings");
473
474        if embedding_dir.exists() {
475            // Remove all embedding files
476            let entries = std::fs::read_dir(&embedding_dir)?;
477            let mut cleared_count = 0;
478
479            for entry in entries.flatten() {
480                if let Some(file_name) = entry.file_name().to_str() {
481                    if file_name.ends_with(".json") {
482                        if let Err(e) = std::fs::remove_file(entry.path()) {
483                            log::warn!("Failed to remove embedding file {:?}: {}", entry.path(), e);
484                        } else {
485                            cleared_count += 1;
486                        }
487                    }
488                }
489            }
490
491            log::info!(
492                "Successfully cleared {} embedding files from local vector storage",
493                cleared_count
494            );
495        } else {
496            log::info!("Local vector storage directory does not exist, nothing to clear");
497        }
498
499        Ok(())
500    }
501
502    pub async fn health_check(&self) -> Result<()> {
503        if !self.config.enabled {
504            return Err(anyhow::anyhow!("RAG system is disabled"));
505        }
506
507        log::info!("Running RAG system health check...");
508
509        // Check if embedding model can be loaded
510        let _embedding_model = EmbeddingModel::new_with_config(self.config.embedding.clone())
511            .await
512            .map_err(|e| anyhow::anyhow!("Embedding model unavailable: {}", e))?;
513        log::info!("✓ Embedding model loaded successfully");
514
515        // Check if index directory exists and is accessible
516        let indexer = Indexer::new(&self.config)?;
517        let index_path = indexer.get_index_path();
518
519        if index_path.exists() {
520            log::info!("✓ Local index directory exists: {:?}", index_path);
521
522            // Check embeddings directory
523            let embedding_dir = index_path.join("embeddings");
524            if embedding_dir.exists() {
525                // Count existing embeddings
526                match std::fs::read_dir(&embedding_dir) {
527                    Ok(entries) => {
528                        let count = entries.filter_map(|e| e.ok()).count();
529                        log::info!(
530                            "✓ Local vector storage accessible with {} embedding files",
531                            count
532                        );
533                    }
534                    Err(e) => {
535                        log::warn!(
536                            "⚠ Local vector storage directory exists but cannot read contents: {}",
537                            e
538                        );
539                    }
540                }
541            } else {
542                log::info!("✓ Local vector storage will be created when needed");
543            }
544        } else {
545            log::info!("✓ Local index directory will be created: {:?}", index_path);
546        }
547
548        // Test file system write access
549        let test_file = index_path.join(".health_check");
550        match std::fs::create_dir_all(index_path) {
551            Ok(_) => {
552                match std::fs::write(&test_file, "health_check") {
553                    Ok(_) => {
554                        log::info!("✓ File system write access confirmed");
555                        let _ = std::fs::remove_file(&test_file); // Clean up test file
556                    }
557                    Err(e) => {
558                        return Err(anyhow::anyhow!("File system write access failed: {}", e));
559                    }
560                }
561            }
562            Err(e) => {
563                return Err(anyhow::anyhow!("Cannot create index directory: {}", e));
564            }
565        }
566
567        log::info!("RAG system health check: All systems operational");
568        Ok(())
569    }
570
571    /// Store document chunks in local file-based vector storage
572    async fn store_chunks_locally(&self, chunks: &[DocumentChunk]) -> Result<()> {
573        use uuid::Uuid;
574
575        if chunks.is_empty() {
576            log::info!("No chunks to store locally");
577            return Ok(());
578        }
579
580        log::info!("Storing {} chunks in local vector storage", chunks.len());
581
582        // Initialize embedding model
583        let embedding_model =
584            EmbeddingModel::new_with_config(self.config.embedding.clone()).await?;
585
586        // Get index path and create embeddings directory
587        let indexer = Indexer::new(&self.config)?;
588        let index_path = indexer.get_index_path();
589        let embedding_dir = index_path.join("embeddings");
590
591        // Create directories if they don't exist
592        std::fs::create_dir_all(&embedding_dir)?;
593
594        // Process chunks and store with embeddings
595        let mut stored_count = 0;
596
597        for chunk in chunks {
598            // Generate embedding for chunk content
599            let embedding = match embedding_model.embed_text(&chunk.content).await {
600                Ok(embedding) => embedding,
601                Err(e) => {
602                    log::warn!("Failed to generate embedding for chunk {}: {}", chunk.id, e);
603                    continue;
604                }
605            };
606
607            // Create stored chunk with embedding
608            let stored_chunk = StoredChunk {
609                id: chunk.id.clone(),
610                content: chunk.content.clone(),
611                source_path: chunk.source_path.clone(),
612                source_type: chunk.source_type.clone(),
613                title: chunk.title.clone(),
614                section: chunk.section.clone(),
615                chunk_index: chunk.chunk_index,
616                metadata: chunk.metadata.clone(),
617                embedding,
618            };
619
620            // Save to JSON file
621            let file_id = Uuid::new_v4().to_string();
622            let file_path = embedding_dir.join(format!("{}.json", file_id));
623
624            let json_content = serde_json::to_string_pretty(&stored_chunk)?;
625            std::fs::write(&file_path, json_content)?;
626
627            stored_count += 1;
628            log::debug!("Stored chunk {} to {:?}", chunk.id, file_path);
629        }
630
631        log::info!(
632            "Successfully stored {} chunks in local vector storage",
633            stored_count
634        );
635        Ok(())
636    }
637}