manx_cli/rag/
mod.rs

1//! Local RAG (Retrieval-Augmented Generation) system for Manx
2//!
3//! Provides document indexing, semantic search, and LLM integration
4//! for enhanced documentation discovery and AI synthesis.
5
6use crate::rag::embeddings::EmbeddingModel;
7use crate::rag::indexer::Indexer;
8use crate::rag::llm::LlmClient;
9use crate::rag::search_engine::SmartSearchEngine;
10use anyhow::Result;
11use docrawl::{crawl, Config as DocrawlConfig, CrawlConfig};
12// gag disabled: let docrawl manage its own spinner
13#[cfg(unix)]
14// removed libc imports; gag handles stdout/stderr capture cross‑platform
15// no longer used: previous attempt to redirect crawler output
16// #[cfg(unix)]
17// use libc::{close, dup, dup2, open, O_WRONLY};
18use serde::{Deserialize, Serialize};
19// no need for Write trait; summary prints are plain
20use std::path::PathBuf;
21use indicatif::{ProgressBar, ProgressStyle};
22use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
23use walkdir::WalkDir;
24
25pub mod benchmarks;
26pub mod embeddings;
27pub mod indexer;
28pub mod llm;
29pub mod model_metadata;
30pub mod providers;
31pub mod query_enhancer;
32pub mod result_verifier;
33pub mod search_engine;
34
35/// Embedding provider types
36#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
37pub enum EmbeddingProvider {
38    #[default]
39    Hash, // Default hash-based embeddings (current implementation)
40    Onnx(String),        // Local ONNX model path
41    Ollama(String),      // Ollama model name
42    OpenAI(String),      // OpenAI model name (requires API key)
43    HuggingFace(String), // HuggingFace model name (requires API key)
44    Custom(String),      // Custom endpoint URL
45}
46
47/// Configuration for embedding generation
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct EmbeddingConfig {
50    pub provider: EmbeddingProvider,
51    pub dimension: usize,
52    pub model_path: Option<PathBuf>, // For local models
53    pub api_key: Option<String>,     // For API providers
54    pub endpoint: Option<String>,    // For custom endpoints
55    pub timeout_seconds: u64,
56    pub batch_size: usize,
57}
58
59impl Default for EmbeddingConfig {
60    fn default() -> Self {
61        Self {
62            provider: EmbeddingProvider::Hash,
63            dimension: 384, // Hash provider default (will be updated dynamically for others)
64            model_path: None,
65            api_key: None,
66            endpoint: None,
67            timeout_seconds: 30,
68            batch_size: 32,
69        }
70    }
71}
72
73impl EmbeddingConfig {
74    /// Update dimension from actual provider detection
75    pub async fn detect_and_update_dimension(&mut self) -> Result<()> {
76        use crate::rag::embeddings::EmbeddingModel;
77
78        let model = EmbeddingModel::new_with_config(self.clone()).await?;
79        let detected_dimension = model.get_dimension().await?;
80
81        if self.dimension != detected_dimension {
82            log::info!(
83                "Updating dimension from {} to {} for provider {:?}",
84                self.dimension,
85                detected_dimension,
86                self.provider
87            );
88            self.dimension = detected_dimension;
89        }
90
91        Ok(())
92    }
93}
94
95/// Security level for code processing
96#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
97pub enum CodeSecurityLevel {
98    /// Strict: Reject files with any suspicious patterns
99    Strict,
100    /// Moderate: Log warnings but allow most files
101    Moderate,
102    /// Permissive: Minimal security checks
103    Permissive,
104}
105
106impl Default for CodeSecurityLevel {
107    fn default() -> Self {
108        Self::Moderate
109    }
110}
111
112/// Configuration for smart search capabilities
113#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct SmartSearchConfig {
115    pub prefer_semantic: bool,            // Use ONNX over hash when available
116    pub enable_query_enhancement: bool,   // Use LLM for query expansion
117    pub enable_result_verification: bool, // Use LLM for relevance checking
118    pub min_confidence_score: f32,        // Minimum relevance threshold
119    pub max_query_variations: usize,      // Number of query variations to try
120    pub enable_multi_stage: bool,         // Enable multi-stage search strategy
121    pub adaptive_chunking: bool,          // Use smart code-aware chunking
122}
123
124impl Default for SmartSearchConfig {
125    fn default() -> Self {
126        Self {
127            prefer_semantic: true,
128            enable_query_enhancement: true,
129            enable_result_verification: true,
130            min_confidence_score: 0.7,
131            max_query_variations: 3,
132            enable_multi_stage: true,
133            adaptive_chunking: true,
134        }
135    }
136}
137
138/// Configuration for the RAG system
139#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct RagConfig {
141    pub enabled: bool,
142    pub index_path: PathBuf,
143    pub max_results: usize,
144    pub similarity_threshold: f32,
145    pub allow_pdf_processing: bool,
146    pub allow_code_processing: bool,
147    pub code_security_level: CodeSecurityLevel,
148    pub mask_secrets: bool,
149    pub max_file_size_mb: u64,
150    pub embedding: EmbeddingConfig,
151    pub smart_search: SmartSearchConfig,
152}
153
154impl Default for RagConfig {
155    fn default() -> Self {
156        Self {
157            enabled: true, // Enabled by default
158            index_path: PathBuf::from("~/.cache/manx/rag_index"),
159            max_results: 10,
160            similarity_threshold: 0.6,
161            allow_pdf_processing: false, // Disabled by default for security
162            allow_code_processing: true, // Enabled by default with security checks
163            code_security_level: CodeSecurityLevel::Moderate,
164            mask_secrets: true,    // Mask secrets by default
165            max_file_size_mb: 100, // 100MB default limit
166            embedding: EmbeddingConfig::default(),
167            smart_search: SmartSearchConfig::default(),
168        }
169    }
170}
171
172/// Document chunk for indexing
173#[derive(Debug, Clone, Serialize, Deserialize)]
174pub struct DocumentChunk {
175    pub id: String,
176    pub content: String,
177    pub source_path: PathBuf,
178    pub source_type: SourceType,
179    pub title: Option<String>,
180    pub section: Option<String>,
181    pub chunk_index: usize,
182    pub metadata: DocumentMetadata,
183}
184
185/// Type of document source
186#[derive(Debug, Clone, Serialize, Deserialize)]
187pub enum SourceType {
188    Local,
189    Remote,
190    Curated,
191    Web,
192}
193
194/// Document metadata
195#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct DocumentMetadata {
197    pub file_type: String,
198    pub size: u64,
199    pub modified: chrono::DateTime<chrono::Utc>,
200    pub tags: Vec<String>,
201    pub language: Option<String>,
202}
203
204/// Search result from RAG
205#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct RagSearchResult {
207    pub id: String,
208    pub content: String,
209    pub source_path: PathBuf,
210    pub source_type: SourceType,
211    pub title: Option<String>,
212    pub section: Option<String>,
213    pub score: f32,
214    pub chunk_index: usize,
215    pub metadata: DocumentMetadata,
216}
217
218/// RAG system stats
219#[derive(Debug, Serialize, Deserialize)]
220pub struct RagStats {
221    pub total_documents: usize,
222    pub total_chunks: usize,
223    pub index_size_mb: f64,
224    pub last_updated: chrono::DateTime<chrono::Utc>,
225    pub sources: Vec<String>,
226}
227
228/// Stored chunk with embedding for file-based vector storage
229#[derive(Debug, Clone, Serialize, Deserialize)]
230pub struct StoredChunk {
231    pub id: String,
232    pub content: String,
233    pub source_path: PathBuf,
234    pub source_type: SourceType,
235    pub title: Option<String>,
236    pub section: Option<String>,
237    pub chunk_index: usize,
238    pub metadata: DocumentMetadata,
239    pub embedding: Vec<f32>,
240}
241
242/// Local file-based RAG system
243pub struct RagSystem {
244    config: RagConfig,
245    llm_client: Option<LlmClient>,
246}
247
248impl RagSystem {
249    pub async fn new(config: RagConfig) -> Result<Self> {
250        Self::new_with_llm(config, None).await
251    }
252
253    pub async fn new_with_llm(config: RagConfig, llm_client: Option<LlmClient>) -> Result<Self> {
254        if !config.enabled {
255            return Err(anyhow::anyhow!("RAG system is disabled"));
256        }
257
258        // Initialize the local vector storage system
259        let indexer = Indexer::new(&config)?;
260        let index_path = indexer.get_index_path();
261
262        // Create index directory if it doesn't exist
263        std::fs::create_dir_all(index_path)?;
264
265        log::info!(
266            "RAG system initialized with local vector storage at {:?}",
267            index_path
268        );
269        Ok(Self { config, llm_client })
270    }
271
272    pub async fn index_document(&mut self, path: PathBuf) -> Result<usize> {
273        if !self.config.enabled {
274            return Err(anyhow::anyhow!("RAG system is disabled"));
275        }
276
277        let indexer = Indexer::new(&self.config)?;
278        let chunks = indexer.index_document(path)?;
279        let chunk_count = chunks.len();
280
281        // Store chunks in local vector storage
282        self.store_chunks_locally(&chunks).await?;
283
284        log::info!("Successfully indexed and stored {} chunks", chunk_count);
285        Ok(chunk_count)
286    }
287
288    pub async fn index_directory(&mut self, path: PathBuf) -> Result<usize> {
289        if !self.config.enabled {
290            return Err(anyhow::anyhow!("RAG system is disabled"));
291        }
292
293        let indexer = Indexer::new(&self.config)?;
294        let chunks = indexer.index_directory(path)?;
295        let chunk_count = chunks.len();
296
297        // Store chunks in local vector storage
298        self.store_chunks_locally(&chunks).await?;
299
300        log::info!(
301            "Successfully indexed and stored {} chunks from directory",
302            chunk_count
303        );
304        Ok(chunk_count)
305    }
306
307    #[allow(dead_code)]
308    pub async fn index_url(&mut self, url: &str) -> Result<usize> {
309        if !self.config.enabled {
310            return Err(anyhow::anyhow!("RAG system is disabled"));
311        }
312
313        log::info!("Indexing URL: {}", url);
314
315        let indexer = Indexer::new(&self.config)?;
316        let chunks = indexer.index_url(url.to_string()).await?;
317        let chunk_count = chunks.len();
318
319        // Store chunks in local vector storage
320        self.store_chunks_locally(&chunks).await?;
321
322        log::info!(
323            "Successfully indexed and stored {} chunks from URL",
324            chunk_count
325        );
326        Ok(chunk_count)
327    }
328
329    #[allow(dead_code)]
330    pub async fn index_url_deep(
331        &mut self,
332        url: &str,
333        max_depth: Option<u32>,
334        max_pages: Option<u32>,
335    ) -> Result<usize> {
336        if !self.config.enabled {
337            return Err(anyhow::anyhow!("RAG system is disabled"));
338        }
339
340        log::info!(
341            "Deep indexing URL: {} (depth: {:?}, pages: {:?})",
342            url,
343            max_depth,
344            max_pages
345        );
346
347        let indexer = Indexer::new(&self.config)?;
348        // Convert old parameters to new docrawl-based parameters
349        let crawl_all = max_pages.is_none(); // If no page limit, crawl all
350        let chunks = indexer
351            .index_url_deep(url.to_string(), max_depth, crawl_all)
352            .await?;
353        let chunk_count = chunks.len();
354
355        // Store chunks in local vector storage
356        self.store_chunks_locally(&chunks).await?;
357
358        log::info!(
359            "Successfully deep indexed and stored {} chunks from URL",
360            chunk_count
361        );
362        Ok(chunk_count)
363    }
364
365    /// Streamed deep indexing: overlaps crawling and embedding using Tokio for speed
366    pub async fn index_url_deep_stream(
367        &self,
368        url: &str,
369        max_depth: Option<u32>,
370        crawl_all: bool,
371        embed_concurrency: Option<usize>,
372        crawl_max_pages: Option<usize>,
373    ) -> Result<usize> {
374        use std::collections::HashSet;
375        use std::sync::Arc;
376        use tokio::sync::mpsc;
377        use tokio::time::{interval, Duration};
378
379        // If explicitly depth 0 and not crawl-all, do single-page fetch without the crawler
380        if matches!(max_depth, Some(0)) && !crawl_all {
381            eprintln!("\n🧭 Indexing single page (no crawl): {}", url);
382            let embedding_model = std::sync::Arc::new(
383                EmbeddingModel::new_with_config(self.config.embedding.clone()).await?,
384            );
385            let indexer = Indexer::new(&self.config)?;
386            let chunks = indexer.index_single_url_no_crawl(url).await?;
387            let total_stored =
388                store_chunks_with_model_config(&self.config, &chunks, &embedding_model).await?;
389
390            let index_path = indexer.get_index_path();
391            eprintln!("\n==== Manx Index Summary ====");
392            eprintln!("Mode: Single page (no crawl)");
393            eprintln!("Chunks created: {}", chunks.len());
394            eprintln!("Chunks stored: {}", total_stored);
395            eprintln!("Index path: {}", index_path.display());
396            return Ok(total_stored);
397        }
398
399        // If depth is 1 (shallow), prefer our manual shallow crawler to avoid docrawl host-scope quirks
400        if matches!(max_depth, Some(1)) && !crawl_all {
401            eprintln!("\n🧭 Shallow crawl (depth 1) for: {}", url);
402            let embedding_model = std::sync::Arc::new(
403                EmbeddingModel::new_with_config(self.config.embedding.clone()).await?,
404            );
405            let indexer = Indexer::new(&self.config)?;
406            let chunks = indexer.index_shallow_url(url, crawl_max_pages).await?;
407            let total_stored =
408                store_chunks_with_model_config(&self.config, &chunks, &embedding_model).await?;
409
410            let index_path = indexer.get_index_path();
411            eprintln!("\n==== Manx Index Summary ====");
412            eprintln!("Mode: Shallow crawl (depth 1)");
413            eprintln!("Chunks created: {}", chunks.len());
414            eprintln!("Chunks stored: {}", total_stored);
415            eprintln!("Index path: {}", index_path.display());
416            return Ok(total_stored);
417        }
418
419        let temp_dir = std::env::temp_dir().join(format!("manx_crawl_{}", uuid::Uuid::new_v4()));
420        std::fs::create_dir_all(&temp_dir)?;
421
422        // Show initial status
423        eprintln!("\n🌐 Starting document crawl for: {}", url);
424        eprintln!("   This will: 1) Crawl pages → 2) Chunk content → 3) Create embeddings");
425        log::debug!("Temp directory: {}", temp_dir.display());
426        eprintln!();
427
428        // Resolve potential redirects to get canonical host (e.g., kali.org -> www.kali.org)
429        let base_url = if let Ok(resp) = reqwest::Client::new().get(url).send().await {
430            resp.url().clone()
431        } else {
432            url::Url::parse(url)?
433        };
434        // Configure docrawl secondary options (use defaults)
435        let crawl_config = CrawlConfig {
436            base_url,
437            output_dir: temp_dir.clone(),
438            user_agent: "Manx/0.5.0 (Documentation Crawler)".to_string(),
439            max_depth: if let Some(d) = max_depth {
440                Some(d as usize)
441            } else if crawl_all {
442                None
443            } else {
444                Some(3)
445            },
446            silence: true, // Silence docrawl; Manx renders its own progress UI
447            rate_limit_per_sec: 20, // Reasonable rate limit for documentation sites
448            follow_sitemaps: true,
449            concurrency: std::cmp::max(8, num_cpus::get()), // Use more threads for faster crawling
450            timeout: Some(std::time::Duration::from_secs(30)),
451            resume: false,
452            // Additional crawler behavior configuration
453            config: DocrawlConfig::default(),
454        };
455
456        // Create embedding model once
457        let embedding_model = Arc::new(EmbeddingModel::new_with_config(self.config.embedding.clone()).await?);
458
459        // Channel for discovered markdown files
460        let (tx, rx) = mpsc::channel::<PathBuf>(200);
461        let rx = std::sync::Arc::new(tokio::sync::Mutex::new(rx));
462        let crawl_max_pages = crawl_max_pages.unwrap_or(usize::MAX);
463
464        // Counters for summary (no live prints during crawl)
465        let pages_counter = Arc::new(AtomicUsize::new(0));
466        let processed_pages_counter = Arc::new(AtomicUsize::new(0));
467        let chunks_counter = Arc::new(AtomicUsize::new(0));
468
469        // Track when crawl is done
470        let crawl_done = Arc::new(AtomicBool::new(false));
471        let crawl_done_clone = crawl_done.clone();
472
473        // Spawn crawler; suppress its stdout to avoid competing spinner
474        let crawl_handle = tokio::spawn(async move {
475            let result = crawl(crawl_config).await;
476            crawl_done_clone.store(true, Ordering::Relaxed);
477            // Convert the error to a string to make it Send
478            result.map_err(|e| e.to_string())
479        });
480
481        // Spawn scanner: discover new markdown files while crawler runs
482        let temp_dir_clone = temp_dir.clone();
483        let scanner_tx = tx.clone();
484        let pc = pages_counter.clone();
485        let crawl_done_scanner = crawl_done.clone();
486        let scanner_handle = tokio::spawn(async move {
487            // Give docrawl a head start before we start scanning
488            tokio::time::sleep(Duration::from_secs(3)).await;
489
490            // Start with longer interval during active crawling, speed up when crawl is done
491            let mut scan_interval_ms = 1000; // Start with 1 second interval to reduce overhead
492            let mut ticker = interval(Duration::from_millis(scan_interval_ms));
493            let mut seen: HashSet<PathBuf> = HashSet::new();
494            let mut idle_ticks = 0u32;
495            let mut total_files_scanned;
496            // Reduced verbosity - only show important messages
497            log::debug!("Scanner: Starting to monitor directory: {}", temp_dir_clone.display());
498
499            loop {
500                ticker.tick().await;
501                let mut new_found = 0usize;
502                let mut current_scan_count = 0usize;
503
504                for entry in WalkDir::new(&temp_dir_clone)
505                    .into_iter()
506                    .filter_map(|e| e.ok())
507                {
508                    let path = entry.path();
509                    current_scan_count += 1;
510
511                    if path.is_file() {
512                        if let Some(ext) = path.extension().and_then(|s| s.to_str()) {
513                            if ext == "md" {
514                                let pb = path.to_path_buf();
515                                if !seen.contains(&pb) {
516                                    log::debug!("Scanner: Found new markdown file: {}", pb.file_name().unwrap_or_default().to_string_lossy());
517                                    seen.insert(pb.clone());
518                                    if scanner_tx.send(pb).await.is_err() {
519                                        break;
520                                    }
521                                    new_found += 1;
522                                    pc.fetch_add(1, Ordering::Relaxed);
523                                    if seen.len() >= crawl_max_pages {
524                                        break;
525                                    }
526                                }
527                            }
528                        }
529                    }
530                }
531
532                total_files_scanned = current_scan_count;
533
534                if new_found > 0 {
535                    log::debug!("Scanner: Found {} new markdown files this scan (total: {})", new_found, seen.len());
536                    idle_ticks = 0;
537                    // Speed up scanning when we're finding files
538                    if scan_interval_ms > 300 {
539                        scan_interval_ms = 300;
540                        ticker = interval(Duration::from_millis(scan_interval_ms));
541                    }
542                } else {
543                    idle_ticks += 1;
544
545                    // If crawl is done and we're idle, speed up scanning to finish quickly
546                    if crawl_done_scanner.load(Ordering::Relaxed) && scan_interval_ms > 200 {
547                        scan_interval_ms = 200;
548                        ticker = interval(Duration::from_millis(scan_interval_ms));
549                    }
550
551                    if idle_ticks % 10 == 0 {
552                        log::debug!("Scanner: No new files found for {} ticks (scanned {} total files in directory)", idle_ticks, total_files_scanned);
553                    }
554                }
555
556                if seen.len() >= crawl_max_pages {
557                    eprintln!("Scanner: Reached max pages limit ({})", crawl_max_pages);
558                    break;
559                }
560
561                // Only exit on idle if crawl is done
562                if idle_ticks > 20 && crawl_done_scanner.load(Ordering::Relaxed) {
563                    log::debug!("Scanner: Crawl is done and no new files for {} ticks, exiting", idle_ticks);
564                    break;
565                }
566
567                // If crawl is still running but we've been idle for a long time, keep waiting
568                if idle_ticks > 100 {
569                    log::debug!("Scanner: Safety exit after {} ticks of no activity", idle_ticks);
570                    break;
571                }
572            }
573            let files_found = seen.len();
574            // Final count will be shown by the main thread
575            drop(scanner_tx);
576            files_found
577        });
578
579        // Worker pool
580        let workers = embed_concurrency.unwrap_or_else(|| std::cmp::max(4, num_cpus::get()));
581        let mut joins = Vec::new();
582        let config_clone = self.config.clone();
583        let url_for_worker = url.to_string();
584        for _ in 0..workers {
585            let rx = rx.clone();
586            let embedding_model = embedding_model.clone();
587            let config_clone = config_clone.clone();
588            let url_clone = url_for_worker.clone();
589            let chunks_counter = chunks_counter.clone();
590            let processed_pages_counter = processed_pages_counter.clone();
591            let join = tokio::spawn(async move {
592                let mut stored = 0usize;
593                let idx = match Indexer::new(&config_clone) {
594                    Ok(i) => i,
595                    Err(_) => return 0usize,
596                };
597                loop {
598                    let opt_path = { rx.lock().await.recv().await };
599                    let Some(md_path) = opt_path else { break };
600                    if let Ok(chunks) = idx.process_markdown_file(&md_path, &url_clone).await {
601                        if let Ok(count) =
602                            store_chunks_with_model_config(&config_clone, &chunks, &embedding_model)
603                                .await
604                        {
605                            stored += count;
606                            chunks_counter.fetch_add(count, Ordering::Relaxed);
607                            // count this page as processed after storing its chunks
608                            processed_pages_counter.fetch_add(1, Ordering::Relaxed);
609                        }
610                    }
611                }
612                stored
613            });
614            joins.push(join);
615        }
616
617        // Wait for crawl to complete first
618        let crawl_result = crawl_handle.await;
619        let crawled_pages = if let Ok(Ok(stats)) = &crawl_result {
620            eprintln!("\n✓ Crawl completed: {} pages crawled", stats.pages);
621            stats.pages
622        } else if let Ok(Err(e)) = &crawl_result {
623            eprintln!("\n⚠️ Crawl completed with error: {}", e);
624            0
625        } else {
626            eprintln!("\n⚠️ Crawl status unknown");
627            0
628        };
629
630        // Monitor file discovery with a proper progress spinner
631        eprintln!("\n📂 Scanning for markdown files...");
632
633        let pb = ProgressBar::new_spinner();
634        pb.set_style(
635            ProgressStyle::default_spinner()
636                .template("{spinner:.green} {msg}")
637                .unwrap()
638        );
639        pb.enable_steady_tick(std::time::Duration::from_millis(200));
640
641        // Create a monitoring task that updates the progress bar
642        let pages_counter_clone = pages_counter.clone();
643        let crawl_done_clone = crawl_done.clone();
644        let pb_clone = pb.clone();
645        let start_time = std::time::Instant::now();
646        let monitor_handle = tokio::spawn(async move {
647            let mut last_count = 0usize;
648            let mut stable_cycles = 0;
649
650            loop {
651                tokio::time::sleep(Duration::from_millis(500)).await;
652                let current_count = pages_counter_clone.load(Ordering::Relaxed);
653                let is_crawl_done = crawl_done_clone.load(Ordering::Relaxed);
654                let elapsed = start_time.elapsed().as_secs_f32().max(0.1);
655                let rate = (current_count as f32) / elapsed;
656
657                let message = if current_count != last_count {
658                    last_count = current_count;
659                    stable_cycles = 0;
660                    format!("Found {} files | {:.1}/s", current_count, rate)
661                } else {
662                    stable_cycles += 1;
663                    if stable_cycles < 4 {
664                        format!("Found {} files (scanning...) | {:.1}/s", current_count, rate)
665                    } else {
666                        format!("Found {} files (finalizing...) | {:.1}/s", current_count, rate)
667                    }
668                };
669
670                pb_clone.set_message(message);
671
672                // Stop monitoring once crawl is done and we've been stable for a bit
673                if is_crawl_done && stable_cycles > 6 {
674                    break;
675                }
676                // Safety exit after 30 seconds
677                if stable_cycles > 60 {
678                    break;
679                }
680            }
681        });
682
683        // Wait for scanner to finish
684        let scanner_result = scanner_handle.await;
685        let _scanner_files = scanner_result.unwrap_or(0);
686
687        // Stop the monitor and finish the progress bar
688        monitor_handle.abort();
689        let final_count = pages_counter.load(Ordering::Relaxed);
690        pb.finish_with_message(format!("✓ Found {} markdown files", final_count));
691
692        drop(tx);
693
694        // If scanner already printed the count, we don't need to print it again
695        // but we'll use the value for logic below
696
697        // Get final count for processing phase
698        let total_pages_found = pages_counter.load(Ordering::Relaxed);
699
700        // Show status for chunking phase
701        let mut processed_so_far = processed_pages_counter.load(Ordering::Relaxed);
702
703        // If we have pages to process, show chunking progress with a progress bar
704        if total_pages_found > 0 {
705            eprintln!("\n📄 Processing {} markdown files...", total_pages_found);
706
707            // Create a progress bar for chunking
708            let pb = ProgressBar::new(total_pages_found as u64);
709            pb.set_style(
710                ProgressStyle::default_bar()
711                    .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} files ({percent}%) | {msg}")
712                    .unwrap()
713                    .progress_chars("█▉▊▋▌▍▎▏  ")
714            );
715            pb.set_message("Chunking... 0 chunks created".to_string());
716
717            // Monitor chunking progress
718            let mut last_processed = processed_so_far;
719            let mut stall_counter = 0;
720            while processed_so_far < total_pages_found && stall_counter < 60 {
721                tokio::time::sleep(Duration::from_millis(200)).await;
722                processed_so_far = processed_pages_counter.load(Ordering::Relaxed);
723                let chunks_so_far = chunks_counter.load(Ordering::Relaxed);
724
725                if processed_so_far != last_processed {
726                    pb.set_position(processed_so_far as u64);
727                    pb.set_message(format!("{} chunks created", chunks_so_far));
728                    last_processed = processed_so_far;
729                    stall_counter = 0;
730                } else {
731                    stall_counter += 1;
732                    // Update spinner even when stalled to show activity
733                    pb.tick();
734                }
735            }
736
737            // Ensure we show completion
738            pb.set_position(processed_so_far as u64);
739
740            if processed_so_far == total_pages_found {
741                pb.finish_with_message("✓ All files processed");
742            } else {
743                pb.abandon_with_message("Processing incomplete - some files may have failed");
744            }
745        } else {
746            eprintln!("\n⚠️  No markdown files found to process");
747            eprintln!("   The crawler processed {} pages but docrawl generated no markdown files.", crawled_pages);
748            eprintln!("   This can happen when:");
749            eprintln!("   • The site uses JavaScript rendering that docrawl can't parse");
750            eprintln!("   • The pages contain mostly non-text content (images, PDFs, etc.)");
751            eprintln!("   • The site structure isn't compatible with the crawler");
752            // Extra newline intentionally removed to satisfy clippy
753            eprintln!("   Try:");
754            eprintln!("   • Using a different URL that points to documentation pages");
755            eprintln!("   • Indexing local files instead if you have them downloaded");
756        }
757
758        // Wait for all workers to complete
759        let mut total_stored = 0usize;
760
761        if total_pages_found > 0 {
762            // Only show spinner if we had files to process
763            eprintln!("\n⏳ Waiting for workers to finish...");
764            let pb_final = ProgressBar::new_spinner();
765            pb_final.set_style(
766                ProgressStyle::default_spinner()
767                    .template("{spinner:.green} {msg}")
768                    .unwrap()
769            );
770            pb_final.set_message("Finalizing embeddings and storing to database...");
771            pb_final.enable_steady_tick(std::time::Duration::from_millis(100));
772
773            for j in joins {
774                pb_final.tick();
775                if let Ok(count) = j.await {
776                    total_stored += count;
777                }
778            }
779
780            pb_final.finish_with_message("✓ Index finalized");
781        } else {
782            // Just wait for workers without showing spinner
783            for j in joins {
784                if let Ok(count) = j.await {
785                    total_stored += count;
786                }
787            }
788        }
789
790        // Clean up temp directory (silently)
791        let _ = std::fs::remove_dir_all(&temp_dir);
792
793        // Final summary (always show, even if no files found)
794        let total_pages = pages_counter.load(Ordering::Relaxed);
795        let final_processed = processed_pages_counter.load(Ordering::Relaxed);
796        let final_chunks = chunks_counter.load(Ordering::Relaxed);
797        let indexer = Indexer::new(&self.config)?;
798        let index_path = indexer.get_index_path();
799
800        eprintln!();
801        eprintln!("==== Manx Index Summary ====");
802        eprintln!("Markdown files found: {}", total_pages);
803        eprintln!("Files processed: {}", final_processed);
804        eprintln!("Chunks created: {}", final_chunks);
805        eprintln!("Chunks stored: {}", total_stored);
806        eprintln!("Index path: {}", index_path.display());
807
808        if total_pages == 0 {
809            eprintln!();
810            eprintln!("⚠️  No markdown files were found. Docrawl may not have generated any content.");
811            eprintln!("   This could mean the site structure is not compatible with crawling.");
812        } else if total_stored == 0 {
813            eprintln!();
814            eprintln!("⚠️  No chunks were stored. The markdown files may have been empty.");
815        }
816
817        Ok(total_stored)
818    }
819
820    pub async fn search(
821        &self,
822        query: &str,
823        max_results: Option<usize>,
824    ) -> Result<Vec<RagSearchResult>> {
825        if !self.config.enabled {
826            return Err(anyhow::anyhow!("RAG system is disabled"));
827        }
828
829        log::info!("Starting intelligent search for: '{}'", query);
830
831        // Create smart search engine
832        let search_engine =
833            SmartSearchEngine::new(self.config.clone(), self.llm_client.clone()).await?;
834
835        // Perform intelligent search
836        let verified_results = search_engine.search(query, max_results).await?;
837
838        // Convert VerifiedResult back to RagSearchResult for compatibility
839        let results: Vec<RagSearchResult> = verified_results
840            .into_iter()
841            .map(|verified| RagSearchResult {
842                id: verified.result.id,
843                content: verified.result.content,
844                source_path: verified.result.source_path,
845                source_type: verified.result.source_type,
846                title: verified.result.title,
847                section: verified.result.section,
848                score: verified.confidence_score, // Use the verified confidence score
849                chunk_index: verified.result.chunk_index,
850                metadata: verified.result.metadata,
851            })
852            .collect();
853
854        log::info!(
855            "Intelligent search completed with {} results",
856            results.len()
857        );
858        Ok(results)
859    }
860
861    pub async fn get_stats(&self) -> Result<RagStats> {
862        if !self.config.enabled {
863            return Err(anyhow::anyhow!("RAG system is disabled"));
864        }
865
866        let indexer = Indexer::new(&self.config)?;
867        let index_path = indexer.get_index_path();
868        let embedding_dir = index_path.join("embeddings");
869
870        if !embedding_dir.exists() {
871            return Ok(RagStats {
872                total_documents: 0,
873                total_chunks: 0,
874                index_size_mb: 0.0,
875                last_updated: chrono::Utc::now(),
876                sources: vec![],
877            });
878        }
879
880        // Count chunks and calculate size
881        let mut total_chunks = 0;
882        let mut total_size = 0u64;
883        let mut sources = std::collections::HashSet::new();
884        let mut last_modified = std::time::UNIX_EPOCH;
885
886        let entries = std::fs::read_dir(&embedding_dir)?;
887        for entry in entries.flatten() {
888            if let Some(file_name) = entry.file_name().to_str() {
889                if file_name.ends_with(".json") {
890                    total_chunks += 1;
891
892                    if let Ok(metadata) = entry.metadata() {
893                        total_size += metadata.len();
894
895                        // Track most recent modification
896                        if let Ok(modified) = metadata.modified() {
897                            if modified > last_modified {
898                                last_modified = modified;
899                            }
900                        }
901                    }
902
903                    // Try to extract source info from chunk data
904                    if let Ok(content) = std::fs::read_to_string(entry.path()) {
905                        if let Ok(chunk_data) = serde_json::from_str::<StoredChunk>(&content) {
906                            if let Some(source_str) = chunk_data.source_path.to_str() {
907                                sources.insert(source_str.to_string());
908                            }
909                        }
910                    }
911                }
912            }
913        }
914
915        // Convert sources to unique document count estimate
916        let total_documents = sources.len();
917        let index_size_mb = total_size as f64 / (1024.0 * 1024.0);
918
919        let last_updated = chrono::DateTime::<chrono::Utc>::from(last_modified);
920
921        let sources_vec: Vec<String> = sources.into_iter().collect();
922
923        Ok(RagStats {
924            total_documents,
925            total_chunks,
926            index_size_mb,
927            last_updated,
928            sources: sources_vec,
929        })
930    }
931
932    pub async fn clear_index(&self) -> Result<()> {
933        if !self.config.enabled {
934            return Err(anyhow::anyhow!("RAG system is disabled"));
935        }
936
937        log::info!("Clearing local vector storage");
938
939        // Get index path and embeddings directory
940        let indexer = Indexer::new(&self.config)?;
941        let index_path = indexer.get_index_path();
942        let embedding_dir = index_path.join("embeddings");
943
944        if embedding_dir.exists() {
945            // Remove all embedding files
946            let entries = std::fs::read_dir(&embedding_dir)?;
947            let mut cleared_count = 0;
948
949            for entry in entries.flatten() {
950                if let Some(file_name) = entry.file_name().to_str() {
951                    if file_name.ends_with(".json") {
952                        if let Err(e) = std::fs::remove_file(entry.path()) {
953                            log::warn!("Failed to remove embedding file {:?}: {}", entry.path(), e);
954                        } else {
955                            cleared_count += 1;
956                        }
957                    }
958                }
959            }
960
961            log::info!(
962                "Successfully cleared {} embedding files from local vector storage",
963                cleared_count
964            );
965        } else {
966            log::info!("Local vector storage directory does not exist, nothing to clear");
967        }
968
969        Ok(())
970    }
971
972    pub async fn health_check(&self) -> Result<()> {
973        if !self.config.enabled {
974            return Err(anyhow::anyhow!("RAG system is disabled"));
975        }
976
977        log::info!("Running RAG system health check...");
978
979        // Check if embedding model can be loaded
980        let _embedding_model = EmbeddingModel::new_with_config(self.config.embedding.clone())
981            .await
982            .map_err(|e| anyhow::anyhow!("Embedding model unavailable: {}", e))?;
983        log::info!("✓ Embedding model loaded successfully");
984
985        // Check if index directory exists and is accessible
986        let indexer = Indexer::new(&self.config)?;
987        let index_path = indexer.get_index_path();
988
989        if index_path.exists() {
990            log::info!("✓ Local index directory exists: {:?}", index_path);
991
992            // Check embeddings directory
993            let embedding_dir = index_path.join("embeddings");
994            if embedding_dir.exists() {
995                // Count existing embeddings
996                match std::fs::read_dir(&embedding_dir) {
997                    Ok(entries) => {
998                        let count = entries.filter_map(|e| e.ok()).count();
999                        log::info!(
1000                            "✓ Local vector storage accessible with {} embedding files",
1001                            count
1002                        );
1003                    }
1004                    Err(e) => {
1005                        log::warn!(
1006                            "⚠ Local vector storage directory exists but cannot read contents: {}",
1007                            e
1008                        );
1009                    }
1010                }
1011            } else {
1012                log::info!("✓ Local vector storage will be created when needed");
1013            }
1014        } else {
1015            log::info!("✓ Local index directory will be created: {:?}", index_path);
1016        }
1017
1018        // Test file system write access
1019        let test_file = index_path.join(".health_check");
1020        match std::fs::create_dir_all(index_path) {
1021            Ok(_) => {
1022                match std::fs::write(&test_file, "health_check") {
1023                    Ok(_) => {
1024                        log::info!("✓ File system write access confirmed");
1025                        let _ = std::fs::remove_file(&test_file); // Clean up test file
1026                    }
1027                    Err(e) => {
1028                        return Err(anyhow::anyhow!("File system write access failed: {}", e));
1029                    }
1030                }
1031            }
1032            Err(e) => {
1033                return Err(anyhow::anyhow!("Cannot create index directory: {}", e));
1034            }
1035        }
1036
1037        log::info!("RAG system health check: All systems operational");
1038        Ok(())
1039    }
1040
1041    /// Store document chunks in local file-based vector storage
1042    async fn store_chunks_locally(&self, chunks: &[DocumentChunk]) -> Result<()> {
1043        use uuid::Uuid;
1044
1045        if chunks.is_empty() {
1046            log::info!("No chunks to store locally");
1047            return Ok(());
1048        }
1049
1050        log::info!("Storing {} chunks in local vector storage", chunks.len());
1051
1052        // Initialize embedding model
1053        let embedding_model =
1054            EmbeddingModel::new_with_config(self.config.embedding.clone()).await?;
1055
1056        // Get index path and create embeddings directory
1057        let indexer = Indexer::new(&self.config)?;
1058        let index_path = indexer.get_index_path();
1059        let embedding_dir = index_path.join("embeddings");
1060
1061        // Create directories if they don't exist
1062        std::fs::create_dir_all(&embedding_dir)?;
1063
1064        // Process chunks and store with embeddings
1065        let mut stored_count = 0;
1066
1067        for (i, chunk) in chunks.iter().enumerate() {
1068            // Generate embedding for chunk content
1069            let embedding = match embedding_model.embed_text(&chunk.content).await {
1070                Ok(embedding) => embedding,
1071                Err(e) => {
1072                    log::warn!("Failed to generate embedding for chunk {}: {}", chunk.id, e);
1073                    continue;
1074                }
1075            };
1076
1077            // Create stored chunk with embedding
1078            let stored_chunk = StoredChunk {
1079                id: chunk.id.clone(),
1080                content: chunk.content.clone(),
1081                source_path: chunk.source_path.clone(),
1082                source_type: chunk.source_type.clone(),
1083                title: chunk.title.clone(),
1084                section: chunk.section.clone(),
1085                chunk_index: chunk.chunk_index,
1086                metadata: chunk.metadata.clone(),
1087                embedding,
1088            };
1089
1090            // Save to JSON file
1091            let file_id = Uuid::new_v4().to_string();
1092            let file_path = embedding_dir.join(format!("{}.json", file_id));
1093
1094            let json_content = serde_json::to_string_pretty(&stored_chunk)?;
1095            std::fs::write(&file_path, json_content)?;
1096
1097            stored_count += 1;
1098            log::debug!("Stored chunk {} to {:?}", chunk.id, file_path);
1099            if (i + 1) % 100 == 0 || i + 1 == chunks.len() {
1100                println!("Stored {}/{} chunks...", i + 1, chunks.len());
1101            }
1102        }
1103
1104        log::info!(
1105            "Successfully stored {} chunks in local vector storage",
1106            stored_count
1107        );
1108        Ok(())
1109    }
1110}
1111
1112/// Store chunks using a shared embedding model (config-based helper)
1113pub async fn store_chunks_with_model_config(
1114    config: &RagConfig,
1115    chunks: &[DocumentChunk],
1116    embedding_model: &EmbeddingModel,
1117) -> Result<usize> {
1118    use uuid::Uuid;
1119    if chunks.is_empty() {
1120        return Ok(0);
1121    }
1122
1123    let indexer = Indexer::new(config)?;
1124    let index_path = indexer.get_index_path();
1125    let embedding_dir = index_path.join("embeddings");
1126    std::fs::create_dir_all(&embedding_dir)?;
1127
1128    let mut stored_count = 0usize;
1129    for chunk in chunks {
1130        let embedding = match embedding_model.embed_text(&chunk.content).await {
1131            Ok(embedding) => embedding,
1132            Err(_) => continue,
1133        };
1134
1135        let stored_chunk = StoredChunk {
1136            id: chunk.id.clone(),
1137            content: chunk.content.clone(),
1138            source_path: chunk.source_path.clone(),
1139            source_type: chunk.source_type.clone(),
1140            title: chunk.title.clone(),
1141            section: chunk.section.clone(),
1142            chunk_index: chunk.chunk_index,
1143            metadata: chunk.metadata.clone(),
1144            embedding,
1145        };
1146
1147        let file_id = Uuid::new_v4().to_string();
1148        let file_path = embedding_dir.join(format!("{}.json", file_id));
1149        let json_content = serde_json::to_string_pretty(&stored_chunk)?;
1150        std::fs::write(&file_path, json_content)?;
1151        stored_count += 1;
1152    }
1153
1154    Ok(stored_count)
1155}