Skip to main content

manx_cli/rag/
mod.rs

1//! Local RAG (Retrieval-Augmented Generation) system for Manx
2//!
3//! Provides document indexing, semantic search, and LLM integration
4//! for enhanced documentation discovery and AI synthesis.
5
6use crate::rag::embeddings::EmbeddingModel;
7use crate::rag::indexer::Indexer;
8use crate::rag::llm::LlmClient;
9use crate::rag::search_engine::SmartSearchEngine;
10use anyhow::Result;
11use docrawl::{crawl, Config as DocrawlConfig, CrawlConfig};
12// gag disabled: let docrawl manage its own spinner
13#[cfg(unix)]
14// removed libc imports; gag handles stdout/stderr capture cross‑platform
15// no longer used: previous attempt to redirect crawler output
16// #[cfg(unix)]
17// use libc::{close, dup, dup2, open, O_WRONLY};
18use serde::{Deserialize, Serialize};
19// no need for Write trait; summary prints are plain
20use indicatif::{ProgressBar, ProgressStyle};
21use std::path::PathBuf;
22use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
23use walkdir::WalkDir;
24
25pub mod benchmarks;
26pub mod embeddings;
27pub mod indexer;
28pub mod llm;
29pub mod model_metadata;
30pub mod providers;
31pub mod query_enhancer;
32pub mod result_verifier;
33pub mod search_engine;
34
35/// Embedding provider types
36#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
37pub enum EmbeddingProvider {
38    #[default]
39    Hash, // Default hash-based embeddings (current implementation)
40    Onnx(String),        // Local ONNX model path
41    Ollama(String),      // Ollama model name
42    OpenAI(String),      // OpenAI model name (requires API key)
43    HuggingFace(String), // HuggingFace model name (requires API key)
44    Custom(String),      // Custom endpoint URL
45}
46
47/// Configuration for embedding generation
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct EmbeddingConfig {
50    pub provider: EmbeddingProvider,
51    pub dimension: usize,
52    pub model_path: Option<PathBuf>, // For local models
53    pub api_key: Option<String>,     // For API providers
54    pub endpoint: Option<String>,    // For custom endpoints
55    pub timeout_seconds: u64,
56    pub batch_size: usize,
57}
58
59impl Default for EmbeddingConfig {
60    fn default() -> Self {
61        Self {
62            provider: EmbeddingProvider::Hash,
63            dimension: 384, // Hash provider default (will be updated dynamically for others)
64            model_path: None,
65            api_key: None,
66            endpoint: None,
67            timeout_seconds: 30,
68            batch_size: 32,
69        }
70    }
71}
72
73impl EmbeddingConfig {
74    /// Update dimension from actual provider detection
75    pub async fn detect_and_update_dimension(&mut self) -> Result<()> {
76        use crate::rag::embeddings::EmbeddingModel;
77
78        let model = EmbeddingModel::new_with_config(self.clone()).await?;
79        let detected_dimension = model.get_dimension().await?;
80
81        if self.dimension != detected_dimension {
82            log::info!(
83                "Updating dimension from {} to {} for provider {:?}",
84                self.dimension,
85                detected_dimension,
86                self.provider
87            );
88            self.dimension = detected_dimension;
89        }
90
91        Ok(())
92    }
93}
94
95/// Security level for code processing
96#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
97pub enum CodeSecurityLevel {
98    /// Strict: Reject files with any suspicious patterns
99    Strict,
100    /// Moderate: Log warnings but allow most files
101    #[default]
102    Moderate,
103    /// Permissive: Minimal security checks
104    Permissive,
105}
106
107/// Configuration for smart search capabilities
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct SmartSearchConfig {
110    pub prefer_semantic: bool,            // Use ONNX over hash when available
111    pub enable_query_enhancement: bool,   // Use LLM for query expansion
112    pub enable_result_verification: bool, // Use LLM for relevance checking
113    pub min_confidence_score: f32,        // Minimum relevance threshold
114    pub max_query_variations: usize,      // Number of query variations to try
115    pub enable_multi_stage: bool,         // Enable multi-stage search strategy
116    pub adaptive_chunking: bool,          // Use smart code-aware chunking
117}
118
119impl Default for SmartSearchConfig {
120    fn default() -> Self {
121        Self {
122            prefer_semantic: true,
123            enable_query_enhancement: true,
124            enable_result_verification: true,
125            min_confidence_score: 0.7,
126            max_query_variations: 3,
127            enable_multi_stage: true,
128            adaptive_chunking: true,
129        }
130    }
131}
132
133/// Configuration for the RAG system
134#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct RagConfig {
136    pub enabled: bool,
137    pub index_path: PathBuf,
138    pub max_results: usize,
139    pub similarity_threshold: f32,
140    pub allow_pdf_processing: bool,
141    pub allow_code_processing: bool,
142    pub code_security_level: CodeSecurityLevel,
143    pub mask_secrets: bool,
144    pub max_file_size_mb: u64,
145    pub embedding: EmbeddingConfig,
146    pub smart_search: SmartSearchConfig,
147}
148
149impl Default for RagConfig {
150    fn default() -> Self {
151        Self {
152            enabled: true, // Enabled by default
153            index_path: PathBuf::from("~/.cache/manx/rag_index"),
154            max_results: 10,
155            similarity_threshold: 0.6,
156            allow_pdf_processing: false, // Disabled by default for security
157            allow_code_processing: true, // Enabled by default with security checks
158            code_security_level: CodeSecurityLevel::Moderate,
159            mask_secrets: true,    // Mask secrets by default
160            max_file_size_mb: 100, // 100MB default limit
161            embedding: EmbeddingConfig::default(),
162            smart_search: SmartSearchConfig::default(),
163        }
164    }
165}
166
167/// Document chunk for indexing
168#[derive(Debug, Clone, Serialize, Deserialize)]
169pub struct DocumentChunk {
170    pub id: String,
171    pub content: String,
172    pub source_path: PathBuf,
173    pub source_type: SourceType,
174    pub title: Option<String>,
175    pub section: Option<String>,
176    pub chunk_index: usize,
177    pub metadata: DocumentMetadata,
178}
179
180/// Type of document source
181#[derive(Debug, Clone, Serialize, Deserialize)]
182pub enum SourceType {
183    Local,
184    Remote,
185    Curated,
186    Web,
187}
188
189/// Document metadata
190#[derive(Debug, Clone, Serialize, Deserialize)]
191pub struct DocumentMetadata {
192    pub file_type: String,
193    pub size: u64,
194    pub modified: chrono::DateTime<chrono::Utc>,
195    pub tags: Vec<String>,
196    pub language: Option<String>,
197}
198
199/// Search result from RAG
200#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct RagSearchResult {
202    pub id: String,
203    pub content: String,
204    pub source_path: PathBuf,
205    pub source_type: SourceType,
206    pub title: Option<String>,
207    pub section: Option<String>,
208    pub score: f32,
209    pub chunk_index: usize,
210    pub metadata: DocumentMetadata,
211}
212
213/// RAG system stats
214#[derive(Debug, Serialize, Deserialize)]
215pub struct RagStats {
216    pub total_documents: usize,
217    pub total_chunks: usize,
218    pub index_size_mb: f64,
219    pub last_updated: chrono::DateTime<chrono::Utc>,
220    pub sources: Vec<String>,
221}
222
223/// Stored chunk with embedding for file-based vector storage
224#[derive(Debug, Clone, Serialize, Deserialize)]
225pub struct StoredChunk {
226    pub id: String,
227    pub content: String,
228    pub source_path: PathBuf,
229    pub source_type: SourceType,
230    pub title: Option<String>,
231    pub section: Option<String>,
232    pub chunk_index: usize,
233    pub metadata: DocumentMetadata,
234    pub embedding: Vec<f32>,
235}
236
237/// Local file-based RAG system
238pub struct RagSystem {
239    config: RagConfig,
240    llm_client: Option<LlmClient>,
241}
242
243impl RagSystem {
244    pub async fn new(config: RagConfig) -> Result<Self> {
245        Self::new_with_llm(config, None).await
246    }
247
248    pub async fn new_with_llm(config: RagConfig, llm_client: Option<LlmClient>) -> Result<Self> {
249        if !config.enabled {
250            return Err(anyhow::anyhow!("RAG system is disabled"));
251        }
252
253        // Initialize the local vector storage system
254        let indexer = Indexer::new(&config)?;
255        let index_path = indexer.get_index_path();
256
257        // Create index directory if it doesn't exist
258        std::fs::create_dir_all(index_path)?;
259
260        log::info!(
261            "RAG system initialized with local vector storage at {:?}",
262            index_path
263        );
264        Ok(Self { config, llm_client })
265    }
266
267    pub async fn index_document(&mut self, path: PathBuf) -> Result<usize> {
268        if !self.config.enabled {
269            return Err(anyhow::anyhow!("RAG system is disabled"));
270        }
271
272        let indexer = Indexer::new(&self.config)?;
273        let chunks = indexer.index_document(path)?;
274        let chunk_count = chunks.len();
275
276        // Store chunks in local vector storage
277        self.store_chunks_locally(&chunks).await?;
278
279        log::info!("Successfully indexed and stored {} chunks", chunk_count);
280        Ok(chunk_count)
281    }
282
283    pub async fn index_directory(&mut self, path: PathBuf) -> Result<usize> {
284        if !self.config.enabled {
285            return Err(anyhow::anyhow!("RAG system is disabled"));
286        }
287
288        let indexer = Indexer::new(&self.config)?;
289        let chunks = indexer.index_directory(path)?;
290        let chunk_count = chunks.len();
291
292        // Store chunks in local vector storage
293        self.store_chunks_locally(&chunks).await?;
294
295        log::info!(
296            "Successfully indexed and stored {} chunks from directory",
297            chunk_count
298        );
299        Ok(chunk_count)
300    }
301
302    #[allow(dead_code)]
303    pub async fn index_url(&mut self, url: &str) -> Result<usize> {
304        if !self.config.enabled {
305            return Err(anyhow::anyhow!("RAG system is disabled"));
306        }
307
308        log::info!("Indexing URL: {}", url);
309
310        let indexer = Indexer::new(&self.config)?;
311        let chunks = indexer.index_url(url.to_string()).await?;
312        let chunk_count = chunks.len();
313
314        // Store chunks in local vector storage
315        self.store_chunks_locally(&chunks).await?;
316
317        log::info!(
318            "Successfully indexed and stored {} chunks from URL",
319            chunk_count
320        );
321        Ok(chunk_count)
322    }
323
324    #[allow(dead_code)]
325    pub async fn index_url_deep(
326        &mut self,
327        url: &str,
328        max_depth: Option<u32>,
329        max_pages: Option<u32>,
330    ) -> Result<usize> {
331        if !self.config.enabled {
332            return Err(anyhow::anyhow!("RAG system is disabled"));
333        }
334
335        log::info!(
336            "Deep indexing URL: {} (depth: {:?}, pages: {:?})",
337            url,
338            max_depth,
339            max_pages
340        );
341
342        let indexer = Indexer::new(&self.config)?;
343        // Convert old parameters to new docrawl-based parameters
344        let crawl_all = max_pages.is_none(); // If no page limit, crawl all
345        let chunks = indexer
346            .index_url_deep(url.to_string(), max_depth, crawl_all)
347            .await?;
348        let chunk_count = chunks.len();
349
350        // Store chunks in local vector storage
351        self.store_chunks_locally(&chunks).await?;
352
353        log::info!(
354            "Successfully deep indexed and stored {} chunks from URL",
355            chunk_count
356        );
357        Ok(chunk_count)
358    }
359
360    /// Streamed deep indexing: overlaps crawling and embedding using Tokio for speed
361    pub async fn index_url_deep_stream(
362        &self,
363        url: &str,
364        max_depth: Option<u32>,
365        crawl_all: bool,
366        embed_concurrency: Option<usize>,
367        crawl_max_pages: Option<usize>,
368    ) -> Result<usize> {
369        use std::collections::HashSet;
370        use std::sync::Arc;
371        use tokio::sync::mpsc;
372        use tokio::time::{interval, Duration};
373
374        // If explicitly depth 0 and not crawl-all, do single-page fetch without the crawler
375        if matches!(max_depth, Some(0)) && !crawl_all {
376            eprintln!("\nIndexing single page (no crawl): {}", url);
377            let embedding_model = std::sync::Arc::new(
378                EmbeddingModel::new_with_config(self.config.embedding.clone()).await?,
379            );
380            let indexer = Indexer::new(&self.config)?;
381            let chunks = indexer.index_single_url_no_crawl(url).await?;
382            let total_stored =
383                store_chunks_with_model_config(&self.config, &chunks, &embedding_model).await?;
384
385            let index_path = indexer.get_index_path();
386            eprintln!("\n==== Manx Index Summary ====");
387            eprintln!("Mode: Single page (no crawl)");
388            eprintln!("Chunks created: {}", chunks.len());
389            eprintln!("Chunks stored: {}", total_stored);
390            eprintln!("Index path: {}", index_path.display());
391            return Ok(total_stored);
392        }
393
394        // If depth is 1 (shallow), prefer our manual shallow crawler to avoid docrawl host-scope quirks
395        if matches!(max_depth, Some(1)) && !crawl_all {
396            eprintln!("\nShallow crawl (depth 1) for: {}", url);
397            let embedding_model = std::sync::Arc::new(
398                EmbeddingModel::new_with_config(self.config.embedding.clone()).await?,
399            );
400            let indexer = Indexer::new(&self.config)?;
401            let chunks = indexer.index_shallow_url(url, crawl_max_pages).await?;
402            let total_stored =
403                store_chunks_with_model_config(&self.config, &chunks, &embedding_model).await?;
404
405            let index_path = indexer.get_index_path();
406            eprintln!("\n==== Manx Index Summary ====");
407            eprintln!("Mode: Shallow crawl (depth 1)");
408            eprintln!("Chunks created: {}", chunks.len());
409            eprintln!("Chunks stored: {}", total_stored);
410            eprintln!("Index path: {}", index_path.display());
411            return Ok(total_stored);
412        }
413
414        let temp_dir = std::env::temp_dir().join(format!("manx_crawl_{}", uuid::Uuid::new_v4()));
415        std::fs::create_dir_all(&temp_dir)?;
416
417        // Show initial status
418        eprintln!("\nStarting document crawl for: {}", url);
419        eprintln!("   This will: 1) Crawl pages -> 2) Chunk content -> 3) Create embeddings");
420        log::debug!("Temp directory: {}", temp_dir.display());
421        eprintln!();
422
423        // Resolve potential redirects to get canonical host (e.g., kali.org -> www.kali.org)
424        let base_url = if let Ok(resp) = reqwest::Client::new().get(url).send().await {
425            resp.url().clone()
426        } else {
427            url::Url::parse(url)?
428        };
429        // Configure docrawl secondary options (use defaults)
430        let crawl_config = CrawlConfig {
431            base_url,
432            output_dir: temp_dir.clone(),
433            user_agent: "Manx/0.5.0 (Documentation Crawler)".to_string(),
434            max_depth: if let Some(d) = max_depth {
435                Some(d as usize)
436            } else if crawl_all {
437                None
438            } else {
439                Some(3)
440            },
441            silence: true,          // Silence docrawl; Manx renders its own progress UI
442            rate_limit_per_sec: 20, // Reasonable rate limit for documentation sites
443            follow_sitemaps: true,
444            concurrency: std::cmp::max(8, num_cpus::get()), // Use more threads for faster crawling
445            timeout: Some(std::time::Duration::from_secs(30)),
446            resume: false,
447            // Additional crawler behavior configuration
448            config: DocrawlConfig::default(),
449        };
450
451        // Create embedding model once
452        let embedding_model =
453            Arc::new(EmbeddingModel::new_with_config(self.config.embedding.clone()).await?);
454
455        // Channel for discovered markdown files
456        let (tx, rx) = mpsc::channel::<PathBuf>(200);
457        let rx = std::sync::Arc::new(tokio::sync::Mutex::new(rx));
458        let crawl_max_pages = crawl_max_pages.unwrap_or(usize::MAX);
459
460        // Counters for summary (no live prints during crawl)
461        let pages_counter = Arc::new(AtomicUsize::new(0));
462        let processed_pages_counter = Arc::new(AtomicUsize::new(0));
463        let chunks_counter = Arc::new(AtomicUsize::new(0));
464
465        // Track when crawl is done
466        let crawl_done = Arc::new(AtomicBool::new(false));
467        let crawl_done_clone = crawl_done.clone();
468
469        // Spawn crawler; suppress its stdout to avoid competing spinner
470        let crawl_handle = tokio::spawn(async move {
471            let result = crawl(crawl_config).await;
472            crawl_done_clone.store(true, Ordering::Relaxed);
473            // Convert the error to a string to make it Send
474            result.map_err(|e| e.to_string())
475        });
476
477        // Spawn scanner: discover new markdown files while crawler runs
478        let temp_dir_clone = temp_dir.clone();
479        let scanner_tx = tx.clone();
480        let pc = pages_counter.clone();
481        let crawl_done_scanner = crawl_done.clone();
482        let scanner_handle = tokio::spawn(async move {
483            // Give docrawl a head start before we start scanning
484            tokio::time::sleep(Duration::from_secs(3)).await;
485
486            // Start with longer interval during active crawling, speed up when crawl is done
487            let mut scan_interval_ms = 1000; // Start with 1 second interval to reduce overhead
488            let mut ticker = interval(Duration::from_millis(scan_interval_ms));
489            let mut seen: HashSet<PathBuf> = HashSet::new();
490            let mut idle_ticks = 0u32;
491            let mut total_files_scanned;
492            // Reduced verbosity - only show important messages
493            log::debug!(
494                "Scanner: Starting to monitor directory: {}",
495                temp_dir_clone.display()
496            );
497
498            loop {
499                ticker.tick().await;
500                let mut new_found = 0usize;
501                let mut current_scan_count = 0usize;
502
503                for entry in WalkDir::new(&temp_dir_clone)
504                    .into_iter()
505                    .filter_map(|e| e.ok())
506                {
507                    let path = entry.path();
508                    current_scan_count += 1;
509
510                    if path.is_file() {
511                        if let Some(ext) = path.extension().and_then(|s| s.to_str()) {
512                            if ext == "md" {
513                                let pb = path.to_path_buf();
514                                if !seen.contains(&pb) {
515                                    log::debug!(
516                                        "Scanner: Found new markdown file: {}",
517                                        pb.file_name().unwrap_or_default().to_string_lossy()
518                                    );
519                                    seen.insert(pb.clone());
520                                    if scanner_tx.send(pb).await.is_err() {
521                                        break;
522                                    }
523                                    new_found += 1;
524                                    pc.fetch_add(1, Ordering::Relaxed);
525                                    if seen.len() >= crawl_max_pages {
526                                        break;
527                                    }
528                                }
529                            }
530                        }
531                    }
532                }
533
534                total_files_scanned = current_scan_count;
535
536                if new_found > 0 {
537                    log::debug!(
538                        "Scanner: Found {} new markdown files this scan (total: {})",
539                        new_found,
540                        seen.len()
541                    );
542                    idle_ticks = 0;
543                    // Speed up scanning when we're finding files
544                    if scan_interval_ms > 300 {
545                        scan_interval_ms = 300;
546                        ticker = interval(Duration::from_millis(scan_interval_ms));
547                    }
548                } else {
549                    idle_ticks += 1;
550
551                    // If crawl is done and we're idle, speed up scanning to finish quickly
552                    if crawl_done_scanner.load(Ordering::Relaxed) && scan_interval_ms > 200 {
553                        scan_interval_ms = 200;
554                        ticker = interval(Duration::from_millis(scan_interval_ms));
555                    }
556
557                    if idle_ticks.is_multiple_of(10) {
558                        log::debug!("Scanner: No new files found for {} ticks (scanned {} total files in directory)", idle_ticks, total_files_scanned);
559                    }
560                }
561
562                if seen.len() >= crawl_max_pages {
563                    eprintln!("Scanner: Reached max pages limit ({})", crawl_max_pages);
564                    break;
565                }
566
567                // Only exit on idle if crawl is done
568                if idle_ticks > 20 && crawl_done_scanner.load(Ordering::Relaxed) {
569                    log::debug!(
570                        "Scanner: Crawl is done and no new files for {} ticks, exiting",
571                        idle_ticks
572                    );
573                    break;
574                }
575
576                // If crawl is still running but we've been idle for a long time, keep waiting
577                if idle_ticks > 100 {
578                    log::debug!(
579                        "Scanner: Safety exit after {} ticks of no activity",
580                        idle_ticks
581                    );
582                    break;
583                }
584            }
585            let files_found = seen.len();
586            // Final count will be shown by the main thread
587            drop(scanner_tx);
588            files_found
589        });
590
591        // Worker pool
592        let workers = embed_concurrency.unwrap_or_else(|| std::cmp::max(4, num_cpus::get()));
593        let mut joins = Vec::new();
594        let config_clone = self.config.clone();
595        let url_for_worker = url.to_string();
596        for _ in 0..workers {
597            let rx = rx.clone();
598            let embedding_model = embedding_model.clone();
599            let config_clone = config_clone.clone();
600            let url_clone = url_for_worker.clone();
601            let chunks_counter = chunks_counter.clone();
602            let processed_pages_counter = processed_pages_counter.clone();
603            let join = tokio::spawn(async move {
604                let mut stored = 0usize;
605                let idx = match Indexer::new(&config_clone) {
606                    Ok(i) => i,
607                    Err(_) => return 0usize,
608                };
609                loop {
610                    let opt_path = { rx.lock().await.recv().await };
611                    let Some(md_path) = opt_path else { break };
612                    if let Ok(chunks) = idx.process_markdown_file(&md_path, &url_clone).await {
613                        if let Ok(count) =
614                            store_chunks_with_model_config(&config_clone, &chunks, &embedding_model)
615                                .await
616                        {
617                            stored += count;
618                            chunks_counter.fetch_add(count, Ordering::Relaxed);
619                            // count this page as processed after storing its chunks
620                            processed_pages_counter.fetch_add(1, Ordering::Relaxed);
621                        }
622                    }
623                }
624                stored
625            });
626            joins.push(join);
627        }
628
629        // Wait for crawl to complete first
630        let crawl_result = crawl_handle.await;
631        let crawled_pages = if let Ok(Ok(stats)) = &crawl_result {
632            eprintln!("\nCrawl completed: {} pages crawled", stats.pages);
633            stats.pages
634        } else if let Ok(Err(e)) = &crawl_result {
635            eprintln!("\nCrawl completed with error: {}", e);
636            0
637        } else {
638            eprintln!("\nCrawl status unknown");
639            0
640        };
641
642        // Monitor file discovery with a proper progress spinner
643        eprintln!("\nScanning for markdown files...");
644
645        let pb = ProgressBar::new_spinner();
646        pb.set_style(
647            ProgressStyle::default_spinner()
648                .template("{spinner:.green} {msg}")
649                .unwrap(),
650        );
651        pb.enable_steady_tick(std::time::Duration::from_millis(200));
652
653        // Create a monitoring task that updates the progress bar
654        let pages_counter_clone = pages_counter.clone();
655        let crawl_done_clone = crawl_done.clone();
656        let pb_clone = pb.clone();
657        let start_time = std::time::Instant::now();
658        let monitor_handle = tokio::spawn(async move {
659            let mut last_count = 0usize;
660            let mut stable_cycles = 0;
661
662            loop {
663                tokio::time::sleep(Duration::from_millis(500)).await;
664                let current_count = pages_counter_clone.load(Ordering::Relaxed);
665                let is_crawl_done = crawl_done_clone.load(Ordering::Relaxed);
666                let elapsed = start_time.elapsed().as_secs_f32().max(0.1);
667                let rate = (current_count as f32) / elapsed;
668
669                let message = if current_count != last_count {
670                    last_count = current_count;
671                    stable_cycles = 0;
672                    format!("Found {} files | {:.1}/s", current_count, rate)
673                } else {
674                    stable_cycles += 1;
675                    if stable_cycles < 4 {
676                        format!(
677                            "Found {} files (scanning...) | {:.1}/s",
678                            current_count, rate
679                        )
680                    } else {
681                        format!(
682                            "Found {} files (finalizing...) | {:.1}/s",
683                            current_count, rate
684                        )
685                    }
686                };
687
688                pb_clone.set_message(message);
689
690                // Stop monitoring once crawl is done and we've been stable for a bit
691                if is_crawl_done && stable_cycles > 6 {
692                    break;
693                }
694                // Safety exit after 30 seconds
695                if stable_cycles > 60 {
696                    break;
697                }
698            }
699        });
700
701        // Wait for scanner to finish
702        let scanner_result = scanner_handle.await;
703        let _scanner_files = scanner_result.unwrap_or(0);
704
705        // Stop the monitor and finish the progress bar
706        monitor_handle.abort();
707        let final_count = pages_counter.load(Ordering::Relaxed);
708        pb.finish_with_message(format!("Found {} markdown files", final_count));
709
710        drop(tx);
711
712        // If scanner already printed the count, we don't need to print it again
713        // but we'll use the value for logic below
714
715        // Get final count for processing phase
716        let total_pages_found = pages_counter.load(Ordering::Relaxed);
717
718        // Show status for chunking phase
719        let mut processed_so_far = processed_pages_counter.load(Ordering::Relaxed);
720
721        // If we have pages to process, show chunking progress with a progress bar
722        if total_pages_found > 0 {
723            eprintln!("\nProcessing {} markdown files...", total_pages_found);
724
725            // Create a progress bar for chunking
726            let pb = ProgressBar::new(total_pages_found as u64);
727            pb.set_style(
728                ProgressStyle::default_bar()
729                    .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} files ({percent}%) | {msg}")
730                    .unwrap()
731                    .progress_chars("█▉▊▋▌▍▎▏  ")
732            );
733            pb.set_message("Chunking... 0 chunks created".to_string());
734
735            // Monitor chunking progress
736            let mut last_processed = processed_so_far;
737            let mut stall_counter = 0;
738            while processed_so_far < total_pages_found && stall_counter < 60 {
739                tokio::time::sleep(Duration::from_millis(200)).await;
740                processed_so_far = processed_pages_counter.load(Ordering::Relaxed);
741                let chunks_so_far = chunks_counter.load(Ordering::Relaxed);
742
743                if processed_so_far != last_processed {
744                    pb.set_position(processed_so_far as u64);
745                    pb.set_message(format!("{} chunks created", chunks_so_far));
746                    last_processed = processed_so_far;
747                    stall_counter = 0;
748                } else {
749                    stall_counter += 1;
750                    // Update spinner even when stalled to show activity
751                    pb.tick();
752                }
753            }
754
755            // Ensure we show completion
756            pb.set_position(processed_so_far as u64);
757
758            if processed_so_far == total_pages_found {
759                pb.finish_with_message("All files processed");
760            } else {
761                pb.abandon_with_message("Processing incomplete - some files may have failed");
762            }
763        } else {
764            eprintln!("\nNo markdown files found to process");
765            eprintln!(
766                "   The crawler processed {} pages but docrawl generated no markdown files.",
767                crawled_pages
768            );
769            eprintln!("   This can happen when:");
770            eprintln!("   • The site uses JavaScript rendering that docrawl can't parse");
771            eprintln!("   • The pages contain mostly non-text content (images, PDFs, etc.)");
772            eprintln!("   • The site structure isn't compatible with the crawler");
773            // Extra newline intentionally removed to satisfy clippy
774            eprintln!("   Try:");
775            eprintln!("   • Using a different URL that points to documentation pages");
776            eprintln!("   • Indexing local files instead if you have them downloaded");
777        }
778
779        // Wait for all workers to complete
780        let mut total_stored = 0usize;
781
782        if total_pages_found > 0 {
783            // Only show spinner if we had files to process
784            eprintln!("\nWaiting for workers to finish...");
785            let pb_final = ProgressBar::new_spinner();
786            pb_final.set_style(
787                ProgressStyle::default_spinner()
788                    .template("{spinner:.green} {msg}")
789                    .unwrap(),
790            );
791            pb_final.set_message("Finalizing embeddings and storing to database...");
792            pb_final.enable_steady_tick(std::time::Duration::from_millis(100));
793
794            for j in joins {
795                pb_final.tick();
796                if let Ok(count) = j.await {
797                    total_stored += count;
798                }
799            }
800
801            pb_final.finish_with_message("Index finalized");
802        } else {
803            // Just wait for workers without showing spinner
804            for j in joins {
805                if let Ok(count) = j.await {
806                    total_stored += count;
807                }
808            }
809        }
810
811        // Clean up temp directory (silently)
812        let _ = std::fs::remove_dir_all(&temp_dir);
813
814        // Final summary (always show, even if no files found)
815        let total_pages = pages_counter.load(Ordering::Relaxed);
816        let final_processed = processed_pages_counter.load(Ordering::Relaxed);
817        let final_chunks = chunks_counter.load(Ordering::Relaxed);
818        let indexer = Indexer::new(&self.config)?;
819        let index_path = indexer.get_index_path();
820
821        eprintln!();
822        eprintln!("==== Manx Index Summary ====");
823        eprintln!("Markdown files found: {}", total_pages);
824        eprintln!("Files processed: {}", final_processed);
825        eprintln!("Chunks created: {}", final_chunks);
826        eprintln!("Chunks stored: {}", total_stored);
827        eprintln!("Index path: {}", index_path.display());
828
829        if total_pages == 0 {
830            eprintln!();
831            eprintln!("No markdown files were found. Docrawl may not have generated any content.");
832            eprintln!("   This could mean the site structure is not compatible with crawling.");
833        } else if total_stored == 0 {
834            eprintln!();
835            eprintln!("No chunks were stored. The markdown files may have been empty.");
836        }
837
838        Ok(total_stored)
839    }
840
841    pub async fn search(
842        &self,
843        query: &str,
844        max_results: Option<usize>,
845    ) -> Result<Vec<RagSearchResult>> {
846        if !self.config.enabled {
847            return Err(anyhow::anyhow!("RAG system is disabled"));
848        }
849
850        log::info!("Starting intelligent search for: '{}'", query);
851
852        // Create smart search engine
853        let search_engine =
854            SmartSearchEngine::new(self.config.clone(), self.llm_client.clone()).await?;
855
856        // Perform intelligent search
857        let verified_results = search_engine.search(query, max_results).await?;
858
859        // Convert VerifiedResult back to RagSearchResult for compatibility
860        let results: Vec<RagSearchResult> = verified_results
861            .into_iter()
862            .map(|verified| RagSearchResult {
863                id: verified.result.id,
864                content: verified.result.content,
865                source_path: verified.result.source_path,
866                source_type: verified.result.source_type,
867                title: verified.result.title,
868                section: verified.result.section,
869                score: verified.confidence_score, // Use the verified confidence score
870                chunk_index: verified.result.chunk_index,
871                metadata: verified.result.metadata,
872            })
873            .collect();
874
875        log::info!(
876            "Intelligent search completed with {} results",
877            results.len()
878        );
879        Ok(results)
880    }
881
882    pub async fn get_stats(&self) -> Result<RagStats> {
883        if !self.config.enabled {
884            return Err(anyhow::anyhow!("RAG system is disabled"));
885        }
886
887        let indexer = Indexer::new(&self.config)?;
888        let index_path = indexer.get_index_path();
889        let embedding_dir = index_path.join("embeddings");
890
891        if !embedding_dir.exists() {
892            return Ok(RagStats {
893                total_documents: 0,
894                total_chunks: 0,
895                index_size_mb: 0.0,
896                last_updated: chrono::Utc::now(),
897                sources: vec![],
898            });
899        }
900
901        // Count chunks and calculate size
902        let mut total_chunks = 0;
903        let mut total_size = 0u64;
904        let mut sources = std::collections::HashSet::new();
905        let mut last_modified = std::time::UNIX_EPOCH;
906
907        let entries = std::fs::read_dir(&embedding_dir)?;
908        for entry in entries.flatten() {
909            if let Some(file_name) = entry.file_name().to_str() {
910                if file_name.ends_with(".json") {
911                    total_chunks += 1;
912
913                    if let Ok(metadata) = entry.metadata() {
914                        total_size += metadata.len();
915
916                        // Track most recent modification
917                        if let Ok(modified) = metadata.modified() {
918                            if modified > last_modified {
919                                last_modified = modified;
920                            }
921                        }
922                    }
923
924                    // Try to extract source info from chunk data
925                    if let Ok(content) = std::fs::read_to_string(entry.path()) {
926                        if let Ok(chunk_data) = serde_json::from_str::<StoredChunk>(&content) {
927                            if let Some(source_str) = chunk_data.source_path.to_str() {
928                                sources.insert(source_str.to_string());
929                            }
930                        }
931                    }
932                }
933            }
934        }
935
936        // Convert sources to unique document count estimate
937        let total_documents = sources.len();
938        let index_size_mb = total_size as f64 / (1024.0 * 1024.0);
939
940        let last_updated = chrono::DateTime::<chrono::Utc>::from(last_modified);
941
942        let sources_vec: Vec<String> = sources.into_iter().collect();
943
944        Ok(RagStats {
945            total_documents,
946            total_chunks,
947            index_size_mb,
948            last_updated,
949            sources: sources_vec,
950        })
951    }
952
953    pub async fn clear_index(&self) -> Result<()> {
954        if !self.config.enabled {
955            return Err(anyhow::anyhow!("RAG system is disabled"));
956        }
957
958        log::info!("Clearing local vector storage");
959
960        // Get index path and embeddings directory
961        let indexer = Indexer::new(&self.config)?;
962        let index_path = indexer.get_index_path();
963        let embedding_dir = index_path.join("embeddings");
964
965        if embedding_dir.exists() {
966            // Remove all embedding files
967            let entries = std::fs::read_dir(&embedding_dir)?;
968            let mut cleared_count = 0;
969
970            for entry in entries.flatten() {
971                if let Some(file_name) = entry.file_name().to_str() {
972                    if file_name.ends_with(".json") {
973                        if let Err(e) = std::fs::remove_file(entry.path()) {
974                            log::warn!("Failed to remove embedding file {:?}: {}", entry.path(), e);
975                        } else {
976                            cleared_count += 1;
977                        }
978                    }
979                }
980            }
981
982            log::info!(
983                "Successfully cleared {} embedding files from local vector storage",
984                cleared_count
985            );
986        } else {
987            log::info!("Local vector storage directory does not exist, nothing to clear");
988        }
989
990        Ok(())
991    }
992
993    pub async fn health_check(&self) -> Result<()> {
994        if !self.config.enabled {
995            return Err(anyhow::anyhow!("RAG system is disabled"));
996        }
997
998        log::info!("Running RAG system health check...");
999
1000        // Check if embedding model can be loaded
1001        let _embedding_model = EmbeddingModel::new_with_config(self.config.embedding.clone())
1002            .await
1003            .map_err(|e| anyhow::anyhow!("Embedding model unavailable: {}", e))?;
1004        log::info!("Embedding model loaded successfully");
1005
1006        // Check if index directory exists and is accessible
1007        let indexer = Indexer::new(&self.config)?;
1008        let index_path = indexer.get_index_path();
1009
1010        if index_path.exists() {
1011            log::info!("Local index directory exists: {:?}", index_path);
1012
1013            // Check embeddings directory
1014            let embedding_dir = index_path.join("embeddings");
1015            if embedding_dir.exists() {
1016                // Count existing embeddings
1017                match std::fs::read_dir(&embedding_dir) {
1018                    Ok(entries) => {
1019                        let count = entries.filter_map(|e| e.ok()).count();
1020                        log::info!(
1021                            "Local vector storage accessible with {} embedding files",
1022                            count
1023                        );
1024                    }
1025                    Err(e) => {
1026                        log::warn!(
1027                            "Local vector storage directory exists but cannot read contents: {}",
1028                            e
1029                        );
1030                    }
1031                }
1032            } else {
1033                log::info!("Local vector storage will be created when needed");
1034            }
1035        } else {
1036            log::info!("Local index directory will be created: {:?}", index_path);
1037        }
1038
1039        // Test file system write access
1040        let test_file = index_path.join(".health_check");
1041        match std::fs::create_dir_all(index_path) {
1042            Ok(_) => {
1043                match std::fs::write(&test_file, "health_check") {
1044                    Ok(_) => {
1045                        log::info!("File system write access confirmed");
1046                        let _ = std::fs::remove_file(&test_file); // Clean up test file
1047                    }
1048                    Err(e) => {
1049                        return Err(anyhow::anyhow!("File system write access failed: {}", e));
1050                    }
1051                }
1052            }
1053            Err(e) => {
1054                return Err(anyhow::anyhow!("Cannot create index directory: {}", e));
1055            }
1056        }
1057
1058        log::info!("RAG system health check: All systems operational");
1059        Ok(())
1060    }
1061
1062    /// Store document chunks in local file-based vector storage
1063    async fn store_chunks_locally(&self, chunks: &[DocumentChunk]) -> Result<()> {
1064        use uuid::Uuid;
1065
1066        if chunks.is_empty() {
1067            log::info!("No chunks to store locally");
1068            return Ok(());
1069        }
1070
1071        log::info!("Storing {} chunks in local vector storage", chunks.len());
1072
1073        // Initialize embedding model
1074        let embedding_model =
1075            EmbeddingModel::new_with_config(self.config.embedding.clone()).await?;
1076
1077        // Get index path and create embeddings directory
1078        let indexer = Indexer::new(&self.config)?;
1079        let index_path = indexer.get_index_path();
1080        let embedding_dir = index_path.join("embeddings");
1081
1082        // Create directories if they don't exist
1083        std::fs::create_dir_all(&embedding_dir)?;
1084
1085        // Process chunks and store with embeddings
1086        let mut stored_count = 0;
1087
1088        for (i, chunk) in chunks.iter().enumerate() {
1089            // Generate embedding for chunk content
1090            let embedding = match embedding_model.embed_text(&chunk.content).await {
1091                Ok(embedding) => embedding,
1092                Err(e) => {
1093                    log::warn!("Failed to generate embedding for chunk {}: {}", chunk.id, e);
1094                    continue;
1095                }
1096            };
1097
1098            // Create stored chunk with embedding
1099            let stored_chunk = StoredChunk {
1100                id: chunk.id.clone(),
1101                content: chunk.content.clone(),
1102                source_path: chunk.source_path.clone(),
1103                source_type: chunk.source_type.clone(),
1104                title: chunk.title.clone(),
1105                section: chunk.section.clone(),
1106                chunk_index: chunk.chunk_index,
1107                metadata: chunk.metadata.clone(),
1108                embedding,
1109            };
1110
1111            // Save to JSON file
1112            let file_id = Uuid::new_v4().to_string();
1113            let file_path = embedding_dir.join(format!("{}.json", file_id));
1114
1115            let json_content = serde_json::to_string_pretty(&stored_chunk)?;
1116            std::fs::write(&file_path, json_content)?;
1117
1118            stored_count += 1;
1119            log::debug!("Stored chunk {} to {:?}", chunk.id, file_path);
1120            if (i + 1) % 100 == 0 || i + 1 == chunks.len() {
1121                println!("Stored {}/{} chunks...", i + 1, chunks.len());
1122            }
1123        }
1124
1125        log::info!(
1126            "Successfully stored {} chunks in local vector storage",
1127            stored_count
1128        );
1129        Ok(())
1130    }
1131}
1132
1133/// Store chunks using a shared embedding model (config-based helper)
1134pub async fn store_chunks_with_model_config(
1135    config: &RagConfig,
1136    chunks: &[DocumentChunk],
1137    embedding_model: &EmbeddingModel,
1138) -> Result<usize> {
1139    use uuid::Uuid;
1140    if chunks.is_empty() {
1141        return Ok(0);
1142    }
1143
1144    let indexer = Indexer::new(config)?;
1145    let index_path = indexer.get_index_path();
1146    let embedding_dir = index_path.join("embeddings");
1147    std::fs::create_dir_all(&embedding_dir)?;
1148
1149    let mut stored_count = 0usize;
1150    for chunk in chunks {
1151        let embedding = match embedding_model.embed_text(&chunk.content).await {
1152            Ok(embedding) => embedding,
1153            Err(_) => continue,
1154        };
1155
1156        let stored_chunk = StoredChunk {
1157            id: chunk.id.clone(),
1158            content: chunk.content.clone(),
1159            source_path: chunk.source_path.clone(),
1160            source_type: chunk.source_type.clone(),
1161            title: chunk.title.clone(),
1162            section: chunk.section.clone(),
1163            chunk_index: chunk.chunk_index,
1164            metadata: chunk.metadata.clone(),
1165            embedding,
1166        };
1167
1168        let file_id = Uuid::new_v4().to_string();
1169        let file_path = embedding_dir.join(format!("{}.json", file_id));
1170        let json_content = serde_json::to_string_pretty(&stored_chunk)?;
1171        std::fs::write(&file_path, json_content)?;
1172        stored_count += 1;
1173    }
1174
1175    Ok(stored_count)
1176}