1use crate::rag::embeddings::EmbeddingModel;
7use crate::rag::indexer::Indexer;
8use crate::rag::llm::LlmClient;
9use crate::rag::search_engine::SmartSearchEngine;
10use anyhow::Result;
11use docrawl::{crawl, Config as DocrawlConfig, CrawlConfig};
12#[cfg(unix)]
14use serde::{Deserialize, Serialize};
19use std::path::PathBuf;
21use indicatif::{ProgressBar, ProgressStyle};
22use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
23use walkdir::WalkDir;
24
25pub mod benchmarks;
26pub mod embeddings;
27pub mod indexer;
28pub mod llm;
29pub mod model_metadata;
30pub mod providers;
31pub mod query_enhancer;
32pub mod result_verifier;
33pub mod search_engine;
34
35#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
37pub enum EmbeddingProvider {
38 #[default]
39 Hash, Onnx(String), Ollama(String), OpenAI(String), HuggingFace(String), Custom(String), }
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct EmbeddingConfig {
50 pub provider: EmbeddingProvider,
51 pub dimension: usize,
52 pub model_path: Option<PathBuf>, pub api_key: Option<String>, pub endpoint: Option<String>, pub timeout_seconds: u64,
56 pub batch_size: usize,
57}
58
59impl Default for EmbeddingConfig {
60 fn default() -> Self {
61 Self {
62 provider: EmbeddingProvider::Hash,
63 dimension: 384, model_path: None,
65 api_key: None,
66 endpoint: None,
67 timeout_seconds: 30,
68 batch_size: 32,
69 }
70 }
71}
72
73impl EmbeddingConfig {
74 pub async fn detect_and_update_dimension(&mut self) -> Result<()> {
76 use crate::rag::embeddings::EmbeddingModel;
77
78 let model = EmbeddingModel::new_with_config(self.clone()).await?;
79 let detected_dimension = model.get_dimension().await?;
80
81 if self.dimension != detected_dimension {
82 log::info!(
83 "Updating dimension from {} to {} for provider {:?}",
84 self.dimension,
85 detected_dimension,
86 self.provider
87 );
88 self.dimension = detected_dimension;
89 }
90
91 Ok(())
92 }
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
97pub enum CodeSecurityLevel {
98 Strict,
100 Moderate,
102 Permissive,
104}
105
106impl Default for CodeSecurityLevel {
107 fn default() -> Self {
108 Self::Moderate
109 }
110}
111
112#[derive(Debug, Clone, Serialize, Deserialize)]
114pub struct SmartSearchConfig {
115 pub prefer_semantic: bool, pub enable_query_enhancement: bool, pub enable_result_verification: bool, pub min_confidence_score: f32, pub max_query_variations: usize, pub enable_multi_stage: bool, pub adaptive_chunking: bool, }
123
124impl Default for SmartSearchConfig {
125 fn default() -> Self {
126 Self {
127 prefer_semantic: true,
128 enable_query_enhancement: true,
129 enable_result_verification: true,
130 min_confidence_score: 0.7,
131 max_query_variations: 3,
132 enable_multi_stage: true,
133 adaptive_chunking: true,
134 }
135 }
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
140pub struct RagConfig {
141 pub enabled: bool,
142 pub index_path: PathBuf,
143 pub max_results: usize,
144 pub similarity_threshold: f32,
145 pub allow_pdf_processing: bool,
146 pub allow_code_processing: bool,
147 pub code_security_level: CodeSecurityLevel,
148 pub mask_secrets: bool,
149 pub max_file_size_mb: u64,
150 pub embedding: EmbeddingConfig,
151 pub smart_search: SmartSearchConfig,
152}
153
154impl Default for RagConfig {
155 fn default() -> Self {
156 Self {
157 enabled: true, index_path: PathBuf::from("~/.cache/manx/rag_index"),
159 max_results: 10,
160 similarity_threshold: 0.6,
161 allow_pdf_processing: false, allow_code_processing: true, code_security_level: CodeSecurityLevel::Moderate,
164 mask_secrets: true, max_file_size_mb: 100, embedding: EmbeddingConfig::default(),
167 smart_search: SmartSearchConfig::default(),
168 }
169 }
170}
171
172#[derive(Debug, Clone, Serialize, Deserialize)]
174pub struct DocumentChunk {
175 pub id: String,
176 pub content: String,
177 pub source_path: PathBuf,
178 pub source_type: SourceType,
179 pub title: Option<String>,
180 pub section: Option<String>,
181 pub chunk_index: usize,
182 pub metadata: DocumentMetadata,
183}
184
185#[derive(Debug, Clone, Serialize, Deserialize)]
187pub enum SourceType {
188 Local,
189 Remote,
190 Curated,
191 Web,
192}
193
194#[derive(Debug, Clone, Serialize, Deserialize)]
196pub struct DocumentMetadata {
197 pub file_type: String,
198 pub size: u64,
199 pub modified: chrono::DateTime<chrono::Utc>,
200 pub tags: Vec<String>,
201 pub language: Option<String>,
202}
203
204#[derive(Debug, Clone, Serialize, Deserialize)]
206pub struct RagSearchResult {
207 pub id: String,
208 pub content: String,
209 pub source_path: PathBuf,
210 pub source_type: SourceType,
211 pub title: Option<String>,
212 pub section: Option<String>,
213 pub score: f32,
214 pub chunk_index: usize,
215 pub metadata: DocumentMetadata,
216}
217
218#[derive(Debug, Serialize, Deserialize)]
220pub struct RagStats {
221 pub total_documents: usize,
222 pub total_chunks: usize,
223 pub index_size_mb: f64,
224 pub last_updated: chrono::DateTime<chrono::Utc>,
225 pub sources: Vec<String>,
226}
227
228#[derive(Debug, Clone, Serialize, Deserialize)]
230pub struct StoredChunk {
231 pub id: String,
232 pub content: String,
233 pub source_path: PathBuf,
234 pub source_type: SourceType,
235 pub title: Option<String>,
236 pub section: Option<String>,
237 pub chunk_index: usize,
238 pub metadata: DocumentMetadata,
239 pub embedding: Vec<f32>,
240}
241
242pub struct RagSystem {
244 config: RagConfig,
245 llm_client: Option<LlmClient>,
246}
247
248impl RagSystem {
249 pub async fn new(config: RagConfig) -> Result<Self> {
250 Self::new_with_llm(config, None).await
251 }
252
253 pub async fn new_with_llm(config: RagConfig, llm_client: Option<LlmClient>) -> Result<Self> {
254 if !config.enabled {
255 return Err(anyhow::anyhow!("RAG system is disabled"));
256 }
257
258 let indexer = Indexer::new(&config)?;
260 let index_path = indexer.get_index_path();
261
262 std::fs::create_dir_all(index_path)?;
264
265 log::info!(
266 "RAG system initialized with local vector storage at {:?}",
267 index_path
268 );
269 Ok(Self { config, llm_client })
270 }
271
272 pub async fn index_document(&mut self, path: PathBuf) -> Result<usize> {
273 if !self.config.enabled {
274 return Err(anyhow::anyhow!("RAG system is disabled"));
275 }
276
277 let indexer = Indexer::new(&self.config)?;
278 let chunks = indexer.index_document(path)?;
279 let chunk_count = chunks.len();
280
281 self.store_chunks_locally(&chunks).await?;
283
284 log::info!("Successfully indexed and stored {} chunks", chunk_count);
285 Ok(chunk_count)
286 }
287
288 pub async fn index_directory(&mut self, path: PathBuf) -> Result<usize> {
289 if !self.config.enabled {
290 return Err(anyhow::anyhow!("RAG system is disabled"));
291 }
292
293 let indexer = Indexer::new(&self.config)?;
294 let chunks = indexer.index_directory(path)?;
295 let chunk_count = chunks.len();
296
297 self.store_chunks_locally(&chunks).await?;
299
300 log::info!(
301 "Successfully indexed and stored {} chunks from directory",
302 chunk_count
303 );
304 Ok(chunk_count)
305 }
306
307 #[allow(dead_code)]
308 pub async fn index_url(&mut self, url: &str) -> Result<usize> {
309 if !self.config.enabled {
310 return Err(anyhow::anyhow!("RAG system is disabled"));
311 }
312
313 log::info!("Indexing URL: {}", url);
314
315 let indexer = Indexer::new(&self.config)?;
316 let chunks = indexer.index_url(url.to_string()).await?;
317 let chunk_count = chunks.len();
318
319 self.store_chunks_locally(&chunks).await?;
321
322 log::info!(
323 "Successfully indexed and stored {} chunks from URL",
324 chunk_count
325 );
326 Ok(chunk_count)
327 }
328
329 #[allow(dead_code)]
330 pub async fn index_url_deep(
331 &mut self,
332 url: &str,
333 max_depth: Option<u32>,
334 max_pages: Option<u32>,
335 ) -> Result<usize> {
336 if !self.config.enabled {
337 return Err(anyhow::anyhow!("RAG system is disabled"));
338 }
339
340 log::info!(
341 "Deep indexing URL: {} (depth: {:?}, pages: {:?})",
342 url,
343 max_depth,
344 max_pages
345 );
346
347 let indexer = Indexer::new(&self.config)?;
348 let crawl_all = max_pages.is_none(); let chunks = indexer
351 .index_url_deep(url.to_string(), max_depth, crawl_all)
352 .await?;
353 let chunk_count = chunks.len();
354
355 self.store_chunks_locally(&chunks).await?;
357
358 log::info!(
359 "Successfully deep indexed and stored {} chunks from URL",
360 chunk_count
361 );
362 Ok(chunk_count)
363 }
364
365 pub async fn index_url_deep_stream(
367 &self,
368 url: &str,
369 max_depth: Option<u32>,
370 crawl_all: bool,
371 embed_concurrency: Option<usize>,
372 crawl_max_pages: Option<usize>,
373 ) -> Result<usize> {
374 use std::collections::HashSet;
375 use std::sync::Arc;
376 use tokio::sync::mpsc;
377 use tokio::time::{interval, Duration};
378
379 if matches!(max_depth, Some(0)) && !crawl_all {
381 eprintln!("\n🧭 Indexing single page (no crawl): {}", url);
382 let embedding_model = std::sync::Arc::new(
383 EmbeddingModel::new_with_config(self.config.embedding.clone()).await?,
384 );
385 let indexer = Indexer::new(&self.config)?;
386 let chunks = indexer.index_single_url_no_crawl(url).await?;
387 let total_stored =
388 store_chunks_with_model_config(&self.config, &chunks, &embedding_model).await?;
389
390 let index_path = indexer.get_index_path();
391 eprintln!("\n==== Manx Index Summary ====");
392 eprintln!("Mode: Single page (no crawl)");
393 eprintln!("Chunks created: {}", chunks.len());
394 eprintln!("Chunks stored: {}", total_stored);
395 eprintln!("Index path: {}", index_path.display());
396 return Ok(total_stored);
397 }
398
399 if matches!(max_depth, Some(1)) && !crawl_all {
401 eprintln!("\n🧭 Shallow crawl (depth 1) for: {}", url);
402 let embedding_model = std::sync::Arc::new(
403 EmbeddingModel::new_with_config(self.config.embedding.clone()).await?,
404 );
405 let indexer = Indexer::new(&self.config)?;
406 let chunks = indexer.index_shallow_url(url, crawl_max_pages).await?;
407 let total_stored =
408 store_chunks_with_model_config(&self.config, &chunks, &embedding_model).await?;
409
410 let index_path = indexer.get_index_path();
411 eprintln!("\n==== Manx Index Summary ====");
412 eprintln!("Mode: Shallow crawl (depth 1)");
413 eprintln!("Chunks created: {}", chunks.len());
414 eprintln!("Chunks stored: {}", total_stored);
415 eprintln!("Index path: {}", index_path.display());
416 return Ok(total_stored);
417 }
418
419 let temp_dir = std::env::temp_dir().join(format!("manx_crawl_{}", uuid::Uuid::new_v4()));
420 std::fs::create_dir_all(&temp_dir)?;
421
422 eprintln!("\n🌐 Starting document crawl for: {}", url);
424 eprintln!(" This will: 1) Crawl pages → 2) Chunk content → 3) Create embeddings");
425 log::debug!("Temp directory: {}", temp_dir.display());
426 eprintln!();
427
428 let base_url = if let Ok(resp) = reqwest::Client::new().get(url).send().await {
430 resp.url().clone()
431 } else {
432 url::Url::parse(url)?
433 };
434 let crawl_config = CrawlConfig {
436 base_url,
437 output_dir: temp_dir.clone(),
438 user_agent: "Manx/0.5.0 (Documentation Crawler)".to_string(),
439 max_depth: if let Some(d) = max_depth {
440 Some(d as usize)
441 } else if crawl_all {
442 None
443 } else {
444 Some(3)
445 },
446 silence: true, rate_limit_per_sec: 20, follow_sitemaps: true,
449 concurrency: std::cmp::max(8, num_cpus::get()), timeout: Some(std::time::Duration::from_secs(30)),
451 resume: false,
452 config: DocrawlConfig::default(),
454 };
455
456 let embedding_model = Arc::new(EmbeddingModel::new_with_config(self.config.embedding.clone()).await?);
458
459 let (tx, rx) = mpsc::channel::<PathBuf>(200);
461 let rx = std::sync::Arc::new(tokio::sync::Mutex::new(rx));
462 let crawl_max_pages = crawl_max_pages.unwrap_or(usize::MAX);
463
464 let pages_counter = Arc::new(AtomicUsize::new(0));
466 let processed_pages_counter = Arc::new(AtomicUsize::new(0));
467 let chunks_counter = Arc::new(AtomicUsize::new(0));
468
469 let crawl_done = Arc::new(AtomicBool::new(false));
471 let crawl_done_clone = crawl_done.clone();
472
473 let crawl_handle = tokio::spawn(async move {
475 let result = crawl(crawl_config).await;
476 crawl_done_clone.store(true, Ordering::Relaxed);
477 result.map_err(|e| e.to_string())
479 });
480
481 let temp_dir_clone = temp_dir.clone();
483 let scanner_tx = tx.clone();
484 let pc = pages_counter.clone();
485 let crawl_done_scanner = crawl_done.clone();
486 let scanner_handle = tokio::spawn(async move {
487 tokio::time::sleep(Duration::from_secs(3)).await;
489
490 let mut scan_interval_ms = 1000; let mut ticker = interval(Duration::from_millis(scan_interval_ms));
493 let mut seen: HashSet<PathBuf> = HashSet::new();
494 let mut idle_ticks = 0u32;
495 let mut total_files_scanned;
496 log::debug!("Scanner: Starting to monitor directory: {}", temp_dir_clone.display());
498
499 loop {
500 ticker.tick().await;
501 let mut new_found = 0usize;
502 let mut current_scan_count = 0usize;
503
504 for entry in WalkDir::new(&temp_dir_clone)
505 .into_iter()
506 .filter_map(|e| e.ok())
507 {
508 let path = entry.path();
509 current_scan_count += 1;
510
511 if path.is_file() {
512 if let Some(ext) = path.extension().and_then(|s| s.to_str()) {
513 if ext == "md" {
514 let pb = path.to_path_buf();
515 if !seen.contains(&pb) {
516 log::debug!("Scanner: Found new markdown file: {}", pb.file_name().unwrap_or_default().to_string_lossy());
517 seen.insert(pb.clone());
518 if scanner_tx.send(pb).await.is_err() {
519 break;
520 }
521 new_found += 1;
522 pc.fetch_add(1, Ordering::Relaxed);
523 if seen.len() >= crawl_max_pages {
524 break;
525 }
526 }
527 }
528 }
529 }
530 }
531
532 total_files_scanned = current_scan_count;
533
534 if new_found > 0 {
535 log::debug!("Scanner: Found {} new markdown files this scan (total: {})", new_found, seen.len());
536 idle_ticks = 0;
537 if scan_interval_ms > 300 {
539 scan_interval_ms = 300;
540 ticker = interval(Duration::from_millis(scan_interval_ms));
541 }
542 } else {
543 idle_ticks += 1;
544
545 if crawl_done_scanner.load(Ordering::Relaxed) && scan_interval_ms > 200 {
547 scan_interval_ms = 200;
548 ticker = interval(Duration::from_millis(scan_interval_ms));
549 }
550
551 if idle_ticks % 10 == 0 {
552 log::debug!("Scanner: No new files found for {} ticks (scanned {} total files in directory)", idle_ticks, total_files_scanned);
553 }
554 }
555
556 if seen.len() >= crawl_max_pages {
557 eprintln!("Scanner: Reached max pages limit ({})", crawl_max_pages);
558 break;
559 }
560
561 if idle_ticks > 20 && crawl_done_scanner.load(Ordering::Relaxed) {
563 log::debug!("Scanner: Crawl is done and no new files for {} ticks, exiting", idle_ticks);
564 break;
565 }
566
567 if idle_ticks > 100 {
569 log::debug!("Scanner: Safety exit after {} ticks of no activity", idle_ticks);
570 break;
571 }
572 }
573 let files_found = seen.len();
574 drop(scanner_tx);
576 files_found
577 });
578
579 let workers = embed_concurrency.unwrap_or_else(|| std::cmp::max(4, num_cpus::get()));
581 let mut joins = Vec::new();
582 let config_clone = self.config.clone();
583 let url_for_worker = url.to_string();
584 for _ in 0..workers {
585 let rx = rx.clone();
586 let embedding_model = embedding_model.clone();
587 let config_clone = config_clone.clone();
588 let url_clone = url_for_worker.clone();
589 let chunks_counter = chunks_counter.clone();
590 let processed_pages_counter = processed_pages_counter.clone();
591 let join = tokio::spawn(async move {
592 let mut stored = 0usize;
593 let idx = match Indexer::new(&config_clone) {
594 Ok(i) => i,
595 Err(_) => return 0usize,
596 };
597 loop {
598 let opt_path = { rx.lock().await.recv().await };
599 let Some(md_path) = opt_path else { break };
600 if let Ok(chunks) = idx.process_markdown_file(&md_path, &url_clone).await {
601 if let Ok(count) =
602 store_chunks_with_model_config(&config_clone, &chunks, &embedding_model)
603 .await
604 {
605 stored += count;
606 chunks_counter.fetch_add(count, Ordering::Relaxed);
607 processed_pages_counter.fetch_add(1, Ordering::Relaxed);
609 }
610 }
611 }
612 stored
613 });
614 joins.push(join);
615 }
616
617 let crawl_result = crawl_handle.await;
619 let crawled_pages = if let Ok(Ok(stats)) = &crawl_result {
620 eprintln!("\n✓ Crawl completed: {} pages crawled", stats.pages);
621 stats.pages
622 } else if let Ok(Err(e)) = &crawl_result {
623 eprintln!("\n⚠️ Crawl completed with error: {}", e);
624 0
625 } else {
626 eprintln!("\n⚠️ Crawl status unknown");
627 0
628 };
629
630 eprintln!("\n📂 Scanning for markdown files...");
632
633 let pb = ProgressBar::new_spinner();
634 pb.set_style(
635 ProgressStyle::default_spinner()
636 .template("{spinner:.green} {msg}")
637 .unwrap()
638 );
639 pb.enable_steady_tick(std::time::Duration::from_millis(200));
640
641 let pages_counter_clone = pages_counter.clone();
643 let crawl_done_clone = crawl_done.clone();
644 let pb_clone = pb.clone();
645 let start_time = std::time::Instant::now();
646 let monitor_handle = tokio::spawn(async move {
647 let mut last_count = 0usize;
648 let mut stable_cycles = 0;
649
650 loop {
651 tokio::time::sleep(Duration::from_millis(500)).await;
652 let current_count = pages_counter_clone.load(Ordering::Relaxed);
653 let is_crawl_done = crawl_done_clone.load(Ordering::Relaxed);
654 let elapsed = start_time.elapsed().as_secs_f32().max(0.1);
655 let rate = (current_count as f32) / elapsed;
656
657 let message = if current_count != last_count {
658 last_count = current_count;
659 stable_cycles = 0;
660 format!("Found {} files | {:.1}/s", current_count, rate)
661 } else {
662 stable_cycles += 1;
663 if stable_cycles < 4 {
664 format!("Found {} files (scanning...) | {:.1}/s", current_count, rate)
665 } else {
666 format!("Found {} files (finalizing...) | {:.1}/s", current_count, rate)
667 }
668 };
669
670 pb_clone.set_message(message);
671
672 if is_crawl_done && stable_cycles > 6 {
674 break;
675 }
676 if stable_cycles > 60 {
678 break;
679 }
680 }
681 });
682
683 let scanner_result = scanner_handle.await;
685 let _scanner_files = scanner_result.unwrap_or(0);
686
687 monitor_handle.abort();
689 let final_count = pages_counter.load(Ordering::Relaxed);
690 pb.finish_with_message(format!("✓ Found {} markdown files", final_count));
691
692 drop(tx);
693
694 let total_pages_found = pages_counter.load(Ordering::Relaxed);
699
700 let mut processed_so_far = processed_pages_counter.load(Ordering::Relaxed);
702
703 if total_pages_found > 0 {
705 eprintln!("\n📄 Processing {} markdown files...", total_pages_found);
706
707 let pb = ProgressBar::new(total_pages_found as u64);
709 pb.set_style(
710 ProgressStyle::default_bar()
711 .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} files ({percent}%) | {msg}")
712 .unwrap()
713 .progress_chars("█▉▊▋▌▍▎▏ ")
714 );
715 pb.set_message("Chunking... 0 chunks created".to_string());
716
717 let mut last_processed = processed_so_far;
719 let mut stall_counter = 0;
720 while processed_so_far < total_pages_found && stall_counter < 60 {
721 tokio::time::sleep(Duration::from_millis(200)).await;
722 processed_so_far = processed_pages_counter.load(Ordering::Relaxed);
723 let chunks_so_far = chunks_counter.load(Ordering::Relaxed);
724
725 if processed_so_far != last_processed {
726 pb.set_position(processed_so_far as u64);
727 pb.set_message(format!("{} chunks created", chunks_so_far));
728 last_processed = processed_so_far;
729 stall_counter = 0;
730 } else {
731 stall_counter += 1;
732 pb.tick();
734 }
735 }
736
737 pb.set_position(processed_so_far as u64);
739
740 if processed_so_far == total_pages_found {
741 pb.finish_with_message("✓ All files processed");
742 } else {
743 pb.abandon_with_message("Processing incomplete - some files may have failed");
744 }
745 } else {
746 eprintln!("\n⚠️ No markdown files found to process");
747 eprintln!(" The crawler processed {} pages but docrawl generated no markdown files.", crawled_pages);
748 eprintln!(" This can happen when:");
749 eprintln!(" • The site uses JavaScript rendering that docrawl can't parse");
750 eprintln!(" • The pages contain mostly non-text content (images, PDFs, etc.)");
751 eprintln!(" • The site structure isn't compatible with the crawler");
752 eprintln!(" Try:");
754 eprintln!(" • Using a different URL that points to documentation pages");
755 eprintln!(" • Indexing local files instead if you have them downloaded");
756 }
757
758 let mut total_stored = 0usize;
760
761 if total_pages_found > 0 {
762 eprintln!("\n⏳ Waiting for workers to finish...");
764 let pb_final = ProgressBar::new_spinner();
765 pb_final.set_style(
766 ProgressStyle::default_spinner()
767 .template("{spinner:.green} {msg}")
768 .unwrap()
769 );
770 pb_final.set_message("Finalizing embeddings and storing to database...");
771 pb_final.enable_steady_tick(std::time::Duration::from_millis(100));
772
773 for j in joins {
774 pb_final.tick();
775 if let Ok(count) = j.await {
776 total_stored += count;
777 }
778 }
779
780 pb_final.finish_with_message("✓ Index finalized");
781 } else {
782 for j in joins {
784 if let Ok(count) = j.await {
785 total_stored += count;
786 }
787 }
788 }
789
790 let _ = std::fs::remove_dir_all(&temp_dir);
792
793 let total_pages = pages_counter.load(Ordering::Relaxed);
795 let final_processed = processed_pages_counter.load(Ordering::Relaxed);
796 let final_chunks = chunks_counter.load(Ordering::Relaxed);
797 let indexer = Indexer::new(&self.config)?;
798 let index_path = indexer.get_index_path();
799
800 eprintln!();
801 eprintln!("==== Manx Index Summary ====");
802 eprintln!("Markdown files found: {}", total_pages);
803 eprintln!("Files processed: {}", final_processed);
804 eprintln!("Chunks created: {}", final_chunks);
805 eprintln!("Chunks stored: {}", total_stored);
806 eprintln!("Index path: {}", index_path.display());
807
808 if total_pages == 0 {
809 eprintln!();
810 eprintln!("⚠️ No markdown files were found. Docrawl may not have generated any content.");
811 eprintln!(" This could mean the site structure is not compatible with crawling.");
812 } else if total_stored == 0 {
813 eprintln!();
814 eprintln!("⚠️ No chunks were stored. The markdown files may have been empty.");
815 }
816
817 Ok(total_stored)
818 }
819
820 pub async fn search(
821 &self,
822 query: &str,
823 max_results: Option<usize>,
824 ) -> Result<Vec<RagSearchResult>> {
825 if !self.config.enabled {
826 return Err(anyhow::anyhow!("RAG system is disabled"));
827 }
828
829 log::info!("Starting intelligent search for: '{}'", query);
830
831 let search_engine =
833 SmartSearchEngine::new(self.config.clone(), self.llm_client.clone()).await?;
834
835 let verified_results = search_engine.search(query, max_results).await?;
837
838 let results: Vec<RagSearchResult> = verified_results
840 .into_iter()
841 .map(|verified| RagSearchResult {
842 id: verified.result.id,
843 content: verified.result.content,
844 source_path: verified.result.source_path,
845 source_type: verified.result.source_type,
846 title: verified.result.title,
847 section: verified.result.section,
848 score: verified.confidence_score, chunk_index: verified.result.chunk_index,
850 metadata: verified.result.metadata,
851 })
852 .collect();
853
854 log::info!(
855 "Intelligent search completed with {} results",
856 results.len()
857 );
858 Ok(results)
859 }
860
861 pub async fn get_stats(&self) -> Result<RagStats> {
862 if !self.config.enabled {
863 return Err(anyhow::anyhow!("RAG system is disabled"));
864 }
865
866 let indexer = Indexer::new(&self.config)?;
867 let index_path = indexer.get_index_path();
868 let embedding_dir = index_path.join("embeddings");
869
870 if !embedding_dir.exists() {
871 return Ok(RagStats {
872 total_documents: 0,
873 total_chunks: 0,
874 index_size_mb: 0.0,
875 last_updated: chrono::Utc::now(),
876 sources: vec![],
877 });
878 }
879
880 let mut total_chunks = 0;
882 let mut total_size = 0u64;
883 let mut sources = std::collections::HashSet::new();
884 let mut last_modified = std::time::UNIX_EPOCH;
885
886 let entries = std::fs::read_dir(&embedding_dir)?;
887 for entry in entries.flatten() {
888 if let Some(file_name) = entry.file_name().to_str() {
889 if file_name.ends_with(".json") {
890 total_chunks += 1;
891
892 if let Ok(metadata) = entry.metadata() {
893 total_size += metadata.len();
894
895 if let Ok(modified) = metadata.modified() {
897 if modified > last_modified {
898 last_modified = modified;
899 }
900 }
901 }
902
903 if let Ok(content) = std::fs::read_to_string(entry.path()) {
905 if let Ok(chunk_data) = serde_json::from_str::<StoredChunk>(&content) {
906 if let Some(source_str) = chunk_data.source_path.to_str() {
907 sources.insert(source_str.to_string());
908 }
909 }
910 }
911 }
912 }
913 }
914
915 let total_documents = sources.len();
917 let index_size_mb = total_size as f64 / (1024.0 * 1024.0);
918
919 let last_updated = chrono::DateTime::<chrono::Utc>::from(last_modified);
920
921 let sources_vec: Vec<String> = sources.into_iter().collect();
922
923 Ok(RagStats {
924 total_documents,
925 total_chunks,
926 index_size_mb,
927 last_updated,
928 sources: sources_vec,
929 })
930 }
931
932 pub async fn clear_index(&self) -> Result<()> {
933 if !self.config.enabled {
934 return Err(anyhow::anyhow!("RAG system is disabled"));
935 }
936
937 log::info!("Clearing local vector storage");
938
939 let indexer = Indexer::new(&self.config)?;
941 let index_path = indexer.get_index_path();
942 let embedding_dir = index_path.join("embeddings");
943
944 if embedding_dir.exists() {
945 let entries = std::fs::read_dir(&embedding_dir)?;
947 let mut cleared_count = 0;
948
949 for entry in entries.flatten() {
950 if let Some(file_name) = entry.file_name().to_str() {
951 if file_name.ends_with(".json") {
952 if let Err(e) = std::fs::remove_file(entry.path()) {
953 log::warn!("Failed to remove embedding file {:?}: {}", entry.path(), e);
954 } else {
955 cleared_count += 1;
956 }
957 }
958 }
959 }
960
961 log::info!(
962 "Successfully cleared {} embedding files from local vector storage",
963 cleared_count
964 );
965 } else {
966 log::info!("Local vector storage directory does not exist, nothing to clear");
967 }
968
969 Ok(())
970 }
971
972 pub async fn health_check(&self) -> Result<()> {
973 if !self.config.enabled {
974 return Err(anyhow::anyhow!("RAG system is disabled"));
975 }
976
977 log::info!("Running RAG system health check...");
978
979 let _embedding_model = EmbeddingModel::new_with_config(self.config.embedding.clone())
981 .await
982 .map_err(|e| anyhow::anyhow!("Embedding model unavailable: {}", e))?;
983 log::info!("✓ Embedding model loaded successfully");
984
985 let indexer = Indexer::new(&self.config)?;
987 let index_path = indexer.get_index_path();
988
989 if index_path.exists() {
990 log::info!("✓ Local index directory exists: {:?}", index_path);
991
992 let embedding_dir = index_path.join("embeddings");
994 if embedding_dir.exists() {
995 match std::fs::read_dir(&embedding_dir) {
997 Ok(entries) => {
998 let count = entries.filter_map(|e| e.ok()).count();
999 log::info!(
1000 "✓ Local vector storage accessible with {} embedding files",
1001 count
1002 );
1003 }
1004 Err(e) => {
1005 log::warn!(
1006 "⚠ Local vector storage directory exists but cannot read contents: {}",
1007 e
1008 );
1009 }
1010 }
1011 } else {
1012 log::info!("✓ Local vector storage will be created when needed");
1013 }
1014 } else {
1015 log::info!("✓ Local index directory will be created: {:?}", index_path);
1016 }
1017
1018 let test_file = index_path.join(".health_check");
1020 match std::fs::create_dir_all(index_path) {
1021 Ok(_) => {
1022 match std::fs::write(&test_file, "health_check") {
1023 Ok(_) => {
1024 log::info!("✓ File system write access confirmed");
1025 let _ = std::fs::remove_file(&test_file); }
1027 Err(e) => {
1028 return Err(anyhow::anyhow!("File system write access failed: {}", e));
1029 }
1030 }
1031 }
1032 Err(e) => {
1033 return Err(anyhow::anyhow!("Cannot create index directory: {}", e));
1034 }
1035 }
1036
1037 log::info!("RAG system health check: All systems operational");
1038 Ok(())
1039 }
1040
1041 async fn store_chunks_locally(&self, chunks: &[DocumentChunk]) -> Result<()> {
1043 use uuid::Uuid;
1044
1045 if chunks.is_empty() {
1046 log::info!("No chunks to store locally");
1047 return Ok(());
1048 }
1049
1050 log::info!("Storing {} chunks in local vector storage", chunks.len());
1051
1052 let embedding_model =
1054 EmbeddingModel::new_with_config(self.config.embedding.clone()).await?;
1055
1056 let indexer = Indexer::new(&self.config)?;
1058 let index_path = indexer.get_index_path();
1059 let embedding_dir = index_path.join("embeddings");
1060
1061 std::fs::create_dir_all(&embedding_dir)?;
1063
1064 let mut stored_count = 0;
1066
1067 for (i, chunk) in chunks.iter().enumerate() {
1068 let embedding = match embedding_model.embed_text(&chunk.content).await {
1070 Ok(embedding) => embedding,
1071 Err(e) => {
1072 log::warn!("Failed to generate embedding for chunk {}: {}", chunk.id, e);
1073 continue;
1074 }
1075 };
1076
1077 let stored_chunk = StoredChunk {
1079 id: chunk.id.clone(),
1080 content: chunk.content.clone(),
1081 source_path: chunk.source_path.clone(),
1082 source_type: chunk.source_type.clone(),
1083 title: chunk.title.clone(),
1084 section: chunk.section.clone(),
1085 chunk_index: chunk.chunk_index,
1086 metadata: chunk.metadata.clone(),
1087 embedding,
1088 };
1089
1090 let file_id = Uuid::new_v4().to_string();
1092 let file_path = embedding_dir.join(format!("{}.json", file_id));
1093
1094 let json_content = serde_json::to_string_pretty(&stored_chunk)?;
1095 std::fs::write(&file_path, json_content)?;
1096
1097 stored_count += 1;
1098 log::debug!("Stored chunk {} to {:?}", chunk.id, file_path);
1099 if (i + 1) % 100 == 0 || i + 1 == chunks.len() {
1100 println!("Stored {}/{} chunks...", i + 1, chunks.len());
1101 }
1102 }
1103
1104 log::info!(
1105 "Successfully stored {} chunks in local vector storage",
1106 stored_count
1107 );
1108 Ok(())
1109 }
1110}
1111
1112pub async fn store_chunks_with_model_config(
1114 config: &RagConfig,
1115 chunks: &[DocumentChunk],
1116 embedding_model: &EmbeddingModel,
1117) -> Result<usize> {
1118 use uuid::Uuid;
1119 if chunks.is_empty() {
1120 return Ok(0);
1121 }
1122
1123 let indexer = Indexer::new(config)?;
1124 let index_path = indexer.get_index_path();
1125 let embedding_dir = index_path.join("embeddings");
1126 std::fs::create_dir_all(&embedding_dir)?;
1127
1128 let mut stored_count = 0usize;
1129 for chunk in chunks {
1130 let embedding = match embedding_model.embed_text(&chunk.content).await {
1131 Ok(embedding) => embedding,
1132 Err(_) => continue,
1133 };
1134
1135 let stored_chunk = StoredChunk {
1136 id: chunk.id.clone(),
1137 content: chunk.content.clone(),
1138 source_path: chunk.source_path.clone(),
1139 source_type: chunk.source_type.clone(),
1140 title: chunk.title.clone(),
1141 section: chunk.section.clone(),
1142 chunk_index: chunk.chunk_index,
1143 metadata: chunk.metadata.clone(),
1144 embedding,
1145 };
1146
1147 let file_id = Uuid::new_v4().to_string();
1148 let file_path = embedding_dir.join(format!("{}.json", file_id));
1149 let json_content = serde_json::to_string_pretty(&stored_chunk)?;
1150 std::fs::write(&file_path, json_content)?;
1151 stored_count += 1;
1152 }
1153
1154 Ok(stored_count)
1155}