1use crate::rag::embeddings::EmbeddingModel;
7use crate::rag::indexer::Indexer;
8use crate::rag::llm::LlmClient;
9use crate::rag::search_engine::SmartSearchEngine;
10use anyhow::Result;
11use docrawl::{crawl, Config as DocrawlConfig, CrawlConfig};
12#[cfg(unix)]
14use serde::{Deserialize, Serialize};
19use indicatif::{ProgressBar, ProgressStyle};
21use std::path::PathBuf;
22use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
23use walkdir::WalkDir;
24
25pub mod benchmarks;
26pub mod embeddings;
27pub mod indexer;
28pub mod llm;
29pub mod model_metadata;
30pub mod providers;
31pub mod query_enhancer;
32pub mod result_verifier;
33pub mod search_engine;
34
35#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
37pub enum EmbeddingProvider {
38 #[default]
39 Hash, Onnx(String), Ollama(String), OpenAI(String), HuggingFace(String), Custom(String), }
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct EmbeddingConfig {
50 pub provider: EmbeddingProvider,
51 pub dimension: usize,
52 pub model_path: Option<PathBuf>, pub api_key: Option<String>, pub endpoint: Option<String>, pub timeout_seconds: u64,
56 pub batch_size: usize,
57}
58
59impl Default for EmbeddingConfig {
60 fn default() -> Self {
61 Self {
62 provider: EmbeddingProvider::Hash,
63 dimension: 384, model_path: None,
65 api_key: None,
66 endpoint: None,
67 timeout_seconds: 30,
68 batch_size: 32,
69 }
70 }
71}
72
73impl EmbeddingConfig {
74 pub async fn detect_and_update_dimension(&mut self) -> Result<()> {
76 use crate::rag::embeddings::EmbeddingModel;
77
78 let model = EmbeddingModel::new_with_config(self.clone()).await?;
79 let detected_dimension = model.get_dimension().await?;
80
81 if self.dimension != detected_dimension {
82 log::info!(
83 "Updating dimension from {} to {} for provider {:?}",
84 self.dimension,
85 detected_dimension,
86 self.provider
87 );
88 self.dimension = detected_dimension;
89 }
90
91 Ok(())
92 }
93}
94
95#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
97pub enum CodeSecurityLevel {
98 Strict,
100 #[default]
102 Moderate,
103 Permissive,
105}
106
107#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct SmartSearchConfig {
110 pub prefer_semantic: bool, pub enable_query_enhancement: bool, pub enable_result_verification: bool, pub min_confidence_score: f32, pub max_query_variations: usize, pub enable_multi_stage: bool, pub adaptive_chunking: bool, }
118
119impl Default for SmartSearchConfig {
120 fn default() -> Self {
121 Self {
122 prefer_semantic: true,
123 enable_query_enhancement: true,
124 enable_result_verification: true,
125 min_confidence_score: 0.7,
126 max_query_variations: 3,
127 enable_multi_stage: true,
128 adaptive_chunking: true,
129 }
130 }
131}
132
133#[derive(Debug, Clone, Serialize, Deserialize)]
135pub struct RagConfig {
136 pub enabled: bool,
137 pub index_path: PathBuf,
138 pub max_results: usize,
139 pub similarity_threshold: f32,
140 pub allow_pdf_processing: bool,
141 pub allow_code_processing: bool,
142 pub code_security_level: CodeSecurityLevel,
143 pub mask_secrets: bool,
144 pub max_file_size_mb: u64,
145 pub embedding: EmbeddingConfig,
146 pub smart_search: SmartSearchConfig,
147}
148
149impl Default for RagConfig {
150 fn default() -> Self {
151 Self {
152 enabled: true, index_path: PathBuf::from("~/.cache/manx/rag_index"),
154 max_results: 10,
155 similarity_threshold: 0.6,
156 allow_pdf_processing: false, allow_code_processing: true, code_security_level: CodeSecurityLevel::Moderate,
159 mask_secrets: true, max_file_size_mb: 100, embedding: EmbeddingConfig::default(),
162 smart_search: SmartSearchConfig::default(),
163 }
164 }
165}
166
167#[derive(Debug, Clone, Serialize, Deserialize)]
169pub struct DocumentChunk {
170 pub id: String,
171 pub content: String,
172 pub source_path: PathBuf,
173 pub source_type: SourceType,
174 pub title: Option<String>,
175 pub section: Option<String>,
176 pub chunk_index: usize,
177 pub metadata: DocumentMetadata,
178}
179
180#[derive(Debug, Clone, Serialize, Deserialize)]
182pub enum SourceType {
183 Local,
184 Remote,
185 Curated,
186 Web,
187}
188
189#[derive(Debug, Clone, Serialize, Deserialize)]
191pub struct DocumentMetadata {
192 pub file_type: String,
193 pub size: u64,
194 pub modified: chrono::DateTime<chrono::Utc>,
195 pub tags: Vec<String>,
196 pub language: Option<String>,
197}
198
199#[derive(Debug, Clone, Serialize, Deserialize)]
201pub struct RagSearchResult {
202 pub id: String,
203 pub content: String,
204 pub source_path: PathBuf,
205 pub source_type: SourceType,
206 pub title: Option<String>,
207 pub section: Option<String>,
208 pub score: f32,
209 pub chunk_index: usize,
210 pub metadata: DocumentMetadata,
211}
212
213#[derive(Debug, Serialize, Deserialize)]
215pub struct RagStats {
216 pub total_documents: usize,
217 pub total_chunks: usize,
218 pub index_size_mb: f64,
219 pub last_updated: chrono::DateTime<chrono::Utc>,
220 pub sources: Vec<String>,
221}
222
223#[derive(Debug, Clone, Serialize, Deserialize)]
225pub struct StoredChunk {
226 pub id: String,
227 pub content: String,
228 pub source_path: PathBuf,
229 pub source_type: SourceType,
230 pub title: Option<String>,
231 pub section: Option<String>,
232 pub chunk_index: usize,
233 pub metadata: DocumentMetadata,
234 pub embedding: Vec<f32>,
235}
236
237pub struct RagSystem {
239 config: RagConfig,
240 llm_client: Option<LlmClient>,
241}
242
243impl RagSystem {
244 pub async fn new(config: RagConfig) -> Result<Self> {
245 Self::new_with_llm(config, None).await
246 }
247
248 pub async fn new_with_llm(config: RagConfig, llm_client: Option<LlmClient>) -> Result<Self> {
249 if !config.enabled {
250 return Err(anyhow::anyhow!("RAG system is disabled"));
251 }
252
253 let indexer = Indexer::new(&config)?;
255 let index_path = indexer.get_index_path();
256
257 std::fs::create_dir_all(index_path)?;
259
260 log::info!(
261 "RAG system initialized with local vector storage at {:?}",
262 index_path
263 );
264 Ok(Self { config, llm_client })
265 }
266
267 pub async fn index_document(&mut self, path: PathBuf) -> Result<usize> {
268 if !self.config.enabled {
269 return Err(anyhow::anyhow!("RAG system is disabled"));
270 }
271
272 let indexer = Indexer::new(&self.config)?;
273 let chunks = indexer.index_document(path)?;
274 let chunk_count = chunks.len();
275
276 self.store_chunks_locally(&chunks).await?;
278
279 log::info!("Successfully indexed and stored {} chunks", chunk_count);
280 Ok(chunk_count)
281 }
282
283 pub async fn index_directory(&mut self, path: PathBuf) -> Result<usize> {
284 if !self.config.enabled {
285 return Err(anyhow::anyhow!("RAG system is disabled"));
286 }
287
288 let indexer = Indexer::new(&self.config)?;
289 let chunks = indexer.index_directory(path)?;
290 let chunk_count = chunks.len();
291
292 self.store_chunks_locally(&chunks).await?;
294
295 log::info!(
296 "Successfully indexed and stored {} chunks from directory",
297 chunk_count
298 );
299 Ok(chunk_count)
300 }
301
302 #[allow(dead_code)]
303 pub async fn index_url(&mut self, url: &str) -> Result<usize> {
304 if !self.config.enabled {
305 return Err(anyhow::anyhow!("RAG system is disabled"));
306 }
307
308 log::info!("Indexing URL: {}", url);
309
310 let indexer = Indexer::new(&self.config)?;
311 let chunks = indexer.index_url(url.to_string()).await?;
312 let chunk_count = chunks.len();
313
314 self.store_chunks_locally(&chunks).await?;
316
317 log::info!(
318 "Successfully indexed and stored {} chunks from URL",
319 chunk_count
320 );
321 Ok(chunk_count)
322 }
323
324 #[allow(dead_code)]
325 pub async fn index_url_deep(
326 &mut self,
327 url: &str,
328 max_depth: Option<u32>,
329 max_pages: Option<u32>,
330 ) -> Result<usize> {
331 if !self.config.enabled {
332 return Err(anyhow::anyhow!("RAG system is disabled"));
333 }
334
335 log::info!(
336 "Deep indexing URL: {} (depth: {:?}, pages: {:?})",
337 url,
338 max_depth,
339 max_pages
340 );
341
342 let indexer = Indexer::new(&self.config)?;
343 let crawl_all = max_pages.is_none(); let chunks = indexer
346 .index_url_deep(url.to_string(), max_depth, crawl_all)
347 .await?;
348 let chunk_count = chunks.len();
349
350 self.store_chunks_locally(&chunks).await?;
352
353 log::info!(
354 "Successfully deep indexed and stored {} chunks from URL",
355 chunk_count
356 );
357 Ok(chunk_count)
358 }
359
360 pub async fn index_url_deep_stream(
362 &self,
363 url: &str,
364 max_depth: Option<u32>,
365 crawl_all: bool,
366 embed_concurrency: Option<usize>,
367 crawl_max_pages: Option<usize>,
368 ) -> Result<usize> {
369 use std::collections::HashSet;
370 use std::sync::Arc;
371 use tokio::sync::mpsc;
372 use tokio::time::{interval, Duration};
373
374 if matches!(max_depth, Some(0)) && !crawl_all {
376 eprintln!("\nIndexing single page (no crawl): {}", url);
377 let embedding_model = std::sync::Arc::new(
378 EmbeddingModel::new_with_config(self.config.embedding.clone()).await?,
379 );
380 let indexer = Indexer::new(&self.config)?;
381 let chunks = indexer.index_single_url_no_crawl(url).await?;
382 let total_stored =
383 store_chunks_with_model_config(&self.config, &chunks, &embedding_model).await?;
384
385 let index_path = indexer.get_index_path();
386 eprintln!("\n==== Manx Index Summary ====");
387 eprintln!("Mode: Single page (no crawl)");
388 eprintln!("Chunks created: {}", chunks.len());
389 eprintln!("Chunks stored: {}", total_stored);
390 eprintln!("Index path: {}", index_path.display());
391 return Ok(total_stored);
392 }
393
394 if matches!(max_depth, Some(1)) && !crawl_all {
396 eprintln!("\nShallow crawl (depth 1) for: {}", url);
397 let embedding_model = std::sync::Arc::new(
398 EmbeddingModel::new_with_config(self.config.embedding.clone()).await?,
399 );
400 let indexer = Indexer::new(&self.config)?;
401 let chunks = indexer.index_shallow_url(url, crawl_max_pages).await?;
402 let total_stored =
403 store_chunks_with_model_config(&self.config, &chunks, &embedding_model).await?;
404
405 let index_path = indexer.get_index_path();
406 eprintln!("\n==== Manx Index Summary ====");
407 eprintln!("Mode: Shallow crawl (depth 1)");
408 eprintln!("Chunks created: {}", chunks.len());
409 eprintln!("Chunks stored: {}", total_stored);
410 eprintln!("Index path: {}", index_path.display());
411 return Ok(total_stored);
412 }
413
414 let temp_dir = std::env::temp_dir().join(format!("manx_crawl_{}", uuid::Uuid::new_v4()));
415 std::fs::create_dir_all(&temp_dir)?;
416
417 eprintln!("\nStarting document crawl for: {}", url);
419 eprintln!(" This will: 1) Crawl pages -> 2) Chunk content -> 3) Create embeddings");
420 log::debug!("Temp directory: {}", temp_dir.display());
421 eprintln!();
422
423 let base_url = if let Ok(resp) = reqwest::Client::new().get(url).send().await {
425 resp.url().clone()
426 } else {
427 url::Url::parse(url)?
428 };
429 let crawl_config = CrawlConfig {
431 base_url,
432 output_dir: temp_dir.clone(),
433 user_agent: "Manx/0.5.0 (Documentation Crawler)".to_string(),
434 max_depth: if let Some(d) = max_depth {
435 Some(d as usize)
436 } else if crawl_all {
437 None
438 } else {
439 Some(3)
440 },
441 silence: true, rate_limit_per_sec: 20, follow_sitemaps: true,
444 concurrency: std::cmp::max(8, num_cpus::get()), timeout: Some(std::time::Duration::from_secs(30)),
446 resume: false,
447 config: DocrawlConfig::default(),
449 };
450
451 let embedding_model =
453 Arc::new(EmbeddingModel::new_with_config(self.config.embedding.clone()).await?);
454
455 let (tx, rx) = mpsc::channel::<PathBuf>(200);
457 let rx = std::sync::Arc::new(tokio::sync::Mutex::new(rx));
458 let crawl_max_pages = crawl_max_pages.unwrap_or(usize::MAX);
459
460 let pages_counter = Arc::new(AtomicUsize::new(0));
462 let processed_pages_counter = Arc::new(AtomicUsize::new(0));
463 let chunks_counter = Arc::new(AtomicUsize::new(0));
464
465 let crawl_done = Arc::new(AtomicBool::new(false));
467 let crawl_done_clone = crawl_done.clone();
468
469 let crawl_handle = tokio::spawn(async move {
471 let result = crawl(crawl_config).await;
472 crawl_done_clone.store(true, Ordering::Relaxed);
473 result.map_err(|e| e.to_string())
475 });
476
477 let temp_dir_clone = temp_dir.clone();
479 let scanner_tx = tx.clone();
480 let pc = pages_counter.clone();
481 let crawl_done_scanner = crawl_done.clone();
482 let scanner_handle = tokio::spawn(async move {
483 tokio::time::sleep(Duration::from_secs(3)).await;
485
486 let mut scan_interval_ms = 1000; let mut ticker = interval(Duration::from_millis(scan_interval_ms));
489 let mut seen: HashSet<PathBuf> = HashSet::new();
490 let mut idle_ticks = 0u32;
491 let mut total_files_scanned;
492 log::debug!(
494 "Scanner: Starting to monitor directory: {}",
495 temp_dir_clone.display()
496 );
497
498 loop {
499 ticker.tick().await;
500 let mut new_found = 0usize;
501 let mut current_scan_count = 0usize;
502
503 for entry in WalkDir::new(&temp_dir_clone)
504 .into_iter()
505 .filter_map(|e| e.ok())
506 {
507 let path = entry.path();
508 current_scan_count += 1;
509
510 if path.is_file() {
511 if let Some(ext) = path.extension().and_then(|s| s.to_str()) {
512 if ext == "md" {
513 let pb = path.to_path_buf();
514 if !seen.contains(&pb) {
515 log::debug!(
516 "Scanner: Found new markdown file: {}",
517 pb.file_name().unwrap_or_default().to_string_lossy()
518 );
519 seen.insert(pb.clone());
520 if scanner_tx.send(pb).await.is_err() {
521 break;
522 }
523 new_found += 1;
524 pc.fetch_add(1, Ordering::Relaxed);
525 if seen.len() >= crawl_max_pages {
526 break;
527 }
528 }
529 }
530 }
531 }
532 }
533
534 total_files_scanned = current_scan_count;
535
536 if new_found > 0 {
537 log::debug!(
538 "Scanner: Found {} new markdown files this scan (total: {})",
539 new_found,
540 seen.len()
541 );
542 idle_ticks = 0;
543 if scan_interval_ms > 300 {
545 scan_interval_ms = 300;
546 ticker = interval(Duration::from_millis(scan_interval_ms));
547 }
548 } else {
549 idle_ticks += 1;
550
551 if crawl_done_scanner.load(Ordering::Relaxed) && scan_interval_ms > 200 {
553 scan_interval_ms = 200;
554 ticker = interval(Duration::from_millis(scan_interval_ms));
555 }
556
557 if idle_ticks.is_multiple_of(10) {
558 log::debug!("Scanner: No new files found for {} ticks (scanned {} total files in directory)", idle_ticks, total_files_scanned);
559 }
560 }
561
562 if seen.len() >= crawl_max_pages {
563 eprintln!("Scanner: Reached max pages limit ({})", crawl_max_pages);
564 break;
565 }
566
567 if idle_ticks > 20 && crawl_done_scanner.load(Ordering::Relaxed) {
569 log::debug!(
570 "Scanner: Crawl is done and no new files for {} ticks, exiting",
571 idle_ticks
572 );
573 break;
574 }
575
576 if idle_ticks > 100 {
578 log::debug!(
579 "Scanner: Safety exit after {} ticks of no activity",
580 idle_ticks
581 );
582 break;
583 }
584 }
585 let files_found = seen.len();
586 drop(scanner_tx);
588 files_found
589 });
590
591 let workers = embed_concurrency.unwrap_or_else(|| std::cmp::max(4, num_cpus::get()));
593 let mut joins = Vec::new();
594 let config_clone = self.config.clone();
595 let url_for_worker = url.to_string();
596 for _ in 0..workers {
597 let rx = rx.clone();
598 let embedding_model = embedding_model.clone();
599 let config_clone = config_clone.clone();
600 let url_clone = url_for_worker.clone();
601 let chunks_counter = chunks_counter.clone();
602 let processed_pages_counter = processed_pages_counter.clone();
603 let join = tokio::spawn(async move {
604 let mut stored = 0usize;
605 let idx = match Indexer::new(&config_clone) {
606 Ok(i) => i,
607 Err(_) => return 0usize,
608 };
609 loop {
610 let opt_path = { rx.lock().await.recv().await };
611 let Some(md_path) = opt_path else { break };
612 if let Ok(chunks) = idx.process_markdown_file(&md_path, &url_clone).await {
613 if let Ok(count) =
614 store_chunks_with_model_config(&config_clone, &chunks, &embedding_model)
615 .await
616 {
617 stored += count;
618 chunks_counter.fetch_add(count, Ordering::Relaxed);
619 processed_pages_counter.fetch_add(1, Ordering::Relaxed);
621 }
622 }
623 }
624 stored
625 });
626 joins.push(join);
627 }
628
629 let crawl_result = crawl_handle.await;
631 let crawled_pages = if let Ok(Ok(stats)) = &crawl_result {
632 eprintln!("\nCrawl completed: {} pages crawled", stats.pages);
633 stats.pages
634 } else if let Ok(Err(e)) = &crawl_result {
635 eprintln!("\nCrawl completed with error: {}", e);
636 0
637 } else {
638 eprintln!("\nCrawl status unknown");
639 0
640 };
641
642 eprintln!("\nScanning for markdown files...");
644
645 let pb = ProgressBar::new_spinner();
646 pb.set_style(
647 ProgressStyle::default_spinner()
648 .template("{spinner:.green} {msg}")
649 .unwrap(),
650 );
651 pb.enable_steady_tick(std::time::Duration::from_millis(200));
652
653 let pages_counter_clone = pages_counter.clone();
655 let crawl_done_clone = crawl_done.clone();
656 let pb_clone = pb.clone();
657 let start_time = std::time::Instant::now();
658 let monitor_handle = tokio::spawn(async move {
659 let mut last_count = 0usize;
660 let mut stable_cycles = 0;
661
662 loop {
663 tokio::time::sleep(Duration::from_millis(500)).await;
664 let current_count = pages_counter_clone.load(Ordering::Relaxed);
665 let is_crawl_done = crawl_done_clone.load(Ordering::Relaxed);
666 let elapsed = start_time.elapsed().as_secs_f32().max(0.1);
667 let rate = (current_count as f32) / elapsed;
668
669 let message = if current_count != last_count {
670 last_count = current_count;
671 stable_cycles = 0;
672 format!("Found {} files | {:.1}/s", current_count, rate)
673 } else {
674 stable_cycles += 1;
675 if stable_cycles < 4 {
676 format!(
677 "Found {} files (scanning...) | {:.1}/s",
678 current_count, rate
679 )
680 } else {
681 format!(
682 "Found {} files (finalizing...) | {:.1}/s",
683 current_count, rate
684 )
685 }
686 };
687
688 pb_clone.set_message(message);
689
690 if is_crawl_done && stable_cycles > 6 {
692 break;
693 }
694 if stable_cycles > 60 {
696 break;
697 }
698 }
699 });
700
701 let scanner_result = scanner_handle.await;
703 let _scanner_files = scanner_result.unwrap_or(0);
704
705 monitor_handle.abort();
707 let final_count = pages_counter.load(Ordering::Relaxed);
708 pb.finish_with_message(format!("Found {} markdown files", final_count));
709
710 drop(tx);
711
712 let total_pages_found = pages_counter.load(Ordering::Relaxed);
717
718 let mut processed_so_far = processed_pages_counter.load(Ordering::Relaxed);
720
721 if total_pages_found > 0 {
723 eprintln!("\nProcessing {} markdown files...", total_pages_found);
724
725 let pb = ProgressBar::new(total_pages_found as u64);
727 pb.set_style(
728 ProgressStyle::default_bar()
729 .template("{spinner:.green} [{elapsed_precise}] [{bar:40.cyan/blue}] {pos}/{len} files ({percent}%) | {msg}")
730 .unwrap()
731 .progress_chars("█▉▊▋▌▍▎▏ ")
732 );
733 pb.set_message("Chunking... 0 chunks created".to_string());
734
735 let mut last_processed = processed_so_far;
737 let mut stall_counter = 0;
738 while processed_so_far < total_pages_found && stall_counter < 60 {
739 tokio::time::sleep(Duration::from_millis(200)).await;
740 processed_so_far = processed_pages_counter.load(Ordering::Relaxed);
741 let chunks_so_far = chunks_counter.load(Ordering::Relaxed);
742
743 if processed_so_far != last_processed {
744 pb.set_position(processed_so_far as u64);
745 pb.set_message(format!("{} chunks created", chunks_so_far));
746 last_processed = processed_so_far;
747 stall_counter = 0;
748 } else {
749 stall_counter += 1;
750 pb.tick();
752 }
753 }
754
755 pb.set_position(processed_so_far as u64);
757
758 if processed_so_far == total_pages_found {
759 pb.finish_with_message("All files processed");
760 } else {
761 pb.abandon_with_message("Processing incomplete - some files may have failed");
762 }
763 } else {
764 eprintln!("\nNo markdown files found to process");
765 eprintln!(
766 " The crawler processed {} pages but docrawl generated no markdown files.",
767 crawled_pages
768 );
769 eprintln!(" This can happen when:");
770 eprintln!(" • The site uses JavaScript rendering that docrawl can't parse");
771 eprintln!(" • The pages contain mostly non-text content (images, PDFs, etc.)");
772 eprintln!(" • The site structure isn't compatible with the crawler");
773 eprintln!(" Try:");
775 eprintln!(" • Using a different URL that points to documentation pages");
776 eprintln!(" • Indexing local files instead if you have them downloaded");
777 }
778
779 let mut total_stored = 0usize;
781
782 if total_pages_found > 0 {
783 eprintln!("\nWaiting for workers to finish...");
785 let pb_final = ProgressBar::new_spinner();
786 pb_final.set_style(
787 ProgressStyle::default_spinner()
788 .template("{spinner:.green} {msg}")
789 .unwrap(),
790 );
791 pb_final.set_message("Finalizing embeddings and storing to database...");
792 pb_final.enable_steady_tick(std::time::Duration::from_millis(100));
793
794 for j in joins {
795 pb_final.tick();
796 if let Ok(count) = j.await {
797 total_stored += count;
798 }
799 }
800
801 pb_final.finish_with_message("Index finalized");
802 } else {
803 for j in joins {
805 if let Ok(count) = j.await {
806 total_stored += count;
807 }
808 }
809 }
810
811 let _ = std::fs::remove_dir_all(&temp_dir);
813
814 let total_pages = pages_counter.load(Ordering::Relaxed);
816 let final_processed = processed_pages_counter.load(Ordering::Relaxed);
817 let final_chunks = chunks_counter.load(Ordering::Relaxed);
818 let indexer = Indexer::new(&self.config)?;
819 let index_path = indexer.get_index_path();
820
821 eprintln!();
822 eprintln!("==== Manx Index Summary ====");
823 eprintln!("Markdown files found: {}", total_pages);
824 eprintln!("Files processed: {}", final_processed);
825 eprintln!("Chunks created: {}", final_chunks);
826 eprintln!("Chunks stored: {}", total_stored);
827 eprintln!("Index path: {}", index_path.display());
828
829 if total_pages == 0 {
830 eprintln!();
831 eprintln!("No markdown files were found. Docrawl may not have generated any content.");
832 eprintln!(" This could mean the site structure is not compatible with crawling.");
833 } else if total_stored == 0 {
834 eprintln!();
835 eprintln!("No chunks were stored. The markdown files may have been empty.");
836 }
837
838 Ok(total_stored)
839 }
840
841 pub async fn search(
842 &self,
843 query: &str,
844 max_results: Option<usize>,
845 ) -> Result<Vec<RagSearchResult>> {
846 if !self.config.enabled {
847 return Err(anyhow::anyhow!("RAG system is disabled"));
848 }
849
850 log::info!("Starting intelligent search for: '{}'", query);
851
852 let search_engine =
854 SmartSearchEngine::new(self.config.clone(), self.llm_client.clone()).await?;
855
856 let verified_results = search_engine.search(query, max_results).await?;
858
859 let results: Vec<RagSearchResult> = verified_results
861 .into_iter()
862 .map(|verified| RagSearchResult {
863 id: verified.result.id,
864 content: verified.result.content,
865 source_path: verified.result.source_path,
866 source_type: verified.result.source_type,
867 title: verified.result.title,
868 section: verified.result.section,
869 score: verified.confidence_score, chunk_index: verified.result.chunk_index,
871 metadata: verified.result.metadata,
872 })
873 .collect();
874
875 log::info!(
876 "Intelligent search completed with {} results",
877 results.len()
878 );
879 Ok(results)
880 }
881
882 pub async fn get_stats(&self) -> Result<RagStats> {
883 if !self.config.enabled {
884 return Err(anyhow::anyhow!("RAG system is disabled"));
885 }
886
887 let indexer = Indexer::new(&self.config)?;
888 let index_path = indexer.get_index_path();
889 let embedding_dir = index_path.join("embeddings");
890
891 if !embedding_dir.exists() {
892 return Ok(RagStats {
893 total_documents: 0,
894 total_chunks: 0,
895 index_size_mb: 0.0,
896 last_updated: chrono::Utc::now(),
897 sources: vec![],
898 });
899 }
900
901 let mut total_chunks = 0;
903 let mut total_size = 0u64;
904 let mut sources = std::collections::HashSet::new();
905 let mut last_modified = std::time::UNIX_EPOCH;
906
907 let entries = std::fs::read_dir(&embedding_dir)?;
908 for entry in entries.flatten() {
909 if let Some(file_name) = entry.file_name().to_str() {
910 if file_name.ends_with(".json") {
911 total_chunks += 1;
912
913 if let Ok(metadata) = entry.metadata() {
914 total_size += metadata.len();
915
916 if let Ok(modified) = metadata.modified() {
918 if modified > last_modified {
919 last_modified = modified;
920 }
921 }
922 }
923
924 if let Ok(content) = std::fs::read_to_string(entry.path()) {
926 if let Ok(chunk_data) = serde_json::from_str::<StoredChunk>(&content) {
927 if let Some(source_str) = chunk_data.source_path.to_str() {
928 sources.insert(source_str.to_string());
929 }
930 }
931 }
932 }
933 }
934 }
935
936 let total_documents = sources.len();
938 let index_size_mb = total_size as f64 / (1024.0 * 1024.0);
939
940 let last_updated = chrono::DateTime::<chrono::Utc>::from(last_modified);
941
942 let sources_vec: Vec<String> = sources.into_iter().collect();
943
944 Ok(RagStats {
945 total_documents,
946 total_chunks,
947 index_size_mb,
948 last_updated,
949 sources: sources_vec,
950 })
951 }
952
953 pub async fn clear_index(&self) -> Result<()> {
954 if !self.config.enabled {
955 return Err(anyhow::anyhow!("RAG system is disabled"));
956 }
957
958 log::info!("Clearing local vector storage");
959
960 let indexer = Indexer::new(&self.config)?;
962 let index_path = indexer.get_index_path();
963 let embedding_dir = index_path.join("embeddings");
964
965 if embedding_dir.exists() {
966 let entries = std::fs::read_dir(&embedding_dir)?;
968 let mut cleared_count = 0;
969
970 for entry in entries.flatten() {
971 if let Some(file_name) = entry.file_name().to_str() {
972 if file_name.ends_with(".json") {
973 if let Err(e) = std::fs::remove_file(entry.path()) {
974 log::warn!("Failed to remove embedding file {:?}: {}", entry.path(), e);
975 } else {
976 cleared_count += 1;
977 }
978 }
979 }
980 }
981
982 log::info!(
983 "Successfully cleared {} embedding files from local vector storage",
984 cleared_count
985 );
986 } else {
987 log::info!("Local vector storage directory does not exist, nothing to clear");
988 }
989
990 Ok(())
991 }
992
993 pub async fn health_check(&self) -> Result<()> {
994 if !self.config.enabled {
995 return Err(anyhow::anyhow!("RAG system is disabled"));
996 }
997
998 log::info!("Running RAG system health check...");
999
1000 let _embedding_model = EmbeddingModel::new_with_config(self.config.embedding.clone())
1002 .await
1003 .map_err(|e| anyhow::anyhow!("Embedding model unavailable: {}", e))?;
1004 log::info!("Embedding model loaded successfully");
1005
1006 let indexer = Indexer::new(&self.config)?;
1008 let index_path = indexer.get_index_path();
1009
1010 if index_path.exists() {
1011 log::info!("Local index directory exists: {:?}", index_path);
1012
1013 let embedding_dir = index_path.join("embeddings");
1015 if embedding_dir.exists() {
1016 match std::fs::read_dir(&embedding_dir) {
1018 Ok(entries) => {
1019 let count = entries.filter_map(|e| e.ok()).count();
1020 log::info!(
1021 "Local vector storage accessible with {} embedding files",
1022 count
1023 );
1024 }
1025 Err(e) => {
1026 log::warn!(
1027 "Local vector storage directory exists but cannot read contents: {}",
1028 e
1029 );
1030 }
1031 }
1032 } else {
1033 log::info!("Local vector storage will be created when needed");
1034 }
1035 } else {
1036 log::info!("Local index directory will be created: {:?}", index_path);
1037 }
1038
1039 let test_file = index_path.join(".health_check");
1041 match std::fs::create_dir_all(index_path) {
1042 Ok(_) => {
1043 match std::fs::write(&test_file, "health_check") {
1044 Ok(_) => {
1045 log::info!("File system write access confirmed");
1046 let _ = std::fs::remove_file(&test_file); }
1048 Err(e) => {
1049 return Err(anyhow::anyhow!("File system write access failed: {}", e));
1050 }
1051 }
1052 }
1053 Err(e) => {
1054 return Err(anyhow::anyhow!("Cannot create index directory: {}", e));
1055 }
1056 }
1057
1058 log::info!("RAG system health check: All systems operational");
1059 Ok(())
1060 }
1061
1062 async fn store_chunks_locally(&self, chunks: &[DocumentChunk]) -> Result<()> {
1064 use uuid::Uuid;
1065
1066 if chunks.is_empty() {
1067 log::info!("No chunks to store locally");
1068 return Ok(());
1069 }
1070
1071 log::info!("Storing {} chunks in local vector storage", chunks.len());
1072
1073 let embedding_model =
1075 EmbeddingModel::new_with_config(self.config.embedding.clone()).await?;
1076
1077 let indexer = Indexer::new(&self.config)?;
1079 let index_path = indexer.get_index_path();
1080 let embedding_dir = index_path.join("embeddings");
1081
1082 std::fs::create_dir_all(&embedding_dir)?;
1084
1085 let mut stored_count = 0;
1087
1088 for (i, chunk) in chunks.iter().enumerate() {
1089 let embedding = match embedding_model.embed_text(&chunk.content).await {
1091 Ok(embedding) => embedding,
1092 Err(e) => {
1093 log::warn!("Failed to generate embedding for chunk {}: {}", chunk.id, e);
1094 continue;
1095 }
1096 };
1097
1098 let stored_chunk = StoredChunk {
1100 id: chunk.id.clone(),
1101 content: chunk.content.clone(),
1102 source_path: chunk.source_path.clone(),
1103 source_type: chunk.source_type.clone(),
1104 title: chunk.title.clone(),
1105 section: chunk.section.clone(),
1106 chunk_index: chunk.chunk_index,
1107 metadata: chunk.metadata.clone(),
1108 embedding,
1109 };
1110
1111 let file_id = Uuid::new_v4().to_string();
1113 let file_path = embedding_dir.join(format!("{}.json", file_id));
1114
1115 let json_content = serde_json::to_string_pretty(&stored_chunk)?;
1116 std::fs::write(&file_path, json_content)?;
1117
1118 stored_count += 1;
1119 log::debug!("Stored chunk {} to {:?}", chunk.id, file_path);
1120 if (i + 1) % 100 == 0 || i + 1 == chunks.len() {
1121 println!("Stored {}/{} chunks...", i + 1, chunks.len());
1122 }
1123 }
1124
1125 log::info!(
1126 "Successfully stored {} chunks in local vector storage",
1127 stored_count
1128 );
1129 Ok(())
1130 }
1131}
1132
1133pub async fn store_chunks_with_model_config(
1135 config: &RagConfig,
1136 chunks: &[DocumentChunk],
1137 embedding_model: &EmbeddingModel,
1138) -> Result<usize> {
1139 use uuid::Uuid;
1140 if chunks.is_empty() {
1141 return Ok(0);
1142 }
1143
1144 let indexer = Indexer::new(config)?;
1145 let index_path = indexer.get_index_path();
1146 let embedding_dir = index_path.join("embeddings");
1147 std::fs::create_dir_all(&embedding_dir)?;
1148
1149 let mut stored_count = 0usize;
1150 for chunk in chunks {
1151 let embedding = match embedding_model.embed_text(&chunk.content).await {
1152 Ok(embedding) => embedding,
1153 Err(_) => continue,
1154 };
1155
1156 let stored_chunk = StoredChunk {
1157 id: chunk.id.clone(),
1158 content: chunk.content.clone(),
1159 source_path: chunk.source_path.clone(),
1160 source_type: chunk.source_type.clone(),
1161 title: chunk.title.clone(),
1162 section: chunk.section.clone(),
1163 chunk_index: chunk.chunk_index,
1164 metadata: chunk.metadata.clone(),
1165 embedding,
1166 };
1167
1168 let file_id = Uuid::new_v4().to_string();
1169 let file_path = embedding_dir.join(format!("{}.json", file_id));
1170 let json_content = serde_json::to_string_pretty(&stored_chunk)?;
1171 std::fs::write(&file_path, json_content)?;
1172 stored_count += 1;
1173 }
1174
1175 Ok(stored_count)
1176}