1use crate::rag::embeddings::EmbeddingModel;
7use crate::rag::indexer::Indexer;
8use crate::rag::llm::LlmClient;
9use crate::rag::search_engine::SmartSearchEngine;
10use anyhow::Result;
11use serde::{Deserialize, Serialize};
12use std::path::PathBuf;
13
14pub mod benchmarks;
15pub mod embeddings;
16pub mod indexer;
17pub mod llm;
18pub mod model_metadata;
19pub mod providers;
20pub mod query_enhancer;
21pub mod result_verifier;
22pub mod search_engine;
23
24#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
26pub enum EmbeddingProvider {
27 #[default]
28 Hash, Onnx(String), Ollama(String), OpenAI(String), HuggingFace(String), Custom(String), }
35
36#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct EmbeddingConfig {
39 pub provider: EmbeddingProvider,
40 pub dimension: usize,
41 pub model_path: Option<PathBuf>, pub api_key: Option<String>, pub endpoint: Option<String>, pub timeout_seconds: u64,
45 pub batch_size: usize,
46}
47
48impl Default for EmbeddingConfig {
49 fn default() -> Self {
50 Self {
51 provider: EmbeddingProvider::Hash,
52 dimension: 384, model_path: None,
54 api_key: None,
55 endpoint: None,
56 timeout_seconds: 30,
57 batch_size: 32,
58 }
59 }
60}
61
62impl EmbeddingConfig {
63 pub async fn detect_and_update_dimension(&mut self) -> Result<()> {
65 use crate::rag::embeddings::EmbeddingModel;
66
67 let model = EmbeddingModel::new_with_config(self.clone()).await?;
68 let detected_dimension = model.get_dimension().await?;
69
70 if self.dimension != detected_dimension {
71 log::info!(
72 "Updating dimension from {} to {} for provider {:?}",
73 self.dimension,
74 detected_dimension,
75 self.provider
76 );
77 self.dimension = detected_dimension;
78 }
79
80 Ok(())
81 }
82}
83
84#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
86pub enum CodeSecurityLevel {
87 Strict,
89 Moderate,
91 Permissive,
93}
94
95impl Default for CodeSecurityLevel {
96 fn default() -> Self {
97 Self::Moderate
98 }
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct SmartSearchConfig {
104 pub prefer_semantic: bool, pub enable_query_enhancement: bool, pub enable_result_verification: bool, pub min_confidence_score: f32, pub max_query_variations: usize, pub enable_multi_stage: bool, pub adaptive_chunking: bool, }
112
113impl Default for SmartSearchConfig {
114 fn default() -> Self {
115 Self {
116 prefer_semantic: true,
117 enable_query_enhancement: true,
118 enable_result_verification: true,
119 min_confidence_score: 0.7,
120 max_query_variations: 3,
121 enable_multi_stage: true,
122 adaptive_chunking: true,
123 }
124 }
125}
126
127#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct RagConfig {
130 pub enabled: bool,
131 pub index_path: PathBuf,
132 pub max_results: usize,
133 pub similarity_threshold: f32,
134 pub allow_pdf_processing: bool,
135 pub allow_code_processing: bool,
136 pub code_security_level: CodeSecurityLevel,
137 pub mask_secrets: bool,
138 pub max_file_size_mb: u64,
139 pub embedding: EmbeddingConfig,
140 pub smart_search: SmartSearchConfig,
141}
142
143impl Default for RagConfig {
144 fn default() -> Self {
145 Self {
146 enabled: true, index_path: PathBuf::from("~/.cache/manx/rag_index"),
148 max_results: 10,
149 similarity_threshold: 0.6,
150 allow_pdf_processing: false, allow_code_processing: true, code_security_level: CodeSecurityLevel::Moderate,
153 mask_secrets: true, max_file_size_mb: 100, embedding: EmbeddingConfig::default(),
156 smart_search: SmartSearchConfig::default(),
157 }
158 }
159}
160
161#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct DocumentChunk {
164 pub id: String,
165 pub content: String,
166 pub source_path: PathBuf,
167 pub source_type: SourceType,
168 pub title: Option<String>,
169 pub section: Option<String>,
170 pub chunk_index: usize,
171 pub metadata: DocumentMetadata,
172}
173
174#[derive(Debug, Clone, Serialize, Deserialize)]
176pub enum SourceType {
177 Local,
178 Remote,
179 Curated,
180 Web,
181}
182
183#[derive(Debug, Clone, Serialize, Deserialize)]
185pub struct DocumentMetadata {
186 pub file_type: String,
187 pub size: u64,
188 pub modified: chrono::DateTime<chrono::Utc>,
189 pub tags: Vec<String>,
190 pub language: Option<String>,
191}
192
193#[derive(Debug, Clone, Serialize, Deserialize)]
195pub struct RagSearchResult {
196 pub id: String,
197 pub content: String,
198 pub source_path: PathBuf,
199 pub source_type: SourceType,
200 pub title: Option<String>,
201 pub section: Option<String>,
202 pub score: f32,
203 pub chunk_index: usize,
204 pub metadata: DocumentMetadata,
205}
206
207#[derive(Debug, Serialize, Deserialize)]
209pub struct RagStats {
210 pub total_documents: usize,
211 pub total_chunks: usize,
212 pub index_size_mb: f64,
213 pub last_updated: chrono::DateTime<chrono::Utc>,
214 pub sources: Vec<String>,
215}
216
217#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct StoredChunk {
220 pub id: String,
221 pub content: String,
222 pub source_path: PathBuf,
223 pub source_type: SourceType,
224 pub title: Option<String>,
225 pub section: Option<String>,
226 pub chunk_index: usize,
227 pub metadata: DocumentMetadata,
228 pub embedding: Vec<f32>,
229}
230
231pub struct RagSystem {
233 config: RagConfig,
234 llm_client: Option<LlmClient>,
235}
236
237impl RagSystem {
238 pub async fn new(config: RagConfig) -> Result<Self> {
239 Self::new_with_llm(config, None).await
240 }
241
242 pub async fn new_with_llm(config: RagConfig, llm_client: Option<LlmClient>) -> Result<Self> {
243 if !config.enabled {
244 return Err(anyhow::anyhow!("RAG system is disabled"));
245 }
246
247 let indexer = Indexer::new(&config)?;
249 let index_path = indexer.get_index_path();
250
251 std::fs::create_dir_all(index_path)?;
253
254 log::info!(
255 "RAG system initialized with local vector storage at {:?}",
256 index_path
257 );
258 Ok(Self { config, llm_client })
259 }
260
261 pub async fn index_document(&mut self, path: PathBuf) -> Result<usize> {
262 if !self.config.enabled {
263 return Err(anyhow::anyhow!("RAG system is disabled"));
264 }
265
266 let indexer = Indexer::new(&self.config)?;
267 let chunks = indexer.index_document(path)?;
268 let chunk_count = chunks.len();
269
270 self.store_chunks_locally(&chunks).await?;
272
273 log::info!("Successfully indexed and stored {} chunks", chunk_count);
274 Ok(chunk_count)
275 }
276
277 pub async fn index_directory(&mut self, path: PathBuf) -> Result<usize> {
278 if !self.config.enabled {
279 return Err(anyhow::anyhow!("RAG system is disabled"));
280 }
281
282 let indexer = Indexer::new(&self.config)?;
283 let chunks = indexer.index_directory(path)?;
284 let chunk_count = chunks.len();
285
286 self.store_chunks_locally(&chunks).await?;
288
289 log::info!(
290 "Successfully indexed and stored {} chunks from directory",
291 chunk_count
292 );
293 Ok(chunk_count)
294 }
295
296 pub async fn index_url(&mut self, url: &str) -> Result<usize> {
297 if !self.config.enabled {
298 return Err(anyhow::anyhow!("RAG system is disabled"));
299 }
300
301 log::info!("Indexing URL: {}", url);
302
303 let indexer = Indexer::new(&self.config)?;
304 let chunks = indexer.index_url(url.to_string()).await?;
305 let chunk_count = chunks.len();
306
307 self.store_chunks_locally(&chunks).await?;
309
310 log::info!(
311 "Successfully indexed and stored {} chunks from URL",
312 chunk_count
313 );
314 Ok(chunk_count)
315 }
316
317 pub async fn index_url_deep(
318 &mut self,
319 url: &str,
320 max_depth: Option<u32>,
321 max_pages: Option<u32>,
322 ) -> Result<usize> {
323 if !self.config.enabled {
324 return Err(anyhow::anyhow!("RAG system is disabled"));
325 }
326
327 log::info!(
328 "Deep indexing URL: {} (depth: {:?}, pages: {:?})",
329 url,
330 max_depth,
331 max_pages
332 );
333
334 let indexer = Indexer::new(&self.config)?;
335 let crawl_all = max_pages.is_none(); let chunks = indexer
338 .index_url_deep(url.to_string(), max_depth, crawl_all)
339 .await?;
340 let chunk_count = chunks.len();
341
342 self.store_chunks_locally(&chunks).await?;
344
345 log::info!(
346 "Successfully deep indexed and stored {} chunks from URL",
347 chunk_count
348 );
349 Ok(chunk_count)
350 }
351
352 pub async fn search(
353 &self,
354 query: &str,
355 max_results: Option<usize>,
356 ) -> Result<Vec<RagSearchResult>> {
357 if !self.config.enabled {
358 return Err(anyhow::anyhow!("RAG system is disabled"));
359 }
360
361 log::info!("Starting intelligent search for: '{}'", query);
362
363 let search_engine =
365 SmartSearchEngine::new(self.config.clone(), self.llm_client.clone()).await?;
366
367 let verified_results = search_engine.search(query, max_results).await?;
369
370 let results: Vec<RagSearchResult> = verified_results
372 .into_iter()
373 .map(|verified| RagSearchResult {
374 id: verified.result.id,
375 content: verified.result.content,
376 source_path: verified.result.source_path,
377 source_type: verified.result.source_type,
378 title: verified.result.title,
379 section: verified.result.section,
380 score: verified.confidence_score, chunk_index: verified.result.chunk_index,
382 metadata: verified.result.metadata,
383 })
384 .collect();
385
386 log::info!(
387 "Intelligent search completed with {} results",
388 results.len()
389 );
390 Ok(results)
391 }
392
393 pub async fn get_stats(&self) -> Result<RagStats> {
394 if !self.config.enabled {
395 return Err(anyhow::anyhow!("RAG system is disabled"));
396 }
397
398 let indexer = Indexer::new(&self.config)?;
399 let index_path = indexer.get_index_path();
400 let embedding_dir = index_path.join("embeddings");
401
402 if !embedding_dir.exists() {
403 return Ok(RagStats {
404 total_documents: 0,
405 total_chunks: 0,
406 index_size_mb: 0.0,
407 last_updated: chrono::Utc::now(),
408 sources: vec![],
409 });
410 }
411
412 let mut total_chunks = 0;
414 let mut total_size = 0u64;
415 let mut sources = std::collections::HashSet::new();
416 let mut last_modified = std::time::UNIX_EPOCH;
417
418 let entries = std::fs::read_dir(&embedding_dir)?;
419 for entry in entries.flatten() {
420 if let Some(file_name) = entry.file_name().to_str() {
421 if file_name.ends_with(".json") {
422 total_chunks += 1;
423
424 if let Ok(metadata) = entry.metadata() {
425 total_size += metadata.len();
426
427 if let Ok(modified) = metadata.modified() {
429 if modified > last_modified {
430 last_modified = modified;
431 }
432 }
433 }
434
435 if let Ok(content) = std::fs::read_to_string(entry.path()) {
437 if let Ok(chunk_data) = serde_json::from_str::<StoredChunk>(&content) {
438 if let Some(source_str) = chunk_data.source_path.to_str() {
439 sources.insert(source_str.to_string());
440 }
441 }
442 }
443 }
444 }
445 }
446
447 let total_documents = sources.len();
449 let index_size_mb = total_size as f64 / (1024.0 * 1024.0);
450
451 let last_updated = chrono::DateTime::<chrono::Utc>::from(last_modified);
452
453 let sources_vec: Vec<String> = sources.into_iter().collect();
454
455 Ok(RagStats {
456 total_documents,
457 total_chunks,
458 index_size_mb,
459 last_updated,
460 sources: sources_vec,
461 })
462 }
463
464 pub async fn clear_index(&self) -> Result<()> {
465 if !self.config.enabled {
466 return Err(anyhow::anyhow!("RAG system is disabled"));
467 }
468
469 log::info!("Clearing local vector storage");
470
471 let indexer = Indexer::new(&self.config)?;
473 let index_path = indexer.get_index_path();
474 let embedding_dir = index_path.join("embeddings");
475
476 if embedding_dir.exists() {
477 let entries = std::fs::read_dir(&embedding_dir)?;
479 let mut cleared_count = 0;
480
481 for entry in entries.flatten() {
482 if let Some(file_name) = entry.file_name().to_str() {
483 if file_name.ends_with(".json") {
484 if let Err(e) = std::fs::remove_file(entry.path()) {
485 log::warn!("Failed to remove embedding file {:?}: {}", entry.path(), e);
486 } else {
487 cleared_count += 1;
488 }
489 }
490 }
491 }
492
493 log::info!(
494 "Successfully cleared {} embedding files from local vector storage",
495 cleared_count
496 );
497 } else {
498 log::info!("Local vector storage directory does not exist, nothing to clear");
499 }
500
501 Ok(())
502 }
503
504 pub async fn health_check(&self) -> Result<()> {
505 if !self.config.enabled {
506 return Err(anyhow::anyhow!("RAG system is disabled"));
507 }
508
509 log::info!("Running RAG system health check...");
510
511 let _embedding_model = EmbeddingModel::new_with_config(self.config.embedding.clone())
513 .await
514 .map_err(|e| anyhow::anyhow!("Embedding model unavailable: {}", e))?;
515 log::info!("✓ Embedding model loaded successfully");
516
517 let indexer = Indexer::new(&self.config)?;
519 let index_path = indexer.get_index_path();
520
521 if index_path.exists() {
522 log::info!("✓ Local index directory exists: {:?}", index_path);
523
524 let embedding_dir = index_path.join("embeddings");
526 if embedding_dir.exists() {
527 match std::fs::read_dir(&embedding_dir) {
529 Ok(entries) => {
530 let count = entries.filter_map(|e| e.ok()).count();
531 log::info!(
532 "✓ Local vector storage accessible with {} embedding files",
533 count
534 );
535 }
536 Err(e) => {
537 log::warn!(
538 "⚠ Local vector storage directory exists but cannot read contents: {}",
539 e
540 );
541 }
542 }
543 } else {
544 log::info!("✓ Local vector storage will be created when needed");
545 }
546 } else {
547 log::info!("✓ Local index directory will be created: {:?}", index_path);
548 }
549
550 let test_file = index_path.join(".health_check");
552 match std::fs::create_dir_all(index_path) {
553 Ok(_) => {
554 match std::fs::write(&test_file, "health_check") {
555 Ok(_) => {
556 log::info!("✓ File system write access confirmed");
557 let _ = std::fs::remove_file(&test_file); }
559 Err(e) => {
560 return Err(anyhow::anyhow!("File system write access failed: {}", e));
561 }
562 }
563 }
564 Err(e) => {
565 return Err(anyhow::anyhow!("Cannot create index directory: {}", e));
566 }
567 }
568
569 log::info!("RAG system health check: All systems operational");
570 Ok(())
571 }
572
573 async fn store_chunks_locally(&self, chunks: &[DocumentChunk]) -> Result<()> {
575 use uuid::Uuid;
576
577 if chunks.is_empty() {
578 log::info!("No chunks to store locally");
579 return Ok(());
580 }
581
582 log::info!("Storing {} chunks in local vector storage", chunks.len());
583
584 let embedding_model =
586 EmbeddingModel::new_with_config(self.config.embedding.clone()).await?;
587
588 let indexer = Indexer::new(&self.config)?;
590 let index_path = indexer.get_index_path();
591 let embedding_dir = index_path.join("embeddings");
592
593 std::fs::create_dir_all(&embedding_dir)?;
595
596 let mut stored_count = 0;
598
599 for chunk in chunks {
600 let embedding = match embedding_model.embed_text(&chunk.content).await {
602 Ok(embedding) => embedding,
603 Err(e) => {
604 log::warn!("Failed to generate embedding for chunk {}: {}", chunk.id, e);
605 continue;
606 }
607 };
608
609 let stored_chunk = StoredChunk {
611 id: chunk.id.clone(),
612 content: chunk.content.clone(),
613 source_path: chunk.source_path.clone(),
614 source_type: chunk.source_type.clone(),
615 title: chunk.title.clone(),
616 section: chunk.section.clone(),
617 chunk_index: chunk.chunk_index,
618 metadata: chunk.metadata.clone(),
619 embedding,
620 };
621
622 let file_id = Uuid::new_v4().to_string();
624 let file_path = embedding_dir.join(format!("{}.json", file_id));
625
626 let json_content = serde_json::to_string_pretty(&stored_chunk)?;
627 std::fs::write(&file_path, json_content)?;
628
629 stored_count += 1;
630 log::debug!("Stored chunk {} to {:?}", chunk.id, file_path);
631 }
632
633 log::info!(
634 "Successfully stored {} chunks in local vector storage",
635 stored_count
636 );
637 Ok(())
638 }
639}