1use crate::rag::embeddings::EmbeddingModel;
7use crate::rag::indexer::Indexer;
8use anyhow::Result;
9use serde::{Deserialize, Serialize};
10use std::path::PathBuf;
11
12pub mod benchmarks;
13pub mod embeddings;
14pub mod indexer;
15pub mod llm;
16pub mod model_metadata;
17pub mod providers;
18
19#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)]
21pub enum EmbeddingProvider {
22 #[default]
23 Hash, Onnx(String), Ollama(String), OpenAI(String), HuggingFace(String), Custom(String), }
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct EmbeddingConfig {
34 pub provider: EmbeddingProvider,
35 pub dimension: usize,
36 pub model_path: Option<PathBuf>, pub api_key: Option<String>, pub endpoint: Option<String>, pub timeout_seconds: u64,
40 pub batch_size: usize,
41}
42
43impl Default for EmbeddingConfig {
44 fn default() -> Self {
45 Self {
46 provider: EmbeddingProvider::Hash,
47 dimension: 384, model_path: None,
49 api_key: None,
50 endpoint: None,
51 timeout_seconds: 30,
52 batch_size: 32,
53 }
54 }
55}
56
57impl EmbeddingConfig {
58 pub async fn detect_and_update_dimension(&mut self) -> Result<()> {
60 use crate::rag::embeddings::EmbeddingModel;
61
62 let model = EmbeddingModel::new_with_config(self.clone()).await?;
63 let detected_dimension = model.get_dimension().await?;
64
65 if self.dimension != detected_dimension {
66 log::info!(
67 "Updating dimension from {} to {} for provider {:?}",
68 self.dimension,
69 detected_dimension,
70 self.provider
71 );
72 self.dimension = detected_dimension;
73 }
74
75 Ok(())
76 }
77}
78
79#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct RagConfig {
82 pub enabled: bool,
83 pub index_path: PathBuf,
84 pub max_results: usize,
85 pub similarity_threshold: f32,
86 pub allow_pdf_processing: bool,
87 pub embedding: EmbeddingConfig,
88}
89
90impl Default for RagConfig {
91 fn default() -> Self {
92 Self {
93 enabled: true, index_path: PathBuf::from("~/.cache/manx/rag_index"),
95 max_results: 10,
96 similarity_threshold: 0.6,
97 allow_pdf_processing: false, embedding: EmbeddingConfig::default(),
99 }
100 }
101}
102
103#[derive(Debug, Clone, Serialize, Deserialize)]
105pub struct DocumentChunk {
106 pub id: String,
107 pub content: String,
108 pub source_path: PathBuf,
109 pub source_type: SourceType,
110 pub title: Option<String>,
111 pub section: Option<String>,
112 pub chunk_index: usize,
113 pub metadata: DocumentMetadata,
114}
115
116#[derive(Debug, Clone, Serialize, Deserialize)]
118pub enum SourceType {
119 Local,
120 Remote,
121 Curated,
122 Web,
123}
124
125#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct DocumentMetadata {
128 pub file_type: String,
129 pub size: u64,
130 pub modified: chrono::DateTime<chrono::Utc>,
131 pub tags: Vec<String>,
132 pub language: Option<String>,
133}
134
135#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct RagSearchResult {
138 pub id: String,
139 pub content: String,
140 pub source_path: PathBuf,
141 pub source_type: SourceType,
142 pub title: Option<String>,
143 pub section: Option<String>,
144 pub score: f32,
145 pub metadata: DocumentMetadata,
146}
147
148#[derive(Debug, Serialize, Deserialize)]
150pub struct RagStats {
151 pub total_documents: usize,
152 pub total_chunks: usize,
153 pub index_size_mb: f64,
154 pub last_updated: chrono::DateTime<chrono::Utc>,
155 pub sources: Vec<String>,
156}
157
158#[derive(Debug, Clone, Serialize, Deserialize)]
160struct StoredChunk {
161 pub id: String,
162 pub content: String,
163 pub source_path: PathBuf,
164 pub source_type: SourceType,
165 pub title: Option<String>,
166 pub section: Option<String>,
167 pub chunk_index: usize,
168 pub metadata: DocumentMetadata,
169 pub embedding: Vec<f32>,
170}
171
172pub struct RagSystem {
174 #[allow(dead_code)]
175 config: RagConfig,
176}
177
178impl RagSystem {
179 pub async fn new(config: RagConfig) -> Result<Self> {
180 if !config.enabled {
181 return Err(anyhow::anyhow!("RAG system is disabled"));
182 }
183
184 let indexer = Indexer::new(&config)?;
186 let index_path = indexer.get_index_path();
187
188 std::fs::create_dir_all(index_path)?;
190
191 log::info!(
192 "RAG system initialized with local vector storage at {:?}",
193 index_path
194 );
195 Ok(Self { config })
196 }
197
198 pub async fn index_document(&mut self, path: PathBuf) -> Result<usize> {
199 if !self.config.enabled {
200 return Err(anyhow::anyhow!("RAG system is disabled"));
201 }
202
203 let indexer = Indexer::new(&self.config)?;
204 let chunks = indexer.index_document(path)?;
205 let chunk_count = chunks.len();
206
207 self.store_chunks_locally(&chunks).await?;
209
210 log::info!("Successfully indexed and stored {} chunks", chunk_count);
211 Ok(chunk_count)
212 }
213
214 pub async fn index_directory(&mut self, path: PathBuf) -> Result<usize> {
215 if !self.config.enabled {
216 return Err(anyhow::anyhow!("RAG system is disabled"));
217 }
218
219 let indexer = Indexer::new(&self.config)?;
220 let chunks = indexer.index_directory(path)?;
221 let chunk_count = chunks.len();
222
223 self.store_chunks_locally(&chunks).await?;
225
226 log::info!(
227 "Successfully indexed and stored {} chunks from directory",
228 chunk_count
229 );
230 Ok(chunk_count)
231 }
232
233 pub async fn index_url(&mut self, url: &str) -> Result<usize> {
234 if !self.config.enabled {
235 return Err(anyhow::anyhow!("RAG system is disabled"));
236 }
237
238 log::info!("Indexing URL: {}", url);
239
240 let indexer = Indexer::new(&self.config)?;
241 let chunks = indexer.index_url(url.to_string()).await?;
242 let chunk_count = chunks.len();
243
244 self.store_chunks_locally(&chunks).await?;
246
247 log::info!(
248 "Successfully indexed and stored {} chunks from URL",
249 chunk_count
250 );
251 Ok(chunk_count)
252 }
253
254 pub async fn index_url_deep(
255 &mut self,
256 url: &str,
257 max_depth: Option<u32>,
258 max_pages: Option<u32>,
259 ) -> Result<usize> {
260 if !self.config.enabled {
261 return Err(anyhow::anyhow!("RAG system is disabled"));
262 }
263
264 log::info!(
265 "Deep indexing URL: {} (depth: {:?}, pages: {:?})",
266 url,
267 max_depth,
268 max_pages
269 );
270
271 let indexer = Indexer::new(&self.config)?;
272 let chunks = indexer
273 .index_url_deep(url.to_string(), max_depth, max_pages)
274 .await?;
275 let chunk_count = chunks.len();
276
277 self.store_chunks_locally(&chunks).await?;
279
280 log::info!(
281 "Successfully deep indexed and stored {} chunks from URL",
282 chunk_count
283 );
284 Ok(chunk_count)
285 }
286
287 pub async fn search(
288 &self,
289 query: &str,
290 max_results: Option<usize>,
291 ) -> Result<Vec<RagSearchResult>> {
292 if !self.config.enabled {
293 return Err(anyhow::anyhow!("RAG system is disabled"));
294 }
295
296 let limit = max_results.unwrap_or(10);
297
298 log::info!("Searching local vector storage for: '{}'", query);
299
300 let embedding_model =
302 EmbeddingModel::new_with_config(self.config.embedding.clone()).await?;
303 let query_embedding = embedding_model.embed_text(query).await?;
304
305 let indexer = Indexer::new(&self.config)?;
307 let index_path = indexer.get_index_path();
308
309 if !index_path.exists() {
310 log::info!("No indexed documents found yet");
311 return Ok(vec![]);
312 }
313
314 let mut results = Vec::new();
315 let embedding_dir = index_path.join("embeddings");
316
317 if !embedding_dir.exists() {
318 log::info!("No embeddings directory found yet");
319 return Ok(vec![]);
320 }
321
322 let entries = std::fs::read_dir(embedding_dir)?;
324
325 for entry in entries.flatten() {
326 if let Some(file_name) = entry.file_name().to_str() {
327 if file_name.ends_with(".json") {
328 match self
329 .load_and_score_embedding(&entry.path(), &query_embedding, &embedding_model)
330 .await
331 {
332 Ok(Some(result)) => {
333 if result.score >= self.config.similarity_threshold {
334 results.push(result);
335 }
336 }
337 Ok(None) => continue,
338 Err(e) => {
339 log::warn!(
340 "Failed to process embedding file {:?}: {}",
341 entry.path(),
342 e
343 );
344 continue;
345 }
346 }
347 }
348 }
349 }
350
351 results.sort_by(|a, b| {
353 b.score
354 .partial_cmp(&a.score)
355 .unwrap_or(std::cmp::Ordering::Equal)
356 });
357 results.truncate(limit);
358
359 log::info!("Found {} results from local vector storage", results.len());
360 Ok(results)
361 }
362
363 async fn load_and_score_embedding(
365 &self,
366 embedding_path: &std::path::Path,
367 query_embedding: &[f32],
368 _embedding_model: &EmbeddingModel,
369 ) -> Result<Option<RagSearchResult>> {
370 let content = std::fs::read_to_string(embedding_path)?;
372 let chunk_data: StoredChunk = serde_json::from_str(&content)
373 .map_err(|e| anyhow::anyhow!("Failed to parse chunk data: {}", e))?;
374
375 let score = EmbeddingModel::cosine_similarity(query_embedding, &chunk_data.embedding);
377
378 Ok(Some(RagSearchResult {
380 id: chunk_data.id,
381 content: chunk_data.content,
382 source_path: chunk_data.source_path,
383 source_type: chunk_data.source_type,
384 title: chunk_data.title,
385 section: chunk_data.section,
386 score,
387 metadata: chunk_data.metadata,
388 }))
389 }
390
391 pub async fn get_stats(&self) -> Result<RagStats> {
392 if !self.config.enabled {
393 return Err(anyhow::anyhow!("RAG system is disabled"));
394 }
395
396 let indexer = Indexer::new(&self.config)?;
397 let index_path = indexer.get_index_path();
398 let embedding_dir = index_path.join("embeddings");
399
400 if !embedding_dir.exists() {
401 return Ok(RagStats {
402 total_documents: 0,
403 total_chunks: 0,
404 index_size_mb: 0.0,
405 last_updated: chrono::Utc::now(),
406 sources: vec![],
407 });
408 }
409
410 let mut total_chunks = 0;
412 let mut total_size = 0u64;
413 let mut sources = std::collections::HashSet::new();
414 let mut last_modified = std::time::UNIX_EPOCH;
415
416 let entries = std::fs::read_dir(&embedding_dir)?;
417 for entry in entries.flatten() {
418 if let Some(file_name) = entry.file_name().to_str() {
419 if file_name.ends_with(".json") {
420 total_chunks += 1;
421
422 if let Ok(metadata) = entry.metadata() {
423 total_size += metadata.len();
424
425 if let Ok(modified) = metadata.modified() {
427 if modified > last_modified {
428 last_modified = modified;
429 }
430 }
431 }
432
433 if let Ok(content) = std::fs::read_to_string(entry.path()) {
435 if let Ok(chunk_data) = serde_json::from_str::<StoredChunk>(&content) {
436 if let Some(source_str) = chunk_data.source_path.to_str() {
437 sources.insert(source_str.to_string());
438 }
439 }
440 }
441 }
442 }
443 }
444
445 let total_documents = sources.len();
447 let index_size_mb = total_size as f64 / (1024.0 * 1024.0);
448
449 let last_updated = chrono::DateTime::<chrono::Utc>::from(last_modified);
450
451 let sources_vec: Vec<String> = sources.into_iter().collect();
452
453 Ok(RagStats {
454 total_documents,
455 total_chunks,
456 index_size_mb,
457 last_updated,
458 sources: sources_vec,
459 })
460 }
461
462 pub async fn clear_index(&self) -> Result<()> {
463 if !self.config.enabled {
464 return Err(anyhow::anyhow!("RAG system is disabled"));
465 }
466
467 log::info!("Clearing local vector storage");
468
469 let indexer = Indexer::new(&self.config)?;
471 let index_path = indexer.get_index_path();
472 let embedding_dir = index_path.join("embeddings");
473
474 if embedding_dir.exists() {
475 let entries = std::fs::read_dir(&embedding_dir)?;
477 let mut cleared_count = 0;
478
479 for entry in entries.flatten() {
480 if let Some(file_name) = entry.file_name().to_str() {
481 if file_name.ends_with(".json") {
482 if let Err(e) = std::fs::remove_file(entry.path()) {
483 log::warn!("Failed to remove embedding file {:?}: {}", entry.path(), e);
484 } else {
485 cleared_count += 1;
486 }
487 }
488 }
489 }
490
491 log::info!(
492 "Successfully cleared {} embedding files from local vector storage",
493 cleared_count
494 );
495 } else {
496 log::info!("Local vector storage directory does not exist, nothing to clear");
497 }
498
499 Ok(())
500 }
501
502 pub async fn health_check(&self) -> Result<()> {
503 if !self.config.enabled {
504 return Err(anyhow::anyhow!("RAG system is disabled"));
505 }
506
507 log::info!("Running RAG system health check...");
508
509 let _embedding_model = EmbeddingModel::new_with_config(self.config.embedding.clone())
511 .await
512 .map_err(|e| anyhow::anyhow!("Embedding model unavailable: {}", e))?;
513 log::info!("✓ Embedding model loaded successfully");
514
515 let indexer = Indexer::new(&self.config)?;
517 let index_path = indexer.get_index_path();
518
519 if index_path.exists() {
520 log::info!("✓ Local index directory exists: {:?}", index_path);
521
522 let embedding_dir = index_path.join("embeddings");
524 if embedding_dir.exists() {
525 match std::fs::read_dir(&embedding_dir) {
527 Ok(entries) => {
528 let count = entries.filter_map(|e| e.ok()).count();
529 log::info!(
530 "✓ Local vector storage accessible with {} embedding files",
531 count
532 );
533 }
534 Err(e) => {
535 log::warn!(
536 "⚠ Local vector storage directory exists but cannot read contents: {}",
537 e
538 );
539 }
540 }
541 } else {
542 log::info!("✓ Local vector storage will be created when needed");
543 }
544 } else {
545 log::info!("✓ Local index directory will be created: {:?}", index_path);
546 }
547
548 let test_file = index_path.join(".health_check");
550 match std::fs::create_dir_all(index_path) {
551 Ok(_) => {
552 match std::fs::write(&test_file, "health_check") {
553 Ok(_) => {
554 log::info!("✓ File system write access confirmed");
555 let _ = std::fs::remove_file(&test_file); }
557 Err(e) => {
558 return Err(anyhow::anyhow!("File system write access failed: {}", e));
559 }
560 }
561 }
562 Err(e) => {
563 return Err(anyhow::anyhow!("Cannot create index directory: {}", e));
564 }
565 }
566
567 log::info!("RAG system health check: All systems operational");
568 Ok(())
569 }
570
571 async fn store_chunks_locally(&self, chunks: &[DocumentChunk]) -> Result<()> {
573 use uuid::Uuid;
574
575 if chunks.is_empty() {
576 log::info!("No chunks to store locally");
577 return Ok(());
578 }
579
580 log::info!("Storing {} chunks in local vector storage", chunks.len());
581
582 let embedding_model =
584 EmbeddingModel::new_with_config(self.config.embedding.clone()).await?;
585
586 let indexer = Indexer::new(&self.config)?;
588 let index_path = indexer.get_index_path();
589 let embedding_dir = index_path.join("embeddings");
590
591 std::fs::create_dir_all(&embedding_dir)?;
593
594 let mut stored_count = 0;
596
597 for chunk in chunks {
598 let embedding = match embedding_model.embed_text(&chunk.content).await {
600 Ok(embedding) => embedding,
601 Err(e) => {
602 log::warn!("Failed to generate embedding for chunk {}: {}", chunk.id, e);
603 continue;
604 }
605 };
606
607 let stored_chunk = StoredChunk {
609 id: chunk.id.clone(),
610 content: chunk.content.clone(),
611 source_path: chunk.source_path.clone(),
612 source_type: chunk.source_type.clone(),
613 title: chunk.title.clone(),
614 section: chunk.section.clone(),
615 chunk_index: chunk.chunk_index,
616 metadata: chunk.metadata.clone(),
617 embedding,
618 };
619
620 let file_id = Uuid::new_v4().to_string();
622 let file_path = embedding_dir.join(format!("{}.json", file_id));
623
624 let json_content = serde_json::to_string_pretty(&stored_chunk)?;
625 std::fs::write(&file_path, json_content)?;
626
627 stored_count += 1;
628 log::debug!("Stored chunk {} to {:?}", chunk.id, file_path);
629 }
630
631 log::info!(
632 "Successfully stored {} chunks in local vector storage",
633 stored_count
634 );
635 Ok(())
636 }
637}