rag-module 0.6.7

Enterprise RAG module with chat context storage, vector search, session management, and model downloading. Rust implementation with Node.js compatibility.
//! Indexing service for document chunking and embedding generation

use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use crate::services::{EmbeddingService, EncryptionService};

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentChunk {
    pub content: String,
    pub index: usize,
    pub embedding: Option<Vec<f32>>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkingOptions {
    pub chunk_size: usize,
    pub overlap_size: usize,
}

impl Default for ChunkingOptions {
    fn default() -> Self {
        Self {
            chunk_size: 1024,
            overlap_size: 200,
        }
    }
}

pub struct IndexingService {
    embedding_service: Arc<EmbeddingService>,
    encryption_service: Arc<EncryptionService>,
    default_chunking_options: ChunkingOptions,
}

impl IndexingService {
    pub async fn new(
        embedding_service: Arc<EmbeddingService>,
        encryption_service: Arc<EncryptionService>,
    ) -> Result<Self> {
        Ok(Self {
            embedding_service,
            encryption_service,
            default_chunking_options: ChunkingOptions::default(),
        })
    }
    
    pub async fn initialize(&self) -> Result<()> {
        Ok(())
    }
    
    pub async fn shutdown(&self) -> Result<()> {
        Ok(())
    }

    pub async fn generate_embedding(&self, text: &str) -> Result<Vec<f32>> {
        self.embedding_service.generate_embedding(text).await
    }

    pub fn chunk_document(&self, content: &str, options: Option<ChunkingOptions>) -> Vec<DocumentChunk> {
        let opts = options.unwrap_or_else(|| self.default_chunking_options.clone());
        
        if content.len() <= opts.chunk_size {
            return vec![DocumentChunk {
                content: content.to_string(),
                index: 0,
                embedding: None,
            }];
        }

        let mut chunks = Vec::new();
        let mut start = 0;
        let mut chunk_index = 0;

        while start < content.len() {
            let end = std::cmp::min(start + opts.chunk_size, content.len());
            let chunk_content = content[start..end].to_string();
            
            chunks.push(DocumentChunk {
                content: chunk_content,
                index: chunk_index,
                embedding: None,
            });
            
            chunk_index += 1;
            
            // Move start position with overlap
            start = end - opts.overlap_size;
            
            // Prevent infinite loops for edge cases
            if start >= content.len() - opts.overlap_size {
                break;
            }
            
            // Ensure we make progress
            if end == content.len() {
                break;
            }
        }

        chunks
    }

    pub async fn generate_chunk_embeddings(&self, chunks: Vec<DocumentChunk>) -> Result<Vec<DocumentChunk>> {
        let mut embedded_chunks = Vec::new();
        
        for chunk in chunks {
            let embedding = self.generate_embedding(&chunk.content).await?;
            
            embedded_chunks.push(DocumentChunk {
                content: chunk.content,
                index: chunk.index,
                embedding: Some(embedding),
            });
        }

        Ok(embedded_chunks)
    }

    pub async fn generate_chunk_embeddings_parallel(&self, chunks: Vec<DocumentChunk>) -> Result<Vec<DocumentChunk>> {
        use tokio::task::JoinSet;
        
        let mut join_set = JoinSet::new();
        
        for chunk in chunks {
            let embedding_service = Arc::clone(&self.embedding_service);
            let chunk_content = chunk.content.clone();
            let chunk_index = chunk.index;
            
            join_set.spawn(async move {
                let embedding = embedding_service.generate_embedding(&chunk_content).await?;
                Ok::<DocumentChunk, anyhow::Error>(DocumentChunk {
                    content: chunk_content,
                    index: chunk_index,
                    embedding: Some(embedding),
                })
            });
        }
        
        let mut embedded_chunks = Vec::new();
        
        while let Some(result) = join_set.join_next().await {
            let chunk = result??;
            embedded_chunks.push(chunk);
        }
        
        // Sort by index to maintain order
        embedded_chunks.sort_by_key(|chunk| chunk.index);
        
        Ok(embedded_chunks)
    }

    pub async fn index_document_with_chunking(
        &self,
        content: &str,
        options: Option<ChunkingOptions>,
        use_parallel: bool,
    ) -> Result<Vec<DocumentChunk>> {
        let chunks = self.chunk_document(content, options);
        
        if use_parallel && chunks.len() > 1 {
            self.generate_chunk_embeddings_parallel(chunks).await
        } else {
            self.generate_chunk_embeddings(chunks).await
        }
    }

    pub fn get_default_chunking_options(&self) -> &ChunkingOptions {
        &self.default_chunking_options
    }

    pub fn update_default_chunking_options(&mut self, options: ChunkingOptions) {
        self.default_chunking_options = options;
    }

    pub fn get_chunk_summary(&self, chunks: &[DocumentChunk]) -> ChunkingSummary {
        let total_chars: usize = chunks.iter().map(|c| c.content.len()).sum();
        let avg_chunk_size = if chunks.is_empty() { 0 } else { total_chars / chunks.len() };
        let embedded_count = chunks.iter().filter(|c| c.embedding.is_some()).count();
        
        ChunkingSummary {
            total_chunks: chunks.len(),
            total_characters: total_chars,
            average_chunk_size: avg_chunk_size,
            embedded_chunks: embedded_count,
            embedding_dimensions: chunks.first()
                .and_then(|c| c.embedding.as_ref())
                .map(|e| e.len())
                .unwrap_or(0),
        }
    }

    pub fn create_knowledge_base_chunks(
        &self,
        content: &str,
        kb_chunk_size: usize,
        kb_overlap_size: usize,
    ) -> Vec<DocumentChunk> {
        let options = ChunkingOptions {
            chunk_size: kb_chunk_size,
            overlap_size: kb_overlap_size,
        };
        
        self.chunk_document(content, Some(options))
    }

    pub async fn create_document_summary_embedding(&self, content: &str, max_summary_length: usize) -> Result<Vec<f32>> {
        let summary = if content.len() > max_summary_length {
            format!("{}...", &content[..max_summary_length])
        } else {
            content.to_string()
        };
        
        self.generate_embedding(&summary).await
    }

    pub fn validate_chunk_content(&self, chunk: &DocumentChunk) -> bool {
        !chunk.content.trim().is_empty() && chunk.content.len() <= 10000 // Reasonable max chunk size
    }

    pub fn get_overlapping_content(&self, chunks: &[DocumentChunk], overlap_size: usize) -> Vec<String> {
        let mut overlaps = Vec::new();
        
        for i in 0..chunks.len().saturating_sub(1) {
            let current = &chunks[i].content;
            let next = &chunks[i + 1].content;
            
            if current.len() >= overlap_size {
                let overlap_start = current.len() - overlap_size;
                let current_overlap = &current[overlap_start..];
                
                if next.len() >= overlap_size && next.starts_with(current_overlap) {
                    overlaps.push(current_overlap.to_string());
                }
            }
        }
        
        overlaps
    }
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkingSummary {
    pub total_chunks: usize,
    pub total_characters: usize,
    pub average_chunk_size: usize,
    pub embedded_chunks: usize,
    pub embedding_dimensions: usize,
}