use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::sync::Arc;
use crate::services::{EmbeddingService, EncryptionService};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentChunk {
pub content: String,
pub index: usize,
pub embedding: Option<Vec<f32>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkingOptions {
pub chunk_size: usize,
pub overlap_size: usize,
}
impl Default for ChunkingOptions {
fn default() -> Self {
Self {
chunk_size: 1024,
overlap_size: 200,
}
}
}
pub struct IndexingService {
embedding_service: Arc<EmbeddingService>,
encryption_service: Arc<EncryptionService>,
default_chunking_options: ChunkingOptions,
}
impl IndexingService {
pub async fn new(
embedding_service: Arc<EmbeddingService>,
encryption_service: Arc<EncryptionService>,
) -> Result<Self> {
Ok(Self {
embedding_service,
encryption_service,
default_chunking_options: ChunkingOptions::default(),
})
}
pub async fn initialize(&self) -> Result<()> {
Ok(())
}
pub async fn shutdown(&self) -> Result<()> {
Ok(())
}
pub async fn generate_embedding(&self, text: &str) -> Result<Vec<f32>> {
self.embedding_service.generate_embedding(text).await
}
pub fn chunk_document(&self, content: &str, options: Option<ChunkingOptions>) -> Vec<DocumentChunk> {
let opts = options.unwrap_or_else(|| self.default_chunking_options.clone());
if content.len() <= opts.chunk_size {
return vec![DocumentChunk {
content: content.to_string(),
index: 0,
embedding: None,
}];
}
let mut chunks = Vec::new();
let mut start = 0;
let mut chunk_index = 0;
while start < content.len() {
let end = std::cmp::min(start + opts.chunk_size, content.len());
let chunk_content = content[start..end].to_string();
chunks.push(DocumentChunk {
content: chunk_content,
index: chunk_index,
embedding: None,
});
chunk_index += 1;
start = end - opts.overlap_size;
if start >= content.len() - opts.overlap_size {
break;
}
if end == content.len() {
break;
}
}
chunks
}
pub async fn generate_chunk_embeddings(&self, chunks: Vec<DocumentChunk>) -> Result<Vec<DocumentChunk>> {
let mut embedded_chunks = Vec::new();
for chunk in chunks {
let embedding = self.generate_embedding(&chunk.content).await?;
embedded_chunks.push(DocumentChunk {
content: chunk.content,
index: chunk.index,
embedding: Some(embedding),
});
}
Ok(embedded_chunks)
}
pub async fn generate_chunk_embeddings_parallel(&self, chunks: Vec<DocumentChunk>) -> Result<Vec<DocumentChunk>> {
use tokio::task::JoinSet;
let mut join_set = JoinSet::new();
for chunk in chunks {
let embedding_service = Arc::clone(&self.embedding_service);
let chunk_content = chunk.content.clone();
let chunk_index = chunk.index;
join_set.spawn(async move {
let embedding = embedding_service.generate_embedding(&chunk_content).await?;
Ok::<DocumentChunk, anyhow::Error>(DocumentChunk {
content: chunk_content,
index: chunk_index,
embedding: Some(embedding),
})
});
}
let mut embedded_chunks = Vec::new();
while let Some(result) = join_set.join_next().await {
let chunk = result??;
embedded_chunks.push(chunk);
}
embedded_chunks.sort_by_key(|chunk| chunk.index);
Ok(embedded_chunks)
}
pub async fn index_document_with_chunking(
&self,
content: &str,
options: Option<ChunkingOptions>,
use_parallel: bool,
) -> Result<Vec<DocumentChunk>> {
let chunks = self.chunk_document(content, options);
if use_parallel && chunks.len() > 1 {
self.generate_chunk_embeddings_parallel(chunks).await
} else {
self.generate_chunk_embeddings(chunks).await
}
}
pub fn get_default_chunking_options(&self) -> &ChunkingOptions {
&self.default_chunking_options
}
pub fn update_default_chunking_options(&mut self, options: ChunkingOptions) {
self.default_chunking_options = options;
}
pub fn get_chunk_summary(&self, chunks: &[DocumentChunk]) -> ChunkingSummary {
let total_chars: usize = chunks.iter().map(|c| c.content.len()).sum();
let avg_chunk_size = if chunks.is_empty() { 0 } else { total_chars / chunks.len() };
let embedded_count = chunks.iter().filter(|c| c.embedding.is_some()).count();
ChunkingSummary {
total_chunks: chunks.len(),
total_characters: total_chars,
average_chunk_size: avg_chunk_size,
embedded_chunks: embedded_count,
embedding_dimensions: chunks.first()
.and_then(|c| c.embedding.as_ref())
.map(|e| e.len())
.unwrap_or(0),
}
}
pub fn create_knowledge_base_chunks(
&self,
content: &str,
kb_chunk_size: usize,
kb_overlap_size: usize,
) -> Vec<DocumentChunk> {
let options = ChunkingOptions {
chunk_size: kb_chunk_size,
overlap_size: kb_overlap_size,
};
self.chunk_document(content, Some(options))
}
pub async fn create_document_summary_embedding(&self, content: &str, max_summary_length: usize) -> Result<Vec<f32>> {
let summary = if content.len() > max_summary_length {
format!("{}...", &content[..max_summary_length])
} else {
content.to_string()
};
self.generate_embedding(&summary).await
}
pub fn validate_chunk_content(&self, chunk: &DocumentChunk) -> bool {
!chunk.content.trim().is_empty() && chunk.content.len() <= 10000 }
pub fn get_overlapping_content(&self, chunks: &[DocumentChunk], overlap_size: usize) -> Vec<String> {
let mut overlaps = Vec::new();
for i in 0..chunks.len().saturating_sub(1) {
let current = &chunks[i].content;
let next = &chunks[i + 1].content;
if current.len() >= overlap_size {
let overlap_start = current.len() - overlap_size;
let current_overlap = ¤t[overlap_start..];
if next.len() >= overlap_size && next.starts_with(current_overlap) {
overlaps.push(current_overlap.to_string());
}
}
}
overlaps
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkingSummary {
pub total_chunks: usize,
pub total_characters: usize,
pub average_chunk_size: usize,
pub embedded_chunks: usize,
pub embedding_dimensions: usize,
}