use processors_rs::pdf::pdf_processor::PdfBackend;
use crate::embeddings::embed::Embedder;
use std::sync::Arc;
#[derive(Clone)]
pub struct TextEmbedConfig {
pub chunk_size: Option<usize>,
pub overlap_ratio: Option<f32>,
pub batch_size: Option<usize>,
pub buffer_size: Option<usize>,
pub splitting_strategy: SplittingStrategy,
pub use_ocr: Option<bool>,
pub tesseract_path: Option<String>,
pub late_chunking: Option<bool>,
pub pdf_backend: PdfBackend,
}
impl Default for TextEmbedConfig {
fn default() -> Self {
Self {
chunk_size: Some(1000),
overlap_ratio: Some(0.0),
batch_size: Some(32),
buffer_size: Some(100),
splitting_strategy: SplittingStrategy::Sentence,
late_chunking: None,
use_ocr: None,
tesseract_path: None,
pdf_backend: PdfBackend::LoPdf,
}
}
}
#[allow(clippy::too_many_arguments)]
impl TextEmbedConfig {
pub fn new(
chunk_size: Option<usize>,
batch_size: Option<usize>,
buffer_size: Option<usize>,
overlap_ratio: Option<f32>,
splitting_strategy: SplittingStrategy,
late_chunking: Option<bool>,
use_ocr: Option<bool>,
tesseract_path: Option<String>,
) -> Self {
Self::default()
.with_chunk_size(chunk_size.unwrap_or(1000), overlap_ratio)
.with_batch_size(batch_size.unwrap_or(32))
.with_buffer_size(buffer_size.unwrap_or(100))
.with_ocr(use_ocr.unwrap_or(false), tesseract_path.as_deref())
.with_pdf_backend("lopdf")
.with_splitting_strategy(splitting_strategy)
.with_late_chunking(late_chunking.unwrap_or(false))
.build()
}
pub fn with_chunk_size(mut self, size: usize, overlap_ratio: Option<f32>) -> Self {
self.chunk_size = Some(size);
self.overlap_ratio = Some(overlap_ratio.unwrap_or(0.0));
self
}
pub fn with_batch_size(mut self, size: usize) -> Self {
self.batch_size = Some(size);
self
}
pub fn with_buffer_size(mut self, size: usize) -> Self {
self.buffer_size = Some(size);
self
}
pub fn with_late_chunking(mut self, late_chunking: bool) -> Self {
self.late_chunking = Some(late_chunking);
self
}
pub fn with_splitting_strategy(mut self, strategy: SplittingStrategy) -> Self {
self.splitting_strategy = strategy;
self
}
pub fn with_ocr(mut self, use_ocr: bool, tesseract_path: Option<&str>) -> Self {
self.use_ocr = Some(use_ocr);
self.tesseract_path = tesseract_path.map(|p| p.to_string());
self
}
pub fn with_pdf_backend(mut self, backend: &str) -> Self {
self.pdf_backend = match backend {
"lopdf" => PdfBackend::LoPdf,
_ => PdfBackend::LoPdf,
};
self
}
pub fn build(self) -> TextEmbedConfig {
self
}
}
#[derive(Clone)]
pub enum SplittingStrategy {
Sentence,
Semantic {
semantic_encoder: Arc<Embedder>,
},
}
#[derive(Clone)]
pub struct ImageEmbedConfig {
pub buffer_size: Option<usize>, pub batch_size: Option<usize>,
}
impl Default for ImageEmbedConfig {
fn default() -> Self {
Self {
buffer_size: Some(100),
batch_size: Some(32),
}
}
}
impl ImageEmbedConfig {
pub fn new(buffer_size: Option<usize>, batch_size: Option<usize>) -> Self {
Self {
buffer_size,
batch_size,
}
}
}