use super::summary::SummaryStrategy;
use crate::config::IndexerConfig;
use crate::document::{DocumentTree, ReasoningIndexConfig};
use crate::llm::throttle::ConcurrencyConfig;
use crate::utils::fingerprint::{Fingerprint, Fingerprinter};
use std::path::PathBuf;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum IndexMode {
Auto,
Markdown,
Pdf,
}
impl Default for IndexMode {
fn default() -> Self {
Self::Auto
}
}
#[derive(Debug, Clone)]
pub struct OptimizationConfig {
pub enabled: bool,
pub max_depth: Option<usize>,
pub max_children: Option<usize>,
pub merge_leaf_threshold: usize,
}
impl Default for OptimizationConfig {
fn default() -> Self {
Self {
enabled: true,
max_depth: None,
max_children: None,
merge_leaf_threshold: 0,
}
}
}
impl OptimizationConfig {
pub fn new() -> Self {
Self::default()
}
pub fn disabled() -> Self {
Self {
enabled: false,
..Self::default()
}
}
pub fn with_max_depth(mut self, depth: usize) -> Self {
self.max_depth = Some(depth);
self
}
pub fn with_max_children(mut self, max: usize) -> Self {
self.max_children = Some(max);
self
}
}
#[derive(Debug, Clone)]
pub struct ThinningConfig {
pub enabled: bool,
pub threshold: usize,
pub merge_content: bool,
}
impl Default for ThinningConfig {
fn default() -> Self {
Self {
enabled: false,
threshold: 500,
merge_content: true,
}
}
}
impl ThinningConfig {
pub fn disabled() -> Self {
Self::default()
}
pub fn enabled(threshold: usize) -> Self {
Self {
enabled: true,
threshold,
merge_content: true,
}
}
pub fn with_threshold(mut self, threshold: usize) -> Self {
self.threshold = threshold;
self
}
pub fn with_merge_content(mut self, merge: bool) -> Self {
self.merge_content = merge;
self
}
}
#[derive(Debug, Clone)]
pub struct SplitConfig {
pub enabled: bool,
pub max_tokens_per_node: usize,
pub pattern_split: bool,
}
impl Default for SplitConfig {
fn default() -> Self {
Self {
enabled: true,
max_tokens_per_node: 4000,
pattern_split: true,
}
}
}
impl SplitConfig {
pub fn disabled() -> Self {
Self {
enabled: false,
..Self::default()
}
}
pub fn with_max_tokens(mut self, max: usize) -> Self {
self.max_tokens_per_node = max;
self
}
pub fn with_pattern_split(mut self, pattern: bool) -> Self {
self.pattern_split = pattern;
self
}
}
#[derive(Debug, Clone)]
pub struct PipelineOptions {
pub mode: IndexMode,
pub generate_ids: bool,
pub summary_strategy: SummaryStrategy,
pub thinning: ThinningConfig,
pub optimization: OptimizationConfig,
pub split: SplitConfig,
pub generate_description: bool,
pub concurrency: ConcurrencyConfig,
pub indexer: IndexerConfig,
pub reasoning_index: ReasoningIndexConfig,
pub existing_tree: Option<DocumentTree>,
pub processing_version: u32,
pub checkpoint_dir: Option<PathBuf>,
}
impl Default for PipelineOptions {
fn default() -> Self {
Self {
mode: IndexMode::Auto,
generate_ids: true,
summary_strategy: SummaryStrategy::full(),
thinning: ThinningConfig::default(),
optimization: OptimizationConfig::default(),
split: SplitConfig::default(),
generate_description: true,
concurrency: ConcurrencyConfig::default(),
indexer: IndexerConfig::default(),
reasoning_index: ReasoningIndexConfig::default(),
existing_tree: None,
processing_version: 1,
checkpoint_dir: None,
}
}
}
impl PipelineOptions {
pub fn new() -> Self {
Self::default()
}
pub fn with_mode(mut self, mode: IndexMode) -> Self {
self.mode = mode;
self
}
pub fn with_generate_ids(mut self, generate: bool) -> Self {
self.generate_ids = generate;
self
}
pub fn with_summary_strategy(mut self, strategy: SummaryStrategy) -> Self {
self.summary_strategy = strategy;
self
}
pub fn with_thinning(mut self, thinning: ThinningConfig) -> Self {
self.thinning = thinning;
self
}
pub fn with_optimization(mut self, optimization: OptimizationConfig) -> Self {
self.optimization = optimization;
self
}
pub fn with_split(mut self, split: SplitConfig) -> Self {
self.split = split;
self
}
pub fn with_generate_description(mut self, generate: bool) -> Self {
self.generate_description = generate;
self
}
pub fn with_concurrency(mut self, concurrency: ConcurrencyConfig) -> Self {
self.concurrency = concurrency;
self
}
pub fn with_indexer(mut self, indexer: IndexerConfig) -> Self {
self.indexer = indexer;
self
}
pub fn with_reasoning_index(mut self, config: ReasoningIndexConfig) -> Self {
self.reasoning_index = config;
self
}
pub fn with_checkpoint_dir(mut self, dir: impl Into<PathBuf>) -> Self {
self.checkpoint_dir = Some(dir.into());
self
}
pub fn logic_fingerprint(&self) -> Fingerprint {
Fingerprinter::new()
.with_str(&format!("{:?}", self.mode))
.with_bool(self.generate_ids)
.with_str(&format!("{:?}", self.summary_strategy))
.with_bool(self.generate_description)
.with_bool(self.optimization.enabled)
.with_str(&format!("{:?}", self.reasoning_index))
.into_fingerprint()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_index_mode_default() {
let mode = IndexMode::default();
assert_eq!(mode, IndexMode::Auto);
}
#[test]
fn test_optimization_config() {
let config = OptimizationConfig::new()
.with_max_depth(5)
.with_max_children(10);
assert!(config.enabled);
assert_eq!(config.max_depth, Some(5));
assert_eq!(config.max_children, Some(10));
}
#[test]
fn test_thinning_config() {
let config = ThinningConfig::enabled(300);
assert!(config.enabled);
assert_eq!(config.threshold, 300);
let disabled = ThinningConfig::disabled();
assert!(!disabled.enabled);
}
#[test]
fn test_pipeline_options_builder() {
let options = PipelineOptions::new()
.with_mode(IndexMode::Markdown)
.with_generate_ids(false);
assert_eq!(options.mode, IndexMode::Markdown);
assert!(!options.generate_ids);
}
}