use std::collections::HashMap;
use std::path::PathBuf;
use crate::document::{DocumentTree, NodeId, ReasoningIndex};
use crate::index::parse::{DocumentFormat, RawNode};
use crate::llm::LlmClient;
use super::super::{PipelineOptions, SummaryStrategy};
use super::metrics::IndexMetrics;
#[derive(Debug, Clone)]
pub enum IndexInput {
File(PathBuf),
Content {
content: String,
name: String,
format: DocumentFormat,
},
Bytes {
data: Vec<u8>,
name: String,
format: DocumentFormat,
},
}
impl IndexInput {
pub fn file(path: impl Into<PathBuf>) -> Self {
Self::File(path.into())
}
pub fn content(content: impl Into<String>) -> Self {
Self::Content {
content: content.into(),
name: String::new(),
format: DocumentFormat::Markdown,
}
}
pub fn content_with(
content: impl Into<String>,
name: impl Into<String>,
format: DocumentFormat,
) -> Self {
Self::Content {
content: content.into(),
name: name.into(),
format,
}
}
pub fn bytes(data: impl Into<Vec<u8>>) -> Self {
Self::Bytes {
data: data.into(),
name: String::new(),
format: DocumentFormat::Pdf,
}
}
pub fn bytes_with(
data: impl Into<Vec<u8>>,
name: impl Into<String>,
format: DocumentFormat,
) -> Self {
Self::Bytes {
data: data.into(),
name: name.into(),
format,
}
}
pub fn is_file(&self) -> bool {
matches!(self, Self::File(_))
}
pub fn is_content(&self) -> bool {
matches!(self, Self::Content { .. })
}
pub fn is_bytes(&self) -> bool {
matches!(self, Self::Bytes { .. })
}
pub fn format(&self) -> Option<DocumentFormat> {
match self {
Self::File(_) => None,
Self::Content { format, .. } => Some(*format),
Self::Bytes { format, .. } => Some(*format),
}
}
}
#[derive(Debug, Clone)]
pub struct StageResult {
pub success: bool,
pub duration_ms: u64,
pub metadata: HashMap<String, serde_json::Value>,
}
impl StageResult {
pub fn success(name: &str) -> Self {
println!("Stage '{}' completed successfully", name);
Self {
success: true,
duration_ms: 0,
metadata: HashMap::new(),
}
}
pub fn failure(name: &str, error: &str) -> Self {
println!("Stage '{}' failed: {}", name, error);
let mut metadata = HashMap::new();
metadata.insert(
"error".to_string(),
serde_json::Value::String(error.to_string()),
);
Self {
success: false,
duration_ms: 0,
metadata,
}
}
pub fn with_duration(mut self, ms: u64) -> Self {
self.duration_ms = ms;
self
}
pub fn with_metadata(mut self, key: &str, value: serde_json::Value) -> Self {
self.metadata.insert(key.to_string(), value);
self
}
}
#[derive(Debug, Clone, Default)]
pub struct SummaryCache {
summaries: HashMap<NodeId, String>,
persist: bool,
}
impl SummaryCache {
pub fn new(persist: bool) -> Self {
Self {
summaries: HashMap::new(),
persist,
}
}
pub fn get(&self, node_id: NodeId) -> Option<&str> {
self.summaries.get(&node_id).map(|s| s.as_str())
}
pub fn put(&mut self, node_id: NodeId, summary: String) {
self.summaries.insert(node_id, summary);
}
pub fn should_persist(&self) -> bool {
self.persist
}
pub fn all(&self) -> &HashMap<NodeId, String> {
&self.summaries
}
}
#[derive(Debug)]
pub struct IndexContext {
pub doc_id: String,
pub input: IndexInput,
pub format: DocumentFormat,
pub name: String,
pub source_path: Option<PathBuf>,
pub raw_nodes: Vec<RawNode>,
pub tree: Option<DocumentTree>,
pub options: PipelineOptions,
pub llm_client: Option<LlmClient>,
pub summary_cache: SummaryCache,
pub reasoning_index: Option<ReasoningIndex>,
pub existing_tree: Option<DocumentTree>,
pub stage_results: HashMap<String, StageResult>,
pub metrics: IndexMetrics,
pub description: Option<String>,
pub page_count: Option<usize>,
pub line_count: Option<usize>,
}
impl IndexContext {
pub fn new(input: IndexInput, options: PipelineOptions) -> Self {
Self {
doc_id: uuid::Uuid::new_v4().to_string(),
input,
format: DocumentFormat::Markdown,
name: String::new(),
source_path: None,
raw_nodes: Vec::new(),
tree: None,
options,
llm_client: None,
summary_cache: SummaryCache::default(),
reasoning_index: None,
existing_tree: None,
stage_results: HashMap::new(),
metrics: IndexMetrics::default(),
description: None,
page_count: None,
line_count: None,
}
}
pub fn with_doc_id(mut self, doc_id: impl Into<String>) -> Self {
self.doc_id = doc_id.into();
self
}
pub fn with_llm_client(mut self, client: LlmClient) -> Self {
self.llm_client = Some(client);
self
}
pub fn with_format(mut self, format: DocumentFormat) -> Self {
self.format = format;
self
}
pub fn with_name(mut self, name: impl Into<String>) -> Self {
self.name = name.into();
self
}
pub fn with_source_path(mut self, path: impl Into<PathBuf>) -> Self {
self.source_path = Some(path.into());
self
}
pub fn with_existing_tree(mut self, tree: DocumentTree) -> Self {
self.existing_tree = Some(tree);
self
}
pub fn init_summary_cache(&mut self) {
if let SummaryStrategy::Lazy { persist, .. } = self.options.summary_strategy {
self.summary_cache = SummaryCache::new(persist);
}
}
pub fn record_stage(&mut self, name: &str, result: StageResult) {
self.stage_results.insert(name.to_string(), result);
}
pub fn tree(&self) -> Result<&DocumentTree, &'static str> {
self.tree.as_ref().ok_or("Tree not built")
}
pub fn tree_mut(&mut self) -> Result<&mut DocumentTree, &'static str> {
self.tree.as_mut().ok_or("Tree not built")
}
pub fn finalize(self) -> PipelineResult {
PipelineResult {
doc_id: self.doc_id,
name: self.name,
format: self.format,
source_path: self.source_path,
tree: self.tree,
description: self.description,
page_count: self.page_count,
line_count: self.line_count,
metrics: self.metrics,
summary_cache: self.summary_cache,
reasoning_index: self.reasoning_index,
}
}
}
#[derive(Debug)]
pub struct PipelineResult {
pub doc_id: String,
pub name: String,
pub format: DocumentFormat,
pub source_path: Option<PathBuf>,
pub tree: Option<DocumentTree>,
pub description: Option<String>,
pub page_count: Option<usize>,
pub line_count: Option<usize>,
pub metrics: IndexMetrics,
pub summary_cache: SummaryCache,
pub reasoning_index: Option<ReasoningIndex>,
}
impl PipelineResult {
pub fn has_tree(&self) -> bool {
self.tree.is_some()
}
pub fn tree(&self) -> Option<&DocumentTree> {
self.tree.as_ref()
}
pub fn total_time_ms(&self) -> u64 {
self.metrics.parse_time_ms
+ self.metrics.build_time_ms
+ self.metrics.validate_time_ms
+ self.metrics.split_time_ms
+ self.metrics.enhance_time_ms
+ self.metrics.enrich_time_ms
+ self.metrics.reasoning_index_time_ms
+ self.metrics.optimize_time_ms
}
}