Skip to main content

vectorless/index/pipeline/
context.rs

1// Copyright (c) 2026 vectorless developers
2// SPDX-License-Identifier: Apache-2.0
3
4//! Index context for passing data between stages.
5
6use std::collections::HashMap;
7use std::path::PathBuf;
8
9use crate::document::{DocumentTree, NodeId};
10use crate::llm::LlmClient;
11use crate::parser::{DocumentFormat, RawNode};
12
13use super::super::{PipelineOptions, SummaryStrategy};
14use super::metrics::IndexMetrics;
15
16/// Input for the index pipeline.
17#[derive(Debug, Clone)]
18pub enum IndexInput {
19    /// Index from file path.
20    File(PathBuf),
21
22    /// Index from raw content.
23    Content {
24        /// Content string.
25        content: String,
26        /// Document name.
27        name: String,
28        /// Document format.
29        format: DocumentFormat,
30    },
31}
32
33impl IndexInput {
34    /// Create input from file path.
35    pub fn file(path: impl Into<PathBuf>) -> Self {
36        Self::File(path.into())
37    }
38
39    /// Create input from content.
40    pub fn content(
41        content: impl Into<String>,
42        name: impl Into<String>,
43        format: DocumentFormat,
44    ) -> Self {
45        Self::Content {
46            content: content.into(),
47            name: name.into(),
48            format,
49        }
50    }
51}
52
53/// Result from a single stage execution.
54#[derive(Debug, Clone)]
55pub struct StageResult {
56    /// Whether the stage succeeded.
57    pub success: bool,
58
59    /// Duration in milliseconds.
60    pub duration_ms: u64,
61
62    /// Additional metadata.
63    pub metadata: HashMap<String, serde_json::Value>,
64}
65
66impl StageResult {
67    /// Create a successful result.
68    pub fn success(name: &str) -> Self {
69        println!("Stage '{}' completed successfully", name);
70
71        Self {
72            success: true,
73            duration_ms: 0,
74            metadata: HashMap::new(),
75        }
76    }
77
78    /// Create a failed result.
79    pub fn failure(name: &str, error: &str) -> Self {
80        println!("Stage '{}' failed: {}", name, error);
81
82        let mut metadata = HashMap::new();
83        metadata.insert(
84            "error".to_string(),
85            serde_json::Value::String(error.to_string()),
86        );
87        Self {
88            success: false,
89            duration_ms: 0,
90            metadata,
91        }
92    }
93
94    /// Set duration.
95    pub fn with_duration(mut self, ms: u64) -> Self {
96        self.duration_ms = ms;
97        self
98    }
99
100    /// Add metadata.
101    pub fn with_metadata(mut self, key: &str, value: serde_json::Value) -> Self {
102        self.metadata.insert(key.to_string(), value);
103        self
104    }
105}
106
107/// Summary cache for lazy generation.
108#[derive(Debug, Clone, Default)]
109pub struct SummaryCache {
110    /// Cached summaries: node_id -> summary.
111    summaries: HashMap<NodeId, String>,
112
113    /// Whether to persist to disk.
114    persist: bool,
115}
116
117impl SummaryCache {
118    /// Create a new cache.
119    pub fn new(persist: bool) -> Self {
120        Self {
121            summaries: HashMap::new(),
122            persist,
123        }
124    }
125
126    /// Get a cached summary.
127    pub fn get(&self, node_id: NodeId) -> Option<&str> {
128        self.summaries.get(&node_id).map(|s| s.as_str())
129    }
130
131    /// Store a summary.
132    pub fn put(&mut self, node_id: NodeId, summary: String) {
133        self.summaries.insert(node_id, summary);
134    }
135
136    /// Whether persistence is enabled.
137    pub fn should_persist(&self) -> bool {
138        self.persist
139    }
140
141    /// Get all cached summaries.
142    pub fn all(&self) -> &HashMap<NodeId, String> {
143        &self.summaries
144    }
145}
146
147/// Index context passed between stages.
148#[derive(Debug)]
149pub struct IndexContext {
150    /// Document ID.
151    pub doc_id: String,
152
153    /// Source input.
154    pub input: IndexInput,
155
156    /// Document format.
157    pub format: DocumentFormat,
158
159    /// Document name.
160    pub name: String,
161
162    /// Source file path (if from file).
163    pub source_path: Option<PathBuf>,
164
165    /// Parsed raw nodes.
166    pub raw_nodes: Vec<RawNode>,
167
168    /// Built document tree.
169    pub tree: Option<DocumentTree>,
170
171    /// Index options.
172    pub options: PipelineOptions,
173
174    /// LLM client for enhancement.
175    pub llm_client: Option<LlmClient>,
176
177    /// Summary cache for lazy generation.
178    pub summary_cache: SummaryCache,
179
180    /// Stage execution results.
181    pub stage_results: HashMap<String, StageResult>,
182
183    /// Performance metrics.
184    pub metrics: IndexMetrics,
185
186    /// Document description.
187    pub description: Option<String>,
188
189    /// Page count (for PDFs).
190    pub page_count: Option<usize>,
191
192    /// Line count.
193    pub line_count: Option<usize>,
194}
195
196impl IndexContext {
197    /// Create a new context from input.
198    pub fn new(input: IndexInput, options: PipelineOptions) -> Self {
199        Self {
200            doc_id: uuid::Uuid::new_v4().to_string(),
201            input,
202            format: DocumentFormat::Markdown,
203            name: String::new(),
204            source_path: None,
205            raw_nodes: Vec::new(),
206            tree: None,
207            options,
208            llm_client: None,
209            summary_cache: SummaryCache::default(),
210            stage_results: HashMap::new(),
211            metrics: IndexMetrics::default(),
212            description: None,
213            page_count: None,
214            line_count: None,
215        }
216    }
217
218    /// Set the document ID.
219    pub fn with_doc_id(mut self, doc_id: impl Into<String>) -> Self {
220        self.doc_id = doc_id.into();
221        self
222    }
223
224    /// Set the LLM client.
225    pub fn with_llm_client(mut self, client: LlmClient) -> Self {
226        self.llm_client = Some(client);
227        self
228    }
229
230    /// Set the document format.
231    pub fn with_format(mut self, format: DocumentFormat) -> Self {
232        self.format = format;
233        self
234    }
235
236    /// Set the document name.
237    pub fn with_name(mut self, name: impl Into<String>) -> Self {
238        self.name = name.into();
239        self
240    }
241
242    /// Set the source path.
243    pub fn with_source_path(mut self, path: impl Into<PathBuf>) -> Self {
244        self.source_path = Some(path.into());
245        self
246    }
247
248    /// Initialize summary cache based on strategy.
249    pub fn init_summary_cache(&mut self) {
250        if let SummaryStrategy::Lazy { persist, .. } = self.options.summary_strategy {
251            self.summary_cache = SummaryCache::new(persist);
252        }
253    }
254
255    /// Record a stage result.
256    pub fn record_stage(&mut self, name: &str, result: StageResult) {
257        self.stage_results.insert(name.to_string(), result);
258    }
259
260    /// Get the tree, returning an error if not built.
261    pub fn tree(&self) -> Result<&DocumentTree, &'static str> {
262        self.tree.as_ref().ok_or("Tree not built")
263    }
264
265    /// Get mutable tree, returning an error if not built.
266    pub fn tree_mut(&mut self) -> Result<&mut DocumentTree, &'static str> {
267        self.tree.as_mut().ok_or("Tree not built")
268    }
269
270    /// Finalize and build the result.
271    pub fn finalize(self) -> IndexResult {
272        IndexResult {
273            doc_id: self.doc_id,
274            name: self.name,
275            format: self.format,
276            source_path: self.source_path,
277            tree: self.tree,
278            description: self.description,
279            page_count: self.page_count,
280            line_count: self.line_count,
281            metrics: self.metrics,
282            summary_cache: self.summary_cache,
283        }
284    }
285}
286
287/// Final result from the index pipeline.
288#[derive(Debug)]
289pub struct IndexResult {
290    /// Document ID.
291    pub doc_id: String,
292
293    /// Document name.
294    pub name: String,
295
296    /// Document format.
297    pub format: DocumentFormat,
298
299    /// Source file path.
300    pub source_path: Option<PathBuf>,
301
302    /// Built document tree.
303    pub tree: Option<DocumentTree>,
304
305    /// Document description.
306    pub description: Option<String>,
307
308    /// Page count (for PDFs).
309    pub page_count: Option<usize>,
310
311    /// Line count.
312    pub line_count: Option<usize>,
313
314    /// Performance metrics.
315    pub metrics: IndexMetrics,
316
317    /// Summary cache.
318    pub summary_cache: SummaryCache,
319}
320
321impl IndexResult {
322    /// Check if the result has a tree.
323    pub fn has_tree(&self) -> bool {
324        self.tree.is_some()
325    }
326
327    /// Get the tree.
328    pub fn tree(&self) -> Option<&DocumentTree> {
329        self.tree.as_ref()
330    }
331
332    /// Get total indexing time in milliseconds.
333    pub fn total_time_ms(&self) -> u64 {
334        self.metrics.parse_time_ms
335            + self.metrics.build_time_ms
336            + self.metrics.enhance_time_ms
337            + self.metrics.enrich_time_ms
338            + self.metrics.optimize_time_ms
339            + self.metrics.persist_time_ms
340    }
341}