Skip to main content

vectorless/index/pipeline/
context.rs

1// Copyright (c) 2026 vectorless developers
2// SPDX-License-Identifier: Apache-2.0
3
4//! Index context for passing data between stages.
5
6use std::collections::HashMap;
7use std::path::PathBuf;
8
9use crate::document::{DocumentTree, NodeId};
10use crate::llm::LlmClient;
11use crate::parser::{DocumentFormat, RawNode};
12
13use super::super::{PipelineOptions, SummaryStrategy};
14use super::metrics::IndexMetrics;
15
16/// Input for the index pipeline.
17#[derive(Debug, Clone)]
18pub enum IndexInput {
19    /// Index from file path.
20    File(PathBuf),
21
22    /// Index from raw content string.
23    Content {
24        /// Content string.
25        content: String,
26        /// Document name.
27        name: String,
28        /// Document format.
29        format: DocumentFormat,
30    },
31
32    /// Index from binary data.
33    Bytes {
34        /// Binary data.
35        data: Vec<u8>,
36        /// Document name.
37        name: String,
38        /// Document format.
39        format: DocumentFormat,
40    },
41}
42
43impl IndexInput {
44    /// Create input from file path.
45    pub fn file(path: impl Into<PathBuf>) -> Self {
46        Self::File(path.into())
47    }
48
49    /// Create input from content string.
50    pub fn content(content: impl Into<String>) -> Self {
51        Self::Content {
52            content: content.into(),
53            name: String::new(),
54            format: DocumentFormat::Markdown,
55        }
56    }
57
58    /// Create input from content with name and format.
59    pub fn content_with(
60        content: impl Into<String>,
61        name: impl Into<String>,
62        format: DocumentFormat,
63    ) -> Self {
64        Self::Content {
65            content: content.into(),
66            name: name.into(),
67            format,
68        }
69    }
70
71    /// Create input from binary data.
72    pub fn bytes(data: impl Into<Vec<u8>>) -> Self {
73        Self::Bytes {
74            data: data.into(),
75            name: String::new(),
76            format: DocumentFormat::Pdf,
77        }
78    }
79
80    /// Create input from binary data with name and format.
81    pub fn bytes_with(
82        data: impl Into<Vec<u8>>,
83        name: impl Into<String>,
84        format: DocumentFormat,
85    ) -> Self {
86        Self::Bytes {
87            data: data.into(),
88            name: name.into(),
89            format,
90        }
91    }
92
93    /// Check if this is a file input.
94    pub fn is_file(&self) -> bool {
95        matches!(self, Self::File(_))
96    }
97
98    /// Check if this is a content input.
99    pub fn is_content(&self) -> bool {
100        matches!(self, Self::Content { .. })
101    }
102
103    /// Check if this is a bytes input.
104    pub fn is_bytes(&self) -> bool {
105        matches!(self, Self::Bytes { .. })
106    }
107
108    /// Get the format if available.
109    pub fn format(&self) -> Option<DocumentFormat> {
110        match self {
111            Self::File(_) => None,
112            Self::Content { format, .. } => Some(*format),
113            Self::Bytes { format, .. } => Some(*format),
114        }
115    }
116}
117
118/// Result from a single stage execution.
119#[derive(Debug, Clone)]
120pub struct StageResult {
121    /// Whether the stage succeeded.
122    pub success: bool,
123
124    /// Duration in milliseconds.
125    pub duration_ms: u64,
126
127    /// Additional metadata.
128    pub metadata: HashMap<String, serde_json::Value>,
129}
130
131impl StageResult {
132    /// Create a successful result.
133    pub fn success(name: &str) -> Self {
134        println!("Stage '{}' completed successfully", name);
135
136        Self {
137            success: true,
138            duration_ms: 0,
139            metadata: HashMap::new(),
140        }
141    }
142
143    /// Create a failed result.
144    pub fn failure(name: &str, error: &str) -> Self {
145        println!("Stage '{}' failed: {}", name, error);
146
147        let mut metadata = HashMap::new();
148        metadata.insert(
149            "error".to_string(),
150            serde_json::Value::String(error.to_string()),
151        );
152        Self {
153            success: false,
154            duration_ms: 0,
155            metadata,
156        }
157    }
158
159    /// Set duration.
160    pub fn with_duration(mut self, ms: u64) -> Self {
161        self.duration_ms = ms;
162        self
163    }
164
165    /// Add metadata.
166    pub fn with_metadata(mut self, key: &str, value: serde_json::Value) -> Self {
167        self.metadata.insert(key.to_string(), value);
168        self
169    }
170}
171
172/// Summary cache for lazy generation.
173#[derive(Debug, Clone, Default)]
174pub struct SummaryCache {
175    /// Cached summaries: node_id -> summary.
176    summaries: HashMap<NodeId, String>,
177
178    /// Whether to persist to disk.
179    persist: bool,
180}
181
182impl SummaryCache {
183    /// Create a new cache.
184    pub fn new(persist: bool) -> Self {
185        Self {
186            summaries: HashMap::new(),
187            persist,
188        }
189    }
190
191    /// Get a cached summary.
192    pub fn get(&self, node_id: NodeId) -> Option<&str> {
193        self.summaries.get(&node_id).map(|s| s.as_str())
194    }
195
196    /// Store a summary.
197    pub fn put(&mut self, node_id: NodeId, summary: String) {
198        self.summaries.insert(node_id, summary);
199    }
200
201    /// Whether persistence is enabled.
202    pub fn should_persist(&self) -> bool {
203        self.persist
204    }
205
206    /// Get all cached summaries.
207    pub fn all(&self) -> &HashMap<NodeId, String> {
208        &self.summaries
209    }
210}
211
212/// Index context passed between stages.
213#[derive(Debug)]
214pub struct IndexContext {
215    /// Document ID.
216    pub doc_id: String,
217
218    /// Source input.
219    pub input: IndexInput,
220
221    /// Document format.
222    pub format: DocumentFormat,
223
224    /// Document name.
225    pub name: String,
226
227    /// Source file path (if from file).
228    pub source_path: Option<PathBuf>,
229
230    /// Parsed raw nodes.
231    pub raw_nodes: Vec<RawNode>,
232
233    /// Built document tree.
234    pub tree: Option<DocumentTree>,
235
236    /// Index options.
237    pub options: PipelineOptions,
238
239    /// LLM client for enhancement.
240    pub llm_client: Option<LlmClient>,
241
242    /// Summary cache for lazy generation.
243    pub summary_cache: SummaryCache,
244
245    /// Stage execution results.
246    pub stage_results: HashMap<String, StageResult>,
247
248    /// Performance metrics.
249    pub metrics: IndexMetrics,
250
251    /// Document description.
252    pub description: Option<String>,
253
254    /// Page count (for PDFs).
255    pub page_count: Option<usize>,
256
257    /// Line count.
258    pub line_count: Option<usize>,
259}
260
261impl IndexContext {
262    /// Create a new context from input.
263    pub fn new(input: IndexInput, options: PipelineOptions) -> Self {
264        Self {
265            doc_id: uuid::Uuid::new_v4().to_string(),
266            input,
267            format: DocumentFormat::Markdown,
268            name: String::new(),
269            source_path: None,
270            raw_nodes: Vec::new(),
271            tree: None,
272            options,
273            llm_client: None,
274            summary_cache: SummaryCache::default(),
275            stage_results: HashMap::new(),
276            metrics: IndexMetrics::default(),
277            description: None,
278            page_count: None,
279            line_count: None,
280        }
281    }
282
283    /// Set the document ID.
284    pub fn with_doc_id(mut self, doc_id: impl Into<String>) -> Self {
285        self.doc_id = doc_id.into();
286        self
287    }
288
289    /// Set the LLM client.
290    pub fn with_llm_client(mut self, client: LlmClient) -> Self {
291        self.llm_client = Some(client);
292        self
293    }
294
295    /// Set the document format.
296    pub fn with_format(mut self, format: DocumentFormat) -> Self {
297        self.format = format;
298        self
299    }
300
301    /// Set the document name.
302    pub fn with_name(mut self, name: impl Into<String>) -> Self {
303        self.name = name.into();
304        self
305    }
306
307    /// Set the source path.
308    pub fn with_source_path(mut self, path: impl Into<PathBuf>) -> Self {
309        self.source_path = Some(path.into());
310        self
311    }
312
313    /// Initialize summary cache based on strategy.
314    pub fn init_summary_cache(&mut self) {
315        if let SummaryStrategy::Lazy { persist, .. } = self.options.summary_strategy {
316            self.summary_cache = SummaryCache::new(persist);
317        }
318    }
319
320    /// Record a stage result.
321    pub fn record_stage(&mut self, name: &str, result: StageResult) {
322        self.stage_results.insert(name.to_string(), result);
323    }
324
325    /// Get the tree, returning an error if not built.
326    pub fn tree(&self) -> Result<&DocumentTree, &'static str> {
327        self.tree.as_ref().ok_or("Tree not built")
328    }
329
330    /// Get mutable tree, returning an error if not built.
331    pub fn tree_mut(&mut self) -> Result<&mut DocumentTree, &'static str> {
332        self.tree.as_mut().ok_or("Tree not built")
333    }
334
335    /// Finalize and build the result.
336    pub fn finalize(self) -> IndexResult {
337        IndexResult {
338            doc_id: self.doc_id,
339            name: self.name,
340            format: self.format,
341            source_path: self.source_path,
342            tree: self.tree,
343            description: self.description,
344            page_count: self.page_count,
345            line_count: self.line_count,
346            metrics: self.metrics,
347            summary_cache: self.summary_cache,
348        }
349    }
350}
351
352/// Final result from the index pipeline.
353#[derive(Debug)]
354pub struct IndexResult {
355    /// Document ID.
356    pub doc_id: String,
357
358    /// Document name.
359    pub name: String,
360
361    /// Document format.
362    pub format: DocumentFormat,
363
364    /// Source file path.
365    pub source_path: Option<PathBuf>,
366
367    /// Built document tree.
368    pub tree: Option<DocumentTree>,
369
370    /// Document description.
371    pub description: Option<String>,
372
373    /// Page count (for PDFs).
374    pub page_count: Option<usize>,
375
376    /// Line count.
377    pub line_count: Option<usize>,
378
379    /// Performance metrics.
380    pub metrics: IndexMetrics,
381
382    /// Summary cache.
383    pub summary_cache: SummaryCache,
384}
385
386impl IndexResult {
387    /// Check if the result has a tree.
388    pub fn has_tree(&self) -> bool {
389        self.tree.is_some()
390    }
391
392    /// Get the tree.
393    pub fn tree(&self) -> Option<&DocumentTree> {
394        self.tree.as_ref()
395    }
396
397    /// Get total indexing time in milliseconds.
398    pub fn total_time_ms(&self) -> u64 {
399        self.metrics.parse_time_ms
400            + self.metrics.build_time_ms
401            + self.metrics.enhance_time_ms
402            + self.metrics.enrich_time_ms
403            + self.metrics.optimize_time_ms
404            + self.metrics.persist_time_ms
405    }
406}