vectorless/index/pipeline/
context.rs1use std::collections::HashMap;
7use std::path::PathBuf;
8
9use crate::document::{DocumentTree, NodeId};
10use crate::llm::LlmClient;
11use crate::parser::{DocumentFormat, RawNode};
12
13use super::super::{PipelineOptions, SummaryStrategy};
14use super::metrics::IndexMetrics;
15
16#[derive(Debug, Clone)]
18pub enum IndexInput {
19 File(PathBuf),
21
22 Content {
24 content: String,
26 name: String,
28 format: DocumentFormat,
30 },
31
32 Bytes {
34 data: Vec<u8>,
36 name: String,
38 format: DocumentFormat,
40 },
41}
42
43impl IndexInput {
44 pub fn file(path: impl Into<PathBuf>) -> Self {
46 Self::File(path.into())
47 }
48
49 pub fn content(content: impl Into<String>) -> Self {
51 Self::Content {
52 content: content.into(),
53 name: String::new(),
54 format: DocumentFormat::Markdown,
55 }
56 }
57
58 pub fn content_with(
60 content: impl Into<String>,
61 name: impl Into<String>,
62 format: DocumentFormat,
63 ) -> Self {
64 Self::Content {
65 content: content.into(),
66 name: name.into(),
67 format,
68 }
69 }
70
71 pub fn bytes(data: impl Into<Vec<u8>>) -> Self {
73 Self::Bytes {
74 data: data.into(),
75 name: String::new(),
76 format: DocumentFormat::Pdf,
77 }
78 }
79
80 pub fn bytes_with(
82 data: impl Into<Vec<u8>>,
83 name: impl Into<String>,
84 format: DocumentFormat,
85 ) -> Self {
86 Self::Bytes {
87 data: data.into(),
88 name: name.into(),
89 format,
90 }
91 }
92
93 pub fn is_file(&self) -> bool {
95 matches!(self, Self::File(_))
96 }
97
98 pub fn is_content(&self) -> bool {
100 matches!(self, Self::Content { .. })
101 }
102
103 pub fn is_bytes(&self) -> bool {
105 matches!(self, Self::Bytes { .. })
106 }
107
108 pub fn format(&self) -> Option<DocumentFormat> {
110 match self {
111 Self::File(_) => None,
112 Self::Content { format, .. } => Some(*format),
113 Self::Bytes { format, .. } => Some(*format),
114 }
115 }
116}
117
118#[derive(Debug, Clone)]
120pub struct StageResult {
121 pub success: bool,
123
124 pub duration_ms: u64,
126
127 pub metadata: HashMap<String, serde_json::Value>,
129}
130
131impl StageResult {
132 pub fn success(name: &str) -> Self {
134 println!("Stage '{}' completed successfully", name);
135
136 Self {
137 success: true,
138 duration_ms: 0,
139 metadata: HashMap::new(),
140 }
141 }
142
143 pub fn failure(name: &str, error: &str) -> Self {
145 println!("Stage '{}' failed: {}", name, error);
146
147 let mut metadata = HashMap::new();
148 metadata.insert(
149 "error".to_string(),
150 serde_json::Value::String(error.to_string()),
151 );
152 Self {
153 success: false,
154 duration_ms: 0,
155 metadata,
156 }
157 }
158
159 pub fn with_duration(mut self, ms: u64) -> Self {
161 self.duration_ms = ms;
162 self
163 }
164
165 pub fn with_metadata(mut self, key: &str, value: serde_json::Value) -> Self {
167 self.metadata.insert(key.to_string(), value);
168 self
169 }
170}
171
172#[derive(Debug, Clone, Default)]
174pub struct SummaryCache {
175 summaries: HashMap<NodeId, String>,
177
178 persist: bool,
180}
181
182impl SummaryCache {
183 pub fn new(persist: bool) -> Self {
185 Self {
186 summaries: HashMap::new(),
187 persist,
188 }
189 }
190
191 pub fn get(&self, node_id: NodeId) -> Option<&str> {
193 self.summaries.get(&node_id).map(|s| s.as_str())
194 }
195
196 pub fn put(&mut self, node_id: NodeId, summary: String) {
198 self.summaries.insert(node_id, summary);
199 }
200
201 pub fn should_persist(&self) -> bool {
203 self.persist
204 }
205
206 pub fn all(&self) -> &HashMap<NodeId, String> {
208 &self.summaries
209 }
210}
211
212#[derive(Debug)]
214pub struct IndexContext {
215 pub doc_id: String,
217
218 pub input: IndexInput,
220
221 pub format: DocumentFormat,
223
224 pub name: String,
226
227 pub source_path: Option<PathBuf>,
229
230 pub raw_nodes: Vec<RawNode>,
232
233 pub tree: Option<DocumentTree>,
235
236 pub options: PipelineOptions,
238
239 pub llm_client: Option<LlmClient>,
241
242 pub summary_cache: SummaryCache,
244
245 pub stage_results: HashMap<String, StageResult>,
247
248 pub metrics: IndexMetrics,
250
251 pub description: Option<String>,
253
254 pub page_count: Option<usize>,
256
257 pub line_count: Option<usize>,
259}
260
261impl IndexContext {
262 pub fn new(input: IndexInput, options: PipelineOptions) -> Self {
264 Self {
265 doc_id: uuid::Uuid::new_v4().to_string(),
266 input,
267 format: DocumentFormat::Markdown,
268 name: String::new(),
269 source_path: None,
270 raw_nodes: Vec::new(),
271 tree: None,
272 options,
273 llm_client: None,
274 summary_cache: SummaryCache::default(),
275 stage_results: HashMap::new(),
276 metrics: IndexMetrics::default(),
277 description: None,
278 page_count: None,
279 line_count: None,
280 }
281 }
282
283 pub fn with_doc_id(mut self, doc_id: impl Into<String>) -> Self {
285 self.doc_id = doc_id.into();
286 self
287 }
288
289 pub fn with_llm_client(mut self, client: LlmClient) -> Self {
291 self.llm_client = Some(client);
292 self
293 }
294
295 pub fn with_format(mut self, format: DocumentFormat) -> Self {
297 self.format = format;
298 self
299 }
300
301 pub fn with_name(mut self, name: impl Into<String>) -> Self {
303 self.name = name.into();
304 self
305 }
306
307 pub fn with_source_path(mut self, path: impl Into<PathBuf>) -> Self {
309 self.source_path = Some(path.into());
310 self
311 }
312
313 pub fn init_summary_cache(&mut self) {
315 if let SummaryStrategy::Lazy { persist, .. } = self.options.summary_strategy {
316 self.summary_cache = SummaryCache::new(persist);
317 }
318 }
319
320 pub fn record_stage(&mut self, name: &str, result: StageResult) {
322 self.stage_results.insert(name.to_string(), result);
323 }
324
325 pub fn tree(&self) -> Result<&DocumentTree, &'static str> {
327 self.tree.as_ref().ok_or("Tree not built")
328 }
329
330 pub fn tree_mut(&mut self) -> Result<&mut DocumentTree, &'static str> {
332 self.tree.as_mut().ok_or("Tree not built")
333 }
334
335 pub fn finalize(self) -> IndexResult {
337 IndexResult {
338 doc_id: self.doc_id,
339 name: self.name,
340 format: self.format,
341 source_path: self.source_path,
342 tree: self.tree,
343 description: self.description,
344 page_count: self.page_count,
345 line_count: self.line_count,
346 metrics: self.metrics,
347 summary_cache: self.summary_cache,
348 }
349 }
350}
351
352#[derive(Debug)]
354pub struct IndexResult {
355 pub doc_id: String,
357
358 pub name: String,
360
361 pub format: DocumentFormat,
363
364 pub source_path: Option<PathBuf>,
366
367 pub tree: Option<DocumentTree>,
369
370 pub description: Option<String>,
372
373 pub page_count: Option<usize>,
375
376 pub line_count: Option<usize>,
378
379 pub metrics: IndexMetrics,
381
382 pub summary_cache: SummaryCache,
384}
385
386impl IndexResult {
387 pub fn has_tree(&self) -> bool {
389 self.tree.is_some()
390 }
391
392 pub fn tree(&self) -> Option<&DocumentTree> {
394 self.tree.as_ref()
395 }
396
397 pub fn total_time_ms(&self) -> u64 {
399 self.metrics.parse_time_ms
400 + self.metrics.build_time_ms
401 + self.metrics.enhance_time_ms
402 + self.metrics.enrich_time_ms
403 + self.metrics.optimize_time_ms
404 + self.metrics.persist_time_ms
405 }
406}