vectorless 0.1.25

Hierarchical, reasoning-native document intelligence engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
// Copyright (c) 2026 vectorless developers
// SPDX-License-Identifier: Apache-2.0

//! TOC processor - integrates all TOC processing components.
//!
//! The processor orchestrates a multi-mode extraction pipeline with automatic
//! degradation: if one mode fails verification, it falls back to a lower-quality
//! but more reliable mode.

use futures::future::join_all;
use tracing::{debug, info, warn};

use crate::error::Result;
use crate::index::parse::pdf::PdfPage;
use crate::llm::LlmClient;

use super::assigner::{PageAssigner, PageAssignerConfig};
use super::detector::{TocDetector, TocDetectorConfig};
use super::parser::{TocParser, TocParserConfig};
use super::repairer::{IndexRepairer, RepairerConfig};
use super::structure_extractor::{StructureExtractor, StructureExtractorConfig};
use super::types::{ProcessingMode, TocEntry, VerificationReport};
use super::verifier::{IndexVerifier, VerifierConfig};

/// TOC processor configuration.
#[derive(Debug, Clone)]
pub struct TocProcessorConfig {
    /// TOC detector configuration.
    pub detector: TocDetectorConfig,

    /// TOC parser configuration.
    pub parser: TocParserConfig,

    /// Page assigner configuration.
    pub assigner: PageAssignerConfig,

    /// Verifier configuration.
    pub verifier: VerifierConfig,

    /// Repairer configuration.
    pub repairer: RepairerConfig,

    /// Accuracy threshold for acceptance (0.0 - 1.0).
    pub accuracy_threshold: f32,

    /// Maximum repair attempts per verification cycle.
    pub max_repair_attempts: usize,

    /// Maximum page span for a single entry before recursive refinement.
    pub max_pages_per_entry: usize,

    /// Maximum estimated tokens for a single entry before recursive refinement.
    pub max_tokens_per_entry: usize,
}

impl Default for TocProcessorConfig {
    fn default() -> Self {
        Self {
            detector: TocDetectorConfig::default(),
            parser: TocParserConfig::default(),
            assigner: PageAssignerConfig::default(),
            verifier: VerifierConfig::default(),
            repairer: RepairerConfig::default(),
            accuracy_threshold: 0.6,
            max_repair_attempts: 3,
            max_pages_per_entry: 30,
            max_tokens_per_entry: 20000,
        }
    }
}

/// TOC processor - orchestrates the complete TOC extraction pipeline.
///
/// # Processing Pipeline
///
/// 1. **Detect** - Find TOC in document (regex + LLM fallback)
/// 2. **Extract** - Get TOC text from detected pages
/// 3. **Parse** - Convert TOC text to structured entries (LLM)
/// 4. **Assign** - Map TOC pages to physical pages
/// 5. **Verify** - Sample verification of page assignments
/// 6. **Repair** - Fix incorrect assignments (if needed)
/// 7. **Refine** - Sub-divide oversized entries (if needed)
///
/// # Degradation Strategy
///
/// The pipeline tries three modes in order of quality:
///
/// 1. `TocWithPageNumbers` - TOC found with page numbers (offset calculation)
/// 2. `TocWithoutPageNumbers` - TOC found without page numbers (LLM positioning)
/// 3. `NoToc` - No TOC available (LLM structure extraction from content)
///
/// If a mode fails verification (accuracy < threshold), it automatically
/// degrades to the next mode.
///
/// # Example
///
/// ```rust,no_run
/// use vectorless::parser::toc::TocProcessor;
/// use vectorless::parser::pdf::PdfParser;
///
/// # #[tokio::main]
/// # async fn main() -> vectorless::Result<()> {
/// let pdf_parser = PdfParser::new();
/// let result = pdf_parser.parse_file("document.pdf".as_ref()).await?;
///
/// let processor = TocProcessor::new();
/// let entries = processor.process(&result.pages).await?;
///
/// for entry in &entries {
///     println!("{} - Page {:?}", entry.title, entry.physical_page);
/// }
/// # Ok(())
/// # }
/// ```
pub struct TocProcessor {
    config: TocProcessorConfig,
    detector: TocDetector,
    parser: TocParser,
    assigner: PageAssigner,
    verifier: IndexVerifier,
    repairer: IndexRepairer,
    /// Optional LLM client for StructureExtractor (no-TOC mode and refinement).
    llm_client: Option<LlmClient>,
}

impl TocProcessor {
    /// Create a new TOC processor with default configuration.
    pub fn new() -> Self {
        Self::with_config(TocProcessorConfig::default())
    }

    /// Create a TOC processor with an externally provided LLM client.
    ///
    /// All sub-components (detector, parser, assigner, verifier, repairer)
    /// will use this client instead of creating their own from default config.
    pub fn with_llm_client(client: LlmClient) -> Self {
        info!("TocProcessor: created with external LLM client");
        let config = TocProcessorConfig::default();
        Self {
            detector: TocDetector::with_client(config.detector.clone(), client.clone()),
            parser: TocParser::with_client(client.clone()),
            assigner: PageAssigner::with_client(client.clone()),
            verifier: IndexVerifier::with_client(client.clone()),
            repairer: IndexRepairer::with_client(client.clone()),
            llm_client: Some(client),
            config,
        }
    }

    /// Create a TOC processor with custom configuration.
    pub fn with_config(config: TocProcessorConfig) -> Self {
        info!("TocProcessor: created with config (no external LLM client)");
        Self {
            detector: TocDetector::new(config.detector.clone()),
            parser: TocParser::new(config.parser.clone()),
            assigner: PageAssigner::new(config.assigner.clone()),
            verifier: IndexVerifier::new(config.verifier.clone()),
            repairer: IndexRepairer::new(config.repairer.clone()),
            llm_client: None,
            config,
        }
    }

    /// Process PDF pages and extract hierarchical structure.
    ///
    /// This is the main entry point. It detects TOC, selects the best
    /// processing mode, and automatically degrades if needed.
    pub async fn process(&self, pages: &[PdfPage]) -> Result<Vec<TocEntry>> {
        if pages.is_empty() {
            return Ok(Vec::new());
        }

        info!("Processing {} pages for TOC extraction", pages.len());

        // Step 1: Detect TOC
        let detection = self.detector.detect(pages).await?;

        // Step 2: Determine initial mode based on detection result
        let initial_mode = if !detection.found {
            info!("No TOC found in document");
            ProcessingMode::NoToc
        } else if detection.has_page_numbers {
            info!(
                "TOC found on pages {:?}, has page numbers",
                detection.pages
            );
            ProcessingMode::TocWithPageNumbers
        } else {
            info!(
                "TOC found on pages {:?}, no page numbers",
                detection.pages
            );
            ProcessingMode::TocWithoutPageNumbers
        };

        // Step 3: Process with degradation
        let entries = self
            .process_with_degradation(initial_mode, &detection, pages)
            .await?;

        // Step 4: Refine oversized entries
        self.refine_large_entries(entries, pages).await
    }

    /// Process with automatic mode degradation.
    ///
    /// Tries the given mode, verifies the result, and degrades to a
    /// lower-quality mode if accuracy is below threshold.
    async fn process_with_degradation(
        &self,
        initial_mode: ProcessingMode,
        detection: &super::types::TocDetection,
        pages: &[PdfPage],
    ) -> Result<Vec<TocEntry>> {
        let mut mode = initial_mode;

        loop {
            info!("Attempting extraction with mode {:?}", mode);

            let result = match mode {
                ProcessingMode::TocWithPageNumbers => {
                    self.process_toc_with_page_numbers(detection, pages).await
                }
                ProcessingMode::TocWithoutPageNumbers => {
                    self.process_toc_without_page_numbers(detection, pages).await
                }
                ProcessingMode::NoToc => {
                    // NoToc always succeeds (produces some structure)
                    return self.process_without_toc(pages).await;
                }
            };

            match result {
                Ok(entries) if !entries.is_empty() => {
                    // Verify the entries
                    let mut mutable_entries = entries;
                    let report = self
                        .verify_and_repair(&mut mutable_entries, pages)
                        .await?;

                    if report.accuracy >= self.config.accuracy_threshold {
                        info!(
                            "Mode {:?} succeeded: {} entries, accuracy {:.1}%",
                            mode,
                            mutable_entries.len(),
                            report.accuracy * 100.0
                        );
                        return Ok(mutable_entries);
                    }

                    // Accuracy too low, try degrading
                    warn!(
                        "Mode {:?} accuracy {:.1}% below threshold {:.1}%",
                        mode,
                        report.accuracy * 100.0,
                        self.config.accuracy_threshold * 100.0
                    );

                    match mode.degrade() {
                        Some(next) => {
                            info!("Degrading from {:?} to {:?}", mode, next);
                            mode = next;
                            // Continue loop with degraded mode
                        }
                        None => {
                            warn!("No further degradation possible, returning best effort");
                            return Ok(mutable_entries);
                        }
                    }
                }
                Ok(_) => {
                    // Empty entries, degrade
                    warn!("Mode {:?} produced no entries", mode);
                    match mode.degrade() {
                        Some(next) => {
                            mode = next;
                        }
                        None => return Ok(Vec::new()),
                    }
                }
                Err(e) => {
                    warn!("Mode {:?} failed: {}", mode, e);
                    match mode.degrade() {
                        Some(next) => {
                            mode = next;
                        }
                        None => return Err(e),
                    }
                }
            }
        }
    }

    /// Mode 1: TOC with page numbers.
    ///
    /// Parse the TOC, calculate physical-page offset from anchor entries,
    /// and apply the offset to all entries.
    async fn process_toc_with_page_numbers(
        &self,
        detection: &super::types::TocDetection,
        pages: &[PdfPage],
    ) -> Result<Vec<TocEntry>> {
        let toc_text = self.extract_toc_text(pages, &detection.pages);
        if toc_text.trim().is_empty() {
            return Ok(Vec::new());
        }

        let mut entries = self.parser.parse(&toc_text).await?;
        if entries.is_empty() {
            return Ok(Vec::new());
        }

        // Assign physical pages using offset calculation
        self.assigner.assign(&mut entries, pages).await?;

        Ok(entries)
    }

    /// Mode 2: TOC without page numbers.
    ///
    /// Parse the TOC, then use LLM to locate each entry in the document.
    async fn process_toc_without_page_numbers(
        &self,
        detection: &super::types::TocDetection,
        pages: &[PdfPage],
    ) -> Result<Vec<TocEntry>> {
        let toc_text = self.extract_toc_text(pages, &detection.pages);
        if toc_text.trim().is_empty() {
            return Ok(Vec::new());
        }

        let mut entries = self.parser.parse(&toc_text).await?;
        if entries.is_empty() {
            return Ok(Vec::new());
        }

        // Clear any TOC page numbers (they're unreliable in this mode)
        for entry in &mut entries {
            entry.toc_page = None;
        }

        // Assign physical pages using LLM positioning
        self.assigner.assign(&mut entries, pages).await?;

        Ok(entries)
    }

    /// Mode 3: No TOC available.
    ///
    /// Extract document structure directly from page content using LLM.
    async fn process_without_toc(&self, pages: &[PdfPage]) -> Result<Vec<TocEntry>> {
        info!("Extracting structure from page content (no TOC available)");

        let extractor = match &self.llm_client {
            Some(client) => {
                StructureExtractor::with_client(StructureExtractorConfig::default(), client.clone())
            }
            None => StructureExtractor::new(StructureExtractorConfig::default()),
        };
        extractor.extract(pages).await
    }

    /// Extract TOC text from pages.
    fn extract_toc_text(&self, pages: &[PdfPage], toc_pages: &[usize]) -> String {
        toc_pages
            .iter()
            .filter_map(|&page_num| pages.get(page_num - 1))
            .map(|page| page.text.as_str())
            .collect::<Vec<_>>()
            .join("\n\n")
    }

    /// Verify entries and repair if needed.
    async fn verify_and_repair(
        &self,
        entries: &mut [TocEntry],
        pages: &[PdfPage],
    ) -> Result<VerificationReport> {
        let mut attempts = 0;

        while attempts < self.config.max_repair_attempts {
            let report = self.verifier.verify(entries, pages).await?;

            if report.accuracy >= self.config.accuracy_threshold {
                debug!(
                    "Verification passed: accuracy {:.1}%",
                    report.accuracy * 100.0
                );
                return Ok(report);
            }

            if report.errors.is_empty() {
                return Ok(report);
            }

            let repaired = self.repairer.repair(entries, &report.errors, pages).await?;

            if repaired == 0 {
                debug!("No repairs possible");
                return Ok(report);
            }

            attempts += 1;
            debug!("Repair attempt {} complete", attempts);
        }

        self.verifier.verify(entries, pages).await
    }

    /// Refine oversized entries by extracting sub-structure.
    ///
    /// Entries that span too many pages or tokens are broken down using
    /// the same structure extraction approach used for no-TOC documents.
    async fn refine_large_entries(
        &self,
        entries: Vec<TocEntry>,
        pages: &[PdfPage],
    ) -> Result<Vec<TocEntry>> {
        if entries.is_empty() {
            return Ok(entries);
        }

        let page_count = pages.len();

        // Pre-compute next-entry page numbers and classify entries
        let next_pages: Vec<Option<usize>> = entries
            .iter()
            .enumerate()
            .map(|(i, _)| entries.get(i + 1).and_then(|e| e.physical_page))
            .collect();

        // Identify oversized entries and launch extractions concurrently
        let llm_client = self.llm_client.clone();
        let oversized_futures: Vec<_> = entries
            .iter()
            .enumerate()
            .filter(|(i, entry)| {
                let span = entry_page_span(entry, next_pages[*i], page_count);
                let tokens = entry_token_count(entry, pages);
                span > self.config.max_pages_per_entry
                    && tokens > self.config.max_tokens_per_entry
            })
            .map(|(i, entry)| {
                let start = entry.physical_page.unwrap_or(1);
                let end = next_pages[i].unwrap_or(page_count);
                let sub_pages: Vec<PdfPage> = pages
                    .iter()
                    .filter(|p| p.number >= start && p.number <= end)
                    .cloned()
                    .collect();

                let entry_title = entry.title.clone();
                let entry_level = entry.level;
                let llm_client = llm_client.clone();

                async move {
                    if sub_pages.is_empty() {
                        return (i, Vec::new());
                    }
                    debug!(
                        "Refining oversized entry '{}' (pages {}-{})",
                        entry_title, start, end
                    );
                    let extractor = match &llm_client {
                        Some(client) => StructureExtractor::with_client(
                            StructureExtractorConfig::default(),
                            client.clone(),
                        ),
                        None => StructureExtractor::new(StructureExtractorConfig::default()),
                    };
                    match extractor.extract(&sub_pages).await {
                        Ok(sub_entries) => {
                            let skip = if sub_entries
                                .first()
                                .map(|e| e.title.trim() == entry_title.trim())
                                .unwrap_or(false)
                            {
                                1
                            } else {
                                0
                            };

                            let refined: Vec<TocEntry> = sub_entries[skip..]
                                .iter()
                                .map(|sub| {
                                    TocEntry::new(&sub.title, sub.level + entry_level)
                                        .with_physical_page(sub.physical_page.unwrap_or(start))
                                        .with_confidence(sub.confidence * 0.9)
                                })
                                .collect();

                            info!(
                                "Refined '{}' into {} sub-entries",
                                entry_title,
                                refined.len()
                            );
                            (i, refined)
                        }
                        Err(e) => {
                            warn!("Sub-extraction failed for '{}': {}", entry_title, e);
                            (i, Vec::new())
                        }
                    }
                }
            })
            .collect();

        let extraction_results = join_all(oversized_futures).await;

        // Build a lookup from index → refined sub-entries
        let mut refined_map = std::collections::HashMap::new();
        for (idx, sub_entries) in extraction_results {
            if !sub_entries.is_empty() {
                refined_map.insert(idx, sub_entries);
            }
        }

        // Assemble final output
        let mut result = Vec::with_capacity(entries.len() * 2);
        for (i, entry) in entries.into_iter().enumerate() {
            if let Some(sub_entries) = refined_map.remove(&i) {
                result.extend(sub_entries);
            } else {
                result.push(entry);
            }
        }

        Ok(result)
    }
}

impl Default for TocProcessor {
    fn default() -> Self {
        Self::new()
    }
}

/// Calculate how many pages an entry spans.
///
/// From its physical_page to the next entry's physical_page (or document end).
fn entry_page_span(entry: &TocEntry, next_physical_page: Option<usize>, total_pages: usize) -> usize {
    let start = entry.physical_page.unwrap_or(1);
    let end = next_physical_page.unwrap_or(total_pages);
    end.saturating_sub(start)
}

/// Estimate total tokens for the content covered by an entry.
fn entry_token_count(entry: &TocEntry, pages: &[PdfPage]) -> usize {
    let start = entry.physical_page.unwrap_or(1);
    pages
        .iter()
        .filter(|p| p.number >= start)
        .take(30) // cap at max_pages_per_entry default
        .map(|p| p.token_count)
        .sum()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_processor_creation() {
        let processor = TocProcessor::new();
        assert_eq!(processor.config.accuracy_threshold, 0.6);
    }

    #[tokio::test]
    async fn test_empty_pages() {
        let processor = TocProcessor::new();
        let entries = processor.process(&[]).await.unwrap();
        assert!(entries.is_empty());
    }
}