Skip to main content

rover/extractor/
options.rs

1//! Per-fetch extraction options carried through the pipeline.
2
3use std::sync::Arc;
4
5use crate::extractor::output::OutputPaths;
6use crate::storage::Db;
7use crate::vlm::CaptionerRegistry;
8
9#[derive(Debug, Clone)]
10pub struct ExtractOptions {
11    pub tables: TablesMode,
12    pub images: ImagesMode,
13    pub metadata: MetadataMode,
14    pub output_paths: Arc<OutputPaths>,
15
16    /// M9: captioner registry (always present in default builds since cloud
17    /// captioners ship in every binary). `None` only during very early tests
18    /// or when no `[captioners.*]` are configured.
19    pub captioners: Option<Arc<CaptionerRegistry>>,
20    pub caption_filters: ImageCaptionFilters,
21    pub db: Option<Db>,
22}
23
24#[derive(Debug, Clone, Default)]
25pub enum MetadataMode {
26    #[default]
27    Include,
28    Skip,
29}
30
31#[derive(Debug, Clone, Default)]
32pub enum TablesMode {
33    #[default]
34    Embed,
35    Sample(SampleStrategy),
36    CsvFile,
37    Drop,
38    Summarize,
39}
40
41#[derive(Debug, Clone)]
42pub enum SampleStrategy {
43    HeadTail { head: usize, tail: usize },
44    RandomSeed { rows: usize, seed: u64 },
45}
46
47impl Default for SampleStrategy {
48    fn default() -> Self {
49        SampleStrategy::HeadTail { head: 5, tail: 5 }
50    }
51}
52
53#[derive(Debug, Clone, Default)]
54pub enum ImagesMode {
55    Keep,
56    #[default]
57    AltTextOnly,
58    Download,
59    Drop,
60    /// Caption each `<img>` via a configured `[captioners.<name>]` (M9).
61    /// When no captioner is configured at fetch time, the apply() call
62    /// returns ExtractorError::CaptionerNotConfigured.
63    Caption,
64}
65
66/// Per-fetch caption-mode budget knobs. Resolved from `[image_captions]`
67/// at server startup; cloned per-fetch with any per-call overrides applied.
68#[derive(Debug, Clone)]
69pub struct ImageCaptionFilters {
70    pub max_per_page: usize,
71    pub min_width: u32,
72    pub min_height: u32,
73    pub max_bytes: u64,
74    pub max_tokens: usize,
75    /// When Some, overrides the registry's default captioner for this fetch.
76    pub captioner_override: Option<String>,
77}
78
79impl Default for ImageCaptionFilters {
80    fn default() -> Self {
81        Self {
82            max_per_page: 10,
83            min_width: 200,
84            min_height: 200,
85            max_bytes: 10 * 1024 * 1024,
86            max_tokens: 50,
87            captioner_override: None,
88        }
89    }
90}