rover/extractor/
options.rs1use std::sync::Arc;
4
5use crate::extractor::output::OutputPaths;
6use crate::storage::Db;
7use crate::vlm::CaptionerRegistry;
8
9#[derive(Debug, Clone)]
10pub struct ExtractOptions {
11 pub tables: TablesMode,
12 pub images: ImagesMode,
13 pub metadata: MetadataMode,
14 pub output_paths: Arc<OutputPaths>,
15
16 pub captioners: Option<Arc<CaptionerRegistry>>,
20 pub caption_filters: ImageCaptionFilters,
21 pub db: Option<Db>,
22}
23
24#[derive(Debug, Clone, Default)]
25pub enum MetadataMode {
26 #[default]
27 Include,
28 Skip,
29}
30
31#[derive(Debug, Clone, Default)]
32pub enum TablesMode {
33 #[default]
34 Embed,
35 Sample(SampleStrategy),
36 CsvFile,
37 Drop,
38 Summarize,
39}
40
41#[derive(Debug, Clone)]
42pub enum SampleStrategy {
43 HeadTail { head: usize, tail: usize },
44 RandomSeed { rows: usize, seed: u64 },
45}
46
47impl Default for SampleStrategy {
48 fn default() -> Self {
49 SampleStrategy::HeadTail { head: 5, tail: 5 }
50 }
51}
52
53#[derive(Debug, Clone, Default)]
54pub enum ImagesMode {
55 Keep,
56 #[default]
57 AltTextOnly,
58 Download,
59 Drop,
60 Caption,
64}
65
66#[derive(Debug, Clone)]
69pub struct ImageCaptionFilters {
70 pub max_per_page: usize,
71 pub min_width: u32,
72 pub min_height: u32,
73 pub max_bytes: u64,
74 pub max_tokens: usize,
75 pub captioner_override: Option<String>,
77}
78
79impl Default for ImageCaptionFilters {
80 fn default() -> Self {
81 Self {
82 max_per_page: 10,
83 min_width: 200,
84 min_height: 200,
85 max_bytes: 10 * 1024 * 1024,
86 max_tokens: 50,
87 captioner_override: None,
88 }
89 }
90}