Skip to main content

datalab_cli/commands/
convert.rs

1use crate::cache::Cache;
2use crate::client::{add_form_field, build_form_with_file, DatalabClient};
3use crate::error::{DatalabError, Result};
4use crate::output::Progress;
5use clap::Args;
6use reqwest::multipart::Form;
7use serde_json::json;
8use std::fs;
9use std::path::PathBuf;
10
11#[derive(Args, Debug)]
12#[command(after_help = "EXAMPLES:\n  \
13                  # Basic conversion to markdown\n  \
14                  datalab convert document.pdf\n\n  \
15                  # High-quality conversion with charts\n  \
16                  datalab convert report.pdf --mode accurate --extras chart_understanding\n\n  \
17                  # Convert specific pages\n  \
18                  datalab convert book.pdf --page-range \"0-10,50-60\" --paginate")]
19pub struct ConvertArgs {
20    /// File path or URL to convert
21    #[arg(value_name = "FILE|URL")]
22    pub input: String,
23
24    /// Output format: markdown, html, json, chunks
25    #[arg(
26        long,
27        default_value = "markdown",
28        value_name = "FORMAT",
29        help_heading = "Output Options"
30    )]
31    pub output_format: String,
32
33    /// Processing mode: fast, balanced, accurate
34    #[arg(
35        long,
36        default_value = "fast",
37        value_name = "MODE",
38        help_heading = "Processing Options"
39    )]
40    pub mode: String,
41
42    /// Maximum pages to process
43    #[arg(long, value_name = "N", help_heading = "Processing Options")]
44    pub max_pages: Option<u32>,
45
46    /// Page range (e.g., "0-5,10")
47    #[arg(long, value_name = "RANGE", help_heading = "Processing Options")]
48    pub page_range: Option<String>,
49
50    /// Add page delimiters to output
51    #[arg(long, help_heading = "Output Options")]
52    pub paginate: bool,
53
54    /// Skip local cache lookup
55    #[arg(long, help_heading = "Cache Options")]
56    pub skip_cache: bool,
57
58    /// Force reprocessing (skip API cache)
59    #[arg(long, help_heading = "Cache Options")]
60    pub force: bool,
61
62    /// Save checkpoint for follow-up extraction/segmentation
63    #[arg(long, help_heading = "Advanced Options")]
64    pub save_checkpoint: bool,
65
66    /// Extra features: track_changes, chart_understanding, extract_links
67    #[arg(long, value_name = "FEATURES", help_heading = "Advanced Options")]
68    pub extras: Option<String>,
69
70    /// Add block IDs for citation tracking
71    #[arg(long, help_heading = "Advanced Options")]
72    pub add_block_ids: bool,
73
74    /// Use token-efficient markdown format
75    #[arg(long, help_heading = "Output Options")]
76    pub token_efficient_markdown: bool,
77
78    /// Disable image extraction from document
79    #[arg(long, help_heading = "Advanced Options")]
80    pub disable_image_extraction: bool,
81
82    /// Disable image caption generation
83    #[arg(long, help_heading = "Advanced Options")]
84    pub disable_image_captions: bool,
85
86    /// Write result to file instead of stdout
87    #[arg(long, short, value_name = "FILE", help_heading = "Output Options")]
88    pub output: Option<PathBuf>,
89
90    /// Request timeout in seconds
91    #[arg(
92        long,
93        default_value = "300",
94        value_name = "SECS",
95        help_heading = "Advanced Options"
96    )]
97    pub timeout: u64,
98}
99
100impl ConvertArgs {
101    fn to_cache_params(&self) -> serde_json::Value {
102        json!({
103            "output_format": self.output_format,
104            "mode": self.mode,
105            "max_pages": self.max_pages,
106            "page_range": self.page_range,
107            "paginate": self.paginate,
108            "save_checkpoint": self.save_checkpoint,
109            "extras": self.extras,
110            "add_block_ids": self.add_block_ids,
111            "token_efficient_markdown": self.token_efficient_markdown,
112            "disable_image_extraction": self.disable_image_extraction,
113            "disable_image_captions": self.disable_image_captions,
114        })
115    }
116
117    fn add_to_form(&self, mut form: Form) -> Form {
118        form = add_form_field(form, "output_format", &self.output_format);
119        form = add_form_field(form, "mode", &self.mode);
120
121        if let Some(max_pages) = self.max_pages {
122            form = add_form_field(form, "max_pages", &max_pages.to_string());
123        }
124        if let Some(ref page_range) = self.page_range {
125            form = add_form_field(form, "page_range", page_range);
126        }
127        if self.paginate {
128            form = add_form_field(form, "paginate", "true");
129        }
130        if self.force {
131            form = add_form_field(form, "skip_cache", "true");
132        }
133        if self.save_checkpoint {
134            form = add_form_field(form, "save_checkpoint", "true");
135        }
136        if let Some(ref extras) = self.extras {
137            form = add_form_field(form, "extras", extras);
138        }
139        if self.add_block_ids {
140            form = add_form_field(form, "add_block_ids", "true");
141        }
142        if self.token_efficient_markdown {
143            form = add_form_field(form, "token_efficient_markdown", "true");
144        }
145        if self.disable_image_extraction {
146            form = add_form_field(form, "disable_image_extraction", "true");
147        }
148        if self.disable_image_captions {
149            form = add_form_field(form, "disable_image_captions", "true");
150        }
151
152        form
153    }
154}
155
156pub async fn execute(args: ConvertArgs, progress: &Progress) -> Result<()> {
157    let client = DatalabClient::new(Some(args.timeout))?;
158    let cache = Cache::new()?;
159
160    let is_url = args.input.starts_with("http://") || args.input.starts_with("https://");
161    let file_path = if is_url {
162        None
163    } else {
164        Some(PathBuf::from(&args.input))
165    };
166
167    // Emit start progress
168    let file_str = file_path.as_ref().map(|p| p.to_string_lossy().to_string());
169    progress.start("convert", file_str.as_deref());
170
171    let file_hash = if let Some(ref path) = file_path {
172        if !path.exists() {
173            return Err(DatalabError::FileNotFound(path.clone()));
174        }
175        Some(Cache::hash_file(path)?)
176    } else {
177        None
178    };
179
180    let cache_params = args.to_cache_params();
181    let cache_key = Cache::generate_key(
182        file_hash.as_deref(),
183        if is_url { Some(&args.input) } else { None },
184        "convert",
185        &cache_params,
186    );
187
188    if !args.skip_cache {
189        if let Some(cached) = cache.get(&cache_key) {
190            progress.cache_hit(&cache_key);
191            output_result(&cached, args.output.as_ref())?;
192            return Ok(());
193        }
194    }
195
196    let form = if let Some(ref path) = file_path {
197        let (form, _) = build_form_with_file(path)?;
198        args.add_to_form(form)
199    } else {
200        let form = Form::new().text("file_url", args.input.clone());
201        args.add_to_form(form)
202    };
203
204    let result = client.submit_and_poll("convert", form, progress).await?;
205
206    let file_path_str = file_path.as_ref().map(|p| p.to_string_lossy().to_string());
207    cache.set(
208        &cache_key,
209        &result,
210        "convert",
211        file_hash.as_deref(),
212        file_path_str.as_deref(),
213    )?;
214
215    output_result(&result, args.output.as_ref())?;
216
217    Ok(())
218}
219
220fn output_result(result: &serde_json::Value, output_file: Option<&PathBuf>) -> Result<()> {
221    let json_output = serde_json::to_string_pretty(result)?;
222
223    if let Some(path) = output_file {
224        fs::write(path, &json_output)?;
225    } else {
226        println!("{}", json_output);
227    }
228
229    Ok(())
230}