1use crate::cache::Cache;
2use crate::client::{add_form_field, build_form_with_file, DatalabClient};
3use crate::error::{DatalabError, Result};
4use crate::output::Progress;
5use clap::Args;
6use reqwest::multipart::Form;
7use serde_json::json;
8use std::fs;
9use std::path::PathBuf;
10
11#[derive(Args, Debug)]
12#[command(after_help = "EXAMPLES:\n \
13 # Basic conversion to markdown\n \
14 datalab convert document.pdf\n\n \
15 # High-quality conversion with charts\n \
16 datalab convert report.pdf --mode accurate --extras chart_understanding\n\n \
17 # Convert specific pages\n \
18 datalab convert book.pdf --page-range \"0-10,50-60\" --paginate")]
19pub struct ConvertArgs {
20 #[arg(value_name = "FILE|URL")]
22 pub input: String,
23
24 #[arg(
26 long,
27 default_value = "markdown",
28 value_name = "FORMAT",
29 help_heading = "Output Options"
30 )]
31 pub output_format: String,
32
33 #[arg(
35 long,
36 default_value = "fast",
37 value_name = "MODE",
38 help_heading = "Processing Options"
39 )]
40 pub mode: String,
41
42 #[arg(long, value_name = "N", help_heading = "Processing Options")]
44 pub max_pages: Option<u32>,
45
46 #[arg(long, value_name = "RANGE", help_heading = "Processing Options")]
48 pub page_range: Option<String>,
49
50 #[arg(long, help_heading = "Output Options")]
52 pub paginate: bool,
53
54 #[arg(long, help_heading = "Cache Options")]
56 pub skip_cache: bool,
57
58 #[arg(long, help_heading = "Cache Options")]
60 pub force: bool,
61
62 #[arg(long, help_heading = "Advanced Options")]
64 pub save_checkpoint: bool,
65
66 #[arg(long, value_name = "FEATURES", help_heading = "Advanced Options")]
68 pub extras: Option<String>,
69
70 #[arg(long, help_heading = "Advanced Options")]
72 pub add_block_ids: bool,
73
74 #[arg(long, help_heading = "Output Options")]
76 pub token_efficient_markdown: bool,
77
78 #[arg(long, help_heading = "Advanced Options")]
80 pub disable_image_extraction: bool,
81
82 #[arg(long, help_heading = "Advanced Options")]
84 pub disable_image_captions: bool,
85
86 #[arg(long, short, value_name = "FILE", help_heading = "Output Options")]
88 pub output: Option<PathBuf>,
89
90 #[arg(
92 long,
93 default_value = "300",
94 value_name = "SECS",
95 help_heading = "Advanced Options"
96 )]
97 pub timeout: u64,
98}
99
100impl ConvertArgs {
101 fn to_cache_params(&self) -> serde_json::Value {
102 json!({
103 "output_format": self.output_format,
104 "mode": self.mode,
105 "max_pages": self.max_pages,
106 "page_range": self.page_range,
107 "paginate": self.paginate,
108 "save_checkpoint": self.save_checkpoint,
109 "extras": self.extras,
110 "add_block_ids": self.add_block_ids,
111 "token_efficient_markdown": self.token_efficient_markdown,
112 "disable_image_extraction": self.disable_image_extraction,
113 "disable_image_captions": self.disable_image_captions,
114 })
115 }
116
117 fn add_to_form(&self, mut form: Form) -> Form {
118 form = add_form_field(form, "output_format", &self.output_format);
119 form = add_form_field(form, "mode", &self.mode);
120
121 if let Some(max_pages) = self.max_pages {
122 form = add_form_field(form, "max_pages", &max_pages.to_string());
123 }
124 if let Some(ref page_range) = self.page_range {
125 form = add_form_field(form, "page_range", page_range);
126 }
127 if self.paginate {
128 form = add_form_field(form, "paginate", "true");
129 }
130 if self.force {
131 form = add_form_field(form, "skip_cache", "true");
132 }
133 if self.save_checkpoint {
134 form = add_form_field(form, "save_checkpoint", "true");
135 }
136 if let Some(ref extras) = self.extras {
137 form = add_form_field(form, "extras", extras);
138 }
139 if self.add_block_ids {
140 form = add_form_field(form, "add_block_ids", "true");
141 }
142 if self.token_efficient_markdown {
143 form = add_form_field(form, "token_efficient_markdown", "true");
144 }
145 if self.disable_image_extraction {
146 form = add_form_field(form, "disable_image_extraction", "true");
147 }
148 if self.disable_image_captions {
149 form = add_form_field(form, "disable_image_captions", "true");
150 }
151
152 form
153 }
154}
155
156pub async fn execute(args: ConvertArgs, progress: &Progress) -> Result<()> {
157 let client = DatalabClient::new(Some(args.timeout))?;
158 let cache = Cache::new()?;
159
160 let is_url = args.input.starts_with("http://") || args.input.starts_with("https://");
161 let file_path = if is_url {
162 None
163 } else {
164 Some(PathBuf::from(&args.input))
165 };
166
167 let file_str = file_path.as_ref().map(|p| p.to_string_lossy().to_string());
169 progress.start("convert", file_str.as_deref());
170
171 let file_hash = if let Some(ref path) = file_path {
172 if !path.exists() {
173 return Err(DatalabError::FileNotFound(path.clone()));
174 }
175 Some(Cache::hash_file(path)?)
176 } else {
177 None
178 };
179
180 let cache_params = args.to_cache_params();
181 let cache_key = Cache::generate_key(
182 file_hash.as_deref(),
183 if is_url { Some(&args.input) } else { None },
184 "convert",
185 &cache_params,
186 );
187
188 if !args.skip_cache {
189 if let Some(cached) = cache.get(&cache_key) {
190 progress.cache_hit(&cache_key);
191 output_result(&cached, args.output.as_ref())?;
192 return Ok(());
193 }
194 }
195
196 let form = if let Some(ref path) = file_path {
197 let (form, _) = build_form_with_file(path)?;
198 args.add_to_form(form)
199 } else {
200 let form = Form::new().text("file_url", args.input.clone());
201 args.add_to_form(form)
202 };
203
204 let result = client.submit_and_poll("convert", form, progress).await?;
205
206 let file_path_str = file_path.as_ref().map(|p| p.to_string_lossy().to_string());
207 cache.set(
208 &cache_key,
209 &result,
210 "convert",
211 file_hash.as_deref(),
212 file_path_str.as_deref(),
213 )?;
214
215 output_result(&result, args.output.as_ref())?;
216
217 Ok(())
218}
219
220fn output_result(result: &serde_json::Value, output_file: Option<&PathBuf>) -> Result<()> {
221 let json_output = serde_json::to_string_pretty(result)?;
222
223 if let Some(path) = output_file {
224 fs::write(path, &json_output)?;
225 } else {
226 println!("{}", json_output);
227 }
228
229 Ok(())
230}