Skip to main content

papers_datalab/
types.rs

1use std::collections::HashMap;
2
3// -- Enums --
4
5#[derive(Default, Clone, serde::Serialize)]
6#[serde(rename_all = "lowercase")]
7pub enum OutputFormat {
8    #[default]
9    Markdown,
10    Html,
11    Json,
12    Chunks,
13}
14
15#[derive(Default, Clone, serde::Serialize)]
16#[serde(rename_all = "lowercase")]
17pub enum ProcessingMode {
18    Fast,
19    #[default]
20    Balanced,
21    Accurate,
22}
23
24#[derive(Debug, Clone, PartialEq, serde::Deserialize)]
25#[serde(rename_all = "lowercase")]
26pub enum MarkerStatus {
27    Processing,
28    Complete,
29    Failed,
30}
31
32// -- Request --
33
34/// Request parameters for the DataLab Marker conversion API.
35///
36/// Exactly one of `file` or `file_url` must be set.
37pub struct MarkerRequest {
38    /// Raw file bytes to upload. Required when `file_url` is not set.
39    pub file: Option<Vec<u8>>,
40    /// Filename for the uploaded file (e.g. `"paper.pdf"`). Used when `file` is set.
41    pub filename: Option<String>,
42    /// Public URL to the file. Alternative to `file`.
43    pub file_url: Option<String>,
44    /// Output format(s). Defaults to `[Markdown]`.
45    pub output_format: Vec<OutputFormat>,
46    /// Processing mode. Defaults to `Balanced`.
47    pub mode: ProcessingMode,
48    /// Maximum number of pages to process.
49    pub max_pages: Option<u32>,
50    /// Page range (0-indexed). E.g. `"0-5"` or `"1,3,5"`.
51    pub page_range: Option<String>,
52    /// Insert page delimiters in output.
53    pub paginate: bool,
54    /// Force reprocessing even if cached.
55    pub skip_cache: bool,
56    /// Skip extracting images.
57    pub disable_image_extraction: bool,
58    /// Skip generating image captions.
59    pub disable_image_captions: bool,
60    /// Save intermediate checkpoint for downstream extraction steps.
61    pub save_checkpoint: bool,
62    /// HTML mode only: adds `data-block-id` attributes.
63    pub add_block_ids: bool,
64    /// Include markdown alongside chunks output.
65    pub include_markdown_in_chunks: bool,
66    /// Preserve spreadsheet table structure.
67    pub keep_spreadsheet_formatting: bool,
68    /// JSON schema for structured data extraction.
69    pub page_schema: Option<serde_json::Value>,
70    /// Schema for document segmentation.
71    pub segmentation_schema: Option<String>,
72    /// Extra Marker config (e.g. force_ocr, languages).
73    pub additional_config: Option<serde_json::Value>,
74    /// Comma-separated extras: `track_changes`, `chart_understanding`, etc.
75    pub extras: Option<String>,
76    /// Fence auto-generated captions.
77    pub fence_synthetic_captions: bool,
78    /// URL to POST results to when processing completes.
79    pub webhook_url: Option<String>,
80}
81
82impl Default for MarkerRequest {
83    fn default() -> Self {
84        Self {
85            file: None,
86            filename: None,
87            file_url: None,
88            output_format: vec![OutputFormat::Markdown],
89            mode: ProcessingMode::default(),
90            max_pages: None,
91            page_range: None,
92            paginate: false,
93            skip_cache: false,
94            disable_image_extraction: false,
95            disable_image_captions: false,
96            save_checkpoint: false,
97            add_block_ids: false,
98            include_markdown_in_chunks: false,
99            keep_spreadsheet_formatting: false,
100            page_schema: None,
101            segmentation_schema: None,
102            additional_config: None,
103            extras: None,
104            fence_synthetic_captions: false,
105            webhook_url: None,
106        }
107    }
108}
109
110// -- Submit response --
111
112/// Response from POST /api/v1/marker (submit).
113#[derive(serde::Deserialize)]
114pub struct MarkerSubmitResponse {
115    #[serde(default)]
116    pub success: Option<bool>,
117    pub request_id: String,
118    pub request_check_url: String,
119}
120
121// -- Poll response --
122
123/// Response from GET /api/v1/marker/{request_id} (poll).
124#[derive(serde::Deserialize)]
125pub struct MarkerPollResponse {
126    #[serde(default)]
127    pub success: Option<bool>,
128    pub status: MarkerStatus,
129    pub output_format: Option<String>,
130    pub markdown: Option<String>,
131    pub html: Option<String>,
132    pub json: Option<serde_json::Value>,
133    pub chunks: Option<serde_json::Value>,
134    pub extraction_schema_json: Option<String>,
135    pub segmentation_results: Option<serde_json::Value>,
136    pub images: Option<HashMap<String, String>>,
137    pub metadata: Option<serde_json::Value>,
138    pub error: Option<String>,
139    pub error_in: Option<String>,
140    pub page_count: Option<u32>,
141    pub checkpoint_id: Option<String>,
142    pub versions: Option<serde_json::Value>,
143    pub parse_quality_score: Option<f64>,
144    pub runtime: Option<f64>,
145    pub cost_breakdown: Option<serde_json::Value>,
146}
147
148// -- Step types response --
149
150/// A single workflow step type.
151#[derive(serde::Deserialize)]
152pub struct StepType {
153    pub id: u32,
154    #[serde(rename = "type")]
155    pub type_: String,
156    pub step_type: String,
157    pub name: String,
158    pub description: String,
159    pub settings_schema: serde_json::Value,
160    pub version: String,
161    pub is_public: bool,
162}
163
164/// Response from GET /api/v1/workflows/step-types.
165#[derive(serde::Deserialize)]
166pub struct StepTypesResponse {
167    pub step_types: Vec<StepType>,
168}