Skip to main content

papers_datalab/
types.rs

1use std::collections::HashMap;
2
3// -- Enums --
4
5#[derive(Default, Clone, serde::Serialize)]
6#[serde(rename_all = "lowercase")]
7pub enum OutputFormat {
8    #[default]
9    Markdown,
10    Html,
11    Json,
12    Chunks,
13}
14
15#[derive(Default, Clone, serde::Serialize)]
16#[serde(rename_all = "lowercase")]
17pub enum ProcessingMode {
18    Fast,
19    #[default]
20    Balanced,
21    Accurate,
22}
23
24#[derive(Debug, Clone, PartialEq, serde::Deserialize)]
25#[serde(rename_all = "lowercase")]
26pub enum MarkerStatus {
27    Processing,
28    Complete,
29    Failed,
30}
31
32// -- Request --
33
34/// Request parameters for the DataLab Marker conversion API.
35///
36/// Exactly one of `file` or `file_url` must be set.
37pub struct MarkerRequest {
38    /// Raw file bytes to upload. Required when `file_url` is not set.
39    pub file: Option<Vec<u8>>,
40    /// Filename for the uploaded file (e.g. `"paper.pdf"`). Used when `file` is set.
41    pub filename: Option<String>,
42    /// Public URL to the file. Alternative to `file`.
43    pub file_url: Option<String>,
44    /// Output format. Defaults to `Markdown`.
45    pub output_format: OutputFormat,
46    /// Processing mode. Defaults to `Balanced`.
47    pub mode: ProcessingMode,
48    /// Maximum number of pages to process.
49    pub max_pages: Option<u32>,
50    /// Page range (0-indexed). E.g. `"0-5"` or `"1,3,5"`.
51    pub page_range: Option<String>,
52    /// Insert page delimiters in output.
53    pub paginate: bool,
54    /// Force reprocessing even if cached.
55    pub skip_cache: bool,
56    /// Skip extracting images.
57    pub disable_image_extraction: bool,
58    /// Skip generating image captions.
59    pub disable_image_captions: bool,
60    /// Save intermediate checkpoint for downstream extraction steps.
61    pub save_checkpoint: bool,
62    /// HTML mode only: adds `data-block-id` attributes.
63    pub add_block_ids: bool,
64    /// Include markdown alongside chunks output.
65    pub include_markdown_in_chunks: bool,
66    /// Preserve spreadsheet table structure.
67    pub keep_spreadsheet_formatting: bool,
68    /// JSON schema for structured data extraction.
69    pub page_schema: Option<serde_json::Value>,
70    /// Schema for document segmentation.
71    pub segmentation_schema: Option<String>,
72    /// Extra Marker config (e.g. force_ocr, languages).
73    pub additional_config: Option<serde_json::Value>,
74    /// Comma-separated extras: `track_changes`, `chart_understanding`, etc.
75    pub extras: Option<String>,
76    /// Fence auto-generated captions.
77    pub fence_synthetic_captions: bool,
78    /// URL to POST results to when processing completes.
79    pub webhook_url: Option<String>,
80}
81
82impl Default for MarkerRequest {
83    fn default() -> Self {
84        Self {
85            file: None,
86            filename: None,
87            file_url: None,
88            output_format: OutputFormat::default(),
89            mode: ProcessingMode::default(),
90            max_pages: None,
91            page_range: None,
92            paginate: false,
93            skip_cache: false,
94            disable_image_extraction: false,
95            disable_image_captions: false,
96            save_checkpoint: false,
97            add_block_ids: false,
98            include_markdown_in_chunks: false,
99            keep_spreadsheet_formatting: false,
100            page_schema: None,
101            segmentation_schema: None,
102            additional_config: None,
103            extras: None,
104            fence_synthetic_captions: false,
105            webhook_url: None,
106        }
107    }
108}
109
110// -- Submit response --
111
112/// Response from POST /api/v1/marker (submit).
113#[derive(serde::Deserialize)]
114pub struct MarkerSubmitResponse {
115    pub success: bool,
116    pub request_id: String,
117    pub request_check_url: String,
118}
119
120// -- Poll response --
121
122/// Response from GET /api/v1/marker/{request_id} (poll).
123#[derive(serde::Deserialize)]
124pub struct MarkerPollResponse {
125    pub success: bool,
126    pub status: MarkerStatus,
127    pub output_format: Option<String>,
128    pub markdown: Option<String>,
129    pub html: Option<String>,
130    pub json: Option<serde_json::Value>,
131    pub chunks: Option<serde_json::Value>,
132    pub extraction_schema_json: Option<String>,
133    pub segmentation_results: Option<serde_json::Value>,
134    pub images: Option<HashMap<String, String>>,
135    pub metadata: Option<serde_json::Value>,
136    pub error: Option<String>,
137    pub error_in: Option<String>,
138    pub page_count: Option<u32>,
139    pub checkpoint_id: Option<String>,
140    pub versions: Option<serde_json::Value>,
141    pub parse_quality_score: Option<f64>,
142    pub runtime: Option<f64>,
143    pub cost_breakdown: Option<serde_json::Value>,
144}
145
146// -- Step types response --
147
148/// A single workflow step type.
149#[derive(serde::Deserialize)]
150pub struct StepType {
151    pub id: u32,
152    #[serde(rename = "type")]
153    pub type_: String,
154    pub step_type: String,
155    pub name: String,
156    pub description: String,
157    pub settings_schema: serde_json::Value,
158    pub version: String,
159    pub is_public: bool,
160}
161
162/// Response from GET /api/v1/workflows/step-types.
163#[derive(serde::Deserialize)]
164pub struct StepTypesResponse {
165    pub step_types: Vec<StepType>,
166}