Skip to main content

quantum_sdk/
scraper.rs

1use serde::{Deserialize, Serialize};
2
3use crate::client::Client;
4use crate::error::Result;
5use crate::jobs::{JobCreateRequest, JobCreateResponse};
6
7// ---------------------------------------------------------------------------
8// Scrape
9// ---------------------------------------------------------------------------
10
11/// A single scrape target.
12#[derive(Debug, Clone, Serialize, Default)]
13pub struct ScrapeTarget {
14    /// Target name.
15    pub name: String,
16
17    /// Start URL to scrape.
18    pub url: String,
19
20    /// Target type: "scrape" (default) or "openapi".
21    #[serde(rename = "type", skip_serializing_if = "Option::is_none")]
22    pub target_type: Option<String>,
23
24    /// CSS selector for navigation links.
25    #[serde(skip_serializing_if = "Option::is_none")]
26    pub selector: Option<String>,
27
28    /// CSS selector for content area.
29    #[serde(skip_serializing_if = "Option::is_none")]
30    pub content: Option<String>,
31
32    /// Joplin notebook name.
33    #[serde(skip_serializing_if = "Option::is_none")]
34    pub notebook: Option<String>,
35
36    /// Enable recursive link discovery.
37    #[serde(skip_serializing_if = "Option::is_none")]
38    pub recursive: Option<bool>,
39
40    /// Maximum pages to scrape.
41    #[serde(skip_serializing_if = "Option::is_none")]
42    pub max_pages: Option<i32>,
43
44    /// Delay between pages in milliseconds.
45    #[serde(skip_serializing_if = "Option::is_none")]
46    pub delay_ms: Option<i32>,
47
48    /// RAG provider name for auto-ingest.
49    #[serde(skip_serializing_if = "Option::is_none")]
50    pub ingest: Option<String>,
51
52    /// OpenAPI spec URL (for type=openapi targets).
53    #[serde(skip_serializing_if = "Option::is_none")]
54    pub spec_url: Option<String>,
55}
56
57/// Request body for submitting a scrape job.
58#[derive(Debug, Clone, Serialize, Default)]
59pub struct ScrapeRequest {
60    /// Targets to scrape.
61    pub targets: Vec<ScrapeTarget>,
62}
63
64/// Response from submitting a scrape job.
65#[derive(Debug, Clone, Deserialize)]
66pub struct ScrapeResponse {
67    /// Job identifier for polling.
68    pub job_id: String,
69
70    /// Initial status.
71    #[serde(default)]
72    pub status: String,
73
74    /// Number of targets submitted.
75    #[serde(default)]
76    pub targets: i32,
77
78    /// Unique request identifier.
79    #[serde(default)]
80    pub request_id: String,
81}
82
83// ---------------------------------------------------------------------------
84// Screenshot
85// ---------------------------------------------------------------------------
86
87/// A single URL to screenshot.
88#[derive(Debug, Clone, Serialize, Default)]
89pub struct ScreenshotURL {
90    /// Page URL to capture.
91    pub url: String,
92
93    /// Viewport width (default 1280).
94    #[serde(skip_serializing_if = "Option::is_none")]
95    pub width: Option<i32>,
96
97    /// Viewport height (default 800).
98    #[serde(skip_serializing_if = "Option::is_none")]
99    pub height: Option<i32>,
100
101    /// Capture full scrollable page.
102    #[serde(skip_serializing_if = "Option::is_none")]
103    pub full_page: Option<bool>,
104
105    /// Wait before capture in milliseconds (default 1000).
106    #[serde(skip_serializing_if = "Option::is_none")]
107    pub delay_ms: Option<i32>,
108}
109
110/// Request body for taking screenshots.
111#[derive(Debug, Clone, Serialize, Default)]
112pub struct ScreenshotRequest {
113    /// URLs to screenshot.
114    pub urls: Vec<ScreenshotURL>,
115}
116
117/// A single screenshot result.
118#[derive(Debug, Clone, Deserialize)]
119pub struct ScreenshotResult {
120    /// Source URL.
121    pub url: String,
122
123    /// Base64-encoded image data.
124    #[serde(default)]
125    pub base64: String,
126
127    /// Image format (e.g. "png").
128    #[serde(default)]
129    pub format: String,
130
131    /// Viewport width used.
132    #[serde(default)]
133    pub width: i32,
134
135    /// Viewport height used.
136    #[serde(default)]
137    pub height: i32,
138
139    /// Error message if capture failed.
140    #[serde(default)]
141    pub error: Option<String>,
142}
143
144/// Response from the screenshot endpoint (synchronous batch).
145#[derive(Debug, Clone, Deserialize)]
146pub struct ScreenshotResponse {
147    /// Screenshot results.
148    #[serde(default)]
149    pub screenshots: Vec<ScreenshotResult>,
150
151    /// Number of screenshots.
152    #[serde(default)]
153    pub count: i32,
154}
155
156/// Response from async screenshot job submission.
157#[derive(Debug, Clone, Deserialize)]
158pub struct ScreenshotJobResponse {
159    /// Job identifier for polling.
160    pub job_id: String,
161
162    /// Initial status.
163    #[serde(default)]
164    pub status: String,
165
166    /// Number of URLs submitted.
167    #[serde(default)]
168    pub urls: i32,
169
170    /// Unique request identifier.
171    #[serde(default)]
172    pub request_id: String,
173}
174
175// ---------------------------------------------------------------------------
176// Client methods
177// ---------------------------------------------------------------------------
178
179impl Client {
180    /// Submits a doc-scraping job. Returns a job ID for polling.
181    pub async fn scrape(&self, req: &ScrapeRequest) -> Result<ScrapeResponse> {
182        let (resp, _meta) = self
183            .post_json::<ScrapeRequest, ScrapeResponse>("/qai/v1/scraper/scrape", req)
184            .await?;
185        Ok(resp)
186    }
187
188    /// Takes screenshots of URLs. For <=5 URLs, returns results inline.
189    /// For >5, returns a job ID for async processing.
190    pub async fn screenshot(&self, req: &ScreenshotRequest) -> Result<ScreenshotResponse> {
191        let (resp, _meta) = self
192            .post_json::<ScreenshotRequest, ScreenshotResponse>("/qai/v1/scraper/screenshot", req)
193            .await?;
194        Ok(resp)
195    }
196
197    /// Submits a large screenshot batch as an async job.
198    pub async fn screenshot_job(&self, req: &ScreenshotRequest) -> Result<JobCreateResponse> {
199        let params = serde_json::to_value(req)?;
200        self.create_job(&JobCreateRequest {
201            job_type: "screenshot".into(),
202            params,
203        })
204        .await
205    }
206}