1use std::collections::BTreeMap;
4
5use bytes::Bytes;
6use serde::Deserialize;
7
8pub mod status {
10 pub const PENDING: &str = "PENDING";
12 pub const RUNNING: &str = "RUNNING";
14 pub const DONE: &str = "DONE";
16 pub const CANCELLED: &str = "CANCELLED";
18}
19
20#[derive(Debug, Clone, Deserialize, Default)]
22pub struct CrawlerStartResponse {
23 #[serde(default)]
25 pub crawler_uuid: String,
26 #[serde(default)]
28 pub status: String,
29}
30
31#[derive(Debug, Clone, Deserialize, Default)]
33pub struct CrawlerState {
34 #[serde(default)]
36 pub urls_visited: u64,
37 #[serde(default)]
39 pub urls_extracted: u64,
40 #[serde(default)]
42 pub urls_failed: u64,
43 #[serde(default)]
45 pub urls_skipped: u64,
46 #[serde(default)]
48 pub urls_to_crawl: u64,
49 #[serde(default)]
51 pub api_credit_used: u64,
52 #[serde(default)]
54 pub duration: u64,
55 #[serde(default)]
57 pub start_time: Option<i64>,
58 #[serde(default)]
60 pub stop_time: Option<i64>,
61 #[serde(default)]
63 pub stop_reason: Option<String>,
64}
65
66#[derive(Debug, Clone, Deserialize, Default)]
68pub struct CrawlerStatus {
69 #[serde(default)]
71 pub crawler_uuid: String,
72 #[serde(default)]
74 pub status: String,
75 #[serde(default)]
77 pub is_finished: bool,
78 #[serde(default)]
80 pub is_success: Option<bool>,
81 #[serde(default)]
83 pub state: CrawlerState,
84}
85
86impl CrawlerStatus {
87 pub fn is_running(&self) -> bool {
89 self.status == status::PENDING || self.status == status::RUNNING
90 }
91 pub fn is_complete(&self) -> bool {
93 self.status == status::DONE && self.is_success == Some(true)
94 }
95 pub fn is_failed(&self) -> bool {
97 self.status == status::DONE && self.is_success == Some(false)
98 }
99 pub fn is_cancelled(&self) -> bool {
101 self.status == status::CANCELLED
102 }
103}
104
105#[derive(Debug, Clone)]
107pub struct CrawlerUrlEntry {
108 pub url: String,
110 pub status: String,
112 pub reason: String,
114}
115
116#[derive(Debug, Clone, Default)]
118pub struct CrawlerUrls {
119 pub urls: Vec<CrawlerUrlEntry>,
121 pub page: u32,
123 pub per_page: u32,
125}
126
127impl CrawlerUrls {
128 pub fn from_text(body: &str, status_hint: &str, page: u32, per_page: u32) -> Self {
131 let mut urls = Vec::new();
132 for raw_line in body.split('\n') {
133 let line = raw_line.trim();
134 if line.is_empty() {
135 continue;
136 }
137 if status_hint == "visited" || status_hint == "pending" {
138 urls.push(CrawlerUrlEntry {
139 url: line.to_string(),
140 status: status_hint.to_string(),
141 reason: String::new(),
142 });
143 continue;
144 }
145 if let Some(idx) = line.find(',') {
146 urls.push(CrawlerUrlEntry {
147 url: line[..idx].to_string(),
148 status: status_hint.to_string(),
149 reason: line[idx + 1..].to_string(),
150 });
151 } else {
152 urls.push(CrawlerUrlEntry {
153 url: line.to_string(),
154 status: status_hint.to_string(),
155 reason: String::new(),
156 });
157 }
158 }
159 Self {
160 urls,
161 page,
162 per_page,
163 }
164 }
165}
166
167#[derive(Debug, Clone, Deserialize, Default)]
169pub struct CrawlerContents {
170 #[serde(default, deserialize_with = "deserialize_contents_map")]
176 pub contents: BTreeMap<String, BTreeMap<String, String>>,
177 #[serde(default)]
179 pub links: CrawlerContentsLinks,
180}
181
182fn deserialize_contents_map<'de, D>(
185 deserializer: D,
186) -> Result<BTreeMap<String, BTreeMap<String, String>>, D::Error>
187where
188 D: serde::Deserializer<'de>,
189{
190 let raw: BTreeMap<String, BTreeMap<String, Option<String>>> =
191 BTreeMap::deserialize(deserializer)?;
192 Ok(raw
193 .into_iter()
194 .map(|(url, by_format)| {
195 (
196 url,
197 by_format
198 .into_iter()
199 .map(|(fmt, body)| (fmt, body.unwrap_or_default()))
200 .collect(),
201 )
202 })
203 .collect())
204}
205
206#[derive(Debug, Clone, Deserialize, Default)]
212pub struct CrawlerContentsLinks {
213 #[serde(default, deserialize_with = "null_as_empty_string")]
215 pub crawled_urls: String,
216 #[serde(default, deserialize_with = "null_as_empty_string")]
218 pub next: String,
219 #[serde(default, deserialize_with = "null_as_empty_string")]
221 pub prev: String,
222}
223
224fn null_as_empty_string<'de, D>(deserializer: D) -> Result<String, D::Error>
228where
229 D: serde::Deserializer<'de>,
230{
231 Ok(Option::<String>::deserialize(deserializer)?.unwrap_or_default())
232}
233
234#[derive(Debug, Clone, Default)]
236pub struct CrawlContent {
237 pub url: String,
239 pub content: String,
241 pub crawl_uuid: String,
243}
244
245#[derive(Debug, Clone, Copy, PartialEq, Eq)]
247pub enum CrawlerArtifactType {
248 Warc,
250 Har,
252}
253
254impl CrawlerArtifactType {
255 pub fn as_str(&self) -> &'static str {
257 match self {
258 Self::Warc => "warc",
259 Self::Har => "har",
260 }
261 }
262}
263
264#[derive(Debug, Clone)]
266pub struct CrawlerArtifact {
267 pub artifact_type: CrawlerArtifactType,
269 pub data: Bytes,
271}
272
273impl CrawlerArtifact {
274 pub fn save(&self, path: &std::path::Path) -> std::io::Result<()> {
276 std::fs::write(path, &self.data)
277 }
278 pub fn len(&self) -> usize {
280 self.data.len()
281 }
282 pub fn is_empty(&self) -> bool {
284 self.data.is_empty()
285 }
286}