use std::collections::BTreeMap;
use bytes::Bytes;
use serde::Deserialize;
pub mod status {
pub const PENDING: &str = "PENDING";
pub const RUNNING: &str = "RUNNING";
pub const DONE: &str = "DONE";
pub const CANCELLED: &str = "CANCELLED";
}
#[derive(Debug, Clone, Deserialize, Default)]
pub struct CrawlerStartResponse {
#[serde(default)]
pub crawler_uuid: String,
#[serde(default)]
pub status: String,
}
#[derive(Debug, Clone, Deserialize, Default)]
pub struct CrawlerState {
#[serde(default)]
pub urls_visited: u64,
#[serde(default)]
pub urls_extracted: u64,
#[serde(default)]
pub urls_failed: u64,
#[serde(default)]
pub urls_skipped: u64,
#[serde(default)]
pub urls_to_crawl: u64,
#[serde(default)]
pub api_credit_used: u64,
#[serde(default)]
pub duration: u64,
#[serde(default)]
pub start_time: Option<i64>,
#[serde(default)]
pub stop_time: Option<i64>,
#[serde(default)]
pub stop_reason: Option<String>,
}
#[derive(Debug, Clone, Deserialize, Default)]
pub struct CrawlerStatus {
#[serde(default)]
pub crawler_uuid: String,
#[serde(default)]
pub status: String,
#[serde(default)]
pub is_finished: bool,
#[serde(default)]
pub is_success: Option<bool>,
#[serde(default)]
pub state: CrawlerState,
}
impl CrawlerStatus {
pub fn is_running(&self) -> bool {
self.status == status::PENDING || self.status == status::RUNNING
}
pub fn is_complete(&self) -> bool {
self.status == status::DONE && self.is_success == Some(true)
}
pub fn is_failed(&self) -> bool {
self.status == status::DONE && self.is_success == Some(false)
}
pub fn is_cancelled(&self) -> bool {
self.status == status::CANCELLED
}
}
#[derive(Debug, Clone)]
pub struct CrawlerUrlEntry {
pub url: String,
pub status: String,
pub reason: String,
}
#[derive(Debug, Clone, Default)]
pub struct CrawlerUrls {
pub urls: Vec<CrawlerUrlEntry>,
pub page: u32,
pub per_page: u32,
}
impl CrawlerUrls {
pub fn from_text(body: &str, status_hint: &str, page: u32, per_page: u32) -> Self {
let mut urls = Vec::new();
for raw_line in body.split('\n') {
let line = raw_line.trim();
if line.is_empty() {
continue;
}
if status_hint == "visited" || status_hint == "pending" {
urls.push(CrawlerUrlEntry {
url: line.to_string(),
status: status_hint.to_string(),
reason: String::new(),
});
continue;
}
if let Some(idx) = line.find(',') {
urls.push(CrawlerUrlEntry {
url: line[..idx].to_string(),
status: status_hint.to_string(),
reason: line[idx + 1..].to_string(),
});
} else {
urls.push(CrawlerUrlEntry {
url: line.to_string(),
status: status_hint.to_string(),
reason: String::new(),
});
}
}
Self {
urls,
page,
per_page,
}
}
}
#[derive(Debug, Clone, Deserialize, Default)]
pub struct CrawlerContents {
#[serde(default, deserialize_with = "deserialize_contents_map")]
pub contents: BTreeMap<String, BTreeMap<String, String>>,
#[serde(default)]
pub links: CrawlerContentsLinks,
}
fn deserialize_contents_map<'de, D>(
deserializer: D,
) -> Result<BTreeMap<String, BTreeMap<String, String>>, D::Error>
where
D: serde::Deserializer<'de>,
{
let raw: BTreeMap<String, BTreeMap<String, Option<String>>> =
BTreeMap::deserialize(deserializer)?;
Ok(raw
.into_iter()
.map(|(url, by_format)| {
(
url,
by_format
.into_iter()
.map(|(fmt, body)| (fmt, body.unwrap_or_default()))
.collect(),
)
})
.collect())
}
#[derive(Debug, Clone, Deserialize, Default)]
pub struct CrawlerContentsLinks {
#[serde(default, deserialize_with = "null_as_empty_string")]
pub crawled_urls: String,
#[serde(default, deserialize_with = "null_as_empty_string")]
pub next: String,
#[serde(default, deserialize_with = "null_as_empty_string")]
pub prev: String,
}
fn null_as_empty_string<'de, D>(deserializer: D) -> Result<String, D::Error>
where
D: serde::Deserializer<'de>,
{
Ok(Option::<String>::deserialize(deserializer)?.unwrap_or_default())
}
#[derive(Debug, Clone, Default)]
pub struct CrawlContent {
pub url: String,
pub content: String,
pub crawl_uuid: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CrawlerArtifactType {
Warc,
Har,
}
impl CrawlerArtifactType {
pub fn as_str(&self) -> &'static str {
match self {
Self::Warc => "warc",
Self::Har => "har",
}
}
}
#[derive(Debug, Clone)]
pub struct CrawlerArtifact {
pub artifact_type: CrawlerArtifactType,
pub data: Bytes,
}
impl CrawlerArtifact {
pub fn save(&self, path: &std::path::Path) -> std::io::Result<()> {
std::fs::write(path, &self.data)
}
pub fn len(&self) -> usize {
self.data.len()
}
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
}