use std::borrow::Cow;
use std::collections::HashMap;
use ahash::AHashSet;
use serde::{Deserialize, Serialize};
use super::{
CookieInfo, DownloadedAsset, ExtractionMeta, FeedInfo, ImageInfo, JsonLdEntry, LinkInfo, PageMetadata, ResponseMeta,
};
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct DownloadedDocument {
pub url: String,
pub mime_type: Cow<'static, str>,
#[serde(skip_serializing)]
pub content: Vec<u8>,
pub size: usize,
pub filename: Option<Box<str>>,
pub content_hash: Box<str>,
#[serde(default, skip_serializing_if = "HashMap::is_empty")]
pub headers: HashMap<Box<str>, Box<str>>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[allow(dead_code)]
pub struct InteractionResult {
pub action_results: Vec<ActionResult>,
pub final_html: String,
pub final_url: String,
#[serde(skip)]
pub screenshot: Option<Vec<u8>>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[allow(dead_code)]
pub struct ActionResult {
pub action_index: usize,
pub action_type: Cow<'static, str>,
pub success: bool,
#[serde(skip_serializing_if = "Option::is_none")]
pub data: Option<serde_json::Value>,
#[serde(skip_serializing_if = "Option::is_none")]
pub error: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct ScrapeResult {
pub status_code: u16,
pub content_type: String,
pub html: String,
pub body_size: usize,
pub metadata: PageMetadata,
pub links: Vec<LinkInfo>,
pub images: Vec<ImageInfo>,
pub feeds: Vec<FeedInfo>,
pub json_ld: Vec<JsonLdEntry>,
pub is_allowed: bool,
pub crawl_delay: Option<u64>,
pub noindex_detected: bool,
pub nofollow_detected: bool,
pub x_robots_tag: Option<String>,
pub is_pdf: bool,
pub was_skipped: bool,
pub detected_charset: Option<String>,
pub auth_header_sent: bool,
pub response_meta: Option<ResponseMeta>,
pub assets: Vec<DownloadedAsset>,
pub js_render_hint: bool,
pub browser_used: bool,
pub markdown: Option<MarkdownResult>,
pub extracted_data: Option<serde_json::Value>,
pub extraction_meta: Option<ExtractionMeta>,
#[serde(skip)]
pub screenshot: Option<Vec<u8>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub downloaded_document: Option<DownloadedDocument>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct CrawlPageResult {
pub url: String,
pub normalized_url: String,
pub status_code: u16,
pub content_type: String,
pub html: String,
pub body_size: usize,
pub metadata: PageMetadata,
pub links: Vec<LinkInfo>,
pub images: Vec<ImageInfo>,
pub feeds: Vec<FeedInfo>,
pub json_ld: Vec<JsonLdEntry>,
pub depth: usize,
pub stayed_on_domain: bool,
pub was_skipped: bool,
pub is_pdf: bool,
pub detected_charset: Option<String>,
pub markdown: Option<MarkdownResult>,
pub extracted_data: Option<serde_json::Value>,
pub extraction_meta: Option<ExtractionMeta>,
#[serde(skip_serializing_if = "Option::is_none")]
pub downloaded_document: Option<DownloadedDocument>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct CrawlResult {
pub pages: Vec<CrawlPageResult>,
pub final_url: String,
pub redirect_count: usize,
pub was_skipped: bool,
pub error: Option<String>,
pub cookies: Vec<CookieInfo>,
#[serde(default, skip_serializing)]
pub normalized_urls: Vec<String>,
}
impl CrawlResult {
pub(crate) fn new(
pages: Vec<CrawlPageResult>,
final_url: String,
redirect_count: usize,
was_skipped: bool,
error: Option<String>,
cookies: Vec<CookieInfo>,
normalized_urls: Vec<String>,
) -> Self {
Self {
pages,
final_url,
redirect_count,
was_skipped,
error,
cookies,
normalized_urls,
}
}
pub fn unique_normalized_urls(&self) -> usize {
let mut unique: AHashSet<&str> = AHashSet::new();
for n in &self.normalized_urls {
unique.insert(n.as_str());
}
unique.len()
}
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct SitemapUrl {
pub url: String,
pub lastmod: Option<String>,
pub changefreq: Option<String>,
pub priority: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct MapResult {
pub urls: Vec<SitemapUrl>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
#[serde(deny_unknown_fields)]
pub struct MarkdownResult {
pub content: String,
pub document_structure: Option<serde_json::Value>,
pub tables: Vec<serde_json::Value>,
pub warnings: Vec<String>,
pub citations: Option<crate::citations::CitationResult>,
pub fit_content: Option<String>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct CachedPage {
pub url: String,
pub status_code: u16,
pub content_type: String,
pub body: String,
pub etag: Option<String>,
pub last_modified: Option<String>,
pub cached_at: u64,
}