Skip to main content

stillo_core/
document.rs

1use std::path::PathBuf;
2use chrono::{DateTime, Utc};
3use url::Url;
4
5#[derive(Debug, Clone)]
6pub struct RawHtml {
7    pub bytes: Vec<u8>,
8    pub url: Url,
9    pub content_type: String,
10    pub status: u16,
11}
12
13#[derive(Debug, Clone)]
14pub struct ExtractedContent {
15    pub url: Url,
16    pub title: String,
17    pub byline: Option<String>,
18    pub body_text: String,
19    pub body_html: String,
20    pub links: Vec<ExtractedLink>,
21    pub metadata: PageMetadata,
22}
23
24#[derive(Debug, Clone)]
25pub struct ExtractedLink {
26    pub text: String,
27    pub href: Url,
28    pub rel: Option<String>,
29}
30
31#[derive(Debug, Clone)]
32pub struct PageMetadata {
33    pub description: Option<String>,
34    pub og_title: Option<String>,
35    pub og_image: Option<String>,
36    pub canonical: Option<Url>,
37    pub published_at: Option<DateTime<Utc>>,
38}
39
40#[derive(Debug, Clone)]
41pub struct MarkdownDocument {
42    pub content: String,
43    pub source_url: Url,
44    pub extracted_at: DateTime<Utc>,
45}
46
47#[derive(Debug, Clone, PartialEq)]
48pub enum SpaDetection {
49    Static,
50    SuspectedSpa { text_length: usize },
51    FrameworkDetected { framework: JsFramework },
52}
53
54#[derive(Debug, Clone, PartialEq)]
55pub enum JsFramework {
56    React,
57    Vue,
58    Angular,
59    Next,
60    Nuxt,
61    Unknown(String),
62}
63
64#[derive(Debug, Clone, PartialEq)]
65pub enum DelegationTarget {
66    LocalCdp { port: u16 },
67    PlaywrightDaemon { socket_path: PathBuf },
68    JinaReader { api_key: Option<String> },
69    Firecrawl { base_url: Url, api_key: String },
70    Unavailable { reason: String },
71}
72
73#[derive(Debug)]
74pub enum FetchResult {
75    Static(RawHtml),
76    SpaDelegated {
77        detection: SpaDetection,
78        target: DelegationTarget,
79    },
80    DelegatedHtml(RawHtml),
81    Failed(FetchError),
82}
83
84#[derive(Debug, thiserror::Error)]
85pub enum FetchError {
86    #[error("HTTP error: {status} {url}")]
87    Http { status: u16, url: Url },
88    #[error("TLS error: {0}")]
89    Tls(String),
90    #[error("Timeout after {seconds}s")]
91    Timeout { seconds: u64 },
92    #[error("Delegation failed: {0}")]
93    DelegationFailed(String),
94    #[error("All delegation targets unavailable")]
95    NoDelegationAvailable,
96}