1use std::path::PathBuf;
2use chrono::{DateTime, Utc};
3use url::Url;
4use crate::ast::Document;
5
6#[derive(Debug, Clone)]
7pub struct RawHtml {
8 pub bytes: Vec<u8>,
9 pub url: Url,
10 pub content_type: String,
11 pub status: u16,
12}
13
14#[derive(Debug, Clone)]
15pub struct ExtractedContent {
16 pub url: Url,
17 pub title: String,
18 pub byline: Option<String>,
19 pub body_text: String,
20 pub body_html: String,
21 pub links: Vec<ExtractedLink>,
22 pub metadata: PageMetadata,
23}
24
25#[derive(Debug, Clone)]
26pub struct ExtractedLink {
27 pub text: String,
28 pub href: Url,
29 pub rel: Option<String>,
30}
31
32#[derive(Debug, Clone)]
33pub struct PageMetadata {
34 pub description: Option<String>,
35 pub og_title: Option<String>,
36 pub og_image: Option<String>,
37 pub canonical: Option<Url>,
38 pub published_at: Option<DateTime<Utc>>,
39}
40
41#[derive(Debug, Clone)]
44pub struct BrowsePage {
45 pub title: String,
46 pub url: Url,
47 pub doc: Document,
48 pub links: Vec<ExtractedLink>,
49 pub markdown: String,
51}
52
53#[derive(Debug, Clone)]
54pub struct MarkdownDocument {
55 pub content: String,
56 pub source_url: Url,
57 pub extracted_at: DateTime<Utc>,
58}
59
60#[derive(Debug, Clone, PartialEq)]
61pub enum SpaDetection {
62 Static,
63 SuspectedSpa { text_length: usize },
64 FrameworkDetected { framework: JsFramework },
65}
66
67#[derive(Debug, Clone, PartialEq)]
68pub enum JsFramework {
69 React,
70 Vue,
71 Angular,
72 Next,
73 Nuxt,
74 Unknown(String),
75}
76
77#[derive(Debug, Clone, PartialEq)]
78pub enum DelegationTarget {
79 LocalCdp { port: u16 },
80 PlaywrightDaemon { socket_path: PathBuf },
81 JinaReader { api_key: Option<String> },
82 Firecrawl { base_url: Url, api_key: String },
83 Unavailable { reason: String },
84}
85
86#[derive(Debug)]
87pub enum FetchResult {
88 Static(RawHtml),
89 SpaDelegated {
90 detection: SpaDetection,
91 target: DelegationTarget,
92 },
93 DelegatedHtml(RawHtml),
94 Failed(FetchError),
95}
96
97#[derive(Debug, thiserror::Error)]
98pub enum FetchError {
99 #[error("HTTP error: {status} {url}")]
100 Http { status: u16, url: Url },
101 #[error("TLS error: {0}")]
102 Tls(String),
103 #[error("Timeout after {seconds}s")]
104 Timeout { seconds: u64 },
105 #[error("Delegation failed: {0}")]
106 DelegationFailed(String),
107 #[error("All delegation targets unavailable")]
108 NoDelegationAvailable,
109}