1use chrono::{DateTime, Utc};
2use serde::{Deserialize, Serialize};
3use std::collections::BTreeMap;
4use url::Url;
5
6#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
7pub struct Domain(pub String);
8
9impl Domain {
10 fn canonicalize(host: &str) -> String {
12 let lower = host.to_ascii_lowercase();
13 idna::domain_to_ascii(&lower).unwrap_or(lower)
14 }
15
16 pub fn from_url(url: &Url) -> Option<Self> {
17 url.domain().map(|d| Domain(Self::canonicalize(d)))
18 }
19
20 pub fn from_raw(host: &str) -> Self {
22 Domain(Self::canonicalize(host))
23 }
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct HeaderSet(pub BTreeMap<String, String>);
28impl HeaderSet {
29 pub fn empty() -> Self {
30 Self(BTreeMap::new())
31 }
32 pub fn with(mut self, k: &str, v: &str) -> Self {
33 self.0.insert(k.to_string(), v.to_string());
34 self
35 }
36}
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct CrawlConfig {
40 pub user_agents: Vec<String>,
41 pub default_headers: HeaderSet,
42 pub respect_robots_txt: bool,
43 pub timeout_ms: u64,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct Sel(pub String);
48
49#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
50pub enum AreaRole {
51 Main,
52 Section,
53 Sidebar,
54 Header,
55 Footer,
56 #[default]
57 Unknown,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize, Default)]
61pub struct FieldSelectors {
62 pub title: Vec<Sel>,
63 pub headings: Vec<Sel>,
64 pub paragraphs: Vec<Sel>,
65 pub images: Vec<Sel>,
66 pub links: Vec<Sel>,
67 pub lists: Vec<Sel>,
68 pub tables: Vec<Sel>,
69}
70
71#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
72pub enum FollowScope {
73 SameDomain,
74 AnyDomain,
75 AllowList,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct FollowLinks {
80 pub enabled: bool,
81 pub scope: FollowScope,
82 pub allow_domains: Vec<String>,
83 pub max: u32,
84 pub dedupe: bool,
85}
86impl Default for FollowLinks {
87 fn default() -> Self {
88 Self {
89 enabled: false,
90 scope: FollowScope::SameDomain,
91 allow_domains: vec![],
92 max: 10,
93 dedupe: true,
94 }
95 }
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct AreaPolicy {
100 pub roots: Vec<Sel>,
101 pub exclude_within: Vec<Sel>,
102 pub role: AreaRole,
103 pub fields: FieldSelectors,
104 pub is_repeating: bool,
105 pub follow_links: FollowLinks,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct ScrapeConfig {
110 pub extract_json_ld: bool,
111 pub areas: Vec<AreaPolicy>,
112}
113
114#[derive(Debug, Clone, Serialize, Deserialize)]
116pub struct PolicyConfig {
117 pub crawl: CrawlConfig,
118 pub scrape: ScrapeConfig,
119}
120
121#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct Policy {
124 pub domain: Domain,
125 pub crawl: CrawlConfig,
126 pub scrape: ScrapeConfig,
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct LinkOut {
131 pub href: String,
132 pub text: String,
133}
134
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct ImageOut {
137 pub src: String,
138 pub alt: Option<String>,
139}
140
141#[derive(Debug, Clone, Serialize, Deserialize, Default)]
142pub struct AreaContent {
143 pub role: AreaRole,
144 pub root_selector_matched: String,
145 pub title: Option<String>,
146 pub headings: Vec<String>,
147 pub paragraphs: Vec<String>,
148 pub images: Vec<ImageOut>,
149 pub links: Vec<LinkOut>,
150 pub lists: Vec<Vec<String>>,
151 pub tables: Vec<Vec<Vec<String>>>,
152}
153
154#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct PageExtraction {
156 pub url: String,
157 pub domain: String,
158 pub areas: Vec<AreaContent>,
159 pub json_ld: Vec<serde_json::Value>,
160 pub fetched_at: DateTime<Utc>,
161}
162
163#[derive(Debug, Clone, Serialize, Deserialize)]
164pub struct ExtractionBundle {
165 pub parent: PageExtraction,
166 pub children: Vec<PageExtraction>,
167}
168
169#[derive(Debug, Clone, Serialize, Deserialize)]
170pub struct ApiResponse<T> {
171 pub ok: bool,
172 pub data: Option<T>,
173 pub error: Option<String>,
174}
175impl<T> ApiResponse<T> {
176 pub fn ok(data: T) -> Self {
177 Self {
178 ok: true,
179 data: Some(data),
180 error: None,
181 }
182 }
183 pub fn err(msg: impl Into<String>) -> Self {
184 Self {
185 ok: false,
186 data: None,
187 error: Some(msg.into()),
188 }
189 }
190}