qrawl/
types.rs

1use chrono::{DateTime, Utc};
2use serde::{Deserialize, Serialize};
3use std::collections::BTreeMap;
4use url::Url;
5
6#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
7pub struct Domain(pub String);
8
9impl Domain {
10    /// Canonicalize host to a stable key: lowercase + IDNA/Punycode
11    fn canonicalize(host: &str) -> String {
12        let lower = host.to_ascii_lowercase();
13        idna::domain_to_ascii(&lower).unwrap_or(lower)
14    }
15
16    pub fn from_url(url: &Url) -> Option<Self> {
17        url.domain().map(|d| Domain(Self::canonicalize(d)))
18    }
19
20    /// Build a Domain from raw user text (CLI, API callers, etc.)
21    pub fn from_raw(host: &str) -> Self {
22        Domain(Self::canonicalize(host))
23    }
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct HeaderSet(pub BTreeMap<String, String>);
28impl HeaderSet {
29    pub fn empty() -> Self {
30        Self(BTreeMap::new())
31    }
32    pub fn with(mut self, k: &str, v: &str) -> Self {
33        self.0.insert(k.to_string(), v.to_string());
34        self
35    }
36}
37
38#[derive(Debug, Clone, Serialize, Deserialize)]
39pub struct CrawlConfig {
40    pub user_agents: Vec<String>,
41    pub default_headers: HeaderSet,
42    pub respect_robots_txt: bool,
43    pub timeout_ms: u64,
44}
45
46#[derive(Debug, Clone, Serialize, Deserialize)]
47pub struct Sel(pub String);
48
49#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default)]
50pub enum AreaRole {
51    Main,
52    Section,
53    Sidebar,
54    Header,
55    Footer,
56    #[default]
57    Unknown,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize, Default)]
61pub struct FieldSelectors {
62    pub title: Vec<Sel>,
63    pub headings: Vec<Sel>,
64    pub paragraphs: Vec<Sel>,
65    pub images: Vec<Sel>,
66    pub links: Vec<Sel>,
67    pub lists: Vec<Sel>,
68    pub tables: Vec<Sel>,
69}
70
71#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
72pub enum FollowScope {
73    SameDomain,
74    AnyDomain,
75    AllowList,
76}
77
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct FollowLinks {
80    pub enabled: bool,
81    pub scope: FollowScope,
82    pub allow_domains: Vec<String>,
83    pub max: u32,
84    pub dedupe: bool,
85}
86impl Default for FollowLinks {
87    fn default() -> Self {
88        Self {
89            enabled: false,
90            scope: FollowScope::SameDomain,
91            allow_domains: vec![],
92            max: 10,
93            dedupe: true,
94        }
95    }
96}
97
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct AreaPolicy {
100    pub roots: Vec<Sel>,
101    pub exclude_within: Vec<Sel>,
102    pub role: AreaRole,
103    pub fields: FieldSelectors,
104    pub is_repeating: bool,
105    pub follow_links: FollowLinks,
106}
107
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct ScrapeConfig {
110    pub extract_json_ld: bool,
111    pub areas: Vec<AreaPolicy>,
112}
113
114/// Handy wrapper when you want to print or pass "config" as a single object
115#[derive(Debug, Clone, Serialize, Deserialize)]
116pub struct PolicyConfig {
117    pub crawl: CrawlConfig,
118    pub scrape: ScrapeConfig,
119}
120
121/// Canonical in-memory policy type (simple & derived)
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct Policy {
124    pub domain: Domain,
125    pub crawl: CrawlConfig,
126    pub scrape: ScrapeConfig,
127}
128
129#[derive(Debug, Clone, Serialize, Deserialize)]
130pub struct LinkOut {
131    pub href: String,
132    pub text: String,
133}
134
135#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct ImageOut {
137    pub src: String,
138    pub alt: Option<String>,
139}
140
141#[derive(Debug, Clone, Serialize, Deserialize, Default)]
142pub struct AreaContent {
143    pub role: AreaRole,
144    pub root_selector_matched: String,
145    pub title: Option<String>,
146    pub headings: Vec<String>,
147    pub paragraphs: Vec<String>,
148    pub images: Vec<ImageOut>,
149    pub links: Vec<LinkOut>,
150    pub lists: Vec<Vec<String>>,
151    pub tables: Vec<Vec<Vec<String>>>,
152}
153
154#[derive(Debug, Clone, Serialize, Deserialize)]
155pub struct PageExtraction {
156    pub url: String,
157    pub domain: String,
158    pub areas: Vec<AreaContent>,
159    pub json_ld: Vec<serde_json::Value>,
160    pub fetched_at: DateTime<Utc>,
161}
162
163#[derive(Debug, Clone, Serialize, Deserialize)]
164pub struct ExtractionBundle {
165    pub parent: PageExtraction,
166    pub children: Vec<PageExtraction>,
167}
168
169#[derive(Debug, Clone, Serialize, Deserialize)]
170pub struct ApiResponse<T> {
171    pub ok: bool,
172    pub data: Option<T>,
173    pub error: Option<String>,
174}
175impl<T> ApiResponse<T> {
176    pub fn ok(data: T) -> Self {
177        Self {
178            ok: true,
179            data: Some(data),
180            error: None,
181        }
182    }
183    pub fn err(msg: impl Into<String>) -> Self {
184        Self {
185            ok: false,
186            data: None,
187            error: Some(msg.into()),
188        }
189    }
190}