sws_crawler/
scrapable.rs

1use std::path::PathBuf;
2use std::rc::Rc;
3use std::sync::atomic::{AtomicUsize, Ordering};
4use std::sync::Arc;
5
6use anyhow::{anyhow, bail};
7use sxd_document::dom;
8use texting_robots::Robot;
9use tokio::sync::mpsc;
10
11pub trait Scrapable {
12    type Config: Clone + Send + 'static;
13
14    fn new(config: &Self::Config) -> anyhow::Result<Self>
15    where
16        Self: Sized;
17
18    fn seed(&self) -> Seed;
19
20    fn accept(&self, url: &str, crawling_ctx: CrawlingContext) -> bool;
21
22    fn scrap(&mut self, page: String, scraping_ctx: ScrapingContext) -> anyhow::Result<()>;
23
24    fn finalizer(&mut self) {}
25}
26
27#[derive(Debug, Clone)]
28pub enum Seed {
29    Sitemaps(Vec<String>),
30    Pages(Vec<String>),
31    RobotsTxt(String),
32}
33
34#[derive(Debug, Clone)]
35pub struct CrawlingContext {
36    sitemap: Sitemap,
37    robot: Option<Arc<Robot>>,
38}
39
40impl CrawlingContext {
41    pub(crate) fn new(sm: Sitemap, robot: Option<Arc<Robot>>) -> Self {
42        Self { sitemap: sm, robot }
43    }
44
45    pub fn sitemap(&self) -> Sitemap {
46        self.sitemap
47    }
48
49    pub fn robot(&self) -> Option<Arc<Robot>> {
50        self.robot.clone()
51    }
52}
53
54#[derive(Debug, Clone, Copy)]
55pub enum Sitemap {
56    Index,
57    Urlset,
58}
59
60impl<'a> TryFrom<dom::Root<'a>> for Sitemap {
61    type Error = anyhow::Error;
62
63    fn try_from(root: dom::Root<'a>) -> Result<Self, Self::Error> {
64        let kind = root.children()[0]
65            .element()
66            .ok_or_else(|| anyhow!("First child of root is not an element"))?
67            .name()
68            .local_part();
69
70        let sm = match kind {
71            "sitemapindex" => Self::Index,
72            "urlset" => Self::Urlset,
73            _ => bail!("Unknown root node kind: {}", kind),
74        };
75
76        Ok(sm)
77    }
78}
79
80#[derive(Debug, Clone)]
81pub struct ScrapingContext {
82    location: Rc<PageLocation>,
83    tx_url: Option<CountedTx>,
84    robot: Option<Arc<Robot>>,
85}
86
87impl ScrapingContext {
88    pub fn with_location(location: PageLocation) -> Self {
89        Self::new(Rc::new(location), None, None)
90    }
91
92    pub(crate) fn new(
93        location: Rc<PageLocation>,
94        tx_url: Option<CountedTx>,
95        robot: Option<Arc<Robot>>,
96    ) -> Self {
97        Self {
98            location,
99            tx_url,
100            robot,
101        }
102    }
103
104    pub fn location(&self) -> Rc<PageLocation> {
105        self.location.clone()
106    }
107
108    pub fn tx_url(&self) -> Option<CountedTx> {
109        self.tx_url.clone()
110    }
111
112    pub fn robot(&self) -> Option<Arc<Robot>> {
113        self.robot.clone()
114    }
115}
116
117#[derive(Debug, Clone)]
118pub enum PageLocation {
119    Url(String),
120    Path(PathBuf),
121}
122
123#[derive(Debug, Clone)]
124pub struct CountedTx {
125    tx: mpsc::UnboundedSender<String>,
126    counter: Arc<AtomicUsize>,
127}
128
129impl CountedTx {
130    pub fn new(tx: mpsc::UnboundedSender<String>, counter: Arc<AtomicUsize>) -> Self {
131        Self { tx, counter }
132    }
133
134    pub fn send(&self, s: String) {
135        match self.tx.send(s) {
136            Ok(()) => {
137                self.counter.fetch_add(1, Ordering::SeqCst);
138            }
139            Err(e) => {
140                log::error!("Couldn't send data: {e}");
141            }
142        }
143    }
144}