1use std::path::PathBuf;
2use std::rc::Rc;
3use std::sync::atomic::{AtomicUsize, Ordering};
4use std::sync::Arc;
5
6use anyhow::{anyhow, bail};
7use sxd_document::dom;
8use texting_robots::Robot;
9use tokio::sync::mpsc;
10
11pub trait Scrapable {
12 type Config: Clone + Send + 'static;
13
14 fn new(config: &Self::Config) -> anyhow::Result<Self>
15 where
16 Self: Sized;
17
18 fn seed(&self) -> Seed;
19
20 fn accept(&self, url: &str, crawling_ctx: CrawlingContext) -> bool;
21
22 fn scrap(&mut self, page: String, scraping_ctx: ScrapingContext) -> anyhow::Result<()>;
23
24 fn finalizer(&mut self) {}
25}
26
27#[derive(Debug, Clone)]
28pub enum Seed {
29 Sitemaps(Vec<String>),
30 Pages(Vec<String>),
31 RobotsTxt(String),
32}
33
34#[derive(Debug, Clone)]
35pub struct CrawlingContext {
36 sitemap: Sitemap,
37 robot: Option<Arc<Robot>>,
38}
39
40impl CrawlingContext {
41 pub(crate) fn new(sm: Sitemap, robot: Option<Arc<Robot>>) -> Self {
42 Self { sitemap: sm, robot }
43 }
44
45 pub fn sitemap(&self) -> Sitemap {
46 self.sitemap
47 }
48
49 pub fn robot(&self) -> Option<Arc<Robot>> {
50 self.robot.clone()
51 }
52}
53
54#[derive(Debug, Clone, Copy)]
55pub enum Sitemap {
56 Index,
57 Urlset,
58}
59
60impl<'a> TryFrom<dom::Root<'a>> for Sitemap {
61 type Error = anyhow::Error;
62
63 fn try_from(root: dom::Root<'a>) -> Result<Self, Self::Error> {
64 let kind = root.children()[0]
65 .element()
66 .ok_or_else(|| anyhow!("First child of root is not an element"))?
67 .name()
68 .local_part();
69
70 let sm = match kind {
71 "sitemapindex" => Self::Index,
72 "urlset" => Self::Urlset,
73 _ => bail!("Unknown root node kind: {}", kind),
74 };
75
76 Ok(sm)
77 }
78}
79
80#[derive(Debug, Clone)]
81pub struct ScrapingContext {
82 location: Rc<PageLocation>,
83 tx_url: Option<CountedTx>,
84 robot: Option<Arc<Robot>>,
85}
86
87impl ScrapingContext {
88 pub fn with_location(location: PageLocation) -> Self {
89 Self::new(Rc::new(location), None, None)
90 }
91
92 pub(crate) fn new(
93 location: Rc<PageLocation>,
94 tx_url: Option<CountedTx>,
95 robot: Option<Arc<Robot>>,
96 ) -> Self {
97 Self {
98 location,
99 tx_url,
100 robot,
101 }
102 }
103
104 pub fn location(&self) -> Rc<PageLocation> {
105 self.location.clone()
106 }
107
108 pub fn tx_url(&self) -> Option<CountedTx> {
109 self.tx_url.clone()
110 }
111
112 pub fn robot(&self) -> Option<Arc<Robot>> {
113 self.robot.clone()
114 }
115}
116
117#[derive(Debug, Clone)]
118pub enum PageLocation {
119 Url(String),
120 Path(PathBuf),
121}
122
123#[derive(Debug, Clone)]
124pub struct CountedTx {
125 tx: mpsc::UnboundedSender<String>,
126 counter: Arc<AtomicUsize>,
127}
128
129impl CountedTx {
130 pub fn new(tx: mpsc::UnboundedSender<String>, counter: Arc<AtomicUsize>) -> Self {
131 Self { tx, counter }
132 }
133
134 pub fn send(&self, s: String) {
135 match self.tx.send(s) {
136 Ok(()) => {
137 self.counter.fetch_add(1, Ordering::SeqCst);
138 }
139 Err(e) => {
140 log::error!("Couldn't send data: {e}");
141 }
142 }
143 }
144}