progscrape_scrapers/
scrapers.rs

1//! Public interface for the collection of scrapers.
2use std::collections::HashMap;
3
4use serde::Serialize;
5
6use crate::{backends::scrape, ScrapeConfig, ScrapeSource, TypedScrape};
7
8/// Accumulates the URLs required to scrape for all the services.
9#[derive(Serialize)]
10pub struct ScraperPossibilities {
11    pub scrapes: HashMap<ScrapeSource, Vec<String>>,
12}
13
14#[derive(Serialize)]
15pub enum ScraperHttpResponseInput {
16    HTTPError(u16, String),
17    Ok(String),
18}
19
20#[derive(Serialize)]
21pub enum ScraperHttpResult {
22    Err(ScraperHttpResponseInput, String),
23    Ok(String, Vec<TypedScrape>),
24}
25
26pub struct Scrapers {
27    config: ScrapeConfig,
28}
29
30/// Interface to the collection of scrapers in this library.
31impl Scrapers {
32    pub fn new(config: &ScrapeConfig) -> Self {
33        Self {
34            config: config.clone(),
35        }
36    }
37
38    /// Compute the list of all possible scrapes from all sources and subsources.
39    pub fn compute_scrape_possibilities(&self) -> ScraperPossibilities {
40        let mut scrapes = HashMap::new();
41        for source in ScrapeSource::all() {
42            if let Some(config) = self.config.get(*source) {
43                let subsources = config.subsources();
44                scrapes.insert(*source, subsources);
45            }
46        }
47        ScraperPossibilities { scrapes }
48    }
49
50    /// Compute the list of all possible scrapes from all sources and subsources.
51    pub fn compute_scrape_subsources(&self, source: ScrapeSource) -> Vec<String> {
52        if let Some(config) = self.config.get(source) {
53            let subsources = config.subsources();
54            return subsources;
55        }
56        vec![]
57    }
58
59    /// Given a source and subsources, compute the set of URLs to fetch.
60    pub fn compute_scrape_url_demands(
61        &self,
62        source: ScrapeSource,
63        subsources: Vec<String>,
64    ) -> Vec<String> {
65        if let Some(scrape) = self.config.get(source) {
66            scrape.provide_urls(subsources)
67        } else {
68            vec![]
69        }
70    }
71
72    /// Given the result of fetching a URL, returns the scraped stories.
73    pub fn scrape_http_result(
74        &self,
75        source: ScrapeSource,
76        input: ScraperHttpResponseInput,
77    ) -> ScraperHttpResult {
78        match input {
79            ScraperHttpResponseInput::Ok(s) => match scrape(&self.config, source, &s) {
80                Ok((scrapes, _warnings)) => ScraperHttpResult::Ok(s, scrapes),
81                Err(e) => {
82                    ScraperHttpResult::Err(ScraperHttpResponseInput::Ok(s), format!("{:?}", e))
83                }
84            },
85            error @ ScraperHttpResponseInput::HTTPError(..) => {
86                ScraperHttpResult::Err(error, "HTTP Error".to_string())
87            }
88        }
89    }
90}