acick_util/service/
scrape.rs

1use std::str::FromStr;
2use std::time::Duration;
3
4use reqwest::blocking::Client;
5use reqwest::{StatusCode, Url};
6use scraper::{ElementRef, Html, Selector};
7
8use crate::abs_path::AbsPathBuf;
9use crate::service::session::WithRetry as _;
10use crate::{Console, Result};
11
12/// Parses normal (hankaku) digits or zenkaku digits.
13///
14/// # Examples
15///
16/// ```
17/// use acick_util::service::scrape::parse_zenkaku_digits;
18///
19/// /// success
20/// assert_eq!(parse_zenkaku_digits::<i32>("0123"), Ok(123));
21/// assert_eq!(parse_zenkaku_digits::<i32>("0123"), Ok(123));
22///
23/// /// failure
24/// assert!(parse_zenkaku_digits::<i32>("01x23").is_err());
25/// assert!(parse_zenkaku_digits::<i32>("01あ23").is_err());
26/// assert!(parse_zenkaku_digits::<i32>("0123").is_err());
27/// ```
28pub fn parse_zenkaku_digits<T: FromStr>(s: &str) -> std::result::Result<T, T::Err> {
29    s.parse().or_else(|err| {
30        if s.chars().all(|c| ('0'..='9').contains(&c)) {
31            s.chars()
32                .map(|c| char::from((u32::from(c) - u32::from('0') + u32::from('0')) as u8))
33                .collect::<String>()
34                .parse()
35        } else {
36            Err(err)
37        }
38    })
39}
40
41pub trait GetHtml {
42    /// Returns a url from which we get html.
43    fn url(&self) -> Result<Url>;
44
45    /// Request html with http GET method.
46    fn get_html(
47        &self,
48        client: &Client,
49        cookies_path: &AbsPathBuf,
50        retry_limit: usize,
51        retry_interval: Duration,
52        cnsl: &mut Console,
53    ) -> Result<(StatusCode, Html)> {
54        let res = client
55            .get(self.url()?)
56            .with_retry(client, cookies_path, retry_limit, retry_interval)
57            .retry_send(cnsl)?;
58        let status = res.status();
59        let html = res.text().map(|text| Html::parse_document(&text))?;
60        Ok((status, html))
61    }
62}
63
64pub trait Scrape {
65    /// Gets the underlying element
66    fn elem(&self) -> ElementRef;
67
68    /// Finds first element that matches `selector`.
69    ///
70    /// Returns `None` if no matches are found.
71    fn find_first(&self, selector: &Selector) -> Option<ElementRef> {
72        self.elem().select(selector).next()
73    }
74
75    /// Gets texts inside the underlying element as `String`.
76    fn inner_text(&self) -> String {
77        self.elem().text().collect()
78    }
79}
80
81impl Scrape for ElementRef<'_> {
82    fn elem(&self) -> ElementRef {
83        *self
84    }
85}
86
87#[cfg(test)]
88mod tests {
89    use reqwest::redirect::Policy;
90    use scraper::Selector;
91    use tempfile::tempdir;
92
93    use crate::assert_matches;
94    use crate::console::ConsoleConfig;
95
96    use super::*;
97
98    fn client() -> Client {
99        Client::builder()
100            .redirect(Policy::none()) // redirects manually
101            .build()
102            .unwrap()
103    }
104
105    #[test]
106    fn test_parse_zenkaku_digits() -> anyhow::Result<()> {
107        assert_eq!(parse_zenkaku_digits::<i32>("0123"), Ok(123));
108        assert_eq!(parse_zenkaku_digits::<i32>("0123"), Ok(123));
109        assert_matches!(parse_zenkaku_digits::<i32>("01x23") => Err(_));
110        assert_matches!(parse_zenkaku_digits::<i32>("01あ23") => Err(_));
111        assert_matches!(parse_zenkaku_digits::<i32>("0123") => Err(_));
112        Ok(())
113    }
114
115    #[test]
116    fn test_get_html() -> anyhow::Result<()> {
117        struct GoogleComPageBuilder {}
118        impl GetHtml for GoogleComPageBuilder {
119            fn url(&self) -> Result<Url> {
120                Ok(Url::parse("http://google.com")?)
121            }
122        }
123
124        let builder = GoogleComPageBuilder {};
125        let test_dir = tempdir()?;
126        let cookies_path = AbsPathBuf::try_new(&test_dir)?.join("cookies.json");
127        let cnsl = &mut Console::sink(ConsoleConfig::default());
128        let (actual_status, actual_html) =
129            builder.get_html(&client(), &cookies_path, 4, Duration::from_secs(2), cnsl)?;
130
131        let expected_status = StatusCode::from_u16(301).unwrap();
132        let expected_html = Html::parse_document(
133            r#"<HTML><HEAD><meta http-equiv="content-type" content="text/html;charset=utf-8">
134<TITLE>301 Moved</TITLE></HEAD><BODY>
135<H1>301 Moved</H1>
136The document has moved
137<A HREF="http://www.google.com/">here</A>.
138</BODY></HTML>
139"#,
140        );
141
142        assert_eq!(actual_status, expected_status);
143        assert_eq!(actual_html, expected_html);
144        Ok(())
145    }
146
147    #[test]
148    fn test_find_first() -> anyhow::Result<()> {
149        let tests = &[
150            (
151                Html::parse_fragment("<ul><li>Foo</li><li>Bar</li><li>Baz</li></ul>"),
152                Some(String::from("<li>Foo</li>")),
153            ),
154            (Html::parse_fragment("<ul></ul>"), None),
155        ];
156
157        for (left, right) in tests {
158            let elem = left.root_element();
159            let actual = &elem
160                .find_first(&Selector::parse("ul > li").unwrap())
161                .map(|elem| elem.html());
162            let expected = right;
163            assert_eq!(actual, expected);
164        }
165        Ok(())
166    }
167
168    #[test]
169    fn test_inner_text() -> anyhow::Result<()> {
170        let tests = &[
171            (
172                Html::parse_fragment("<ul><li>Foo</li><li>Bar</li><li>Baz</li></ul>"),
173                "FooBarBaz",
174            ),
175            (Html::parse_fragment("<div></div>"), ""),
176        ];
177
178        for (left, right) in tests {
179            let actual = left.root_element().inner_text();
180            let expected = *right;
181            assert_eq!(actual, expected);
182        }
183        Ok(())
184    }
185}