acick_util/service/
scrape.rs1use std::str::FromStr;
2use std::time::Duration;
3
4use reqwest::blocking::Client;
5use reqwest::{StatusCode, Url};
6use scraper::{ElementRef, Html, Selector};
7
8use crate::abs_path::AbsPathBuf;
9use crate::service::session::WithRetry as _;
10use crate::{Console, Result};
11
12pub fn parse_zenkaku_digits<T: FromStr>(s: &str) -> std::result::Result<T, T::Err> {
29 s.parse().or_else(|err| {
30 if s.chars().all(|c| ('0'..='9').contains(&c)) {
31 s.chars()
32 .map(|c| char::from((u32::from(c) - u32::from('0') + u32::from('0')) as u8))
33 .collect::<String>()
34 .parse()
35 } else {
36 Err(err)
37 }
38 })
39}
40
41pub trait GetHtml {
42 fn url(&self) -> Result<Url>;
44
45 fn get_html(
47 &self,
48 client: &Client,
49 cookies_path: &AbsPathBuf,
50 retry_limit: usize,
51 retry_interval: Duration,
52 cnsl: &mut Console,
53 ) -> Result<(StatusCode, Html)> {
54 let res = client
55 .get(self.url()?)
56 .with_retry(client, cookies_path, retry_limit, retry_interval)
57 .retry_send(cnsl)?;
58 let status = res.status();
59 let html = res.text().map(|text| Html::parse_document(&text))?;
60 Ok((status, html))
61 }
62}
63
64pub trait Scrape {
65 fn elem(&self) -> ElementRef;
67
68 fn find_first(&self, selector: &Selector) -> Option<ElementRef> {
72 self.elem().select(selector).next()
73 }
74
75 fn inner_text(&self) -> String {
77 self.elem().text().collect()
78 }
79}
80
81impl Scrape for ElementRef<'_> {
82 fn elem(&self) -> ElementRef {
83 *self
84 }
85}
86
87#[cfg(test)]
88mod tests {
89 use reqwest::redirect::Policy;
90 use scraper::Selector;
91 use tempfile::tempdir;
92
93 use crate::assert_matches;
94 use crate::console::ConsoleConfig;
95
96 use super::*;
97
98 fn client() -> Client {
99 Client::builder()
100 .redirect(Policy::none()) .build()
102 .unwrap()
103 }
104
105 #[test]
106 fn test_parse_zenkaku_digits() -> anyhow::Result<()> {
107 assert_eq!(parse_zenkaku_digits::<i32>("0123"), Ok(123));
108 assert_eq!(parse_zenkaku_digits::<i32>("0123"), Ok(123));
109 assert_matches!(parse_zenkaku_digits::<i32>("01x23") => Err(_));
110 assert_matches!(parse_zenkaku_digits::<i32>("01あ23") => Err(_));
111 assert_matches!(parse_zenkaku_digits::<i32>("0123") => Err(_));
112 Ok(())
113 }
114
115 #[test]
116 fn test_get_html() -> anyhow::Result<()> {
117 struct GoogleComPageBuilder {}
118 impl GetHtml for GoogleComPageBuilder {
119 fn url(&self) -> Result<Url> {
120 Ok(Url::parse("http://google.com")?)
121 }
122 }
123
124 let builder = GoogleComPageBuilder {};
125 let test_dir = tempdir()?;
126 let cookies_path = AbsPathBuf::try_new(&test_dir)?.join("cookies.json");
127 let cnsl = &mut Console::sink(ConsoleConfig::default());
128 let (actual_status, actual_html) =
129 builder.get_html(&client(), &cookies_path, 4, Duration::from_secs(2), cnsl)?;
130
131 let expected_status = StatusCode::from_u16(301).unwrap();
132 let expected_html = Html::parse_document(
133 r#"<HTML><HEAD><meta http-equiv="content-type" content="text/html;charset=utf-8">
134<TITLE>301 Moved</TITLE></HEAD><BODY>
135<H1>301 Moved</H1>
136The document has moved
137<A HREF="http://www.google.com/">here</A>.
138</BODY></HTML>
139"#,
140 );
141
142 assert_eq!(actual_status, expected_status);
143 assert_eq!(actual_html, expected_html);
144 Ok(())
145 }
146
147 #[test]
148 fn test_find_first() -> anyhow::Result<()> {
149 let tests = &[
150 (
151 Html::parse_fragment("<ul><li>Foo</li><li>Bar</li><li>Baz</li></ul>"),
152 Some(String::from("<li>Foo</li>")),
153 ),
154 (Html::parse_fragment("<ul></ul>"), None),
155 ];
156
157 for (left, right) in tests {
158 let elem = left.root_element();
159 let actual = &elem
160 .find_first(&Selector::parse("ul > li").unwrap())
161 .map(|elem| elem.html());
162 let expected = right;
163 assert_eq!(actual, expected);
164 }
165 Ok(())
166 }
167
168 #[test]
169 fn test_inner_text() -> anyhow::Result<()> {
170 let tests = &[
171 (
172 Html::parse_fragment("<ul><li>Foo</li><li>Bar</li><li>Baz</li></ul>"),
173 "FooBarBaz",
174 ),
175 (Html::parse_fragment("<div></div>"), ""),
176 ];
177
178 for (left, right) in tests {
179 let actual = left.root_element().inner_text();
180 let expected = *right;
181 assert_eq!(actual, expected);
182 }
183 Ok(())
184 }
185}