reqwest_scraper/
lib.rs

1#![deny(missing_docs)]
2
3//! reqwest-scraper
4//#![doc = include_str!("../README.md")]
5
6#[cfg(feature = "css_selector")]
7pub mod css_selector;
8pub mod error;
9#[cfg(feature = "jsonpath")]
10pub mod jsonpath;
11#[cfg(feature = "xpath")]
12pub mod xpath;
13
14#[cfg(feature = "jsonpath")]
15use std::future::Future;
16
17#[cfg(feature = "css_selector")]
18use crate::css_selector::Html;
19use crate::error::Result;
20#[cfg(feature = "jsonpath")]
21use crate::jsonpath::Json;
22#[cfg(feature = "xpath")]
23use crate::xpath::XHtml;
24use encoding_rs::{Encoding, UTF_8};
25use error::ScraperError;
26use mime::Mime;
27use regex::Regex;
28pub use reqwest::Response;
29#[cfg(feature = "json")]
30use serde::de::DeserializeOwned;
31
32pub use reqwest_scraper_macros::{include_http, FromCssSelector, FromXPath};
33
34/// Use XPath to extract the HTML response body into the derived struct
35#[cfg(feature = "xpath")]
36pub trait FromXPath {
37    /// extract result by xpath
38    type XPathExtractResult;
39
40    /// From Html Response
41    fn from_xhtml(html: XHtml) -> Self::XPathExtractResult;
42}
43
44/// Use CssSelector to extract the HTML response body into the derived struct
45#[cfg(feature = "css_selector")]
46pub trait FromCssSelector {
47    /// extract result by css selector
48    type CssSelectorExtractResult;
49
50    /// From Html Response
51    fn from_html(html: Html) -> Self::CssSelectorExtractResult;
52}
53
54/// Support extended traits of jsonpath, css selector, and xpath
55pub trait ScraperResponse {
56    /// Use jsonpath to select the response body
57    #[cfg(feature = "jsonpath")]
58    fn jsonpath(self) -> impl Future<Output = Result<Json>>;
59
60    /// works with any existing Serde Deserializer and exposes the chain of field names leading to the error.
61    /// * https://crates.io/crates/serde_path_to_error
62    #[cfg(feature = "json")]
63    fn json_with_path_to_err<T: DeserializeOwned>(self) -> impl Future<Output = Result<T>>;
64
65    /// Use CSS selector to select the response body
66    #[cfg(feature = "css_selector")]
67    fn css_selector(self) -> impl Future<Output = Result<Html>>;
68
69    /// Use XPath to select the response body
70    #[cfg(feature = "xpath")]
71    fn xpath(self) -> impl Future<Output = Result<XHtml>>;
72
73    /// If there is no Encoding method in the Content-Type of the response header,
74    /// try to read the meta information in the HTML to obtain the encoding.
75    /// eg: <meta charset="gb2312">
76    fn html(self) -> impl Future<Output = Result<String>>;
77}
78
79impl ScraperResponse for Response {
80    #[cfg(feature = "jsonpath")]
81    async fn jsonpath(self) -> Result<Json> {
82        if self.status().is_success() {
83            let json = self.text().await?;
84            Ok(Json::new(json.as_str())?)
85        } else {
86            let url = self.url().to_string();
87            let status_code = self.status().as_u16();
88            let response = self.text().await?;
89            Err(ScraperError::HttpError(url, status_code, response))
90        }
91    }
92
93    #[cfg(feature = "json")]
94    async fn json_with_path_to_err<T: DeserializeOwned>(self) -> Result<T> {
95        let full = self.bytes().await?;
96        let mut deserializer = serde_json::Deserializer::from_slice(&full);
97        Ok(serde_path_to_error::deserialize(&mut deserializer)?)
98    }
99
100    #[cfg(feature = "css_selector")]
101    async fn css_selector(self) -> Result<Html> {
102        if self.status().is_success() {
103            let html_str = self.html().await?;
104            Ok(Html::new(html_str.as_str()))
105        } else {
106            let url = self.url().to_string();
107            let status_code = self.status().as_u16();
108            let response = self.text().await?;
109            Err(ScraperError::HttpError(url, status_code, response))
110        }
111    }
112
113    #[cfg(feature = "xpath")]
114    async fn xpath(self) -> Result<XHtml> {
115        if self.status().is_success() {
116            let html_str = self.html().await?;
117            Ok(XHtml::new(html_str)?)
118        } else {
119            let url = self.url().to_string();
120            let status_code = self.status().as_u16();
121            let response = self.text().await?;
122            Err(ScraperError::HttpError(url, status_code, response))
123        }
124    }
125
126    async fn html(self) -> Result<String> {
127        let content_type = self
128            .headers()
129            .get(reqwest::header::CONTENT_TYPE)
130            .and_then(|value| value.to_str().ok())
131            .and_then(|value| value.parse::<Mime>().ok());
132        let encoding_name = content_type
133            .as_ref()
134            .and_then(|mime| mime.get_param("charset").map(|charset| charset.as_str()));
135
136        let full = self.bytes().await?;
137        match encoding_name {
138            Some(encoding_name) => {
139                let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
140                let (text, _, _) = encoding.decode(&full);
141                Ok(text.into_owned())
142            }
143            None => {
144                let (text, _, _) = UTF_8.decode(&full);
145                let meta_charset = extract_charset(&text);
146                if let Some(meta_charset) = meta_charset {
147                    let encoding = Encoding::for_label(meta_charset.as_bytes()).unwrap_or(UTF_8);
148                    let (text, _, _) = encoding.decode(&full);
149                    Ok(text.into_owned())
150                } else {
151                    Ok(text.into_owned())
152                }
153            }
154        }
155    }
156}
157
158fn extract_charset(html: &str) -> Option<String> {
159    // 优先匹配 <meta charset="UTF-8"> 或 <meta charset='UTF-8'>
160    let re1 = Regex::new(r#"(?i)<meta\s+[^>]*charset=['"]?([a-zA-Z0-9\-_]+)"#).unwrap();
161    if let Some(cap) = re1.captures(html) {
162        return Some(cap[1].to_string());
163    }
164
165    // 兼容 HTML4/XHTML 写法
166    // 匹配 <meta http-equiv="Content-Type" content="text/html; charset=GB2312">
167    // 支持属性乱序、单双引号
168    let re2 = Regex::new(r#"(?i)<meta\s+[^>]*http-equiv=['"]?content-type['"]?[^>]*content=['"][^>]*charset=([a-zA-Z0-9\-_]+)"#).unwrap();
169    if let Some(cap) = re2.captures(html) {
170        return Some(cap[1].to_string());
171    }
172
173    None
174}
175
176#[cfg(test)]
177mod tests {
178
179    #[test]
180    fn test_extract_charset() {
181        use super::extract_charset;
182        let html = r#"
183        <!DOCTYPE html>
184        <html>
185        <head>
186            <meta charset="gb2312">
187            <title>Example</title>
188        </head>
189        <body><p>Hello, world!</p></body>
190        </html>
191        "#;
192
193        let cs = extract_charset(html);
194        assert!(cs.is_some());
195        assert_eq!(cs.unwrap(), "gb2312");
196
197        let html = r#"
198        <!DOCTYPE html>
199        <html>
200        <head>
201            <meta charset='gb2312'>
202            <title>Example</title>
203        </head>
204        <body><p>Hello, world!</p></body>
205        </html>
206        "#;
207
208        let cs = extract_charset(html);
209        assert!(cs.is_some());
210        assert_eq!(cs.unwrap(), "gb2312");
211
212        let html = r#"
213        <!DOCTYPE html>
214        <html>
215        <head>
216            <meta http-equiv="Content-Type" content="text/html; charset=GB2312">
217            <title>Example</title>
218        </head>
219        <body><p>Hello, world!</p></body>
220        </html>
221        "#;
222
223        let cs = extract_charset(html);
224        assert!(cs.is_some());
225        assert_eq!(cs.unwrap(), "GB2312");
226
227        let html = r#"
228        <!DOCTYPE html>
229        <html>
230        <head>
231            <meta content='text/html; charset=gbk' http-equiv="Content-Type">
232            <title>Example</title>
233        </head>
234        <body><p>Hello, world!</p></body>
235        </html>
236        "#;
237
238        let cs = extract_charset(html);
239        assert!(cs.is_some());
240        assert_eq!(cs.unwrap(), "gbk");
241    }
242}