1#![deny(missing_docs)]
2
3#[cfg(feature = "css_selector")]
7pub mod css_selector;
8pub mod error;
9#[cfg(feature = "jsonpath")]
10pub mod jsonpath;
11#[cfg(feature = "xpath")]
12pub mod xpath;
13
14#[cfg(feature = "jsonpath")]
15use std::future::Future;
16
17#[cfg(feature = "css_selector")]
18use crate::css_selector::Html;
19use crate::error::Result;
20#[cfg(feature = "jsonpath")]
21use crate::jsonpath::Json;
22#[cfg(feature = "xpath")]
23use crate::xpath::XHtml;
24use encoding_rs::{Encoding, UTF_8};
25use error::ScraperError;
26use mime::Mime;
27use regex::Regex;
28pub use reqwest::Response;
29#[cfg(feature = "json")]
30use serde::de::DeserializeOwned;
31
32pub use reqwest_scraper_macros::{include_http, FromCssSelector, FromXPath};
33
34#[cfg(feature = "xpath")]
36pub trait FromXPath {
37 type XPathExtractResult;
39
40 fn from_xhtml(html: XHtml) -> Self::XPathExtractResult;
42}
43
44#[cfg(feature = "css_selector")]
46pub trait FromCssSelector {
47 type CssSelectorExtractResult;
49
50 fn from_html(html: Html) -> Self::CssSelectorExtractResult;
52}
53
54pub trait ScraperResponse {
56 #[cfg(feature = "jsonpath")]
58 fn jsonpath(self) -> impl Future<Output = Result<Json>>;
59
60 #[cfg(feature = "json")]
63 fn json_with_path_to_err<T: DeserializeOwned>(self) -> impl Future<Output = Result<T>>;
64
65 #[cfg(feature = "css_selector")]
67 fn css_selector(self) -> impl Future<Output = Result<Html>>;
68
69 #[cfg(feature = "xpath")]
71 fn xpath(self) -> impl Future<Output = Result<XHtml>>;
72
73 fn html(self) -> impl Future<Output = Result<String>>;
77}
78
79impl ScraperResponse for Response {
80 #[cfg(feature = "jsonpath")]
81 async fn jsonpath(self) -> Result<Json> {
82 if self.status().is_success() {
83 let json = self.text().await?;
84 Ok(Json::new(json.as_str())?)
85 } else {
86 let url = self.url().to_string();
87 let status_code = self.status().as_u16();
88 let response = self.text().await?;
89 Err(ScraperError::HttpError(url, status_code, response))
90 }
91 }
92
93 #[cfg(feature = "json")]
94 async fn json_with_path_to_err<T: DeserializeOwned>(self) -> Result<T> {
95 let full = self.bytes().await?;
96 let mut deserializer = serde_json::Deserializer::from_slice(&full);
97 Ok(serde_path_to_error::deserialize(&mut deserializer)?)
98 }
99
100 #[cfg(feature = "css_selector")]
101 async fn css_selector(self) -> Result<Html> {
102 if self.status().is_success() {
103 let html_str = self.html().await?;
104 Ok(Html::new(html_str.as_str()))
105 } else {
106 let url = self.url().to_string();
107 let status_code = self.status().as_u16();
108 let response = self.text().await?;
109 Err(ScraperError::HttpError(url, status_code, response))
110 }
111 }
112
113 #[cfg(feature = "xpath")]
114 async fn xpath(self) -> Result<XHtml> {
115 if self.status().is_success() {
116 let html_str = self.html().await?;
117 Ok(XHtml::new(html_str)?)
118 } else {
119 let url = self.url().to_string();
120 let status_code = self.status().as_u16();
121 let response = self.text().await?;
122 Err(ScraperError::HttpError(url, status_code, response))
123 }
124 }
125
126 async fn html(self) -> Result<String> {
127 let content_type = self
128 .headers()
129 .get(reqwest::header::CONTENT_TYPE)
130 .and_then(|value| value.to_str().ok())
131 .and_then(|value| value.parse::<Mime>().ok());
132 let encoding_name = content_type
133 .as_ref()
134 .and_then(|mime| mime.get_param("charset").map(|charset| charset.as_str()));
135
136 let full = self.bytes().await?;
137 match encoding_name {
138 Some(encoding_name) => {
139 let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
140 let (text, _, _) = encoding.decode(&full);
141 Ok(text.into_owned())
142 }
143 None => {
144 let (text, _, _) = UTF_8.decode(&full);
145 let meta_charset = extract_charset(&text);
146 if let Some(meta_charset) = meta_charset {
147 let encoding = Encoding::for_label(meta_charset.as_bytes()).unwrap_or(UTF_8);
148 let (text, _, _) = encoding.decode(&full);
149 Ok(text.into_owned())
150 } else {
151 Ok(text.into_owned())
152 }
153 }
154 }
155 }
156}
157
158fn extract_charset(html: &str) -> Option<String> {
159 let re1 = Regex::new(r#"(?i)<meta\s+[^>]*charset=['"]?([a-zA-Z0-9\-_]+)"#).unwrap();
161 if let Some(cap) = re1.captures(html) {
162 return Some(cap[1].to_string());
163 }
164
165 let re2 = Regex::new(r#"(?i)<meta\s+[^>]*http-equiv=['"]?content-type['"]?[^>]*content=['"][^>]*charset=([a-zA-Z0-9\-_]+)"#).unwrap();
169 if let Some(cap) = re2.captures(html) {
170 return Some(cap[1].to_string());
171 }
172
173 None
174}
175
176#[cfg(test)]
177mod tests {
178
179 #[test]
180 fn test_extract_charset() {
181 use super::extract_charset;
182 let html = r#"
183 <!DOCTYPE html>
184 <html>
185 <head>
186 <meta charset="gb2312">
187 <title>Example</title>
188 </head>
189 <body><p>Hello, world!</p></body>
190 </html>
191 "#;
192
193 let cs = extract_charset(html);
194 assert!(cs.is_some());
195 assert_eq!(cs.unwrap(), "gb2312");
196
197 let html = r#"
198 <!DOCTYPE html>
199 <html>
200 <head>
201 <meta charset='gb2312'>
202 <title>Example</title>
203 </head>
204 <body><p>Hello, world!</p></body>
205 </html>
206 "#;
207
208 let cs = extract_charset(html);
209 assert!(cs.is_some());
210 assert_eq!(cs.unwrap(), "gb2312");
211
212 let html = r#"
213 <!DOCTYPE html>
214 <html>
215 <head>
216 <meta http-equiv="Content-Type" content="text/html; charset=GB2312">
217 <title>Example</title>
218 </head>
219 <body><p>Hello, world!</p></body>
220 </html>
221 "#;
222
223 let cs = extract_charset(html);
224 assert!(cs.is_some());
225 assert_eq!(cs.unwrap(), "GB2312");
226
227 let html = r#"
228 <!DOCTYPE html>
229 <html>
230 <head>
231 <meta content='text/html; charset=gbk' http-equiv="Content-Type">
232 <title>Example</title>
233 </head>
234 <body><p>Hello, world!</p></body>
235 </html>
236 "#;
237
238 let cs = extract_charset(html);
239 assert!(cs.is_some());
240 assert_eq!(cs.unwrap(), "gbk");
241 }
242}