#![deny(missing_docs)]
#[cfg(feature = "css_selector")]
pub mod css_selector;
pub mod error;
#[cfg(feature = "jsonpath")]
pub mod jsonpath;
#[cfg(feature = "xpath")]
pub mod xpath;
#[cfg(feature = "css_selector")]
use crate::css_selector::Html;
use crate::error::Result;
#[cfg(feature = "jsonpath")]
use crate::jsonpath::Json;
#[cfg(feature = "xpath")]
use crate::xpath::XHtml;
use async_trait::async_trait;
use encoding_rs::{Encoding, UTF_8};
use error::ScraperError;
use mime::Mime;
pub use reqwest::Response;
pub use reqwest_scraper_macros::{FromCssSelector, FromXPath, include_http};
#[cfg(feature = "xpath")]
pub trait FromXPath {
type XPathExtractResult;
fn from_xhtml(html: XHtml) -> Self::XPathExtractResult;
}
#[cfg(feature = "css_selector")]
pub trait FromCssSelector {
type CssSelectorExtractResult;
fn from_html(html: Html) -> Self::CssSelectorExtractResult;
}
#[async_trait]
pub trait ScraperResponse {
#[cfg(feature = "jsonpath")]
async fn jsonpath(self) -> Result<Json>;
#[cfg(feature = "css_selector")]
async fn css_selector(self) -> Result<Html>;
#[cfg(feature = "xpath")]
async fn xpath(self) -> Result<XHtml>;
async fn html(self) -> Result<String>;
}
#[async_trait]
impl ScraperResponse for Response {
#[cfg(feature = "jsonpath")]
async fn jsonpath(self) -> Result<Json> {
if self.status().is_success() {
let json = self.text().await?;
Ok(Json::new(json.as_str())?)
} else {
let url = self.url().to_string();
let status_code = self.status().as_u16();
let response = self.text().await?;
Err(ScraperError::HttpError(url, status_code, response))
}
}
#[cfg(feature = "css_selector")]
async fn css_selector(self) -> Result<Html> {
if self.status().is_success() {
let html_str = self.html().await?;
Ok(Html::new(html_str.as_str()))
} else {
let url = self.url().to_string();
let status_code = self.status().as_u16();
let response = self.text().await?;
Err(ScraperError::HttpError(url, status_code, response))
}
}
#[cfg(feature = "xpath")]
async fn xpath(self) -> Result<XHtml> {
if self.status().is_success() {
let html_str = self.html().await?;
Ok(XHtml::new(html_str)?)
} else {
let url = self.url().to_string();
let status_code = self.status().as_u16();
let response = self.text().await?;
Err(ScraperError::HttpError(url, status_code, response))
}
}
async fn html(self) -> Result<String> {
let content_type = self
.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|value| value.to_str().ok())
.and_then(|value| value.parse::<Mime>().ok());
let encoding_name = content_type
.as_ref()
.and_then(|mime| mime.get_param("charset").map(|charset| charset.as_str()));
let full = self.bytes().await?;
match encoding_name {
Some(encoding_name) => {
let encoding = Encoding::for_label(encoding_name.as_bytes()).unwrap_or(UTF_8);
let (text, _, _) = encoding.decode(&full);
Ok(text.into_owned())
}
None => {
let (text, _, _) = UTF_8.decode(&full);
let meta_charset = extract_charset(&text);
if let Some(meta_charset) = meta_charset {
let encoding = Encoding::for_label(meta_charset.as_bytes()).unwrap_or(UTF_8);
let (text, _, _) = encoding.decode(&full);
Ok(text.into_owned())
} else {
Ok(text.into_owned())
}
}
}
}
}
fn extract_charset(html: &str) -> Option<String> {
let meta_start = "<meta charset=";
if let Some(start_index) = html.find(meta_start) {
let start = start_index + meta_start.len();
let quote_char = html[start..].chars().next()?;
if quote_char != '"' && quote_char != '\'' {
return None;
}
let end_quote = html[start + 1..].find(quote_char)? + start + 1;
let charset = &html[start + 1..end_quote];
return Some(charset.to_string());
}
None
}
mod tests {
#[test]
fn test_extract_charset() {
use super::extract_charset;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<meta charset="gb2312">
<title>Example</title>
</head>
<body><p>Hello, world!</p></body>
</html>
"#;
let cs = extract_charset(html);
assert!(cs.is_some());
assert_eq!(cs.unwrap(), "gb2312");
let html = r#"
<!DOCTYPE html>
<html>
<head>
<meta charset='gb2312'>
<title>Example</title>
</head>
<body><p>Hello, world!</p></body>
</html>
"#;
let cs = extract_charset(html);
assert!(cs.is_some());
assert_eq!(cs.unwrap(), "gb2312");
}
}