#[cfg(not(feature = "decentralized"))]
use crate::packages::scraper::Html;
use crate::utils::log;
use crate::utils::PageResponse;
use crate::CaseInsensitiveString;
use bytes::Bytes;
use compact_str::CompactString;
use hashbrown::HashSet;
use reqwest::{Client, StatusCode};
use smallvec::SmallVec;
#[cfg(all(feature = "time", not(feature = "decentralized")))]
use std::time::Duration;
#[cfg(all(feature = "time", not(feature = "decentralized")))]
use tokio::time::Instant;
#[cfg(not(feature = "decentralized"))]
use tokio_stream::StreamExt;
use url::Url;
lazy_static! {
static ref CASELESS_WILD_CARD: CaseInsensitiveString = CaseInsensitiveString::new("*");
}
#[derive(Debug, Clone)]
#[cfg(not(feature = "decentralized"))]
pub struct Page {
html: Option<Bytes>,
base: Url,
url: String,
pub status_code: StatusCode,
pub error_status: Option<String>,
pub external_domains_caseless: Box<HashSet<CaseInsensitiveString>>,
pub final_redirect_destination: Option<String>,
#[cfg(feature = "time")]
duration: Instant,
}
#[cfg(feature = "decentralized")]
#[derive(Debug, Clone)]
pub struct Page {
html: Option<Bytes>,
pub status_code: StatusCode,
pub error_status: Option<String>,
pub links: HashSet<CaseInsensitiveString>,
pub external_domains_caseless: Box<HashSet<CaseInsensitiveString>>,
pub final_redirect_destination: Option<String>,
}
lazy_static! {
static ref ONLY_RESOURCES: HashSet<CaseInsensitiveString> = {
let mut m: HashSet<CaseInsensitiveString> = HashSet::with_capacity(16);
m.extend([
"html", "htm", "asp", "aspx", "php", "jps", "jpsx", "jsp",
".html", ".htm", ".asp", ".aspx", ".php", ".jps", ".jpsx", ".jsp",
].map(|s| s.into()));
m
};
}
pub fn domain_name(domain: &Url) -> &str {
match domain.host_str() {
Some(b) => {
let b = b.split('.').collect::<Vec<&str>>();
let bsize = b.len();
if bsize > 0 {
b[bsize - 1]
} else {
""
}
}
_ => "",
}
}
#[inline]
pub fn convert_abs_path(base: &Url, href: &str) -> Url {
match base.join(href) {
Ok(mut joined) => {
joined.set_fragment(None);
joined
}
Err(e) => {
log("URL Parse Error: ", e.to_string());
base.clone()
}
}
}
pub fn get_page_selectors(
url: &str,
subdomains: bool,
tld: bool,
) -> Option<(CompactString, SmallVec<[CompactString; 2]>)> {
match Url::parse(&url) {
Ok(host) => {
let host_name = CompactString::from(
match convert_abs_path(&host, Default::default()).host_str() {
Some(host) => host.to_ascii_lowercase(),
_ => Default::default(),
},
);
let scheme = host.scheme();
Some(if tld || subdomains {
let dname = domain_name(&host);
let scheme = host.scheme();
(
dname.into(),
smallvec::SmallVec::from([host_name, CompactString::from(scheme)]),
)
} else {
(
CompactString::default(),
smallvec::SmallVec::from([host_name, CompactString::from(scheme)]),
)
})
}
_ => None,
}
}
#[cfg(not(feature = "decentralized"))]
pub fn build(url: &str, res: PageResponse) -> Page {
Page {
html: if res.content.is_some() {
res.content
} else {
None
},
base: Url::parse(&url).expect("Invalid page URL"),
url: url.into(),
#[cfg(feature = "time")]
duration: Instant::now(),
external_domains_caseless: Default::default(),
final_redirect_destination: res.final_url,
status_code: res.status_code,
error_status: match res.error_for_status {
Some(e) => match e {
Ok(_) => None,
Err(er) => Some(er.to_string()),
},
_ => None,
},
}
}
#[cfg(feature = "decentralized")]
pub fn build(_: &str, res: PageResponse) -> Page {
Page {
html: if res.content.is_some() {
res.content
} else {
None
},
links: Default::default(),
external_domains_caseless: Default::default(),
final_redirect_destination: res.final_url,
status_code: res.status_code,
error_status: match res.error_for_status {
Some(e) => match e {
Ok(_) => None,
Err(er) => Some(er.to_string()),
},
_ => None,
},
}
}
impl Page {
#[cfg(all(not(feature = "decentralized"), feature = "chrome"))]
pub async fn new(url: &str, client: &Client, page: &chromiumoxide::Page) -> Self {
let page_resource = crate::utils::fetch_page_html(&url, &client, &page).await;
build(url, page_resource)
}
#[cfg(not(feature = "decentralized"))]
pub async fn new_page(url: &str, client: &Client) -> Self {
let page_resource = crate::utils::fetch_page_html_raw(&url, &client).await;
build(url, page_resource)
}
#[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))]
pub async fn new(url: &str, client: &Client) -> Self {
let page_resource = crate::utils::fetch_page_html(&url, &client).await;
build(url, page_resource)
}
#[cfg(feature = "decentralized")]
pub async fn new(url: &str, client: &Client) -> Self {
use crate::serde::Deserialize;
use bytes::Buf;
let links = match crate::utils::fetch_page(&url, &client).await {
Some(b) => match flexbuffers::Reader::get_root(b.chunk()) {
Ok(buf) => match HashSet::<CaseInsensitiveString>::deserialize(buf) {
Ok(link) => link,
_ => Default::default(),
},
_ => Default::default(),
},
_ => Default::default(),
};
Page {
html: None,
links,
external_domains_caseless: Default::default(),
final_redirect_destination: Default::default(),
status_code: Default::default(),
error_status: Default::default(),
}
}
pub fn is_empty(&self) -> bool {
self.html.is_none()
}
#[cfg(not(feature = "decentralized"))]
pub fn get_url(&self) -> &str {
&self.url
}
#[cfg(not(feature = "decentralized"))]
pub fn get_url_final(&self) -> &str {
match self.final_redirect_destination.as_ref() {
Some(u) => &u,
_ => &self.url,
}
}
pub fn set_external(&mut self, external_domains_caseless: Box<HashSet<CaseInsensitiveString>>) {
self.external_domains_caseless = external_domains_caseless;
}
#[cfg(not(feature = "decentralized"))]
pub fn get_url_parsed(&self) -> &Url {
&self.base
}
#[cfg(feature = "decentralized")]
pub fn get_url(&self) -> &str {
""
}
pub fn get_bytes(&self) -> Option<&Bytes> {
match self.html.as_ref() {
Some(html) => Some(html),
_ => None,
}
}
pub fn get_html(&self) -> String {
match self.html.as_ref() {
Some(html) => String::from_utf8_lossy(&html).to_string(),
_ => Default::default(),
}
}
pub fn get_html_bytes_u8(&self) -> &[u8] {
match self.html.as_deref() {
Some(html) => html,
_ => Default::default(),
}
}
#[cfg(all(feature = "time", not(feature = "decentralized")))]
pub fn get_duration_elasped(&self) -> Duration {
self.duration.elapsed()
}
#[inline(always)]
#[cfg(all(
not(feature = "decentralized"),
not(feature = "full_resources"),
not(feature = "js")
))]
pub async fn links_stream<A: PartialEq + Eq + std::hash::Hash + From<String>>(
&self,
selectors: &(&CompactString, &SmallVec<[CompactString; 2]>),
) -> HashSet<A> {
let html = Box::new(Html::parse_fragment(&self.get_html()));
tokio::task::yield_now().await;
let mut stream = tokio_stream::iter(html.tree);
let mut map = HashSet::new();
let base_domain = &selectors.0;
let parent_frags = &selectors.1; let parent_host = &parent_frags[0];
let parent_host_scheme = &parent_frags[1];
while let Some(node) = stream.next().await {
if let Some(element) = node.as_element() {
if element.name() == "a" {
match element.attr("href") {
Some(href) => {
let mut abs = self.abs_path(href);
let host_name = abs.host_str();
let mut can_process = match host_name {
Some(host) => {
if base_domain.is_empty() {
parent_host.eq(&host)
} else {
parent_host.ends_with(host)
}
}
_ => false,
};
if !can_process
&& host_name.is_some()
&& !self.external_domains_caseless.is_empty()
{
can_process = self
.external_domains_caseless
.contains::<CaseInsensitiveString>(
&host_name.unwrap_or_default().into(),
)
|| self
.external_domains_caseless
.contains::<CaseInsensitiveString>(&CASELESS_WILD_CARD)
}
if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}
let hchars = abs.path();
if let Some(position) = hchars.rfind('.') {
let resource_ext = &hchars[position + 1..hchars.len()];
if !ONLY_RESOURCES
.contains::<CaseInsensitiveString>(&resource_ext.into())
{
can_process = false;
}
}
if can_process
&& (base_domain.is_empty()
|| base_domain.as_str() == domain_name(&abs))
{
map.insert(abs.as_str().to_string().into());
}
}
}
_ => (),
};
}
}
}
map
}
#[inline(always)]
#[cfg(all(
not(feature = "decentralized"),
not(feature = "full_resources"),
feature = "js"
))]
pub async fn links_stream<
A: PartialEq + std::fmt::Debug + Eq + std::hash::Hash + From<String>,
>(
&self,
selectors: &(&CompactString, &SmallVec<[CompactString; 2]>),
) -> HashSet<A> {
use jsdom::extract::extract_links;
lazy_static! {
static ref IGNORE_ASSETS: HashSet<&'static str> = {
let mut m: HashSet<&'static str> = HashSet::with_capacity(23);
m.extend::<[&'static str; 23]>([
"jquery.min.js", "jquery.qtip.min.js", "jquery.js", "angular.js", "jquery.slim.js", "react.development.js", "react-dom.development.js", "react.production.min.js", "react-dom.production.min.js",
"vue.global.js", "vue.esm-browser.js", "vue.js", "bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", "d3.js", "material-components-web.min.js",
"otSDKStub.js", "clipboard.min.js", "moment.js", "moment.min.js", "dexie.js",
].map(|s| s.into()));
m
};
}
let base_domain = &selectors.0;
let parent_frags = &selectors.1; let parent_host = &parent_frags[0];
let parent_host_scheme = &parent_frags[1];
let mut map = HashSet::new();
let html = Box::new(self.get_html());
if !base_domain.is_empty() && !html.starts_with("<") {
let links: HashSet<CaseInsensitiveString> = extract_links(&html).await;
let mut stream = tokio_stream::iter(&links);
while let Some(href) = stream.next().await {
let mut abs = self.abs_path(href.inner());
let host_name = abs.host_str();
let mut can_process = match host_name {
Some(host) => {
if base_domain.is_empty() {
parent_host.eq(&host)
} else {
parent_host.ends_with(host)
}
}
_ => false,
};
if !can_process && host_name.is_some() && !self.external_domains_caseless.is_empty()
{
can_process = self
.external_domains_caseless
.contains::<CaseInsensitiveString>(&host_name.unwrap_or_default().into())
}
if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}
let hchars = abs.path();
if let Some(position) = hchars.rfind('.') {
let resource_ext = &hchars[position + 1..hchars.len()];
if !ONLY_RESOURCES.contains::<CaseInsensitiveString>(&resource_ext.into()) {
can_process = false;
}
}
if can_process
&& (base_domain.is_empty() || base_domain.as_str() == domain_name(&abs))
{
map.insert(abs.as_str().to_string().into());
}
}
}
} else {
let html = Box::new(Html::parse_document(&html));
tokio::task::yield_now().await;
let mut stream = tokio_stream::iter(html.tree);
while let Some(node) = stream.next().await {
if let Some(element) = node.as_element() {
let element_name = element.name();
if element_name == "script" {
match element.attr("src") {
Some(src) => {
if src.starts_with("/")
&& element.attr("id") != Some("gatsby-chunk-mapping")
{
if !src.starts_with("/_next/static/chunks/pages/")
&& !src.starts_with("/webpack-runtime-")
{
let abs = self.abs_path(src);
let mut insertable = true;
match abs.path_segments().ok_or_else(|| "cannot be base") {
Ok(mut paths) => {
while let Some(p) = paths.next() {
if p.ends_with(".js")
&& IGNORE_ASSETS.contains(&p)
{
insertable = false;
}
}
}
_ => (),
};
if insertable {
map.insert(abs.as_str().to_string().into());
}
}
}
}
_ => (),
}
}
if element_name == "a" {
match element.attr("href") {
Some(href) => {
let mut abs = self.abs_path(href);
let mut can_process = match abs.host_str() {
Some(host) => {
if base_domain.is_empty() {
parent_host.eq(&host)
} else {
parent_host.ends_with(host)
}
}
_ => false,
};
if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}
let hchars = abs.path();
if let Some(position) = hchars.find('.') {
let resource_ext = &hchars[position + 1..hchars.len()];
if !ONLY_RESOURCES
.contains::<CaseInsensitiveString>(&resource_ext.into())
{
can_process = false;
}
}
if can_process
&& (base_domain.is_empty()
|| base_domain.as_str() == domain_name(&abs))
{
map.insert(abs.as_str().to_string().into());
}
}
}
_ => (),
};
}
}
}
}
map
}
#[inline(always)]
#[cfg(all(not(feature = "decentralized"), feature = "full_resources"))]
pub async fn links_stream<A: PartialEq + Eq + std::hash::Hash + From<String>>(
&self,
selectors: &(&CompactString, &SmallVec<[CompactString; 2]>),
) -> HashSet<A> {
let html = Box::new(Html::parse_document(&self.get_html()));
tokio::task::yield_now().await;
let mut stream = tokio_stream::iter(html.tree);
let mut map = HashSet::new();
let base_domain = &selectors.0;
let parent_frags = &selectors.1; let parent_host = &parent_frags[0];
let parent_host_scheme = &parent_frags[1];
while let Some(node) = stream.next().await {
if let Some(element) = node.as_element() {
let element_name = element.name();
let ele_attribute = if element_name == "a" || element_name == "link" {
"href"
} else if element_name == "script" {
"src"
} else {
"href"
};
match element.attr(ele_attribute) {
Some(href) => {
let mut abs = self.abs_path(href);
let can_process = match abs.host_str() {
Some(host) => {
if base_domain.is_empty() {
parent_host.eq(&host)
} else {
parent_host.ends_with(host)
}
}
_ => false,
};
if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}
let h = abs.as_str();
if can_process
&& (base_domain.is_empty()
|| base_domain.as_str() == domain_name(&abs))
{
map.insert(h.to_string().into());
}
}
}
_ => (),
};
}
}
map
}
#[inline(always)]
#[cfg(feature = "decentralized")]
pub async fn links_stream<A: PartialEq + Eq + std::hash::Hash + From<String>>(
&self,
_: &(&CompactString, &SmallVec<[CompactString; 2]>),
) -> HashSet<A> {
Default::default()
}
#[cfg(not(feature = "decentralized"))]
#[inline(never)]
pub async fn links(
&self,
selectors: &(CompactString, SmallVec<[CompactString; 2]>),
) -> HashSet<CaseInsensitiveString> {
match self.html.is_some() {
false => Default::default(),
true => {
self.links_stream::<CaseInsensitiveString>(&(&selectors.0, &selectors.1))
.await
}
}
}
#[cfg(feature = "decentralized")]
#[inline(never)]
pub async fn links(
&self,
_: &(CompactString, smallvec::SmallVec<[CompactString; 2]>),
) -> HashSet<CaseInsensitiveString> {
self.links.to_owned()
}
#[inline]
#[cfg(not(feature = "decentralized"))]
fn abs_path(&self, href: &str) -> Url {
convert_abs_path(&self.base, href)
}
}
#[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))]
#[tokio::test]
async fn parse_links() {
let client = Client::builder()
.user_agent("spider/1.1.2")
.build()
.unwrap();
let link_result = "https://choosealicense.com/";
let page: Page = Page::new(&link_result, &client).await;
let selector = get_page_selectors(&link_result, false, false);
let links = page.links(&selector.unwrap()).await;
assert!(
links.contains::<CaseInsensitiveString>(&"https://choosealicense.com/about/".into()),
"Could not find {}. Theses URLs was found {:?}",
page.get_url(),
&links
);
}
#[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))]
#[tokio::test]
async fn test_status_code() {
let client = Client::builder()
.user_agent("spider/1.1.2")
.build()
.unwrap();
let link_result = "https://choosealicense.com/does-not-exist";
let page: Page = Page::new(&link_result, &client).await;
assert_eq!(page.status_code.as_u16(), 404);
}
#[cfg(all(not(feature = "decentralized"), not(feature = "chrome")))]
#[tokio::test]
async fn test_abs_path() {
let client = Client::builder()
.user_agent("spider/1.1.2")
.build()
.unwrap();
let link_result = "https://choosealicense.com/";
let page: Page = Page::new(&link_result, &client).await;
assert_eq!(
page.abs_path("/page"),
Url::parse("https://choosealicense.com/page").unwrap()
);
assert_eq!(
page.abs_path("/page?query=keyword"),
Url::parse("https://choosealicense.com/page?query=keyword").unwrap()
);
assert_eq!(
page.abs_path("/page#hash"),
Url::parse("https://choosealicense.com/page").unwrap()
);
assert_eq!(
page.abs_path("/page?query=keyword#hash"),
Url::parse("https://choosealicense.com/page?query=keyword").unwrap()
);
assert_eq!(
page.abs_path("#hash"),
Url::parse("https://choosealicense.com/").unwrap()
);
assert_eq!(
page.abs_path("tel://+212 3456"),
Url::parse("https://choosealicense.com/").unwrap()
);
}
#[cfg(all(feature = "time", not(feature = "decentralized")))]
#[tokio::test]
async fn test_duration() {
let client = Client::builder()
.user_agent("spider/1.1.2")
.build()
.unwrap();
let link_result = "https://choosealicense.com/";
let page: Page = Page::new(&link_result, &client).await;
let duration_elasped = page.get_duration_elasped().as_millis();
assert!(
duration_elasped < 6000,
"Duration took longer than expected {}.",
duration_elasped,
);
}