#[cfg(not(feature = "decentralized"))]
use crate::packages::scraper::Html;
use crate::CaseInsensitiveString;
use compact_str::CompactString;
use hashbrown::HashSet;
use reqwest::Client;
use smallvec::SmallVec;
#[cfg(not(feature = "decentralized"))]
use tokio_stream::StreamExt;
use url::Url;
#[derive(Debug, Clone)]
#[cfg(not(feature = "decentralized"))]
pub struct Page {
html: Option<String>,
base: Url,
}
#[cfg(feature = "decentralized")]
#[derive(Debug, Clone)]
pub struct Page {
html: Option<String>,
pub links: HashSet<CaseInsensitiveString>,
}
lazy_static! {
static ref ONLY_RESOURCES: HashSet<CaseInsensitiveString> = {
let mut m: HashSet<CaseInsensitiveString> = HashSet::with_capacity(14);
m.extend([
"html", "htm", "asp", "aspx", "php", "jps", "jpsx",
".html", ".htm", ".asp", ".aspx", ".php", ".jps", ".jpsx",
].map(|s| s.into()));
m
};
}
pub fn domain_name(domain: &Url) -> &str {
match domain.host_str() {
Some(b) => {
let b = b.split('.').collect::<Vec<&str>>();
if b.len() > 2 {
b[1]
} else if b.len() == 2 {
b[0]
} else {
b[b.len() - 2]
}
}
_ => "",
}
}
#[inline]
pub fn convert_abs_path(base: &Url, href: &str) -> Url {
match base.join(href) {
Ok(mut joined) => {
joined.set_fragment(None);
joined
}
Err(_) => base.clone(),
}
}
pub fn get_page_selectors(
url: &str,
subdomains: bool,
tld: bool,
) -> Option<(CompactString, SmallVec<[CompactString; 2]>)> {
match Url::parse(&url) {
Ok(host) => {
let host_name = CompactString::from(
match convert_abs_path(&host, Default::default()).host_str() {
Some(host) => host.to_ascii_lowercase(),
_ => Default::default(),
},
);
let scheme = host.scheme();
Some(if tld || subdomains {
let base = Url::parse(&url).expect("Invalid page URL");
let dname = domain_name(&base);
let scheme = base.scheme();
(
dname.into(),
smallvec::SmallVec::from([host_name, CompactString::from(scheme)]),
)
} else {
(
CompactString::default(),
smallvec::SmallVec::from([host_name, CompactString::from(scheme)]),
)
})
}
_ => None,
}
}
#[cfg(not(feature = "decentralized"))]
pub fn build(url: &str, html: Option<String>) -> Page {
Page {
html: if html.is_some() { html } else { None },
base: Url::parse(&url).expect("Invalid page URL"),
}
}
#[cfg(feature = "decentralized")]
pub fn build(_: &str, html: Option<String>) -> Page {
Page {
html: if html.is_some() { html } else { None },
links: Default::default(),
}
}
impl Page {
#[cfg(not(feature = "decentralized"))]
pub async fn new(url: &str, client: &Client) -> Self {
build(url, crate::utils::fetch_page_html(&url, &client).await)
}
#[cfg(feature = "decentralized")]
pub async fn new(url: &str, client: &Client) -> Self {
use crate::serde::Deserialize;
use bytes::Buf;
let links = match crate::utils::fetch_page(&url, &client).await {
Some(b) => match flexbuffers::Reader::get_root(b.chunk()) {
Ok(buf) => match HashSet::<CaseInsensitiveString>::deserialize(buf) {
Ok(link) => link,
_ => Default::default(),
},
_ => Default::default(),
},
_ => Default::default(),
};
Page { html: None, links }
}
#[cfg(not(feature = "decentralized"))]
pub fn get_url(&self) -> &str {
self.base.as_str()
}
#[cfg(feature = "decentralized")]
pub fn get_url(&self) -> &str {
""
}
pub fn get_html(&self) -> &str {
match self.html.as_deref() {
Some(html) => html,
_ => "",
}
}
#[inline(always)]
#[cfg(all(
not(feature = "decentralized"),
not(feature = "full_resources"),
not(feature = "js")
))]
pub async fn links_stream<A: PartialEq + Eq + std::hash::Hash + From<String>>(
&self,
selectors: &(&CompactString, &SmallVec<[CompactString; 2]>),
) -> HashSet<A> {
let html = Box::new(Html::parse_document(self.get_html()));
tokio::task::yield_now().await;
let mut stream = tokio_stream::iter(html.tree);
let mut map = HashSet::new();
let base_domain = &selectors.0;
let parent_frags = &selectors.1; let parent_host = &parent_frags[0];
let parent_host_scheme = &parent_frags[1];
while let Some(node) = stream.next().await {
if let Some(element) = node.as_element() {
match element.attr("href") {
Some(href) => {
let mut abs = self.abs_path(href);
let mut can_process = match abs.host_str() {
Some(host) => parent_host.ends_with(host),
_ => false,
};
if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}
let resource_url = abs.clone();
abs.set_query(None);
let clean_resource = abs.as_str();
let hlen = clean_resource.len();
let hchars = &clean_resource[hlen - 5..hlen];
if let Some(position) = hchars.find('.') {
let resource_ext = &hchars[position + 1..hchars.len()];
if !ONLY_RESOURCES
.contains::<CaseInsensitiveString>(&resource_ext.into())
{
can_process = false;
}
}
if can_process && base_domain.is_empty()
|| can_process && base_domain.as_str() == domain_name(&abs)
{
map.insert(resource_url.as_str().to_string().into());
}
}
}
_ => (),
};
}
}
map
}
#[inline(always)]
#[cfg(all(
not(feature = "decentralized"),
not(feature = "full_resources"),
feature = "js"
))]
pub async fn links_stream<
A: PartialEq + std::fmt::Debug + Eq + std::hash::Hash + From<String>,
>(
&self,
selectors: &(&CompactString, &SmallVec<[CompactString; 2]>),
) -> HashSet<A> {
use jsdom::extract::extract_links;
lazy_static! {
static ref IGNORE_ASSETS: HashSet<&'static str> = {
let mut m: HashSet<&'static str> = HashSet::with_capacity(23);
m.extend::<[&'static str; 23]>([
"jquery.min.js", "jquery.qtip.min.js", "jquery.js", "angular.js", "jquery.slim.js", "react.development.js", "react-dom.development.js", "react.production.min.js", "react-dom.production.min.js",
"vue.global.js", "vue.esm-browser.js", "vue.js", "bootstrap.min.js", "bootstrap.bundle.min.js", "bootstrap.esm.min.js", "d3.min.js", "d3.js", "material-components-web.min.js",
"otSDKStub.js", "clipboard.min.js", "moment.js", "moment.min.js", "dexie.js",
].map(|s| s.into()));
m
};
}
let base_domain = &selectors.0;
let parent_frags = &selectors.1; let parent_host = &parent_frags[0];
let parent_host_scheme = &parent_frags[1];
let mut map = HashSet::new();
let html = Box::new(self.get_html());
if !base_domain.is_empty() && !html.starts_with("<") {
let links: HashSet<CaseInsensitiveString> = extract_links(&html).await;
let mut stream = tokio_stream::iter(&links);
while let Some(href) = stream.next().await {
let mut abs = self.abs_path(href.inner());
let mut can_process = match abs.host_str() {
Some(host) => parent_host.ends_with(host),
_ => false,
};
if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}
let resource_url = abs.clone();
abs.set_query(None);
let clean_resource = abs.as_str();
let hlen = clean_resource.len();
let hchars = &clean_resource[hlen - 5..hlen];
if let Some(position) = hchars.find('.') {
let resource_ext = &hchars[position + 1..hchars.len()];
if !ONLY_RESOURCES.contains::<CaseInsensitiveString>(&resource_ext.into()) {
can_process = false;
}
}
if can_process
&& (base_domain.is_empty() || base_domain.as_str() == domain_name(&abs))
{
map.insert(resource_url.as_str().to_string().into());
}
}
}
} else {
let html = Box::new(Html::parse_document(&html));
tokio::task::yield_now().await;
let mut stream = tokio_stream::iter(html.tree);
while let Some(node) = stream.next().await {
if let Some(element) = node.as_element() {
if element.name() == "script" {
match element.attr("src") {
Some(src) => {
if src.starts_with("/")
&& element.attr("id") != Some("gatsby-chunk-mapping")
{
if !src.starts_with("/_next/static/chunks/pages/")
&& !src.starts_with("/webpack-runtime-")
{
let abs = self.abs_path(src);
let mut insertable = true;
match abs.path_segments().ok_or_else(|| "cannot be base") {
Ok(mut paths) => {
while let Some(p) = paths.next() {
if p.ends_with(".js")
&& IGNORE_ASSETS.contains(&p)
{
insertable = false;
}
}
}
_ => (),
};
if insertable {
map.insert(abs.as_str().to_string().into());
}
}
}
}
_ => (),
}
}
match element.attr("href") {
Some(href) => {
let mut abs = self.abs_path(href);
let mut can_process = match abs.host_str() {
Some(host) => parent_host.ends_with(host),
_ => false,
};
if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}
let resource_url = abs.clone();
abs.set_query(None);
let clean_resource = abs.as_str();
let hlen = clean_resource.len();
let hchars = &clean_resource[hlen - 5..hlen];
if let Some(position) = hchars.find('.') {
let resource_ext = &hchars[position + 1..hchars.len()];
if !ONLY_RESOURCES
.contains::<CaseInsensitiveString>(&resource_ext.into())
{
can_process = false;
}
}
if can_process
&& (base_domain.is_empty()
|| base_domain.as_str() == domain_name(&abs))
{
map.insert(resource_url.as_str().to_string().into());
}
}
}
_ => (),
};
}
}
}
map
}
#[inline(always)]
#[cfg(all(not(feature = "decentralized"), feature = "full_resources"))]
pub async fn links_stream<A: PartialEq + Eq + std::hash::Hash + From<String>>(
&self,
selectors: &(&CompactString, &SmallVec<[CompactString; 2]>),
) -> HashSet<A> {
let html = Box::new(Html::parse_document(self.get_html()));
tokio::task::yield_now().await;
let mut stream = tokio_stream::iter(html.tree);
let mut map = HashSet::new();
let base_domain = &selectors.0;
let parent_frags = &selectors.1; let parent_host = &parent_frags[0];
let parent_host_scheme = &parent_frags[1];
while let Some(node) = stream.next().await {
if let Some(element) = node.as_element() {
match element.attr("href") {
Some(href) => {
let mut abs = self.abs_path(href);
let can_process = match abs.host_str() {
Some(host) => parent_host.ends_with(host),
_ => false,
};
if can_process {
if abs.scheme() != parent_host_scheme.as_str() {
let _ = abs.set_scheme(parent_host_scheme.as_str());
}
let h = abs.as_str();
if can_process
&& (base_domain.is_empty()
|| base_domain.as_str() == domain_name(&abs))
{
map.insert(h.to_string().into());
}
}
}
_ => (),
};
}
}
map
}
#[inline(always)]
#[cfg(feature = "decentralized")]
pub async fn links_stream<A: PartialEq + Eq + std::hash::Hash + From<String>>(
&self,
_: &(&CompactString, &SmallVec<[CompactString; 2]>),
) -> HashSet<A> {
Default::default()
}
#[cfg(not(feature = "decentralized"))]
#[inline(never)]
pub async fn links(
&self,
selectors: &(CompactString, SmallVec<[CompactString; 2]>),
) -> HashSet<CaseInsensitiveString> {
match self.html.is_some() {
false => Default::default(),
true => {
self.links_stream::<CaseInsensitiveString>(&(&selectors.0, &selectors.1))
.await
}
}
}
#[cfg(feature = "decentralized")]
#[inline(never)]
pub async fn links(
&self,
_: &(CompactString, smallvec::SmallVec<[CompactString; 2]>),
) -> HashSet<CaseInsensitiveString> {
self.links.to_owned()
}
#[inline]
#[cfg(not(feature = "decentralized"))]
fn abs_path(&self, href: &str) -> Url {
convert_abs_path(&self.base, href)
}
}
#[cfg(not(feature = "decentralized"))]
#[tokio::test]
async fn parse_links() {
let client = Client::builder()
.user_agent("spider/1.1.2")
.build()
.unwrap();
let link_result = "https://choosealicense.com/";
let page: Page = Page::new(&link_result, &client).await;
let selector = get_page_selectors(&link_result, false, false);
let links = page.links(&selector.unwrap()).await;
assert!(
links.contains::<CaseInsensitiveString>(&"https://choosealicense.com/about/".into()),
"Could not find {}. Theses URLs was found {:?}",
page.get_url(),
&links
);
}
#[cfg(not(feature = "decentralized"))]
#[tokio::test]
async fn test_abs_path() {
let client = Client::builder()
.user_agent("spider/1.1.2")
.build()
.unwrap();
let link_result = "https://choosealicense.com/";
let page: Page = Page::new(&link_result, &client).await;
assert_eq!(
page.abs_path("/page"),
Url::parse("https://choosealicense.com/page").unwrap()
);
assert_eq!(
page.abs_path("/page?query=keyword"),
Url::parse("https://choosealicense.com/page?query=keyword").unwrap()
);
assert_eq!(
page.abs_path("/page#hash"),
Url::parse("https://choosealicense.com/page").unwrap()
);
assert_eq!(
page.abs_path("/page?query=keyword#hash"),
Url::parse("https://choosealicense.com/page?query=keyword").unwrap()
);
assert_eq!(
page.abs_path("#hash"),
Url::parse("https://choosealicense.com/").unwrap()
);
assert_eq!(
page.abs_path("tel://+212 3456"),
Url::parse("https://choosealicense.com/").unwrap()
);
}