use tl::VDom;
use url::Url;
use crate::types::{FeedInfo, ImageInfo, JsonLdEntry, LinkInfo, PageMetadata};
use super::feeds::{extract_favicons, extract_feeds, extract_headings, extract_hreflangs};
use super::images::extract_images;
use super::json_ld::extract_json_ld;
use super::links::extract_links;
use super::metadata::extract_metadata;
pub(crate) struct HtmlExtraction {
pub(crate) metadata: PageMetadata,
pub(crate) links: Vec<LinkInfo>,
pub(crate) images: Vec<ImageInfo>,
pub(crate) feeds: Vec<FeedInfo>,
pub(crate) json_ld: Vec<JsonLdEntry>,
}
pub(crate) fn extract_page_data(
dom: &VDom<'_>,
body: &str,
base_url: &Url,
is_html: bool,
include_extended: bool,
) -> HtmlExtraction {
if !is_html {
return HtmlExtraction {
metadata: PageMetadata::default(),
links: Vec::new(),
images: Vec::new(),
feeds: Vec::new(),
json_ld: Vec::new(),
};
}
let mut metadata = extract_metadata(dom, body);
if include_extended {
let hreflangs = extract_hreflangs(dom);
if !hreflangs.is_empty() {
metadata.hreflangs = Some(hreflangs);
}
let favicons = extract_favicons(dom);
if !favicons.is_empty() {
metadata.favicons = Some(favicons);
}
let headings = extract_headings(dom);
if !headings.is_empty() {
metadata.headings = Some(headings);
}
metadata.word_count = Some(super::content::compute_word_count(dom));
}
let links = extract_links(dom, base_url);
let images = extract_images(dom, base_url);
let feeds = extract_feeds(dom);
let json_ld = extract_json_ld(dom);
HtmlExtraction {
metadata,
links,
images,
feeds,
json_ld,
}
}