use std::collections::BTreeSet;
use async_trait::async_trait;
use scraper::{ElementRef, Html, Selector};
use crate::{
enums::content::Content,
extractors::regex::RegexExtractor,
interfaces::extractor::SubdomainExtractorInterface,
types::core::{Result, Subdomain},
};
#[derive(Default)]
pub struct HTMLExtractor {
selector: String,
removes: Vec<String>,
regextractor: RegexExtractor,
}
impl HTMLExtractor {
pub fn new(selector: String, removes: Vec<String>) -> Self {
Self {
selector,
removes,
regextractor: RegexExtractor::default(),
}
}
}
#[async_trait]
impl SubdomainExtractorInterface for HTMLExtractor {
async fn extract(&self, content: Content, domain: &str) -> Result<BTreeSet<Subdomain>> {
let document = Html::parse_document(&content.as_string());
let selector = Selector::parse(&self.selector)?;
let selected = document.select(&selector);
let remove = |item: ElementRef| {
let mut text = item.inner_html();
self.removes.iter().for_each(|element| {
text = text.replace(element, "");
});
text
};
let extract = |item| self.regextractor.extract_one(item, domain);
Ok(selected.map(remove).filter_map(extract).collect())
}
}