use crate::url_to_text::fetchers::{ChromeFetcher, RequestFetcher};
use crate::url_to_text::html::extractors::{
Extractor, HtmlClassExtractor, JsonLdExtractor, MicroDataExtractor, ParsingContext,
};
use crate::url_to_text::text::TextExtractor;
use scraper::Html;
use std::error::Error;
use std::time::Duration;
pub async fn process(url: &str) -> Result<String, Box<dyn Error + Send + Sync>> {
let fetcher = RequestFetcher::new(Some(Duration::from_secs(30)));
let html_content = fetcher.fetch(url).await?;
let document = Html::parse_document(&html_content);
let context = ParsingContext {
url: url.to_string(),
document,
texts: None,
};
let extractors: Vec<Box<dyn Extractor>> = vec![
Box::new(JsonLdExtractor),
Box::new(MicroDataExtractor),
Box::new(HtmlClassExtractor),
];
for extractor in extractors {
if let Ok(recipe) = extractor.parse(&context) {
return Ok(recipe.to_text_with_metadata());
}
}
let plain_text = if ChromeFetcher::is_available() {
let chrome =
ChromeFetcher::new().ok_or("ChromeFetcher is available but failed to initialize")?;
chrome.fetch(url).await?
} else {
extract_text_from_html(&html_content)
};
let text_with_metadata = TextExtractor::extract(&plain_text, url).await?;
Ok(text_with_metadata)
}
fn extract_text_from_html(html: &str) -> String {
let document = Html::parse_document(html);
let selector = scraper::Selector::parse("body").unwrap();
document
.select(&selector)
.next()
.map(|el| el.text().collect::<Vec<_>>().join(" "))
.unwrap_or_default()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_text_from_html() {
let html = r#"
<html>
<body>
<h1>Test Recipe</h1>
<p>Some ingredients</p>
<p>Some instructions</p>
</body>
</html>
"#;
let text = extract_text_from_html(html);
assert!(text.contains("Test Recipe"));
assert!(text.contains("Some ingredients"));
assert!(text.contains("Some instructions"));
}
}