use super::RecipeComponents;
use crate::url_to_text::fetchers::{ChromeFetcher, RequestFetcher};
use crate::url_to_text::html::extractors::{
Extractor, HtmlClassExtractor, JsonLdExtractor, MicroDataExtractor, ParsingContext,
};
use crate::url_to_text::text::TextExtractor;
use scraper::Html;
use std::error::Error;
use std::time::Duration;
pub async fn process(url: &str) -> Result<RecipeComponents, Box<dyn Error + Send + Sync>> {
let fetcher = RequestFetcher::new(Some(Duration::from_secs(30)));
let html_content = fetcher.fetch(url).await?;
let document = Html::parse_document(&html_content);
let context = ParsingContext {
url: url.to_string(),
document,
texts: None,
};
let extractors: Vec<Box<dyn Extractor>> = vec![
Box::new(JsonLdExtractor),
Box::new(MicroDataExtractor),
Box::new(HtmlClassExtractor),
];
for extractor in extractors {
if let Ok(recipe) = extractor.parse(&context) {
return Ok(recipe_to_components(&recipe));
}
}
let plain_text = if ChromeFetcher::is_available() {
let chrome =
ChromeFetcher::new().ok_or("ChromeFetcher is available but failed to initialize")?;
chrome.fetch(url).await?
} else {
extract_text_from_html(&html_content)
};
let text_with_metadata = TextExtractor::extract(&plain_text, url).await?;
Ok(parse_text_to_components(&text_with_metadata))
}
fn recipe_to_components(recipe: &crate::model::Recipe) -> RecipeComponents {
let mut text = String::new();
for ingredient in &recipe.ingredients {
text.push_str(ingredient);
text.push('\n');
}
if !recipe.ingredients.is_empty() {
text.push('\n');
}
text.push_str(&recipe.instructions);
let mut metadata_lines = Vec::new();
if let Some(desc) = &recipe.description {
metadata_lines.push(format!("description: {}", desc));
}
if !recipe.image.is_empty() {
metadata_lines.push(format!("image: {}", recipe.image.join(", ")));
}
for (key, value) in &recipe.metadata {
metadata_lines.push(format!("{}: {}", key, value));
}
RecipeComponents {
text,
metadata: metadata_lines.join("\n"),
name: recipe.name.clone(),
}
}
fn parse_text_to_components(text: &str) -> RecipeComponents {
let (metadata_map, body) = crate::model::Recipe::parse_text_format(text);
let name = metadata_map.get("title").cloned().unwrap_or_default();
let metadata_lines: Vec<String> = metadata_map
.iter()
.filter(|(k, _)| *k != "title")
.map(|(k, v)| format!("{}: {}", k, v))
.collect();
RecipeComponents {
text: body,
metadata: metadata_lines.join("\n"),
name,
}
}
fn extract_text_from_html(html: &str) -> String {
let document = Html::parse_document(html);
let selector = scraper::Selector::parse("body").unwrap();
document
.select(&selector)
.next()
.map(|el| el.text().collect::<Vec<_>>().join(" "))
.unwrap_or_default()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_text_from_html() {
let html = r#"
<html>
<body>
<h1>Test Recipe</h1>
<p>Some ingredients</p>
<p>Some instructions</p>
</body>
</html>
"#;
let text = extract_text_from_html(html);
assert!(text.contains("Test Recipe"));
assert!(text.contains("Some ingredients"));
assert!(text.contains("Some instructions"));
}
}