use super::RecipeComponents;
use crate::url_to_text::fetchers::{ChromeFetcher, RequestFetcher};
use crate::url_to_text::html::extractors::{
Extractor, HtmlClassExtractor, JsonLdExtractor, MicroDataExtractor, ParsingContext,
};
use crate::url_to_text::text::TextExtractor;
use scraper::Html;
use std::error::Error;
use std::time::Duration;
pub async fn process(url: &str) -> Result<RecipeComponents, Box<dyn Error + Send + Sync>> {
let fetcher = RequestFetcher::new(Some(Duration::from_secs(30)));
let html_content = fetcher.fetch(url).await?;
let document = Html::parse_document(&html_content);
let context = ParsingContext {
url: url.to_string(),
document,
texts: None,
};
let extractors: Vec<Box<dyn Extractor>> = vec![
Box::new(JsonLdExtractor),
Box::new(MicroDataExtractor),
Box::new(HtmlClassExtractor),
];
for extractor in extractors {
if let Ok(recipe) = extractor.parse(&context) {
return Ok(recipe_to_components(&recipe));
}
}
if !TextExtractor::is_available() {
return Err("No recipe found on page. Structured data extractors failed and LLM extraction is not configured.".into());
}
let plain_text = if ChromeFetcher::is_available() {
let chrome =
ChromeFetcher::new().ok_or("ChromeFetcher is available but failed to initialize")?;
chrome.fetch(url).await?
} else {
extract_text_from_html(&html_content)
};
TextExtractor::extract(&plain_text, url).await
}
fn recipe_to_components(recipe: &crate::model::Recipe) -> RecipeComponents {
let mut text = String::new();
for ingredient in &recipe.ingredients {
text.push_str(ingredient.trim());
text.push('\n');
}
if !recipe.ingredients.is_empty() && !recipe.instructions.is_empty() {
text.push('\n');
}
text.push_str(recipe.instructions.trim_start());
let mut metadata_lines = Vec::new();
if let Some(desc) = &recipe.description {
metadata_lines.push(format!("description: {}", desc));
}
if let Some(first_image) = recipe.image.first() {
metadata_lines.push(format!("image: {}", first_image));
}
for (key, value) in &recipe.metadata {
metadata_lines.push(format!("{}: {}", key, value));
}
RecipeComponents {
text,
metadata: metadata_lines.join("\n"),
name: recipe.name.clone(),
}
}
fn extract_text_from_html(html: &str) -> String {
let document = Html::parse_document(html);
let selector = scraper::Selector::parse("body").unwrap();
document
.select(&selector)
.next()
.map(|el| el.text().collect::<Vec<_>>().join(" "))
.unwrap_or_default()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_text_from_html() {
let html = r#"
<html>
<body>
<h1>Test Recipe</h1>
<p>Some ingredients</p>
<p>Some instructions</p>
</body>
</html>
"#;
let text = extract_text_from_html(html);
assert!(text.contains("Test Recipe"));
assert!(text.contains("Some ingredients"));
assert!(text.contains("Some instructions"));
}
}