use super::RecipeComponents;
use crate::config::load_config;
use crate::url_to_text::fetchers::{PageScriberFetcher, RequestFetcher};
use crate::url_to_text::html::extractors::{
Extractor, HtmlClassExtractor, JsonLdExtractor, MicroDataExtractor, ParsingContext,
};
use crate::url_to_text::text::TextExtractor;
use scraper::Html;
use std::error::Error;
use std::time::Duration;
pub async fn process(url: &str) -> Result<RecipeComponents, Box<dyn Error + Send + Sync>> {
let page_scriber_config = load_config()
.ok()
.map(|c| c.page_scriber)
.unwrap_or_default();
let use_page_scriber_first = domain_in_list(url, &page_scriber_config.domains);
let (html_result, used_page_scriber) = if use_page_scriber_first {
match PageScriberFetcher::new(page_scriber_config.url.clone()) {
Some(fetcher) => (fetcher.fetch(url).await, true),
None => {
let fetcher = RequestFetcher::new(Some(Duration::from_secs(30)));
(fetcher.fetch(url).await, false)
}
}
} else {
let fetcher = RequestFetcher::new(Some(Duration::from_secs(30)));
(fetcher.fetch(url).await, false)
};
if let Ok(html_content) = &html_result {
if let Some(components) = try_structured_extractors(html_content, url) {
return Ok(components);
}
}
if !used_page_scriber && html_result.is_err() {
if let Some(fetcher) = PageScriberFetcher::new(page_scriber_config.url.clone()) {
if let Ok(html_content) = fetcher.fetch(url).await {
if let Some(components) = try_structured_extractors(&html_content, url) {
return Ok(components);
}
if TextExtractor::is_available() {
let plain_text = extract_text_from_html(&html_content);
return TextExtractor::extract(&plain_text, url).await;
}
}
}
}
let html_content = html_result?;
if !TextExtractor::is_available() {
return Err("No recipe found on page. Structured data extractors failed and LLM extraction is not configured.".into());
}
let plain_text = extract_text_from_html(&html_content);
TextExtractor::extract(&plain_text, url).await
}
fn try_structured_extractors(html_content: &str, url: &str) -> Option<RecipeComponents> {
let document = Html::parse_document(html_content);
let context = ParsingContext {
url: url.to_string(),
document,
texts: None,
};
let extractors: Vec<Box<dyn Extractor>> = vec![
Box::new(JsonLdExtractor),
Box::new(MicroDataExtractor),
Box::new(HtmlClassExtractor),
];
for extractor in extractors {
if let Ok(recipe) = extractor.parse(&context) {
return Some(recipe_to_components(&recipe));
}
}
None
}
fn recipe_to_components(recipe: &crate::model::Recipe) -> RecipeComponents {
let mut text = String::new();
for ingredient in &recipe.ingredients {
text.push_str(ingredient.trim());
text.push('\n');
}
if !recipe.ingredients.is_empty() && !recipe.instructions.is_empty() {
text.push('\n');
}
text.push_str(recipe.instructions.trim_start());
let mut metadata_lines = Vec::new();
if let Some(desc) = &recipe.description {
metadata_lines.push(format!("description: {}", desc));
}
if let Some(first_image) = recipe.image.first() {
metadata_lines.push(format!("image: {}", first_image));
}
for (key, value) in &recipe.metadata {
metadata_lines.push(format!("{}: {}", key, value));
}
RecipeComponents {
text,
metadata: metadata_lines.join("\n"),
name: recipe.name.clone(),
}
}
fn extract_text_from_html(html: &str) -> String {
let document = Html::parse_document(html);
let selector = scraper::Selector::parse("body").unwrap();
document
.select(&selector)
.next()
.map(|el| el.text().collect::<Vec<_>>().join(" "))
.unwrap_or_default()
}
fn domain_in_list(url: &str, domains: &[String]) -> bool {
let host = url
.split("//")
.nth(1)
.and_then(|s| s.split('/').next())
.unwrap_or("");
domains
.iter()
.any(|domain| host == domain.as_str() || host.ends_with(&format!(".{}", domain)))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_text_from_html() {
let html = r#"
<html>
<body>
<h1>Test Recipe</h1>
<p>Some ingredients</p>
<p>Some instructions</p>
</body>
</html>
"#;
let text = extract_text_from_html(html);
assert!(text.contains("Test Recipe"));
assert!(text.contains("Some ingredients"));
assert!(text.contains("Some instructions"));
}
#[test]
fn test_domain_matches_exact() {
let domains = vec!["seriouseats.com".to_string()];
assert!(domain_in_list("https://seriouseats.com/recipe", &domains));
}
#[test]
fn test_domain_matches_subdomain() {
let domains = vec!["seriouseats.com".to_string()];
assert!(domain_in_list(
"https://www.seriouseats.com/recipe",
&domains
));
}
#[test]
fn test_domain_no_match() {
let domains = vec!["seriouseats.com".to_string()];
assert!(!domain_in_list("https://example.com/recipe", &domains));
}
#[test]
fn test_domain_empty_list() {
let domains: Vec<String> = vec![];
assert!(!domain_in_list("https://seriouseats.com/recipe", &domains));
}
#[test]
fn test_domain_invalid_url() {
let domains = vec!["seriouseats.com".to_string()];
assert!(!domain_in_list("not-a-url", &domains));
}
}