cooklang-import 0.9.2

A tool for importing recipes into Cooklang format
Documentation
use super::RecipeComponents;
use crate::url_to_text::fetchers::{ChromeFetcher, RequestFetcher};
use crate::url_to_text::html::extractors::{
    Extractor, HtmlClassExtractor, JsonLdExtractor, MicroDataExtractor, ParsingContext,
};
use crate::url_to_text::text::TextExtractor;
use scraper::Html;
use std::error::Error;
use std::time::Duration;

/// Process a URL to extract recipe content
///
/// This pipeline:
/// 1. Fetches HTML using RequestFetcher
/// 2. Tries HTML extractors (json_ld, microdata, html_class) in order
/// 3. If HTML extractors fail and TextExtractor is configured (OPENAI_API_KEY),
///    falls back to LLM-based extraction
/// 4. Returns RecipeComponents with separated text, metadata, and name
///
/// # Arguments
/// * `url` - The URL to fetch and process
///
/// # Returns
/// * `Ok(RecipeComponents)` - The extracted recipe components
/// * `Err(...)` - If all extraction methods fail or are not configured
pub async fn process(url: &str) -> Result<RecipeComponents, Box<dyn Error + Send + Sync>> {
    // 1. Fetch HTML
    let fetcher = RequestFetcher::new(Some(Duration::from_secs(30)));
    let fetch_result = fetcher.fetch(url).await;

    // 2. If HTML fetch succeeded, try structured extractors first
    if let Ok(html_content) = &fetch_result {
        let document = Html::parse_document(html_content);

        let context = ParsingContext {
            url: url.to_string(),
            document,
            texts: None,
        };

        let extractors: Vec<Box<dyn Extractor>> = vec![
            Box::new(JsonLdExtractor),
            Box::new(MicroDataExtractor),
            Box::new(HtmlClassExtractor),
        ];

        for extractor in extractors {
            if let Ok(recipe) = extractor.parse(&context) {
                return Ok(recipe_to_components(&recipe));
            }
        }
    }

    // 3. Fallback: try ChromeFetcher if available (handles Cloudflare, JS-rendered pages)
    if ChromeFetcher::is_available() {
        if !TextExtractor::is_available() {
            return Err("No recipe found on page. Structured data extractors failed and LLM extraction is not configured.".into());
        }

        let chrome =
            ChromeFetcher::new().ok_or("ChromeFetcher is available but failed to initialize")?;
        let plain_text = chrome.fetch(url).await?;
        return TextExtractor::extract(&plain_text, url).await;
    }

    // 4. No ChromeFetcher — try LLM text extraction from HTML if fetch succeeded
    let html_content = fetch_result?;

    if !TextExtractor::is_available() {
        return Err("No recipe found on page. Structured data extractors failed and LLM extraction is not configured.".into());
    }

    let plain_text = extract_text_from_html(&html_content);
    TextExtractor::extract(&plain_text, url).await
}

/// Convert a Recipe to RecipeComponents
fn recipe_to_components(recipe: &crate::model::Recipe) -> RecipeComponents {
    // Build text from ingredients and instructions
    let mut text = String::new();
    for ingredient in &recipe.ingredients {
        text.push_str(ingredient.trim());
        text.push('\n');
    }
    // Always add a blank line between ingredients and instructions
    if !recipe.ingredients.is_empty() && !recipe.instructions.is_empty() {
        text.push('\n');
    }
    text.push_str(recipe.instructions.trim_start());

    // Build metadata YAML (without --- delimiters)
    let mut metadata_lines = Vec::new();
    if let Some(desc) = &recipe.description {
        metadata_lines.push(format!("description: {}", desc));
    }
    // Only use the first image if multiple are available
    if let Some(first_image) = recipe.image.first() {
        metadata_lines.push(format!("image: {}", first_image));
    }
    for (key, value) in &recipe.metadata {
        metadata_lines.push(format!("{}: {}", key, value));
    }

    RecipeComponents {
        text,
        metadata: metadata_lines.join("\n"),
        name: recipe.name.clone(),
    }
}

/// Simple text extraction from HTML
///
/// Extracts all text content from the <body> element.
/// This is a basic fallback when structured extractors fail.
fn extract_text_from_html(html: &str) -> String {
    let document = Html::parse_document(html);
    let selector = scraper::Selector::parse("body").unwrap();
    document
        .select(&selector)
        .next()
        .map(|el| el.text().collect::<Vec<_>>().join(" "))
        .unwrap_or_default()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_extract_text_from_html() {
        let html = r#"
            <html>
            <body>
                <h1>Test Recipe</h1>
                <p>Some ingredients</p>
                <p>Some instructions</p>
            </body>
            </html>
        "#;

        let text = extract_text_from_html(html);
        assert!(text.contains("Test Recipe"));
        assert!(text.contains("Some ingredients"));
        assert!(text.contains("Some instructions"));
    }
}