microformats-cli 0.11.0

A command line tool for parsing HTML as Microformats
//! Command-line interface for parsing HTML as Microformats.
//!
//! Supports parsing HTML from files or STDIN and outputting structured
//! microformat data as JSON or JF2 format.

use clap::Parser;
use clap_stdin::FileOrStdin;
use microformats::jf2::IntoJf2;

#[cfg(feature = "debug_flow")]
use std::fmt::Display;

#[cfg(feature = "debug_flow")]
use microformats::types::DebugDocument;

#[cfg(feature = "debug_flow")]
use tabled::{Table, Tabled};

#[derive(Parser, Debug)]
#[clap(
    version,
    about = "A command-line tool for parsing HTML as Microformats.\n\nMicroformats are a web standard for semantic markup, allowing machines to understand the meaning behind HTML content. This parser supports h-card, h-entry, h-event, h-feed, h-product, h-recipe, h-resume, h-review, h-geo, h-adr, and more.\n\nFeatures:\n  • Parse HTML input from files or STDIN\n  • Extract structured microformat data\n  • Output as JSON or JF2 format\n  • Support for base URL specification\n  • Debug mode with element source tracking (when --debug is used)"
)]
struct Arguments {
    /// The URL to use as a base URL, if none can be found from the HTML.
    #[arg(short, long)]
    base_url: Option<url::Url>,

    /// The HTML to be parsed (can be piped via STDIN or be a path to a file).
    html: FileOrStdin,

    /// Present the result as JF2 instead.
    #[arg(short, long, default_value = "false")]
    jf2: bool,

    // Debug mode flags
    /// Enable debug mode with detailed source tracking information.
    /// When enabled, shows which HTML elements contributed to each parsed value.
    #[arg(long, default_value = "false")]
    debug: bool,

    /// Output HTML with debug markers (data-mf2-id attributes) instead of parsing.
    /// Useful for visually inspecting which elements were identified during parsing.
    #[arg(long, requires = "debug", default_value = "false")]
    debug_html: bool,

    /// Filter output to show only specific microformat types (e.g., "h-entry", "h-card").
    #[arg(long, requires = "debug")]
    filter: Option<String>,
}

#[cfg(feature = "debug_flow")]
#[derive(Tabled)]
struct ItemTableRow {
    idx: usize,
    r#type: String,
    element_id: String,
    tag: String,
    line: usize,
    column: usize,
}

#[cfg(feature = "debug_flow")]
impl Display for ItemTableRow {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{} | {} | {} | <{}> | {}:{}",
            self.idx + 1,
            self.r#type,
            self.element_id,
            self.tag,
            self.line,
            self.column
        )
    }
}

#[cfg(feature = "debug_flow")]
#[derive(Tabled)]
struct PropertyTableRow {
    property: String,
    value_idx: usize,
    element_id: String,
    tag: String,
    value_preview: String,
    classes: String,
}

#[cfg(feature = "debug_flow")]
impl Display for PropertyTableRow {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let preview = if self.value_preview.len() > 30 {
            format!("{}...", &self.value_preview[..27])
        } else {
            self.value_preview.clone()
        };
        write!(
            f,
            "{} | [{}] | {} | <{}> | {} | {}",
            self.property, self.value_idx, self.element_id, self.tag, preview, self.classes
        )
    }
}

fn main() -> eyre::Result<()> {
    let args = Arguments::parse();

    // Extract needed fields before consuming html
    let debug = args.debug;
    let base_url = args.base_url;
    let jf2 = args.jf2;
    let debug_html = args.debug_html;
    let filter = args.filter.clone();

    let html = args.html.contents()?;

    if debug {
        handle_debug_mode_inner(html, base_url, debug_html, filter)?;
    } else {
        // Normal parsing (existing behavior)
        let mut parser = microformats::parse::Parser::from_html(html)?;
        let result = parser.into_document(base_url)?;

        let output = if jf2 {
            serde_json::to_string(&result.into_jf2()?)?
        } else {
            serde_json::to_string(&result)?
        };

        println!("{}", output);
    }

    Ok(())
}

#[cfg(feature = "debug_flow")]
fn handle_debug_mode_inner(
    html: String,
    base_url: Option<url::Url>,
    debug_html: bool,
    filter: Option<String>,
) -> eyre::Result<()> {
    let base_url = base_url.unwrap_or_else(|| "https://example.com".parse().unwrap());

    if debug_html {
        // Just output annotated HTML
        let parser = microformats::parse::Parser::from_html(html)?;
        let annotated = parser.with_id_generation(true).to_html()?;
        println!("{}", annotated);
    } else {
        // Parse with debug info
        let mut parser = microformats::parse::Parser::from_html(html)?.with_id_generation(true);

        let doc = parser.into_document(Some(base_url))?;

        match doc.into_debug_document() {
            Some(debug_doc) => {
                print_debug_tables(&debug_doc, filter.as_deref());
            }
            None => {
                eprintln!(
                    "Error: Document was parsed without debug tracking. Make sure to use Parser::with_id_generation(true)."
                );
                std::process::exit(1);
            }
        }
    }

    Ok(())
}

#[cfg(not(feature = "debug_flow"))]
fn handle_debug_mode_inner(
    _html: String,
    _base_url: Option<url::Url>,
    _debug_html: bool,
    _filter: Option<String>,
) -> eyre::Result<()> {
    eyre::bail!("Debug mode requires the 'debug_flow' feature to be enabled at compile time.");
}

#[cfg(feature = "debug_flow")]
fn print_debug_tables(debug_doc: &DebugDocument, filter: Option<&str>) {
    use tabled::settings::Style;

    println!("\n=== Microformats Debug Information ===\n");

    // Filter items if requested
    let items: Vec<_> = if let Some(filter_str) = filter {
        debug_doc
            .value_sources
            .items
            .iter()
            .filter(|item| {
                let type_str = item
                    .r#type
                    .iter()
                    .map(|t| t.to_string())
                    .collect::<Vec<_>>()
                    .join(", ");
                type_str.contains(filter_str)
            })
            .collect()
    } else {
        debug_doc.value_sources.items.iter().collect()
    };

    if items.is_empty() {
        println!("No items found.");
        return;
    }

    // Items summary table
    println!("\n📋 Items:");
    let item_rows: Vec<ItemTableRow> = items
        .iter()
        .enumerate()
        .map(|(idx, item)| ItemTableRow {
            idx,
            r#type: item
                .r#type
                .iter()
                .map(|t| t.to_string())
                .collect::<Vec<_>>()
                .join(", "),
            element_id: item
                .element
                .mf2_id
                .clone()
                .unwrap_or_else(|| "N/A".to_string()),
            tag: item.element.tag.clone(),
            line: item.element.position.line,
            column: item.element.position.column,
        })
        .collect();

    let item_table = Table::new(&item_rows).with(Style::sharp()).to_string();
    println!("{}", item_table);

    // Detailed property information for each item
    for (idx, item) in items.iter().enumerate() {
        println!(
            "\n📦 Item #{} Details ({})",
            idx + 1,
            item.r#type
                .iter()
                .map(|t| t.to_string())
                .collect::<Vec<_>>()
                .join(", ")
        );

        if item.properties.is_empty() {
            println!("  No properties found.");
            continue;
        }

        println!("\n  Properties:");
        for (prop_name, sources) in &item.properties {
            println!("\n  📌 {}:", prop_name);

            let prop_rows: Vec<PropertyTableRow> = sources
                .iter()
                .enumerate()
                .map(|(val_idx, prop)| {
                    let value_preview = format_value_preview(&prop.value);
                    PropertyTableRow {
                        property: prop_name.to_string(),
                        value_idx: val_idx,
                        element_id: prop
                            .element
                            .mf2_id
                            .clone()
                            .unwrap_or_else(|| "N/A".to_string()),
                        tag: prop.element.tag.clone(),
                        value_preview,
                        classes: prop.element.classes.join(", "),
                    }
                })
                .collect();

            let prop_table = Table::new(&prop_rows).with(Style::rounded()).to_string();
            println!("  {}", prop_table.replace('\n', "\n  "));
        }

        if !item.children.is_empty() {
            println!("\n  📂 Children: {}", item.children.len());
        }
    }

    // Relations summary
    if !debug_doc.value_sources.relations.is_empty() {
        println!("\n🔗 Relations:");
        let relation_rows: Vec<_> = debug_doc
            .value_sources
            .relations
            .iter()
            .enumerate()
            .map(|(idx, (url, rel))| (idx + 1, url.to_string(), rel.urls.len()))
            .collect();

        for (rel_idx, url, count) in relation_rows {
            println!("  [{}] {} ({})", rel_idx, url, count);
        }
    }

    println!("\n=== End of Debug Information ===\n");
}

#[cfg(feature = "debug_flow")]
fn format_value_preview(value: &microformats::types::PropertyValue) -> String {
    use microformats::types::PropertyValue;

    match value {
        PropertyValue::Plain(s) => s.to_string(),
        PropertyValue::Url(u) => u.to_string(),
        PropertyValue::Temporal(t) => t.to_string(),
        PropertyValue::Fragment(f) => {
            let html = if f.html.len() > 40 {
                format!("{}...", &f.html[..37])
            } else {
                f.html.clone()
            };
            format!("[HTML] {}", html)
        }
        PropertyValue::Image(img) => format!("[IMG] {}", img.value),
        PropertyValue::Item(item) => format!("[ITEM] {:?}", item.r#type),
    }
}