//! Command-line interface for parsing HTML as Microformats.
//!
//! Supports parsing HTML from files or STDIN and outputting structured
//! microformat data as JSON or JF2 format.
use clap::Parser;
use clap_stdin::FileOrStdin;
use microformats::jf2::IntoJf2;
#[cfg(feature = "debug_flow")]
use std::fmt::Display;
#[cfg(feature = "debug_flow")]
use microformats::types::DebugDocument;
#[cfg(feature = "debug_flow")]
use tabled::{Table, Tabled};
#[derive(Parser, Debug)]
#[clap(
version,
about = "A command-line tool for parsing HTML as Microformats.\n\nMicroformats are a web standard for semantic markup, allowing machines to understand the meaning behind HTML content. This parser supports h-card, h-entry, h-event, h-feed, h-product, h-recipe, h-resume, h-review, h-geo, h-adr, and more.\n\nFeatures:\n • Parse HTML input from files or STDIN\n • Extract structured microformat data\n • Output as JSON or JF2 format\n • Support for base URL specification\n • Debug mode with element source tracking (when --debug is used)"
)]
struct Arguments {
/// The URL to use as a base URL, if none can be found from the HTML.
#[arg(short, long)]
base_url: Option<url::Url>,
/// The HTML to be parsed (can be piped via STDIN or be a path to a file).
html: FileOrStdin,
/// Present the result as JF2 instead.
#[arg(short, long, default_value = "false")]
jf2: bool,
// Debug mode flags
/// Enable debug mode with detailed source tracking information.
/// When enabled, shows which HTML elements contributed to each parsed value.
#[arg(long, default_value = "false")]
debug: bool,
/// Output HTML with debug markers (data-mf2-id attributes) instead of parsing.
/// Useful for visually inspecting which elements were identified during parsing.
#[arg(long, requires = "debug", default_value = "false")]
debug_html: bool,
/// Filter output to show only specific microformat types (e.g., "h-entry", "h-card").
#[arg(long, requires = "debug")]
filter: Option<String>,
}
#[cfg(feature = "debug_flow")]
#[derive(Tabled)]
struct ItemTableRow {
idx: usize,
r#type: String,
element_id: String,
tag: String,
line: usize,
column: usize,
}
#[cfg(feature = "debug_flow")]
impl Display for ItemTableRow {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{} | {} | {} | <{}> | {}:{}",
self.idx + 1,
self.r#type,
self.element_id,
self.tag,
self.line,
self.column
)
}
}
#[cfg(feature = "debug_flow")]
#[derive(Tabled)]
struct PropertyTableRow {
property: String,
value_idx: usize,
element_id: String,
tag: String,
value_preview: String,
classes: String,
}
#[cfg(feature = "debug_flow")]
impl Display for PropertyTableRow {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let preview = if self.value_preview.len() > 30 {
format!("{}...", &self.value_preview[..27])
} else {
self.value_preview.clone()
};
write!(
f,
"{} | [{}] | {} | <{}> | {} | {}",
self.property, self.value_idx, self.element_id, self.tag, preview, self.classes
)
}
}
fn main() -> eyre::Result<()> {
let args = Arguments::parse();
// Extract needed fields before consuming html
let debug = args.debug;
let base_url = args.base_url;
let jf2 = args.jf2;
let debug_html = args.debug_html;
let filter = args.filter.clone();
let html = args.html.contents()?;
if debug {
handle_debug_mode_inner(html, base_url, debug_html, filter)?;
} else {
// Normal parsing (existing behavior)
let mut parser = microformats::parse::Parser::from_html(html)?;
let result = parser.into_document(base_url)?;
let output = if jf2 {
serde_json::to_string(&result.into_jf2()?)?
} else {
serde_json::to_string(&result)?
};
println!("{}", output);
}
Ok(())
}
#[cfg(feature = "debug_flow")]
fn handle_debug_mode_inner(
html: String,
base_url: Option<url::Url>,
debug_html: bool,
filter: Option<String>,
) -> eyre::Result<()> {
let base_url = base_url.unwrap_or_else(|| "https://example.com".parse().unwrap());
if debug_html {
// Just output annotated HTML
let parser = microformats::parse::Parser::from_html(html)?;
let annotated = parser.with_id_generation(true).to_html()?;
println!("{}", annotated);
} else {
// Parse with debug info
let mut parser = microformats::parse::Parser::from_html(html)?.with_id_generation(true);
let doc = parser.into_document(Some(base_url))?;
match doc.into_debug_document() {
Some(debug_doc) => {
print_debug_tables(&debug_doc, filter.as_deref());
}
None => {
eprintln!(
"Error: Document was parsed without debug tracking. Make sure to use Parser::with_id_generation(true)."
);
std::process::exit(1);
}
}
}
Ok(())
}
#[cfg(not(feature = "debug_flow"))]
fn handle_debug_mode_inner(
_html: String,
_base_url: Option<url::Url>,
_debug_html: bool,
_filter: Option<String>,
) -> eyre::Result<()> {
eyre::bail!("Debug mode requires the 'debug_flow' feature to be enabled at compile time.");
}
#[cfg(feature = "debug_flow")]
fn print_debug_tables(debug_doc: &DebugDocument, filter: Option<&str>) {
use tabled::settings::Style;
println!("\n=== Microformats Debug Information ===\n");
// Filter items if requested
let items: Vec<_> = if let Some(filter_str) = filter {
debug_doc
.value_sources
.items
.iter()
.filter(|item| {
let type_str = item
.r#type
.iter()
.map(|t| t.to_string())
.collect::<Vec<_>>()
.join(", ");
type_str.contains(filter_str)
})
.collect()
} else {
debug_doc.value_sources.items.iter().collect()
};
if items.is_empty() {
println!("No items found.");
return;
}
// Items summary table
println!("\n📋 Items:");
let item_rows: Vec<ItemTableRow> = items
.iter()
.enumerate()
.map(|(idx, item)| ItemTableRow {
idx,
r#type: item
.r#type
.iter()
.map(|t| t.to_string())
.collect::<Vec<_>>()
.join(", "),
element_id: item
.element
.mf2_id
.clone()
.unwrap_or_else(|| "N/A".to_string()),
tag: item.element.tag.clone(),
line: item.element.position.line,
column: item.element.position.column,
})
.collect();
let item_table = Table::new(&item_rows).with(Style::sharp()).to_string();
println!("{}", item_table);
// Detailed property information for each item
for (idx, item) in items.iter().enumerate() {
println!(
"\n📦 Item #{} Details ({})",
idx + 1,
item.r#type
.iter()
.map(|t| t.to_string())
.collect::<Vec<_>>()
.join(", ")
);
if item.properties.is_empty() {
println!(" No properties found.");
continue;
}
println!("\n Properties:");
for (prop_name, sources) in &item.properties {
println!("\n 📌 {}:", prop_name);
let prop_rows: Vec<PropertyTableRow> = sources
.iter()
.enumerate()
.map(|(val_idx, prop)| {
let value_preview = format_value_preview(&prop.value);
PropertyTableRow {
property: prop_name.to_string(),
value_idx: val_idx,
element_id: prop
.element
.mf2_id
.clone()
.unwrap_or_else(|| "N/A".to_string()),
tag: prop.element.tag.clone(),
value_preview,
classes: prop.element.classes.join(", "),
}
})
.collect();
let prop_table = Table::new(&prop_rows).with(Style::rounded()).to_string();
println!(" {}", prop_table.replace('\n', "\n "));
}
if !item.children.is_empty() {
println!("\n 📂 Children: {}", item.children.len());
}
}
// Relations summary
if !debug_doc.value_sources.relations.is_empty() {
println!("\n🔗 Relations:");
let relation_rows: Vec<_> = debug_doc
.value_sources
.relations
.iter()
.enumerate()
.map(|(idx, (url, rel))| (idx + 1, url.to_string(), rel.urls.len()))
.collect();
for (rel_idx, url, count) in relation_rows {
println!(" [{}] {} ({})", rel_idx, url, count);
}
}
println!("\n=== End of Debug Information ===\n");
}
#[cfg(feature = "debug_flow")]
fn format_value_preview(value: µformats::types::PropertyValue) -> String {
use microformats::types::PropertyValue;
match value {
PropertyValue::Plain(s) => s.to_string(),
PropertyValue::Url(u) => u.to_string(),
PropertyValue::Temporal(t) => t.to_string(),
PropertyValue::Fragment(f) => {
let html = if f.html.len() > 40 {
format!("{}...", &f.html[..37])
} else {
f.html.clone()
};
format!("[HTML] {}", html)
}
PropertyValue::Image(img) => format!("[IMG] {}", img.value),
PropertyValue::Item(item) => format!("[ITEM] {:?}", item.r#type),
}
}