use clap::Parser;
use std::fs;
use super::super::output::{color, print_signals};
use super::super::parser::ModelBackend;
use super::super::utils::{get_input_text, link_tracks_to_kb, resolve_coreference};
#[cfg(feature = "eval")]
use crate::cli::ingest::{CompositeResolver, UrlResolver};
use anno::core::grounded::{render_document_html, GroundedDocument, Location, Signal, SignalId};
#[cfg(feature = "graph")]
use anno::graph::{GraphDocument, GraphExportFormat};
use anno::ingest::DocumentPreprocessor;
#[derive(Parser, Debug)]
pub struct DebugArgs {
#[arg(short, long)]
pub text: Option<String>,
#[arg(short, long, value_name = "PATH")]
pub file: Option<String>,
#[arg(value_name = "TEXT")]
pub positional: Vec<String>,
#[arg(long, value_name = "URL")]
pub url: Option<String>,
#[arg(long)]
pub clean: bool,
#[arg(long)]
pub normalize: bool,
#[arg(long)]
pub detect_lang: bool,
#[arg(long, value_name = "FORMAT")]
pub export_graph: Option<String>,
#[arg(short, long, default_value = "stacked")]
pub model: ModelBackend,
#[arg(long)]
pub html: bool,
#[arg(long, value_name = "PATH")]
pub export: Option<String>,
#[arg(long, default_value = "full", value_name = "FORMAT")]
pub export_format: String,
#[arg(short, long, value_name = "PATH")]
pub output: Option<String>,
#[arg(long)]
pub coref: bool,
#[arg(long)]
pub link_kb: bool,
#[arg(short, long)]
pub quiet: bool,
#[arg(short, long, action = clap::ArgAction::Count)]
pub verbose: u8,
}
pub fn run(args: DebugArgs) -> Result<(), String> {
let doc_id = if let Some(url) = &args.url {
url.clone()
} else if let Some(path) = &args.file {
path.clone()
} else {
"debug".to_string()
};
let mut raw_text = if let Some(url) = &args.url {
#[cfg(feature = "eval")]
{
let resolver = CompositeResolver::new();
let resolved = resolver
.resolve(url)
.map_err(|e| format!("Failed to fetch URL {}: {}", url, e))?;
resolved.text
}
#[cfg(not(feature = "eval"))]
{
#[allow(unused_variables)]
let _url = url;
return Err(
"URL resolution requires 'eval' feature. Enable with: cargo build -p anno-cli --features eval"
.to_string(),
);
}
} else {
get_input_text(&args.text, args.file.as_deref(), &args.positional)?
};
if args.clean || args.normalize || args.detect_lang {
let preprocessor = DocumentPreprocessor {
clean_whitespace: args.clean,
normalize_unicode: args.normalize,
detect_language: args.detect_lang,
chunk_size: None,
};
let prepared = preprocessor.prepare(&raw_text);
raw_text = prepared.text;
if args.verbose >= 1 && !prepared.metadata.is_empty() {
eprintln!("Preprocessing metadata: {:?}", prepared.metadata);
}
}
let text = raw_text;
let model = args.model.create_model()?;
let entities = model
.extract_entities(&text, None)
.map_err(|e| format!("Extraction failed: {}", e))?;
let mut doc = GroundedDocument::new(doc_id, &text);
let mut signal_ids: Vec<SignalId> = Vec::new();
for e in &entities {
let id = doc.add_signal(Signal::from(e));
signal_ids.push(id);
}
if args.coref {
resolve_coreference(&mut doc, &text, &signal_ids);
}
if args.link_kb {
link_tracks_to_kb(&mut doc);
}
if let Some(export_path) = args.export {
let export_data = match args.export_format.as_str() {
"full" => serde_json::to_value(&doc)
.map_err(|e| format!("Failed to serialize GroundedDocument: {}", e))?,
"signals" => {
let signals: Vec<_> = doc.signals().to_vec();
serde_json::json!({
"id": doc.id(),
"text": doc.text(),
"signals": signals
})
}
"minimal" => {
let signals: Vec<_> = doc
.signals()
.iter()
.map(|s| {
let (start, end) = s.text_offsets().unwrap_or((0, 0));
serde_json::json!({
"surface": s.surface(),
"label": s.label(),
"start": start,
"end": end,
"confidence": s.confidence
})
})
.collect();
serde_json::json!({
"id": doc.id(),
"text": doc.text(),
"signals": signals
})
}
_ => {
return Err(format!(
"Invalid export format '{}'. Use: full, signals, or minimal",
args.export_format
));
}
};
let json = serde_json::to_string_pretty(&export_data)
.map_err(|e| format!("Failed to serialize export data: {}", e))?;
if let Some(parent) = std::path::Path::new(&export_path).parent() {
if !parent.exists() {
fs::create_dir_all(parent).map_err(|e| {
format!(
"Failed to create directory for export file '{}': {}",
export_path, e
)
})?;
}
}
fs::write(&export_path, json)
.map_err(|e| format!("Failed to write export file '{}': {}", export_path, e))?;
if !args.quiet {
eprintln!(
"{} Exported {} format to {}",
color("32", "✓"),
args.export_format,
export_path
);
}
}
let _index = doc.build_text_index();
let errors = doc.validate();
if !errors.is_empty() && !args.quiet {
eprintln!(
"{} {} validation errors:",
color("33", "warning:"),
errors.len()
);
for e in &errors {
eprintln!(" - {}", e);
}
}
if args.html
|| args
.output
.as_ref()
.map(|p| p.ends_with(".html"))
.unwrap_or(false)
{
let html = render_document_html(&doc);
if let Some(path) = &args.output {
fs::write(path, &html).map_err(|e| format!("Failed to write {}: {}", path, e))?;
if !args.quiet {
println!("{} HTML written to: {}", color("32", "ok:"), path);
}
} else {
println!("{}", html);
}
} else {
if doc.signals().is_empty() {
println!("(no entities)");
} else {
let effective_verbose = if args.coref || args.link_kb {
args.verbose.max(2) } else {
args.verbose
};
print_signals(&doc, &text, effective_verbose);
}
}
if let Some(graph_format_str) = args.export_graph.as_deref() {
#[cfg(not(feature = "graph"))]
{
let _ = graph_format_str;
return Err("Graph export requires the 'graph' feature to be enabled.".to_string());
}
#[cfg(feature = "graph")]
{
let graph_format = match graph_format_str.to_lowercase().as_str() {
"neo4j" | "cypher" => GraphExportFormat::Cypher,
"networkx" | "nx" => GraphExportFormat::NetworkXJson,
"jsonld" | "json-ld" => GraphExportFormat::JsonLd,
_ => {
return Err(format!(
"Invalid graph format '{}'. Use: neo4j, networkx, or jsonld",
graph_format_str
));
}
};
let graph = anno::graph::grounded_to_graph_document(&doc);
let graph_output = graph.export(graph_format);
if !args.quiet {
eprintln!(
"{} Exported graph ({} nodes, {} edges) in {} format",
color("32", "✓"),
graph.node_count(),
graph.edge_count(),
graph_format_str
);
}
println!("{}", graph_output);
}
}
Ok(())
}