use clap::{Parser, ValueEnum};
use std::fs;
use std::path::{Path, PathBuf};
use super::super::output::color;
use super::super::parser::ModelBackend;
use super::super::utils::parse_grounded_document;
#[derive(Parser, Debug)]
pub struct ExportArgs {
#[arg(short, long, value_name = "PATH")]
pub input: PathBuf,
#[arg(short, long, value_name = "DIR")]
pub output: PathBuf,
#[arg(short, long, default_value = "brat")]
pub format: ExportFormat,
#[arg(short, long, default_value = "stacked")]
pub model: ModelBackend,
#[arg(long)]
pub overwrite: bool,
#[arg(long)]
pub include_confidence: bool,
#[arg(long, default_value = "urn:anno:")]
pub base_uri: String,
#[arg(short, long)]
pub quiet: bool,
}
#[derive(Debug, Clone, Copy, Default, ValueEnum)]
pub enum ExportFormat {
#[default]
Brat,
Conll,
Jsonl,
#[value(name = "ntriples")]
NTriples,
#[value(name = "jsonld")]
JsonLd,
#[cfg(feature = "graph")]
#[value(name = "graph-ntriples")]
GraphNTriples,
#[value(name = "graph-csv", alias = "kuzu")]
GraphCsv,
}
struct Extracted {
entities: Vec<anno::Entity>,
relations: Vec<anno::Relation>,
}
impl Extracted {
fn entities_only(entities: Vec<anno::Entity>) -> Self {
Self {
entities,
relations: Vec::new(),
}
}
}
pub fn run(args: ExportArgs) -> Result<(), String> {
if !args.input.exists() {
return Err(format!("Input not found: {:?}", args.input));
}
if !args.output.exists() {
fs::create_dir_all(&args.output)
.map_err(|e| format!("Failed to create output directory: {}", e))?;
}
let relation_model = args.model.try_create_relation_model().transpose()?;
let plain_model = if relation_model.is_none() {
Some(args.model.create_model()?)
} else {
None
};
let files: Vec<PathBuf> = if args.input.is_file() {
vec![args.input.clone()]
} else {
fs::read_dir(&args.input)
.map_err(|e| format!("Failed to read directory: {}", e))?
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| {
p.is_file()
&& p.extension()
.is_some_and(|e| e == "txt" || e == "json" || e == "jsonl")
})
.collect()
};
if files.is_empty() {
return Err("No .txt/.json/.jsonl files found in input".into());
}
if !args.quiet {
let mode = if relation_model.is_some() {
" (relation-capable)"
} else {
""
};
eprintln!(
"{} Exporting {} files to {:?} format{}",
color("32", "[export]"),
files.len(),
args.format,
mode,
);
}
let mut success_count = 0;
let mut error_count = 0;
for file in &files {
let is_json = file
.extension()
.is_some_and(|e| e == "json" || e == "jsonl");
let extracted = if is_json {
let json_content =
fs::read_to_string(file).map_err(|e| format!("Failed to read file: {}", e))?;
let doc = parse_grounded_document(&json_content)?;
let entities: Vec<anno::Entity> = doc
.signals()
.iter()
.filter_map(|s| {
let (start, end) = s.text_offsets()?;
Some(anno::Entity::new(
s.surface(),
s.label.to_entity_type(),
start,
end,
s.confidence.value(),
))
})
.collect();
Ok((doc.text().to_owned(), Extracted::entities_only(entities)))
} else if let Some(ref rm) = relation_model {
let content =
fs::read_to_string(file).map_err(|e| format!("Failed to read file: {}", e))?;
let (entities, relations) = rm
.extract_relations_default(&content)
.map_err(|e| format!("Extraction failed: {}", e))?;
Ok((
content,
Extracted {
entities,
relations,
},
))
} else if let Some(ref m) = plain_model {
let content =
fs::read_to_string(file).map_err(|e| format!("Failed to read file: {}", e))?;
let entities = m
.extract_entities(&content, None)
.map_err(|e| format!("Extraction failed: {}", e))?;
Ok((content, Extracted::entities_only(entities)))
} else {
Err("No model available".to_string())
};
match extracted {
Err(e) => {
error_count += 1;
if !args.quiet {
eprintln!(
" {} {:?}: {}",
color("31", "✗"),
file.file_name().unwrap_or_default(),
e
);
}
}
Ok((content, ext)) => {
let entity_count = ext.entities.len();
let rel_count = ext.relations.len();
match export_file(ExportFileOpts {
input: file,
output_dir: &args.output,
content: &content,
ext,
format: args.format,
include_confidence: args.include_confidence,
overwrite: args.overwrite,
base_uri: &args.base_uri,
}) {
Ok(()) => {
success_count += 1;
if !args.quiet {
let rel_suffix = if rel_count > 0 {
format!(", {} relations", rel_count)
} else {
String::new()
};
eprintln!(
" {} {:?} ({} entities{})",
color("32", "✓"),
file.file_name().unwrap_or_default(),
entity_count,
rel_suffix,
);
}
}
Err(e) => {
error_count += 1;
if !args.quiet {
eprintln!(
" {} {:?}: {}",
color("31", "✗"),
file.file_name().unwrap_or_default(),
e
);
}
}
}
}
}
}
if !args.quiet {
eprintln!();
eprintln!(
"{} Exported {} files ({} failed)",
color("32", "[done]"),
success_count,
error_count
);
}
if error_count > 0 && success_count == 0 {
Err("All exports failed".into())
} else {
Ok(())
}
}
struct ExportFileOpts<'a> {
input: &'a Path,
output_dir: &'a Path,
content: &'a str,
ext: Extracted,
format: ExportFormat,
include_confidence: bool,
overwrite: bool,
base_uri: &'a str,
}
fn export_file(opts: ExportFileOpts<'_>) -> Result<(), String> {
let ExportFileOpts {
input,
output_dir,
content,
ext,
format,
include_confidence,
overwrite,
base_uri,
} = opts;
let stem = input.file_stem().unwrap_or_default().to_string_lossy();
if matches!(format, ExportFormat::GraphCsv) {
let nodes_path = output_dir.join(format!("{}-nodes.csv", stem));
let edges_path = output_dir.join(format!("{}-edges.csv", stem));
for path in [&nodes_path, &edges_path] {
if path.exists() && !overwrite {
return Err(format!(
"Output file already exists: {:?} (use --overwrite)",
path
));
}
}
let (nodes_csv, edges_csv) = anno::export::to_graph_csv(
&ext.entities,
&ext.relations,
&input.to_string_lossy(),
include_confidence,
);
fs::write(&nodes_path, nodes_csv)
.map_err(|e| format!("Failed to write nodes CSV: {}", e))?;
fs::write(&edges_path, edges_csv)
.map_err(|e| format!("Failed to write edges CSV: {}", e))?;
return Ok(());
}
let output_path = match format {
ExportFormat::Brat => output_dir.join(format!("{}.ann", stem)),
ExportFormat::Conll => output_dir.join(format!("{}.conll", stem)),
ExportFormat::Jsonl => output_dir.join(format!("{}.jsonl", stem)),
ExportFormat::NTriples => output_dir.join(format!("{}.nt", stem)),
ExportFormat::JsonLd => output_dir.join(format!("{}.jsonld", stem)),
#[cfg(feature = "graph")]
ExportFormat::GraphNTriples => output_dir.join(format!("{}.nt", stem)),
_ => {
return Err(format!(
"Unsupported single-file export format: {:?}",
format
))
}
};
if output_path.exists() && !overwrite {
return Err(format!(
"Output file already exists: {:?} (use --overwrite)",
output_path
));
}
let source_label = input.to_string_lossy();
let output_content = match format {
ExportFormat::Brat => anno::export::to_brat(content, &ext.entities, include_confidence),
ExportFormat::Conll => anno::export::to_conll(content, &ext.entities),
ExportFormat::Jsonl => {
anno::export::to_jsonl(&ext.entities, &source_label, include_confidence)
}
ExportFormat::NTriples => {
anno::export::to_ntriples(&ext.entities, &ext.relations, &source_label, base_uri)
}
ExportFormat::JsonLd => anno::export::to_jsonld(
&ext.entities,
&ext.relations,
&source_label,
include_confidence,
base_uri,
),
#[cfg(feature = "graph")]
ExportFormat::GraphNTriples => {
export_graph_ntriples(&ext.entities, &ext.relations, input, base_uri)
}
_ => {
return Err(format!(
"Unsupported single-file export format: {:?}",
format
))
}
};
fs::write(&output_path, output_content)
.map_err(|e| format!("Failed to write output: {}", e))?;
if matches!(format, ExportFormat::Brat) {
let txt_path = output_dir.join(format!("{}.txt", stem));
if !txt_path.exists() || overwrite {
fs::write(&txt_path, content)
.map_err(|e| format!("Failed to write text file: {}", e))?;
}
}
Ok(())
}
#[cfg(feature = "graph")]
fn export_graph_ntriples(
entities: &[anno::Entity],
relations: &[anno::Relation],
source: &Path,
base_uri: &str,
) -> String {
let base = base_uri.trim_end_matches('/');
let stem = anno::graph::uri_safe(&source.file_stem().unwrap_or_default().to_string_lossy());
let doc_iri = format!("{}/doc/{}", base, stem);
let kg = anno::graph::entities_to_knowledge_graph(entities, relations, &doc_iri, base_uri);
kg.triples()
.map(|t| t.to_ntriples())
.collect::<Vec<_>>()
.join("\n")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn conll_splits_trailing_period_from_entity() {
let text = "Apple CEO Tim Cook.";
let entities = vec![
anno::Entity::new("Apple", anno::EntityType::Organization, 0, 5, 0.9),
anno::Entity::new("Tim Cook", anno::EntityType::Person, 10, 18, 0.9),
];
let output = anno::export::to_conll(text, &entities);
let lines: Vec<&str> = output.lines().collect();
assert_eq!(lines[0], "Apple\tB-ORG");
assert_eq!(lines[1], "CEO\tO");
assert_eq!(lines[2], "Tim\tB-PER");
assert_eq!(lines[3], "Cook\tI-PER");
assert_eq!(lines[4], ".\tO");
assert_eq!(lines.len(), 5);
}
#[test]
fn conll_no_trailing_punct_unchanged() {
let text = "Tim Cook spoke";
let entities = vec![anno::Entity::new(
"Tim Cook",
anno::EntityType::Person,
0,
8,
0.9,
)];
let output = anno::export::to_conll(text, &entities);
let lines: Vec<&str> = output.lines().collect();
assert_eq!(lines[0], "Tim\tB-PER");
assert_eq!(lines[1], "Cook\tI-PER");
assert_eq!(lines[2], "spoke\tO");
assert_eq!(lines.len(), 3);
}
#[test]
fn conll_multiple_punct_chars() {
let text = "Really?!";
let entities: Vec<anno::Entity> = vec![];
let output = anno::export::to_conll(text, &entities);
let lines: Vec<&str> = output.lines().collect();
assert_eq!(lines[0], "Really\tO");
assert_eq!(lines[1], "?!\tO");
}
}