use clap::Parser;
use std::collections::HashMap;
use std::fs;
use std::path::PathBuf;
use super::super::output::color;
use super::super::parser::ModelBackend;
#[derive(Parser, Debug)]
pub struct SingletonArgs {
#[arg(short, long)]
pub input: Option<PathBuf>,
#[arg(short, long)]
pub text: Option<String>,
#[arg(short, long, default_value = "stacked")]
pub model: ModelBackend,
#[arg(long, default_value = "human")]
pub format: String,
#[arg(short, long)]
pub verbose: bool,
#[arg(short, long)]
pub quiet: bool,
}
#[derive(Debug, Clone)]
pub struct SingletonReport {
pub total_entities: usize,
pub singleton_count: usize,
pub clustered_count: usize,
pub singleton_ratio: f32,
pub singletons_by_type: HashMap<String, Vec<SingletonEntity>>,
pub likely_missed: Vec<SingletonEntity>,
pub likely_genuine: Vec<SingletonEntity>,
}
#[derive(Debug, Clone)]
pub struct SingletonEntity {
pub text: String,
pub entity_type: String,
pub start: usize,
pub end: usize,
pub confidence: f32,
pub reason: SingletonReason,
}
#[derive(Debug, Clone)]
pub enum SingletonReason {
FirstMentionOnly,
UniqueProperNoun,
GenericReference,
LikelyMissed {
similar_to: String,
similarity: f32,
},
PartOfCompound,
}
pub fn run(args: SingletonArgs) -> Result<(), String> {
let text = if let Some(path) = &args.input {
fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?
} else if let Some(t) = &args.text {
t.clone()
} else {
return Err("No input provided. Use --input or --text".into());
};
let model = args.model.create_model()?;
let entities = model
.extract_entities(&text, None)
.map_err(|e| format!("Extraction failed: {}", e))?;
let report = analyze_singletons(&entities, &text);
match args.format.as_str() {
"json" => print_json_report(&report),
"tsv" => print_tsv_report(&report),
_ => print_human_report(&report, &text, args.verbose, args.quiet),
}
Ok(())
}
fn analyze_singletons(entities: &[anno::Entity], text: &str) -> SingletonReport {
let mut singletons_by_type: HashMap<String, Vec<SingletonEntity>> = HashMap::new();
let mut likely_missed = Vec::new();
let mut likely_genuine = Vec::new();
let total_entities = entities.len();
let singleton_count = entities.len(); let clustered_count = 0;
for entity in entities {
let entity_type = entity.entity_type.as_label().to_string();
let reason = classify_singleton(entity, entities, text);
let singleton = SingletonEntity {
text: entity.text.clone(),
entity_type: entity_type.clone(),
start: entity.start(),
end: entity.end(),
confidence: f32::from(entity.confidence),
reason: reason.clone(),
};
singletons_by_type
.entry(entity_type)
.or_default()
.push(singleton.clone());
match &reason {
SingletonReason::LikelyMissed { .. } => {
likely_missed.push(singleton);
}
SingletonReason::UniqueProperNoun | SingletonReason::FirstMentionOnly => {
likely_genuine.push(singleton);
}
_ => {}
}
}
SingletonReport {
total_entities,
singleton_count,
clustered_count,
singleton_ratio: if total_entities > 0 {
singleton_count as f32 / total_entities as f32
} else {
0.0
},
singletons_by_type,
likely_missed,
likely_genuine,
}
}
fn classify_singleton(
entity: &anno::Entity,
all_entities: &[anno::Entity],
text: &str,
) -> SingletonReason {
let entity_text = entity.text.to_lowercase();
let entity_words: Vec<&str> = entity.text.split_whitespace().collect();
if entity_words
.first()
.is_some_and(|w| ["a", "an", "the", "some", "any"].contains(&w.to_lowercase().as_str()))
{
return SingletonReason::GenericReference;
}
for other in all_entities {
if std::ptr::eq(entity, other) {
continue;
}
let other_text = other.text.to_lowercase();
if entity_text == other_text && entity.start() != other.start() {
return SingletonReason::LikelyMissed {
similar_to: other.text.clone(),
similarity: 1.0,
};
}
if entity_text.contains(&other_text) || other_text.contains(&entity_text) {
let similarity = entity_text.len().min(other_text.len()) as f32
/ entity_text.len().max(other_text.len()) as f32;
if similarity > 0.5 {
return SingletonReason::LikelyMissed {
similar_to: other.text.clone(),
similarity,
};
}
}
if entity.entity_type.as_label() == "PER"
&& other.entity_type.as_label() == "PER"
&& entity_words.len() > 1
{
let other_words: Vec<&str> = other.text.split_whitespace().collect();
if other_words.len() > 1 && entity_words.last() == other_words.last() {
return SingletonReason::LikelyMissed {
similar_to: other.text.clone(),
similarity: 0.7,
};
}
}
}
let before_context: String = text
.chars()
.skip(entity.start().saturating_sub(10))
.take(10.min(entity.start()))
.collect();
if before_context.contains(" of ")
|| before_context.contains(" for ")
|| before_context.contains(" at ")
|| before_context.contains("'s ")
{
return SingletonReason::PartOfCompound;
}
if entity.start() > 0 && entity.text.chars().next().is_some_and(|c| c.is_uppercase()) {
let prev_char = text.chars().nth(entity.start() - 1);
if prev_char.is_some_and(|c| c != '.' && c != '!' && c != '?') {
return SingletonReason::UniqueProperNoun;
}
}
SingletonReason::FirstMentionOnly
}
fn print_human_report(report: &SingletonReport, _text: &str, verbose: bool, quiet: bool) {
if quiet {
println!(
"{}\t{}\t{:.1}%",
report.singleton_count,
report.clustered_count,
report.singleton_ratio * 100.0
);
return;
}
println!("{}", color("1;36", "Singleton Analysis Report"));
println!();
println!("{}:", color("1;33", "Summary"));
println!(" Total entities: {}", report.total_entities);
println!(" Singletons: {}", report.singleton_count);
println!(" Clustered: {}", report.clustered_count);
println!(" Singleton ratio: {:.1}%", report.singleton_ratio * 100.0);
println!();
println!("{}:", color("1;33", "By Entity Type"));
for (entity_type, singletons) in &report.singletons_by_type {
println!(" {}: {} singletons", entity_type, singletons.len());
}
println!();
if !report.likely_missed.is_empty() {
println!("{}:", color("1;31", "Likely Missed Coreferences"));
for s in &report.likely_missed {
if let SingletonReason::LikelyMissed {
similar_to,
similarity,
} = &s.reason
{
println!(
" \"{}\" ↔ \"{}\" ({:.0}% similar)",
s.text,
similar_to,
similarity * 100.0
);
}
}
println!();
}
if verbose && !report.likely_genuine.is_empty() {
println!("{}:", color("1;32", "Likely Genuine Singletons"));
for s in &report.likely_genuine {
let reason_str = match &s.reason {
SingletonReason::UniqueProperNoun => "unique proper noun",
SingletonReason::FirstMentionOnly => "first mention only",
_ => "other",
};
println!(" \"{}\" [{}] - {}", s.text, s.entity_type, reason_str);
}
println!();
}
if verbose {
println!("{}:", color("1;33", "All Singletons"));
for (entity_type, singletons) in &report.singletons_by_type {
for s in singletons {
let reason_str = match &s.reason {
SingletonReason::FirstMentionOnly => "first_mention",
SingletonReason::UniqueProperNoun => "unique_proper",
SingletonReason::GenericReference => "generic_ref",
SingletonReason::LikelyMissed { .. } => "likely_missed",
SingletonReason::PartOfCompound => "compound_part",
};
println!(
" {} \"{}\" @{}:{} [{}]",
entity_type, s.text, s.start, s.end, reason_str
);
}
}
}
}
fn print_json_report(report: &SingletonReport) {
let json = serde_json::json!({
"total_entities": report.total_entities,
"singleton_count": report.singleton_count,
"clustered_count": report.clustered_count,
"singleton_ratio": report.singleton_ratio,
"by_type": report.singletons_by_type.iter().map(|(t, s)| {
(t.clone(), serde_json::json!({
"count": s.len(),
"entities": s.iter().map(|e| serde_json::json!({
"text": e.text,
"start": e.start,
"end": e.end,
"confidence": e.confidence,
"reason": format!("{:?}", e.reason),
})).collect::<Vec<_>>()
}))
}).collect::<HashMap<_, _>>(),
"likely_missed": report.likely_missed.iter().map(|e| {
serde_json::json!({
"text": e.text,
"entity_type": e.entity_type,
"reason": format!("{:?}", e.reason),
})
}).collect::<Vec<_>>(),
"likely_genuine": report.likely_genuine.len(),
});
println!(
"{}",
serde_json::to_string_pretty(&json).unwrap_or_default()
);
}
fn print_tsv_report(report: &SingletonReport) {
println!("text\ttype\tstart\tend\tconfidence\treason");
for singletons in report.singletons_by_type.values() {
for s in singletons {
let reason_str = match &s.reason {
SingletonReason::FirstMentionOnly => "first_mention",
SingletonReason::UniqueProperNoun => "unique_proper",
SingletonReason::GenericReference => "generic_ref",
SingletonReason::LikelyMissed { similar_to, .. } => {
&format!("missed:{}", similar_to)
}
SingletonReason::PartOfCompound => "compound_part",
};
println!(
"{}\t{}\t{}\t{}\t{:.2}\t{}",
s.text, s.entity_type, s.start, s.end, s.confidence, reason_str
);
}
}
}