use std::collections::HashMap;
use std::hash::BuildHasher;
use panproto_gat::Theory;
use panproto_schema::{EdgeRule, Protocol, Schema, SchemaBuilder};
use crate::emit::{children_by_edge, constraint_value};
use crate::error::ProtocolError;
use crate::theories;
#[must_use]
pub fn protocol() -> Protocol {
Protocol {
name: "conllu".into(),
schema_theory: "ThConlluSchema".into(),
instance_theory: "ThConlluInstance".into(),
edge_rules: edge_rules(),
obj_kinds: vec![
"sentence".into(),
"word".into(),
"multiword".into(),
"empty".into(),
"upos-tag".into(),
"xpos-tag".into(),
"deprel".into(),
"feature".into(),
"lemma".into(),
],
constraint_sorts: vec![
"form".into(),
"id-range".into(),
"head".into(),
"misc".into(),
"sent-id".into(),
"text".into(),
"newpar".into(),
"newdoc".into(),
],
has_order: true,
..Protocol::default()
}
}
pub fn register_theories<S: BuildHasher>(registry: &mut HashMap<String, Theory, S>) {
theories::register_hypergraph_functor(registry, "ThConlluSchema", "ThConlluInstance");
}
#[allow(clippy::too_many_lines)]
pub fn parse_conllu(input: &str) -> Result<Schema, ProtocolError> {
let proto = protocol();
let mut builder = SchemaBuilder::new(&proto);
let mut he_counter: usize = 0;
let sentences = split_sentences(input);
if sentences.is_empty() {
return Err(ProtocolError::Parse("no sentences found".into()));
}
for (sent_counter, (comments, token_lines)) in sentences.iter().enumerate() {
let sent_id = format!("sent_{sent_counter}");
builder = builder
.vertex(&sent_id, "sentence", None)
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
for comment in comments {
let trimmed = comment.trim_start_matches('#').trim();
if let Some(rest) = trimmed.strip_prefix("sent_id") {
let val = rest.trim().trim_start_matches('=').trim();
builder = builder.constraint(&sent_id, "sent-id", val);
} else if let Some(rest) = trimmed.strip_prefix("text") {
let val = rest.trim().trim_start_matches('=').trim();
builder = builder.constraint(&sent_id, "text", val);
} else if trimmed == "newdoc" {
builder = builder.constraint(&sent_id, "newdoc", "true");
} else if let Some(rest) = trimmed.strip_prefix("newdoc id") {
let val = rest.trim().trim_start_matches('=').trim();
builder = builder.constraint(&sent_id, "newdoc", val);
} else if trimmed == "newpar" {
builder = builder.constraint(&sent_id, "newpar", "true");
} else if let Some(rest) = trimmed.strip_prefix("newpar id") {
let val = rest.trim().trim_start_matches('=').trim();
builder = builder.constraint(&sent_id, "newpar", val);
}
}
let mut token_ids: HashMap<String, String> = HashMap::new();
let mut deferred_deps: Vec<(String, String, String)> = Vec::new();
let mut deferred_enhanced: Vec<(String, String)> = Vec::new();
for line in token_lines {
let cols: Vec<&str> = line.split('\t').collect();
if cols.len() != 10 {
return Err(ProtocolError::Parse(format!(
"expected 10 columns, got {}: {line}",
cols.len()
)));
}
let id_col = cols[0];
let form = cols[1];
let lemma_str = cols[2];
let upos = cols[3];
let xpos = cols[4];
let feats = cols[5];
let head = cols[6];
let deprel_str = cols[7];
let deps_col = cols[8];
let misc = cols[9];
let token_kind = classify_id(id_col);
let token_vertex_id = format!("{sent_id}.tok_{id_col}");
builder = builder
.vertex(&token_vertex_id, &token_kind, None)
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
builder = builder
.edge(&sent_id, &token_vertex_id, "contains", Some(id_col))
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
builder = builder.constraint(&token_vertex_id, "form", form);
builder = builder.constraint(&token_vertex_id, "id-range", id_col);
let is_word = token_kind == "word";
if is_word {
if lemma_str != "_" {
let lemma_id = format!("{token_vertex_id}.lemma");
builder = builder
.vertex(&lemma_id, "lemma", None)
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
builder = builder
.edge(&token_vertex_id, &lemma_id, "lemma-of", Some(lemma_str))
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
}
if upos != "_" {
let upos_id = format!("{token_vertex_id}.upos");
builder = builder
.vertex(&upos_id, "upos-tag", None)
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
builder = builder
.edge(&token_vertex_id, &upos_id, "upos", Some(upos))
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
}
if xpos != "_" {
let xpos_id = format!("{token_vertex_id}.xpos");
builder = builder
.vertex(&xpos_id, "xpos-tag", None)
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
builder = builder
.edge(&token_vertex_id, &xpos_id, "xpos", Some(xpos))
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
}
if feats != "_" {
for (fi, feat_pair) in feats.split('|').enumerate() {
let feat_id = format!("{token_vertex_id}.feat_{fi}");
builder = builder
.vertex(&feat_id, "feature", None)
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
builder = builder
.edge(&token_vertex_id, &feat_id, "feat", Some(feat_pair))
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
}
}
}
if misc != "_" {
builder = builder.constraint(&token_vertex_id, "misc", misc);
}
if head != "_" {
builder = builder.constraint(&token_vertex_id, "head", head);
deferred_deps.push((
token_vertex_id.clone(),
head.to_string(),
deprel_str.to_string(),
));
}
if deps_col != "_" {
deferred_enhanced.push((token_vertex_id.clone(), deps_col.to_string()));
}
token_ids.insert(id_col.to_string(), token_vertex_id);
}
for (dep_vertex, head_col, deprel_col) in &deferred_deps {
if head_col == "0" {
let deprel_id = format!("{dep_vertex}.deprel");
builder = builder
.vertex(&deprel_id, "deprel", None)
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
builder = builder
.edge(dep_vertex, &deprel_id, "dep", Some(deprel_col))
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
} else if let Some(head_vertex) = token_ids.get(head_col) {
builder = builder
.edge(head_vertex, dep_vertex, "dep", Some(deprel_col))
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
}
}
for (dep_vertex, deps_col) in &deferred_enhanced {
for (ei, pair) in deps_col.split('|').enumerate() {
if let Some(colon_pos) = pair.find(':') {
let head_id = &pair[..colon_pos];
let relation = &pair[colon_pos + 1..];
let label = pair.to_string();
if head_id == "0" {
let edep_id = format!("{dep_vertex}.edep_{ei}");
builder = builder
.vertex(&edep_id, "deprel", None)
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
builder = builder
.edge(dep_vertex, &edep_id, "enhanced-dep", Some(relation))
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
} else if let Some(head_vertex) = token_ids.get(head_id) {
builder = builder
.edge(head_vertex, dep_vertex, "enhanced-dep", Some(&label))
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
}
}
}
}
if !token_ids.is_empty() {
let he_id = format!("he_{he_counter}");
he_counter += 1;
let sig: HashMap<String, String> = token_ids
.iter()
.map(|(label, vid)| (label.clone(), vid.clone()))
.collect();
builder = builder
.hyper_edge(&he_id, "sentence", sig, &sent_id)
.map_err(|e| ProtocolError::Parse(e.to_string()))?;
}
}
let schema = builder.build()?;
Ok(schema)
}
#[allow(clippy::too_many_lines)]
pub fn emit_conllu(schema: &Schema) -> Result<String, ProtocolError> {
let mut output = String::new();
let mut sentences: Vec<_> = schema
.vertices
.values()
.filter(|v| v.kind == "sentence")
.collect();
sentences.sort_by(|a, b| a.id.cmp(&b.id));
for sentence in &sentences {
if let Some(newdoc) = constraint_value(schema, &sentence.id, "newdoc") {
if newdoc == "true" {
output.push_str("# newdoc\n");
} else {
output.push_str("# newdoc id = ");
output.push_str(newdoc);
output.push('\n');
}
}
if let Some(newpar) = constraint_value(schema, &sentence.id, "newpar") {
if newpar == "true" {
output.push_str("# newpar\n");
} else {
output.push_str("# newpar id = ");
output.push_str(newpar);
output.push('\n');
}
}
if let Some(sid) = constraint_value(schema, &sentence.id, "sent-id") {
output.push_str("# sent_id = ");
output.push_str(sid);
output.push('\n');
}
if let Some(text) = constraint_value(schema, &sentence.id, "text") {
output.push_str("# text = ");
output.push_str(text);
output.push('\n');
}
let tokens = children_by_edge(schema, &sentence.id, "contains");
let mut enhanced_map: HashMap<String, Vec<String>> = HashMap::new();
for (_edge, token_vertex) in &tokens {
let incoming = schema.incoming_edges(&token_vertex.id);
let mut edeps: Vec<String> = incoming
.iter()
.filter(|e| e.kind == "enhanced-dep")
.filter_map(|e| {
let head_id = constraint_value(schema, &e.src, "id-range").unwrap_or("0");
e.name.as_deref().map(|label| {
if label.contains(':') {
label.to_string()
} else {
format!("{head_id}:{label}")
}
})
})
.collect();
let outgoing = schema.outgoing_edges(&token_vertex.id);
for e in outgoing.iter().filter(|e| e.kind == "enhanced-dep") {
if let Some(tgt_v) = schema.vertices.get(&e.tgt) {
if tgt_v.kind == "deprel" {
if let Some(rel) = e.name.as_deref() {
edeps.push(format!("0:{rel}"));
}
}
}
}
if !edeps.is_empty() {
edeps.sort_by(|a, b| {
let a_head = a.split(':').next().unwrap_or("0");
let b_head = b.split(':').next().unwrap_or("0");
let a_n = a_head.parse::<f64>().unwrap_or(0.0);
let b_n = b_head.parse::<f64>().unwrap_or(0.0);
a_n.partial_cmp(&b_n).unwrap_or(std::cmp::Ordering::Equal)
});
enhanced_map.insert(token_vertex.id.to_string(), edeps);
}
}
let mut token_lines: Vec<(String, String)> = Vec::new();
for (_edge, token_vertex) in &tokens {
let id_col = constraint_value(schema, &token_vertex.id, "id-range").unwrap_or("_");
let form = constraint_value(schema, &token_vertex.id, "form").unwrap_or("_");
let is_word = token_vertex.kind == "word";
let lemma = if is_word {
let lemma_edges = children_by_edge(schema, &token_vertex.id, "lemma-of");
lemma_edges
.first()
.and_then(|(e, _)| e.name.as_deref())
.unwrap_or("_")
} else {
"_"
};
let upos = if is_word {
let upos_edges = children_by_edge(schema, &token_vertex.id, "upos");
upos_edges
.first()
.and_then(|(e, _)| e.name.as_deref())
.unwrap_or("_")
} else {
"_"
};
let xpos = if is_word {
let xpos_edges = children_by_edge(schema, &token_vertex.id, "xpos");
xpos_edges
.first()
.and_then(|(e, _)| e.name.as_deref())
.unwrap_or("_")
} else {
"_"
};
let feats = if is_word {
let feat_edges = children_by_edge(schema, &token_vertex.id, "feat");
if feat_edges.is_empty() {
"_".to_string()
} else {
feat_edges
.iter()
.filter_map(|(e, _)| e.name.as_deref())
.collect::<Vec<_>>()
.join("|")
}
} else {
"_".to_string()
};
let head = constraint_value(schema, &token_vertex.id, "head").unwrap_or("_");
let deprel = {
let incoming_dep = schema
.incoming_edges(&token_vertex.id)
.iter()
.find(|e| e.kind == "dep")
.and_then(|e| e.name.as_deref());
let root_dep = schema
.outgoing_edges(&token_vertex.id)
.iter()
.find(|e| e.kind == "dep")
.and_then(|e| e.name.as_deref());
incoming_dep.or(root_dep).unwrap_or("_")
};
let deps: String = enhanced_map
.get(token_vertex.id.as_str())
.map_or_else(|| "_".to_string(), |v| v.join("|"));
let misc = constraint_value(schema, &token_vertex.id, "misc").unwrap_or("_");
let line = format!(
"{id_col}\t{form}\t{lemma}\t{upos}\t{xpos}\t{feats}\t{head}\t{deprel}\t{deps}\t{misc}"
);
token_lines.push((id_col.to_string(), line));
}
token_lines.sort_by(|a, b| cmp_conllu_id(&a.0, &b.0));
for (_, line) in &token_lines {
output.push_str(line);
output.push('\n');
}
output.push('\n');
}
Ok(output)
}
fn classify_id(id: &str) -> String {
if id.contains('-') {
"multiword".into()
} else if id.contains('.') {
"empty".into()
} else {
"word".into()
}
}
fn split_sentences(input: &str) -> Vec<(Vec<String>, Vec<String>)> {
let mut sentences = Vec::new();
let mut current_comments: Vec<String> = Vec::new();
let mut current_lines: Vec<String> = Vec::new();
for line in input.lines() {
let trimmed = line.trim();
if trimmed.is_empty() {
if !current_lines.is_empty() {
sentences.push((
std::mem::take(&mut current_comments),
std::mem::take(&mut current_lines),
));
}
} else if trimmed.starts_with('#') {
if !current_lines.is_empty() {
sentences.push((
std::mem::take(&mut current_comments),
std::mem::take(&mut current_lines),
));
}
current_comments.push(trimmed.to_string());
} else {
current_lines.push(trimmed.to_string());
}
}
if !current_lines.is_empty() {
sentences.push((current_comments, current_lines));
}
sentences
}
fn cmp_conllu_id(a: &str, b: &str) -> std::cmp::Ordering {
let a_key = conllu_id_sort_key(a);
let b_key = conllu_id_sort_key(b);
a_key.cmp(&b_key)
}
fn conllu_id_sort_key(id: &str) -> (u32, u32, u8) {
if let Some((start, _end)) = id.split_once('-') {
let major = start.parse::<u32>().unwrap_or(0);
(major, 0, 0)
} else if let Some((int_part, dec_part)) = id.split_once('.') {
let major = int_part.parse::<u32>().unwrap_or(0);
let minor = dec_part.parse::<u32>().unwrap_or(0);
(major, minor, 2)
} else {
let major = id.parse::<u32>().unwrap_or(0);
(major, 0, 1)
}
}
fn edge_rules() -> Vec<EdgeRule> {
vec![
EdgeRule {
edge_kind: "contains".into(),
src_kinds: vec!["sentence".into()],
tgt_kinds: vec!["word".into(), "multiword".into(), "empty".into()],
},
EdgeRule {
edge_kind: "dep".into(),
src_kinds: vec!["word".into(), "empty".into()],
tgt_kinds: vec!["word".into(), "empty".into(), "deprel".into()],
},
EdgeRule {
edge_kind: "enhanced-dep".into(),
src_kinds: vec!["word".into(), "empty".into()],
tgt_kinds: vec!["word".into(), "empty".into(), "deprel".into()],
},
EdgeRule {
edge_kind: "feat".into(),
src_kinds: vec!["word".into()],
tgt_kinds: vec!["feature".into()],
},
EdgeRule {
edge_kind: "upos".into(),
src_kinds: vec!["word".into()],
tgt_kinds: vec!["upos-tag".into()],
},
EdgeRule {
edge_kind: "xpos".into(),
src_kinds: vec!["word".into()],
tgt_kinds: vec!["xpos-tag".into()],
},
EdgeRule {
edge_kind: "lemma-of".into(),
src_kinds: vec!["word".into()],
tgt_kinds: vec!["lemma".into()],
},
]
}
#[cfg(test)]
#[allow(clippy::expect_used, clippy::unwrap_used)]
mod tests {
use super::*;
#[test]
fn protocol_def() {
let p = protocol();
assert_eq!(p.name, "conllu");
assert_eq!(p.schema_theory, "ThConlluSchema");
assert_eq!(p.instance_theory, "ThConlluInstance");
assert!(p.find_edge_rule("contains").is_some());
assert!(p.find_edge_rule("dep").is_some());
assert!(p.find_edge_rule("enhanced-dep").is_some());
assert!(p.find_edge_rule("feat").is_some());
assert!(p.find_edge_rule("upos").is_some());
assert!(p.find_edge_rule("xpos").is_some());
assert!(p.find_edge_rule("lemma-of").is_some());
assert!(!p.obj_kinds.contains(&"token".to_string()));
assert!(!p.constraint_sorts.contains(&"deprel".to_string()));
}
#[test]
fn register_theories_works() {
let mut registry = HashMap::new();
register_theories(&mut registry);
assert!(registry.contains_key("ThConlluSchema"));
assert!(registry.contains_key("ThConlluInstance"));
}
#[test]
fn parse_and_emit_roundtrip() {
let conllu_text = "\
# sent_id = test-01
# text = The cat sat.
1\tThe\tthe\tDET\tDT\tDefinite=Def|PronType=Art\t2\tdet\t_\t_
2\tcat\tcat\tNOUN\tNN\tNumber=Sing\t3\tnsubj\t_\t_
3\tsat\tsit\tVERB\tVBD\tMood=Ind|Tense=Past|VerbForm=Fin\t0\troot\t_\tSpaceAfter=No
";
let schema = parse_conllu(conllu_text).expect("should parse");
assert!(schema.has_vertex("sent_0"));
assert_eq!(schema.vertices.get("sent_0").unwrap().kind, "sentence");
assert_eq!(
constraint_value(&schema, "sent_0", "sent-id"),
Some("test-01")
);
assert_eq!(
constraint_value(&schema, "sent_0", "text"),
Some("The cat sat.")
);
assert!(schema.has_vertex("sent_0.tok_1"));
assert!(schema.has_vertex("sent_0.tok_2"));
assert!(schema.has_vertex("sent_0.tok_3"));
assert_eq!(schema.vertices.get("sent_0.tok_1").unwrap().kind, "word");
assert_eq!(
constraint_value(&schema, "sent_0.tok_1", "form"),
Some("The")
);
assert_eq!(
constraint_value(&schema, "sent_0.tok_3", "misc"),
Some("SpaceAfter=No")
);
assert!(schema.has_vertex("sent_0.tok_1.upos"));
assert_eq!(
schema.vertices.get("sent_0.tok_1.upos").unwrap().kind,
"upos-tag"
);
assert!(schema.has_vertex("sent_0.tok_1.lemma"));
assert_eq!(
schema.vertices.get("sent_0.tok_1.lemma").unwrap().kind,
"lemma"
);
let emitted = emit_conllu(&schema).expect("should emit");
let schema2 = parse_conllu(&emitted).expect("should re-parse");
assert_eq!(schema.vertex_count(), schema2.vertex_count());
}
#[test]
fn parse_empty_input_fails() {
let result = parse_conllu("");
assert!(result.is_err());
}
#[test]
fn parse_multiword_token() {
let conllu_text = "\
1\tI\tI\tPRON\tPRP\t_\t2\tnsubj\t_\t_
2-3\tdon't\t_\t_\t_\t_\t_\t_\t_\t_
2\tdo\tdo\tAUX\tVBP\t_\t0\troot\t_\t_
3\tnot\tnot\tPART\tRB\t_\t2\tadvmod\t_\t_
";
let schema = parse_conllu(conllu_text).expect("should parse multiword");
assert!(schema.has_vertex("sent_0.tok_2-3"));
assert_eq!(
schema.vertices.get("sent_0.tok_2-3").unwrap().kind,
"multiword"
);
}
#[test]
fn empty_nodes_no_upos_xpos_lemma() {
let conllu_text = "\
1\tThey\tthey\tPRON\tPRP\t_\t2\tnsubj\t_\t_
1.1\tare\tbe\tAUX\tVBZ\t_\t_\t_\t2:cop\t_
2\thappy\thappy\tADJ\tJJ\t_\t0\troot\t_\t_
";
let schema = parse_conllu(conllu_text).expect("should parse empty node");
assert!(schema.has_vertex("sent_0.tok_1.1"));
assert_eq!(schema.vertices.get("sent_0.tok_1.1").unwrap().kind, "empty");
assert!(!schema.has_vertex("sent_0.tok_1.1.upos"));
assert!(!schema.has_vertex("sent_0.tok_1.1.xpos"));
assert!(!schema.has_vertex("sent_0.tok_1.1.lemma"));
}
#[test]
fn enhanced_deps_parsed() {
let conllu_text = "\
1\tThey\tthey\tPRON\tPRP\t_\t2\tnsubj\t_\t_
1.1\tare\tbe\tAUX\tVBZ\t_\t_\t_\t2:cop\t_
2\thappy\thappy\tADJ\tJJ\t_\t0\troot\t_\t_
";
let schema = parse_conllu(conllu_text).expect("should parse enhanced deps");
let enhanced = schema.edges_between("sent_0.tok_2", "sent_0.tok_1.1");
assert!(
enhanced.iter().any(|e| e.kind == "enhanced-dep"),
"expected enhanced-dep edge from tok_2 to tok_1.1"
);
}
#[test]
fn newpar_newdoc_comments() {
let conllu_text = "\
# newdoc
# newpar id = par-1
# sent_id = s1
1\tHello\thello\tINTJ\tUH\t_\t0\troot\t_\t_
";
let schema = parse_conllu(conllu_text).expect("should parse");
assert_eq!(constraint_value(&schema, "sent_0", "newdoc"), Some("true"));
assert_eq!(constraint_value(&schema, "sent_0", "newpar"), Some("par-1"));
assert_eq!(constraint_value(&schema, "sent_0", "sent-id"), Some("s1"));
}
#[test]
fn deprel_not_in_constraint_sorts() {
let p = protocol();
assert!(p.obj_kinds.contains(&"deprel".to_string()));
assert!(!p.constraint_sorts.contains(&"deprel".to_string()));
}
#[test]
fn upos_xpos_feat_lemma_rules_word_only() {
let p = protocol();
for kind in &["upos", "xpos", "feat", "lemma-of"] {
let rule = p
.find_edge_rule(kind)
.unwrap_or_else(|| panic!("no rule for {kind}"));
assert_eq!(
rule.src_kinds,
vec!["word".to_string()],
"edge rule '{kind}' should only allow 'word' sources"
);
}
}
}