1use std::collections::{HashMap, HashSet};
11use std::hash::BuildHasher;
12
13use panproto_gat::Theory;
14use panproto_schema::{EdgeRule, Protocol, Schema, SchemaBuilder};
15
16use crate::emit::{children_by_edge, constraint_value, vertex_constraints};
17use crate::error::ProtocolError;
18use crate::theories;
19
20const FACTUAL: &str = "factual";
23
24const PRED_PARTICULAR: &str = "pred-particular";
25const PRED_DYNAMIC: &str = "pred-dynamic";
26const PRED_HYPOTHETICAL: &str = "pred-hypothetical";
27
28const ARG_PARTICULAR: &str = "arg-particular";
29const ARG_KIND: &str = "arg-kind";
30const ARG_ABSTRACT: &str = "arg-abstract";
31
32const PROTOROLE_PROPERTIES: &[&str] = &[
34 "awareness",
35 "change_of_location",
36 "change_of_possession",
37 "change_of_state",
38 "existed_before",
39 "existed_after",
40 "existed_during",
41 "instigation",
42 "location",
43 "manner",
44 "partitive",
45 "purpose",
46 "sentient",
47 "time",
48 "volition",
49 "was_for_benefit",
50 "was_used",
51 "change_of_state_continuous",
52];
53
54const EVENT_STRUCTURE_PROPERTIES: &[&str] = &[
56 "distributive",
57 "dynamic",
58 "natural_parts",
59 "part_similarity",
60 "telic",
61];
62
63const TIME_GRANULARITIES: &[&str] = &[
65 "dur-seconds",
66 "dur-minutes",
67 "dur-hours",
68 "dur-days",
69 "dur-weeks",
70 "dur-months",
71 "dur-years",
72 "dur-decades",
73 "dur-centuries",
74 "instant",
75 "forever",
76];
77
78const WORDSENSE_PROPERTIES: &[&str] = &[
80 "supersense-noun.act",
81 "supersense-noun.animal",
82 "supersense-noun.artifact",
83 "supersense-noun.attribute",
84 "supersense-noun.body",
85 "supersense-noun.cognition",
86 "supersense-noun.communication",
87 "supersense-noun.event",
88 "supersense-noun.feeling",
89 "supersense-noun.food",
90 "supersense-noun.group",
91 "supersense-noun.location",
92 "supersense-noun.motive",
93 "supersense-noun.object",
94 "supersense-noun.person",
95 "supersense-noun.phenomenon",
96 "supersense-noun.plant",
97 "supersense-noun.possession",
98 "supersense-noun.process",
99 "supersense-noun.quantity",
100 "supersense-noun.relation",
101 "supersense-noun.shape",
102 "supersense-noun.state",
103 "supersense-noun.substance",
104 "supersense-noun.time",
105 "supersense-noun.tops",
106];
107
108#[must_use]
112pub fn protocol() -> Protocol {
113 Protocol {
114 name: "decomp".into(),
115 schema_theory: "ThDecompSchema".into(),
116 instance_theory: "ThDecompInstance".into(),
117 edge_rules: edge_rules(),
118 obj_kinds: vec![
119 "corpus".into(),
121 "document".into(),
122 "sentence".into(),
123 "token".into(),
125 "predicate".into(),
127 "argument".into(),
128 "string".into(),
130 "integer".into(),
131 "float".into(),
132 "boolean".into(),
133 ],
134 constraint_sorts: vec![
135 "domain".into(),
137 "type".into(),
138 "position".into(),
139 "form".into(),
140 "lemma".into(),
141 "upos".into(),
142 "xpos".into(),
143 "deprel".into(),
144 "frompredpatt".into(),
146 "value".into(),
148 "confidence".into(),
149 "subspace".into(),
151 "property".into(),
152 ],
153 has_order: true,
154 ..Protocol::default()
155 }
156}
157
158pub fn register_theories<S: BuildHasher>(registry: &mut HashMap<String, Theory, S>) {
160 theories::register_constrained_multigraph_wtype(registry, "ThDecompSchema", "ThDecompInstance");
161}
162
163#[allow(clippy::too_many_lines)]
218pub fn parse_decomp(json: &serde_json::Value) -> Result<Schema, ProtocolError> {
219 let proto = protocol();
220 let mut builder = SchemaBuilder::new(&proto);
221 let mut known: HashSet<String> = HashSet::new();
223
224 let corpus_id = json
226 .get("corpus_id")
227 .and_then(serde_json::Value::as_str)
228 .unwrap_or("corpus")
229 .to_string();
230
231 builder = builder
232 .vertex(&corpus_id, "corpus", None)
233 .map_err(|e| ProtocolError::Parse(e.to_string()))?;
234 known.insert(corpus_id.clone());
235 builder = builder.constraint(&corpus_id, "domain", "root");
236 builder = builder.constraint(&corpus_id, "type", "corpus");
237
238 let documents = json
240 .get("documents")
241 .and_then(serde_json::Value::as_object)
242 .ok_or_else(|| ProtocolError::MissingField("documents".into()))?;
243
244 for (doc_key, doc_val) in documents {
245 let doc_vid = format!("{corpus_id}.{doc_key}");
246 builder = builder
247 .vertex(&doc_vid, "document", None)
248 .map_err(|e| ProtocolError::Parse(e.to_string()))?;
249 known.insert(doc_vid.clone());
250 builder = builder.constraint(&doc_vid, "domain", "document");
251 builder = builder.constraint(&doc_vid, "type", "document");
252 builder = builder
253 .edge(&corpus_id, &doc_vid, "contains", Some(doc_key))
254 .map_err(|e| ProtocolError::Parse(e.to_string()))?;
255
256 let sentences = doc_val
258 .get("sentences")
259 .and_then(serde_json::Value::as_object)
260 .ok_or_else(|| ProtocolError::MissingField(format!("{doc_key}.sentences")))?;
261
262 for (sent_key, sent_val) in sentences {
263 let sent_vid = format!("{doc_vid}.{sent_key}");
264 builder = builder
265 .vertex(&sent_vid, "sentence", None)
266 .map_err(|e| ProtocolError::Parse(e.to_string()))?;
267 known.insert(sent_vid.clone());
268 builder = builder.constraint(&sent_vid, "domain", "syntax");
269 builder = builder.constraint(&sent_vid, "type", "sentence");
270 builder = builder
271 .edge(&doc_vid, &sent_vid, "contains", Some(sent_key))
272 .map_err(|e| ProtocolError::Parse(e.to_string()))?;
273
274 builder = parse_syntax_tokens(builder, sent_val, &sent_vid, &mut known)
275 .map_err(|e| ProtocolError::Parse(e.to_string()))?;
276
277 builder = parse_semantics(builder, sent_val, &sent_vid, &known)
278 .map_err(|e| ProtocolError::Parse(e.to_string()))?;
279 }
280 }
281
282 let schema = builder.build()?;
283 Ok(schema)
284}
285
286fn parse_syntax_tokens(
288 mut builder: SchemaBuilder,
289 sent_val: &serde_json::Value,
290 sent_vid: &str,
291 known: &mut HashSet<String>,
292) -> Result<SchemaBuilder, panproto_schema::SchemaError> {
293 let Some(tokens) = sent_val
294 .pointer("/syntax/tokens")
295 .and_then(serde_json::Value::as_object)
296 else {
297 return Ok(builder);
298 };
299
300 for (pos_str, tok_val) in tokens {
301 let tok_vid = format!("{sent_vid}.tok_{pos_str}");
302 builder = builder.vertex(&tok_vid, "token", None)?;
303 known.insert(tok_vid.clone());
304 builder = builder.constraint(&tok_vid, "domain", "syntax");
305 builder = builder.constraint(&tok_vid, "type", "token");
306 builder = builder.constraint(&tok_vid, "position", pos_str);
307 builder = builder.edge(sent_vid, &tok_vid, "syntax-dep", Some(pos_str))?;
308
309 for field in &["form", "lemma", "upos", "xpos", "deprel"] {
310 if let Some(v) = tok_val.get(field).and_then(serde_json::Value::as_str) {
311 builder = builder.constraint(&tok_vid, field, v);
312 }
313 }
314 }
315
316 Ok(builder)
317}
318
319#[allow(clippy::too_many_lines)]
321fn parse_semantics(
322 mut builder: SchemaBuilder,
323 sent_val: &serde_json::Value,
324 sent_vid: &str,
325 known: &HashSet<String>,
326) -> Result<SchemaBuilder, panproto_schema::SchemaError> {
327 let Some(sem) = sent_val
328 .get("semantics")
329 .and_then(serde_json::Value::as_object)
330 else {
331 return Ok(builder);
332 };
333
334 let mut sem_known: HashSet<String> = HashSet::new();
336
337 if let Some(preds) = sem.get("predicates").and_then(serde_json::Value::as_object) {
339 for (pred_key, pred_val) in preds {
340 let pred_vid = format!("{sent_vid}.{pred_key}");
341 builder = builder.vertex(&pred_vid, "predicate", None)?;
342 sem_known.insert(pred_vid.clone());
343 builder = builder.constraint(&pred_vid, "domain", "semantics");
344 builder = builder.constraint(&pred_vid, "type", "predicate");
345 builder = builder.edge(sent_vid, &pred_vid, "contains", Some(pred_key))?;
346
347 if let Some(fp) = pred_val.get("frompredpatt") {
348 let fp_str = if fp.as_bool().unwrap_or(false) {
349 "true"
350 } else {
351 "false"
352 };
353 builder = builder.constraint(&pred_vid, "frompredpatt", fp_str);
354 }
355
356 if let Some(head_pos) = pred_val
358 .get("head_token")
359 .and_then(serde_json::Value::as_str)
360 {
361 let tok_vid = format!("{sent_vid}.tok_{head_pos}");
362 if known.contains(&tok_vid) {
363 builder = builder.edge(&pred_vid, &tok_vid, "head", Some(head_pos))?;
364 }
365 }
366
367 if let Some(span_arr) = pred_val
369 .get("span_tokens")
370 .and_then(serde_json::Value::as_array)
371 {
372 let mut added: HashSet<String> = HashSet::new();
373 for tok_pos in span_arr.iter().filter_map(serde_json::Value::as_str) {
374 let tok_vid = format!("{sent_vid}.tok_{tok_pos}");
375 if known.contains(&tok_vid) && added.insert(tok_vid.clone()) {
376 builder = builder.edge(&pred_vid, &tok_vid, "nonhead", Some(tok_pos))?;
377 }
378 }
379 }
380
381 builder = parse_subspace(builder, pred_val, "factuality", &[FACTUAL], &pred_vid)?;
382 builder = parse_subspace(
383 builder,
384 pred_val,
385 "genericity",
386 &[PRED_PARTICULAR, PRED_DYNAMIC, PRED_HYPOTHETICAL],
387 &pred_vid,
388 )?;
389 builder = parse_subspace(builder, pred_val, "time", TIME_GRANULARITIES, &pred_vid)?;
390 builder = parse_subspace(
391 builder,
392 pred_val,
393 "event_structure",
394 EVENT_STRUCTURE_PROPERTIES,
395 &pred_vid,
396 )?;
397 }
398 }
399
400 if let Some(args) = sem.get("arguments").and_then(serde_json::Value::as_object) {
402 for (arg_key, arg_val) in args {
403 let arg_vid = format!("{sent_vid}.{arg_key}");
404 builder = builder.vertex(&arg_vid, "argument", None)?;
405 sem_known.insert(arg_vid.clone());
406 builder = builder.constraint(&arg_vid, "domain", "semantics");
407 builder = builder.constraint(&arg_vid, "type", "argument");
408 builder = builder.edge(sent_vid, &arg_vid, "contains", Some(arg_key))?;
409
410 if let Some(head_pos) = arg_val
411 .get("head_token")
412 .and_then(serde_json::Value::as_str)
413 {
414 let tok_vid = format!("{sent_vid}.tok_{head_pos}");
415 if known.contains(&tok_vid) {
416 builder = builder.edge(&arg_vid, &tok_vid, "head", Some(head_pos))?;
417 }
418 }
419
420 if let Some(span_arr) = arg_val
421 .get("span_tokens")
422 .and_then(serde_json::Value::as_array)
423 {
424 let mut added: HashSet<String> = HashSet::new();
425 for tok_pos in span_arr.iter().filter_map(serde_json::Value::as_str) {
426 let tok_vid = format!("{sent_vid}.tok_{tok_pos}");
427 if known.contains(&tok_vid) && added.insert(tok_vid.clone()) {
428 builder = builder.edge(&arg_vid, &tok_vid, "nonhead", Some(tok_pos))?;
429 }
430 }
431 }
432
433 builder = parse_subspace(
434 builder,
435 arg_val,
436 "genericity",
437 &[ARG_PARTICULAR, ARG_KIND, ARG_ABSTRACT],
438 &arg_vid,
439 )?;
440 builder = parse_subspace(
441 builder,
442 arg_val,
443 "wordsense",
444 WORDSENSE_PROPERTIES,
445 &arg_vid,
446 )?;
447 }
448 }
449
450 if let Some(edges) = sem.get("edges").and_then(serde_json::Value::as_object) {
452 for (edge_key, edge_val) in edges {
453 let Some((pred_key, arg_key)) = edge_key.split_once("$$") else {
454 continue;
455 };
456 let pred_vid = format!("{sent_vid}.{pred_key}");
457 let arg_vid = format!("{sent_vid}.{arg_key}");
458 if !sem_known.contains(&pred_vid) || !sem_known.contains(&arg_vid) {
459 continue;
460 }
461 builder = builder.edge(&pred_vid, &arg_vid, "sem-dep", Some(edge_key))?;
462
463 if let Some(protoroles) = edge_val
465 .get("protoroles")
466 .and_then(serde_json::Value::as_object)
467 {
468 for prop in PROTOROLE_PROPERTIES {
469 if let Some(ann) = protoroles.get(*prop) {
470 let prop_vid = format!("{pred_vid}.pr.{arg_key}.{prop}");
471 builder = builder.vertex(&prop_vid, "float", None)?;
472 builder = builder.constraint(&prop_vid, "subspace", "protoroles");
473 builder = builder.constraint(&prop_vid, "property", prop);
474 if let Some(v) = ann.get("value").and_then(serde_json::Value::as_f64) {
475 builder = builder.constraint(&prop_vid, "value", &v.to_string());
476 }
477 if let Some(c) = ann.get("confidence").and_then(serde_json::Value::as_f64) {
478 builder = builder.constraint(&prop_vid, "confidence", &c.to_string());
479 }
480 builder = builder.edge(&pred_vid, &prop_vid, "prop", Some(prop))?;
481 }
482 }
483 }
484
485 if let Some(event_struct) = edge_val
487 .get("event_structure")
488 .and_then(serde_json::Value::as_object)
489 {
490 for (mero_key, ann) in event_struct {
491 let mero_vid = format!("{pred_vid}.es.{arg_key}.{mero_key}");
492 builder = builder.vertex(&mero_vid, "boolean", None)?;
493 builder = builder.constraint(&mero_vid, "subspace", "event_structure");
494 builder = builder.constraint(&mero_vid, "property", mero_key);
495 if let Some(v) = ann.get("value").and_then(serde_json::Value::as_f64) {
496 builder = builder.constraint(&mero_vid, "value", &v.to_string());
497 }
498 if let Some(c) = ann.get("confidence").and_then(serde_json::Value::as_f64) {
499 builder = builder.constraint(&mero_vid, "confidence", &c.to_string());
500 }
501 builder =
502 builder.edge(&pred_vid, &mero_vid, "prop", Some(mero_key.as_str()))?;
503 }
504 }
505 }
506 }
507
508 Ok(builder)
509}
510
511fn parse_subspace(
513 mut builder: SchemaBuilder,
514 node_val: &serde_json::Value,
515 subspace: &str,
516 known_props: &[&str],
517 parent_vid: &str,
518) -> Result<SchemaBuilder, panproto_schema::SchemaError> {
519 let Some(subspace_obj) = node_val
520 .get(subspace)
521 .and_then(serde_json::Value::as_object)
522 else {
523 return Ok(builder);
524 };
525
526 for prop in known_props {
527 if let Some(ann) = subspace_obj.get(*prop) {
528 let prop_vid = format!("{parent_vid}.{subspace}.{prop}");
529 builder = builder.vertex(&prop_vid, "float", None)?;
530 builder = builder.constraint(&prop_vid, "subspace", subspace);
531 builder = builder.constraint(&prop_vid, "property", prop);
532 if let Some(v) = ann.get("value").and_then(serde_json::Value::as_f64) {
533 builder = builder.constraint(&prop_vid, "value", &v.to_string());
534 }
535 if let Some(c) = ann.get("confidence").and_then(serde_json::Value::as_f64) {
536 builder = builder.constraint(&prop_vid, "confidence", &c.to_string());
537 }
538 builder = builder.edge(parent_vid, &prop_vid, "prop", Some(prop))?;
539 }
540 }
541
542 Ok(builder)
543}
544
545#[allow(clippy::too_many_lines)]
553pub fn emit_decomp(schema: &Schema) -> Result<serde_json::Value, ProtocolError> {
554 let corpus = schema
555 .vertices
556 .values()
557 .find(|v| v.kind == "corpus")
558 .ok_or_else(|| ProtocolError::Emit("no corpus vertex found".into()))?;
559
560 let corpus_id = corpus.id.to_string();
561 let mut documents_map = serde_json::Map::new();
562
563 for (_doc_edge, doc_vertex) in children_by_edge(schema, &corpus_id, "contains") {
564 let mut sentences_map = serde_json::Map::new();
565
566 for (_sent_edge, sent_vertex) in children_by_edge(schema, &doc_vertex.id, "contains") {
567 let sent_json = emit_sentence(schema, &sent_vertex.id);
568 let sent_key = sent_vertex.id.rsplit('.').next().unwrap_or(&sent_vertex.id);
569 sentences_map.insert(sent_key.to_string(), sent_json);
570 }
571
572 let doc_key = doc_vertex.id.rsplit('.').next().unwrap_or(&doc_vertex.id);
573 documents_map.insert(
574 doc_key.to_string(),
575 serde_json::json!({ "sentences": sentences_map }),
576 );
577 }
578
579 Ok(serde_json::json!({
580 "corpus_id": corpus_id,
581 "documents": documents_map,
582 }))
583}
584
585fn emit_sentence(schema: &Schema, sent_vid: &str) -> serde_json::Value {
587 let mut tokens_map = serde_json::Map::new();
589 for (_edge, tok_vertex) in children_by_edge(schema, sent_vid, "syntax-dep") {
590 let mut tok_obj = serde_json::Map::new();
591 for sort in &["form", "lemma", "upos", "xpos", "deprel"] {
592 if let Some(v) = constraint_value(schema, &tok_vertex.id, sort) {
593 tok_obj.insert((*sort).to_string(), serde_json::json!(v));
594 }
595 }
596 let pos = constraint_value(schema, &tok_vertex.id, "position").unwrap_or(&tok_vertex.id);
597 tokens_map.insert(pos.to_string(), serde_json::Value::Object(tok_obj));
598 }
599
600 let mut preds_map = serde_json::Map::new();
602 let mut args_map = serde_json::Map::new();
603 let mut edges_map = serde_json::Map::new();
604
605 for (_edge, child) in children_by_edge(schema, sent_vid, "contains") {
606 match child.kind.as_str() {
607 "predicate" => {
608 let pred_key = child.id.rsplit('.').next().unwrap_or(&child.id);
609 preds_map.insert(
610 pred_key.to_string(),
611 emit_sem_node(schema, &child.id, "predicate"),
612 );
613
614 for dep_edge in schema
616 .outgoing_edges(&child.id)
617 .iter()
618 .filter(|e| e.kind == "sem-dep")
619 {
620 let arg_vid = &dep_edge.tgt;
621 let arg_key = arg_vid.rsplit('.').next().unwrap_or(arg_vid.as_str());
622 let edge_key = dep_edge
623 .name
624 .as_ref()
625 .map_or_else(|| format!("{pred_key}$${arg_key}"), ToString::to_string);
626
627 let mut protoroles_map = serde_json::Map::new();
629 for (_prop_edge, prop_vertex) in children_by_edge(schema, &child.id, "prop") {
630 if constraint_value(schema, &prop_vertex.id, "subspace")
631 != Some("protoroles")
632 {
633 continue;
634 }
635 if !prop_vertex.id.contains(arg_key) {
636 continue;
637 }
638 if let Some(pname) = constraint_value(schema, &prop_vertex.id, "property") {
639 protoroles_map.insert(
640 pname.to_string(),
641 emit_annotation(schema, &prop_vertex.id),
642 );
643 }
644 }
645
646 let mut edge_obj = serde_json::Map::new();
647 if !protoroles_map.is_empty() {
648 edge_obj.insert(
649 "protoroles".into(),
650 serde_json::Value::Object(protoroles_map),
651 );
652 }
653 edges_map.insert(edge_key, serde_json::Value::Object(edge_obj));
654 }
655 }
656 "argument" => {
657 let arg_key = child.id.rsplit('.').next().unwrap_or(&child.id);
658 args_map.insert(
659 arg_key.to_string(),
660 emit_sem_node(schema, &child.id, "argument"),
661 );
662 }
663 _ => {}
664 }
665 }
666
667 let mut sem_obj = serde_json::Map::new();
668 if !preds_map.is_empty() {
669 sem_obj.insert("predicates".into(), serde_json::Value::Object(preds_map));
670 }
671 if !args_map.is_empty() {
672 sem_obj.insert("arguments".into(), serde_json::Value::Object(args_map));
673 }
674 if !edges_map.is_empty() {
675 sem_obj.insert("edges".into(), serde_json::Value::Object(edges_map));
676 }
677
678 serde_json::json!({
679 "syntax": { "tokens": tokens_map },
680 "semantics": sem_obj,
681 })
682}
683
684fn emit_sem_node(schema: &Schema, node_vid: &str, sem_type: &str) -> serde_json::Value {
686 let mut obj = serde_json::Map::new();
687 obj.insert("domain".into(), serde_json::json!("semantics"));
688 obj.insert("type".into(), serde_json::json!(sem_type));
689
690 if let Some(fp) = constraint_value(schema, node_vid, "frompredpatt") {
691 obj.insert("frompredpatt".into(), serde_json::json!(fp == "true"));
692 }
693
694 if let Some(head_edge) = schema
696 .outgoing_edges(node_vid)
697 .iter()
698 .find(|e| e.kind == "head")
699 {
700 if let Some(pos) = &head_edge.name {
701 obj.insert("head_token".into(), serde_json::json!(pos));
702 }
703 }
704
705 let nonhead: Vec<_> = schema
707 .outgoing_edges(node_vid)
708 .iter()
709 .filter(|e| e.kind == "nonhead")
710 .collect();
711 if !nonhead.is_empty() {
712 let span: Vec<serde_json::Value> = nonhead
713 .iter()
714 .filter_map(|e| e.name.as_deref().map(|n| serde_json::json!(n)))
715 .collect();
716 obj.insert("span_tokens".into(), serde_json::Value::Array(span));
717 }
718
719 let mut subspaces: HashMap<String, serde_json::Map<String, serde_json::Value>> = HashMap::new();
721 for (_prop_edge, prop_vertex) in children_by_edge(schema, node_vid, "prop") {
722 let sub = constraint_value(schema, &prop_vertex.id, "subspace");
723 let prop_name = constraint_value(schema, &prop_vertex.id, "property");
724 if sub == Some("protoroles") {
725 continue;
726 }
727 if let (Some(sub_str), Some(prop_str)) = (sub, prop_name) {
728 let ann = emit_annotation(schema, &prop_vertex.id);
729 subspaces
730 .entry(sub_str.to_string())
731 .or_default()
732 .insert(prop_str.to_string(), ann);
733 }
734 }
735 for (sub, props) in subspaces {
736 obj.insert(sub, serde_json::Value::Object(props));
737 }
738
739 serde_json::Value::Object(obj)
740}
741
742fn emit_annotation(schema: &Schema, vertex_id: &str) -> serde_json::Value {
744 let mut ann = serde_json::Map::new();
745 for c in vertex_constraints(schema, vertex_id) {
746 if c.sort == "value" || c.sort == "confidence" {
747 if let Ok(f) = c.value.parse::<f64>() {
748 ann.insert(c.sort.to_string(), serde_json::json!(f));
749 }
750 }
751 }
752 serde_json::Value::Object(ann)
753}
754
755fn edge_rules() -> Vec<EdgeRule> {
758 let sem_kinds = || vec!["predicate".to_string(), "argument".to_string()];
759 let scalar_kinds = || {
760 vec![
761 "string".to_string(),
762 "integer".to_string(),
763 "float".to_string(),
764 "boolean".to_string(),
765 ]
766 };
767
768 vec![
769 EdgeRule {
771 edge_kind: "contains".into(),
772 src_kinds: vec!["corpus".into(), "document".into(), "sentence".into()],
773 tgt_kinds: vec![
774 "document".into(),
775 "sentence".into(),
776 "predicate".into(),
777 "argument".into(),
778 ],
779 },
780 EdgeRule {
782 edge_kind: "syntax-dep".into(),
783 src_kinds: vec!["sentence".into(), "token".into()],
784 tgt_kinds: vec!["token".into()],
785 },
786 EdgeRule {
788 edge_kind: "head".into(),
789 src_kinds: sem_kinds(),
790 tgt_kinds: vec!["token".into()],
791 },
792 EdgeRule {
794 edge_kind: "nonhead".into(),
795 src_kinds: sem_kinds(),
796 tgt_kinds: vec!["token".into()],
797 },
798 EdgeRule {
800 edge_kind: "sem-dep".into(),
801 src_kinds: vec!["predicate".into()],
802 tgt_kinds: vec!["argument".into()],
803 },
804 EdgeRule {
806 edge_kind: "sem-head".into(),
807 src_kinds: vec!["argument".into()],
808 tgt_kinds: vec!["predicate".into()],
809 },
810 EdgeRule {
812 edge_kind: "sub-argument".into(),
813 src_kinds: vec!["argument".into()],
814 tgt_kinds: vec!["argument".into()],
815 },
816 EdgeRule {
818 edge_kind: "sub-predicate".into(),
819 src_kinds: vec!["predicate".into()],
820 tgt_kinds: vec!["predicate".into()],
821 },
822 EdgeRule {
824 edge_kind: "doc-relation".into(),
825 src_kinds: sem_kinds(),
826 tgt_kinds: sem_kinds(),
827 },
828 EdgeRule {
830 edge_kind: "prop".into(),
831 src_kinds: sem_kinds(),
832 tgt_kinds: scalar_kinds(),
833 },
834 EdgeRule {
836 edge_kind: "items".into(),
837 src_kinds: [sem_kinds(), vec!["sentence".into()]].concat(),
838 tgt_kinds: [
839 sem_kinds(),
840 scalar_kinds(),
841 vec!["token".into(), "sentence".into()],
842 ]
843 .concat(),
844 },
845 ]
846}
847
848#[cfg(test)]
849#[allow(clippy::expect_used, clippy::unwrap_used)]
850mod tests {
851 use super::*;
852
853 #[test]
854 fn protocol_def() {
855 let p = protocol();
856 assert_eq!(p.name, "decomp");
857 assert_eq!(p.schema_theory, "ThDecompSchema");
858 assert_eq!(p.instance_theory, "ThDecompInstance");
859
860 for kind in &[
861 "contains",
862 "syntax-dep",
863 "head",
864 "nonhead",
865 "sem-dep",
866 "sem-head",
867 "sub-argument",
868 "sub-predicate",
869 "doc-relation",
870 "prop",
871 "items",
872 ] {
873 assert!(
874 p.find_edge_rule(kind).is_some(),
875 "missing edge rule for '{kind}'"
876 );
877 }
878
879 for kind in &[
880 "corpus",
881 "document",
882 "sentence",
883 "token",
884 "predicate",
885 "argument",
886 "string",
887 "integer",
888 "float",
889 "boolean",
890 ] {
891 assert!(p.is_known_vertex_kind(kind), "unknown vertex kind '{kind}'");
892 }
893
894 for sort in &[
895 "domain",
896 "type",
897 "position",
898 "form",
899 "lemma",
900 "upos",
901 "xpos",
902 "deprel",
903 "frompredpatt",
904 "value",
905 "confidence",
906 "subspace",
907 "property",
908 ] {
909 assert!(
910 p.constraint_sorts.iter().any(|s| s == sort),
911 "missing constraint sort '{sort}'"
912 );
913 }
914 }
915
916 #[test]
917 fn register_theories_works() {
918 let mut registry = HashMap::new();
919 register_theories(&mut registry);
920 assert!(registry.contains_key("ThDecompSchema"));
921 assert!(registry.contains_key("ThDecompInstance"));
922 assert!(registry.contains_key("ThGraph"));
923 assert!(registry.contains_key("ThConstraint"));
924 assert!(registry.contains_key("ThMulti"));
925 }
926
927 fn minimal_json() -> serde_json::Value {
928 serde_json::json!({
929 "corpus_id": "test-corpus",
930 "documents": {
931 "doc-1": {
932 "sentences": {
933 "sent-1": {
934 "syntax": {
935 "tokens": {
936 "1": {
937 "form": "The",
938 "lemma": "the",
939 "upos": "DET",
940 "xpos": "DT",
941 "deprel": "det"
942 },
943 "2": {
944 "form": "cat",
945 "lemma": "cat",
946 "upos": "NOUN",
947 "xpos": "NN",
948 "deprel": "nsubj"
949 }
950 }
951 },
952 "semantics": {
953 "predicates": {
954 "pred-1-1": {
955 "domain": "semantics",
956 "type": "predicate",
957 "frompredpatt": true,
958 "head_token": "2",
959 "span_tokens": ["2"],
960 "factuality": {
961 "factual": {"value": 0.9, "confidence": 1.0}
962 },
963 "genericity": {
964 "pred-particular": {"value": 0.8, "confidence": 1.0}
965 },
966 "time": {
967 "dur-seconds": {"value": 0.1, "confidence": 0.5}
968 },
969 "event_structure": {
970 "telic": {"value": 0.7, "confidence": 1.0}
971 }
972 }
973 },
974 "arguments": {
975 "arg-1-1": {
976 "domain": "semantics",
977 "type": "argument",
978 "head_token": "1",
979 "span_tokens": ["1"],
980 "genericity": {
981 "arg-particular": {"value": 0.9, "confidence": 1.0}
982 },
983 "wordsense": {
984 "supersense-noun.person": {
985 "value": 0.8,
986 "confidence": 1.0
987 }
988 }
989 }
990 },
991 "edges": {
992 "pred-1-1$$arg-1-1": {
993 "protoroles": {
994 "awareness": {"value": 0.85, "confidence": 1.0},
995 "instigation": {"value": 0.6, "confidence": 0.8}
996 }
997 }
998 }
999 }
1000 }
1001 }
1002 }
1003 }
1004 })
1005 }
1006
1007 #[test]
1008 #[allow(clippy::too_many_lines)]
1009 fn parse_and_emit() {
1010 let json = minimal_json();
1011 let schema = parse_decomp(&json).expect("should parse");
1012
1013 assert!(schema.has_vertex("test-corpus"), "missing corpus");
1015 assert_eq!(schema.vertices["test-corpus"].kind, "corpus");
1016
1017 assert!(schema.has_vertex("test-corpus.doc-1"), "missing document");
1018 assert_eq!(schema.vertices["test-corpus.doc-1"].kind, "document");
1019
1020 let sent_vid = "test-corpus.doc-1.sent-1";
1021 assert!(schema.has_vertex(sent_vid), "missing sentence");
1022 assert_eq!(schema.vertices[sent_vid].kind, "sentence");
1023
1024 let tok1 = format!("{sent_vid}.tok_1");
1026 let tok2 = format!("{sent_vid}.tok_2");
1027 assert!(schema.has_vertex(&tok1), "missing tok_1");
1028 assert!(schema.has_vertex(&tok2), "missing tok_2");
1029 assert_eq!(schema.vertices[tok1.as_str()].kind, "token");
1030 assert_eq!(
1031 constraint_value(&schema, &tok1, "form"),
1032 Some("The"),
1033 "tok_1 form"
1034 );
1035 assert_eq!(
1036 constraint_value(&schema, &tok2, "upos"),
1037 Some("NOUN"),
1038 "tok_2 upos"
1039 );
1040
1041 let pred_vid = format!("{sent_vid}.pred-1-1");
1043 assert!(schema.has_vertex(&pred_vid), "missing predicate");
1044 assert_eq!(schema.vertices[pred_vid.as_str()].kind, "predicate");
1045 assert_eq!(
1046 constraint_value(&schema, &pred_vid, "frompredpatt"),
1047 Some("true")
1048 );
1049
1050 let arg_vid = format!("{sent_vid}.arg-1-1");
1052 assert!(schema.has_vertex(&arg_vid), "missing argument");
1053 assert_eq!(schema.vertices[arg_vid.as_str()].kind, "argument");
1054
1055 let dep_count = schema
1057 .outgoing_edges(&pred_vid)
1058 .iter()
1059 .filter(|e| e.kind == "sem-dep")
1060 .count();
1061 assert_eq!(dep_count, 1, "expected 1 sem-dep edge");
1062
1063 let factual_vid = format!("{pred_vid}.factuality.factual");
1065 assert!(
1066 schema.has_vertex(&factual_vid),
1067 "missing factuality.factual"
1068 );
1069 assert_eq!(schema.vertices[factual_vid.as_str()].kind, "float");
1070 assert_eq!(
1071 constraint_value(&schema, &factual_vid, "value"),
1072 Some("0.9")
1073 );
1074 assert_eq!(
1075 constraint_value(&schema, &factual_vid, "confidence"),
1076 Some("1")
1077 );
1078
1079 let telic_vid = format!("{pred_vid}.event_structure.telic");
1080 assert!(schema.has_vertex(&telic_vid), "missing telic");
1081
1082 let arg_gen_vid = format!("{arg_vid}.genericity.arg-particular");
1083 assert!(schema.has_vertex(&arg_gen_vid), "missing arg genericity");
1084
1085 let pr_aware_vid = format!("{pred_vid}.pr.arg-1-1.awareness");
1087 assert!(
1088 schema.has_vertex(&pr_aware_vid),
1089 "missing protorole awareness"
1090 );
1091 assert_eq!(
1092 constraint_value(&schema, &pr_aware_vid, "subspace"),
1093 Some("protoroles")
1094 );
1095 assert_eq!(
1096 constraint_value(&schema, &pr_aware_vid, "property"),
1097 Some("awareness")
1098 );
1099 assert_eq!(
1100 constraint_value(&schema, &pr_aware_vid, "value"),
1101 Some("0.85")
1102 );
1103
1104 let pred_head_count = schema
1106 .outgoing_edges(&pred_vid)
1107 .iter()
1108 .filter(|e| e.kind == "head")
1109 .count();
1110 assert_eq!(pred_head_count, 1, "predicate should have 1 head edge");
1111
1112 let arg_head_count = schema
1113 .outgoing_edges(&arg_vid)
1114 .iter()
1115 .filter(|e| e.kind == "head")
1116 .count();
1117 assert_eq!(arg_head_count, 1, "argument should have 1 head edge");
1118
1119 let emitted = emit_decomp(&schema).expect("should emit");
1121 let schema2 = parse_decomp(&emitted).expect("should re-parse");
1122 assert_eq!(
1123 schema.vertex_count(),
1124 schema2.vertex_count(),
1125 "vertex count mismatch on roundtrip"
1126 );
1127 }
1128}