Skip to main content

panproto_protocols/annotation/
decomp.rs

1//! Universal Decompositional Semantics (UDS / Decomp) protocol definition.
2//!
3//! UDS graphs are directed acyclic semantic graphs with real-valued node and
4//! edge attributes, built on top of Universal Dependencies syntax trees.
5//! Every graph is a unified multi-domain DiGraph whose nodes and edges each
6//! carry a `domain` and `type` label plus annotation-subspace attributes.
7//!
8//! Uses Group A theory: `register_constrained_multigraph_wtype`.
9
10use std::collections::{HashMap, HashSet};
11use std::hash::BuildHasher;
12
13use panproto_gat::Theory;
14use panproto_schema::{EdgeRule, Protocol, Schema, SchemaBuilder};
15
16use crate::emit::{children_by_edge, constraint_value, vertex_constraints};
17use crate::error::ProtocolError;
18use crate::theories;
19
20// ── Annotation subspace constants ────────────────────────────────────────────
21
22const FACTUAL: &str = "factual";
23
24const PRED_PARTICULAR: &str = "pred-particular";
25const PRED_DYNAMIC: &str = "pred-dynamic";
26const PRED_HYPOTHETICAL: &str = "pred-hypothetical";
27
28const ARG_PARTICULAR: &str = "arg-particular";
29const ARG_KIND: &str = "arg-kind";
30const ARG_ABSTRACT: &str = "arg-abstract";
31
32/// Protorole properties on semantics-dep edges (predicate → argument).
33const PROTOROLE_PROPERTIES: &[&str] = &[
34    "awareness",
35    "change_of_location",
36    "change_of_possession",
37    "change_of_state",
38    "existed_before",
39    "existed_after",
40    "existed_during",
41    "instigation",
42    "location",
43    "manner",
44    "partitive",
45    "purpose",
46    "sentient",
47    "time",
48    "volition",
49    "was_for_benefit",
50    "was_used",
51    "change_of_state_continuous",
52];
53
54/// Event-structure subspace properties on predicates.
55const EVENT_STRUCTURE_PROPERTIES: &[&str] = &[
56    "distributive",
57    "dynamic",
58    "natural_parts",
59    "part_similarity",
60    "telic",
61];
62
63/// Duration granularities for the time subspace.
64const TIME_GRANULARITIES: &[&str] = &[
65    "dur-seconds",
66    "dur-minutes",
67    "dur-hours",
68    "dur-days",
69    "dur-weeks",
70    "dur-months",
71    "dur-years",
72    "dur-decades",
73    "dur-centuries",
74    "instant",
75    "forever",
76];
77
78/// Wordsense supersense properties on argument nodes (26 items).
79const WORDSENSE_PROPERTIES: &[&str] = &[
80    "supersense-noun.act",
81    "supersense-noun.animal",
82    "supersense-noun.artifact",
83    "supersense-noun.attribute",
84    "supersense-noun.body",
85    "supersense-noun.cognition",
86    "supersense-noun.communication",
87    "supersense-noun.event",
88    "supersense-noun.feeling",
89    "supersense-noun.food",
90    "supersense-noun.group",
91    "supersense-noun.location",
92    "supersense-noun.motive",
93    "supersense-noun.object",
94    "supersense-noun.person",
95    "supersense-noun.phenomenon",
96    "supersense-noun.plant",
97    "supersense-noun.possession",
98    "supersense-noun.process",
99    "supersense-noun.quantity",
100    "supersense-noun.relation",
101    "supersense-noun.shape",
102    "supersense-noun.state",
103    "supersense-noun.substance",
104    "supersense-noun.time",
105    "supersense-noun.tops",
106];
107
108// ─────────────────────────────────────────────────────────────────────────────
109
110/// Returns the Decomp/UDS protocol definition.
111#[must_use]
112pub fn protocol() -> Protocol {
113    Protocol {
114        name: "decomp".into(),
115        schema_theory: "ThDecompSchema".into(),
116        instance_theory: "ThDecompInstance".into(),
117        edge_rules: edge_rules(),
118        obj_kinds: vec![
119            // Containment hierarchy
120            "corpus".into(),
121            "document".into(),
122            "sentence".into(),
123            // Syntax layer
124            "token".into(),
125            // Semantics layer
126            "predicate".into(),
127            "argument".into(),
128            // Scalar leaf types for annotation values
129            "string".into(),
130            "integer".into(),
131            "float".into(),
132            "boolean".into(),
133        ],
134        constraint_sorts: vec![
135            // Node identity / syntax
136            "domain".into(),
137            "type".into(),
138            "position".into(),
139            "form".into(),
140            "lemma".into(),
141            "upos".into(),
142            "xpos".into(),
143            "deprel".into(),
144            // UDS provenance
145            "frompredpatt".into(),
146            // Annotation value pair
147            "value".into(),
148            "confidence".into(),
149            // Annotation subspace and property keys
150            "subspace".into(),
151            "property".into(),
152        ],
153        has_order: true,
154        ..Protocol::default()
155    }
156}
157
158/// Register the component GATs for Decomp/UDS with a theory registry.
159pub fn register_theories<S: BuildHasher>(registry: &mut HashMap<String, Theory, S>) {
160    theories::register_constrained_multigraph_wtype(registry, "ThDecompSchema", "ThDecompInstance");
161}
162
163// ── Parse ─────────────────────────────────────────────────────────────────────
164
165/// Parse a JSON-serialised UDS graph into a [`Schema`].
166///
167/// The expected JSON layout mirrors the Decomp toolkit's serialisation:
168///
169/// ```json
170/// {
171///   "corpus_id": "ewt",
172///   "documents": {
173///     "doc-1": {
174///       "sentences": {
175///         "sent-1": {
176///           "syntax": {
177///             "tokens": {
178///               "1": {"form":"The","lemma":"the","upos":"DET","xpos":"DT","deprel":"det"}
179///             }
180///           },
181///           "semantics": {
182///             "predicates": {
183///               "pred-1-1": {
184///                 "domain": "semantics", "type": "predicate",
185///                 "frompredpatt": true,
186///                 "head_token": "1", "span_tokens": ["1"],
187///                 "factuality":      {"factual":         {"value": 0.9, "confidence": 1.0}},
188///                 "genericity":      {"pred-particular": {"value": 0.8, "confidence": 1.0}},
189///                 "time":            {"dur-seconds":     {"value": 0.1, "confidence": 0.5}},
190///                 "event_structure": {"telic":           {"value": 0.7, "confidence": 1.0}}
191///               }
192///             },
193///             "arguments": {
194///               "arg-1-1": {
195///                 "domain": "semantics", "type": "argument",
196///                 "head_token": "2", "span_tokens": ["2"],
197///                 "genericity": {"arg-particular": {"value": 0.9, "confidence": 1.0}},
198///                 "wordsense":  {"supersense-noun.person": {"value": 0.8, "confidence": 1.0}}
199///               }
200///             },
201///             "edges": {
202///               "pred-1-1$$arg-1-1": {
203///                 "protoroles": {"awareness": {"value": 0.9, "confidence": 1.0}}
204///               }
205///             }
206///           }
207///         }
208///       }
209///     }
210///   }
211/// }
212/// ```
213///
214/// # Errors
215///
216/// Returns [`ProtocolError`] if the JSON structure cannot be parsed.
217#[allow(clippy::too_many_lines)]
218pub fn parse_decomp(json: &serde_json::Value) -> Result<Schema, ProtocolError> {
219    let proto = protocol();
220    let mut builder = SchemaBuilder::new(&proto);
221    // Track vertex IDs so helper functions can check token existence before linking.
222    let mut known: HashSet<String> = HashSet::new();
223
224    // ── Corpus root vertex ────────────────────────────────────────────────
225    let corpus_id = json
226        .get("corpus_id")
227        .and_then(serde_json::Value::as_str)
228        .unwrap_or("corpus")
229        .to_string();
230
231    builder = builder
232        .vertex(&corpus_id, "corpus", None)
233        .map_err(|e| ProtocolError::Parse(e.to_string()))?;
234    known.insert(corpus_id.clone());
235    builder = builder.constraint(&corpus_id, "domain", "root");
236    builder = builder.constraint(&corpus_id, "type", "corpus");
237
238    // ── Documents ─────────────────────────────────────────────────────────
239    let documents = json
240        .get("documents")
241        .and_then(serde_json::Value::as_object)
242        .ok_or_else(|| ProtocolError::MissingField("documents".into()))?;
243
244    for (doc_key, doc_val) in documents {
245        let doc_vid = format!("{corpus_id}.{doc_key}");
246        builder = builder
247            .vertex(&doc_vid, "document", None)
248            .map_err(|e| ProtocolError::Parse(e.to_string()))?;
249        known.insert(doc_vid.clone());
250        builder = builder.constraint(&doc_vid, "domain", "document");
251        builder = builder.constraint(&doc_vid, "type", "document");
252        builder = builder
253            .edge(&corpus_id, &doc_vid, "contains", Some(doc_key))
254            .map_err(|e| ProtocolError::Parse(e.to_string()))?;
255
256        // ── Sentences ─────────────────────────────────────────────────────
257        let sentences = doc_val
258            .get("sentences")
259            .and_then(serde_json::Value::as_object)
260            .ok_or_else(|| ProtocolError::MissingField(format!("{doc_key}.sentences")))?;
261
262        for (sent_key, sent_val) in sentences {
263            let sent_vid = format!("{doc_vid}.{sent_key}");
264            builder = builder
265                .vertex(&sent_vid, "sentence", None)
266                .map_err(|e| ProtocolError::Parse(e.to_string()))?;
267            known.insert(sent_vid.clone());
268            builder = builder.constraint(&sent_vid, "domain", "syntax");
269            builder = builder.constraint(&sent_vid, "type", "sentence");
270            builder = builder
271                .edge(&doc_vid, &sent_vid, "contains", Some(sent_key))
272                .map_err(|e| ProtocolError::Parse(e.to_string()))?;
273
274            builder = parse_syntax_tokens(builder, sent_val, &sent_vid, &mut known)
275                .map_err(|e| ProtocolError::Parse(e.to_string()))?;
276
277            builder = parse_semantics(builder, sent_val, &sent_vid, &known)
278                .map_err(|e| ProtocolError::Parse(e.to_string()))?;
279        }
280    }
281
282    let schema = builder.build()?;
283    Ok(schema)
284}
285
286/// Parse the `syntax.tokens` sub-object of a sentence.
287fn parse_syntax_tokens(
288    mut builder: SchemaBuilder,
289    sent_val: &serde_json::Value,
290    sent_vid: &str,
291    known: &mut HashSet<String>,
292) -> Result<SchemaBuilder, panproto_schema::SchemaError> {
293    let Some(tokens) = sent_val
294        .pointer("/syntax/tokens")
295        .and_then(serde_json::Value::as_object)
296    else {
297        return Ok(builder);
298    };
299
300    for (pos_str, tok_val) in tokens {
301        let tok_vid = format!("{sent_vid}.tok_{pos_str}");
302        builder = builder.vertex(&tok_vid, "token", None)?;
303        known.insert(tok_vid.clone());
304        builder = builder.constraint(&tok_vid, "domain", "syntax");
305        builder = builder.constraint(&tok_vid, "type", "token");
306        builder = builder.constraint(&tok_vid, "position", pos_str);
307        builder = builder.edge(sent_vid, &tok_vid, "syntax-dep", Some(pos_str))?;
308
309        for field in &["form", "lemma", "upos", "xpos", "deprel"] {
310            if let Some(v) = tok_val.get(field).and_then(serde_json::Value::as_str) {
311                builder = builder.constraint(&tok_vid, field, v);
312            }
313        }
314    }
315
316    Ok(builder)
317}
318
319/// Parse the `semantics` sub-object of a sentence (predicates, arguments, edges).
320#[allow(clippy::too_many_lines)]
321fn parse_semantics(
322    mut builder: SchemaBuilder,
323    sent_val: &serde_json::Value,
324    sent_vid: &str,
325    known: &HashSet<String>,
326) -> Result<SchemaBuilder, panproto_schema::SchemaError> {
327    let Some(sem) = sent_val
328        .get("semantics")
329        .and_then(serde_json::Value::as_object)
330    else {
331        return Ok(builder);
332    };
333
334    // Track semantics-layer vertex IDs to resolve edges between them.
335    let mut sem_known: HashSet<String> = HashSet::new();
336
337    // ── Predicates ────────────────────────────────────────────────────────
338    if let Some(preds) = sem.get("predicates").and_then(serde_json::Value::as_object) {
339        for (pred_key, pred_val) in preds {
340            let pred_vid = format!("{sent_vid}.{pred_key}");
341            builder = builder.vertex(&pred_vid, "predicate", None)?;
342            sem_known.insert(pred_vid.clone());
343            builder = builder.constraint(&pred_vid, "domain", "semantics");
344            builder = builder.constraint(&pred_vid, "type", "predicate");
345            builder = builder.edge(sent_vid, &pred_vid, "contains", Some(pred_key))?;
346
347            if let Some(fp) = pred_val.get("frompredpatt") {
348                let fp_str = if fp.as_bool().unwrap_or(false) {
349                    "true"
350                } else {
351                    "false"
352                };
353                builder = builder.constraint(&pred_vid, "frompredpatt", fp_str);
354            }
355
356            // Interface head edge
357            if let Some(head_pos) = pred_val
358                .get("head_token")
359                .and_then(serde_json::Value::as_str)
360            {
361                let tok_vid = format!("{sent_vid}.tok_{head_pos}");
362                if known.contains(&tok_vid) {
363                    builder = builder.edge(&pred_vid, &tok_vid, "head", Some(head_pos))?;
364                }
365            }
366
367            // Interface nonhead edges (deduplicated)
368            if let Some(span_arr) = pred_val
369                .get("span_tokens")
370                .and_then(serde_json::Value::as_array)
371            {
372                let mut added: HashSet<String> = HashSet::new();
373                for tok_pos in span_arr.iter().filter_map(serde_json::Value::as_str) {
374                    let tok_vid = format!("{sent_vid}.tok_{tok_pos}");
375                    if known.contains(&tok_vid) && added.insert(tok_vid.clone()) {
376                        builder = builder.edge(&pred_vid, &tok_vid, "nonhead", Some(tok_pos))?;
377                    }
378                }
379            }
380
381            builder = parse_subspace(builder, pred_val, "factuality", &[FACTUAL], &pred_vid)?;
382            builder = parse_subspace(
383                builder,
384                pred_val,
385                "genericity",
386                &[PRED_PARTICULAR, PRED_DYNAMIC, PRED_HYPOTHETICAL],
387                &pred_vid,
388            )?;
389            builder = parse_subspace(builder, pred_val, "time", TIME_GRANULARITIES, &pred_vid)?;
390            builder = parse_subspace(
391                builder,
392                pred_val,
393                "event_structure",
394                EVENT_STRUCTURE_PROPERTIES,
395                &pred_vid,
396            )?;
397        }
398    }
399
400    // ── Arguments ─────────────────────────────────────────────────────────
401    if let Some(args) = sem.get("arguments").and_then(serde_json::Value::as_object) {
402        for (arg_key, arg_val) in args {
403            let arg_vid = format!("{sent_vid}.{arg_key}");
404            builder = builder.vertex(&arg_vid, "argument", None)?;
405            sem_known.insert(arg_vid.clone());
406            builder = builder.constraint(&arg_vid, "domain", "semantics");
407            builder = builder.constraint(&arg_vid, "type", "argument");
408            builder = builder.edge(sent_vid, &arg_vid, "contains", Some(arg_key))?;
409
410            if let Some(head_pos) = arg_val
411                .get("head_token")
412                .and_then(serde_json::Value::as_str)
413            {
414                let tok_vid = format!("{sent_vid}.tok_{head_pos}");
415                if known.contains(&tok_vid) {
416                    builder = builder.edge(&arg_vid, &tok_vid, "head", Some(head_pos))?;
417                }
418            }
419
420            if let Some(span_arr) = arg_val
421                .get("span_tokens")
422                .and_then(serde_json::Value::as_array)
423            {
424                let mut added: HashSet<String> = HashSet::new();
425                for tok_pos in span_arr.iter().filter_map(serde_json::Value::as_str) {
426                    let tok_vid = format!("{sent_vid}.tok_{tok_pos}");
427                    if known.contains(&tok_vid) && added.insert(tok_vid.clone()) {
428                        builder = builder.edge(&arg_vid, &tok_vid, "nonhead", Some(tok_pos))?;
429                    }
430                }
431            }
432
433            builder = parse_subspace(
434                builder,
435                arg_val,
436                "genericity",
437                &[ARG_PARTICULAR, ARG_KIND, ARG_ABSTRACT],
438                &arg_vid,
439            )?;
440            builder = parse_subspace(
441                builder,
442                arg_val,
443                "wordsense",
444                WORDSENSE_PROPERTIES,
445                &arg_vid,
446            )?;
447        }
448    }
449
450    // ── Semantics dependency edges (pred → arg) with protoroles ───────────
451    if let Some(edges) = sem.get("edges").and_then(serde_json::Value::as_object) {
452        for (edge_key, edge_val) in edges {
453            let Some((pred_key, arg_key)) = edge_key.split_once("$$") else {
454                continue;
455            };
456            let pred_vid = format!("{sent_vid}.{pred_key}");
457            let arg_vid = format!("{sent_vid}.{arg_key}");
458            if !sem_known.contains(&pred_vid) || !sem_known.contains(&arg_vid) {
459                continue;
460            }
461            builder = builder.edge(&pred_vid, &arg_vid, "sem-dep", Some(edge_key))?;
462
463            // Protorole annotations as float prop vertices on the predicate.
464            if let Some(protoroles) = edge_val
465                .get("protoroles")
466                .and_then(serde_json::Value::as_object)
467            {
468                for prop in PROTOROLE_PROPERTIES {
469                    if let Some(ann) = protoroles.get(*prop) {
470                        let prop_vid = format!("{pred_vid}.pr.{arg_key}.{prop}");
471                        builder = builder.vertex(&prop_vid, "float", None)?;
472                        builder = builder.constraint(&prop_vid, "subspace", "protoroles");
473                        builder = builder.constraint(&prop_vid, "property", prop);
474                        if let Some(v) = ann.get("value").and_then(serde_json::Value::as_f64) {
475                            builder = builder.constraint(&prop_vid, "value", &v.to_string());
476                        }
477                        if let Some(c) = ann.get("confidence").and_then(serde_json::Value::as_f64) {
478                            builder = builder.constraint(&prop_vid, "confidence", &c.to_string());
479                        }
480                        builder = builder.edge(&pred_vid, &prop_vid, "prop", Some(prop))?;
481                    }
482                }
483            }
484
485            // Event-structure mereology on edges (e.g. pred1_contains_pred2).
486            if let Some(event_struct) = edge_val
487                .get("event_structure")
488                .and_then(serde_json::Value::as_object)
489            {
490                for (mero_key, ann) in event_struct {
491                    let mero_vid = format!("{pred_vid}.es.{arg_key}.{mero_key}");
492                    builder = builder.vertex(&mero_vid, "boolean", None)?;
493                    builder = builder.constraint(&mero_vid, "subspace", "event_structure");
494                    builder = builder.constraint(&mero_vid, "property", mero_key);
495                    if let Some(v) = ann.get("value").and_then(serde_json::Value::as_f64) {
496                        builder = builder.constraint(&mero_vid, "value", &v.to_string());
497                    }
498                    if let Some(c) = ann.get("confidence").and_then(serde_json::Value::as_f64) {
499                        builder = builder.constraint(&mero_vid, "confidence", &c.to_string());
500                    }
501                    builder =
502                        builder.edge(&pred_vid, &mero_vid, "prop", Some(mero_key.as_str()))?;
503                }
504            }
505        }
506    }
507
508    Ok(builder)
509}
510
511/// Parse one annotation subspace object and attach `float` prop vertices.
512fn parse_subspace(
513    mut builder: SchemaBuilder,
514    node_val: &serde_json::Value,
515    subspace: &str,
516    known_props: &[&str],
517    parent_vid: &str,
518) -> Result<SchemaBuilder, panproto_schema::SchemaError> {
519    let Some(subspace_obj) = node_val
520        .get(subspace)
521        .and_then(serde_json::Value::as_object)
522    else {
523        return Ok(builder);
524    };
525
526    for prop in known_props {
527        if let Some(ann) = subspace_obj.get(*prop) {
528            let prop_vid = format!("{parent_vid}.{subspace}.{prop}");
529            builder = builder.vertex(&prop_vid, "float", None)?;
530            builder = builder.constraint(&prop_vid, "subspace", subspace);
531            builder = builder.constraint(&prop_vid, "property", prop);
532            if let Some(v) = ann.get("value").and_then(serde_json::Value::as_f64) {
533                builder = builder.constraint(&prop_vid, "value", &v.to_string());
534            }
535            if let Some(c) = ann.get("confidence").and_then(serde_json::Value::as_f64) {
536                builder = builder.constraint(&prop_vid, "confidence", &c.to_string());
537            }
538            builder = builder.edge(parent_vid, &prop_vid, "prop", Some(prop))?;
539        }
540    }
541
542    Ok(builder)
543}
544
545// ── Emit ──────────────────────────────────────────────────────────────────────
546
547/// Emit a [`Schema`] back to its JSON UDS representation.
548///
549/// # Errors
550///
551/// Returns [`ProtocolError::Emit`] if the schema cannot be serialised.
552#[allow(clippy::too_many_lines)]
553pub fn emit_decomp(schema: &Schema) -> Result<serde_json::Value, ProtocolError> {
554    let corpus = schema
555        .vertices
556        .values()
557        .find(|v| v.kind == "corpus")
558        .ok_or_else(|| ProtocolError::Emit("no corpus vertex found".into()))?;
559
560    let corpus_id = corpus.id.to_string();
561    let mut documents_map = serde_json::Map::new();
562
563    for (_doc_edge, doc_vertex) in children_by_edge(schema, &corpus_id, "contains") {
564        let mut sentences_map = serde_json::Map::new();
565
566        for (_sent_edge, sent_vertex) in children_by_edge(schema, &doc_vertex.id, "contains") {
567            let sent_json = emit_sentence(schema, &sent_vertex.id);
568            let sent_key = sent_vertex.id.rsplit('.').next().unwrap_or(&sent_vertex.id);
569            sentences_map.insert(sent_key.to_string(), sent_json);
570        }
571
572        let doc_key = doc_vertex.id.rsplit('.').next().unwrap_or(&doc_vertex.id);
573        documents_map.insert(
574            doc_key.to_string(),
575            serde_json::json!({ "sentences": sentences_map }),
576        );
577    }
578
579    Ok(serde_json::json!({
580        "corpus_id": corpus_id,
581        "documents": documents_map,
582    }))
583}
584
585/// Emit a single sentence vertex as a JSON object.
586fn emit_sentence(schema: &Schema, sent_vid: &str) -> serde_json::Value {
587    // ── Syntax tokens ─────────────────────────────────────────────────────
588    let mut tokens_map = serde_json::Map::new();
589    for (_edge, tok_vertex) in children_by_edge(schema, sent_vid, "syntax-dep") {
590        let mut tok_obj = serde_json::Map::new();
591        for sort in &["form", "lemma", "upos", "xpos", "deprel"] {
592            if let Some(v) = constraint_value(schema, &tok_vertex.id, sort) {
593                tok_obj.insert((*sort).to_string(), serde_json::json!(v));
594            }
595        }
596        let pos = constraint_value(schema, &tok_vertex.id, "position").unwrap_or(&tok_vertex.id);
597        tokens_map.insert(pos.to_string(), serde_json::Value::Object(tok_obj));
598    }
599
600    // ── Semantics ─────────────────────────────────────────────────────────
601    let mut preds_map = serde_json::Map::new();
602    let mut args_map = serde_json::Map::new();
603    let mut edges_map = serde_json::Map::new();
604
605    for (_edge, child) in children_by_edge(schema, sent_vid, "contains") {
606        match child.kind.as_str() {
607            "predicate" => {
608                let pred_key = child.id.rsplit('.').next().unwrap_or(&child.id);
609                preds_map.insert(
610                    pred_key.to_string(),
611                    emit_sem_node(schema, &child.id, "predicate"),
612                );
613
614                // Collect sem-dep edges originating from this predicate.
615                for dep_edge in schema
616                    .outgoing_edges(&child.id)
617                    .iter()
618                    .filter(|e| e.kind == "sem-dep")
619                {
620                    let arg_vid = &dep_edge.tgt;
621                    let arg_key = arg_vid.rsplit('.').next().unwrap_or(arg_vid.as_str());
622                    let edge_key = dep_edge
623                        .name
624                        .as_ref()
625                        .map_or_else(|| format!("{pred_key}$${arg_key}"), ToString::to_string);
626
627                    // Protoroles: prop children of pred scoped to this arg.
628                    let mut protoroles_map = serde_json::Map::new();
629                    for (_prop_edge, prop_vertex) in children_by_edge(schema, &child.id, "prop") {
630                        if constraint_value(schema, &prop_vertex.id, "subspace")
631                            != Some("protoroles")
632                        {
633                            continue;
634                        }
635                        if !prop_vertex.id.contains(arg_key) {
636                            continue;
637                        }
638                        if let Some(pname) = constraint_value(schema, &prop_vertex.id, "property") {
639                            protoroles_map.insert(
640                                pname.to_string(),
641                                emit_annotation(schema, &prop_vertex.id),
642                            );
643                        }
644                    }
645
646                    let mut edge_obj = serde_json::Map::new();
647                    if !protoroles_map.is_empty() {
648                        edge_obj.insert(
649                            "protoroles".into(),
650                            serde_json::Value::Object(protoroles_map),
651                        );
652                    }
653                    edges_map.insert(edge_key, serde_json::Value::Object(edge_obj));
654                }
655            }
656            "argument" => {
657                let arg_key = child.id.rsplit('.').next().unwrap_or(&child.id);
658                args_map.insert(
659                    arg_key.to_string(),
660                    emit_sem_node(schema, &child.id, "argument"),
661                );
662            }
663            _ => {}
664        }
665    }
666
667    let mut sem_obj = serde_json::Map::new();
668    if !preds_map.is_empty() {
669        sem_obj.insert("predicates".into(), serde_json::Value::Object(preds_map));
670    }
671    if !args_map.is_empty() {
672        sem_obj.insert("arguments".into(), serde_json::Value::Object(args_map));
673    }
674    if !edges_map.is_empty() {
675        sem_obj.insert("edges".into(), serde_json::Value::Object(edges_map));
676    }
677
678    serde_json::json!({
679        "syntax": { "tokens": tokens_map },
680        "semantics": sem_obj,
681    })
682}
683
684/// Emit a semantics predicate or argument node as a JSON object.
685fn emit_sem_node(schema: &Schema, node_vid: &str, sem_type: &str) -> serde_json::Value {
686    let mut obj = serde_json::Map::new();
687    obj.insert("domain".into(), serde_json::json!("semantics"));
688    obj.insert("type".into(), serde_json::json!(sem_type));
689
690    if let Some(fp) = constraint_value(schema, node_vid, "frompredpatt") {
691        obj.insert("frompredpatt".into(), serde_json::json!(fp == "true"));
692    }
693
694    // head_token
695    if let Some(head_edge) = schema
696        .outgoing_edges(node_vid)
697        .iter()
698        .find(|e| e.kind == "head")
699    {
700        if let Some(pos) = &head_edge.name {
701            obj.insert("head_token".into(), serde_json::json!(pos));
702        }
703    }
704
705    // span_tokens
706    let nonhead: Vec<_> = schema
707        .outgoing_edges(node_vid)
708        .iter()
709        .filter(|e| e.kind == "nonhead")
710        .collect();
711    if !nonhead.is_empty() {
712        let span: Vec<serde_json::Value> = nonhead
713            .iter()
714            .filter_map(|e| e.name.as_deref().map(|n| serde_json::json!(n)))
715            .collect();
716        obj.insert("span_tokens".into(), serde_json::Value::Array(span));
717    }
718
719    // Annotation subspaces from prop children (excluding protoroles).
720    let mut subspaces: HashMap<String, serde_json::Map<String, serde_json::Value>> = HashMap::new();
721    for (_prop_edge, prop_vertex) in children_by_edge(schema, node_vid, "prop") {
722        let sub = constraint_value(schema, &prop_vertex.id, "subspace");
723        let prop_name = constraint_value(schema, &prop_vertex.id, "property");
724        if sub == Some("protoroles") {
725            continue;
726        }
727        if let (Some(sub_str), Some(prop_str)) = (sub, prop_name) {
728            let ann = emit_annotation(schema, &prop_vertex.id);
729            subspaces
730                .entry(sub_str.to_string())
731                .or_default()
732                .insert(prop_str.to_string(), ann);
733        }
734    }
735    for (sub, props) in subspaces {
736        obj.insert(sub, serde_json::Value::Object(props));
737    }
738
739    serde_json::Value::Object(obj)
740}
741
742/// Build `{"value": f64, "confidence": f64}` from a vertex's constraints.
743fn emit_annotation(schema: &Schema, vertex_id: &str) -> serde_json::Value {
744    let mut ann = serde_json::Map::new();
745    for c in vertex_constraints(schema, vertex_id) {
746        if c.sort == "value" || c.sort == "confidence" {
747            if let Ok(f) = c.value.parse::<f64>() {
748                ann.insert(c.sort.to_string(), serde_json::json!(f));
749            }
750        }
751    }
752    serde_json::Value::Object(ann)
753}
754
755// ── Edge rules ────────────────────────────────────────────────────────────────
756
757fn edge_rules() -> Vec<EdgeRule> {
758    let sem_kinds = || vec!["predicate".to_string(), "argument".to_string()];
759    let scalar_kinds = || {
760        vec![
761            "string".to_string(),
762            "integer".to_string(),
763            "float".to_string(),
764            "boolean".to_string(),
765        ]
766    };
767
768    vec![
769        // Containment: corpus → document → sentence; sentence → pred / arg
770        EdgeRule {
771            edge_kind: "contains".into(),
772            src_kinds: vec!["corpus".into(), "document".into(), "sentence".into()],
773            tgt_kinds: vec![
774                "document".into(),
775                "sentence".into(),
776                "predicate".into(),
777                "argument".into(),
778            ],
779        },
780        // Syntax dependency: sentence → token (root) or token → token
781        EdgeRule {
782            edge_kind: "syntax-dep".into(),
783            src_kinds: vec!["sentence".into(), "token".into()],
784            tgt_kinds: vec!["token".into()],
785        },
786        // Interface head: semantics node → head syntax token
787        EdgeRule {
788            edge_kind: "head".into(),
789            src_kinds: sem_kinds(),
790            tgt_kinds: vec!["token".into()],
791        },
792        // Interface nonhead: semantics node → span syntax token
793        EdgeRule {
794            edge_kind: "nonhead".into(),
795            src_kinds: sem_kinds(),
796            tgt_kinds: vec!["token".into()],
797        },
798        // Semantics dependency: predicate → argument with protorole annotations
799        EdgeRule {
800            edge_kind: "sem-dep".into(),
801            src_kinds: vec!["predicate".into()],
802            tgt_kinds: vec!["argument".into()],
803        },
804        // Semantics head: argument → predicate (realization)
805        EdgeRule {
806            edge_kind: "sem-head".into(),
807            src_kinds: vec!["argument".into()],
808            tgt_kinds: vec!["predicate".into()],
809        },
810        // Sub-argument structural edges
811        EdgeRule {
812            edge_kind: "sub-argument".into(),
813            src_kinds: vec!["argument".into()],
814            tgt_kinds: vec!["argument".into()],
815        },
816        // Sub-predicate structural edges
817        EdgeRule {
818            edge_kind: "sub-predicate".into(),
819            src_kinds: vec!["predicate".into()],
820            tgt_kinds: vec!["predicate".into()],
821        },
822        // Document relation: cross-sentence edges
823        EdgeRule {
824            edge_kind: "doc-relation".into(),
825            src_kinds: sem_kinds(),
826            tgt_kinds: sem_kinds(),
827        },
828        // Annotation property leaf edges
829        EdgeRule {
830            edge_kind: "prop".into(),
831            src_kinds: sem_kinds(),
832            tgt_kinds: scalar_kinds(),
833        },
834        // Items: ordered membership edges
835        EdgeRule {
836            edge_kind: "items".into(),
837            src_kinds: [sem_kinds(), vec!["sentence".into()]].concat(),
838            tgt_kinds: [
839                sem_kinds(),
840                scalar_kinds(),
841                vec!["token".into(), "sentence".into()],
842            ]
843            .concat(),
844        },
845    ]
846}
847
848#[cfg(test)]
849#[allow(clippy::expect_used, clippy::unwrap_used)]
850mod tests {
851    use super::*;
852
853    #[test]
854    fn protocol_def() {
855        let p = protocol();
856        assert_eq!(p.name, "decomp");
857        assert_eq!(p.schema_theory, "ThDecompSchema");
858        assert_eq!(p.instance_theory, "ThDecompInstance");
859
860        for kind in &[
861            "contains",
862            "syntax-dep",
863            "head",
864            "nonhead",
865            "sem-dep",
866            "sem-head",
867            "sub-argument",
868            "sub-predicate",
869            "doc-relation",
870            "prop",
871            "items",
872        ] {
873            assert!(
874                p.find_edge_rule(kind).is_some(),
875                "missing edge rule for '{kind}'"
876            );
877        }
878
879        for kind in &[
880            "corpus",
881            "document",
882            "sentence",
883            "token",
884            "predicate",
885            "argument",
886            "string",
887            "integer",
888            "float",
889            "boolean",
890        ] {
891            assert!(p.is_known_vertex_kind(kind), "unknown vertex kind '{kind}'");
892        }
893
894        for sort in &[
895            "domain",
896            "type",
897            "position",
898            "form",
899            "lemma",
900            "upos",
901            "xpos",
902            "deprel",
903            "frompredpatt",
904            "value",
905            "confidence",
906            "subspace",
907            "property",
908        ] {
909            assert!(
910                p.constraint_sorts.iter().any(|s| s == sort),
911                "missing constraint sort '{sort}'"
912            );
913        }
914    }
915
916    #[test]
917    fn register_theories_works() {
918        let mut registry = HashMap::new();
919        register_theories(&mut registry);
920        assert!(registry.contains_key("ThDecompSchema"));
921        assert!(registry.contains_key("ThDecompInstance"));
922        assert!(registry.contains_key("ThGraph"));
923        assert!(registry.contains_key("ThConstraint"));
924        assert!(registry.contains_key("ThMulti"));
925    }
926
927    fn minimal_json() -> serde_json::Value {
928        serde_json::json!({
929            "corpus_id": "test-corpus",
930            "documents": {
931                "doc-1": {
932                    "sentences": {
933                        "sent-1": {
934                            "syntax": {
935                                "tokens": {
936                                    "1": {
937                                        "form": "The",
938                                        "lemma": "the",
939                                        "upos": "DET",
940                                        "xpos": "DT",
941                                        "deprel": "det"
942                                    },
943                                    "2": {
944                                        "form": "cat",
945                                        "lemma": "cat",
946                                        "upos": "NOUN",
947                                        "xpos": "NN",
948                                        "deprel": "nsubj"
949                                    }
950                                }
951                            },
952                            "semantics": {
953                                "predicates": {
954                                    "pred-1-1": {
955                                        "domain": "semantics",
956                                        "type": "predicate",
957                                        "frompredpatt": true,
958                                        "head_token": "2",
959                                        "span_tokens": ["2"],
960                                        "factuality": {
961                                            "factual": {"value": 0.9, "confidence": 1.0}
962                                        },
963                                        "genericity": {
964                                            "pred-particular": {"value": 0.8, "confidence": 1.0}
965                                        },
966                                        "time": {
967                                            "dur-seconds": {"value": 0.1, "confidence": 0.5}
968                                        },
969                                        "event_structure": {
970                                            "telic": {"value": 0.7, "confidence": 1.0}
971                                        }
972                                    }
973                                },
974                                "arguments": {
975                                    "arg-1-1": {
976                                        "domain": "semantics",
977                                        "type": "argument",
978                                        "head_token": "1",
979                                        "span_tokens": ["1"],
980                                        "genericity": {
981                                            "arg-particular": {"value": 0.9, "confidence": 1.0}
982                                        },
983                                        "wordsense": {
984                                            "supersense-noun.person": {
985                                                "value": 0.8,
986                                                "confidence": 1.0
987                                            }
988                                        }
989                                    }
990                                },
991                                "edges": {
992                                    "pred-1-1$$arg-1-1": {
993                                        "protoroles": {
994                                            "awareness": {"value": 0.85, "confidence": 1.0},
995                                            "instigation": {"value": 0.6, "confidence": 0.8}
996                                        }
997                                    }
998                                }
999                            }
1000                        }
1001                    }
1002                }
1003            }
1004        })
1005    }
1006
1007    #[test]
1008    #[allow(clippy::too_many_lines)]
1009    fn parse_and_emit() {
1010        let json = minimal_json();
1011        let schema = parse_decomp(&json).expect("should parse");
1012
1013        // ── Structural vertices ──────────────────────────────────────────
1014        assert!(schema.has_vertex("test-corpus"), "missing corpus");
1015        assert_eq!(schema.vertices["test-corpus"].kind, "corpus");
1016
1017        assert!(schema.has_vertex("test-corpus.doc-1"), "missing document");
1018        assert_eq!(schema.vertices["test-corpus.doc-1"].kind, "document");
1019
1020        let sent_vid = "test-corpus.doc-1.sent-1";
1021        assert!(schema.has_vertex(sent_vid), "missing sentence");
1022        assert_eq!(schema.vertices[sent_vid].kind, "sentence");
1023
1024        // ── Tokens ──────────────────────────────────────────────────────
1025        let tok1 = format!("{sent_vid}.tok_1");
1026        let tok2 = format!("{sent_vid}.tok_2");
1027        assert!(schema.has_vertex(&tok1), "missing tok_1");
1028        assert!(schema.has_vertex(&tok2), "missing tok_2");
1029        assert_eq!(schema.vertices[tok1.as_str()].kind, "token");
1030        assert_eq!(
1031            constraint_value(&schema, &tok1, "form"),
1032            Some("The"),
1033            "tok_1 form"
1034        );
1035        assert_eq!(
1036            constraint_value(&schema, &tok2, "upos"),
1037            Some("NOUN"),
1038            "tok_2 upos"
1039        );
1040
1041        // ── Predicate ───────────────────────────────────────────────────
1042        let pred_vid = format!("{sent_vid}.pred-1-1");
1043        assert!(schema.has_vertex(&pred_vid), "missing predicate");
1044        assert_eq!(schema.vertices[pred_vid.as_str()].kind, "predicate");
1045        assert_eq!(
1046            constraint_value(&schema, &pred_vid, "frompredpatt"),
1047            Some("true")
1048        );
1049
1050        // ── Argument ────────────────────────────────────────────────────
1051        let arg_vid = format!("{sent_vid}.arg-1-1");
1052        assert!(schema.has_vertex(&arg_vid), "missing argument");
1053        assert_eq!(schema.vertices[arg_vid.as_str()].kind, "argument");
1054
1055        // ── sem-dep edge ────────────────────────────────────────────────
1056        let dep_count = schema
1057            .outgoing_edges(&pred_vid)
1058            .iter()
1059            .filter(|e| e.kind == "sem-dep")
1060            .count();
1061        assert_eq!(dep_count, 1, "expected 1 sem-dep edge");
1062
1063        // ── Annotation subspace prop vertices ────────────────────────────
1064        let factual_vid = format!("{pred_vid}.factuality.factual");
1065        assert!(
1066            schema.has_vertex(&factual_vid),
1067            "missing factuality.factual"
1068        );
1069        assert_eq!(schema.vertices[factual_vid.as_str()].kind, "float");
1070        assert_eq!(
1071            constraint_value(&schema, &factual_vid, "value"),
1072            Some("0.9")
1073        );
1074        assert_eq!(
1075            constraint_value(&schema, &factual_vid, "confidence"),
1076            Some("1")
1077        );
1078
1079        let telic_vid = format!("{pred_vid}.event_structure.telic");
1080        assert!(schema.has_vertex(&telic_vid), "missing telic");
1081
1082        let arg_gen_vid = format!("{arg_vid}.genericity.arg-particular");
1083        assert!(schema.has_vertex(&arg_gen_vid), "missing arg genericity");
1084
1085        // ── Protorole prop vertices ──────────────────────────────────────
1086        let pr_aware_vid = format!("{pred_vid}.pr.arg-1-1.awareness");
1087        assert!(
1088            schema.has_vertex(&pr_aware_vid),
1089            "missing protorole awareness"
1090        );
1091        assert_eq!(
1092            constraint_value(&schema, &pr_aware_vid, "subspace"),
1093            Some("protoroles")
1094        );
1095        assert_eq!(
1096            constraint_value(&schema, &pr_aware_vid, "property"),
1097            Some("awareness")
1098        );
1099        assert_eq!(
1100            constraint_value(&schema, &pr_aware_vid, "value"),
1101            Some("0.85")
1102        );
1103
1104        // ── Interface edges ──────────────────────────────────────────────
1105        let pred_head_count = schema
1106            .outgoing_edges(&pred_vid)
1107            .iter()
1108            .filter(|e| e.kind == "head")
1109            .count();
1110        assert_eq!(pred_head_count, 1, "predicate should have 1 head edge");
1111
1112        let arg_head_count = schema
1113            .outgoing_edges(&arg_vid)
1114            .iter()
1115            .filter(|e| e.kind == "head")
1116            .count();
1117        assert_eq!(arg_head_count, 1, "argument should have 1 head edge");
1118
1119        // ── Roundtrip ────────────────────────────────────────────────────
1120        let emitted = emit_decomp(&schema).expect("should emit");
1121        let schema2 = parse_decomp(&emitted).expect("should re-parse");
1122        assert_eq!(
1123            schema.vertex_count(),
1124            schema2.vertex_count(),
1125            "vertex count mismatch on roundtrip"
1126        );
1127    }
1128}