Skip to main content

anno_graph/
lib.rs

1//! Adapters between `anno-core` and `lattix`.
2//!
3//! This crate exists to avoid introducing a dependency edge from `anno-core` → `lattix` while
4//! still letting downstream tooling treat `lattix` as the graph/KG substrate.
5//!
6//! ## Functions
7//!
8//! - [`entities_to_knowledge_graph`] — the preferred export path: takes raw extraction output
9//!   (`Entity` + `Relation` slices) and emits a fully-annotated `KnowledgeGraph` including
10//!   character-offset, confidence, and provenance triples. Used by `anno export --format
11//!   graph-ntriples`.
12
13use lattix::{GraphDocument, GraphEdge, GraphNode, KnowledgeGraph, Triple};
14
15/// Convert a `GroundedDocument` into a `lattix::exchange::GraphDocument`.
16///
17/// **Note**: Relations are not currently stored in `GroundedDocument`, so this
18/// conversion only produces entity nodes and track-based edges. To include
19/// extraction-time relations, use [`entities_to_graph_document`] directly with
20/// the `Relation` slice from the extraction backend.
21#[must_use]
22pub fn grounded_to_graph_document(doc: &anno_core::GroundedDocument) -> GraphDocument {
23    let entities = doc.to_entities();
24    entities_to_graph_document(&entities, &[])
25}
26
27/// Convert entities and relations into a `lattix::exchange::GraphDocument`.
28#[must_use]
29pub fn entities_to_graph_document(
30    entities: &[anno_core::Entity],
31    relations: &[anno_core::Relation],
32) -> GraphDocument {
33    let mut doc = GraphDocument::new();
34    let mut seen_nodes: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
35    let mut entity_to_node: std::collections::HashMap<usize, String> =
36        std::collections::HashMap::new();
37
38    let get_node_id = |e: &anno_core::Entity| -> String {
39        if let Some(ref kb_id) = e.kb_id {
40            return kb_id.clone();
41        }
42        if let Some(canonical_id) = e.canonical_id {
43            return format!("coref_{}", canonical_id);
44        }
45        format!(
46            "{}:{}",
47            e.entity_type.as_label().to_lowercase(),
48            uri_safe(&e.text)
49        )
50    };
51
52    for (idx, entity) in entities.iter().enumerate() {
53        let node_id = get_node_id(entity);
54
55        if let Some(&existing_idx) = seen_nodes.get(&node_id) {
56            if let Some(count) = doc.nodes[existing_idx].properties.get_mut("mentions_count") {
57                if let Some(n) = count.as_u64() {
58                    *count = serde_json::Value::from(n + 1);
59                }
60            }
61            entity_to_node.insert(idx, node_id);
62            continue;
63        }
64
65        let node = GraphNode::new(&node_id, entity.entity_type.as_label(), &entity.text)
66            .with_mentions_count(1)
67            .with_first_seen(entity.start());
68
69        seen_nodes.insert(node_id.clone(), doc.nodes.len());
70        entity_to_node.insert(idx, node_id);
71        doc.nodes.push(node);
72    }
73
74    let mut seen_edges: std::collections::HashMap<(String, String, String), usize> =
75        std::collections::HashMap::new();
76    for relation in relations {
77        let source_node_id = get_node_id(&relation.head);
78        let target_node_id = get_node_id(&relation.tail);
79
80        if seen_nodes.contains_key(&source_node_id) && seen_nodes.contains_key(&target_node_id) {
81            let key = (
82                source_node_id.clone(),
83                target_node_id.clone(),
84                relation.relation_type.clone(),
85            );
86            if let Some(&idx) = seen_edges.get(&key) {
87                if let Some(existing) = doc.edges.get_mut(idx) {
88                    existing.confidence = existing.confidence.max(relation.confidence.value());
89                }
90            } else {
91                let edge =
92                    GraphEdge::new(&source_node_id, &target_node_id, &relation.relation_type)
93                        .with_confidence(relation.confidence.value());
94                doc.edges.push(edge);
95                seen_edges.insert(key, doc.edges.len().saturating_sub(1));
96            }
97        }
98    }
99    doc
100}
101
102// ---------------------------------------------------------------------------
103// URI helpers (shared across all export paths)
104// ---------------------------------------------------------------------------
105
106/// Make a string safe for use inside a URI path segment.
107pub fn uri_safe(s: &str) -> String {
108    s.chars()
109        .map(|c| {
110            if c.is_alphanumeric() || c == '_' || c == '-' {
111                c
112            } else {
113                '_'
114            }
115        })
116        .collect()
117}
118
119fn escape_literal(s: &str) -> String {
120    s.replace('\\', "\\\\")
121        .replace('"', "\\\"")
122        .replace('\n', "\\n")
123        .replace('\r', "\\r")
124        .replace('\t', "\\t")
125}
126
127// ---------------------------------------------------------------------------
128// Raw entity/relation extraction output → KnowledgeGraph
129// ---------------------------------------------------------------------------
130
131/// Build a fully-annotated `KnowledgeGraph` from raw NER + relation extraction output.
132///
133/// Each entity becomes a subject with:
134/// - `rdf:type` assertion
135/// - `rdfs:label` (surface text)
136/// - character offset and confidence typed literals
137/// - `prov:hadPrimarySource` provenance link to the document IRI
138///
139/// When `relations` is non-empty (i.e. the model supports relation extraction), each triple becomes
140/// a predicate arc: `<head_entity> <{base}/rel/{type}> <tail_entity>`.
141///
142/// # Arguments
143///
144/// - `entities` — extracted entity spans
145/// - `relations` — semantic triples; empty for entity-only backends
146/// - `doc_iri` — IRI identifying this document (e.g. `https://www.gutenberg.org/ebooks/doc/pg1342`)
147/// - `base_uri` — namespace prefix (e.g. `https://www.gutenberg.org/ebooks/`)
148///
149/// # Returns
150///
151/// A `KnowledgeGraph` whose triples can be serialised to N-Triples via
152/// `kg.triples().map(|t| t.to_ntriples()).collect::<Vec<_>>().join("\n")`.
153#[must_use]
154pub fn entities_to_knowledge_graph(
155    entities: &[anno_core::Entity],
156    relations: &[anno_core::Relation],
157    doc_iri: &str,
158    base_uri: &str,
159) -> KnowledgeGraph {
160    let mut kg =
161        KnowledgeGraph::with_capacity(entities.len().max(1), entities.len() * 7 + relations.len());
162
163    let base = base_uri.trim_end_matches('/');
164    let anno_ns = format!("{}/vocab#", base);
165    let entity_ns = format!("{}/entity/", base);
166
167    const RDF_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
168    const RDFS_LABEL: &str = "http://www.w3.org/2000/01/rdf-schema#label";
169    const PROV_SOURCE: &str = "http://www.w3.org/ns/prov#hadPrimarySource";
170    const XSD_INT: &str = "http://www.w3.org/2001/XMLSchema#integer";
171    const XSD_FLOAT: &str = "http://www.w3.org/2001/XMLSchema#float";
172
173    // Stable per-doc entity IRIs used for both entity triples and relation arcs.
174    let entity_iris: Vec<String> = entities
175        .iter()
176        .enumerate()
177        .map(|(i, e)| {
178            format!(
179                "{}{}/{}_{}_{}/",
180                entity_ns,
181                e.entity_type.as_label().to_lowercase(),
182                i,
183                uri_safe(&e.text),
184                e.start(),
185            )
186        })
187        .collect();
188
189    for (idx, entity) in entities.iter().enumerate() {
190        let iri = &entity_iris[idx];
191        let type_iri = format!("{}{}Type", anno_ns, entity.entity_type.as_label());
192
193        kg.add_triple(Triple::new(iri.as_str(), RDF_TYPE, type_iri.as_str()));
194        kg.add_triple(Triple::new(
195            iri.as_str(),
196            RDFS_LABEL,
197            format!("\"{}\"", escape_literal(&entity.text)),
198        ));
199        kg.add_triple(Triple::new(
200            iri.as_str(),
201            format!("{}startOffset", anno_ns),
202            format!("\"{}\"^^<{}>", entity.start(), XSD_INT),
203        ));
204        kg.add_triple(Triple::new(
205            iri.as_str(),
206            format!("{}endOffset", anno_ns),
207            format!("\"{}\"^^<{}>", entity.end(), XSD_INT),
208        ));
209        kg.add_triple(Triple::new(
210            iri.as_str(),
211            format!("{}confidence", anno_ns),
212            format!("\"{}\"^^<{}>", entity.confidence, XSD_FLOAT),
213        ));
214        kg.add_triple(Triple::new(iri.as_str(), PROV_SOURCE, doc_iri));
215        kg.add_triple(Triple::new(
216            doc_iri,
217            format!("{}mentions", anno_ns),
218            iri.as_str(),
219        ));
220    }
221
222    // Build entity lookup by (text, start, end) for reliable relation matching.
223    let entity_lookup: std::collections::HashMap<(&str, usize, usize), usize> = entities
224        .iter()
225        .enumerate()
226        .map(|(i, e)| ((e.text.as_str(), e.start(), e.end()), i))
227        .collect();
228
229    // Semantic relation triples from RelationExtractor backends.
230    for rel in relations {
231        let head_iri = entity_lookup
232            .get(&(rel.head.text.as_str(), rel.head.start(), rel.head.end()))
233            .map(|&i| entity_iris[i].as_str());
234        let tail_iri = entity_lookup
235            .get(&(rel.tail.text.as_str(), rel.tail.start(), rel.tail.end()))
236            .map(|&i| entity_iris[i].as_str());
237        if let (Some(h), Some(t)) = (head_iri, tail_iri) {
238            let pred = format!("{}/rel/{}", base, uri_safe(&rel.relation_type));
239            let mut triple = Triple::new(h, pred.as_str(), t);
240            if rel.confidence.value().is_finite() {
241                triple = triple.with_confidence(f32::from(rel.confidence));
242            }
243            kg.add_triple(triple);
244        }
245    }
246
247    kg
248}
249
250#[cfg(test)]
251mod tests {
252    use super::*;
253    use anno_core::{Entity, EntityType, Relation};
254
255    fn ent(text: &str, start: usize, end: usize, ty: EntityType) -> Entity {
256        Entity::new(text, ty, start, end, 0.9)
257    }
258
259    #[test]
260    fn kg_produces_type_label_provenance_triples() {
261        let entities = vec![ent("Lynn Conway", 0, 11, EntityType::Person)];
262        let kg = entities_to_knowledge_graph(&entities, &[], "urn:test:doc/d1", "urn:test:");
263        let triples: Vec<String> = kg.triples().map(|t| t.to_ntriples()).collect();
264
265        assert!(
266            triples.len() >= 6,
267            "expected ≥6 triples, got {}",
268            triples.len()
269        );
270        assert!(triples
271            .iter()
272            .any(|t| t.contains("rdf-syntax-ns#type") && t.contains("PERType")));
273        assert!(triples.iter().any(|t| t.contains("rdf-schema#label")));
274        assert!(triples
275            .iter()
276            .any(|t| t.contains("prov#hadPrimarySource") || t.contains("prov/ns#")));
277    }
278
279    #[test]
280    fn kg_includes_relation_arc() {
281        let head = ent("Steve Jobs", 0, 10, EntityType::Person);
282        let tail = ent("Apple", 19, 24, EntityType::Organization);
283        let rel = Relation::new(head.clone(), tail.clone(), "founded", 0.85);
284
285        let kg = entities_to_knowledge_graph(&[head, tail], &[rel], "urn:test:doc/d2", "urn:test:");
286        let triples: Vec<String> = kg.triples().map(|t| t.to_ntriples()).collect();
287        assert!(
288            triples.iter().any(|t| t.contains("rel/founded")),
289            "missing relation triple; triples:\n{}",
290            triples.join("\n")
291        );
292    }
293
294    #[test]
295    fn empty_entities_empty_kg() {
296        let kg = entities_to_knowledge_graph(&[], &[], "urn:test:doc/empty", "urn:test:");
297        assert_eq!(kg.triples().count(), 0);
298    }
299
300    #[test]
301    fn uri_safe_replaces_specials() {
302        assert_eq!(uri_safe("Lynn Conway"), "Lynn_Conway");
303        assert_eq!(uri_safe("IBM"), "IBM");
304        assert_eq!(uri_safe("New York"), "New_York");
305    }
306}