1use lattix::{GraphDocument, GraphEdge, GraphNode, KnowledgeGraph, Triple};
14
15#[must_use]
22pub fn grounded_to_graph_document(doc: &anno_core::GroundedDocument) -> GraphDocument {
23 let entities = doc.to_entities();
24 entities_to_graph_document(&entities, &[])
25}
26
27#[must_use]
29pub fn entities_to_graph_document(
30 entities: &[anno_core::Entity],
31 relations: &[anno_core::Relation],
32) -> GraphDocument {
33 let mut doc = GraphDocument::new();
34 let mut seen_nodes: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
35 let mut entity_to_node: std::collections::HashMap<usize, String> =
36 std::collections::HashMap::new();
37
38 let get_node_id = |e: &anno_core::Entity| -> String {
39 if let Some(ref kb_id) = e.kb_id {
40 return kb_id.clone();
41 }
42 if let Some(canonical_id) = e.canonical_id {
43 return format!("coref_{}", canonical_id);
44 }
45 format!(
46 "{}:{}",
47 e.entity_type.as_label().to_lowercase(),
48 uri_safe(&e.text)
49 )
50 };
51
52 for (idx, entity) in entities.iter().enumerate() {
53 let node_id = get_node_id(entity);
54
55 if let Some(&existing_idx) = seen_nodes.get(&node_id) {
56 if let Some(count) = doc.nodes[existing_idx].properties.get_mut("mentions_count") {
57 if let Some(n) = count.as_u64() {
58 *count = serde_json::Value::from(n + 1);
59 }
60 }
61 entity_to_node.insert(idx, node_id);
62 continue;
63 }
64
65 let node = GraphNode::new(&node_id, entity.entity_type.as_label(), &entity.text)
66 .with_mentions_count(1)
67 .with_first_seen(entity.start());
68
69 seen_nodes.insert(node_id.clone(), doc.nodes.len());
70 entity_to_node.insert(idx, node_id);
71 doc.nodes.push(node);
72 }
73
74 let mut seen_edges: std::collections::HashMap<(String, String, String), usize> =
75 std::collections::HashMap::new();
76 for relation in relations {
77 let source_node_id = get_node_id(&relation.head);
78 let target_node_id = get_node_id(&relation.tail);
79
80 if seen_nodes.contains_key(&source_node_id) && seen_nodes.contains_key(&target_node_id) {
81 let key = (
82 source_node_id.clone(),
83 target_node_id.clone(),
84 relation.relation_type.clone(),
85 );
86 if let Some(&idx) = seen_edges.get(&key) {
87 if let Some(existing) = doc.edges.get_mut(idx) {
88 existing.confidence = existing.confidence.max(relation.confidence.value());
89 }
90 } else {
91 let edge =
92 GraphEdge::new(&source_node_id, &target_node_id, &relation.relation_type)
93 .with_confidence(relation.confidence.value());
94 doc.edges.push(edge);
95 seen_edges.insert(key, doc.edges.len().saturating_sub(1));
96 }
97 }
98 }
99 doc
100}
101
102pub fn uri_safe(s: &str) -> String {
108 s.chars()
109 .map(|c| {
110 if c.is_alphanumeric() || c == '_' || c == '-' {
111 c
112 } else {
113 '_'
114 }
115 })
116 .collect()
117}
118
119fn escape_literal(s: &str) -> String {
120 s.replace('\\', "\\\\")
121 .replace('"', "\\\"")
122 .replace('\n', "\\n")
123 .replace('\r', "\\r")
124 .replace('\t', "\\t")
125}
126
127#[must_use]
154pub fn entities_to_knowledge_graph(
155 entities: &[anno_core::Entity],
156 relations: &[anno_core::Relation],
157 doc_iri: &str,
158 base_uri: &str,
159) -> KnowledgeGraph {
160 let mut kg =
161 KnowledgeGraph::with_capacity(entities.len().max(1), entities.len() * 7 + relations.len());
162
163 let base = base_uri.trim_end_matches('/');
164 let anno_ns = format!("{}/vocab#", base);
165 let entity_ns = format!("{}/entity/", base);
166
167 const RDF_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
168 const RDFS_LABEL: &str = "http://www.w3.org/2000/01/rdf-schema#label";
169 const PROV_SOURCE: &str = "http://www.w3.org/ns/prov#hadPrimarySource";
170 const XSD_INT: &str = "http://www.w3.org/2001/XMLSchema#integer";
171 const XSD_FLOAT: &str = "http://www.w3.org/2001/XMLSchema#float";
172
173 let entity_iris: Vec<String> = entities
175 .iter()
176 .enumerate()
177 .map(|(i, e)| {
178 format!(
179 "{}{}/{}_{}_{}/",
180 entity_ns,
181 e.entity_type.as_label().to_lowercase(),
182 i,
183 uri_safe(&e.text),
184 e.start(),
185 )
186 })
187 .collect();
188
189 for (idx, entity) in entities.iter().enumerate() {
190 let iri = &entity_iris[idx];
191 let type_iri = format!("{}{}Type", anno_ns, entity.entity_type.as_label());
192
193 kg.add_triple(Triple::new(iri.as_str(), RDF_TYPE, type_iri.as_str()));
194 kg.add_triple(Triple::new(
195 iri.as_str(),
196 RDFS_LABEL,
197 format!("\"{}\"", escape_literal(&entity.text)),
198 ));
199 kg.add_triple(Triple::new(
200 iri.as_str(),
201 format!("{}startOffset", anno_ns),
202 format!("\"{}\"^^<{}>", entity.start(), XSD_INT),
203 ));
204 kg.add_triple(Triple::new(
205 iri.as_str(),
206 format!("{}endOffset", anno_ns),
207 format!("\"{}\"^^<{}>", entity.end(), XSD_INT),
208 ));
209 kg.add_triple(Triple::new(
210 iri.as_str(),
211 format!("{}confidence", anno_ns),
212 format!("\"{}\"^^<{}>", entity.confidence, XSD_FLOAT),
213 ));
214 kg.add_triple(Triple::new(iri.as_str(), PROV_SOURCE, doc_iri));
215 kg.add_triple(Triple::new(
216 doc_iri,
217 format!("{}mentions", anno_ns),
218 iri.as_str(),
219 ));
220 }
221
222 let entity_lookup: std::collections::HashMap<(&str, usize, usize), usize> = entities
224 .iter()
225 .enumerate()
226 .map(|(i, e)| ((e.text.as_str(), e.start(), e.end()), i))
227 .collect();
228
229 for rel in relations {
231 let head_iri = entity_lookup
232 .get(&(rel.head.text.as_str(), rel.head.start(), rel.head.end()))
233 .map(|&i| entity_iris[i].as_str());
234 let tail_iri = entity_lookup
235 .get(&(rel.tail.text.as_str(), rel.tail.start(), rel.tail.end()))
236 .map(|&i| entity_iris[i].as_str());
237 if let (Some(h), Some(t)) = (head_iri, tail_iri) {
238 let pred = format!("{}/rel/{}", base, uri_safe(&rel.relation_type));
239 let mut triple = Triple::new(h, pred.as_str(), t);
240 if rel.confidence.value().is_finite() {
241 triple = triple.with_confidence(f32::from(rel.confidence));
242 }
243 kg.add_triple(triple);
244 }
245 }
246
247 kg
248}
249
250#[cfg(test)]
251mod tests {
252 use super::*;
253 use anno_core::{Entity, EntityType, Relation};
254
255 fn ent(text: &str, start: usize, end: usize, ty: EntityType) -> Entity {
256 Entity::new(text, ty, start, end, 0.9)
257 }
258
259 #[test]
260 fn kg_produces_type_label_provenance_triples() {
261 let entities = vec![ent("Lynn Conway", 0, 11, EntityType::Person)];
262 let kg = entities_to_knowledge_graph(&entities, &[], "urn:test:doc/d1", "urn:test:");
263 let triples: Vec<String> = kg.triples().map(|t| t.to_ntriples()).collect();
264
265 assert!(
266 triples.len() >= 6,
267 "expected ≥6 triples, got {}",
268 triples.len()
269 );
270 assert!(triples
271 .iter()
272 .any(|t| t.contains("rdf-syntax-ns#type") && t.contains("PERType")));
273 assert!(triples.iter().any(|t| t.contains("rdf-schema#label")));
274 assert!(triples
275 .iter()
276 .any(|t| t.contains("prov#hadPrimarySource") || t.contains("prov/ns#")));
277 }
278
279 #[test]
280 fn kg_includes_relation_arc() {
281 let head = ent("Steve Jobs", 0, 10, EntityType::Person);
282 let tail = ent("Apple", 19, 24, EntityType::Organization);
283 let rel = Relation::new(head.clone(), tail.clone(), "founded", 0.85);
284
285 let kg = entities_to_knowledge_graph(&[head, tail], &[rel], "urn:test:doc/d2", "urn:test:");
286 let triples: Vec<String> = kg.triples().map(|t| t.to_ntriples()).collect();
287 assert!(
288 triples.iter().any(|t| t.contains("rel/founded")),
289 "missing relation triple; triples:\n{}",
290 triples.join("\n")
291 );
292 }
293
294 #[test]
295 fn empty_entities_empty_kg() {
296 let kg = entities_to_knowledge_graph(&[], &[], "urn:test:doc/empty", "urn:test:");
297 assert_eq!(kg.triples().count(), 0);
298 }
299
300 #[test]
301 fn uri_safe_replaces_specials() {
302 assert_eq!(uri_safe("Lynn Conway"), "Lynn_Conway");
303 assert_eq!(uri_safe("IBM"), "IBM");
304 assert_eq!(uri_safe("New York"), "New_York");
305 }
306}