use std::collections::{HashMap, HashSet};
use uuid::Uuid;
use cognee_core::{HasDataPoint, ProvenanceContext, stamp_tree};
use cognee_models::{Entity, EntityType};
use cognee_ontology::traits::OntologyEdge;
use cognee_ontology::{AttachedOntologyNode, NodeCategory, OntologyResolver};
use tracing::warn;
use crate::fact_extraction::{KnowledgeGraph, Node};
use crate::graph_integration::types::{GraphEdgePair, GraphNodePair};
pub(crate) fn pre_stamp_extraction(
target: &mut dyn HasDataPoint,
user_label: Option<&str>,
visited: &mut HashSet<Uuid>,
) {
let ctx = ProvenanceContext {
pipeline_name: "cognify",
task_name: "extract_graph_from_data",
user_label,
node_set: None,
content_hash: None,
};
stamp_tree(target, &ctx, visited);
}
pub async fn expand_with_nodes_and_edges(
graphs: Vec<(Uuid, KnowledgeGraph)>,
dataset_id: Uuid,
existing_edges_set: &HashSet<String>,
ontology_resolver: &dyn OntologyResolver,
user_label: Option<&str>,
) -> (Vec<GraphNodePair>, Vec<GraphEdgePair>) {
let mut local_visited: HashSet<Uuid> = HashSet::new();
let mut node_map = HashMap::new();
let mut edge_map = HashMap::new();
let mut type_map = HashMap::new();
let mut node_id_to_entity_id: HashMap<String, Uuid> = HashMap::new();
let mut key_mapping: HashMap<String, String> = HashMap::new();
let mut ontology_types_map: HashMap<String, EntityType> = HashMap::new();
let mut ontology_entities_map: HashMap<String, GraphNodePair> = HashMap::new();
let mut ontology_edge_keys: HashSet<String> = HashSet::new();
let mut ontology_edges_out: Vec<GraphEdgePair> = Vec::new();
for (chunk_id, graph) in graphs {
for node in graph.nodes {
let type_key = format!("{}_type", node.node_type);
let effective_key = key_mapping
.get(&type_key)
.cloned()
.unwrap_or_else(|| type_key.clone());
if !type_map.contains_key(&effective_key) {
let mut et = EntityType::from_node_type(&node.node_type, Some(dataset_id));
pre_stamp_extraction(&mut et, user_label, &mut local_visited);
if ontology_resolver.is_loaded() {
match ontology_resolver.get_subgraph(&node.node_type, "classes", true) {
Ok((onto_nodes, onto_edges, Some(root_node))) => {
let canonical_name = root_node.name.clone();
et.mark_ontology_valid(Some(canonical_name.clone()));
et.base.id = ontology_name_to_uuid(&canonical_name);
let new_type_key = format!("{canonical_name}_type");
if new_type_key != type_key {
key_mapping.insert(type_key.clone(), new_type_key.clone());
}
process_ontology_nodes(
&onto_nodes,
dataset_id,
&node_map,
&type_map,
&mut ontology_types_map,
&mut ontology_entities_map,
user_label,
&mut local_visited,
);
process_ontology_edges(
&onto_edges,
existing_edges_set,
&mut ontology_edge_keys,
&mut ontology_edges_out,
);
type_map.insert(
if new_type_key != type_key {
new_type_key
} else {
effective_key.clone()
},
et,
);
}
Ok((_, _, None)) => {
type_map.insert(effective_key.clone(), et);
}
Err(e) => {
warn!(
"Ontology subgraph extraction failed for '{}': {}",
node.node_type, e
);
type_map.insert(effective_key.clone(), et);
}
}
} else {
type_map.insert(effective_key.clone(), et);
}
}
let resolved_key = key_mapping
.get(&type_key)
.cloned()
.unwrap_or_else(|| type_key.clone());
#[allow(clippy::expect_used, reason = "invariant is upheld by construction")]
let entity_type = type_map
.get(&resolved_key)
.expect("entity type was just inserted or already existed");
let entity_key = format!("{}_entity", node.id);
let mut deferred_individual_data = None;
if let std::collections::hash_map::Entry::Vacant(e) = node_map.entry(entity_key) {
let mut entity_pair = create_entity_node(
&node,
entity_type.clone(), dataset_id,
chunk_id,
);
pre_stamp_extraction(&mut entity_pair.entity, user_label, &mut local_visited);
if ontology_resolver.is_loaded() {
match ontology_resolver.get_subgraph(&node.name, "individuals", true) {
Ok((ont_nodes, ont_edges, Some(root_individual))) => {
let canonical_name = root_individual.name.clone();
entity_pair.entity.base.set_metadata(
"original_name",
serde_json::json!(entity_pair.entity.name.clone()),
);
entity_pair.entity.name = canonical_name.clone();
entity_pair.entity.base.id = ontology_name_to_uuid(&canonical_name);
entity_pair.entity.base.set_ontology_valid(true);
deferred_individual_data = Some((ont_nodes, ont_edges));
}
Ok((_, _, None)) => {}
Err(err) => {
warn!(
"Ontology individual lookup failed for '{}': {}",
node.name, err
);
}
}
}
node_id_to_entity_id.insert(node.id.clone(), entity_pair.entity.base.id);
e.insert(entity_pair);
}
if let Some((ont_nodes, ont_edges)) = deferred_individual_data {
process_ontology_nodes(
&ont_nodes,
dataset_id,
&node_map,
&type_map,
&mut ontology_types_map,
&mut ontology_entities_map,
user_label,
&mut local_visited,
);
process_ontology_edges(
&ont_edges,
existing_edges_set,
&mut ontology_edge_keys,
&mut ontology_edges_out,
);
}
}
for edge in graph.edges {
let Some(source_entity_id) = node_id_to_entity_id.get(&edge.source_node_id) else {
warn!(
"Skipping edge: source node '{}' not found in extracted nodes",
edge.source_node_id
);
continue;
};
let Some(target_entity_id) = node_id_to_entity_id.get(&edge.target_node_id) else {
warn!(
"Skipping edge: target node '{}' not found in extracted nodes",
edge.target_node_id
);
continue;
};
let edge_db_key = format!(
"{}_{}_{}",
source_entity_id, target_entity_id, edge.relationship_name
);
if existing_edges_set.contains(&edge_db_key) {
continue;
}
let edge_key = (
*source_entity_id,
*target_entity_id,
edge.relationship_name.clone(),
);
if let std::collections::hash_map::Entry::Vacant(e) = edge_map.entry(edge_key) {
let edge_text = edge
.description
.as_deref()
.map(str::trim)
.filter(|s| !s.is_empty())
.unwrap_or("")
.to_string();
let mut edge_pair = GraphEdgePair::new(
*source_entity_id,
*target_entity_id,
edge.relationship_name.clone(),
);
edge_pair.add_property("relationship_name", edge.relationship_name);
edge_pair.add_property("source_node_id", source_entity_id.to_string());
edge_pair.add_property("target_node_id", target_entity_id.to_string());
edge_pair.add_property("ontology_valid", "false");
edge_pair.add_property("edge_text", edge_text);
e.insert(edge_pair);
}
}
}
let mut graph_nodes: Vec<GraphNodePair> = node_map.into_values().collect();
for et in ontology_types_map.into_values() {
let entity = Entity::from_node(
&et.name,
&et.name,
format!("Ontology-derived type: {}", et.name),
et.base.id,
Some(dataset_id),
);
graph_nodes.push(GraphNodePair {
entity,
entity_type: et,
});
}
graph_nodes.extend(ontology_entities_map.into_values());
let mut graph_edges: Vec<GraphEdgePair> = edge_map.into_values().collect();
graph_edges.extend(ontology_edges_out);
(graph_nodes, graph_edges)
}
fn create_entity_node(
node: &Node,
entity_type: EntityType,
dataset_id: Uuid,
chunk_id: Uuid,
) -> GraphNodePair {
let entity = Entity::from_node(
&node.id,
&node.name,
&node.description,
entity_type.base.id,
Some(dataset_id),
);
let mut entity_with_chunk = entity;
entity_with_chunk
.base
.set_metadata("chunk_id", serde_json::json!(chunk_id.to_string()));
GraphNodePair {
entity: entity_with_chunk,
entity_type,
}
}
fn ontology_name_to_uuid(name: &str) -> Uuid {
let normalized = name.to_lowercase().replace(' ', "_").replace('\'', "");
Uuid::new_v5(&Uuid::NAMESPACE_OID, normalized.as_bytes())
}
fn normalize_edge_name(name: &str) -> String {
name.to_lowercase().replace(' ', "_").replace('\'', "")
}
#[allow(clippy::too_many_arguments)]
fn process_ontology_nodes(
ontology_nodes: &[AttachedOntologyNode],
dataset_id: Uuid,
node_map: &HashMap<String, GraphNodePair>,
type_map: &HashMap<String, EntityType>,
ontology_types_map: &mut HashMap<String, EntityType>,
ontology_entities_map: &mut HashMap<String, GraphNodePair>,
user_label: Option<&str>,
visited: &mut HashSet<Uuid>,
) {
for node in ontology_nodes {
let node_id = ontology_name_to_uuid(&node.name);
match node.category {
NodeCategory::Classes => {
let dedup_key = format!("{node_id}_type");
let llm_type_key = format!("{}_type", node.name);
if type_map.contains_key(&llm_type_key)
|| ontology_types_map.contains_key(&dedup_key)
{
continue;
}
let node_entity_key = format!("{node_id}_entity");
if node_map.contains_key(&node_entity_key) {
continue;
}
let mut et = EntityType::new(&node.name, &node.name, Some(dataset_id));
et.base.id = node_id;
et.base.set_ontology_valid(true);
pre_stamp_extraction(&mut et, user_label, visited);
ontology_types_map.insert(dedup_key, et);
}
NodeCategory::Individuals => {
let dedup_key = format!("{node_id}_entity");
if node_map.contains_key(&dedup_key)
|| ontology_entities_map.contains_key(&dedup_key)
{
continue;
}
let mut entity = Entity::new(&node.name, None, &node.name, Some(dataset_id));
entity.base.id = node_id;
entity.base.set_ontology_valid(true);
pre_stamp_extraction(&mut entity, user_label, visited);
let mut placeholder_et =
EntityType::new("OntologyIndividual", "", Some(dataset_id));
placeholder_et.base.id = ontology_name_to_uuid("ontologyindividual");
pre_stamp_extraction(&mut placeholder_et, user_label, visited);
let pair = GraphNodePair {
entity,
entity_type: placeholder_et,
};
ontology_entities_map.insert(dedup_key, pair);
}
}
}
}
fn process_ontology_edges(
ontology_edges: &[OntologyEdge],
existing_edge_keys: &HashSet<String>,
ontology_edge_keys: &mut HashSet<String>,
ontology_edges_out: &mut Vec<GraphEdgePair>,
) {
for (source, relation, target) in ontology_edges {
let source_id = ontology_name_to_uuid(source);
let target_id = ontology_name_to_uuid(target);
let rel_name = normalize_edge_name(relation);
let edge_key = format!("{source_id}_{target_id}_{rel_name}");
if existing_edge_keys.contains(&edge_key) || ontology_edge_keys.contains(&edge_key) {
continue;
}
let mut edge = GraphEdgePair::new(source_id, target_id, &rel_name);
edge.add_property("ontology_valid", "true");
edge.add_property("relationship_name", &rel_name);
edge.add_property("source_node_id", source_id.to_string());
edge.add_property("target_node_id", target_id.to_string());
ontology_edge_keys.insert(edge_key);
ontology_edges_out.push(edge);
}
}
#[cfg(test)]
#[allow(
clippy::unwrap_used,
clippy::expect_used,
reason = "test code — panics are acceptable failures"
)]
mod tests {
use super::*;
use crate::fact_extraction::Edge;
use cognee_ontology::{NoOpOntologyResolver, OntologyResult, traits::OntologySubgraph};
fn noop() -> NoOpOntologyResolver {
NoOpOntologyResolver::new()
}
fn create_test_graph() -> KnowledgeGraph {
KnowledgeGraph {
nodes: vec![
Node {
id: "techcorp_1".to_string(),
name: "TechCorp".to_string(),
node_type: "Organization".to_string(),
description: "A technology company".to_string(),
},
Node {
id: "alice_1".to_string(),
name: "Alice".to_string(),
node_type: "Person".to_string(),
description: "A software engineer".to_string(),
},
],
edges: vec![Edge {
source_node_id: "alice_1".to_string(),
target_node_id: "techcorp_1".to_string(),
relationship_name: "works_at".to_string(),
description: None,
}],
}
}
#[tokio::test]
async fn test_expand_single_graph() {
let graph = create_test_graph();
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let (nodes, edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&noop(),
None,
)
.await;
assert_eq!(nodes.len(), 2);
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].relationship_name, "works_at");
let names: Vec<String> = nodes.iter().map(|n| n.entity.name.clone()).collect();
assert!(names.contains(&"TechCorp".to_string()));
assert!(names.contains(&"Alice".to_string()));
}
#[tokio::test]
async fn test_expand_deduplicates_nodes() {
let graph1 = create_test_graph();
let graph2 = create_test_graph();
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let (nodes, edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph1), (chunk_id, graph2)],
dataset_id,
&HashSet::new(),
&noop(),
None,
)
.await;
assert_eq!(nodes.len(), 2);
assert_eq!(edges.len(), 1);
}
#[tokio::test]
async fn test_expand_creates_entity_types() {
let graph = create_test_graph();
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let (nodes, _) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&noop(),
None,
)
.await;
for node_pair in &nodes {
assert!(!node_pair.entity_type.name.is_empty());
assert_eq!(node_pair.entity_type.base.data_type, "EntityType");
}
let types: Vec<String> = nodes.iter().map(|n| n.entity_type.name.clone()).collect();
assert!(types.contains(&"Organization".to_string()));
assert!(types.contains(&"Person".to_string()));
}
#[tokio::test]
async fn test_expand_links_entities_to_types() {
let graph = create_test_graph();
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let (nodes, _) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&noop(),
None,
)
.await;
for node_pair in &nodes {
assert_eq!(node_pair.entity.is_a, Some(node_pair.entity_type.base.id));
}
}
#[tokio::test]
async fn test_expand_stores_chunk_reference() {
let graph = create_test_graph();
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let (nodes, _) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&noop(),
None,
)
.await;
for node_pair in &nodes {
let chunk_ref = node_pair.entity.base.get_metadata("chunk_id");
assert!(chunk_ref.is_some());
}
}
#[tokio::test]
async fn test_expand_missing_target_node_is_skipped() {
let graph = KnowledgeGraph {
nodes: vec![Node {
id: "alice_1".to_string(),
name: "Alice".to_string(),
node_type: "Person".to_string(),
description: "A person".to_string(),
}],
edges: vec![Edge {
source_node_id: "alice_1".to_string(),
target_node_id: "missing_node".to_string(), relationship_name: "knows".to_string(),
description: None,
}],
};
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let (nodes, edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&noop(),
None,
)
.await;
assert_eq!(nodes.len(), 1);
assert_eq!(edges.len(), 0);
}
#[tokio::test]
async fn test_expand_empty_graphs() {
let dataset_id = Uuid::new_v4();
let (nodes, edges) =
expand_with_nodes_and_edges(vec![], dataset_id, &HashSet::new(), &noop(), None).await;
assert_eq!(nodes.len(), 0);
assert_eq!(edges.len(), 0);
}
#[tokio::test]
async fn test_expand_multiple_edges_same_entities() {
let graph = KnowledgeGraph {
nodes: vec![
Node {
id: "alice_1".to_string(),
name: "Alice".to_string(),
node_type: "Person".to_string(),
description: "A person".to_string(),
},
Node {
id: "techcorp_1".to_string(),
name: "TechCorp".to_string(),
node_type: "Organization".to_string(),
description: "A company".to_string(),
},
],
edges: vec![
Edge {
source_node_id: "alice_1".to_string(),
target_node_id: "techcorp_1".to_string(),
relationship_name: "works_at".to_string(),
description: None,
},
Edge {
source_node_id: "alice_1".to_string(),
target_node_id: "techcorp_1".to_string(),
relationship_name: "founded".to_string(),
description: None,
},
],
};
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let (nodes, edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&noop(),
None,
)
.await;
assert_eq!(nodes.len(), 2);
assert_eq!(edges.len(), 2);
let relationships: Vec<String> =
edges.iter().map(|e| e.relationship_name.clone()).collect();
assert!(relationships.contains(&"works_at".to_string()));
assert!(relationships.contains(&"founded".to_string()));
}
#[tokio::test]
async fn test_expand_multiple_chunks_different_ids() {
let chunk_id_a = Uuid::new_v4();
let chunk_id_b = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let graph_a = KnowledgeGraph {
nodes: vec![Node {
id: "alice_1".to_string(),
name: "Alice".to_string(),
node_type: "Person".to_string(),
description: "A software engineer".to_string(),
}],
edges: vec![],
};
let graph_b = KnowledgeGraph {
nodes: vec![Node {
id: "bob_1".to_string(),
name: "Bob".to_string(),
node_type: "Person".to_string(),
description: "A data scientist".to_string(),
}],
edges: vec![],
};
let (nodes, _edges) = expand_with_nodes_and_edges(
vec![(chunk_id_a, graph_a), (chunk_id_b, graph_b)],
dataset_id,
&HashSet::new(),
&noop(),
None,
)
.await;
assert_eq!(nodes.len(), 2);
for node_pair in &nodes {
let chunk_ref = node_pair
.entity
.base
.get_metadata("chunk_id")
.expect("chunk_id metadata should be present");
if node_pair.entity.name == "Alice" {
assert_eq!(
chunk_ref.as_str().unwrap(),
chunk_id_a.to_string(),
"Alice should be tagged with chunk_id_a"
);
} else if node_pair.entity.name == "Bob" {
assert_eq!(
chunk_ref.as_str().unwrap(),
chunk_id_b.to_string(),
"Bob should be tagged with chunk_id_b"
);
} else {
panic!("Unexpected entity name: {}", node_pair.entity.name);
}
}
}
struct MockOntologyResolver;
impl OntologyResolver for MockOntologyResolver {
fn find_closest_match(&self, name: &str, category: &str) -> OntologyResult<Option<String>> {
match (name, category) {
("Alice", "individuals") => Ok(Some("Alice_Canonical".to_string())),
_ => Ok(None),
}
}
fn get_subgraph(
&self,
node_name: &str,
node_type: &str,
_directed: bool,
) -> OntologyResult<OntologySubgraph> {
match (node_name, node_type) {
("Organization", "classes") => {
let root = AttachedOntologyNode {
uri: "http://test.org#Organisation".to_string(),
name: "organisation".to_string(),
category: NodeCategory::Classes,
};
let ancestor = AttachedOntologyNode {
uri: "http://test.org#LegalEntity".to_string(),
name: "legalentity".to_string(),
category: NodeCategory::Classes,
};
Ok((
vec![ancestor],
vec![(
"organisation".to_string(),
"is_a".to_string(),
"legalentity".to_string(),
)],
Some(root),
))
}
("Person", "classes") => {
let root = AttachedOntologyNode {
uri: "http://test.org#Person".to_string(),
name: "person".to_string(),
category: NodeCategory::Classes,
};
Ok((vec![], vec![], Some(root)))
}
("Alice", "individuals") => {
let root = AttachedOntologyNode {
uri: "http://test.org#alice_canonical".to_string(),
name: "alice_canonical".to_string(),
category: NodeCategory::Individuals,
};
Ok((vec![], vec![], Some(root)))
}
_ => Ok((vec![], vec![], None)),
}
}
fn is_loaded(&self) -> bool {
true
}
}
#[tokio::test]
async fn test_expand_with_ontology_validates_entity_types() {
let graph = create_test_graph();
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let resolver = MockOntologyResolver;
let (nodes, _edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&resolver,
None,
)
.await;
assert!(
nodes.len() >= 2,
"Expected at least 2 nodes, got {}",
nodes.len()
);
let llm_nodes: Vec<_> = nodes
.iter()
.filter(|n| {
n.entity.name == "TechCorp"
|| n.entity.name == "Alice"
|| n.entity.name == "alice_canonical"
})
.collect();
assert_eq!(llm_nodes.len(), 2);
for node_pair in &llm_nodes {
assert!(
node_pair.entity_type.is_ontology_valid(),
"EntityType '{}' should be ontology-valid",
node_pair.entity_type.name
);
if node_pair.entity.name == "TechCorp" {
assert_eq!(node_pair.entity_type.name, "organisation");
} else if node_pair.entity.name == "alice_canonical" {
assert_eq!(node_pair.entity_type.name, "person");
assert!(
node_pair.entity.base.ontology_valid,
"Entity 'alice_canonical' should be ontology-valid"
);
assert_eq!(
node_pair.entity.base.get_metadata("original_name"),
Some(&serde_json::json!("Alice")),
);
}
}
}
#[tokio::test]
async fn test_expand_noop_resolver_leaves_entities_unvalidated() {
let graph = create_test_graph();
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let (nodes, _edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&noop(),
None,
)
.await;
for node_pair in &nodes {
assert!(
!node_pair.entity_type.is_ontology_valid(),
"EntityType '{}' should NOT be ontology-valid with NoOp resolver",
node_pair.entity_type.name
);
assert!(
!node_pair.entity.base.ontology_valid,
"Entity '{}' should NOT be ontology-valid with NoOp resolver",
node_pair.entity.name
);
}
}
#[test]
fn test_ontology_name_to_uuid_deterministic() {
let uuid_upper = ontology_name_to_uuid("Car");
let uuid_lower = ontology_name_to_uuid("car");
assert_eq!(uuid_upper, uuid_lower);
let expected = Uuid::new_v5(&Uuid::NAMESPACE_OID, b"car");
assert_eq!(uuid_upper, expected);
}
#[test]
fn test_normalize_edge_name() {
assert_eq!(normalize_edge_name("is a"), "is_a");
assert_eq!(normalize_edge_name("Is A"), "is_a");
assert_eq!(normalize_edge_name("don't know"), "dont_know");
}
#[test]
fn test_process_ontology_nodes_creates_entity_types_for_classes() {
let dataset_id = Uuid::new_v4();
let nodes = vec![
AttachedOntologyNode {
uri: "http://example.org#Vehicle".to_string(),
name: "Vehicle".to_string(),
category: NodeCategory::Classes,
},
AttachedOntologyNode {
uri: "http://example.org#Car".to_string(),
name: "Car".to_string(),
category: NodeCategory::Classes,
},
];
let node_map = HashMap::new();
let type_map = HashMap::new();
let mut ontology_types_map = HashMap::new();
let mut ontology_entities_map = HashMap::new();
process_ontology_nodes(
&nodes,
dataset_id,
&node_map,
&type_map,
&mut ontology_types_map,
&mut ontology_entities_map,
None,
&mut HashSet::new(),
);
assert_eq!(ontology_types_map.len(), 2);
assert!(ontology_entities_map.is_empty());
for et in ontology_types_map.values() {
assert!(et.base.ontology_valid);
}
let vehicle_key = format!("{}_type", ontology_name_to_uuid("Vehicle"));
let car_key = format!("{}_type", ontology_name_to_uuid("Car"));
assert!(ontology_types_map.contains_key(&vehicle_key));
assert!(ontology_types_map.contains_key(&car_key));
let vehicle_et = &ontology_types_map[&vehicle_key];
assert_eq!(vehicle_et.base.id, ontology_name_to_uuid("Vehicle"));
assert_eq!(vehicle_et.name, "Vehicle");
}
#[test]
fn test_process_ontology_nodes_skips_duplicates() {
let dataset_id = Uuid::new_v4();
let nodes = vec![AttachedOntologyNode {
uri: "http://example.org#Organization".to_string(),
name: "Organization".to_string(),
category: NodeCategory::Classes,
}];
let node_map = HashMap::new();
let mut type_map = HashMap::new();
type_map.insert(
"Organization_type".to_string(),
EntityType::new("Organization", "A type", Some(dataset_id)),
);
let mut ontology_types_map = HashMap::new();
let mut ontology_entities_map = HashMap::new();
process_ontology_nodes(
&nodes,
dataset_id,
&node_map,
&type_map,
&mut ontology_types_map,
&mut ontology_entities_map,
None,
&mut HashSet::new(),
);
assert!(ontology_types_map.is_empty());
}
#[test]
fn test_process_ontology_nodes_creates_entities_for_individuals() {
let dataset_id = Uuid::new_v4();
let nodes = vec![AttachedOntologyNode {
uri: "http://example.org#MyCar".to_string(),
name: "MyCar".to_string(),
category: NodeCategory::Individuals,
}];
let node_map = HashMap::new();
let type_map = HashMap::new();
let mut ontology_types_map = HashMap::new();
let mut ontology_entities_map = HashMap::new();
process_ontology_nodes(
&nodes,
dataset_id,
&node_map,
&type_map,
&mut ontology_types_map,
&mut ontology_entities_map,
None,
&mut HashSet::new(),
);
assert_eq!(ontology_entities_map.len(), 1);
assert!(ontology_types_map.is_empty());
let dedup_key = format!("{}_entity", ontology_name_to_uuid("MyCar"));
let pair = &ontology_entities_map[&dedup_key];
assert!(pair.entity.base.ontology_valid);
assert_eq!(pair.entity.base.id, ontology_name_to_uuid("MyCar"));
assert_eq!(pair.entity.name, "MyCar");
assert_eq!(pair.entity_type.name, "OntologyIndividual");
assert_eq!(
pair.entity_type.base.id,
ontology_name_to_uuid("ontologyindividual")
);
}
#[test]
fn test_process_ontology_edges_creates_edges() {
let edges: Vec<OntologyEdge> = vec![
("Car".to_string(), "is a".to_string(), "Vehicle".to_string()),
(
"Vehicle".to_string(),
"has part".to_string(),
"Engine".to_string(),
),
];
let existing_edge_keys = HashSet::new();
let mut ontology_edge_keys = HashSet::new();
let mut ontology_edges_out = Vec::new();
process_ontology_edges(
&edges,
&existing_edge_keys,
&mut ontology_edge_keys,
&mut ontology_edges_out,
);
assert_eq!(ontology_edges_out.len(), 2);
assert_eq!(ontology_edge_keys.len(), 2);
let car_id = ontology_name_to_uuid("Car");
let vehicle_id = ontology_name_to_uuid("Vehicle");
let edge0 = &ontology_edges_out[0];
assert_eq!(edge0.source_entity_id, car_id);
assert_eq!(edge0.target_entity_id, vehicle_id);
assert_eq!(edge0.relationship_name, "is_a");
assert_eq!(
edge0.properties.get("ontology_valid"),
Some(&"true".to_string())
);
assert_eq!(
edge0.properties.get("source_node_id"),
Some(&car_id.to_string())
);
assert_eq!(
edge0.properties.get("target_node_id"),
Some(&vehicle_id.to_string())
);
let engine_id = ontology_name_to_uuid("Engine");
let edge1 = &ontology_edges_out[1];
assert_eq!(edge1.source_entity_id, vehicle_id);
assert_eq!(edge1.target_entity_id, engine_id);
assert_eq!(edge1.relationship_name, "has_part");
}
#[test]
fn test_process_ontology_edges_skips_existing() {
let car_id = ontology_name_to_uuid("Car");
let vehicle_id = ontology_name_to_uuid("Vehicle");
let existing_key = format!("{}_{}_{}", car_id, vehicle_id, "is_a");
let mut existing_edge_keys = HashSet::new();
existing_edge_keys.insert(existing_key);
let edges: Vec<OntologyEdge> = vec![
("Car".to_string(), "is a".to_string(), "Vehicle".to_string()),
(
"Vehicle".to_string(),
"has part".to_string(),
"Engine".to_string(),
),
];
let mut ontology_edge_keys = HashSet::new();
let mut ontology_edges_out = Vec::new();
process_ontology_edges(
&edges,
&existing_edge_keys,
&mut ontology_edge_keys,
&mut ontology_edges_out,
);
assert_eq!(ontology_edges_out.len(), 1);
assert_eq!(ontology_edges_out[0].relationship_name, "has_part");
}
#[tokio::test]
async fn test_expand_ontology_adds_ancestor_type_nodes() {
let graph = KnowledgeGraph {
nodes: vec![Node {
id: "techcorp_1".to_string(),
name: "TechCorp".to_string(),
node_type: "Organization".to_string(),
description: "A technology company".to_string(),
}],
edges: vec![],
};
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let resolver = MockOntologyResolver;
let (nodes, _edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&resolver,
None,
)
.await;
assert!(
nodes.len() >= 2,
"Expected at least 2 nodes (LLM + ontology ancestor), got {}",
nodes.len()
);
let legalentity_node = nodes
.iter()
.find(|n| n.entity.name == "legalentity" || n.entity_type.name == "legalentity");
assert!(
legalentity_node.is_some(),
"Expected ontology-derived 'legalentity' node in output"
);
if let Some(le) = legalentity_node {
assert!(le.entity_type.base.ontology_valid || le.entity.base.ontology_valid);
}
}
#[tokio::test]
async fn test_expand_ontology_adds_is_a_edges() {
let graph = KnowledgeGraph {
nodes: vec![Node {
id: "techcorp_1".to_string(),
name: "TechCorp".to_string(),
node_type: "Organization".to_string(),
description: "A technology company".to_string(),
}],
edges: vec![],
};
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let resolver = MockOntologyResolver;
let (_nodes, edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&resolver,
None,
)
.await;
let is_a_edges: Vec<_> = edges
.iter()
.filter(|e| e.relationship_name == "is_a")
.collect();
assert_eq!(
is_a_edges.len(),
1,
"Expected exactly 1 is_a edge from ontology"
);
let is_a = &is_a_edges[0];
assert_eq!(is_a.source_entity_id, ontology_name_to_uuid("organisation"));
assert_eq!(is_a.target_entity_id, ontology_name_to_uuid("legalentity"));
assert_eq!(
is_a.properties.get("ontology_valid"),
Some(&"true".to_string())
);
}
#[tokio::test]
async fn test_expand_edges_connect_to_canonicalized_entities() {
let graph = create_test_graph(); let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let resolver = MockOntologyResolver;
let (nodes, edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&resolver,
None,
)
.await;
let works_at: Vec<_> = edges
.iter()
.filter(|e| e.relationship_name == "works_at")
.collect();
assert_eq!(works_at.len(), 1, "Expected exactly 1 works_at edge");
let alice = nodes
.iter()
.find(|n| n.entity.name == "alice_canonical")
.expect("Alice should be canonicalized to 'alice_canonical'");
let techcorp = nodes
.iter()
.find(|n| n.entity.name == "TechCorp")
.expect("TechCorp entity should exist");
assert_eq!(
works_at[0].source_entity_id, alice.entity.base.id,
"Edge source should point to canonicalized Alice's UUID"
);
assert_eq!(
works_at[0].target_entity_id, techcorp.entity.base.id,
"Edge target should point to TechCorp's UUID"
);
}
#[tokio::test]
async fn test_expand_ontology_no_duplicate_derived_nodes() {
let graph = KnowledgeGraph {
nodes: vec![
Node {
id: "techcorp_1".to_string(),
name: "TechCorp".to_string(),
node_type: "Organization".to_string(),
description: "A tech company".to_string(),
},
Node {
id: "acmecorp_1".to_string(),
name: "AcmeCorp".to_string(),
node_type: "Organization".to_string(),
description: "Another company".to_string(),
},
],
edges: vec![],
};
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let resolver = MockOntologyResolver;
let (nodes, edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&resolver,
None,
)
.await;
assert!(nodes.iter().any(|n| n.entity.name == "TechCorp"));
assert!(nodes.iter().any(|n| n.entity.name == "AcmeCorp"));
let tc = nodes.iter().find(|n| n.entity.name == "TechCorp").unwrap();
let ac = nodes.iter().find(|n| n.entity.name == "AcmeCorp").unwrap();
assert_eq!(tc.entity_type.base.id, ac.entity_type.base.id);
let legalentity_count = nodes
.iter()
.filter(|n| n.entity.name == "legalentity" || n.entity_type.name == "legalentity")
.count();
assert_eq!(
legalentity_count, 1,
"legalentity ancestor should appear exactly once"
);
let is_a_edges: Vec<_> = edges
.iter()
.filter(|e| e.relationship_name == "is_a")
.collect();
assert_eq!(is_a_edges.len(), 1, "Expected exactly 1 is_a edge");
}
#[tokio::test]
async fn test_expand_ontology_mixed_validated_and_unvalidated() {
let graph = KnowledgeGraph {
nodes: vec![
Node {
id: "techcorp_1".to_string(),
name: "TechCorp".to_string(),
node_type: "Organization".to_string(),
description: "A tech company".to_string(),
},
Node {
id: "quantum_1".to_string(),
name: "QuantumTheory".to_string(),
node_type: "Concept".to_string(),
description: "A scientific concept".to_string(),
},
],
edges: vec![],
};
let chunk_id = Uuid::new_v4();
let dataset_id = Uuid::new_v4();
let resolver = MockOntologyResolver;
let (nodes, _edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&resolver,
None,
)
.await;
let tc = nodes.iter().find(|n| n.entity.name == "TechCorp").unwrap();
let qt = nodes
.iter()
.find(|n| n.entity.name == "QuantumTheory")
.unwrap();
assert!(tc.entity_type.is_ontology_valid());
assert_eq!(tc.entity_type.name, "organisation");
assert!(!qt.entity_type.is_ontology_valid());
assert_eq!(qt.entity_type.name, "Concept");
}
#[tokio::test]
async fn pre_stamp_sets_pipeline_and_task_on_entity_types() {
let dataset_id = Uuid::new_v4();
let chunk_id = Uuid::new_v4();
let graph = create_test_graph();
let (nodes, _edges) = expand_with_nodes_and_edges(
vec![(chunk_id, graph)],
dataset_id,
&HashSet::new(),
&noop(),
Some("alice@example.com"),
)
.await;
assert!(!nodes.is_empty(), "expected at least one node");
for pair in &nodes {
assert_eq!(
pair.entity_type.base.source_pipeline.as_deref(),
Some("cognify"),
"EntityType '{}' should be pre-stamped with cognify",
pair.entity_type.name
);
assert_eq!(
pair.entity_type.base.source_task.as_deref(),
Some("extract_graph_from_data"),
"EntityType '{}' should be pre-stamped with extract_graph_from_data",
pair.entity_type.name
);
assert_eq!(
pair.entity_type.base.source_user.as_deref(),
Some("alice@example.com"),
"EntityType '{}' should carry the supplied user_label",
pair.entity_type.name
);
assert_eq!(
pair.entity.base.source_pipeline.as_deref(),
Some("cognify"),
"Entity '{}' should be pre-stamped with cognify",
pair.entity.name
);
assert_eq!(
pair.entity.base.source_task.as_deref(),
Some("extract_graph_from_data"),
"Entity '{}' should be pre-stamped with extract_graph_from_data",
pair.entity.name
);
assert_eq!(
pair.entity.base.source_user.as_deref(),
Some("alice@example.com"),
"Entity '{}' should carry the supplied user_label",
pair.entity.name
);
}
}
}