Skip to main content

oxirs_graphrag/
summarizer.rs

1//! # Knowledge Graph Subgraph Summarizer
2//!
3//! Cluster-based abstraction for summarising KG subgraphs.
4//!
5//! The [`SubgraphSummarizer`] groups nodes by their `node_type` into clusters,
6//! selects the most-connected node of each cluster as its representative, and
7//! produces both structured [`SummaryCluster`] data and a human-readable text
8//! summary paragraph.
9//!
10//! ## Example
11//!
12//! ```rust
13//! use oxirs_graphrag::summarizer::{
14//!     KgEdge, KgNode, KgSubgraph, SubgraphSummarizer,
15//! };
16//! use std::collections::HashMap;
17//!
18//! let mut graph = KgSubgraph::new();
19//! graph.add_node(KgNode {
20//!     id: "e1".to_string(),
21//!     label: "Alice".to_string(),
22//!     node_type: "Person".to_string(),
23//!     properties: HashMap::new(),
24//! });
25//! graph.add_node(KgNode {
26//!     id: "e2".to_string(),
27//!     label: "Bob".to_string(),
28//!     node_type: "Person".to_string(),
29//!     properties: HashMap::new(),
30//! });
31//! graph.add_edge(KgEdge {
32//!     source: "e1".to_string(),
33//!     target: "e2".to_string(),
34//!     relation: "knows".to_string(),
35//!     weight: 1.0,
36//! });
37//!
38//! let summarizer = SubgraphSummarizer::new();
39//! let clusters = summarizer.summarize(&graph, 10);
40//! assert_eq!(clusters.len(), 1); // one "Person" cluster
41//! ```
42
43use std::collections::HashMap;
44
45// ─── GraphSummarizer (v0.4.0) ────────────────────────────────────────────────
46
47mod graph_summarizer_impl {
48    use crate::graph::community::{CommunityConfig, CommunityDetector};
49    use crate::Triple;
50    use rayon::prelude::*;
51    use std::collections::HashMap;
52
53    /// A compact summary of a large KG subgraph, suitable for LLM context windows.
54    #[derive(Debug, Clone)]
55    pub struct GraphSummary {
56        /// Representative entity IRIs (one per community, by in-degree centrality).
57        pub entities: Vec<String>,
58        /// Selected triples involving representative nodes and top predicates.
59        pub relations: Vec<(String, String, String)>,
60        /// Human-readable label for each detected community.
61        pub community_labels: Vec<String>,
62    }
63
64    impl GraphSummary {
65        /// Serialise to a natural-language paragraph for LLM context.
66        pub fn to_text(&self) -> String {
67            if self.entities.is_empty() {
68                return "The graph is empty.".to_string();
69            }
70            let n_comm = self.community_labels.len();
71            let n_ent = self.entities.len();
72            let entity_list = self
73                .entities
74                .iter()
75                .take(5)
76                .cloned()
77                .collect::<Vec<_>>()
78                .join(", ");
79            // Collect unique predicates from selected relations, ordered by first appearance.
80            let mut seen_preds: std::collections::HashSet<String> =
81                std::collections::HashSet::new();
82            let mut pred_list: Vec<String> = Vec::new();
83            for (_, p, _) in &self.relations {
84                if seen_preds.insert(p.clone()) {
85                    pred_list.push(p.clone());
86                    if pred_list.len() >= 5 {
87                        break;
88                    }
89                }
90            }
91            format!(
92                "The graph contains {} {} across {} {}. \
93                 Key entities include: {}. \
94                 Primary relationships: {}.",
95                n_ent,
96                if n_ent == 1 { "entity" } else { "entities" },
97                n_comm,
98                if n_comm == 1 {
99                    "community"
100                } else {
101                    "communities"
102                },
103                entity_list,
104                if pred_list.is_empty() {
105                    "none".to_string()
106                } else {
107                    pred_list.join(", ")
108                },
109            )
110        }
111    }
112
113    /// Compresses a large KG subgraph (as raw triples) into a [`GraphSummary`]
114    /// capped at `max_nodes` entities and `max_triples` relations.
115    ///
116    /// Pipeline:
117    /// 1. Community detection via Leiden (reusing `src/graph/community.rs`).
118    /// 2. Per-community representative selection by in-degree centrality.
119    /// 3. Predicate frequency ranking.
120    /// 4. Triple selection and truncation.
121    pub struct GraphSummarizer {
122        pub max_nodes: usize,
123        pub max_triples: usize,
124    }
125
126    impl GraphSummarizer {
127        pub fn new(max_nodes: usize, max_triples: usize) -> Self {
128            Self {
129                max_nodes,
130                max_triples,
131            }
132        }
133
134        /// Summarise the given triples into a [`GraphSummary`].
135        pub fn summarize(&self, triples: &[(String, String, String)]) -> GraphSummary {
136            if triples.is_empty() {
137                return GraphSummary {
138                    entities: Vec::new(),
139                    relations: Vec::new(),
140                    community_labels: Vec::new(),
141                };
142            }
143
144            // ── Convert to crate Triple for community detector ────────────────
145            let core_triples: Vec<Triple> = triples
146                .iter()
147                .map(|(s, p, o)| Triple::new(s.clone(), p.clone(), o.clone()))
148                .collect();
149
150            // ── Build in-degree map: how many triples point TO each node ─────
151            let mut in_degree: HashMap<String, usize> = HashMap::new();
152            for (s, _, o) in triples {
153                // Initialise subject with 0 if not present (to ensure all nodes tracked).
154                in_degree.entry(s.clone()).or_insert(0);
155                *in_degree.entry(o.clone()).or_insert(0) += 1;
156            }
157
158            // ── Community detection via Leiden ────────────────────────────────
159            let config = CommunityConfig {
160                min_community_size: 1,
161                ..CommunityConfig::default()
162            };
163            let detector = CommunityDetector::new(config);
164            let communities = detector.detect(&core_triples).unwrap_or_default();
165
166            // ── Per-community: pick representative by max in-degree ───────────
167            // Use rayon for parallel centroid selection when there are many communities.
168            let representatives: Vec<(String, String)> = communities
169                .par_iter()
170                .enumerate()
171                .filter_map(|(idx, comm)| {
172                    let rep = comm
173                        .entities
174                        .iter()
175                        .max_by_key(|e| in_degree.get(e.as_str()).copied().unwrap_or(0))
176                        .cloned()?;
177                    let label = format!("Community {} ({} entities)", idx, comm.entities.len());
178                    Some((rep, label))
179                })
180                .collect();
181
182            // ── Predicate frequency ranking ───────────────────────────────────
183            let mut pred_freq: HashMap<&str, usize> = HashMap::new();
184            for (_, p, _) in triples {
185                *pred_freq.entry(p.as_str()).or_insert(0) += 1;
186            }
187            let mut pred_ranked: Vec<(&str, usize)> = pred_freq.into_iter().collect();
188            pred_ranked.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(b.0)));
189            let top_predicates: std::collections::HashSet<&str> =
190                pred_ranked.iter().take(10).map(|(p, _)| *p).collect();
191
192            // ── Build entities list (capped at max_nodes) ────────────────────
193            let entities: Vec<String> = representatives
194                .iter()
195                .map(|(rep, _)| rep.clone())
196                .take(self.max_nodes)
197                .collect();
198
199            let rep_set: std::collections::HashSet<&str> =
200                entities.iter().map(|s| s.as_str()).collect();
201
202            // ── Select relations: triples involving representatives + top preds
203            let mut selected: Vec<(String, String, String)> = triples
204                .iter()
205                .filter(|(s, p, _)| {
206                    rep_set.contains(s.as_str()) && top_predicates.contains(p.as_str())
207                })
208                .map(|(s, p, o)| (s.clone(), p.clone(), o.clone()))
209                .take(self.max_triples)
210                .collect();
211
212            // If still below max_triples, fill from any triple with a representative subject.
213            if selected.len() < self.max_triples {
214                for (s, p, o) in triples {
215                    if selected.len() >= self.max_triples {
216                        break;
217                    }
218                    if rep_set.contains(s.as_str()) {
219                        let candidate = (s.clone(), p.clone(), o.clone());
220                        if !selected.contains(&candidate) {
221                            selected.push(candidate);
222                        }
223                    }
224                }
225            }
226
227            let community_labels: Vec<String> = representatives
228                .iter()
229                .take(self.max_nodes)
230                .map(|(_, label)| label.clone())
231                .collect();
232
233            GraphSummary {
234                entities,
235                relations: selected,
236                community_labels,
237            }
238        }
239    }
240} // mod graph_summarizer_impl
241
242pub use graph_summarizer_impl::{GraphSummarizer, GraphSummary};
243
244// ─── Graph primitives ─────────────────────────────────────────────────────────
245
246/// A node in a knowledge-graph subgraph.
247#[derive(Debug, Clone)]
248pub struct KgNode {
249    /// Unique node identifier (URI or local name).
250    pub id: String,
251    /// Human-readable label.
252    pub label: String,
253    /// Semantic type / RDF class.
254    pub node_type: String,
255    /// Arbitrary key-value properties.
256    pub properties: HashMap<String, String>,
257}
258
259impl KgNode {
260    /// Create a node with no properties.
261    pub fn simple(
262        id: impl Into<String>,
263        label: impl Into<String>,
264        node_type: impl Into<String>,
265    ) -> Self {
266        Self {
267            id: id.into(),
268            label: label.into(),
269            node_type: node_type.into(),
270            properties: HashMap::new(),
271        }
272    }
273}
274
275/// A directed, weighted edge between two KG nodes.
276#[derive(Debug, Clone)]
277pub struct KgEdge {
278    /// Source node identifier.
279    pub source: String,
280    /// Target node identifier.
281    pub target: String,
282    /// Relation type / predicate label.
283    pub relation: String,
284    /// Edge weight (e.g. confidence score).
285    pub weight: f64,
286}
287
288impl KgEdge {
289    /// Create an unweighted (weight = 1.0) edge.
290    pub fn unweighted(
291        source: impl Into<String>,
292        target: impl Into<String>,
293        relation: impl Into<String>,
294    ) -> Self {
295        Self {
296            source: source.into(),
297            target: target.into(),
298            relation: relation.into(),
299            weight: 1.0,
300        }
301    }
302}
303
304// ─── KgSubgraph ──────────────────────────────────────────────────────────────
305
306/// A subgraph of a knowledge graph, consisting of nodes and directed edges.
307#[derive(Debug, Clone, Default)]
308pub struct KgSubgraph {
309    /// All nodes in the subgraph.
310    pub nodes: Vec<KgNode>,
311    /// All edges in the subgraph.
312    pub edges: Vec<KgEdge>,
313}
314
315impl KgSubgraph {
316    /// Create an empty subgraph.
317    pub fn new() -> Self {
318        Self::default()
319    }
320
321    /// Add a node.
322    pub fn add_node(&mut self, node: KgNode) {
323        self.nodes.push(node);
324    }
325
326    /// Add an edge.
327    pub fn add_edge(&mut self, edge: KgEdge) {
328        self.edges.push(edge);
329    }
330
331    /// Number of nodes.
332    pub fn node_count(&self) -> usize {
333        self.nodes.len()
334    }
335
336    /// Number of edges.
337    pub fn edge_count(&self) -> usize {
338        self.edges.len()
339    }
340
341    /// Return the node with the given `id`, or `None`.
342    pub fn node(&self, id: &str) -> Option<&KgNode> {
343        self.nodes.iter().find(|n| n.id == id)
344    }
345}
346
347// ─── Summary output ──────────────────────────────────────────────────────────
348
349/// A cluster of KG nodes of the same type, with a representative and summary.
350#[derive(Debug, Clone)]
351pub struct SummaryCluster {
352    /// Sequential cluster identifier.
353    pub id: usize,
354    /// Node ID chosen as the representative (most-connected node in cluster).
355    pub representative_node: String,
356    /// All node IDs belonging to this cluster.
357    pub member_nodes: Vec<String>,
358    /// Number of edges whose both endpoints are within this cluster.
359    pub internal_edges: usize,
360    /// Brief human-readable label derived from the cluster's node type.
361    pub summary_label: String,
362}
363
364impl SummaryCluster {
365    /// Size of the cluster (number of member nodes).
366    pub fn size(&self) -> usize {
367        self.member_nodes.len()
368    }
369}
370
371// ─── SubgraphSummarizer ───────────────────────────────────────────────────────
372
373/// Produces cluster-based summaries of KG subgraphs.
374pub struct SubgraphSummarizer;
375
376impl SubgraphSummarizer {
377    /// Create a new summarizer (stateless).
378    pub fn new() -> Self {
379        Self
380    }
381
382    /// Summarise `graph` into at most `max_clusters` [`SummaryCluster`]s.
383    ///
384    /// ## Algorithm
385    ///
386    /// 1. Group nodes by `node_type`.
387    /// 2. If there are more types than `max_clusters`, merge the smallest groups
388    ///    into an `"Other"` cluster so that only `max_clusters` remain.
389    /// 3. For each cluster, count internal edges and pick the most-connected
390    ///    node (highest undirected degree within the whole graph) as
391    ///    the representative.
392    pub fn summarize(&self, graph: &KgSubgraph, max_clusters: usize) -> Vec<SummaryCluster> {
393        if graph.nodes.is_empty() || max_clusters == 0 {
394            return Vec::new();
395        }
396
397        // Group node IDs by node_type.
398        let mut type_groups: HashMap<String, Vec<String>> = HashMap::new();
399        for node in &graph.nodes {
400            type_groups
401                .entry(node.node_type.clone())
402                .or_default()
403                .push(node.id.clone());
404        }
405
406        // Sort groups deterministically by type name.
407        let mut groups: Vec<(String, Vec<String>)> = type_groups.into_iter().collect();
408        groups.sort_by(|a, b| a.0.cmp(&b.0));
409
410        // Merge overflow groups into "Other" if too many types.
411        let groups = if groups.len() > max_clusters {
412            let (keep, overflow) = groups.split_at(max_clusters - 1);
413            let mut merged = keep.to_vec();
414            let other_members: Vec<String> = overflow
415                .iter()
416                .flat_map(|(_, ids)| ids.iter().cloned())
417                .collect();
418            if !other_members.is_empty() {
419                merged.push(("Other".to_string(), other_members));
420            }
421            merged
422        } else {
423            groups
424        };
425
426        // Build undirected degree map over the entire graph.
427        let degree_map = build_degree_map(graph);
428
429        // Build the clusters.
430        groups
431            .into_iter()
432            .enumerate()
433            .map(|(cluster_id, (node_type, members))| {
434                // Pick representative: member with highest degree.
435                let representative = members
436                    .iter()
437                    .max_by_key(|id| degree_map.get(*id).copied().unwrap_or(0))
438                    .cloned()
439                    .unwrap_or_default();
440
441                // Count internal edges (both endpoints in this cluster).
442                let member_set: std::collections::HashSet<&str> =
443                    members.iter().map(|s| s.as_str()).collect();
444                let internal_edges = graph
445                    .edges
446                    .iter()
447                    .filter(|e| {
448                        member_set.contains(e.source.as_str())
449                            && member_set.contains(e.target.as_str())
450                    })
451                    .count();
452
453                SummaryCluster {
454                    id: cluster_id,
455                    representative_node: representative,
456                    member_nodes: members,
457                    internal_edges,
458                    summary_label: format!("{node_type} cluster"),
459                }
460            })
461            .collect()
462    }
463
464    /// Return the top-`top_n` relation types by frequency of occurrence.
465    ///
466    /// Ties are broken by relation name (alphabetical ascending).
467    pub fn extract_key_relations(&self, graph: &KgSubgraph, top_n: usize) -> Vec<(String, usize)> {
468        if top_n == 0 {
469            return Vec::new();
470        }
471        let mut counts: HashMap<String, usize> = HashMap::new();
472        for edge in &graph.edges {
473            *counts.entry(edge.relation.clone()).or_insert(0) += 1;
474        }
475        let mut sorted: Vec<(String, usize)> = counts.into_iter().collect();
476        sorted.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
477        sorted.truncate(top_n);
478        sorted
479    }
480
481    /// Compute the undirected degree of `node_id` in `graph`.
482    ///
483    /// Both outgoing and incoming edges are counted (a self-loop counts twice).
484    pub fn node_degree(&self, graph: &KgSubgraph, node_id: &str) -> usize {
485        graph
486            .edges
487            .iter()
488            .filter(|e| e.source == node_id || e.target == node_id)
489            .count()
490    }
491
492    /// Generate a human-readable summary paragraph from a set of clusters.
493    ///
494    /// The paragraph lists each cluster's representative node, member count, and
495    /// internal edge count, ending with the total number of clusters.
496    pub fn generate_text_summary(&self, clusters: &[SummaryCluster]) -> String {
497        if clusters.is_empty() {
498            return "The subgraph contains no identifiable clusters.".to_string();
499        }
500
501        let mut parts: Vec<String> = Vec::new();
502        for cluster in clusters {
503            parts.push(format!(
504                "The {} (representative: {}, {} members, {} internal edges)",
505                cluster.summary_label,
506                cluster.representative_node,
507                cluster.member_nodes.len(),
508                cluster.internal_edges,
509            ));
510        }
511
512        format!(
513            "{}. The subgraph contains {} cluster{}.",
514            parts.join(". "),
515            clusters.len(),
516            if clusters.len() == 1 { "" } else { "s" }
517        )
518    }
519}
520
521impl Default for SubgraphSummarizer {
522    fn default() -> Self {
523        Self::new()
524    }
525}
526
527// ─── Internal helpers ─────────────────────────────────────────────────────────
528
529/// Build an undirected-degree map over all edges in `graph`.
530fn build_degree_map(graph: &KgSubgraph) -> HashMap<String, usize> {
531    let mut map: HashMap<String, usize> = HashMap::new();
532    for edge in &graph.edges {
533        *map.entry(edge.source.clone()).or_insert(0) += 1;
534        if edge.source != edge.target {
535            *map.entry(edge.target.clone()).or_insert(0) += 1;
536        }
537    }
538    map
539}
540
541// ─── Tests ────────────────────────────────────────────────────────────────────
542
543#[cfg(test)]
544mod tests {
545    use super::*;
546
547    // ── Helpers ──────────────────────────────────────────────────────────────
548
549    fn node(id: &str, node_type: &str) -> KgNode {
550        KgNode::simple(id, id, node_type)
551    }
552
553    fn edge(src: &str, tgt: &str, rel: &str) -> KgEdge {
554        KgEdge::unweighted(src, tgt, rel)
555    }
556
557    fn make_graph_with_types(specs: &[(&str, &str)], edges: &[(&str, &str, &str)]) -> KgSubgraph {
558        let mut g = KgSubgraph::new();
559        for (id, typ) in specs {
560            g.add_node(node(id, typ));
561        }
562        for (s, t, r) in edges {
563            g.add_edge(edge(s, t, r));
564        }
565        g
566    }
567
568    // ── KgSubgraph basics ────────────────────────────────────────────────────
569
570    #[test]
571    fn test_new_subgraph_empty() {
572        let g = KgSubgraph::new();
573        assert_eq!(g.node_count(), 0);
574        assert_eq!(g.edge_count(), 0);
575    }
576
577    #[test]
578    fn test_add_node_increments_count() {
579        let mut g = KgSubgraph::new();
580        g.add_node(node("n1", "Person"));
581        assert_eq!(g.node_count(), 1);
582    }
583
584    #[test]
585    fn test_add_edge_increments_count() {
586        let mut g = KgSubgraph::new();
587        g.add_edge(edge("a", "b", "knows"));
588        assert_eq!(g.edge_count(), 1);
589    }
590
591    #[test]
592    fn test_node_lookup_found() {
593        let mut g = KgSubgraph::new();
594        g.add_node(node("alice", "Person"));
595        assert!(g.node("alice").is_some());
596        assert_eq!(g.node("alice").expect("node").node_type, "Person");
597    }
598
599    #[test]
600    fn test_node_lookup_missing() {
601        let g = KgSubgraph::new();
602        assert!(g.node("nope").is_none());
603    }
604
605    // ── KgNode simple constructor ─────────────────────────────────────────────
606
607    #[test]
608    fn test_kg_node_simple() {
609        let n = KgNode::simple("id1", "Label", "Type");
610        assert_eq!(n.id, "id1");
611        assert_eq!(n.label, "Label");
612        assert_eq!(n.node_type, "Type");
613        assert!(n.properties.is_empty());
614    }
615
616    // ── KgEdge unweighted ─────────────────────────────────────────────────────
617
618    #[test]
619    fn test_kg_edge_unweighted_weight_one() {
620        let e = KgEdge::unweighted("a", "b", "rel");
621        assert_eq!(e.weight, 1.0);
622    }
623
624    // ── summarize: empty graph ────────────────────────────────────────────────
625
626    #[test]
627    fn test_summarize_empty_graph() {
628        let g = KgSubgraph::new();
629        let s = SubgraphSummarizer::new();
630        assert!(s.summarize(&g, 10).is_empty());
631    }
632
633    #[test]
634    fn test_summarize_max_clusters_zero() {
635        let g = make_graph_with_types(&[("n1", "Person")], &[]);
636        let s = SubgraphSummarizer::new();
637        assert!(s.summarize(&g, 0).is_empty());
638    }
639
640    // ── summarize: single type ────────────────────────────────────────────────
641
642    #[test]
643    fn test_summarize_single_type_one_cluster() {
644        let g = make_graph_with_types(&[("a", "Person"), ("b", "Person"), ("c", "Person")], &[]);
645        let s = SubgraphSummarizer::new();
646        let clusters = s.summarize(&g, 5);
647        assert_eq!(clusters.len(), 1);
648        assert_eq!(clusters[0].member_nodes.len(), 3);
649    }
650
651    #[test]
652    fn test_summarize_cluster_label_contains_type() {
653        let g = make_graph_with_types(&[("a", "Organization")], &[]);
654        let s = SubgraphSummarizer::new();
655        let clusters = s.summarize(&g, 5);
656        assert!(
657            clusters[0].summary_label.contains("Organization"),
658            "label should mention the type: {}",
659            clusters[0].summary_label
660        );
661    }
662
663    // ── summarize: multiple types ─────────────────────────────────────────────
664
665    #[test]
666    fn test_summarize_two_types_two_clusters() {
667        let g = make_graph_with_types(&[("a", "Person"), ("b", "Person"), ("c", "Company")], &[]);
668        let s = SubgraphSummarizer::new();
669        let clusters = s.summarize(&g, 10);
670        assert_eq!(clusters.len(), 2);
671    }
672
673    #[test]
674    fn test_summarize_respects_max_clusters() {
675        let g = make_graph_with_types(&[("a", "T1"), ("b", "T2"), ("c", "T3"), ("d", "T4")], &[]);
676        let s = SubgraphSummarizer::new();
677        let clusters = s.summarize(&g, 2);
678        assert!(
679            clusters.len() <= 2,
680            "should have at most 2 clusters, got {}",
681            clusters.len()
682        );
683    }
684
685    #[test]
686    fn test_summarize_overflow_goes_to_other() {
687        let g = make_graph_with_types(
688            &[
689                ("a", "T1"),
690                ("b", "T2"),
691                ("c", "T3"),
692                ("d", "T4"),
693                ("e", "T5"),
694            ],
695            &[],
696        );
697        let s = SubgraphSummarizer::new();
698        let clusters = s.summarize(&g, 3);
699        // Should have 3 clusters: T1, T2, Other (T3+T4+T5)
700        assert_eq!(clusters.len(), 3);
701        let has_other = clusters.iter().any(|c| c.summary_label.contains("Other"));
702        assert!(has_other, "overflow should be merged into Other cluster");
703    }
704
705    // ── summarize: representative selection ──────────────────────────────────
706
707    #[test]
708    fn test_representative_is_most_connected() {
709        // "b" has degree 2, "a" and "c" have degree 1 — b should be representative
710        let g = make_graph_with_types(
711            &[("a", "Person"), ("b", "Person"), ("c", "Person")],
712            &[("a", "b", "knows"), ("b", "c", "knows")],
713        );
714        let s = SubgraphSummarizer::new();
715        let clusters = s.summarize(&g, 5);
716        assert_eq!(clusters.len(), 1);
717        assert_eq!(clusters[0].representative_node, "b");
718    }
719
720    #[test]
721    fn test_representative_exists_for_single_node_cluster() {
722        let g = make_graph_with_types(&[("solo", "Category")], &[]);
723        let s = SubgraphSummarizer::new();
724        let clusters = s.summarize(&g, 5);
725        assert_eq!(clusters[0].representative_node, "solo");
726    }
727
728    // ── summarize: internal edges ─────────────────────────────────────────────
729
730    #[test]
731    fn test_internal_edges_all_within_cluster() {
732        let g = make_graph_with_types(
733            &[("a", "T"), ("b", "T"), ("c", "T")],
734            &[("a", "b", "r"), ("b", "c", "r")],
735        );
736        let s = SubgraphSummarizer::new();
737        let clusters = s.summarize(&g, 5);
738        assert_eq!(clusters[0].internal_edges, 2);
739    }
740
741    #[test]
742    fn test_internal_edges_none_cross_cluster() {
743        // Edges between different types — should not appear as internal.
744        let g = make_graph_with_types(
745            &[("a", "T1"), ("b", "T1"), ("c", "T2")],
746            &[("a", "c", "cross"), ("b", "c", "cross")],
747        );
748        let s = SubgraphSummarizer::new();
749        let clusters = s.summarize(&g, 5);
750        for cluster in &clusters {
751            if cluster.summary_label.contains("T1") {
752                assert_eq!(cluster.internal_edges, 0);
753            }
754        }
755    }
756
757    // ── summarize: cluster IDs ────────────────────────────────────────────────
758
759    #[test]
760    fn test_cluster_ids_are_sequential() {
761        let g = make_graph_with_types(&[("a", "T1"), ("b", "T2"), ("c", "T3")], &[]);
762        let s = SubgraphSummarizer::new();
763        let clusters = s.summarize(&g, 10);
764        for (i, c) in clusters.iter().enumerate() {
765            assert_eq!(c.id, i);
766        }
767    }
768
769    // ── extract_key_relations ─────────────────────────────────────────────────
770
771    #[test]
772    fn test_key_relations_empty_graph() {
773        let g = KgSubgraph::new();
774        let s = SubgraphSummarizer::new();
775        assert!(s.extract_key_relations(&g, 5).is_empty());
776    }
777
778    #[test]
779    fn test_key_relations_top_n_zero() {
780        let mut g = KgSubgraph::new();
781        g.add_edge(edge("a", "b", "knows"));
782        let s = SubgraphSummarizer::new();
783        assert!(s.extract_key_relations(&g, 0).is_empty());
784    }
785
786    #[test]
787    fn test_key_relations_sorted_by_frequency() {
788        let g = make_graph_with_types(
789            &[("a", "T"), ("b", "T"), ("c", "T")],
790            &[
791                ("a", "b", "knows"),
792                ("b", "c", "knows"),
793                ("a", "c", "likes"),
794            ],
795        );
796        let s = SubgraphSummarizer::new();
797        let relations = s.extract_key_relations(&g, 5);
798        assert!(!relations.is_empty());
799        assert_eq!(relations[0].0, "knows");
800        assert_eq!(relations[0].1, 2);
801    }
802
803    #[test]
804    fn test_key_relations_truncated_to_top_n() {
805        let g = make_graph_with_types(
806            &[("a", "T"), ("b", "T"), ("c", "T")],
807            &[
808                ("a", "b", "r1"),
809                ("a", "b", "r2"),
810                ("a", "b", "r3"),
811                ("a", "b", "r4"),
812                ("a", "b", "r5"),
813            ],
814        );
815        let s = SubgraphSummarizer::new();
816        let relations = s.extract_key_relations(&g, 3);
817        assert!(relations.len() <= 3);
818    }
819
820    #[test]
821    fn test_key_relations_single_relation() {
822        let mut g = KgSubgraph::new();
823        for i in 0..5 {
824            g.add_edge(edge(&format!("n{i}"), &format!("n{}", i + 1), "knows"));
825        }
826        let s = SubgraphSummarizer::new();
827        let rels = s.extract_key_relations(&g, 1);
828        assert_eq!(rels.len(), 1);
829        assert_eq!(rels[0].0, "knows");
830        assert_eq!(rels[0].1, 5);
831    }
832
833    // ── node_degree ───────────────────────────────────────────────────────────
834
835    #[test]
836    fn test_node_degree_no_edges() {
837        let g = make_graph_with_types(&[("a", "T")], &[]);
838        let s = SubgraphSummarizer::new();
839        assert_eq!(s.node_degree(&g, "a"), 0);
840    }
841
842    #[test]
843    fn test_node_degree_outgoing_only() {
844        let g = make_graph_with_types(&[("a", "T"), ("b", "T")], &[("a", "b", "r")]);
845        let s = SubgraphSummarizer::new();
846        assert_eq!(s.node_degree(&g, "a"), 1);
847        assert_eq!(s.node_degree(&g, "b"), 1);
848    }
849
850    #[test]
851    fn test_node_degree_multiple_edges() {
852        let g = make_graph_with_types(
853            &[("a", "T"), ("b", "T"), ("c", "T")],
854            &[("a", "b", "r"), ("a", "c", "r"), ("b", "a", "r")],
855        );
856        let s = SubgraphSummarizer::new();
857        // a appears in edges: (a→b), (a→c), (b→a) = 3
858        assert_eq!(s.node_degree(&g, "a"), 3);
859    }
860
861    #[test]
862    fn test_node_degree_missing_node() {
863        let g = KgSubgraph::new();
864        let s = SubgraphSummarizer::new();
865        assert_eq!(s.node_degree(&g, "ghost"), 0);
866    }
867
868    // ── generate_text_summary ─────────────────────────────────────────────────
869
870    #[test]
871    fn test_text_summary_empty_clusters() {
872        let s = SubgraphSummarizer::new();
873        let text = s.generate_text_summary(&[]);
874        assert!(text.contains("no identifiable clusters"), "text: {text}");
875    }
876
877    #[test]
878    fn test_text_summary_single_cluster() {
879        let clusters = vec![SummaryCluster {
880            id: 0,
881            representative_node: "alice".to_string(),
882            member_nodes: vec!["alice".to_string(), "bob".to_string()],
883            internal_edges: 1,
884            summary_label: "Person cluster".to_string(),
885        }];
886        let s = SubgraphSummarizer::new();
887        let text = s.generate_text_summary(&clusters);
888        assert!(text.contains("alice"), "text: {text}");
889        assert!(text.contains("Person cluster"), "text: {text}");
890        assert!(text.contains("1 cluster"), "text: {text}");
891    }
892
893    #[test]
894    fn test_text_summary_multiple_clusters() {
895        let clusters = vec![
896            SummaryCluster {
897                id: 0,
898                representative_node: "a".to_string(),
899                member_nodes: vec!["a".to_string()],
900                internal_edges: 0,
901                summary_label: "Person cluster".to_string(),
902            },
903            SummaryCluster {
904                id: 1,
905                representative_node: "c".to_string(),
906                member_nodes: vec!["c".to_string(), "d".to_string()],
907                internal_edges: 1,
908                summary_label: "Company cluster".to_string(),
909            },
910        ];
911        let s = SubgraphSummarizer::new();
912        let text = s.generate_text_summary(&clusters);
913        assert!(text.contains("2 clusters"), "text: {text}");
914        assert!(text.contains("Person cluster"), "text: {text}");
915        assert!(text.contains("Company cluster"), "text: {text}");
916    }
917
918    #[test]
919    fn test_text_summary_contains_member_count() {
920        let clusters = vec![SummaryCluster {
921            id: 0,
922            representative_node: "x".to_string(),
923            member_nodes: vec!["x".to_string(), "y".to_string(), "z".to_string()],
924            internal_edges: 2,
925            summary_label: "T cluster".to_string(),
926        }];
927        let s = SubgraphSummarizer::new();
928        let text = s.generate_text_summary(&clusters);
929        assert!(text.contains("3 members"), "text: {text}");
930    }
931
932    #[test]
933    fn test_text_summary_contains_internal_edge_count() {
934        let clusters = vec![SummaryCluster {
935            id: 0,
936            representative_node: "x".to_string(),
937            member_nodes: vec!["x".to_string()],
938            internal_edges: 7,
939            summary_label: "T cluster".to_string(),
940        }];
941        let s = SubgraphSummarizer::new();
942        let text = s.generate_text_summary(&clusters);
943        assert!(text.contains("7 internal edges"), "text: {text}");
944    }
945
946    // ── SummaryCluster helpers ────────────────────────────────────────────────
947
948    #[test]
949    fn test_cluster_size() {
950        let c = SummaryCluster {
951            id: 0,
952            representative_node: "r".to_string(),
953            member_nodes: vec!["a".to_string(), "b".to_string(), "c".to_string()],
954            internal_edges: 0,
955            summary_label: "X cluster".to_string(),
956        };
957        assert_eq!(c.size(), 3);
958    }
959
960    // ── default impl ──────────────────────────────────────────────────────────
961
962    #[test]
963    fn test_default_summarizer() {
964        let s = SubgraphSummarizer;
965        let g = KgSubgraph::new();
966        assert!(s.summarize(&g, 5).is_empty());
967    }
968
969    #[test]
970    fn test_default_subgraph() {
971        let g = KgSubgraph::default();
972        assert_eq!(g.node_count(), 0);
973    }
974}
975
976// ─── GraphSummarizer tests ────────────────────────────────────────────────────
977
978#[cfg(test)]
979mod graph_summarizer_tests {
980    use super::GraphSummarizer;
981
982    /// Build a synthetic set of triples for testing.
983    fn sample_triples() -> Vec<(String, String, String)> {
984        vec![
985            ("Alice".into(), "knows".into(), "Bob".into()),
986            ("Bob".into(), "knows".into(), "Carol".into()),
987            ("Carol".into(), "knows".into(), "Alice".into()),
988            ("Alice".into(), "worksAt".into(), "ACME".into()),
989            ("Bob".into(), "worksAt".into(), "ACME".into()),
990            ("ACME".into(), "locatedIn".into(), "Berlin".into()),
991            ("Dave".into(), "knows".into(), "Eve".into()),
992            ("Eve".into(), "knows".into(), "Frank".into()),
993            ("Dave".into(), "worksAt".into(), "WidgetCo".into()),
994            ("WidgetCo".into(), "locatedIn".into(), "Paris".into()),
995        ]
996    }
997
998    #[test]
999    fn test_summary_respects_max_nodes() {
1000        let summarizer = GraphSummarizer::new(3, 20);
1001        let triples = sample_triples();
1002        let summary = summarizer.summarize(&triples);
1003        assert!(
1004            summary.entities.len() <= 3,
1005            "expected ≤3 entities, got {}",
1006            summary.entities.len()
1007        );
1008    }
1009
1010    #[test]
1011    fn test_summary_respects_max_triples() {
1012        let summarizer = GraphSummarizer::new(10, 4);
1013        let triples = sample_triples();
1014        let summary = summarizer.summarize(&triples);
1015        assert!(
1016            summary.relations.len() <= 4,
1017            "expected ≤4 relations, got {}",
1018            summary.relations.len()
1019        );
1020    }
1021
1022    #[test]
1023    fn test_to_text_non_empty_on_non_empty_graph() {
1024        let summarizer = GraphSummarizer::new(10, 20);
1025        let triples = sample_triples();
1026        let summary = summarizer.summarize(&triples);
1027        let text = summary.to_text();
1028        assert!(!text.is_empty(), "to_text() should not be empty");
1029    }
1030
1031    #[test]
1032    fn test_empty_graph_returns_empty_summary() {
1033        let summarizer = GraphSummarizer::new(10, 20);
1034        let summary = summarizer.summarize(&[]);
1035        assert!(
1036            summary.entities.is_empty(),
1037            "empty graph: entities should be empty"
1038        );
1039        assert!(
1040            summary.relations.is_empty(),
1041            "empty graph: relations should be empty"
1042        );
1043    }
1044
1045    #[test]
1046    fn test_community_labels_present_when_graph_has_nodes() {
1047        let summarizer = GraphSummarizer::new(10, 20);
1048        let triples = sample_triples();
1049        let summary = summarizer.summarize(&triples);
1050        assert!(
1051            !summary.community_labels.is_empty(),
1052            "community_labels should be non-empty when graph has nodes"
1053        );
1054    }
1055
1056    #[test]
1057    fn test_predicate_frequency_ordering() {
1058        // "knows" appears 5×, "worksAt" appears 3×, "locatedIn" appears 2×.
1059        // The most-used predicate must appear first among the relations.
1060        let summarizer = GraphSummarizer::new(10, 20);
1061        let triples = sample_triples();
1062        let summary = summarizer.summarize(&triples);
1063        // Collect the first predicate that appears in relations.
1064        if let Some((_, p, _)) = summary.relations.first() {
1065            assert_eq!(
1066                p.as_str(),
1067                "knows",
1068                "first predicate in relations should be 'knows' (most frequent), got '{p}'"
1069            );
1070        }
1071        // If relations are empty, the max_triples was 0 — skip the ordering check.
1072    }
1073}