Skip to main content

aeo_graph_explorer/
graph.rs

1//! The in-memory typed graph.
2
3use std::collections::HashMap;
4
5use petgraph::graph::{DiGraph, NodeIndex};
6use serde::{Deserialize, Serialize};
7
8use crate::error::GraphError;
9use crate::model::AeoNode;
10
11/// Relationship kinds the explorer cares about.
12#[derive(Clone, Copy, Debug, Deserialize, Serialize, PartialEq, Eq)]
13#[serde(rename_all = "snake_case")]
14pub enum EdgeKind {
15    /// `from.peers[]` declared `to` as a peer entity.
16    DeclaresPeer,
17    /// `from.authority.primary_sources` chained through `to`.
18    CitesAuthority,
19}
20
21/// Container for a single loaded crawl.
22#[derive(Debug, Default)]
23pub struct AeoGraph {
24    graph: DiGraph<AeoNode, EdgeKind>,
25    index: HashMap<String, NodeIndex>,
26}
27
28impl AeoGraph {
29    /// Build a graph from JSONL — one AEO document per line, in the same shape
30    /// `aeo-crawler` emits. Edges are inferred from `peers` and
31    /// `authority.primary_sources` arrays.
32    pub fn from_jsonl(raw: &str) -> Result<Self, GraphError> {
33        let mut graph = Self::default();
34        for (line_idx, line) in raw.lines().enumerate() {
35            let line = line.trim();
36            if line.is_empty() {
37                continue;
38            }
39            let node: AeoNode = serde_json::from_str(line).map_err(|err| GraphError::JsonLine {
40                line: line_idx + 1,
41                source: err,
42            })?;
43            graph.upsert(node);
44        }
45        graph.wire_edges();
46        Ok(graph)
47    }
48
49    /// Insert or replace a node. Edge inference is deferred to
50    /// [`Self::wire_edges`] so bulk loads only pay for it once.
51    pub fn upsert(&mut self, node: AeoNode) -> NodeIndex {
52        if let Some(&idx) = self.index.get(&node.id) {
53            self.graph[idx] = node;
54            return idx;
55        }
56        let id = node.id.clone();
57        let idx = self.graph.add_node(node);
58        self.index.insert(id, idx);
59        idx
60    }
61
62    /// After all nodes are loaded, walk the bodies and wire up edges.
63    pub fn wire_edges(&mut self) {
64        // Snapshot ids -> indices so the mutable borrow doesn't fight us.
65        let snapshot: Vec<(NodeIndex, AeoNode)> = self
66            .graph
67            .node_indices()
68            .map(|i| (i, self.graph[i].clone()))
69            .collect();
70
71        for (from_idx, node) in &snapshot {
72            // Peers: `body.peers: [{ "id": "...", ... }, ...]`
73            if let Some(peers) = node.body.get("peers").and_then(|v| v.as_array()) {
74                for peer in peers {
75                    if let Some(peer_id) = peer.get("id").and_then(|v| v.as_str()) {
76                        if let Some(&peer_idx) = self.index.get(peer_id) {
77                            self.graph
78                                .add_edge(*from_idx, peer_idx, EdgeKind::DeclaresPeer);
79                        }
80                    }
81                }
82            }
83            // Authority: `body.authority.primary_sources: [url, ...]`
84            // Crawler-side convention is that primary_sources can be
85            // arbitrary URLs; we wire an edge only if a node with that id
86            // exists in the loaded graph.
87            if let Some(sources) = node
88                .body
89                .get("authority")
90                .and_then(|v| v.get("primary_sources"))
91                .and_then(|v| v.as_array())
92            {
93                for src in sources {
94                    if let Some(url) = src.as_str() {
95                        if let Some(&src_idx) = self.index.get(url) {
96                            if src_idx != *from_idx {
97                                self.graph
98                                    .add_edge(*from_idx, src_idx, EdgeKind::CitesAuthority);
99                            }
100                        }
101                    }
102                }
103            }
104        }
105    }
106
107    /// Look up a node by id.
108    pub fn node(&self, id: &str) -> Option<&AeoNode> {
109        self.index.get(id).map(|&i| &self.graph[i])
110    }
111
112    /// All loaded nodes.
113    pub fn nodes(&self) -> impl Iterator<Item = &AeoNode> {
114        self.graph.node_indices().map(|i| &self.graph[i])
115    }
116
117    /// Number of nodes in the graph.
118    pub fn node_count(&self) -> usize {
119        self.graph.node_count()
120    }
121
122    /// Number of edges in the graph.
123    pub fn edge_count(&self) -> usize {
124        self.graph.edge_count()
125    }
126
127    pub(crate) fn idx(&self, id: &str) -> Option<NodeIndex> {
128        self.index.get(id).copied()
129    }
130
131    pub(crate) fn raw(&self) -> &DiGraph<AeoNode, EdgeKind> {
132        &self.graph
133    }
134}