Skip to main content

dsfb_debug/
graph_inference.rs

1//! DSFB-Debug: service-call graph auto-inference (no_std).
2//!
3//! # Why auto-inference matters
4//!
5//! `causality::attribute_root_causes` (paper §10) requires a service
6//! dependency graph as input. Earlier versions made the graph an
7//! operator-supplied prerequisite: a deployment couldn't run
8//! root-cause attribution until a human curated the
9//! parent→child service map. This module removes that prerequisite
10//! by recovering the graph directly from observed span parent-child
11//! links in the trace stream.
12//!
13//! # Algorithm — Tarjan's SCC
14//!
15//! Tarjan's strongly-connected-components algorithm
16//! (Tarjan 1972, "Depth-first search and linear graph algorithms",
17//! SIAM J. Comput. 1(2):146-160) handles cyclic dependency
18//! structures (e.g.\ mutual-recursion service pairs). Each SCC
19//! contracts to a single supernode; the resulting DAG is the
20//! service dependency graph that
21//! `causality::attribute_root_causes` consumes.
22//!
23//! # Determinism (Theorem 9 preserved)
24//!
25//! The entire graph-walk is a pure function of the input edges.
26//! Tarjan's algorithm is deterministic given a fixed input
27//! ordering; the caller's responsibility is to pass `ServiceEdge`
28//! slices in canonical order (lexicographic sort on
29//! `(parent_service_idx, child_service_idx)` is sufficient).
30//!
31//! # Input shape
32//!
33//! `ServiceEdge { parent_service_idx, child_service_idx }` —
34//! indices into a caller-managed `signal_names` table. The minimal
35//! Jaeger / OTLP span shape suffices: `service_name`, `span_id`,
36//! `parent_span_id` map to ServiceEdges via a one-pass scan over
37//! the span list.
38
39#![allow(clippy::too_many_arguments)]
40
41/// One observed parent->child invocation between services.
42/// `parent_service_idx` and `child_service_idx` are indices into a
43/// caller-managed `signal_names` table.
44#[derive(Copy, Clone, Debug, PartialEq, Eq)]
45pub struct ServiceEdge {
46    pub parent: u16,
47    pub child: u16,
48}
49
50/// Maximum SCC graph size handled. Larger graphs return an empty
51/// result; in practice 256 services covers most microservice meshes.
52const MAX_NODES: usize = 256;
53
54/// Infer a parent->child service edge list from raw observation pairs.
55///
56/// Deduplicates input edges and collapses self-loops. Returns the edges
57/// in canonical (lowest-parent, then lowest-child) order so downstream
58/// causality attribution is deterministic.
59///
60/// `observed` may contain duplicates, self-loops, and out-of-range
61/// indices; this function tolerates all three. Edges with either
62/// endpoint >= num_services are dropped.
63pub fn infer_service_graph_from_observed(
64    observed: &[ServiceEdge],
65    num_services: usize,
66    out_edges: &mut [(u16, u16)],
67) -> usize {
68    if num_services == 0 || num_services > MAX_NODES {
69        return 0;
70    }
71    // Adjacency bitset (out_edges) and (in_edges) for dedup.
72    let mut seen = [[false; MAX_NODES]; MAX_NODES];
73
74    let mut count: usize = 0;
75    let mut i = 0;
76    while i < observed.len() {
77        let e = observed[i];
78        let p = e.parent as usize;
79        let c = e.child as usize;
80        if p < num_services && c < num_services && p != c && !seen[p][c] {
81            seen[p][c] = true;
82            // Emit in row-major (parent-major, child-minor) order — but
83            // since we're scanning input in caller order, we'll re-sort
84            // at the end for canonical output.
85            if count < out_edges.len() {
86                out_edges[count] = (e.parent, e.child);
87                count += 1;
88            }
89        }
90        i += 1;
91    }
92
93    // Canonical sort: lexicographic (parent, child).
94    canonical_sort(&mut out_edges[..count]);
95    count
96}
97
98fn canonical_sort(edges: &mut [(u16, u16)]) {
99    // Insertion sort — adequate for small N; deterministic.
100    let n = edges.len();
101    let mut i = 1;
102    while i < n {
103        let cur = edges[i];
104        let mut j = i;
105        while j > 0 && (edges[j - 1].0 > cur.0
106                        || (edges[j - 1].0 == cur.0 && edges[j - 1].1 > cur.1))
107        {
108            edges[j] = edges[j - 1];
109            j -= 1;
110        }
111        edges[j] = cur;
112        i += 1;
113    }
114}
115
116/// Strongly-connected-component count via Tarjan's algorithm.
117/// Returns the SCC ID per node into `scc_id_out` (length must be >=
118/// num_services), and the total SCC count.
119///
120/// Used for cycle detection: if SCC count < num_services, the graph
121/// has at least one cycle and the operator should consider whether the
122/// causality attribution algorithm's "first slew → upstream-most"
123/// heuristic is valid for their topology (it is for DAGs; ambiguous
124/// for graphs with cycles).
125pub fn tarjan_scc(
126    edges: &[(u16, u16)],
127    num_services: usize,
128    scc_id_out: &mut [u16],
129) -> usize {
130    if num_services == 0 || num_services > MAX_NODES || scc_id_out.len() < num_services {
131        return 0;
132    }
133
134    // Adjacency list constructed from edges. Use fixed-size arrays.
135    let mut adj_start = [0_u16; MAX_NODES + 1];
136    let mut adj_targets = [0_u16; MAX_NODES * MAX_NODES];
137    // Two-pass: count outgoing per node, then fill.
138    let mut out_count = [0_u16; MAX_NODES];
139    let mut e = 0;
140    while e < edges.len() {
141        let (p, _c) = edges[e];
142        if (p as usize) < num_services {
143            out_count[p as usize] += 1;
144        }
145        e += 1;
146    }
147    let mut acc = 0_u16;
148    let mut k = 0;
149    while k <= num_services {
150        adj_start[k] = acc;
151        if k < num_services {
152            acc = acc.saturating_add(out_count[k]);
153        }
154        k += 1;
155    }
156    let mut filled = [0_u16; MAX_NODES];
157    e = 0;
158    while e < edges.len() {
159        let (p, c) = edges[e];
160        if (p as usize) < num_services && (c as usize) < num_services {
161            let pos = adj_start[p as usize] + filled[p as usize];
162            if (pos as usize) < adj_targets.len() {
163                adj_targets[pos as usize] = c;
164                filled[p as usize] += 1;
165            }
166        }
167        e += 1;
168    }
169
170    // Tarjan state.
171    let mut index = 0_u32;
172    let mut stack = [0_u16; MAX_NODES];
173    let mut on_stack = [false; MAX_NODES];
174    let mut top = 0_usize;
175    let mut node_index = [u32::MAX; MAX_NODES];
176    let mut node_lowlink = [0_u32; MAX_NODES];
177    let mut scc_count = 0_usize;
178
179    // Iterative DFS — Rust no_std friendly (no recursion limit).
180    let mut dfs_stack = [(0_u16, 0_u16); MAX_NODES];
181    let mut dfs_top = 0_usize;
182
183    for v0 in 0..num_services {
184        if node_index[v0] != u32::MAX {
185            continue;
186        }
187        // Push v0 with iterator pos 0
188        dfs_stack[dfs_top] = (v0 as u16, 0);
189        dfs_top += 1;
190        node_index[v0] = index;
191        node_lowlink[v0] = index;
192        index += 1;
193        stack[top] = v0 as u16;
194        on_stack[v0] = true;
195        top += 1;
196
197        while dfs_top > 0 {
198            let (v, mut iter_pos) = dfs_stack[dfs_top - 1];
199            let start = adj_start[v as usize];
200            let end = adj_start[v as usize + 1];
201            let mut advanced = false;
202            while (start + iter_pos) < end {
203                let w = adj_targets[(start + iter_pos) as usize];
204                iter_pos += 1;
205                if node_index[w as usize] == u32::MAX {
206                    // Recurse on w
207                    dfs_stack[dfs_top - 1].1 = iter_pos;
208                    dfs_stack[dfs_top] = (w, 0);
209                    dfs_top += 1;
210                    node_index[w as usize] = index;
211                    node_lowlink[w as usize] = index;
212                    index += 1;
213                    if top < MAX_NODES {
214                        stack[top] = w;
215                        on_stack[w as usize] = true;
216                        top += 1;
217                    }
218                    advanced = true;
219                    break;
220                } else if on_stack[w as usize] {
221                    let lw = node_lowlink[v as usize];
222                    let iw = node_index[w as usize];
223                    if iw < lw {
224                        node_lowlink[v as usize] = iw;
225                    }
226                }
227            }
228            if advanced {
229                continue;
230            }
231            // Finished iterating v's neighbours.
232            // Pop one frame; propagate lowlink to parent.
233            dfs_top -= 1;
234            if dfs_top > 0 {
235                let parent = dfs_stack[dfs_top - 1].0;
236                let lp = node_lowlink[parent as usize];
237                let lv = node_lowlink[v as usize];
238                if lv < lp {
239                    node_lowlink[parent as usize] = lv;
240                }
241            }
242            // If v is an SCC root, pop the SCC.
243            if node_lowlink[v as usize] == node_index[v as usize] {
244                loop {
245                    if top == 0 {
246                        break;
247                    }
248                    top -= 1;
249                    let w = stack[top];
250                    on_stack[w as usize] = false;
251                    scc_id_out[w as usize] = scc_count as u16;
252                    if w == v {
253                        break;
254                    }
255                }
256                scc_count += 1;
257            }
258        }
259    }
260    scc_count
261}
262
263#[cfg(test)]
264mod tests {
265    use super::*;
266
267    #[test]
268    fn deduplicates_and_drops_self_loops() {
269        let observed = [
270            ServiceEdge { parent: 0, child: 1 },
271            ServiceEdge { parent: 0, child: 1 }, // duplicate
272            ServiceEdge { parent: 1, child: 1 }, // self-loop
273            ServiceEdge { parent: 2, child: 0 },
274        ];
275        let mut out = [(0_u16, 0_u16); 16];
276        let n = infer_service_graph_from_observed(&observed, 3, &mut out);
277        assert_eq!(n, 2, "self-loop dropped, duplicate dedup");
278        // Canonical order: (0, 1), (2, 0)
279        assert_eq!(out[0], (0, 1));
280        assert_eq!(out[1], (2, 0));
281    }
282
283    #[test]
284    fn out_of_range_indices_dropped() {
285        let observed = [
286            ServiceEdge { parent: 0, child: 5 },   // child out of range (num_services=3)
287            ServiceEdge { parent: 99, child: 0 },  // parent out of range
288            ServiceEdge { parent: 1, child: 2 },
289        ];
290        let mut out = [(0_u16, 0_u16); 16];
291        let n = infer_service_graph_from_observed(&observed, 3, &mut out);
292        assert_eq!(n, 1);
293        assert_eq!(out[0], (1, 2));
294    }
295
296    #[test]
297    fn scc_linear_chain_yields_n_components() {
298        // Linear chain s0 → s1 → s2 → s3 has 4 SCCs (one per node).
299        let edges = [(0_u16, 1_u16), (1, 2), (2, 3)];
300        let mut scc_id = [0_u16; 4];
301        let n = tarjan_scc(&edges, 4, &mut scc_id);
302        assert_eq!(n, 4);
303    }
304
305    #[test]
306    fn scc_cycle_collapses_to_one() {
307        // 3-cycle: s0 → s1 → s2 → s0 has 1 SCC.
308        let edges = [(0_u16, 1_u16), (1, 2), (2, 0)];
309        let mut scc_id = [0_u16; 3];
310        let n = tarjan_scc(&edges, 3, &mut scc_id);
311        assert_eq!(n, 1);
312        // All three nodes belong to the same SCC.
313        assert_eq!(scc_id[0], scc_id[1]);
314        assert_eq!(scc_id[1], scc_id[2]);
315    }
316
317    #[test]
318    fn scc_mixed_topology() {
319        // 4 nodes: s0 → s1, s1 → s2 → s1 (cycle), s3 standalone
320        // SCCs: {s0}, {s1, s2}, {s3} = 3 SCCs.
321        let edges = [(0_u16, 1_u16), (1, 2), (2, 1)];
322        let mut scc_id = [0_u16; 4];
323        let n = tarjan_scc(&edges, 4, &mut scc_id);
324        assert_eq!(n, 3);
325    }
326}