dsfb_debug/graph_inference.rs
1//! DSFB-Debug: service-call graph auto-inference (no_std).
2//!
3//! # Why auto-inference matters
4//!
5//! `causality::attribute_root_causes` (paper §10) requires a service
6//! dependency graph as input. Earlier versions made the graph an
7//! operator-supplied prerequisite: a deployment couldn't run
8//! root-cause attribution until a human curated the
9//! parent→child service map. This module removes that prerequisite
10//! by recovering the graph directly from observed span parent-child
11//! links in the trace stream.
12//!
13//! # Algorithm — Tarjan's SCC
14//!
15//! Tarjan's strongly-connected-components algorithm
16//! (Tarjan 1972, "Depth-first search and linear graph algorithms",
17//! SIAM J. Comput. 1(2):146-160) handles cyclic dependency
18//! structures (e.g.\ mutual-recursion service pairs). Each SCC
19//! contracts to a single supernode; the resulting DAG is the
20//! service dependency graph that
21//! `causality::attribute_root_causes` consumes.
22//!
23//! # Determinism (Theorem 9 preserved)
24//!
25//! The entire graph-walk is a pure function of the input edges.
26//! Tarjan's algorithm is deterministic given a fixed input
27//! ordering; the caller's responsibility is to pass `ServiceEdge`
28//! slices in canonical order (lexicographic sort on
29//! `(parent_service_idx, child_service_idx)` is sufficient).
30//!
31//! # Input shape
32//!
33//! `ServiceEdge { parent_service_idx, child_service_idx }` —
34//! indices into a caller-managed `signal_names` table. The minimal
35//! Jaeger / OTLP span shape suffices: `service_name`, `span_id`,
36//! `parent_span_id` map to ServiceEdges via a one-pass scan over
37//! the span list.
38
39#![allow(clippy::too_many_arguments)]
40
41/// One observed parent->child invocation between services.
42/// `parent_service_idx` and `child_service_idx` are indices into a
43/// caller-managed `signal_names` table.
44#[derive(Copy, Clone, Debug, PartialEq, Eq)]
45pub struct ServiceEdge {
46 pub parent: u16,
47 pub child: u16,
48}
49
50/// Maximum SCC graph size handled. Larger graphs return an empty
51/// result; in practice 256 services covers most microservice meshes.
52const MAX_NODES: usize = 256;
53
54/// Infer a parent->child service edge list from raw observation pairs.
55///
56/// Deduplicates input edges and collapses self-loops. Returns the edges
57/// in canonical (lowest-parent, then lowest-child) order so downstream
58/// causality attribution is deterministic.
59///
60/// `observed` may contain duplicates, self-loops, and out-of-range
61/// indices; this function tolerates all three. Edges with either
62/// endpoint >= num_services are dropped.
63pub fn infer_service_graph_from_observed(
64 observed: &[ServiceEdge],
65 num_services: usize,
66 out_edges: &mut [(u16, u16)],
67) -> usize {
68 if num_services == 0 || num_services > MAX_NODES {
69 return 0;
70 }
71 // Adjacency bitset (out_edges) and (in_edges) for dedup.
72 let mut seen = [[false; MAX_NODES]; MAX_NODES];
73
74 let mut count: usize = 0;
75 let mut i = 0;
76 while i < observed.len() {
77 let e = observed[i];
78 let p = e.parent as usize;
79 let c = e.child as usize;
80 if p < num_services && c < num_services && p != c && !seen[p][c] {
81 seen[p][c] = true;
82 // Emit in row-major (parent-major, child-minor) order — but
83 // since we're scanning input in caller order, we'll re-sort
84 // at the end for canonical output.
85 if count < out_edges.len() {
86 out_edges[count] = (e.parent, e.child);
87 count += 1;
88 }
89 }
90 i += 1;
91 }
92
93 // Canonical sort: lexicographic (parent, child).
94 canonical_sort(&mut out_edges[..count]);
95 count
96}
97
98fn canonical_sort(edges: &mut [(u16, u16)]) {
99 // Insertion sort — adequate for small N; deterministic.
100 let n = edges.len();
101 let mut i = 1;
102 while i < n {
103 let cur = edges[i];
104 let mut j = i;
105 while j > 0 && (edges[j - 1].0 > cur.0
106 || (edges[j - 1].0 == cur.0 && edges[j - 1].1 > cur.1))
107 {
108 edges[j] = edges[j - 1];
109 j -= 1;
110 }
111 edges[j] = cur;
112 i += 1;
113 }
114}
115
116/// Strongly-connected-component count via Tarjan's algorithm.
117/// Returns the SCC ID per node into `scc_id_out` (length must be >=
118/// num_services), and the total SCC count.
119///
120/// Used for cycle detection: if SCC count < num_services, the graph
121/// has at least one cycle and the operator should consider whether the
122/// causality attribution algorithm's "first slew → upstream-most"
123/// heuristic is valid for their topology (it is for DAGs; ambiguous
124/// for graphs with cycles).
125pub fn tarjan_scc(
126 edges: &[(u16, u16)],
127 num_services: usize,
128 scc_id_out: &mut [u16],
129) -> usize {
130 if num_services == 0 || num_services > MAX_NODES || scc_id_out.len() < num_services {
131 return 0;
132 }
133
134 // Adjacency list constructed from edges. Use fixed-size arrays.
135 let mut adj_start = [0_u16; MAX_NODES + 1];
136 let mut adj_targets = [0_u16; MAX_NODES * MAX_NODES];
137 // Two-pass: count outgoing per node, then fill.
138 let mut out_count = [0_u16; MAX_NODES];
139 let mut e = 0;
140 while e < edges.len() {
141 let (p, _c) = edges[e];
142 if (p as usize) < num_services {
143 out_count[p as usize] += 1;
144 }
145 e += 1;
146 }
147 let mut acc = 0_u16;
148 let mut k = 0;
149 while k <= num_services {
150 adj_start[k] = acc;
151 if k < num_services {
152 acc = acc.saturating_add(out_count[k]);
153 }
154 k += 1;
155 }
156 let mut filled = [0_u16; MAX_NODES];
157 e = 0;
158 while e < edges.len() {
159 let (p, c) = edges[e];
160 if (p as usize) < num_services && (c as usize) < num_services {
161 let pos = adj_start[p as usize] + filled[p as usize];
162 if (pos as usize) < adj_targets.len() {
163 adj_targets[pos as usize] = c;
164 filled[p as usize] += 1;
165 }
166 }
167 e += 1;
168 }
169
170 // Tarjan state.
171 let mut index = 0_u32;
172 let mut stack = [0_u16; MAX_NODES];
173 let mut on_stack = [false; MAX_NODES];
174 let mut top = 0_usize;
175 let mut node_index = [u32::MAX; MAX_NODES];
176 let mut node_lowlink = [0_u32; MAX_NODES];
177 let mut scc_count = 0_usize;
178
179 // Iterative DFS — Rust no_std friendly (no recursion limit).
180 let mut dfs_stack = [(0_u16, 0_u16); MAX_NODES];
181 let mut dfs_top = 0_usize;
182
183 for v0 in 0..num_services {
184 if node_index[v0] != u32::MAX {
185 continue;
186 }
187 // Push v0 with iterator pos 0
188 dfs_stack[dfs_top] = (v0 as u16, 0);
189 dfs_top += 1;
190 node_index[v0] = index;
191 node_lowlink[v0] = index;
192 index += 1;
193 stack[top] = v0 as u16;
194 on_stack[v0] = true;
195 top += 1;
196
197 while dfs_top > 0 {
198 let (v, mut iter_pos) = dfs_stack[dfs_top - 1];
199 let start = adj_start[v as usize];
200 let end = adj_start[v as usize + 1];
201 let mut advanced = false;
202 while (start + iter_pos) < end {
203 let w = adj_targets[(start + iter_pos) as usize];
204 iter_pos += 1;
205 if node_index[w as usize] == u32::MAX {
206 // Recurse on w
207 dfs_stack[dfs_top - 1].1 = iter_pos;
208 dfs_stack[dfs_top] = (w, 0);
209 dfs_top += 1;
210 node_index[w as usize] = index;
211 node_lowlink[w as usize] = index;
212 index += 1;
213 if top < MAX_NODES {
214 stack[top] = w;
215 on_stack[w as usize] = true;
216 top += 1;
217 }
218 advanced = true;
219 break;
220 } else if on_stack[w as usize] {
221 let lw = node_lowlink[v as usize];
222 let iw = node_index[w as usize];
223 if iw < lw {
224 node_lowlink[v as usize] = iw;
225 }
226 }
227 }
228 if advanced {
229 continue;
230 }
231 // Finished iterating v's neighbours.
232 // Pop one frame; propagate lowlink to parent.
233 dfs_top -= 1;
234 if dfs_top > 0 {
235 let parent = dfs_stack[dfs_top - 1].0;
236 let lp = node_lowlink[parent as usize];
237 let lv = node_lowlink[v as usize];
238 if lv < lp {
239 node_lowlink[parent as usize] = lv;
240 }
241 }
242 // If v is an SCC root, pop the SCC.
243 if node_lowlink[v as usize] == node_index[v as usize] {
244 loop {
245 if top == 0 {
246 break;
247 }
248 top -= 1;
249 let w = stack[top];
250 on_stack[w as usize] = false;
251 scc_id_out[w as usize] = scc_count as u16;
252 if w == v {
253 break;
254 }
255 }
256 scc_count += 1;
257 }
258 }
259 }
260 scc_count
261}
262
263#[cfg(test)]
264mod tests {
265 use super::*;
266
267 #[test]
268 fn deduplicates_and_drops_self_loops() {
269 let observed = [
270 ServiceEdge { parent: 0, child: 1 },
271 ServiceEdge { parent: 0, child: 1 }, // duplicate
272 ServiceEdge { parent: 1, child: 1 }, // self-loop
273 ServiceEdge { parent: 2, child: 0 },
274 ];
275 let mut out = [(0_u16, 0_u16); 16];
276 let n = infer_service_graph_from_observed(&observed, 3, &mut out);
277 assert_eq!(n, 2, "self-loop dropped, duplicate dedup");
278 // Canonical order: (0, 1), (2, 0)
279 assert_eq!(out[0], (0, 1));
280 assert_eq!(out[1], (2, 0));
281 }
282
283 #[test]
284 fn out_of_range_indices_dropped() {
285 let observed = [
286 ServiceEdge { parent: 0, child: 5 }, // child out of range (num_services=3)
287 ServiceEdge { parent: 99, child: 0 }, // parent out of range
288 ServiceEdge { parent: 1, child: 2 },
289 ];
290 let mut out = [(0_u16, 0_u16); 16];
291 let n = infer_service_graph_from_observed(&observed, 3, &mut out);
292 assert_eq!(n, 1);
293 assert_eq!(out[0], (1, 2));
294 }
295
296 #[test]
297 fn scc_linear_chain_yields_n_components() {
298 // Linear chain s0 → s1 → s2 → s3 has 4 SCCs (one per node).
299 let edges = [(0_u16, 1_u16), (1, 2), (2, 3)];
300 let mut scc_id = [0_u16; 4];
301 let n = tarjan_scc(&edges, 4, &mut scc_id);
302 assert_eq!(n, 4);
303 }
304
305 #[test]
306 fn scc_cycle_collapses_to_one() {
307 // 3-cycle: s0 → s1 → s2 → s0 has 1 SCC.
308 let edges = [(0_u16, 1_u16), (1, 2), (2, 0)];
309 let mut scc_id = [0_u16; 3];
310 let n = tarjan_scc(&edges, 3, &mut scc_id);
311 assert_eq!(n, 1);
312 // All three nodes belong to the same SCC.
313 assert_eq!(scc_id[0], scc_id[1]);
314 assert_eq!(scc_id[1], scc_id[2]);
315 }
316
317 #[test]
318 fn scc_mixed_topology() {
319 // 4 nodes: s0 → s1, s1 → s2 → s1 (cycle), s3 standalone
320 // SCCs: {s0}, {s1, s2}, {s3} = 3 SCCs.
321 let edges = [(0_u16, 1_u16), (1, 2), (2, 1)];
322 let mut scc_id = [0_u16; 4];
323 let n = tarjan_scc(&edges, 4, &mut scc_id);
324 assert_eq!(n, 3);
325 }
326}