agentic_eval/
web.rs

1//! Evaluating **web stacks** for agentic AI use.
2//!
3//! Agents do not browse the web a human does; they talk to other services over
4//! whatever wire format an LLM-native call graph rewards. That workload has its
5//! own five axes — different from the `vms` axes (which score *where* code
6//! runs) and different from the language/framework axes (which score *what
7//! agents build*). This module scores the **wire protocols and service
8//! contracts** an agent actually has to speak with:
9//!
10//! - **streaming** — does the protocol carry LLM-shaped output (token streams,
11//!   latents, mid-stream tool calls) as first-class frames, or is streaming a
12//!   bolt-on on top of a document-oriented base?
13//! - **tool-discoverability** — can an agent introspect the available
14//!   capabilities (tool list, schemas, types) from the protocol itself, or
15//!   must it read prose?
16//! - **encoding-efficiency** — wire compactness for the LLM/tool-call workload
17//!   (binary framing + content-typed payloads vs. JSON-over-HTTP/1.1 baseline).
18//! - **interop** — does the agent ecosystem actually speak this? Network
19//!   effect: the protocol every SDK already knows is worth more than the
20//!   "objectively cleaner" one no one targets.
21//! - **security-primitives** — does the protocol carry auth, distributed
22//!   tracing, content integrity, and per-message identity natively, or are
23//!   they someone-else's-problem?
24//!
25//! Profiles are curated 0.0–1.0 static judgments with `evidence`, like the
26//! [`languages`](crate::languages) / [`frameworks`](crate::frameworks) /
27//! [`vms`](crate::vms) profiles — deterministic, serializable, comparable.
28//! Scores reflect each stack's design center for *agent-to-service* traffic;
29//! a great document-delivery protocol (HTTP+JSON, GraphQL) can rank low for
30//! LLM-token streaming and high on interop, and that is the point.
31//!
32//! ```
33//! use agentic_eval::web::{profile, rank_web_stacks, WebStack};
34//! let spine = profile(WebStack::Spine);
35//! assert!(spine.evidence.len() >= 3);
36//! let ranked = rank_web_stacks();
37//! assert!(ranked[0].fitness() >= ranked[ranked.len() - 1].fitness());
38//! ```
39
40/// Web stacks / wire protocols with curated agentic profiles.
41#[cfg_attr(feature = "serde", derive(serde::Serialize))]
42#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
43#[allow(missing_docs)]
44pub enum WebStack {
45    /// SPINE (nervosys/SPINE) — the agentic-first web stack. Native
46    /// `StreamStart/Token/End` frames with text / bytes / encoded-latent /
47    /// tool-call data variants; `CapabilityQuery/CapabilityAdvertisement`
48    /// with JSON Schema and optional semantic embeddings; `TraceContext`
49    /// (W3C traceparent) attached inline; OpenAI-compatible SSE bridge in
50    /// the gateway; bearer auth secure by default as of v1.3.0; optional
51    /// FIPS 140-3 build via `aws-lc-rs`.
52    Spine,
53    /// OpenAI API — HTTP + JSON with SSE chat.completion.chunk streaming;
54    /// `tools` parameter for function calling; bearer auth + TLS.
55    OpenAiApi,
56    /// Anthropic API — HTTP + JSON with SSE `message_start/delta/stop`
57    /// streaming; `tools` parameter for tool use; bearer auth + TLS.
58    AnthropicApi,
59    /// Model Context Protocol — JSON-RPC over stdio or SSE; `tools/list`,
60    /// `resources/list`, `prompts/list` introspection RPCs are the surface.
61    Mcp,
62    /// gRPC — protobuf over HTTP/2 with first-class server / client / bidi
63    /// streaming, service reflection, mTLS, interceptors for auth/tracing.
64    Grpc,
65    /// Plain HTTP + JSON (REST-shaped). The generic baseline an agent has
66    /// to fall back to when nothing more specific exists.
67    HttpJson,
68    /// GraphQL — query language with introspection built into the protocol;
69    /// subscriptions for streaming.
70    GraphQl,
71}
72
73impl WebStack {
74    /// All profiled web stacks, in fixed (deterministic) order.
75    pub fn all() -> [WebStack; 7] {
76        [
77            WebStack::Spine,
78            WebStack::OpenAiApi,
79            WebStack::AnthropicApi,
80            WebStack::Mcp,
81            WebStack::Grpc,
82            WebStack::HttpJson,
83            WebStack::GraphQl,
84        ]
85    }
86
87    /// Canonical lowercase name.
88    pub fn name(self) -> &'static str {
89        match self {
90            WebStack::Spine => "spine",
91            WebStack::OpenAiApi => "openai-api",
92            WebStack::AnthropicApi => "anthropic-api",
93            WebStack::Mcp => "mcp",
94            WebStack::Grpc => "grpc",
95            WebStack::HttpJson => "http-json",
96            WebStack::GraphQl => "graphql",
97        }
98    }
99
100    /// Parse a (case-insensitive) name; accepts common aliases
101    /// (`openai`, `claude`, `model-context-protocol`, `rest`, `graphql-spec`, …).
102    pub fn from_name(name: &str) -> Option<WebStack> {
103        match name.to_ascii_lowercase().as_str() {
104            "spine" | "nervosys-spine" => Some(WebStack::Spine),
105            "openai" | "openai-api" | "gpt-api" => Some(WebStack::OpenAiApi),
106            "anthropic" | "anthropic-api" | "claude-api" => Some(WebStack::AnthropicApi),
107            "mcp" | "model-context-protocol" => Some(WebStack::Mcp),
108            "grpc" | "g-rpc" => Some(WebStack::Grpc),
109            "http-json" | "rest" | "http+json" | "json-over-http" => Some(WebStack::HttpJson),
110            "graphql" | "graphql-spec" | "gql" => Some(WebStack::GraphQl),
111            _ => None,
112        }
113    }
114}
115
116/// A curated agentic profile of a web stack / wire protocol across the five
117/// agent-native axes, with evidence.
118#[cfg_attr(feature = "serde", derive(serde::Serialize))]
119#[derive(Debug, Clone)]
120pub struct WebStackProfile {
121    /// Which stack this profiles.
122    pub stack: WebStack,
123    /// LLM-shaped streaming as a first-class frame family
124    /// (1.0 = native token / latent / mid-stream tool-call frames).
125    pub streaming: f64,
126    /// Tool / capability introspection at the protocol layer
127    /// (1.0 = the protocol itself exposes a tools/list contract).
128    pub tool_discoverability: f64,
129    /// Wire compactness for the LLM/tool-call workload
130    /// (1.0 = binary framing + content-typed payloads).
131    pub encoding_efficiency: f64,
132    /// Existing agent-ecosystem adoption
133    /// (1.0 = every major SDK already speaks it).
134    pub interop: f64,
135    /// Auth / tracing / integrity primitives carried by the protocol itself
136    /// (1.0 = bearer/mTLS + W3C tracing + content integrity + identity inline).
137    pub security_primitives: f64,
138    /// Why: one evidence string per notable factor.
139    pub evidence: Vec<&'static str>,
140}
141
142impl WebStackProfile {
143    /// Composite agentic fitness: unweighted mean of all five axes.
144    pub fn fitness(&self) -> f64 {
145        (self.streaming
146            + self.tool_discoverability
147            + self.encoding_efficiency
148            + self.interop
149            + self.security_primitives)
150            / 5.0
151    }
152}
153
154impl std::fmt::Display for WebStackProfile {
155    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
156        write!(
157            f,
158            "{}: fitness {:.2} (streaming {:.2}, tools {:.2}, encoding {:.2}, interop {:.2}, security {:.2})",
159            self.stack.name(),
160            self.fitness(),
161            self.streaming,
162            self.tool_discoverability,
163            self.encoding_efficiency,
164            self.interop,
165            self.security_primitives
166        )
167    }
168}
169
170/// The curated profile for `stack` (static, documented judgments — see module docs).
171pub fn profile(stack: WebStack) -> WebStackProfile {
172    match stack {
173        WebStack::Spine => WebStackProfile {
174            stack,
175            streaming: 0.98,
176            tool_discoverability: 0.95,
177            encoding_efficiency: 0.95,
178            interop: 0.67,
179            security_primitives: 0.95,
180            evidence: vec![
181                "StreamStart / StreamToken { seq, data, usage? } / StreamEnd are first-class Message variants; StreamData carries Text | Bytes | ToolCall | Encoded(EncodedFrame), so latents and mid-stream function calls fall out of the same frame. v1.5.0 adds Message::StreamCancel (cancel one stream by id — SPINE multiplexes many streams per connection, so closing the socket like SSE is too blunt) and optional StreamToken.usage (mid-stream cumulative token budget, the multiplexed analogue of OpenAI stream_options.include_usage). As of v1.9.0 the gRPC StreamChat is backed by a pluggable real model (OpenAiChatModel streams any OpenAI-compatible endpoint) mapped LAZILY, so cancelling the stream actually stops upstream generation — verified by test",
182                "two discovery surfaces: native CapabilityQuery (Exact | Prefix | Semantic { embedding, top_k } | All) → CapabilityAdvertisement with input/output JSON Schema per Capability plus optional embedding for similarity-matched lookup; and, as of v1.5.0, the spine_protocol::mcp bridge that re-exposes the same capabilities over the MCP tools/list / tools/call contract — so SPINE matches the introspection gold standard AND adds semantic capability search MCP lacks",
183                "the wire body is a self-describing binary codec (8-byte SpineWireHeader + CBOR/RFC 8949); EncodedFrame.data and StreamData::Bytes are CBOR byte strings (serde_bytes), giving tensor payloads protobuf-class density. v1.7.0 made the default wire::encode plain CBOR — fast AND dense, no hot-path compression (a benchmark caught the prior zstd-per-frame design costing ~250 µs/frame; plain CBOR encodes a 1 KiB embedding in ~590 ns, ~10× faster than the JSON it replaced). Measured (spine-protocol examples/wire_sizes.rs, header included): the embedding frame is 3975→1263 B (68% smaller) on the fast default and 3975→446 B (89%) via the opt-in wire::encode_compressed; a tool call 323→284 B; every frame beats JSON, and EncodedFrame moves raw f32/f16/bf16/q8/q4 tensor bytes zero-token — a path gRPC has no native equivalent for. At parity with protobuf for the agentic data plane, and now fast as well as small",
184                "measured transport benchmarks back the encoding/streaming scores (nervosys/SPINE src/spine-transport/benches: spine_vs_http2, agentic_ai_workload, llm_tok_per_sec; re-run 2026-06-08 vs the real h2 HTTP/2 crate on one persistent TCP connection). Single-stream: SPINE wins latency 1.6–2.4× and throughput 1.8–2.3×; N=64 pipelined multiplexing ~32× (≈1.3M req/s on one connection). Dominant agentic pattern (batches of 1536-dim embeddings between agents — RAG / fleet broadcast): SPINE beats HTTP/2+JSON ~6–25×. LLM token streaming: SPINE sustains hundreds of millions of tokens/sec (9–15× over HTTP/2+binary at ≥16K-token batches) where OpenAI-style JSON-SSE caps near ~10M tok/s and collapses on large batches. These are TCP-loopback medians — direction and order of magnitude reproduce run-to-run, but absolute peaks are bandwidth/scheduler-bound and machine-dependent; full methodology in BENCHMARK_REPORT.md",
185                "the neural encoder-decoder protocol is itself benchmarked for agentic use (nervosys/SPINE spine-protocol benches/neural_codec_bench.rs, 2026-06-08): the real TitansLatentCodec projects text into a fixed-width Titans latent and frames it as a self-describing EncodedFrame. The resulting on-wire frame is 66–71% smaller than its JSON form (dim 256: 1241 B binary vs 3942 B JSON; dim 1024: 4314 B vs 14803 B) because the latent rides as a CBOR byte string, not a JSON float array — this latent data plane is what the 0.95 encoding score rests on, and gRPC/JSON have no native equivalent. Honest cost: it is a genuine Titans forward pass, not a memcpy, so encode is superlinear in width (~94 µs at dim 128, ~847 µs at 256, ~3.1 ms at 512, ~26 ms at 1024) — the one-time sender-side price of a semantic projection, separate from the wire-size win the score reflects",
186                "still young (nervosys/SPINE), but reachable from three dominant ecosystems via deployable, standards-compliant server bridges: (1) the spine_protocol::mcp runnable MCP server (v1.6.0 — mcp::serve_stdio speaks the stdio JSON-RPC transport, so a Claude Desktop / Code mcpServers entry drives a SPINE agent today); (2) the OpenAI-compatible /v1/chat/completions + /v1/embeddings + /v1/agentic/{capabilities,codecs} gateway; and (3) the spine-grpc crate — a tonic AgentService (ListCapabilities / CallTool / streaming StreamChat, verified end-to-end over real HTTP/2) which as of v1.9.0 is production-grade: gRPC server reflection (grpcurl and any tooling introspect it with zero stubs), a runnable serve example, and a real pluggable model backend (StreamChat streams from any OpenAI-compatible endpoint — including SPINE's own gateway, so the bridges compose). A gRPC or MCP client uses 100% standard stubs — these are real, deployable bridges, not SPINE-specific shims — but each maps the agentic surface (not SPINE's native binary latent frames), and SPINE's own protocol still has ~zero native install base, so interop stays its lowest axis despite the breadth",
187                "message-level security, not just channel: v1.5.0 spine_agentic::signed_frame wraps any frame in an Ed25519 detached signature over the exact wire bytes (integrity + authenticity + non-repudiation, verified before decode) — a guarantee mTLS does not give once a message leaves the socket. Plus W3C TraceContext inline on tool calls/results/stream starts; bearer auth SECURE BY DEFAULT since v1.3.0; zeroize-on-drop on every key-bearing struct; optional FIPS 140-3 build via aws-lc-rs; Chameleon moving-target protocol + Certificate Transparency policy in the box",
188            ],
189        },
190        WebStack::OpenAiApi => WebStackProfile {
191            stack,
192            streaming: 0.85,
193            tool_discoverability: 0.70,
194            encoding_efficiency: 0.35,
195            interop: 1.00,
196            security_primitives: 0.55,
197            evidence: vec![
198                "SSE chat.completion.chunk is the de facto wire format for LLM tokens; clients consume `data: {...}\\ndata: [DONE]` natively; delta.content + delta.tool_calls handle text and function-call deltas in the same chunk shape",
199                "`tools` parameter on the request declares available functions with JSON Schema args — the agent can branch on returned tool_calls — but there is no `tools/list` introspection RPC; discovery is request-time, not protocol-time",
200                "JSON over HTTP/1.1 or HTTP/2 — the verbose baseline; no first-class binary or latent path",
201                "every major SDK speaks it; LangChain, LlamaIndex, Vercel AI SDK, all OSS agent frameworks default to the OpenAI shape, and most competing providers (Azure, Together, Groq, Fireworks, OpenRouter, …) expose an OpenAI-compatible endpoint as their first interface — the dominant network effect",
202                "bearer token + TLS; per-message identity / tracing / integrity are someone-else's problem (use HTTP headers and your own observability stack)",
203            ],
204        },
205        WebStack::AnthropicApi => WebStackProfile {
206            stack,
207            streaming: 0.85,
208            tool_discoverability: 0.70,
209            encoding_efficiency: 0.35,
210            interop: 0.85,
211            security_primitives: 0.55,
212            evidence: vec![
213                "SSE message_start / content_block_start / content_block_delta / message_delta / message_stop is the streaming protocol; carries text deltas and tool_use blocks; clients consume it the same way they consume OpenAI SSE",
214                "`tools` parameter on the request declares tool surface with JSON Schema; tool_use / tool_result blocks complete the loop; no protocol-level tools/list introspection",
215                "JSON over HTTPS — same shape as the OpenAI baseline",
216                "wide SDK + framework coverage; second-largest closed-LLM ecosystem; some clients reach it through the OpenAI-compatible adapter layer rather than natively",
217                "bearer token + TLS; computer use / agent skills add capability surface but auth/tracing/integrity primitives remain transport-level",
218            ],
219        },
220        WebStack::Mcp => WebStackProfile {
221            stack,
222            streaming: 0.40,
223            tool_discoverability: 0.95,
224            encoding_efficiency: 0.40,
225            interop: 0.65,
226            security_primitives: 0.40,
227            evidence: vec![
228                "JSON-RPC notifications carry tool progress / log entries / sampling — not LLM-token-native; streaming is generic notification flow rather than chat.completion.chunk-shaped",
229                "`tools/list`, `resources/list`, `prompts/list`, `tools/call`, `resources/read` are the protocol — discoverability IS the design center; this is the highest-scoring axis for any stack here",
230                "JSON-RPC text envelopes over stdio or SSE — verbose like JSON-over-HTTP but with the JSON-RPC frame overhead on top",
231                "Anthropic-published in late 2024, adopted by Claude Desktop, Claude Code, several IDE integrations, and growing through 2025-2026; the de facto tool-server contract for agent runtimes",
232                "transport-level (stdio process boundary or HTTPS for SSE); no in-protocol auth/tracing/integrity — relies on the host process or HTTP layer",
233            ],
234        },
235        WebStack::Grpc => WebStackProfile {
236            stack,
237            streaming: 0.70,
238            tool_discoverability: 0.85,
239            encoding_efficiency: 0.95,
240            interop: 0.85,
241            security_primitives: 0.80,
242            evidence: vec![
243                "first-class server / client / bidirectional streaming over HTTP/2 — strong general streaming, but no LLM-token shape out of the box; an agent service has to define its own chunk schema",
244                "Server Reflection (proto-reflect) exposes service / method / message descriptors; introspection works but the agent must translate proto types — less direct than tools/list",
245                "protobuf binary on HTTP/2 framing — the most compact mainstream wire format; zero JSON envelope overhead",
246                "huge enterprise install base; standard for high-throughput internal services and machine-to-machine traffic; well represented in agent backends even if not at the LLM edge",
247                "mTLS first-class, per-channel interceptors for auth and OpenTelemetry tracing, deadlines propagate on the wire — among the strongest protocol-level security surfaces in the set",
248            ],
249        },
250        WebStack::HttpJson => WebStackProfile {
251            stack,
252            streaming: 0.55,
253            tool_discoverability: 0.40,
254            encoding_efficiency: 0.30,
255            interop: 1.00,
256            security_primitives: 0.45,
257            evidence: vec![
258                "chunked Transfer-Encoding + SSE handle streaming as a bolt-on; HTTP/2 server push and HTTP/3 datagrams help, but there is no LLM-token frame standard",
259                "OpenAPI/Swagger gives schema discoverability when the service ships one — but the protocol itself does not require it; many real services have no schema endpoint",
260                "verbose JSON over HTTP/1.1 (HTTP/2 helps with framing but not body size); the cost baseline every more-efficient stack measures against",
261                "the universal lingua franca of services; every language, every framework, every agent stack can call a JSON HTTP endpoint",
262                "TLS at transport; auth/tracing/integrity are conventions (bearer headers, W3C traceparent, content hashing) layered on, not in the protocol",
263            ],
264        },
265        WebStack::GraphQl => WebStackProfile {
266            stack,
267            streaming: 0.50,
268            tool_discoverability: 0.95,
269            encoding_efficiency: 0.35,
270            interop: 0.75,
271            security_primitives: 0.45,
272            evidence: vec![
273                "Subscriptions handle streaming as a separate operation type over WebSocket / SSE; not LLM-token native; defer / stream directives help for partial results",
274                "introspection (__schema, __type) is built into the protocol — a client can discover the entire surface without docs; on this axis GraphQL ties MCP for the highest score",
275                "JSON request/response with selection sets that reduce over-fetch; binary efficiency is still text-JSON-shaped",
276                "wide adoption especially on the front-end and federated-service edges; persisted-query patterns are common in agent backends",
277                "TLS at transport; persisted queries help control surface area; per-message identity / tracing / integrity remain conventions",
278            ],
279        },
280    }
281}
282
283/// Profiles for all stacks, in [`WebStack::all`] order (deterministic).
284pub fn profiles() -> Vec<WebStackProfile> {
285    WebStack::all().iter().map(|&s| profile(s)).collect()
286}
287
288/// All profiles ranked best-first by [`WebStackProfile::fitness`]
289/// (stable order on ties).
290pub fn rank_web_stacks() -> Vec<WebStackProfile> {
291    let mut v = profiles();
292    v.sort_by(|a, b| {
293        b.fitness()
294            .partial_cmp(&a.fitness())
295            .unwrap_or(std::cmp::Ordering::Equal)
296    });
297    v
298}
299
300/// Compare two stacks: positive deltas mean `a` fits agentic use better.
301#[cfg_attr(feature = "serde", derive(serde::Serialize))]
302#[derive(Debug, Clone)]
303pub struct WebStackComparison {
304    /// First stack (the subject).
305    pub a: WebStackProfile,
306    /// Second stack (the baseline).
307    pub b: WebStackProfile,
308    /// `a.fitness() - b.fitness()`.
309    pub fitness_delta: f64,
310    /// Axis name → delta (a − b), in fixed axis order.
311    pub axis_deltas: Vec<(&'static str, f64)>,
312}
313
314/// Compare stack `a` against baseline `b` across all five axes.
315pub fn compare_web_stacks(a: WebStack, b: WebStack) -> WebStackComparison {
316    let pa = profile(a);
317    let pb = profile(b);
318    let axis_deltas = vec![
319        ("streaming", pa.streaming - pb.streaming),
320        (
321            "tool-discoverability",
322            pa.tool_discoverability - pb.tool_discoverability,
323        ),
324        (
325            "encoding-efficiency",
326            pa.encoding_efficiency - pb.encoding_efficiency,
327        ),
328        ("interop", pa.interop - pb.interop),
329        (
330            "security-primitives",
331            pa.security_primitives - pb.security_primitives,
332        ),
333    ];
334    WebStackComparison {
335        fitness_delta: pa.fitness() - pb.fitness(),
336        a: pa,
337        b: pb,
338        axis_deltas,
339    }
340}
341
342impl std::fmt::Display for WebStackComparison {
343    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
344        writeln!(
345            f,
346            "{} vs {}: fitness delta {:+.2}",
347            self.a.stack.name(),
348            self.b.stack.name(),
349            self.fitness_delta
350        )?;
351        for (axis, d) in &self.axis_deltas {
352            writeln!(f, "  {axis}: {d:+.2}")?;
353        }
354        Ok(())
355    }
356}
357
358#[cfg(test)]
359mod tests {
360    use super::*;
361
362    #[test]
363    fn every_stack_profiles_with_evidence() {
364        for stack in WebStack::all() {
365            let p = profile(stack);
366            assert!(
367                p.evidence.len() >= 3,
368                "{} needs ≥3 evidence lines",
369                stack.name()
370            );
371            for s in [
372                p.streaming,
373                p.tool_discoverability,
374                p.encoding_efficiency,
375                p.interop,
376                p.security_primitives,
377            ] {
378                assert!(
379                    (0.0..=1.0).contains(&s),
380                    "{} score out of range",
381                    stack.name()
382                );
383            }
384        }
385    }
386
387    #[test]
388    fn from_name_roundtrip_and_aliases() {
389        for stack in WebStack::all() {
390            assert_eq!(WebStack::from_name(stack.name()), Some(stack));
391        }
392        assert_eq!(WebStack::from_name("OpenAI"), Some(WebStack::OpenAiApi));
393        assert_eq!(WebStack::from_name("claude-api"), Some(WebStack::AnthropicApi));
394        assert_eq!(WebStack::from_name("REST"), Some(WebStack::HttpJson));
395        assert_eq!(
396            WebStack::from_name("model-context-protocol"),
397            Some(WebStack::Mcp)
398        );
399        assert_eq!(WebStack::from_name("gql"), Some(WebStack::GraphQl));
400        assert_eq!(WebStack::from_name("nervosys-spine"), Some(WebStack::Spine));
401        assert_eq!(WebStack::from_name("not-a-stack"), None);
402    }
403
404    #[test]
405    fn ranking_is_deterministic_and_sorted() {
406        let r1 = rank_web_stacks();
407        let r2 = rank_web_stacks();
408        let n1: Vec<_> = r1.iter().map(|p| p.stack.name()).collect();
409        let n2: Vec<_> = r2.iter().map(|p| p.stack.name()).collect();
410        assert_eq!(n1, n2);
411        for w in r1.windows(2) {
412            assert!(w[0].fitness() >= w[1].fitness());
413        }
414    }
415
416    #[test]
417    fn axis_judgments_hold_directionally() {
418        let spine = profile(WebStack::Spine);
419        let openai = profile(WebStack::OpenAiApi);
420        let anthropic = profile(WebStack::AnthropicApi);
421        let mcp = profile(WebStack::Mcp);
422        let grpc = profile(WebStack::Grpc);
423        let http = profile(WebStack::HttpJson);
424        let graphql = profile(WebStack::GraphQl);
425
426        // SPINE leads on the axes it was designed for.
427        assert!(
428            spine.streaming > anthropic.streaming,
429            "native StreamStart/Token/End frames beat SSE-on-HTTP for LLM streaming"
430        );
431        assert!(
432            spine.security_primitives > openai.security_primitives,
433            "inline W3C tracing + zeroize + secure-by-default auth + Chameleon protocol beat bearer-on-TLS"
434        );
435
436        // SPINE pays for being new.
437        assert!(
438            openai.interop > spine.interop,
439            "the OpenAI API has the dominant ecosystem network effect; SPINE is brand new"
440        );
441        assert!(
442            http.interop >= openai.interop,
443            "plain HTTP+JSON is the universal lingua franca"
444        );
445
446        // gRPC's general strengths.
447        assert!(
448            grpc.encoding_efficiency > openai.encoding_efficiency,
449            "protobuf binary beats verbose JSON over HTTP"
450        );
451
452        // SPINE's CBOR wire format (v1.4.0) plus byte-string tensor payloads
453        // (v1.5.0) moved encoding from a weakness to protobuf-class density.
454        assert!(
455            spine.encoding_efficiency > openai.encoding_efficiency,
456            "binary CBOR (+opportunistic zstd) crushes JSON-over-HTTP"
457        );
458        assert!(
459            spine.encoding_efficiency > mcp.encoding_efficiency,
460            "binary CBOR beats JSON-RPC text envelopes"
461        );
462        assert!(
463            spine.encoding_efficiency >= grpc.encoding_efficiency,
464            "byte-string tensor payloads bring SPINE to parity with protobuf for the agentic data plane"
465        );
466        assert!(
467            grpc.security_primitives > http.security_primitives,
468            "mTLS + interceptors beat bring-your-own bearer-headers"
469        );
470
471        // v1.5.0 lifted every axis with real capabilities (MCP bridge for
472        // interop + tools, byte-string payloads for encoding, per-message
473        // Ed25519 signatures for security, StreamCancel/usage for streaming),
474        // so SPINE now edges gRPC on the composite — while STILL trailing badly
475        // on interop, the one axis that only rewards real ecosystem adoption.
476        assert!(
477            spine.fitness() > grpc.fitness(),
478            "v1.5.0 capability work puts SPINE first on composite agentic fitness"
479        );
480        assert!(
481            grpc.interop > spine.interop,
482            "SPINE reaches the ecosystem through MCP/OpenAI bridges, not native adoption; gRPC's install base is broader"
483        );
484        assert!(
485            spine.security_primitives > grpc.security_primitives,
486            "per-message Ed25519 signatures (non-repudiation) exceed channel-only mTLS"
487        );
488
489        // MCP / GraphQL win on introspection because the protocol IS the schema.
490        assert!(
491            mcp.tool_discoverability > openai.tool_discoverability,
492            "tools/list is more discoverable than a request-time tools parameter"
493        );
494        assert!(
495            graphql.tool_discoverability > http.tool_discoverability,
496            "__schema introspection beats hoping the service ships OpenAPI"
497        );
498
499        // MCP is tool-shaped, not LLM-token-shaped.
500        assert!(
501            openai.streaming > mcp.streaming,
502            "JSON-RPC notifications are not LLM-token streaming"
503        );
504    }
505
506    #[test]
507    fn comparison_deltas_are_consistent() {
508        let cmp = compare_web_stacks(WebStack::Spine, WebStack::OpenAiApi);
509        let sum: f64 = cmp.axis_deltas.iter().map(|(_, d)| d).sum();
510        assert!((sum / 5.0 - cmp.fitness_delta).abs() < 1e-9);
511        assert!(format!("{cmp}").contains("spine vs openai-api"));
512    }
513}
agentic_eval/web.rs

agentic_eval/
web.rs