rpcnet 0.1.0 - Docs.rs

<!DOCTYPE HTML>
<html lang="en" class="light sidebar-visible" dir="ltr">
    <head>
        <!-- Book generated using mdBook -->
        <meta charset="UTF-8">
        <title>RpcNet Guide</title>
        <meta name="robots" content="noindex">


        <!-- Custom HTML head -->

        <meta name="description" content="">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <meta name="theme-color" content="#ffffff">

        <link rel="icon" href="favicon.svg">
        <link rel="shortcut icon" href="favicon.png">
        <link rel="stylesheet" href="css/variables.css">
        <link rel="stylesheet" href="css/general.css">
        <link rel="stylesheet" href="css/chrome.css">
        <link rel="stylesheet" href="css/print.css" media="print">

        <!-- Fonts -->
        <link rel="stylesheet" href="FontAwesome/css/font-awesome.css">
        <link rel="stylesheet" href="fonts/fonts.css">

        <!-- Highlight.js Stylesheets -->
        <link rel="stylesheet" id="highlight-css" href="highlight.css">
        <link rel="stylesheet" id="tomorrow-night-css" href="tomorrow-night.css">
        <link rel="stylesheet" id="ayu-highlight-css" href="ayu-highlight.css">

        <!-- Custom theme stylesheets -->


        <!-- Provide site root and default themes to javascript -->
        <script>
            const path_to_root = "";
            const default_light_theme = "light";
            const default_dark_theme = "navy";
            window.path_to_searchindex_js = "searchindex.js";
        </script>
        <!-- Start loading toc.js asap -->
        <script src="toc.js"></script>
    </head>
    <body>
    <div id="mdbook-help-container">
        <div id="mdbook-help-popup">
            <h2 class="mdbook-help-title">Keyboard shortcuts</h2>
            <div>
                <p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
                <p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
                <p>Press <kbd>?</kbd> to show this help</p>
                <p>Press <kbd>Esc</kbd> to hide this help</p>
            </div>
        </div>
    </div>
    <div id="body-container">
        <!-- Work around some values being stored in localStorage wrapped in quotes -->
        <script>
            try {
                let theme = localStorage.getItem('mdbook-theme');
                let sidebar = localStorage.getItem('mdbook-sidebar');

                if (theme.startsWith('"') && theme.endsWith('"')) {
                    localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
                }

                if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
                    localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
                }
            } catch (e) { }
        </script>

        <!-- Set the theme before any content is loaded, prevents flash -->
        <script>
            const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
            let theme;
            try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
            if (theme === null || theme === undefined) { theme = default_theme; }
            const html = document.documentElement;
            html.classList.remove('light')
            html.classList.add(theme);
            html.classList.add("js");
        </script>

        <input type="checkbox" id="sidebar-toggle-anchor" class="hidden">

        <!-- Hide / unhide sidebar before it is displayed -->
        <script>
            let sidebar = null;
            const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
            if (document.body.clientWidth >= 1080) {
                try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
                sidebar = sidebar || 'visible';
            } else {
                sidebar = 'hidden';
                sidebar_toggle.checked = false;
            }
            if (sidebar === 'visible') {
                sidebar_toggle.checked = true;
            } else {
                html.classList.remove('sidebar-visible');
            }
        </script>

        <nav id="sidebar" class="sidebar" aria-label="Table of contents">
            <!-- populated by js -->
            <mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
            <noscript>
                <iframe class="sidebar-iframe-outer" src="toc.html"></iframe>
            </noscript>
            <div id="sidebar-resize-handle" class="sidebar-resize-handle">
                <div class="sidebar-resize-indicator"></div>
            </div>
        </nav>

        <div id="page-wrapper" class="page-wrapper">

            <div class="page">
                <div id="menu-bar-hover-placeholder"></div>
                <div id="menu-bar" class="menu-bar sticky">
                    <div class="left-buttons">
                        <label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
                            <i class="fa fa-bars"></i>
                        </label>
                        <button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
                            <i class="fa fa-paint-brush"></i>
                        </button>
                        <ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
                            <li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
                        </ul>
                        <button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
                            <i class="fa fa-search"></i>
                        </button>
                    </div>

                    <h1 class="menu-title">RpcNet Guide</h1>

                    <div class="right-buttons">
                        <a href="print.html" title="Print this book" aria-label="Print this book">
                            <i id="print-button" class="fa fa-print"></i>
                        </a>

                    </div>
                </div>

                <div id="search-wrapper" class="hidden">
                    <form id="searchbar-outer" class="searchbar-outer">
                        <div class="search-wrapper">
                            <input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
                            <div class="spinner-wrapper">
                                <i class="fa fa-spinner fa-spin"></i>
                            </div>
                        </div>
                    </form>
                    <div id="searchresults-outer" class="searchresults-outer hidden">
                        <div id="searchresults-header" class="searchresults-header"></div>
                        <ul id="searchresults">
                        </ul>
                    </div>
                </div>

                <!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
                <script>
                    document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
                    document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
                    Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
                        link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
                    });
                </script>

                <div id="content" class="content">
                    <main>
                        <h1 id="introduction"><a class="header" href="#introduction">Introduction</a></h1>
<blockquote>
<p><strong>Version</strong>: 0.1.0 | <strong>Features</strong>: Cluster Management, Streaming, Code Generation</p>
</blockquote>
<p>RpcNet is a high-performance QUIC-based RPC library built on <code>s2n-quic</code>. The library provides
high-level server and client primitives, TLS configuration helpers, rich support for
unary and streaming request flows, and complete distributed cluster management. This book
centralizes the user-facing materials so you can learn RpcNet in one place.</p>
<h2 id="key-capabilities"><a class="header" href="#key-capabilities">Key Capabilities</a></h2>
<h3 id="core-rpc"><a class="header" href="#core-rpc">Core RPC</a></h3>
<ul>
<li>TLS-first configuration for both client and server components</li>
<li>Simple registration of request handlers with async closures</li>
<li>Bidirectional, client-streaming, and server-streaming support</li>
<li>Structured error reporting through <code>RpcError</code></li>
<li>Test-friendly abstractions that allow mocking QUIC streams</li>
</ul>
<h3 id="distributed-systems-v010"><a class="header" href="#distributed-systems-v010">Distributed Systems (v0.1.0+)</a></h3>
<ul>
<li><strong>Cluster Management</strong>: Built-in gossip protocol (SWIM) for node discovery</li>
<li><strong>Load Balancing</strong>: Multiple strategies (Round Robin, Random, Least Connections)</li>
<li><strong>Health Checking</strong>: Phi Accrual failure detection</li>
<li><strong>Tag-Based Routing</strong>: Route requests by worker capabilities</li>
<li><strong>Auto-Failover</strong>: Zero-downtime worker replacement</li>
</ul>
<h2 id="how-to-read-this-book"><a class="header" href="#how-to-read-this-book">How To Read This Book</a></h2>
<ol>
<li><strong>Getting Started</strong> walks through installing RpcNet and creating your first service.</li>
<li><strong>Core Concepts</strong> introduces the configuration model, error types, and runtime fundamentals.</li>
<li><strong>Cluster Example</strong> demonstrates building distributed systems with automatic discovery and load balancing.</li>
<li><strong>Streaming Patterns</strong> covers bidirectional and one-way streaming.</li>
<li><strong>rpcnet-gen CLI</strong> explains the code generation tool and workflows.</li>
</ol>
<p>Throughout the chapters you will find executable snippets based on the working examples
in the repository.</p>
<div style="break-before: page; page-break-before: always;"></div><h1 id="getting-started"><a class="header" href="#getting-started">Getting Started</a></h1>
<p>This tutorial mirrors the <code>examples/basic_greeting</code> sample and shows, step by
step, how to install RpcNet, run the <code>rpcnet-gen</code> CLI, and integrate the
generated code into your own project.</p>
<h2 id="step-0-prerequisites"><a class="header" href="#step-0-prerequisites">Step 0: Prerequisites</a></h2>
<ul>
<li>Rust 1.75+ (<code>rustup show</code> to confirm)</li>
<li><code>cargo</code> on your <code>PATH</code></li>
<li>macOS or Linux (QUIC/TLS support is bundled through <code>s2n-quic</code>)</li>
</ul>
<h2 id="step-1-create-a-new-crate"><a class="header" href="#step-1-create-a-new-crate">Step 1: Create a new crate</a></h2>
<pre><code class="language-bash">cargo new hello-rpc
cd hello-rpc
</code></pre>
<h2 id="step-2-add-the-rpcnet-runtime-crate"><a class="header" href="#step-2-add-the-rpcnet-runtime-crate">Step 2: Add the RpcNet runtime crate</a></h2>
<pre><code class="language-bash">cargo add rpcnet
</code></pre>
<p>RpcNet enables the high-performance <code>perf</code> feature by default. If you need to
opt out (e.g. another allocator is already selected), edit <code>Cargo.toml</code>:</p>
<pre><code class="language-toml">[dependencies]
rpcnet = { version = "0.1", default-features = false }
</code></pre>
<p>You will also want <code>serde</code> for request/response types, just like the example:</p>
<pre><code class="language-toml">serde = { version = "1", features = ["derive"] }
</code></pre>
<h2 id="step-3-install-the-rpcnet-gen-cli"><a class="header" href="#step-3-install-the-rpcnet-gen-cli">Step 3: Install the rpcnet-gen CLI</a></h2>
<p>Starting with v0.1.0, the CLI is included by default when you install rpcnet:</p>
<pre><code class="language-bash">cargo install rpcnet  # CLI automatically included!
</code></pre>
<p>Verify the install:</p>
<pre><code class="language-bash">rpcnet-gen --help
</code></pre>
<p>You should see the full usage banner:</p>
<pre><code>Generate RPC client and server code from service definitions

Usage: rpcnet-gen [OPTIONS] --input &lt;INPUT&gt;

Options:
  -i, --input &lt;INPUT&gt;    Input .rpc file (Rust source with service trait)
  -o, --output &lt;OUTPUT&gt;  Output directory for generated code [default: src/generated]
      --server-only      Generate only server code
      --client-only      Generate only client code
      --types-only       Generate only type definitions
  -h, --help             Print help
  -V, --version          Print version
</code></pre>
<h2 id="step-4-author-a-service-definition"><a class="header" href="#step-4-author-a-service-definition">Step 4: Author a service definition</a></h2>
<p>Create <code>src/greeting.rpc.rs</code> describing your protocol. The syntax is ordinary
Rust with a <code>#[rpcnet::service]</code> attribute, so you can leverage the compiler and
IDE tooling while you design the API:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// src/greeting.rpc.rs
use serde::{Deserialize, Serialize};

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct GreetRequest {
    pub name: String,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct GreetResponse {
    pub message: String,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub enum GreetingError {
    EmptyName,
    InvalidInput(String),
}

#[rpcnet::service]
pub trait Greeting {
    async fn greet(&amp;self, request: GreetRequest) -&gt; Result&lt;GreetResponse, GreetingError&gt;;
}
<span class="boring">}</span></code></pre></pre>
<h2 id="step-5-generate-client-and-server-code"><a class="header" href="#step-5-generate-client-and-server-code">Step 5: Generate client and server code</a></h2>
<p>Point the CLI at the <code>.rpc</code> file and choose an output directory. Here we mirror
<code>examples/basic_greeting</code> by writing into <code>src/generated</code>:</p>
<pre><code class="language-bash">rpcnet-gen --input src/greeting.rpc.rs --output src/generated
</code></pre>
<p>The CLI confirms what it created:</p>
<pre><code>📦 Generating code for service: Greeting
  ✅ Generated server: src/generated/greeting/server.rs
  ✅ Generated client: src/generated/greeting/client.rs
  ✅ Generated types: src/generated/greeting/types.rs

✨ Code generation complete!

📝 Add the following to your code to use the generated service:
    #[path = "generated/greeting/mod.rs"]
    mod greeting;
    use greeting::*;
</code></pre>
<p>Inspect the directory to see the modules that were created—this matches the
layout under <code>examples/basic_greeting/generated/</code>:</p>
<pre><code>src/generated/
└── greeting/
    ├── client.rs   # async client wrapper for calling the service
    ├── mod.rs      # re-exports so `use greeting::*` pulls everything in
    ├── server.rs   # server harness plus `GreetingHandler` trait
    └── types.rs    # request/response/error structs cloned from the .rpc file
</code></pre>
<p><code>client.rs</code> exposes <code>GreetingClient</code>, <code>server.rs</code> wires your implementation into
the transport via <code>GreetingServer</code>, and <code>types.rs</code> contains the shared data
structures.</p>
<h2 id="step-6-wire-the-generated-code-into-your-project"><a class="header" href="#step-6-wire-the-generated-code-into-your-project">Step 6: Wire the generated code into your project</a></h2>
<p>Reference the generated module and bring the types into scope. For example,
in <code>src/main.rs</code>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[path = "generated/greeting/mod.rs"]
mod greeting;

use greeting::client::GreetingClient;
use greeting::server::{GreetingHandler, GreetingServer};
use greeting::{GreetRequest, GreetResponse, GreetingError};
use rpcnet::RpcConfig;
<span class="boring">}</span></code></pre></pre>
<p>From here there are two pieces to wire up:</p>
<ol>
<li>
<p><strong>Server</strong> – implement the generated <code>GreetingHandler</code> trait and launch the
harness. This mirrors <code>examples/basic_greeting/server.rs</code>:</p>
<pre><pre class="playground"><code class="language-rust">struct MyGreetingService;

#[async_trait::async_trait]
impl GreetingHandler for MyGreetingService {
    async fn greet(&amp;self, request: GreetRequest) -&gt; Result&lt;GreetResponse, GreetingError&gt; {
        Ok(GreetResponse { message: format!("Hello, {}!", request.name) })
    }
}

#[tokio::main]
async fn main() -&gt; anyhow::Result&lt;()&gt; {
    let config = RpcConfig::new("certs/test_cert.pem", "127.0.0.1:8080")
        .with_key_path("certs/test_key.pem")
        .with_server_name("localhost");

    GreetingServer::new(MyGreetingService, config).serve().await?;
    Ok(())
}</code></pre></pre>
<p><code>GreetingServer::serve</code> handles QUIC I/O, wiring your implementation to the
generated protocol handlers.</p>
<p><strong>Tuning worker threads (optional).</strong> By default Tokio uses the number of
available CPU cores. To override this for RpcNet services, set
<code>RPCNET_SERVER_THREADS</code> and build your runtime manually:</p>
<pre><pre class="playground"><code class="language-rust">fn main() -&gt; anyhow::Result&lt;()&gt; {
    let worker_threads = rpcnet::runtime::server_worker_threads();

    let runtime = tokio::runtime::Builder::new_multi_thread()
        .worker_threads(worker_threads)
        .enable_all()
        .build()?;

    runtime.block_on(async {
        // existing async server logic goes here
        Ok::&lt;_, anyhow::Error&gt;(())
    })?;

    Ok(())
}</code></pre></pre>
<p>Run the binary with a custom thread count:</p>
<pre><code class="language-bash">RPCNET_SERVER_THREADS=8 cargo run
</code></pre>
<p>Adjust the command if your server lives in a different binary target (for
example <code>cargo run --bin my-server</code>).</p>
<p>If you keep using the <code>#[tokio::main]</code> macro, Tokio will also honour the
upstream <code>TOKIO_WORKER_THREADS</code> environment variable.</p>
</li>
<li>
<p><strong>Client</strong> – construct <code>GreetingClient</code> to invoke the RPC. Compare with
<code>examples/basic_greeting/client.rs</code>:</p>
<pre><pre class="playground"><code class="language-rust">#[tokio::main]
async fn main() -&gt; anyhow::Result&lt;()&gt; {
    let config = RpcConfig::new("certs/test_cert.pem", "127.0.0.1:0")
        .with_server_name("localhost");

    let server_addr = "127.0.0.1:8080".parse()?;
    let client = GreetingClient::connect(server_addr, config).await?;

    let response = client.greet(GreetRequest { name: "World".into() }).await?;
    println!("Server replied: {}", response.message);
    Ok(())
}</code></pre></pre>
</li>
</ol>
<p>The generated client takes care of serialization, TLS, and backpressure while
presenting an async function per RPC method.</p>
<h2 id="step-7-build-and-run"><a class="header" href="#step-7-build-and-run">Step 7: Build and run</a></h2>
<p>Compile and execute as usual:</p>
<pre><code class="language-bash">cargo build
cargo run
</code></pre>
<p>While you experiment, keep the reference example nearby:</p>
<pre><code class="language-bash">ls examples/basic_greeting
# client.rs  generated/  greeting.rpc.rs  server.rs
</code></pre>
<p>Comparing your project with the example is a quick way to confirm the wiring
matches what the CLI expects.</p>
<h2 id="where-to-go-next"><a class="header" href="#where-to-go-next">Where to go next</a></h2>
<ul>
<li>Read the <a href="rpcnet-gen.html">rpcnet-gen CLI guide</a> for advanced flags such as
<code>--server-only</code>, <code>--client-only</code>, and custom output paths.</li>
<li>Explore the <a href="concepts.html">Concepts</a> chapter for runtime fundamentals,
server/client wiring, and streaming patterns.</li>
</ul>
<div style="break-before: page; page-break-before: always;"></div><h1 id="concepts"><a class="header" href="#concepts">Concepts</a></h1>
<p>This chapter collects the fundamental ideas behind RpcNet: the runtime building
blocks, how servers and clients are constructed, and the streaming patterns that
sit on top of QUIC.</p>
<h2 id="runtime-building-blocks"><a class="header" href="#runtime-building-blocks">Runtime Building Blocks</a></h2>
<h3 id="configuration-rpcconfig"><a class="header" href="#configuration-rpcconfig">Configuration (<code>RpcConfig</code>)</a></h3>
<p><code>RpcConfig</code> encapsulates the TLS artifacts, socket bindings, and optional
keep-alive settings shared by clients and servers.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::RpcConfig;

let config = RpcConfig::new("certs/server.pem", "127.0.0.1:0")
    .with_key_path("certs/server-key.pem")
    .with_server_name("localhost")
    .with_keep_alive_interval(std::time::Duration::from_secs(30));
<span class="boring">}</span></code></pre></pre>
<p>Keep-alive is optional; when enabled the interval is mirrored on both ends of
the connection so heartbeats stay in sync.</p>
<h3 id="error-handling-rpcerror"><a class="header" href="#error-handling-rpcerror">Error Handling (<code>RpcError</code>)</a></h3>
<p><code>RpcError</code> differentiates between connection, stream, TLS, configuration, IO,
and serialization failures so callers can branch on the exact condition instead
of parsing strings:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>match client.call("ping", vec![]).await {
    Ok(bytes) =&gt; println!("pong: {}", String::from_utf8_lossy(&amp;bytes)),
    Err(rpcnet::RpcError::Timeout) =&gt; eprintln!("server took too long"),
    Err(other) =&gt; eprintln!("unhandled rpc error: {other}")
}
<span class="boring">}</span></code></pre></pre>
<h3 id="serialization-strategy"><a class="header" href="#serialization-strategy">Serialization Strategy</a></h3>
<p>Requests and responses travel as <code>Vec&lt;u8&gt;</code>. Examples use <code>bincode</code> for compact
frames, but any serialization format can be layered on top.</p>
<h3 id="concurrency-model"><a class="header" href="#concurrency-model">Concurrency Model</a></h3>
<p>Each accepted QUIC connection runs inside its own Tokio task. Within that
connection, every RPC request is processed on another task so long-running
handlers never block unrelated work. Clients open a fresh bidirectional stream
per call while sharing a single connection behind an <code>Arc</code> + <code>RwLock</code>.</p>
<h2 id="server-essentials"><a class="header" href="#server-essentials">Server Essentials</a></h2>
<h3 id="creating-the-server"><a class="header" href="#creating-the-server">Creating the Server</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::{RpcServer, RpcConfig};

let config = RpcConfig::new("certs/server.pem", "127.0.0.1:8080")
    .with_key_path("certs/server-key.pem")
    .with_server_name("localhost");
let mut server = RpcServer::new(config);
<span class="boring">}</span></code></pre></pre>
<p>Binding to port <code>0</code> lets the OS allocate a free port. Once <code>bind()</code> succeeds the
chosen address is stored on <code>server.socket_addr</code>.</p>
<h3 id="registering-unary-handlers"><a class="header" href="#registering-unary-handlers">Registering Unary Handlers</a></h3>
<p>Handlers receive raw <code>Vec&lt;u8&gt;</code> payloads and return serialized responses. The
closure executes inside a Tokio task, so async IO is allowed.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::{RpcError, RpcServer};

server.register("add", |params| async move {
    let (a, b): (i32, i32) = bincode::deserialize(&amp;params)
        .map_err(RpcError::SerializationError)?;
    let sum = a + b;
    Ok(bincode::serialize(&amp;sum)? )
}).await;
<span class="boring">}</span></code></pre></pre>
<p>Registering a method again overwrites the previous handler.</p>
<h3 id="registering-streaming-handlers"><a class="header" href="#registering-streaming-handlers">Registering Streaming Handlers</a></h3>
<p>Streaming handlers consume a stream of request payloads and produce a stream of
<code>Result&lt;Vec&lt;u8&gt;, RpcError&gt;</code> responses. Use <code>async_stream::stream!</code> or
<code>tokio_stream</code> helpers to build the return value.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use async_stream::stream;
use futures::StreamExt;

server.register_streaming("echo_stream", |mut reqs| async move {
    stream! {
        while let Some(payload) = reqs.next().await {
            yield Ok(payload); // echo back exactly what we received
        }
    }
}).await;
<span class="boring">}</span></code></pre></pre>
<h3 id="binding-and-starting"><a class="header" href="#binding-and-starting">Binding and Starting</a></h3>
<p>Binding consumes the TLS material supplied in <code>RpcConfig</code> and returns an
<code>s2n_quic::Server</code> that feeds into <code>start</code>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>let quic_server = server.bind()?;
println!("listening on {}", server.socket_addr.unwrap());
server.start(quic_server).await?;
<span class="boring">}</span></code></pre></pre>
<p><code>start</code> runs until the QUIC provider stops delivering connections (typically
when your process shuts down). Every accepted connection and stream is served
concurrently.</p>
<h3 id="graceful-shutdown"><a class="header" href="#graceful-shutdown">Graceful Shutdown</a></h3>
<p>Wrap the <code>start</code> future inside a <code>tokio::select!</code> with your shutdown signal.
When <code>accept()</code> yields <code>None</code> the loop exits and the server terminates cleanly.</p>
<h2 id="client-essentials"><a class="header" href="#client-essentials">Client Essentials</a></h2>
<h3 id="connecting"><a class="header" href="#connecting">Connecting</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::{RpcClient, RpcConfig};
use std::net::SocketAddr;

let config = RpcConfig::new("certs/ca.pem", "127.0.0.1:0")
    .with_server_name("localhost");
let server_addr: SocketAddr = "127.0.0.1:8080".parse().unwrap();
let client = RpcClient::connect(server_addr, config).await?;
<span class="boring">}</span></code></pre></pre>
<p>Client configuration mirrors the server TLS settings, including optional
keep-alive.</p>
<h3 id="unary-calls"><a class="header" href="#unary-calls">Unary Calls</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>let payload = bincode::serialize(&amp;(21, 21))?;
let response = client.call("add", payload).await?;
let result: i32 = bincode::deserialize(&amp;response)?;
assert_eq!(result, 42);
<span class="boring">}</span></code></pre></pre>
<p>Errors surface as <code>RpcError</code> values. Timeouts honour the <code>DEFAULT_TIMEOUT</code>
constant (30 seconds normally, 2 seconds under <code>cfg(test)</code>).</p>
<h3 id="concurrent-calls"><a class="header" href="#concurrent-calls">Concurrent Calls</a></h3>
<p>Clone the client (internally <code>Arc</code>) and issue calls in parallel. Each call opens
a new bidirectional stream on the shared connection.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use std::sync::Arc;
use tokio::join;

let client = Arc::new(client);
let (a, b) = join!(
    client.clone().call("first", vec![]),
    client.clone().call("second", vec![])
);
<span class="boring">}</span></code></pre></pre>
<h3 id="inspecting-request-ids"><a class="header" href="#inspecting-request-ids">Inspecting Request IDs</a></h3>
<p><code>RpcClient</code> maintains an atomic <code>next_id</code>. Incrementing it per call keeps
request/response pairs aligned. You rarely need to touch this directly, but it
aids traffic debugging.</p>
<h2 id="streaming-patterns"><a class="header" href="#streaming-patterns">Streaming Patterns</a></h2>
<p>RpcNet exposes three streaming helpers built on top of QUIC bidirectional
streams. Each frame is length-prefixed followed by the payload bytes.</p>
<h3 id="bidirectional-call_streaming"><a class="header" href="#bidirectional-call_streaming">Bidirectional (<code>call_streaming</code>)</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use futures::stream;
use futures::StreamExt;

let requests = stream::iter(vec![
    b"hello".to_vec(),
    b"world".to_vec(),
]);

let responses = client.call_streaming("chat", requests).await?;
let mut responses = Box::pin(responses);
while let Some(frame) = responses.next().await {
    println!("response: {:?}", frame?);
}
<span class="boring">}</span></code></pre></pre>
<p>The client sends the method name first, then each payload, finishing with a <code>0</code>
length frame to signal completion. Sending continues even as responses arrive;
upload and download directions are independent.</p>
<h3 id="server-streaming-call_server_streaming"><a class="header" href="#server-streaming-call_server_streaming">Server Streaming (<code>call_server_streaming</code>)</a></h3>
<p>Server streaming wraps <code>call_streaming</code> and sends a single request frame before
yielding the response stream:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use futures::StreamExt;

let stream = client.call_server_streaming("list_items", Vec::new()).await?;
let mut stream = Box::pin(stream);
while let Some(frame) = stream.next().await {
    println!("item: {:?}", frame?);
}
<span class="boring">}</span></code></pre></pre>
<h3 id="client-streaming-call_client_streaming"><a class="header" href="#client-streaming-call_client_streaming">Client Streaming (<code>call_client_streaming</code>)</a></h3>
<p>Client streaming uploads many payloads and waits for an aggregated result.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use futures::stream;

let uploads = stream::iter(vec![b"chunk-a".to_vec(), b"chunk-b".to_vec()]);
let digest = client.call_client_streaming("upload", uploads).await?;
println!("digest bytes: {digest:?}");
<span class="boring">}</span></code></pre></pre>
<h3 id="implementing-streaming-handlers"><a class="header" href="#implementing-streaming-handlers">Implementing Streaming Handlers</a></h3>
<p>On the server, build a response stream with <code>async_stream::stream!</code> or
<code>tokio_stream</code> helpers. Returning <code>Err</code> from the response stream maps to a
generic error frame; encode richer error payloads yourself when necessary.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use async_stream::stream;
use futures::StreamExt;

server.register_streaming("uppercase", |mut reqs| async move {
    stream! {
        while let Some(bytes) = reqs.next().await {
            let mut owned = bytes.clone();
            owned.make_ascii_uppercase();
            yield Ok(owned);
        }
    }
}).await;
<span class="boring">}</span></code></pre></pre>
<h2 id="cluster-management-v010"><a class="header" href="#cluster-management-v010">Cluster Management (v0.1.0+)</a></h2>
<p>RpcNet provides built-in distributed systems support for building scalable clusters with automatic discovery and failover.</p>
<h3 id="architecture-components"><a class="header" href="#architecture-components">Architecture Components</a></h3>
<h4 id="noderegistry"><a class="header" href="#noderegistry">NodeRegistry</a></h4>
<p>Tracks all nodes in the cluster with their metadata (address, tags, status). Filters nodes by tags for heterogeneous worker pools (e.g., GPU workers, CPU workers).</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::NodeRegistry;

let registry = NodeRegistry::new(cluster);
let gpu_workers = registry.nodes_with_tag("gpu").await;
<span class="boring">}</span></code></pre></pre>
<h4 id="workerregistry"><a class="header" href="#workerregistry">WorkerRegistry</a></h4>
<p>Automatically discovers workers via gossip and provides load-balanced worker selection.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{WorkerRegistry, LoadBalancingStrategy};

let registry = WorkerRegistry::new(
    cluster,
    LoadBalancingStrategy::LeastConnections
);
registry.start().await;

let worker = registry.select_worker(Some("role=worker")).await?;
<span class="boring">}</span></code></pre></pre>
<h4 id="load-balancing-strategies"><a class="header" href="#load-balancing-strategies">Load Balancing Strategies</a></h4>
<ul>
<li><strong>Round Robin</strong>: Even distribution across workers</li>
<li><strong>Random</strong>: Random selection for stateless workloads</li>
<li><strong>Least Connections</strong>: Routes to least-loaded worker (recommended)</li>
</ul>
<h4 id="health-checking"><a class="header" href="#health-checking">Health Checking</a></h4>
<p>Phi Accrual failure detector provides accurate, adaptive health monitoring:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::HealthChecker;

let health = HealthChecker::new(cluster, config);
health.start().await;

// Automatically marks nodes as failed/recovered
<span class="boring">}</span></code></pre></pre>
<h3 id="gossip-protocol"><a class="header" href="#gossip-protocol">Gossip Protocol</a></h3>
<p>RpcNet uses SWIM (Scalable Weakly-consistent Infection-style Process Group Membership Protocol) for:</p>
<ul>
<li>Automatic node discovery</li>
<li>Failure detection propagation</li>
<li>Cluster state synchronization</li>
<li>Network partition detection</li>
</ul>
<h3 id="clusterclient"><a class="header" href="#clusterclient">ClusterClient</a></h3>
<p>High-level client that combines worker discovery and load balancing:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{ClusterClient, WorkerRegistry, LoadBalancingStrategy};

let registry = Arc::new(WorkerRegistry::new(
    cluster,
    LoadBalancingStrategy::LeastConnections
));
registry.start().await;

let client = Arc::new(ClusterClient::new(registry, config));

// Call any worker in the pool
let result = client.call_worker("compute", data, Some("role=worker")).await?;
<span class="boring">}</span></code></pre></pre>
<h3 id="complete-example"><a class="header" href="#complete-example">Complete Example</a></h3>
<p>See the <a href="cluster-example.html">Cluster Example</a> chapter for a complete walkthrough of building a distributed worker pool with automatic discovery, load balancing, and failover.</p>
<div style="break-before: page; page-break-before: always;"></div><h1 id="rpcnet-gen-cli"><a class="header" href="#rpcnet-gen-cli">rpcnet-gen CLI</a></h1>
<p>The <code>rpcnet-gen</code> binary turns a Rust service definition (<code>*.rpc.rs</code>) into the
client, server, and type modules consumed by your application. This chapter
covers installation, day-to-day usage, and automation patterns.</p>
<h2 id="installing"><a class="header" href="#installing">Installing</a></h2>
<p>Starting with v0.1.0, the CLI is included by default with rpcnet. Install it once and reuse it across workspaces:</p>
<pre><code class="language-bash">cargo install rpcnet
</code></pre>
<p>The CLI is always available - no feature flags needed!</p>
<p>Add <code>--locked</code> in CI to guarantee reproducible dependency resolution.</p>
<h2 id="input-files-at-a-glance"><a class="header" href="#input-files-at-a-glance">Input Files at a Glance</a></h2>
<p>Service definitions are ordinary Rust modules annotated with <code>#[rpcnet::service]</code>.
For example:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// src/greeting.rpc.rs
use serde::{Deserialize, Serialize};

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct GreetRequest {
    pub name: String,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct GreetResponse {
    pub message: String,
}

#[rpcnet::service]
pub trait Greeting {
    async fn greet(&amp;self, request: GreetRequest) -&gt; Result&lt;GreetResponse, GreetingError&gt;;
}
<span class="boring">}</span></code></pre></pre>
<p>Every request/response/error type must be <code>Serialize</code>/<code>Deserialize</code>, and all
trait methods must be <code>async fn</code> returning <code>Result&lt;T, E&gt;</code>.</p>
<h2 id="basic-invocation"><a class="header" href="#basic-invocation">Basic Invocation</a></h2>
<p>Run the generator whenever you change a service trait:</p>
<pre><code class="language-bash">rpcnet-gen --input src/greeting.rpc.rs --output src/generated
</code></pre>
<p>A successful run prints the generated paths and writes the following structure:</p>
<pre><code>src/generated/
└── greeting/
    ├── client.rs   # GreetingClient with typed async methods
    ├── mod.rs      # Module exports and re-exports
    ├── server.rs   # GreetingServer + GreetingHandler trait
    └── types.rs    # Request/response/error definitions
</code></pre>
<p>Import the module once and re-export whatever you need:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[path = "generated/greeting/mod.rs"]
mod greeting;

use greeting::{client::GreetingClient, server::{GreetingHandler, GreetingServer}};
<span class="boring">}</span></code></pre></pre>
<h2 id="command-line-options"><a class="header" href="#command-line-options">Command-Line Options</a></h2>
<p><code>rpcnet-gen --help</code> surfaces all switches:</p>
<pre><code>Generate RPC client and server code from service definitions

Usage: rpcnet-gen [OPTIONS] --input &lt;INPUT&gt;

Options:
  -i, --input &lt;INPUT&gt;    Input .rpc file (Rust source with service trait)
  -o, --output &lt;OUTPUT&gt;  Output directory for generated code [default: src/generated]
      --server-only      Generate only server code
      --client-only      Generate only client code
      --types-only       Generate only type definitions
  -h, --help             Print help
  -V, --version          Print version
</code></pre>
<p>Key behaviours:</p>
<ul>
<li>Omit <code>--output</code> to use <code>src/generated</code>. The generator creates a lowercase
subdirectory named after the service (<code>Greeting</code> → <code>greeting/</code>).</li>
<li>Combine <code>--server-only</code>, <code>--client-only</code>, and <code>--types-only</code> to tailor the
outputs. The implicit <code>mod.rs</code> only re-exports files that were produced.</li>
<li>Passing mutually exclusive flags (e.g. <code>--server-only --client-only</code>) produces
only the directories you asked for; <code>types.rs</code> is skipped when either flag is
present.</li>
</ul>
<h2 id="regenerating-automatically"><a class="header" href="#regenerating-automatically">Regenerating Automatically</a></h2>
<h3 id="manual-rebuilds"><a class="header" href="#manual-rebuilds">Manual rebuilds</a></h3>
<p>Run the command by hand after touching a <code>.rpc.rs</code> file. Consider wiring a
<code>cargo alias</code> or a shell script so teammates can regenerate with a single
command.</p>
<h3 id="with-cargo-watch"><a class="header" href="#with-cargo-watch">With <code>cargo watch</code></a></h3>
<p>Install <code>cargo-watch</code> and keep generated code up to date during development:</p>
<pre><code class="language-bash">cargo install cargo-watch
cargo watch -w src/greeting.rpc.rs -x "run --bin rpcnet-gen -- --input src/greeting.rpc.rs --output src/generated"
</code></pre>
<h3 id="through-buildrs"><a class="header" href="#through-buildrs">Through <code>build.rs</code></a></h3>
<p>For projects that must guarantee generated code exists before compilation,
invoke the builder API from a build script (requires the <code>codegen</code> feature in
<code>[build-dependencies]</code>):</p>
<pre><pre class="playground"><code class="language-rust">// build.rs
fn main() {
    println!("cargo:rerun-if-changed=src/greeting.rpc.rs");

    rpcnet::codegen::Builder::new()
        .input("src/greeting.rpc.rs")
        .output("src/generated")
        .build()
        .expect("Failed to generate RPC code");
}</code></pre></pre>
<p>Cargo reruns the script when the <code>.rpc.rs</code> file changes, keeping the generated
modules in sync.</p>
<h2 id="working-with-multiple-services"><a class="header" href="#working-with-multiple-services">Working With Multiple Services</a></h2>
<p>Generate several services in one go by running the CLI multiple times or by
stacking inputs in the builder:</p>
<pre><pre class="playground"><code class="language-rust">// build.rs
fn main() {
    for service in ["rpc/user.rpc.rs", "rpc/billing.rpc.rs", "rpc/audit.rpc.rs"] {
        println!("cargo:rerun-if-changed={service}");
    }

    rpcnet::codegen::Builder::new()
        .input("rpc/user.rpc.rs")
        .input("rpc/billing.rpc.rs")
        .input("rpc/audit.rpc.rs")
        .output("src/generated")
        .build()
        .expect("Failed to generate RPC code");
}</code></pre></pre>
<p>Each input produces a sibling directory under <code>src/generated/</code> (<code>user/</code>,
<code>billing/</code>, <code>audit/</code>).</p>
<h2 id="version-control-strategy"><a class="header" href="#version-control-strategy">Version-Control Strategy</a></h2>
<p>Generated code is ordinary Rust and can be committed. Most teams either:</p>
<ol>
<li>Commit the <code>src/generated/**</code> tree so downstream crates build without the
generator, or</li>
<li>Ignore the directory and require the CLI (or <code>build.rs</code>) to run during CI.</li>
</ol>
<p>Pick a single approach and document it for contributors.</p>
<h2 id="troubleshooting"><a class="header" href="#troubleshooting">Troubleshooting</a></h2>
<ul>
<li><strong>Missing input file</strong> – the CLI exits with <code>Error: Input file '…' does not exist</code>. Double-check the path and ensure the file is tracked in git so
collaborators receive it.</li>
<li><strong>Invalid trait</strong> – methods must be <code>async fn</code> and return <code>Result</code>. The parser
reports an error pointing at the offending signature.</li>
<li><strong>Serialization failures at runtime</strong> – make sure your request/response/error
types derive <code>Serialize</code> and <code>Deserialize</code> and keep both client and server on
the same crate version so layouts match.</li>
</ul>
<p>With these workflows in place you can treat <code>rpcnet-gen</code> like any other build
step: edit the <code>.rpc.rs</code> trait, regenerate, and keep building.</p>
<div style="break-before: page; page-break-before: always;"></div><h1 id="cluster-example"><a class="header" href="#cluster-example">Cluster Example</a></h1>
<p>This chapter demonstrates building a distributed RPC cluster with automatic worker discovery, load balancing, and failure detection using RpcNet's built-in cluster features.</p>
<h2 id="architecture-overview"><a class="header" href="#architecture-overview">Architecture Overview</a></h2>
<p>The cluster example showcases three main components working together:</p>
<pre><code>                    ┌──────────────────────────┐
                    │      Director            │
                    │  (Coordinator Node)      │
                    │                          │
                    │  - WorkerRegistry        │
                    │  - ClusterClient         │
                    │  - Load Balancing        │
                    └────────┬─────────────────┘
                             │
                    Gossip Protocol (SWIM)
                             │
            ┌────────────────┼────────────────┐
            │                                 │
    ┌───────▼────────┐              ┌────────▼───────┐
    │   Worker A      │              │   Worker B      │
    │                 │              │                 │
    │  - Auto-join    │              │  - Auto-join    │
    │  - Tag: worker  │              │  - Tag: worker  │
    │  - Process tasks│              │  - Process tasks│
    └─────────────────┘              └─────────────────┘
</code></pre>
<h3 id="components"><a class="header" href="#components">Components</a></h3>
<p><strong>1. Director</strong> - Coordinator node that:</p>
<ul>
<li>Uses <code>WorkerRegistry</code> for automatic worker discovery</li>
<li>Uses <code>ClusterClient</code> for load-balanced request routing</li>
<li>Employs <code>LeastConnections</code> strategy by default</li>
<li>Monitors worker pool status</li>
<li>Routes client requests to healthy workers</li>
</ul>
<p><strong>2. Workers</strong> - Processing nodes that:</p>
<ul>
<li>Join cluster automatically via gossip protocol</li>
<li>Tag themselves with <code>role=worker</code> for discovery</li>
<li>Process compute tasks from clients</li>
<li>Monitor cluster events (node joined/left/failed)</li>
<li>Support simulated failures for testing</li>
</ul>
<p><strong>3. Client</strong> - Application that:</p>
<ul>
<li>Connects to director</li>
<li>Gets worker assignment</li>
<li>Establishes direct connection to worker</li>
<li>Handles failover automatically</li>
</ul>
<h2 id="why-use-built-in-cluster-features"><a class="header" href="#why-use-built-in-cluster-features">Why Use Built-in Cluster Features?</a></h2>
<p>Compared to manual worker management patterns:</p>
<p><strong>Manual Approach</strong> ❌:</p>
<ul>
<li>Custom <code>HashMap&lt;Uuid, WorkerInfo&gt;</code> for tracking</li>
<li>Manual round-robin selection logic</li>
<li>Explicit RPC calls for worker registration</li>
<li>Custom ping-based health checks</li>
<li>~200 lines of boilerplate code</li>
</ul>
<p><strong>Built-in Cluster</strong> ✅:</p>
<ul>
<li>Built-in <code>WorkerRegistry</code> + <code>ClusterClient</code></li>
<li>Multiple load balancing strategies (Round Robin, Random, Least Connections)</li>
<li>Automatic discovery via SWIM gossip protocol</li>
<li>Phi Accrual failure detection (accurate, adaptive)</li>
<li>~50 lines to set up</li>
<li><strong>75% code reduction!</strong></li>
</ul>
<h2 id="running-the-example"><a class="header" href="#running-the-example">Running the Example</a></h2>
<h3 id="prerequisites"><a class="header" href="#prerequisites">Prerequisites</a></h3>
<p>Ensure test certificates exist:</p>
<pre><code class="language-bash">ls certs/test_cert.pem certs/test_key.pem
</code></pre>
<p>All commands should be run from the <strong>project root directory</strong>.</p>
<h3 id="basic-setup"><a class="header" href="#basic-setup">Basic Setup</a></h3>
<p>Open four terminals and run each component:</p>
<p><strong>Terminal 1 - Director:</strong></p>
<pre><code class="language-bash">DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --manifest-path examples/cluster/Cargo.toml --bin director
</code></pre>
<p><strong>Terminal 2 - Worker A:</strong></p>
<pre><code class="language-bash">WORKER_LABEL=worker-a \
  WORKER_ADDR=127.0.0.1:62001 \
  DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --manifest-path examples/cluster/Cargo.toml --bin worker
</code></pre>
<p><strong>Terminal 3 - Worker B:</strong></p>
<pre><code class="language-bash">WORKER_LABEL=worker-b \
  WORKER_ADDR=127.0.0.1:62002 \
  DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --manifest-path examples/cluster/Cargo.toml --bin worker
</code></pre>
<p><strong>Terminal 4 - Client:</strong></p>
<pre><code class="language-bash">DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --manifest-path examples/cluster/Cargo.toml --bin client
</code></pre>
<h3 id="what-youll-see"><a class="header" href="#what-youll-see">What You'll See</a></h3>
<p><strong>Director Output:</strong></p>
<pre><code>🎯 Starting Director at 127.0.0.1:61000
📁 Loading certificates from "../../certs/test_cert.pem"
✅ Director registered itself in cluster
✅ Cluster enabled - Director is now discoverable
🔄 Load balancing strategy: LeastConnections
📊 Worker pool status: 2 workers available
   - worker-a at 127.0.0.1:62001 (0 connections)
   - worker-b at 127.0.0.1:62002 (0 connections)
🚀 Director ready - listening on 127.0.0.1:61000
</code></pre>
<p><strong>Worker Output:</strong></p>
<pre><code>👷 Starting Worker 'worker-a' at 127.0.0.1:62001
🔌 Binding server to 127.0.0.1:62001...
✅ Server bound successfully
🌐 Enabling cluster, connecting to director at 127.0.0.1:61000...
✅ Cluster enabled, connected to director
🏷️  Tagging worker with role=worker and label=worker-a...
✅ Worker 'worker-a' joined cluster with role=worker
🚀 Worker 'worker-a' is running and ready to handle requests
</code></pre>
<p><strong>Client Output:</strong></p>
<pre><code>📡 Starting Client - connecting to director at 127.0.0.1:61000
✅ connected to director
🔀 director assigned worker - establishing direct connection
✅ direct connection established to worker
📤 creating request stream
🌊 stream opened successfully, starting to consume responses
📦 received token (sequence=1, text="token-1", total=1)
📦 received token (sequence=2, text="token-2", total=2)
...
</code></pre>
<h2 id="testing-failure-scenarios"><a class="header" href="#testing-failure-scenarios">Testing Failure Scenarios</a></h2>
<h3 id="simulated-worker-failures"><a class="header" href="#simulated-worker-failures">Simulated Worker Failures</a></h3>
<p>Enable periodic failures to test automatic failover:</p>
<p><strong>Worker with Failures:</strong></p>
<pre><code class="language-bash">WORKER_LABEL=worker-a \
  WORKER_ADDR=127.0.0.1:62001 \
  DIRECTOR_ADDR=127.0.0.1:61000 \
  WORKER_FAILURE_ENABLED=true \  # Enable failure simulation
  RUST_LOG=info \
  cargo run --manifest-path examples/cluster/Cargo.toml --bin worker
</code></pre>
<p><strong>Failure Cycle</strong> (~18 seconds):</p>
<ol>
<li><strong>Run</strong>: 10 seconds of normal operation</li>
<li><strong>Warning</strong>: "⚠️  Simulating worker failure in 3 seconds..."</li>
<li><strong>Failed</strong>: 5 seconds in failed state - "💥 Worker failed!"</li>
<li><strong>Recovery</strong>: "🔄 Worker recovering..."</li>
<li><strong>Ready</strong>: "✅ Worker recovered and ready to serve!"</li>
<li>Repeat</li>
</ol>
<p><strong>Client Behavior:</strong></p>
<ul>
<li>Detects failure via error response</li>
<li>Returns to director for new worker assignment</li>
<li>Switches to healthy worker seamlessly</li>
<li>Streaming continues with minimal interruption</li>
</ul>
<h3 id="hard-kill-test"><a class="header" href="#hard-kill-test">Hard Kill Test</a></h3>
<p>Test network-level failure detection:</p>
<pre><code class="language-bash"># In a worker terminal, press Ctrl+C
</code></pre>
<p><strong>Observe:</strong></p>
<ul>
<li>Director detects failure via gossip protocol</li>
<li><code>WorkerRegistry</code> removes worker from pool</li>
<li>Client requests automatically route to remaining workers</li>
<li>Zero downtime for ongoing operations</li>
</ul>
<h3 id="worker-restart-test"><a class="header" href="#worker-restart-test">Worker Restart Test</a></h3>
<p>After killing a worker, restart it to see re-discovery:</p>
<pre><code class="language-bash">WORKER_LABEL=worker-a \
  WORKER_ADDR=127.0.0.1:62001 \
  DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --manifest-path examples/cluster/Cargo.toml --bin worker
</code></pre>
<p><strong>Observe:</strong></p>
<ul>
<li>Worker automatically rejoins cluster</li>
<li>Gossip spreads worker availability</li>
<li>Director adds worker back to registry</li>
<li>Client requests resume to all available workers</li>
</ul>
<h2 id="how-it-works"><a class="header" href="#how-it-works">How It Works</a></h2>
<h3 id="1-automatic-discovery"><a class="header" href="#1-automatic-discovery">1. Automatic Discovery</a></h3>
<p>Workers don't manually register - they just join the cluster:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Worker code (simplified)
let cluster = ClusterMembership::new(config).await?;
cluster.join(vec![director_addr]).await?;

// Tag for discovery
cluster.set_tag("role", "worker");
cluster.set_tag("label", worker_label);

// That's it! Director discovers automatically via gossip
<span class="boring">}</span></code></pre></pre>
<h3 id="2-load-balancing"><a class="header" href="#2-load-balancing">2. Load Balancing</a></h3>
<p>Director uses <code>WorkerRegistry</code> for automatic load balancing:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Director code
let registry = Arc::new(WorkerRegistry::new(
    cluster,
    LoadBalancingStrategy::LeastConnections
));
registry.start().await;

// Automatically tracks workers and balances load
<span class="boring">}</span></code></pre></pre>
<h3 id="3-failure-detection"><a class="header" href="#3-failure-detection">3. Failure Detection</a></h3>
<p>Phi Accrual algorithm provides accurate health monitoring:</p>
<ul>
<li>Adapts to network conditions</li>
<li>Distinguishes slow nodes from failed nodes</li>
<li>No false positives from temporary delays</li>
<li>Automatic recovery when nodes return</li>
</ul>
<h3 id="4-tag-based-routing"><a class="header" href="#4-tag-based-routing">4. Tag-Based Routing</a></h3>
<p>Filter workers by capabilities:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Get only GPU workers
let gpu_worker = registry.select_worker(Some("gpu=true")).await?;

// Get any worker
let any_worker = registry.select_worker(Some("role=worker")).await?;
<span class="boring">}</span></code></pre></pre>
<h2 id="key-cluster-features-demonstrated"><a class="header" href="#key-cluster-features-demonstrated">Key Cluster Features Demonstrated</a></h2>
<h3 id="-automatic-discovery"><a class="header" href="#-automatic-discovery">✅ Automatic Discovery</a></h3>
<p>No manual registration needed - gossip protocol handles everything</p>
<h3 id="-load-balancing"><a class="header" href="#-load-balancing">✅ Load Balancing</a></h3>
<p>Choose from:</p>
<ul>
<li><strong>Round Robin</strong>: Even distribution</li>
<li><strong>Random</strong>: Stateless workload distribution</li>
<li><strong>Least Connections</strong>: Balance based on current load (recommended)</li>
</ul>
<h3 id="-failure-detection"><a class="header" href="#-failure-detection">✅ Failure Detection</a></h3>
<p>Phi Accrual algorithm provides accurate, adaptive health monitoring</p>
<h3 id="-tag-based-routing"><a class="header" href="#-tag-based-routing">✅ Tag-Based Routing</a></h3>
<p>Route by worker capabilities (GPU, CPU, zone, etc.)</p>
<h3 id="-event-monitoring"><a class="header" href="#-event-monitoring">✅ Event Monitoring</a></h3>
<p>Subscribe to cluster events:</p>
<ul>
<li><code>NodeJoined</code> - New worker available</li>
<li><code>NodeLeft</code> - Worker gracefully departed</li>
<li><code>NodeFailed</code> - Worker detected as failed</li>
</ul>
<h2 id="configuration-options"><a class="header" href="#configuration-options">Configuration Options</a></h2>
<h3 id="environment-variables"><a class="header" href="#environment-variables">Environment Variables</a></h3>
<p><strong>Director:</strong></p>
<ul>
<li><code>DIRECTOR_ADDR</code> - Bind address (default: <code>127.0.0.1:61000</code>)</li>
<li><code>RUST_LOG</code> - Log level (e.g., <code>info</code>, <code>debug</code>)</li>
</ul>
<p><strong>Worker:</strong></p>
<ul>
<li><code>WORKER_LABEL</code> - Worker identifier (default: <code>worker-1</code>)</li>
<li><code>WORKER_ADDR</code> - Bind address (default: <code>127.0.0.1:62001</code>)</li>
<li><code>DIRECTOR_ADDR</code> - Director address (default: <code>127.0.0.1:61000</code>)</li>
<li><code>WORKER_FAILURE_ENABLED</code> - Enable failure simulation (default: <code>false</code>)</li>
<li><code>RUST_LOG</code> - Log level</li>
</ul>
<p><strong>Client:</strong></p>
<ul>
<li><code>DIRECTOR_ADDR</code> - Director address (default: <code>127.0.0.1:61000</code>)</li>
<li><code>RUST_LOG</code> - Log level</li>
</ul>
<h3 id="load-balancing-strategies-1"><a class="header" href="#load-balancing-strategies-1">Load Balancing Strategies</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::LoadBalancingStrategy;

// Options:
LoadBalancingStrategy::RoundRobin       // Even distribution
LoadBalancingStrategy::Random           // Random selection
LoadBalancingStrategy::LeastConnections // Pick least loaded (recommended)
<span class="boring">}</span></code></pre></pre>
<h3 id="cluster-configuration"><a class="header" href="#cluster-configuration">Cluster Configuration</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::ClusterConfig;

let config = ClusterConfig::default()
    .with_gossip_interval(Duration::from_secs(1))
    .with_health_check_interval(Duration::from_secs(2));
<span class="boring">}</span></code></pre></pre>
<h2 id="troubleshooting-1"><a class="header" href="#troubleshooting-1">Troubleshooting</a></h2>
<p><strong>Workers not discovered:</strong></p>
<ul>
<li>Ensure director starts first (it's the seed node)</li>
<li>Check firewall allows UDP for gossip</li>
<li>Verify workers connect to correct director address</li>
</ul>
<p><strong>Requests failing:</strong></p>
<ul>
<li>Check worker has <code>role=worker</code> tag</li>
<li>Verify compute handler is registered</li>
<li>Check logs for connection errors</li>
</ul>
<p><strong>Slow failover:</strong></p>
<ul>
<li>Adjust health check interval in config</li>
<li>Tune Phi Accrual threshold</li>
<li>Check network latency</li>
</ul>
<h2 id="production-considerations"><a class="header" href="#production-considerations">Production Considerations</a></h2>
<p>For production deployments:</p>
<ol>
<li><strong>TLS Certificates</strong>: Use proper certificates, not test certs</li>
<li><strong>Monitoring</strong>: Integrate cluster events with your monitoring system</li>
<li><strong>Scaling</strong>: Add more workers dynamically as needed</li>
<li><strong>Persistence</strong>: Consider persisting cluster state if needed</li>
<li><strong>Security</strong>: Add authentication and authorization</li>
<li><strong>Network</strong>: Plan for network partitions and split-brain scenarios</li>
</ol>
<h2 id="next-steps"><a class="header" href="#next-steps">Next Steps</a></h2>
<ul>
<li>Try different load balancing strategies</li>
<li>Add more workers dynamically</li>
<li>Test network partition scenarios</li>
<li>Add custom tags for routing (zone, GPU, etc.)</li>
<li>Integrate with your application logic</li>
</ul>
<p>For full source code, see <code>examples/cluster/</code> in the repository.</p>
<div style="break-before: page; page-break-before: always;"></div><h1 id="cluster-overview"><a class="header" href="#cluster-overview">Cluster Overview</a></h1>
<p>RpcNet provides built-in support for building distributed RPC clusters with automatic service discovery, intelligent load balancing, and robust failure detection. This chapter introduces the core concepts and components of RpcNet's cluster architecture.</p>
<h2 id="what-is-a-cluster"><a class="header" href="#what-is-a-cluster">What is a Cluster?</a></h2>
<p>A <strong>cluster</strong> in RpcNet is a group of interconnected nodes that work together to provide distributed RPC services. Nodes automatically discover each other, share information about their state, and coordinate to handle client requests efficiently.</p>
<h3 id="key-benefits"><a class="header" href="#key-benefits">Key Benefits</a></h3>
<p><strong>Automatic Discovery</strong> 🔍</p>
<ul>
<li>No manual node registration required</li>
<li>Nodes join and leave seamlessly</li>
<li>Gossip protocol spreads information automatically</li>
</ul>
<p><strong>Intelligent Load Balancing</strong> ⚖️</p>
<ul>
<li>Multiple strategies (Round Robin, Random, Least Connections)</li>
<li>Tracks active connections per node</li>
<li>Prevents overload on individual nodes</li>
</ul>
<p><strong>Robust Failure Detection</strong> 💓</p>
<ul>
<li>Phi Accrual failure detection algorithm</li>
<li>Adapts to network conditions</li>
<li>Distinguishes between slow and failed nodes</li>
</ul>
<p><strong>Tag-Based Routing</strong> 🏷️</p>
<ul>
<li>Route requests by node capabilities</li>
<li>Filter by zone, hardware type, role, etc.</li>
<li>Enables heterogeneous worker pools</li>
</ul>
<h2 id="architecture-components-1"><a class="header" href="#architecture-components-1">Architecture Components</a></h2>
<p>RpcNet's cluster architecture consists of several key components that work together:</p>
<pre><code>┌─────────────────────────────────────────────────────────────┐
│                     Application Layer                        │
│  (Your RPC handlers, business logic)                         │
└────────────────────────┬────────────────────────────────────┘
                         │
┌────────────────────────▼────────────────────────────────────┐
│                    ClusterClient                             │
│  - High-level API for cluster operations                    │
│  - Load-balanced request routing                            │
│  - Efficient request routing                               │
└────────────────────────┬────────────────────────────────────┘
                         │
        │
┌───────▼─────────┐
│ WorkerRegistry  │
│  - Tracks nodes │
│  - Load balance │
│  - Filter tags  │
└───────┬─────────┘
        │
┌───────▼─────────┐
│  NodeRegistry   │
│  - All nodes    │
│  - Health state │
│  - Metadata     │
└───────┬─────────┘
        │
┌───────▼─────────────────────────────────────────────────────┐
│              ClusterMembership (SWIM)                        │
│  - Gossip protocol for node discovery                       │
│  - Phi Accrual failure detection                            │
│  - Event notifications (NodeJoined/Left/Failed)             │
└──────────────────────────────────────────────────────────────┘
</code></pre>
<h3 id="1-clustermembership-swim"><a class="header" href="#1-clustermembership-swim">1. ClusterMembership (SWIM)</a></h3>
<p>The foundation of RpcNet's cluster is the <strong>SWIM (Scalable Weakly-consistent Infection-style Process Group Membership)</strong> protocol. This provides:</p>
<ul>
<li><strong>Gossip-based communication</strong>: Nodes periodically exchange information</li>
<li><strong>Failure detection</strong>: Phi Accrual algorithm detects node failures accurately</li>
<li><strong>Partition detection</strong>: Identifies network splits and handles them gracefully</li>
<li><strong>Event system</strong>: Notifies about node state changes</li>
</ul>
<p><strong>Key characteristics</strong>:</p>
<ul>
<li>Eventually consistent membership information</li>
<li>Scales to thousands of nodes</li>
<li>Low network overhead (UDP-based gossip)</li>
<li>Handles network partitions and node churn</li>
</ul>
<h3 id="2-noderegistry"><a class="header" href="#2-noderegistry">2. NodeRegistry</a></h3>
<p>The <strong>NodeRegistry</strong> maintains a comprehensive view of all nodes in the cluster:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{NodeRegistry, ClusterMembership};

let registry = Arc::new(NodeRegistry::new(cluster));
registry.start().await;

// Get all nodes
let nodes = registry.nodes().await;

// Subscribe to cluster events
let mut events = registry.subscribe();
while let Some(event) = events.recv().await {
    match event {
        ClusterEvent::NodeJoined(node) =&gt; println!("Node joined: {}", node.id),
        ClusterEvent::NodeLeft(node) =&gt; println!("Node left: {}", node.id),
        ClusterEvent::NodeFailed(node) =&gt; println!("Node failed: {}", node.id),
    }
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Features</strong>:</p>
<ul>
<li>Real-time node tracking</li>
<li>Metadata storage per node</li>
<li>Event subscription for state changes</li>
<li>Thread-safe access via <code>Arc</code></li>
</ul>
<h3 id="3-workerregistry"><a class="header" href="#3-workerregistry">3. WorkerRegistry</a></h3>
<p>The <strong>WorkerRegistry</strong> extends NodeRegistry to track worker nodes specifically:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{WorkerRegistry, LoadBalancingStrategy};

let registry = Arc::new(WorkerRegistry::new(
    cluster,
    LoadBalancingStrategy::LeastConnections
));
registry.start().await;

// Select a worker (with optional tag filter)
let worker = registry.select_worker(Some("role=worker")).await?;
println!("Selected worker: {} at {}", worker.label, worker.addr);
<span class="boring">}</span></code></pre></pre>
<p><strong>Features</strong>:</p>
<ul>
<li>Filters nodes by tags (e.g., <code>role=worker</code>)</li>
<li>Applies load balancing strategy</li>
<li>Tracks active connections per worker</li>
<li>Automatic removal of failed workers</li>
</ul>
<h3 id="4-clusterclient"><a class="header" href="#4-clusterclient">4. ClusterClient</a></h3>
<p>The <strong>ClusterClient</strong> provides a high-level API that combines all components:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{ClusterClient, ClusterClientConfig};

let client = Arc::new(ClusterClient::new(registry, config));

// Call any worker matching the filter
let result = client.call_worker("compute", request, Some("role=worker")).await?;
<span class="boring">}</span></code></pre></pre>
<p><strong>Features</strong>:</p>
<ul>
<li>Automatic worker selection</li>
<li>Load-balanced request routing</li>
<li>Efficient connection management</li>
<li>Retry logic for failed requests</li>
</ul>
<h2 id="when-to-use-clusters"><a class="header" href="#when-to-use-clusters">When to Use Clusters</a></h2>
<p>RpcNet clusters are ideal for scenarios where you need:</p>
<h3 id="-good-use-cases"><a class="header" href="#-good-use-cases">✅ Good Use Cases</a></h3>
<p><strong>Distributed Workload Processing</strong></p>
<ul>
<li>Multiple workers processing tasks in parallel</li>
<li>Automatic load distribution across workers</li>
<li>Example: Video transcoding farm, data processing pipeline</li>
</ul>
<p><strong>High Availability Services</strong></p>
<ul>
<li>Services that must tolerate node failures</li>
<li>Automatic failover to healthy nodes</li>
<li>Example: API gateway, microservices mesh</li>
</ul>
<p><strong>Dynamic Scaling</strong></p>
<ul>
<li>Add/remove nodes based on load</li>
<li>Automatic discovery of new capacity</li>
<li>Example: Auto-scaling worker pools, elastic compute clusters</li>
</ul>
<p><strong>Heterogeneous Worker Pools</strong></p>
<ul>
<li>Different node types (GPU vs CPU, different zones)</li>
<li>Tag-based routing to appropriate nodes</li>
<li>Example: ML inference with GPU/CPU workers, multi-region deployments</li>
</ul>
<h3 id="-when-not-to-use-clusters"><a class="header" href="#-when-not-to-use-clusters">❌ When NOT to Use Clusters</a></h3>
<p><strong>Single Node Deployments</strong></p>
<ul>
<li>If you only have one server, use direct RPC instead</li>
<li>Cluster overhead isn't justified</li>
</ul>
<p><strong>Strict Consistency Requirements</strong></p>
<ul>
<li>SWIM provides eventual consistency</li>
<li>Not suitable for strong consistency needs (use consensus protocols like Raft)</li>
</ul>
<p><strong>Low-Latency Single-Hop</strong></p>
<ul>
<li>Direct RPC is faster for single client-server communication</li>
<li>Cluster adds minimal overhead, but every bit counts for ultra-low latency</li>
</ul>
<h2 id="cluster-modes"><a class="header" href="#cluster-modes">Cluster Modes</a></h2>
<p>RpcNet supports different cluster deployment patterns:</p>
<h3 id="1-coordinator-worker-pattern"><a class="header" href="#1-coordinator-worker-pattern">1. Coordinator-Worker Pattern</a></h3>
<p>One or more coordinator nodes route requests to worker nodes:</p>
<pre><code>         ┌──────────────┐
         │  Coordinator │
         │  (Director)  │
         └──────┬───────┘
                │
    ┌───────────┼───────────┐
    │           │           │
┌───▼───┐   ┌──▼────┐   ┌──▼────┐
│Worker │   │Worker │   │Worker │
└───────┘   └───────┘   └───────┘
</code></pre>
<p><strong>Use when</strong>:</p>
<ul>
<li>Clients don't need to track worker pool</li>
<li>Centralized routing and monitoring</li>
<li>Example: Load balancer + worker pool</li>
</ul>
<h3 id="2-peer-to-peer-pattern"><a class="header" href="#2-peer-to-peer-pattern">2. Peer-to-Peer Pattern</a></h3>
<p>All nodes are equal and can route to each other:</p>
<pre><code>┌──────┐     ┌──────┐
│ Node ├─────┤ Node │
└───┬──┘     └──┬───┘
    │           │
    └─────┬─────┘
      ┌───▼───┐
      │ Node  │
      └───────┘
</code></pre>
<p><strong>Use when</strong>:</p>
<ul>
<li>No single point of coordination needed</li>
<li>Nodes serve both as clients and servers</li>
<li>Example: Distributed cache, gossip-based database</li>
</ul>
<h3 id="3-hierarchical-pattern"><a class="header" href="#3-hierarchical-pattern">3. Hierarchical Pattern</a></h3>
<p>Multiple layers with different roles:</p>
<pre><code>       ┌────────┐
       │ Master │
       └───┬────┘
           │
    ┌──────┼──────┐
┌───▼───┐     ┌───▼───┐
│Region │     │Region │
│Leader │     │Leader │
└───┬───┘     └───┬───┘
    │             │
┌───▼───┐     ┌───▼───┐
│Worker │     │Worker │
└───────┘     └───────┘
</code></pre>
<p><strong>Use when</strong>:</p>
<ul>
<li>Multi-region deployments</li>
<li>Different node tiers (leaders, workers, storage)</li>
<li>Example: Global CDN, multi-tenant systems</li>
</ul>
<h2 id="performance-characteristics"><a class="header" href="#performance-characteristics">Performance Characteristics</a></h2>
<p>RpcNet clusters maintain high performance while providing distributed coordination:</p>
<h3 id="throughput"><a class="header" href="#throughput">Throughput</a></h3>
<ul>
<li><strong>172K+ requests/second</strong> in benchmarks</li>
<li>Minimal overhead compared to direct RPC</li>
<li>Scales linearly with number of workers</li>
</ul>
<h3 id="latency"><a class="header" href="#latency">Latency</a></h3>
<ul>
<li><strong>&lt; 0.1ms</strong> additional latency for load balancing</li>
<li>Efficient connection handling reduces overhead</li>
<li>QUIC's 0-RTT mode for warm connections</li>
</ul>
<h3 id="scalability"><a class="header" href="#scalability">Scalability</a></h3>
<ul>
<li>Tested with <strong>1000+ nodes</strong> in gossip cluster</li>
<li>Sub-linear gossip overhead (O(log N) per node)</li>
<li>Configurable gossip intervals for tuning</li>
</ul>
<h3 id="resource-usage"><a class="header" href="#resource-usage">Resource Usage</a></h3>
<ul>
<li><strong>Low memory</strong>: ~10KB per tracked node</li>
<li><strong>Low CPU</strong>: &lt; 1% for gossip maintenance</li>
<li><strong>Low network</strong>: ~1KB/s per node for gossip</li>
</ul>
<h2 id="next-steps-1"><a class="header" href="#next-steps-1">Next Steps</a></h2>
<p>Now that you understand the cluster architecture, you can:</p>
<ol>
<li><strong><a href="cluster/tutorial.html">Follow the Tutorial</a></strong> - Build your first cluster step-by-step</li>
<li><strong><a href="cluster/discovery.html">Learn About Discovery</a></strong> - Deep dive into SWIM gossip protocol</li>
<li><strong><a href="cluster/load-balancing.html">Explore Load Balancing</a></strong> - Choose the right strategy</li>
<li><strong><a href="cluster/health.html">Understand Health Checking</a></strong> - How Phi Accrual works</li>
<li><strong><a href="cluster/failures.html">Handle Failures</a></strong> - Partition detection and recovery</li>
</ol>
<p>Or jump directly to the <strong><a href="cluster/../cluster-example.html">Cluster Example</a></strong> to see a complete working system.</p>
<div style="break-before: page; page-break-before: always;"></div><h1 id="cluster-tutorial"><a class="header" href="#cluster-tutorial">Cluster Tutorial</a></h1>
<p>This hands-on tutorial guides you through building a complete distributed RPC cluster from scratch. You'll create a coordinator (director) that manages a pool of worker nodes, with automatic discovery, load balancing, and failure handling.</p>
<h2 id="what-youll-build"><a class="header" href="#what-youll-build">What You'll Build</a></h2>
<p>By the end of this tutorial, you'll have:</p>
<ul>
<li><strong>Director</strong>: Coordinator node that manages worker discovery and routes client requests</li>
<li><strong>Workers</strong>: Processing nodes that join automatically and handle compute tasks</li>
<li><strong>Client</strong>: Application that connects through the director and handles failover</li>
<li><strong>Failure Testing</strong>: Simulate worker failures and observe automatic recovery</li>
</ul>
<p><strong>Time</strong>: ~30 minutes<br />
<strong>Difficulty</strong>: Intermediate</p>
<h2 id="prerequisites-1"><a class="header" href="#prerequisites-1">Prerequisites</a></h2>
<h3 id="1-install-rpcnet"><a class="header" href="#1-install-rpcnet">1. Install RpcNet</a></h3>
<pre><code class="language-bash">cargo install rpcnet
</code></pre>
<p>This installs both the library and the <code>rpcnet-gen</code> CLI tool.</p>
<h3 id="2-create-test-certificates"><a class="header" href="#2-create-test-certificates">2. Create Test Certificates</a></h3>
<p>RpcNet requires TLS certificates. For development:</p>
<pre><code class="language-bash">mkdir certs
cd certs

# Generate self-signed certificate
openssl req -x509 -newkey rsa:4096 -nodes \
  -keyout test_key.pem -out test_cert.pem \
  -days 365 -subj "/CN=localhost"

cd ..
</code></pre>
<h3 id="3-create-project-structure"><a class="header" href="#3-create-project-structure">3. Create Project Structure</a></h3>
<pre><code class="language-bash">cargo new --bin cluster_tutorial
cd cluster_tutorial

# Add RpcNet dependency
cargo add rpcnet --features cluster
cargo add tokio --features full
cargo add anyhow
</code></pre>
<p>Your <code>Cargo.toml</code> should include:</p>
<pre><code class="language-toml">[dependencies]
rpcnet = { version = "0.2", features = ["cluster"] }
tokio = { version = "1", features = ["full"] }
anyhow = "1"
</code></pre>
<h2 id="step-1-define-the-rpc-interface"><a class="header" href="#step-1-define-the-rpc-interface">Step 1: Define the RPC Interface</a></h2>
<p>Create <code>compute.rpc.rs</code> to define the worker interface:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::prelude::*;

#[rpc_trait]
pub trait ComputeService {
    async fn process_task(&amp;self, task_id: String, data: Vec&lt;u8&gt;) -&gt; Result&lt;ComputeResult&gt;;
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ComputeResult {
    pub task_id: String,
    pub result: Vec&lt;u8&gt;,
    pub worker_label: String,
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Generate code</strong>:</p>
<pre><code class="language-bash">rpcnet-gen --input compute.rpc.rs --output src/generated
</code></pre>
<p>This creates <code>src/generated/compute_service.rs</code> with client and server stubs.</p>
<h2 id="step-2-implement-the-worker"><a class="header" href="#step-2-implement-the-worker">Step 2: Implement the Worker</a></h2>
<p>Create <code>src/bin/worker.rs</code>:</p>
<pre><pre class="playground"><code class="language-rust">use anyhow::Result;
use rpcnet::prelude::*;
use rpcnet::cluster::{ClusterMembership, ClusterConfig};
use std::sync::Arc;
use std::env;

mod generated;
use generated::compute_service::*;

struct WorkerHandler {
    label: String,
}

#[rpc_impl]
impl ComputeService for WorkerHandler {
    async fn process_task(&amp;self, task_id: String, data: Vec&lt;u8&gt;) -&gt; Result&lt;ComputeResult&gt; {
        println!("📋 [{}] Processing task: {}", self.label, task_id);
        
        // Simulate work
        tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
        
        // Return result with worker identity
        Ok(ComputeResult {
            task_id,
            result: data, // Echo data for demo
            worker_label: self.label.clone(),
        })
    }
}

#[tokio::main]
async fn main() -&gt; Result&lt;()&gt; {
    env_logger::init();
    
    // Get configuration from environment
    let worker_label = env::var("WORKER_LABEL").unwrap_or_else(|_| "worker-1".to_string());
    let worker_addr = env::var("WORKER_ADDR").unwrap_or_else(|_| "127.0.0.1:62001".to_string());
    let director_addr = env::var("DIRECTOR_ADDR").unwrap_or_else(|_| "127.0.0.1:61000".to_string());
    
    println!("👷 Starting Worker '{}' at {}", worker_label, worker_addr);
    
    // Load certificates
    let cert = std::fs::read("certs/test_cert.pem")?;
    let key = std::fs::read("certs/test_key.pem")?;
    
    // Create RPC server
    let config = ServerConfig::builder()
        .with_cert_and_key(cert, key)?
        .build();
    
    let mut server = Server::new(config);
    
    // Register compute handler
    let handler = Arc::new(WorkerHandler {
        label: worker_label.clone(),
    });
    server.register_service(handler);
    
    // Bind server
    println!("🔌 Binding server to {}...", worker_addr);
    server.bind(&amp;worker_addr).await?;
    println!("✅ Server bound successfully");
    
    // Enable cluster and join
    println!("🌐 Enabling cluster, connecting to director at {}...", director_addr);
    let cluster_config = ClusterConfig::default()
        .with_bind_addr(worker_addr.parse()?);
    
    let cluster = server.enable_cluster(cluster_config).await?;
    cluster.join(vec![director_addr.parse()?]).await?;
    println!("✅ Cluster enabled, connected to director");
    
    // Tag worker for discovery
    println!("🏷️  Tagging worker with role=worker and label={}...", worker_label);
    cluster.set_tag("role", "worker");
    cluster.set_tag("label", &amp;worker_label);
    println!("✅ Worker '{}' joined cluster with role=worker", worker_label);
    
    println!("🚀 Worker '{}' is running and ready to handle requests", worker_label);
    
    // Run server
    server.run().await?;
    
    Ok(())
}</code></pre></pre>
<h2 id="step-3-implement-the-director"><a class="header" href="#step-3-implement-the-director">Step 3: Implement the Director</a></h2>
<p>Create <code>src/bin/director.rs</code>:</p>
<pre><pre class="playground"><code class="language-rust">use anyhow::Result;
use rpcnet::prelude::*;
use rpcnet::cluster::{
    ClusterMembership, ClusterConfig, WorkerRegistry, 
    LoadBalancingStrategy, ClusterClient, ClusterClientConfig
};
use std::sync::Arc;
use std::env;

mod generated;
use generated::compute_service::*;

#[rpc_trait]
pub trait DirectorService {
    async fn get_worker(&amp;self) -&gt; Result&lt;String&gt;;
}

struct DirectorHandler {
    registry: Arc&lt;WorkerRegistry&gt;,
}

#[rpc_impl]
impl DirectorService for DirectorHandler {
    async fn get_worker(&amp;self) -&gt; Result&lt;String&gt; {
        println!("📨 Client requesting worker assignment");
        
        // Select worker using registry
        let worker = self.registry
            .select_worker(Some("role=worker"))
            .await
            .map_err(|e| anyhow::anyhow!("No workers available: {}", e))?;
        
        println!("✅ Assigned worker: {} at {}", worker.label, worker.addr);
        Ok(worker.addr.to_string())
    }
}

#[tokio::main]
async fn main() -&gt; Result&lt;()&gt; {
    env_logger::init();
    
    let director_addr = env::var("DIRECTOR_ADDR")
        .unwrap_or_else(|_| "127.0.0.1:61000".to_string());
    
    println!("🎯 Starting Director at {}", director_addr);
    
    // Load certificates
    println!("📁 Loading certificates from certs/");
    let cert = std::fs::read("certs/test_cert.pem")?;
    let key = std::fs::read("certs/test_key.pem")?;
    
    // Create server
    let config = ServerConfig::builder()
        .with_cert_and_key(cert, key)?
        .build();
    
    let mut server = Server::new(config);
    
    // Enable cluster first
    let cluster_config = ClusterConfig::default()
        .with_bind_addr(director_addr.parse()?);
    
    let cluster = server.enable_cluster(cluster_config).await?;
    println!("✅ Director registered itself in cluster");
    println!("✅ Cluster enabled - Director is now discoverable");
    
    // Create worker registry with load balancing
    let registry = Arc::new(WorkerRegistry::new(
        cluster,
        LoadBalancingStrategy::LeastConnections
    ));
    registry.start().await;
    
    println!("🔄 Load balancing strategy: LeastConnections");
    
    // Register director service
    let handler = Arc::new(DirectorHandler {
        registry: registry.clone(),
    });
    server.register_service(handler);
    
    // Bind and run
    server.bind(&amp;director_addr).await?;
    
    // Monitor worker pool
    tokio::spawn({
        let registry = registry.clone();
        async move {
            loop {
                tokio::time::sleep(tokio::time::Duration::from_secs(10)).await;
                let workers = registry.workers().await;
                println!("📊 Worker pool status: {} workers available", workers.len());
                for worker in workers {
                    println!("   - {} at {} ({} connections)", 
                        worker.label, worker.addr, worker.active_connections);
                }
            }
        }
    });
    
    println!("🚀 Director ready - listening on {}", director_addr);
    
    server.run().await?;
    
    Ok(())
}</code></pre></pre>
<h2 id="step-4-implement-the-client"><a class="header" href="#step-4-implement-the-client">Step 4: Implement the Client</a></h2>
<p>Create <code>src/bin/client.rs</code>:</p>
<pre><pre class="playground"><code class="language-rust">use anyhow::Result;
use rpcnet::prelude::*;
use std::env;

mod generated;
use generated::compute_service::*;
use generated::director_service::*;

#[tokio::main]
async fn main() -&gt; Result&lt;()&gt; {
    env_logger::init();
    
    let director_addr = env::var("DIRECTOR_ADDR")
        .unwrap_or_else(|_| "127.0.0.1:61000".to_string());
    
    println!("📡 Starting Client - connecting to director at {}", director_addr);
    
    // Load certificate for TLS
    let cert = std::fs::read("certs/test_cert.pem")?;
    
    let config = ClientConfig::builder()
        .with_server_cert(cert)?
        .build();
    
    // Connect to director
    let director_client = DirectorClient::connect(&amp;director_addr, config.clone()).await?;
    println!("✅ Connected to director");
    
    // Main loop: get worker, process tasks, handle failures
    let mut task_counter = 0;
    loop {
        // Get worker assignment from director
        println!("🔍 Asking director for worker assignment");
        let worker_addr = match director_client.get_worker().await {
            Ok(addr) =&gt; {
                println!("🔀 Director assigned worker at {}", addr);
                addr
            }
            Err(e) =&gt; {
                println!("❌ Failed to get worker: {}", e);
                tokio::time::sleep(tokio::time::Duration::from_secs(2)).await;
                continue;
            }
        };
        
        // Connect to worker directly
        println!("✅ Establishing direct connection to worker");
        let worker_client = match ComputeClient::connect(&amp;worker_addr, config.clone()).await {
            Ok(client) =&gt; {
                println!("✅ Direct connection established");
                client
            }
            Err(e) =&gt; {
                println!("❌ Failed to connect to worker: {}", e);
                continue;
            }
        };
        
        // Process tasks until worker fails
        loop {
            task_counter += 1;
            let task_id = format!("task-{}", task_counter);
            let data = format!("data-{}", task_counter).into_bytes();
            
            println!("📤 Sending task: {}", task_id);
            
            match worker_client.process_task(task_id.clone(), data).await {
                Ok(result) =&gt; {
                    println!("✅ Task {} completed by worker: {}", 
                        result.task_id, result.worker_label);
                    
                    // Wait before next task
                    tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
                }
                Err(e) =&gt; {
                    println!("⚠️  Worker failed: {} - returning to director", e);
                    break; // Get new worker from director
                }
            }
        }
    }
}</code></pre></pre>
<h2 id="step-5-update-cargotoml"><a class="header" href="#step-5-update-cargotoml">Step 5: Update Cargo.toml</a></h2>
<p>Add the binary definitions to <code>Cargo.toml</code>:</p>
<pre><code class="language-toml">[[bin]]
name = "director"
path = "src/bin/director.rs"

[[bin]]
name = "worker"
path = "src/bin/worker.rs"

[[bin]]
name = "client"
path = "src/bin/client.rs"
</code></pre>
<p>Also add the generated module to <code>src/lib.rs</code>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>pub mod generated;
<span class="boring">}</span></code></pre></pre>
<h2 id="step-6-run-the-cluster"><a class="header" href="#step-6-run-the-cluster">Step 6: Run the Cluster</a></h2>
<p>Open <strong>four terminals</strong> and run each component:</p>
<h3 id="terminal-1-start-director"><a class="header" href="#terminal-1-start-director">Terminal 1: Start Director</a></h3>
<pre><code class="language-bash">DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --bin director
</code></pre>
<p>Wait for: <code>🚀 Director ready - listening on 127.0.0.1:61000</code></p>
<h3 id="terminal-2-start-worker-a"><a class="header" href="#terminal-2-start-worker-a">Terminal 2: Start Worker A</a></h3>
<pre><code class="language-bash">WORKER_LABEL=worker-a \
  WORKER_ADDR=127.0.0.1:62001 \
  DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --bin worker
</code></pre>
<p>Wait for: <code>🚀 Worker 'worker-a' is running and ready to handle requests</code></p>
<h3 id="terminal-3-start-worker-b"><a class="header" href="#terminal-3-start-worker-b">Terminal 3: Start Worker B</a></h3>
<pre><code class="language-bash">WORKER_LABEL=worker-b \
  WORKER_ADDR=127.0.0.1:62002 \
  DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --bin worker
</code></pre>
<p>Wait for: <code>🚀 Worker 'worker-b' is running and ready to handle requests</code></p>
<h3 id="terminal-4-run-client"><a class="header" href="#terminal-4-run-client">Terminal 4: Run Client</a></h3>
<pre><code class="language-bash">DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --bin client
</code></pre>
<h2 id="step-7-observe-the-system"><a class="header" href="#step-7-observe-the-system">Step 7: Observe the System</a></h2>
<h3 id="director-output"><a class="header" href="#director-output">Director Output</a></h3>
<pre><code>🎯 Starting Director at 127.0.0.1:61000
📁 Loading certificates from certs/
✅ Director registered itself in cluster
✅ Cluster enabled - Director is now discoverable
🔄 Load balancing strategy: LeastConnections
🚀 Director ready - listening on 127.0.0.1:61000
📊 Worker pool status: 2 workers available
   - worker-a at 127.0.0.1:62001 (0 connections)
   - worker-b at 127.0.0.1:62002 (0 connections)
📨 Client requesting worker assignment
✅ Assigned worker: worker-a at 127.0.0.1:62001
</code></pre>
<h3 id="worker-output"><a class="header" href="#worker-output">Worker Output</a></h3>
<pre><code>👷 Starting Worker 'worker-a' at 127.0.0.1:62001
🔌 Binding server to 127.0.0.1:62001...
✅ Server bound successfully
🌐 Enabling cluster, connecting to director at 127.0.0.1:61000...
✅ Cluster enabled, connected to director
🏷️  Tagging worker with role=worker and label=worker-a...
✅ Worker 'worker-a' joined cluster with role=worker
🚀 Worker 'worker-a' is running and ready to handle requests
📋 [worker-a] Processing task: task-1
📋 [worker-a] Processing task: task-2
</code></pre>
<h3 id="client-output"><a class="header" href="#client-output">Client Output</a></h3>
<pre><code>📡 Starting Client - connecting to director at 127.0.0.1:61000
✅ Connected to director
🔍 Asking director for worker assignment
🔀 Director assigned worker at 127.0.0.1:62001
✅ Establishing direct connection to worker
✅ Direct connection established
📤 Sending task: task-1
✅ Task task-1 completed by worker: worker-a
📤 Sending task: task-2
✅ Task task-2 completed by worker: worker-a
</code></pre>
<h2 id="step-8-test-failure-handling"><a class="header" href="#step-8-test-failure-handling">Step 8: Test Failure Handling</a></h2>
<h3 id="scenario-1-kill-a-worker"><a class="header" href="#scenario-1-kill-a-worker">Scenario 1: Kill a Worker</a></h3>
<p>In Worker A terminal, press <strong>Ctrl+C</strong> to kill it.</p>
<p><strong>Observe</strong>:</p>
<ul>
<li>Director detects failure via gossip: <code>Node worker-a failed</code></li>
<li>Director updates worker pool: <code>📊 Worker pool status: 1 workers available</code></li>
<li>Client detects error: <code>⚠️ Worker failed - returning to director</code></li>
<li>Client gets new worker: <code>🔀 Director assigned worker at 127.0.0.1:62002</code></li>
<li>Tasks continue on Worker B with no data loss</li>
</ul>
<h3 id="scenario-2-restart-worker"><a class="header" href="#scenario-2-restart-worker">Scenario 2: Restart Worker</a></h3>
<p>Restart Worker A:</p>
<pre><code class="language-bash">WORKER_LABEL=worker-a \
  WORKER_ADDR=127.0.0.1:62001 \
  DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --bin worker
</code></pre>
<p><strong>Observe</strong>:</p>
<ul>
<li>Worker rejoins automatically</li>
<li>Gossip spreads availability</li>
<li>Director adds back to pool: <code>📊 Worker pool status: 2 workers available</code></li>
<li>Future client requests can use either worker</li>
</ul>
<h2 id="what-you-learned"><a class="header" href="#what-you-learned">What You Learned</a></h2>
<p>Congratulations! You've built a complete distributed RPC cluster. You now understand:</p>
<p>✅ <strong>Automatic Discovery</strong>: Workers join via gossip, no manual registration<br />
✅ <strong>Load Balancing</strong>: Director uses LeastConnections strategy automatically<br />
✅ <strong>Failure Detection</strong>: Gossip protocol detects and handles node failures<br />
✅ <strong>Client Failover</strong>: Clients handle worker failures gracefully<br />
✅ <strong>Tag-Based Routing</strong>: Filter workers by role (<code>role=worker</code>)</p>
<h2 id="next-steps-2"><a class="header" href="#next-steps-2">Next Steps</a></h2>
<h3 id="add-more-workers"><a class="header" href="#add-more-workers">Add More Workers</a></h3>
<p>Scale up by adding more workers with different labels:</p>
<pre><code class="language-bash">WORKER_LABEL=worker-c \
  WORKER_ADDR=127.0.0.1:62003 \
  DIRECTOR_ADDR=127.0.0.1:61000 \
  cargo run --bin worker
</code></pre>
<h3 id="try-different-load-balancing"><a class="header" href="#try-different-load-balancing">Try Different Load Balancing</a></h3>
<p>Change the strategy in <code>director.rs</code>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>LoadBalancingStrategy::RoundRobin       // Even distribution
LoadBalancingStrategy::Random           // Random selection
LoadBalancingStrategy::LeastConnections // Pick least loaded (default)
<span class="boring">}</span></code></pre></pre>
<h3 id="add-custom-tags"><a class="header" href="#add-custom-tags">Add Custom Tags</a></h3>
<p>Tag workers by capability:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>cluster.set_tag("gpu", "true");
cluster.set_tag("zone", "us-west");
<span class="boring">}</span></code></pre></pre>
<p>Then filter in client:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>registry.select_worker(Some("gpu=true")).await?;
<span class="boring">}</span></code></pre></pre>
<h3 id="monitor-cluster-events"><a class="header" href="#monitor-cluster-events">Monitor Cluster Events</a></h3>
<p>Subscribe to events in director or workers:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>let mut events = cluster.subscribe();
while let Some(event) = events.recv().await {
    match event {
        ClusterEvent::NodeJoined(node) =&gt; println!("Node joined: {:?}", node),
        ClusterEvent::NodeLeft(node) =&gt; println!("Node left: {:?}", node),
        ClusterEvent::NodeFailed(node) =&gt; println!("Node failed: {:?}", node),
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="further-reading"><a class="header" href="#further-reading">Further Reading</a></h2>
<ul>
<li><strong><a href="cluster/discovery.html">Discovery</a></strong> - Learn how SWIM gossip protocol works</li>
<li><strong><a href="cluster/load-balancing.html">Load Balancing</a></strong> - Deep dive into strategies</li>
<li><strong><a href="cluster/health.html">Health Checking</a></strong> - Understand Phi Accrual algorithm</li>
<li><strong><a href="cluster/failures.html">Failure Handling</a></strong> - Advanced partition detection</li>
</ul>
<p>Or explore the <strong><a href="cluster/../cluster-example.html">Complete Cluster Example</a></strong> with streaming and advanced features.</p>
<div style="break-before: page; page-break-before: always;"></div><h1 id="automatic-discovery"><a class="header" href="#automatic-discovery">Automatic Discovery</a></h1>
<p>RpcNet uses the <strong>SWIM (Scalable Weakly-consistent Infection-style Process Group Membership)</strong> protocol for automatic node discovery. This chapter explains how nodes find each other without central coordination or manual registration.</p>
<h2 id="how-discovery-works"><a class="header" href="#how-discovery-works">How Discovery Works</a></h2>
<h3 id="the-problem"><a class="header" href="#the-problem">The Problem</a></h3>
<p>In distributed systems, you need to know:</p>
<ul>
<li>Which nodes are currently alive?</li>
<li>Which nodes just joined?</li>
<li>Which nodes have failed or left?</li>
</ul>
<p>Traditional solutions have limitations:</p>
<ul>
<li><strong>Centralized registry</strong>: Single point of failure</li>
<li><strong>Broadcast</strong>: Doesn't scale (O(N²) messages)</li>
<li><strong>Heartbeats</strong>: Network overhead grows with cluster size</li>
</ul>
<h3 id="the-swim-solution"><a class="header" href="#the-swim-solution">The SWIM Solution</a></h3>
<p>SWIM provides <strong>scalable membership</strong> with constant overhead per node:</p>
<pre><code>┌─────────────────────────────────────────────────────┐
│  Node A discovers new nodes through gossip          │
│  without contacting every node in the cluster       │
└─────────────────────────────────────────────────────┘

     Node A                    Node B                    Node C
       │                         │                         │
       │   1. Ping (health)      │                         │
       ├────────────────────────►│                         │
       │                         │                         │
       │   2. Ack + Gossip       │                         │
       │◄────────────────────────┤                         │
       │   (includes info        │                         │
       │    about Node C)        │                         │
       │                         │                         │
       │   3. Now A knows C      │                         │
       │   exists without        │                         │
       │   direct contact!       │                         │
       │                         │                         │
       └─────────────┬───────────┴─────────────────────────┘
                     │
              Information spreads
              exponentially fast
</code></pre>
<h2 id="swim-protocol-basics"><a class="header" href="#swim-protocol-basics">SWIM Protocol Basics</a></h2>
<h3 id="1-gossip-based-communication"><a class="header" href="#1-gossip-based-communication">1. Gossip-Based Communication</a></h3>
<p>Nodes periodically exchange information with random peers:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Simplified gossip cycle (every 1 second by default)
loop {
    // Pick random node
    let peer = select_random_node();
    
    // Send health check + gossip payload
    let gossip = GossipMessage {
        sender: my_node_id,
        members: my_known_members.clone(),
        incarnation: my_incarnation,
    };
    peer.ping(gossip).await?;
    
    // Receive ack + peer's gossip
    let ack = receive_ack().await?;
    merge_member_information(ack.members);
    
    tokio::time::sleep(Duration::from_secs(1)).await;
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Key properties</strong>:</p>
<ul>
<li>Constant overhead per node: O(1) messages per cycle</li>
<li>Information spreads exponentially: O(log N) time</li>
<li>No single point of failure</li>
<li>Works with network partitions</li>
</ul>
<h3 id="2-three-node-states"><a class="header" href="#2-three-node-states">2. Three Node States</a></h3>
<p>SWIM tracks nodes in three states:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>pub enum NodeState {
    Alive,      // Node is healthy and responding
    Suspect,    // Node might be failed (under investigation)
    Failed,     // Node confirmed failed
}
<span class="boring">}</span></code></pre></pre>
<p><strong>State transitions</strong>:</p>
<pre><code>         ┌──────────────────────────────────────┐
         │                                      │
         │  Join cluster                        │  Gossip confirms alive
         │                                      │
    ┌────▼─────┐  No response after 3 pings  ┌─▼──────┐
    │  Alive   ├───────────────────────────►  │Suspect │
    └────┬─────┘                              └───┬────┘
         │                                        │
         │  Voluntary leave                       │  Confirmed by multiple nodes
         │                                        │  or timeout
         │                                    ┌───▼────┐
         └───────────────────────────────────►│ Failed │
                                              └────────┘
</code></pre>
<h3 id="3-failure-detection-protocol"><a class="header" href="#3-failure-detection-protocol">3. Failure Detection Protocol</a></h3>
<p>SWIM uses <strong>indirect probing</strong> to avoid false positives:</p>
<p><strong>Direct Probe</strong> (normal case):</p>
<pre><code>Node A                  Node B
  │                       │
  │  1. Ping              │
  ├──────────────────────►│
  │                       │
  │  2. Ack               │
  │◄──────────────────────┤
  │                       │
  │  B is alive ✓         │
</code></pre>
<p><strong>Indirect Probe</strong> (when direct fails):</p>
<pre><code>Node A                  Node C                  Node B
  │                       │                       │
  │  1. Ping (timeout)    │                       │
  ├─────────────────────X─┤                       │
  │                       │                       │
  │  2. Ask C to probe B  │                       │
  ├──────────────────────►│                       │
  │                       │  3. Ping              │
  │                       ├──────────────────────►│
  │                       │                       │
  │                       │  4. Ack               │
  │                       │◄──────────────────────┤
  │  5. B is alive via C  │                       │
  │◄──────────────────────┤                       │
  │                       │                       │
  │  B is alive ✓         │                       │
</code></pre>
<p>This prevents false positives from temporary network issues.</p>
<h2 id="rpcnet-implementation"><a class="header" href="#rpcnet-implementation">RpcNet Implementation</a></h2>
<h3 id="joining-a-cluster"><a class="header" href="#joining-a-cluster">Joining a Cluster</a></h3>
<p>When a node starts, it joins by contacting one or more <strong>seed nodes</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{ClusterMembership, ClusterConfig};

// Create cluster membership
let cluster_config = ClusterConfig::default()
    .with_bind_addr("0.0.0.0:7946".parse()?);

let cluster = ClusterMembership::new(cluster_config).await?;

// Join via seed nodes (directors, known workers, etc.)
let seeds = vec![
    "director.example.com:7946".parse()?,
    "worker-1.example.com:7946".parse()?,
];

cluster.join(seeds).await?;
<span class="boring">}</span></code></pre></pre>
<p><strong>What happens during join</strong>:</p>
<ol>
<li><strong>Contact seed nodes</strong>: Node sends join request to all seeds</li>
<li><strong>Receive member list</strong>: Seed responds with known cluster members</li>
<li><strong>Merge member info</strong>: Node learns about entire cluster</li>
<li><strong>Start gossip</strong>: Node begins exchanging info with all members</li>
<li><strong>Spread join event</strong>: Other nodes learn about new member via gossip</li>
</ol>
<p><strong>Time to full discovery</strong>: ~O(log N) gossip cycles (typically 2-5 seconds)</p>
<h3 id="tagging-nodes"><a class="header" href="#tagging-nodes">Tagging Nodes</a></h3>
<p>Nodes can advertise capabilities via <strong>tags</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Tag worker with role and capabilities
cluster.set_tag("role", "worker");
cluster.set_tag("label", "worker-gpu-1");
cluster.set_tag("gpu", "true");
cluster.set_tag("zone", "us-west-2a");
cluster.set_tag("memory", "64GB");
<span class="boring">}</span></code></pre></pre>
<p><strong>Tags are gossiped</strong> to all nodes, enabling:</p>
<ul>
<li>Service discovery (find all nodes with <code>role=worker</code>)</li>
<li>Capability-based routing (find nodes with <code>gpu=true</code>)</li>
<li>Zone-aware load balancing (prefer nodes in <code>zone=us-west-2a</code>)</li>
</ul>
<h3 id="subscribing-to-events"><a class="header" href="#subscribing-to-events">Subscribing to Events</a></h3>
<p>Monitor cluster changes in real-time:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::ClusterEvent;

let mut events = cluster.subscribe();

while let Some(event) = events.recv().await {
    match event {
        ClusterEvent::NodeJoined(node) =&gt; {
            println!("New node: {} at {}", node.id, node.addr);
            println!("Tags: {:?}", node.tags);
        }
        ClusterEvent::NodeLeft(node) =&gt; {
            println!("Node left gracefully: {}", node.id);
        }
        ClusterEvent::NodeFailed(node) =&gt; {
            println!("Node failed: {}", node.id);
            // Take action: remove from pool, alert monitoring, etc.
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="gossip-internals"><a class="header" href="#gossip-internals">Gossip Internals</a></h2>
<h3 id="gossip-message-structure"><a class="header" href="#gossip-message-structure">Gossip Message Structure</a></h3>
<p>Each gossip message contains:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>struct GossipMessage {
    // Sender identification
    sender_id: Uuid,
    sender_addr: SocketAddr,
    incarnation: u64,  // Anti-entropy counter
    
    // Member information
    members: Vec&lt;MemberInfo&gt;,
    
    // Piggyback information
    events: Vec&lt;ClusterEvent&gt;,
}

struct MemberInfo {
    id: Uuid,
    addr: SocketAddr,
    state: NodeState,
    incarnation: u64,
    tags: HashMap&lt;String, String&gt;,
    last_seen: SystemTime,
}
<span class="boring">}</span></code></pre></pre>
<h3 id="gossip-cycle"><a class="header" href="#gossip-cycle">Gossip Cycle</a></h3>
<p><strong>Every gossip interval</strong> (default: 1 second):</p>
<ol>
<li><strong>Select target</strong>: Pick random node from member list</li>
<li><strong>Prepare message</strong>: Collect recent events and member updates</li>
<li><strong>Send ping</strong>: UDP datagram with gossip payload</li>
<li><strong>Wait for ack</strong>: Timeout after 500ms (configurable)</li>
<li><strong>Merge information</strong>: Update local member list with received data</li>
<li><strong>Detect failures</strong>: Check for nodes that haven't responded</li>
</ol>
<h3 id="information-spread-speed"><a class="header" href="#information-spread-speed">Information Spread Speed</a></h3>
<p>With <strong>N nodes</strong> and <strong>gossip interval T</strong>:</p>
<ul>
<li><strong>1 node</strong> knows: T seconds (initial)</li>
<li><strong>2 nodes</strong> know: 2T seconds (1st gossip)</li>
<li><strong>4 nodes</strong> know: 3T seconds (2nd gossip)</li>
<li><strong>8 nodes</strong> know: 4T seconds (3rd gossip)</li>
<li><strong>N nodes</strong> know: (log₂ N) × T seconds</li>
</ul>
<p><strong>Example</strong>: 1000-node cluster, 1-second interval:</p>
<ul>
<li>Full propagation: ~10 seconds (log₂ 1000 ≈ 10)</li>
</ul>
<h2 id="advanced-features"><a class="header" href="#advanced-features">Advanced Features</a></h2>
<h3 id="incarnation-numbers"><a class="header" href="#incarnation-numbers">Incarnation Numbers</a></h3>
<p>Each node maintains an <strong>incarnation counter</strong> to handle:</p>
<p><strong>Problem</strong>: Node A suspects Node B is failed, but B is actually alive.</p>
<p><strong>Solution</strong>: B increments its incarnation number and gossips "I'm alive with incarnation N+1". This overrides stale failure suspicion.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Node B refutes failure suspicion
if cluster.is_suspected() {
    cluster.increment_incarnation();
    cluster.broadcast_alive();
}
<span class="boring">}</span></code></pre></pre>
<h3 id="anti-entropy"><a class="header" href="#anti-entropy">Anti-Entropy</a></h3>
<p>Periodically, nodes perform <strong>full state synchronization</strong> to:</p>
<ul>
<li>Fix inconsistencies from packet loss</li>
<li>Recover from network partitions</li>
<li>Ensure eventual consistency</li>
</ul>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Every 10 gossip cycles, do full sync with random node
if cycle_count % 10 == 0 {
    let peer = select_random_node();
    let full_state = get_all_members();
    peer.sync(full_state).await?;
}
<span class="boring">}</span></code></pre></pre>
<h3 id="partition-detection"><a class="header" href="#partition-detection">Partition Detection</a></h3>
<p>SWIM can detect <strong>network partitions</strong>:</p>
<pre><code>Before partition:            After partition:
     Cluster                     Cluster A  |  Cluster B
        │                            │      |      │
  ┌─────┼─────┐                ┌─────┼─────┐|┌─────┼─────┐
  A     B     C                A     B      ||     C     D
  │     │     │                │     │      ||     │     │
  └─────┼─────┘                └─────┘      |└─────┘     
        D                                   |
                                         SPLIT!
</code></pre>
<p><strong>Detection</strong>: Nodes in partition A can't reach nodes in partition B after multiple indirect probes.</p>
<p><strong>Handling</strong>:</p>
<ul>
<li>Each partition continues operating independently</li>
<li>When partition heals, gossip merges the views</li>
<li>Application must handle split-brain scenarios</li>
</ul>
<h2 id="configuration"><a class="header" href="#configuration">Configuration</a></h2>
<h3 id="tuning-gossip-parameters"><a class="header" href="#tuning-gossip-parameters">Tuning Gossip Parameters</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::ClusterConfig;
use std::time::Duration;

let config = ClusterConfig::default()
    .with_bind_addr("0.0.0.0:7946".parse()?)
    .with_gossip_interval(Duration::from_secs(1))      // How often to gossip
    .with_probe_timeout(Duration::from_millis(500))    // Ping timeout
    .with_indirect_probes(3)                           // How many indirect probes
    .with_suspicion_timeout(Duration::from_secs(5))    // Suspect → Failed timeout
    .with_gossip_fanout(3);                            // How many nodes to gossip to

cluster = ClusterMembership::new(config).await?;
<span class="boring">}</span></code></pre></pre>
<h3 id="tuning-guidelines"><a class="header" href="#tuning-guidelines">Tuning Guidelines</a></h3>
<p><strong>Small clusters</strong> (&lt; 10 nodes):</p>
<ul>
<li>Longer intervals (2-3 seconds)</li>
<li>Faster timeouts (200ms)</li>
<li>Lower fanout (1-2 nodes)</li>
</ul>
<p><strong>Medium clusters</strong> (10-100 nodes):</p>
<ul>
<li>Default settings (1 second, 500ms, 3 fanout)</li>
</ul>
<p><strong>Large clusters</strong> (100-1000 nodes):</p>
<ul>
<li>Shorter intervals (500ms)</li>
<li>More indirect probes (5+)</li>
<li>Higher fanout (5-7 nodes)</li>
</ul>
<p><strong>Very large clusters</strong> (1000+ nodes):</p>
<ul>
<li>Consider hierarchical clustering</li>
<li>Adjust suspicion timeout upward</li>
<li>Use regional seed nodes</li>
</ul>
<h2 id="failure-scenarios"><a class="header" href="#failure-scenarios">Failure Scenarios</a></h2>
<h3 id="temporary-network-glitch"><a class="header" href="#temporary-network-glitch">Temporary Network Glitch</a></h3>
<pre><code>Node A pings B → timeout (network glitch)
Node A → Suspect B
Node A asks C to probe B
Node C → B responds ✓
Node A → B is Alive (false alarm avoided)
</code></pre>
<p><strong>Result</strong>: No false positive due to indirect probing.</p>
<h3 id="actual-node-failure"><a class="header" href="#actual-node-failure">Actual Node Failure</a></h3>
<pre><code>Node A pings B → timeout
Node A → Suspect B
Node A asks C, D, E to probe B → all timeout
Suspicion timeout expires (5 seconds)
Node A → B is Failed
Gossip spreads: B failed
All nodes remove B from active pool
</code></pre>
<p><strong>Result</strong>: B marked failed within ~6 seconds (1s ping + 5s suspicion).</p>
<h3 id="network-partition"><a class="header" href="#network-partition">Network Partition</a></h3>
<pre><code>Partition occurs: {A, B} | {C, D}

In partition {A, B}:
- A and B communicate normally
- C and D marked as Failed

In partition {C, D}:
- C and D communicate normally
- A and B marked as Failed

Partition heals:
- Gossip exchanges full state
- All nodes marked Alive again
- Incarnation numbers resolve conflicts
</code></pre>
<p><strong>Result</strong>: Both partitions continue operating; merge when healed.</p>
<h2 id="best-practices"><a class="header" href="#best-practices">Best Practices</a></h2>
<h3 id="1-use-multiple-seed-nodes"><a class="header" href="#1-use-multiple-seed-nodes">1. Use Multiple Seed Nodes</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// ✅ Good: Multiple seeds for reliability
let seeds = vec![
    "seed-1.cluster.local:7946".parse()?,
    "seed-2.cluster.local:7946".parse()?,
    "seed-3.cluster.local:7946".parse()?,
];

// ❌ Bad: Single seed (single point of failure)
let seeds = vec!["seed-1.cluster.local:7946".parse()?];
<span class="boring">}</span></code></pre></pre>
<h3 id="2-monitor-cluster-events"><a class="header" href="#2-monitor-cluster-events">2. Monitor Cluster Events</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Log all cluster changes for debugging
tokio::spawn(async move {
    let mut events = cluster.subscribe();
    while let Some(event) = events.recv().await {
        log::info!("Cluster event: {:?}", event);
        metrics.record_cluster_event(&amp;event);
    }
});
<span class="boring">}</span></code></pre></pre>
<h3 id="3-tag-nodes-with-rich-metadata"><a class="header" href="#3-tag-nodes-with-rich-metadata">3. Tag Nodes with Rich Metadata</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Provide detailed tags for routing decisions
cluster.set_tag("role", "worker");
cluster.set_tag("version", env!("CARGO_PKG_VERSION"));
cluster.set_tag("zone", get_availability_zone());
cluster.set_tag("instance_type", "m5.xlarge");
cluster.set_tag("capabilities", "gpu,video-encode");
<span class="boring">}</span></code></pre></pre>
<h3 id="4-handle-partition-detection"><a class="header" href="#4-handle-partition-detection">4. Handle Partition Detection</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Detect partitions and alert
let mut events = cluster.subscribe();
while let Some(event) = events.recv().await {
    if let ClusterEvent::PartitionDetected = event {
        alert_ops_team("Network partition detected!");
        enable_read_only_mode(); // Prevent split-brain writes
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="5-graceful-shutdown"><a class="header" href="#5-graceful-shutdown">5. Graceful Shutdown</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Leave cluster gracefully when shutting down
cluster.leave().await?;

// This tells other nodes "I'm leaving intentionally"
// rather than waiting for failure detection timeout
<span class="boring">}</span></code></pre></pre>
<h2 id="comparison-to-other-protocols"><a class="header" href="#comparison-to-other-protocols">Comparison to Other Protocols</a></h2>
<div class="table-wrapper"><table><thead><tr><th>Feature</th><th>SWIM (RpcNet)</th><th>Raft</th><th>Consul</th><th>Kubernetes</th></tr></thead><tbody>
<tr><td><strong>Consistency</strong></td><td>Eventual</td><td>Strong</td><td>Strong</td><td>Eventual</td></tr>
<tr><td><strong>Failure Detection</strong></td><td>Phi Accrual</td><td>Leader heartbeat</td><td>Gossip</td><td>kubelet heartbeat</td></tr>
<tr><td><strong>Scalability</strong></td><td>1000+ nodes</td><td>~10 nodes</td><td>100s of nodes</td><td>1000s of nodes</td></tr>
<tr><td><strong>Partition Handling</strong></td><td>Both sides live</td><td>Majority only</td><td>Both sides live</td><td>Both sides live</td></tr>
<tr><td><strong>Network Overhead</strong></td><td>O(1) per node</td><td>O(N) from leader</td><td>O(1) per node</td><td>O(1) per node</td></tr>
<tr><td><strong>Setup Complexity</strong></td><td>Low</td><td>Medium</td><td>Medium</td><td>High</td></tr>
</tbody></table>
</div>
<p><strong>When to use SWIM</strong>:</p>
<ul>
<li>Large clusters (100+ nodes)</li>
<li>Partition tolerance required</li>
<li>Eventual consistency acceptable</li>
<li>Decentralized architecture preferred</li>
</ul>
<p><strong>When NOT to use SWIM</strong>:</p>
<ul>
<li>Strong consistency required → Use Raft</li>
<li>Small clusters (&lt; 5 nodes) → Direct RPC simpler</li>
<li>Centralized control desired → Use coordinator pattern</li>
</ul>
<h2 id="troubleshooting-2"><a class="header" href="#troubleshooting-2">Troubleshooting</a></h2>
<h3 id="nodes-not-discovering"><a class="header" href="#nodes-not-discovering">Nodes Not Discovering</a></h3>
<p><strong>Symptom</strong>: Workers join but director doesn't see them.</p>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Enable debug logging
RUST_LOG=rpcnet::cluster=debug cargo run

// Check what nodes are known
let members = cluster.members().await;
println!("Known members: {:?}", members);
<span class="boring">}</span></code></pre></pre>
<p><strong>Common causes</strong>:</p>
<ul>
<li>Firewall blocking UDP gossip port</li>
<li>Wrong seed node address</li>
<li>Network partition</li>
</ul>
<h3 id="slow-propagation"><a class="header" href="#slow-propagation">Slow Propagation</a></h3>
<p><strong>Symptom</strong>: Takes 30+ seconds for nodes to discover each other.</p>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Check gossip interval
let config = ClusterConfig::default()
    .with_gossip_interval(Duration::from_millis(500)); // Faster
<span class="boring">}</span></code></pre></pre>
<p><strong>Common causes</strong>:</p>
<ul>
<li>Gossip interval too long</li>
<li>High packet loss</li>
<li>Too few gossip fanout targets</li>
</ul>
<h3 id="false-failure-detection"><a class="header" href="#false-failure-detection">False Failure Detection</a></h3>
<p><strong>Symptom</strong>: Nodes marked failed but they're actually alive.</p>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Increase timeouts
let config = ClusterConfig::default()
    .with_probe_timeout(Duration::from_secs(1))    // More lenient
    .with_suspicion_timeout(Duration::from_secs(10));
<span class="boring">}</span></code></pre></pre>
<p><strong>Common causes</strong>:</p>
<ul>
<li>Network latency spikes</li>
<li>Node overloaded (GC pauses)</li>
<li>Timeout too aggressive</li>
</ul>
<h2 id="next-steps-3"><a class="header" href="#next-steps-3">Next Steps</a></h2>
<ul>
<li><strong><a href="cluster/load-balancing.html">Load Balancing</a></strong> - Use discovered nodes for routing</li>
<li><strong><a href="cluster/health.html">Health Checking</a></strong> - Understand Phi Accrual algorithm</li>
<li><strong><a href="cluster/failures.html">Failures</a></strong> - Handle partitions and split-brain scenarios</li>
</ul>
<h2 id="references"><a class="header" href="#references">References</a></h2>
<ul>
<li><a href="https://www.cs.cornell.edu/projects/Quicksilver/public_pdfs/SWIM.pdf">SWIM Paper (Cornell)</a> - Original SWIM protocol</li>
<li><a href="https://citeseerx.ist.psu.edu/document?repid=rep1&amp;type=pdf&amp;doi=babf246cf6753ad12ce97ae47e64c9d4ff85c6f7">Phi Accrual Paper</a> - Advanced failure detection</li>
<li><a href="https://en.wikipedia.org/wiki/Gossip_protocol">Gossip Protocols Overview</a> - General gossip concepts</li>
</ul>
<div style="break-before: page; page-break-before: always;"></div><h1 id="load-balancing"><a class="header" href="#load-balancing">Load Balancing</a></h1>
<p>Load balancing distributes requests across worker nodes to optimize resource utilization, minimize response time, and prevent overload. RpcNet provides multiple strategies to suit different workload patterns.</p>
<h2 id="available-strategies"><a class="header" href="#available-strategies">Available Strategies</a></h2>
<p>RpcNet includes three built-in load balancing strategies:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::LoadBalancingStrategy;

// Available strategies
LoadBalancingStrategy::RoundRobin       // Even distribution
LoadBalancingStrategy::Random           // Random selection
LoadBalancingStrategy::LeastConnections // Pick least loaded (recommended)
<span class="boring">}</span></code></pre></pre>
<h3 id="1-round-robin"><a class="header" href="#1-round-robin">1. Round Robin</a></h3>
<p>Distributes requests evenly across all available workers in sequence.</p>
<pre><code>Request Flow:
  Request 1 → Worker A
  Request 2 → Worker B
  Request 3 → Worker C
  Request 4 → Worker A  (cycle repeats)
  Request 5 → Worker B
  ...
</code></pre>
<p><strong>Algorithm</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>fn select_worker(&amp;mut self, workers: &amp;[Worker]) -&gt; &amp;Worker {
    let worker = &amp;workers[self.index % workers.len()];
    self.index += 1;
    worker
}
<span class="boring">}</span></code></pre></pre>
<p><strong>When to use</strong>:</p>
<ul>
<li>✅ Workers have identical capabilities</li>
<li>✅ Requests have similar processing time</li>
<li>✅ Simple, predictable distribution needed</li>
<li>❌ Workers have different performance characteristics</li>
<li>❌ Requests vary significantly in complexity</li>
</ul>
<p><strong>Pros</strong>:</p>
<ul>
<li>Simple and deterministic</li>
<li>Perfect load distribution over time</li>
<li>No state tracking required</li>
</ul>
<p><strong>Cons</strong>:</p>
<ul>
<li>Doesn't account for current load</li>
<li>Doesn't handle heterogeneous workers well</li>
<li>Can send requests to overloaded nodes</li>
</ul>
<h3 id="2-random"><a class="header" href="#2-random">2. Random</a></h3>
<p>Selects a random worker for each request.</p>
<pre><code>Request Flow:
  Request 1 → Worker B  (random)
  Request 2 → Worker A  (random)
  Request 3 → Worker B  (random)
  Request 4 → Worker C  (random)
  ...
</code></pre>
<p><strong>Algorithm</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>fn select_worker(&amp;self, workers: &amp;[Worker]) -&gt; &amp;Worker {
    let idx = rand::thread_rng().gen_range(0..workers.len());
    &amp;workers[idx]
}
<span class="boring">}</span></code></pre></pre>
<p><strong>When to use</strong>:</p>
<ul>
<li>✅ Stateless workloads</li>
<li>✅ Workers have identical capabilities</li>
<li>✅ No session affinity required</li>
<li>✅ Want to avoid coordinating state across requestors</li>
<li>❌ Need predictable distribution</li>
</ul>
<p><strong>Pros</strong>:</p>
<ul>
<li>No coordination required (fully stateless)</li>
<li>Good distribution with large request counts</li>
<li>Simple implementation</li>
</ul>
<p><strong>Cons</strong>:</p>
<ul>
<li>Uneven short-term distribution</li>
<li>Doesn't account for current load</li>
<li>Probabilistic rather than deterministic</li>
</ul>
<h3 id="3-least-connections-recommended"><a class="header" href="#3-least-connections-recommended">3. Least Connections (Recommended)</a></h3>
<p>Selects the worker with the fewest active connections.</p>
<pre><code>Worker Status:
  Worker A: 5 active connections
  Worker B: 2 active connections  ← SELECTED
  Worker C: 8 active connections

Next request → Worker B (has least connections)
</code></pre>
<p><strong>Algorithm</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>fn select_worker(&amp;self, workers: &amp;[Worker]) -&gt; &amp;Worker {
    workers
        .iter()
        .min_by_key(|w| w.active_connections.load(Ordering::Relaxed))
        .unwrap()
}
<span class="boring">}</span></code></pre></pre>
<p><strong>When to use</strong>:</p>
<ul>
<li>✅ Long-lived connections (streaming, websockets)</li>
<li>✅ Variable request processing time</li>
<li>✅ Workers have different capacities</li>
<li>✅ <strong>Recommended default for most use cases</strong></li>
<li>❌ Very short requests (overhead not worth it)</li>
</ul>
<p><strong>Pros</strong>:</p>
<ul>
<li>Adapts to actual load in real-time</li>
<li>Handles heterogeneous workers well</li>
<li>Prevents overload automatically</li>
</ul>
<p><strong>Cons</strong>:</p>
<ul>
<li>Slight overhead tracking connection counts</li>
<li>Requires connection counting infrastructure</li>
</ul>
<h2 id="using-load-balancing"><a class="header" href="#using-load-balancing">Using Load Balancing</a></h2>
<h3 id="with-workerregistry"><a class="header" href="#with-workerregistry">With WorkerRegistry</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{WorkerRegistry, LoadBalancingStrategy};

// Create registry with desired strategy
let registry = Arc::new(WorkerRegistry::new(
    cluster,
    LoadBalancingStrategy::LeastConnections // Change strategy here
));

registry.start().await;

// Select worker automatically using configured strategy
let worker = registry.select_worker(Some("role=worker")).await?;
println!("Selected worker: {} at {}", worker.label, worker.addr);
<span class="boring">}</span></code></pre></pre>
<h3 id="with-clusterclient"><a class="header" href="#with-clusterclient">With ClusterClient</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{ClusterClient, ClusterClientConfig};

// ClusterClient uses the registry's configured strategy
let config = ClusterClientConfig::default();
let client = Arc::new(ClusterClient::new(registry, config));

// Automatic load-balanced routing
let result = client.call_worker("compute", request, Some("role=worker")).await?;
<span class="boring">}</span></code></pre></pre>
<h2 id="strategy-comparison"><a class="header" href="#strategy-comparison">Strategy Comparison</a></h2>
<h3 id="performance-characteristics-1"><a class="header" href="#performance-characteristics-1">Performance Characteristics</a></h3>
<div class="table-wrapper"><table><thead><tr><th>Strategy</th><th>Selection Time</th><th>Memory</th><th>Accuracy</th><th>Best For</th></tr></thead><tbody>
<tr><td><strong>Round Robin</strong></td><td>O(1)</td><td>O(1)</td><td>Low</td><td>Uniform loads</td></tr>
<tr><td><strong>Random</strong></td><td>O(1)</td><td>O(1)</td><td>Medium</td><td>Stateless</td></tr>
<tr><td><strong>Least Connections</strong></td><td>O(N)</td><td>O(N)</td><td>High</td><td>Variable loads</td></tr>
</tbody></table>
</div>
<h3 id="distribution-quality"><a class="header" href="#distribution-quality">Distribution Quality</a></h3>
<p><strong>Test scenario</strong>: 1000 requests to 3 workers with varying processing times</p>
<div class="table-wrapper"><table><thead><tr><th>Strategy</th><th>Worker A</th><th>Worker B</th><th>Worker C</th><th>Std Dev</th></tr></thead><tbody>
<tr><td><strong>Round Robin</strong></td><td>333</td><td>333</td><td>334</td><td>0.58</td></tr>
<tr><td><strong>Random</strong></td><td>328</td><td>345</td><td>327</td><td>9.86</td></tr>
<tr><td><strong>Least Connections</strong></td><td>280</td><td>390</td><td>330</td><td>55.52</td></tr>
</tbody></table>
</div>
<p><strong>Note</strong>: Round Robin appears most even, but this ignores actual load (processing time per request). Least Connections adapts to real load.</p>
<h3 id="real-world-scenarios"><a class="header" href="#real-world-scenarios">Real-World Scenarios</a></h3>
<h4 id="scenario-1-identical-workers-uniform-requests"><a class="header" href="#scenario-1-identical-workers-uniform-requests">Scenario 1: Identical Workers, Uniform Requests</a></h4>
<pre><code>Workers: 3x m5.large (identical)
Requests: 1KB data, 50ms processing
</code></pre>
<p><strong>Best strategy</strong>: Round Robin or Random</p>
<ul>
<li>All strategies perform similarly</li>
<li>Round Robin slightly more predictable</li>
</ul>
<h4 id="scenario-2-heterogeneous-workers"><a class="header" href="#scenario-2-heterogeneous-workers">Scenario 2: Heterogeneous Workers</a></h4>
<pre><code>Workers:
  - 2x m5.large (2 CPU, 8GB RAM)
  - 1x m5.xlarge (4 CPU, 16GB RAM)
Requests: CPU-intensive (100-500ms)
</code></pre>
<p><strong>Best strategy</strong>: Least Connections</p>
<ul>
<li>Larger worker naturally gets more requests</li>
<li>Prevents overload on smaller workers</li>
</ul>
<h4 id="scenario-3-variable-request-complexity"><a class="header" href="#scenario-3-variable-request-complexity">Scenario 3: Variable Request Complexity</a></h4>
<pre><code>Workers: 3x m5.large (identical)
Requests:
  - 70% simple (10ms)
  - 20% medium (100ms)
  - 10% complex (1000ms)
</code></pre>
<p><strong>Best strategy</strong>: Least Connections</p>
<ul>
<li>Workers with complex requests get fewer new ones</li>
<li>Prevents queue buildup</li>
</ul>
<h4 id="scenario-4-streaming-workloads"><a class="header" href="#scenario-4-streaming-workloads">Scenario 4: Streaming Workloads</a></h4>
<pre><code>Workers: 3x GPU instances
Requests: Long-lived video transcoding streams
</code></pre>
<p><strong>Best strategy</strong>: Least Connections</p>
<ul>
<li>Critical to balance active streams</li>
<li>Round Robin would overload sequentially</li>
</ul>
<h2 id="advanced-techniques"><a class="header" href="#advanced-techniques">Advanced Techniques</a></h2>
<h3 id="weighted-load-balancing"><a class="header" href="#weighted-load-balancing">Weighted Load Balancing</a></h3>
<p>Weight workers by capacity:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Tag workers with capacity
cluster.set_tag("capacity", "100");  // Large worker
cluster.set_tag("capacity", "50");   // Small worker

// Custom selection logic
fn select_weighted_worker(workers: &amp;[Worker]) -&gt; &amp;Worker {
    let total_capacity: u32 = workers.iter()
        .map(|w| w.tags.get("capacity").unwrap().parse::&lt;u32&gt;().unwrap())
        .sum();
    
    let mut rand_val = rand::thread_rng().gen_range(0..total_capacity);
    
    for worker in workers {
        let capacity = worker.tags.get("capacity").unwrap().parse::&lt;u32&gt;().unwrap();
        if rand_val &lt; capacity {
            return worker;
        }
        rand_val -= capacity;
    }
    
    unreachable!()
}
<span class="boring">}</span></code></pre></pre>
<h3 id="locality-aware-load-balancing"><a class="header" href="#locality-aware-load-balancing">Locality-Aware Load Balancing</a></h3>
<p>Prefer workers in the same zone/region:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>async fn select_local_worker(
    registry: &amp;WorkerRegistry,
    client_zone: &amp;str,
) -&gt; Result&lt;Worker&gt; {
    // Try local workers first
    let filter = format!("role=worker,zone={}", client_zone);
    if let Ok(worker) = registry.select_worker(Some(&amp;filter)).await {
        return Ok(worker);
    }
    
    // Fall back to any worker
    registry.select_worker(Some("role=worker")).await
}
<span class="boring">}</span></code></pre></pre>
<h3 id="affinity-based-load-balancing"><a class="header" href="#affinity-based-load-balancing">Affinity-Based Load Balancing</a></h3>
<p>Route requests from the same client to the same worker:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};

fn select_with_affinity(client_id: &amp;str, workers: &amp;[Worker]) -&gt; &amp;Worker {
    let mut hasher = DefaultHasher::new();
    client_id.hash(&amp;mut hasher);
    let hash = hasher.finish() as usize;
    
    &amp;workers[hash % workers.len()]
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Use cases</strong>:</p>
<ul>
<li>Session-based workloads</li>
<li>Client-specific caching</li>
<li>Stateful processing</li>
</ul>
<h3 id="load-shedding"><a class="header" href="#load-shedding">Load Shedding</a></h3>
<p>Reject requests when all workers are overloaded:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>async fn select_with_shedding(
    registry: &amp;WorkerRegistry,
    max_connections: usize,
) -&gt; Result&lt;Worker&gt; {
    let worker = registry.select_worker(Some("role=worker")).await?;
    
    if worker.active_connections &gt;= max_connections {
        return Err(anyhow::anyhow!("All workers at capacity"));
    }
    
    Ok(worker)
}
<span class="boring">}</span></code></pre></pre>
<h2 id="monitoring-and-metrics"><a class="header" href="#monitoring-and-metrics">Monitoring and Metrics</a></h2>
<h3 id="track-load-distribution"><a class="header" href="#track-load-distribution">Track Load Distribution</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::collections::HashMap;

struct LoadBalancerMetrics {
    requests_per_worker: Arc&lt;Mutex&lt;HashMap&lt;Uuid, AtomicUsize&gt;&gt;&gt;,
}

impl LoadBalancerMetrics {
    async fn record_request(&amp;self, worker_id: Uuid) {
        let mut map = self.requests_per_worker.lock().await;
        map.entry(worker_id)
            .or_insert_with(|| AtomicUsize::new(0))
            .fetch_add(1, Ordering::Relaxed);
    }
    
    async fn get_distribution(&amp;self) -&gt; HashMap&lt;Uuid, usize&gt; {
        let map = self.requests_per_worker.lock().await;
        map.iter()
            .map(|(id, count)| (*id, count.load(Ordering::Relaxed)))
            .collect()
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="monitor-worker-health"><a class="header" href="#monitor-worker-health">Monitor Worker Health</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>async fn monitor_worker_load(registry: Arc&lt;WorkerRegistry&gt;) {
    loop {
        tokio::time::sleep(Duration::from_secs(10)).await;
        
        let workers = registry.workers().await;
        for worker in workers {
            let load_pct = (worker.active_connections as f64 / worker.capacity as f64) * 100.0;
            
            if load_pct &gt; 80.0 {
                log::warn!(
                    "Worker {} at {}% capacity ({} connections)",
                    worker.label,
                    load_pct,
                    worker.active_connections
                );
            }
            
            // Report to metrics system
            metrics::gauge!("worker.load_pct", load_pct, "worker" =&gt; worker.label.clone());
            metrics::gauge!("worker.connections", worker.active_connections as f64, "worker" =&gt; worker.label.clone());
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="best-practices-1"><a class="header" href="#best-practices-1">Best Practices</a></h2>
<h3 id="1-choose-the-right-strategy"><a class="header" href="#1-choose-the-right-strategy">1. Choose the Right Strategy</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Default recommendation
LoadBalancingStrategy::LeastConnections  // Handles most cases well

// Use Round Robin if:
// - All workers identical
// - All requests uniform
// - Need deterministic distribution

// Use Random if:
// - Completely stateless
// - Multiple load balancers
// - Want to avoid coordination overhead
<span class="boring">}</span></code></pre></pre>
<h3 id="2-tag-workers-appropriately"><a class="header" href="#2-tag-workers-appropriately">2. Tag Workers Appropriately</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Provide rich metadata for routing decisions
cluster.set_tag("role", "worker");
cluster.set_tag("capacity", "100");
cluster.set_tag("zone", "us-west-2a");
cluster.set_tag("instance_type", "m5.xlarge");
cluster.set_tag("gpu", "true");
<span class="boring">}</span></code></pre></pre>
<h3 id="3-monitor-load-distribution"><a class="header" href="#3-monitor-load-distribution">3. Monitor Load Distribution</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Log worker selection for debugging
let worker = registry.select_worker(Some("role=worker")).await?;
log::debug!(
    "Selected worker {} (connections: {})",
    worker.label,
    worker.active_connections
);
<span class="boring">}</span></code></pre></pre>
<h3 id="4-handle-no-workers-available"><a class="header" href="#4-handle-no-workers-available">4. Handle No Workers Available</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Gracefully handle empty worker pool
match registry.select_worker(Some("role=worker")).await {
    Ok(worker) =&gt; {
        // Process with worker
    }
    Err(e) =&gt; {
        log::error!("No workers available: {}", e);
        // Return error to client or queue request
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="5-test-under-load"><a class="header" href="#5-test-under-load">5. Test Under Load</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Benchmark different strategies
#[tokio::test]
async fn bench_load_balancing() {
    let strategies = vec![
        LoadBalancingStrategy::RoundRobin,
        LoadBalancingStrategy::Random,
        LoadBalancingStrategy::LeastConnections,
    ];
    
    for strategy in strategies {
        let registry = WorkerRegistry::new(cluster.clone(), strategy);
        registry.start().await;
        
        let start = Instant::now();
        for _ in 0..10_000 {
            registry.select_worker(Some("role=worker")).await?;
        }
        let duration = start.elapsed();
        
        println!("{:?}: {:?}", strategy, duration);
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="troubleshooting-3"><a class="header" href="#troubleshooting-3">Troubleshooting</a></h2>
<h3 id="uneven-load-distribution"><a class="header" href="#uneven-load-distribution">Uneven Load Distribution</a></h3>
<p><strong>Symptom</strong>: One worker consistently gets more requests than others.</p>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Check active connections
let workers = registry.workers().await;
for worker in workers {
    println!("{}: {} connections", worker.label, worker.active_connections);
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Common causes</strong>:</p>
<ul>
<li>Using Least Connections with short-lived requests (connections finish before next selection)</li>
<li>Worker capacity differences not accounted for</li>
<li>Some workers slower to release connections</li>
</ul>
<p><strong>Solution</strong>:</p>
<ul>
<li>Try Round Robin for uniform short requests</li>
<li>Use weighted load balancing for heterogeneous workers</li>
<li>Ensure connections are properly closed</li>
</ul>
<h3 id="worker-overload"><a class="header" href="#worker-overload">Worker Overload</a></h3>
<p><strong>Symptom</strong>: Workers running out of resources despite load balancing.</p>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Monitor worker metrics
for worker in registry.workers().await {
    println!(
        "{}: {} connections (capacity: {})",
        worker.label,
        worker.active_connections,
        worker.capacity
    );
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Common causes</strong>:</p>
<ul>
<li>Too few workers for load</li>
<li>Worker capacity set too high</li>
<li>Requests taking longer than expected</li>
</ul>
<p><strong>Solution</strong>:</p>
<ul>
<li>Add more workers</li>
<li>Implement load shedding</li>
<li>Scale worker resources</li>
</ul>
<h3 id="strategy-not-applied"><a class="header" href="#strategy-not-applied">Strategy Not Applied</a></h3>
<p><strong>Symptom</strong>: Load balancing seems random despite configuring strategy.</p>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Verify registry configuration
println!("Strategy: {:?}", registry.strategy());
<span class="boring">}</span></code></pre></pre>
<p><strong>Common causes</strong>:</p>
<ul>
<li>Wrong registry instance used</li>
<li>Strategy changed after initialization</li>
<li>Multiple registries with different configs</li>
</ul>
<p><strong>Solution</strong>:</p>
<ul>
<li>Use single registry instance</li>
<li>Configure strategy at creation time</li>
<li>Pass registry via Arc for sharing</li>
</ul>
<h2 id="performance-impact"><a class="header" href="#performance-impact">Performance Impact</a></h2>
<h3 id="overhead-by-strategy"><a class="header" href="#overhead-by-strategy">Overhead by Strategy</a></h3>
<p>Measured on 3-node cluster, 100K requests:</p>
<div class="table-wrapper"><table><thead><tr><th>Strategy</th><th>Avg Selection Time</th><th>Memory per Request</th><th>Total Overhead</th></tr></thead><tbody>
<tr><td><strong>Round Robin</strong></td><td>15ns</td><td>0 bytes</td><td>0.0015ms</td></tr>
<tr><td><strong>Random</strong></td><td>42ns</td><td>0 bytes</td><td>0.0042ms</td></tr>
<tr><td><strong>Least Connections</strong></td><td>180ns</td><td>8 bytes</td><td>0.018ms</td></tr>
</tbody></table>
</div>
<p><strong>Conclusion</strong>: All strategies add negligible overhead (&lt; 0.02ms) compared to network latency (~0.1-1ms).</p>
<h3 id="throughput-impact"><a class="header" href="#throughput-impact">Throughput Impact</a></h3>
<p>Load balancing does not reduce throughput:</p>
<pre><code>Direct RPC (no load balancing):    172K RPS
With Round Robin:                  171K RPS (-0.5%)
With Random:                       170K RPS (-1.1%)
With Least Connections:            168K RPS (-2.3%)
</code></pre>
<p><strong>Conclusion</strong>: Load balancing overhead is minimal, well worth the improved distribution.</p>
<h2 id="next-steps-4"><a class="header" href="#next-steps-4">Next Steps</a></h2>
<ul>
<li><strong><a href="cluster/health.html">Health Checking</a></strong> - Ensure selected workers are healthy</li>
<li><strong><a href="cluster/failures.html">Failures</a></strong> - Handle worker failures gracefully</li>
</ul>
<h2 id="references-1"><a class="header" href="#references-1">References</a></h2>
<ul>
<li><a href="https://en.wikipedia.org/wiki/Load_balancing_(computing)">Load Balancing Algorithms</a> - Overview of strategies</li>
<li><a href="https://www.nginx.com/resources/glossary/load-balancing/">Least Connections Algorithm</a> - Industry standard</li>
<li><a href="https://en.wikipedia.org/wiki/Consistent_hashing">Consistent Hashing</a> - Advanced affinity technique</li>
</ul>
<div style="break-before: page; page-break-before: always;"></div><h1 id="health-checking-1"><a class="header" href="#health-checking-1">Health Checking</a></h1>
<p>RpcNet uses the <strong>Phi Accrual Failure Detector</strong> algorithm for accurate and adaptive health checking. This chapter explains how RpcNet determines which nodes are healthy and when to mark them as failed.</p>
<h2 id="the-problem-with-binary-health-checks"><a class="header" href="#the-problem-with-binary-health-checks">The Problem with Binary Health Checks</a></h2>
<p>Traditional health checks use binary logic:</p>
<pre><code>if (ping_timeout):
    node_is_failed = True
else:
    node_is_healthy = True
</code></pre>
<p><strong>Problems</strong>:</p>
<ol>
<li><strong>Fixed threshold</strong>: 500ms timeout doesn't adapt to network conditions</li>
<li><strong>False positives</strong>: Temporary slowdown triggers failure</li>
<li><strong>False negatives</strong>: Slow node stays "healthy" until timeout</li>
<li><strong>No confidence</strong>: Can't express "probably failed" vs "definitely failed"</li>
</ol>
<h2 id="phi-accrual-solution"><a class="header" href="#phi-accrual-solution">Phi Accrual Solution</a></h2>
<p>The Phi Accrual algorithm provides a <strong>continuous suspicion level</strong> instead of binary alive/dead:</p>
<pre><code>Phi Value (Φ) = Suspicion Level

Φ = 0     → Node is responding normally
Φ = 5     → Moderate suspicion (50% chance failed)
Φ = 8     → High suspicion (97.7% chance failed) ← Typical threshold
Φ = 10    → Very high suspicion (99.99% chance failed)
Φ = 15+   → Almost certainly failed
</code></pre>
<h3 id="how-it-works-1"><a class="header" href="#how-it-works-1">How It Works</a></h3>
<p><strong>1. Track Heartbeat History</strong></p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>struct HeartbeatHistory {
    intervals: Vec&lt;Duration&gt;,  // Last N intervals between heartbeats
    last_heartbeat: Instant,   // When we last heard from node
}
<span class="boring">}</span></code></pre></pre>
<p><strong>2. Calculate Expected Interval</strong></p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>fn mean_interval(&amp;self) -&gt; Duration {
    self.intervals.iter().sum::&lt;Duration&gt;() / self.intervals.len()
}

fn std_deviation(&amp;self) -&gt; Duration {
    let mean = self.mean_interval();
    let variance = self.intervals
        .iter()
        .map(|&amp;interval| {
            let diff = interval.as_secs_f64() - mean.as_secs_f64();
            diff * diff
        })
        .sum::&lt;f64&gt;() / self.intervals.len() as f64;
    
    Duration::from_secs_f64(variance.sqrt())
}
<span class="boring">}</span></code></pre></pre>
<p><strong>3. Compute Phi</strong></p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>fn phi(&amp;self) -&gt; f64 {
    let now = Instant::now();
    let time_since_last = now.duration_since(self.last_heartbeat);
    let mean = self.mean_interval();
    let std_dev = self.std_deviation();
    
    // How many standard deviations away is current delay?
    let z_score = (time_since_last.as_secs_f64() - mean.as_secs_f64()) 
                  / std_dev.as_secs_f64();
    
    // Convert to phi (log probability)
    -z_score.ln() / 2.0_f64.ln()
}
<span class="boring">}</span></code></pre></pre>
<p><strong>4. Determine Failure</strong></p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>const PHI_THRESHOLD: f64 = 8.0;  // Configurable

if phi() &gt; PHI_THRESHOLD {
    mark_node_as_failed();
}
<span class="boring">}</span></code></pre></pre>
<h2 id="visualization"><a class="header" href="#visualization">Visualization</a></h2>
<h3 id="example-1-healthy-node"><a class="header" href="#example-1-healthy-node">Example 1: Healthy Node</a></h3>
<pre><code>Heartbeats arrive regularly every ~1 second:

Time (s):    0    1    2    3    4    5    6    7    8
Heartbeat:   ✓    ✓    ✓    ✓    ✓    ✓    ✓    ✓    ✓
Phi:         0    0    0    0    0    0    0    0    0

Status: Healthy (Φ = 0)
</code></pre>
<h3 id="example-2-temporary-network-glitch"><a class="header" href="#example-2-temporary-network-glitch">Example 2: Temporary Network Glitch</a></h3>
<pre><code>Heartbeats delayed but node recovers:

Time (s):    0    1    2    3    4    5    6    7    8
Heartbeat:   ✓    ✓    ✓    .    .    ✓    ✓    ✓    ✓
Phi:         0    0    0    2    5    2    0    0    0
                              ▲
                              Elevated but below threshold

Status: Suspect briefly, but recovers (no failure declared)
</code></pre>
<h3 id="example-3-actual-failure"><a class="header" href="#example-3-actual-failure">Example 3: Actual Failure</a></h3>
<pre><code>Heartbeats stop after node crashes:

Time (s):    0    1    2    3    4    5    6    7    8
Heartbeat:   ✓    ✓    ✓    X    .    .    .    .    .
Phi:         0    0    0    2    5    8    11   14   17
                                   ▲
                                   Exceeds threshold → FAILED

Status: Failed (Φ = 8+)
</code></pre>
<h2 id="adaptive-behavior"><a class="header" href="#adaptive-behavior">Adaptive Behavior</a></h2>
<p>Phi Accrual adapts to network conditions automatically:</p>
<h3 id="stable-network"><a class="header" href="#stable-network">Stable Network</a></h3>
<pre><code>History: [1.0s, 1.0s, 1.0s, 1.0s, 1.0s]
Mean: 1.0s
Std Dev: 0.0s (very predictable)

Current delay: 1.5s
Phi: 8.0 → FAILURE (unusual for this stable network)
</code></pre>
<h3 id="variable-network"><a class="header" href="#variable-network">Variable Network</a></h3>
<pre><code>History: [0.8s, 1.2s, 0.9s, 1.4s, 1.0s]
Mean: 1.06s
Std Dev: 0.24s (more variable)

Current delay: 1.5s
Phi: 3.2 → HEALTHY (normal variation)
</code></pre>
<p><strong>Key insight</strong>: Same 1.5s delay is interpreted differently based on historical patterns.</p>
<h2 id="rpcnet-implementation-1"><a class="header" href="#rpcnet-implementation-1">RpcNet Implementation</a></h2>
<h3 id="configuration-1"><a class="header" href="#configuration-1">Configuration</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{ClusterConfig, HealthCheckConfig};
use std::time::Duration;

let health_config = HealthCheckConfig::default()
    .with_interval(Duration::from_secs(1))        // Check every 1 second
    .with_phi_threshold(8.0)                       // Suspicion threshold
    .with_history_size(100)                        // Track last 100 intervals
    .with_min_std_deviation(Duration::from_millis(50)); // Min variation

let cluster_config = ClusterConfig::default()
    .with_health_check(health_config);

let cluster = ClusterMembership::new(cluster_config).await?;
<span class="boring">}</span></code></pre></pre>
<h3 id="monitoring-health"><a class="header" href="#monitoring-health">Monitoring Health</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Subscribe to health events
let mut events = cluster.subscribe();

while let Some(event) = events.recv().await {
    match event {
        ClusterEvent::NodeSuspect(node, phi) =&gt; {
            println!("Node {} suspect (Φ = {:.2})", node.id, phi);
        }
        ClusterEvent::NodeFailed(node) =&gt; {
            println!("Node {} failed (Φ exceeded threshold)", node.id);
        }
        ClusterEvent::NodeRecovered(node) =&gt; {
            println!("Node {} recovered (Φ back to normal)", node.id);
        }
        _ =&gt; {}
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="custom-phi-threshold"><a class="header" href="#custom-phi-threshold">Custom Phi Threshold</a></h3>
<p>Different thresholds for different applications:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Conservative (fewer false positives, slower detection)
.with_phi_threshold(10.0)  // 99.99% confidence

// Aggressive (faster detection, more false positives)
.with_phi_threshold(5.0)   // 50% confidence

// Recommended default
.with_phi_threshold(8.0)   // 97.7% confidence
<span class="boring">}</span></code></pre></pre>
<h2 id="choosing-phi-threshold"><a class="header" href="#choosing-phi-threshold">Choosing Phi Threshold</a></h2>
<div class="table-wrapper"><table><thead><tr><th>Threshold</th><th>Confidence</th><th>False Positive Rate</th><th>Detection Time</th><th>Use Case</th></tr></thead><tbody>
<tr><td><strong>3.0</strong></td><td>12.5%</td><td>Very High</td><td>Very Fast</td><td>Testing only</td></tr>
<tr><td><strong>5.0</strong></td><td>50%</td><td>High</td><td>Fast</td><td>Aggressive failover</td></tr>
<tr><td><strong>8.0</strong></td><td>97.7%</td><td>Low</td><td>Moderate</td><td><strong>Recommended</strong></td></tr>
<tr><td><strong>10.0</strong></td><td>99.99%</td><td>Very Low</td><td>Slower</td><td>Critical systems</td></tr>
<tr><td><strong>12.0</strong></td><td>99.9999%</td><td>Extremely Low</td><td>Slow</td><td>High-latency networks</td></tr>
</tbody></table>
</div>
<h3 id="threshold-selection-guide"><a class="header" href="#threshold-selection-guide">Threshold Selection Guide</a></h3>
<p><strong>Low threshold (3-5)</strong> if:</p>
<ul>
<li>Fast failover is critical</li>
<li>False positives are acceptable</li>
<li>Network is very stable</li>
</ul>
<p><strong>Medium threshold (6-9)</strong> if:</p>
<ul>
<li>Balance between speed and accuracy</li>
<li>Typical production environments</li>
<li><strong>Recommended for most use cases</strong></li>
</ul>
<p><strong>High threshold (10+)</strong> if:</p>
<ul>
<li>False positives are very costly</li>
<li>Network has high variance</li>
<li>Graceful degradation preferred over fast failover</li>
</ul>
<h2 id="integration-with-swim"><a class="header" href="#integration-with-swim">Integration with SWIM</a></h2>
<p>Phi Accrual works alongside SWIM's failure detection:</p>
<pre><code>┌─────────────────────────────────────────────────────┐
│                   SWIM Protocol                      │
│                                                      │
│  1. Gossip → Heartbeats to Phi Accrual              │
│  2. Phi Accrual → Computes suspicion level          │
│  3. Φ &gt; threshold → Mark node as Suspect            │
│  4. Indirect probes → Verify with other nodes       │
│  5. Multiple confirmations → Mark node as Failed    │
│  6. Gossip spreads failure → All nodes updated      │
└─────────────────────────────────────────────────────┘
</code></pre>
<p><strong>Process</strong>:</p>
<ol>
<li><strong>Regular operation</strong>: Nodes exchange gossip messages (heartbeats)</li>
<li><strong>Phi calculation</strong>: Each heartbeat updates Phi Accrual history</li>
<li><strong>Suspicion</strong>: When Φ exceeds threshold, node marked Suspect</li>
<li><strong>Verification</strong>: SWIM performs indirect probes to confirm</li>
<li><strong>Failure declaration</strong>: Multiple nodes agree → Node marked Failed</li>
<li><strong>Recovery</strong>: If heartbeats resume, Φ drops and node marked Alive again</li>
</ol>
<h2 id="performance-characteristics-2"><a class="header" href="#performance-characteristics-2">Performance Characteristics</a></h2>
<h3 id="computational-overhead"><a class="header" href="#computational-overhead">Computational Overhead</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Phi calculation per node per check:
// - Mean: O(1) with running average
// - Std dev: O(1) with running variance
// - Phi: O(1) math operations

// Total overhead: ~500ns per node per health check
<span class="boring">}</span></code></pre></pre>
<p><strong>For 100 nodes checked every 1 second</strong>: 0.05ms total CPU time (negligible)</p>
<h3 id="memory-overhead"><a class="header" href="#memory-overhead">Memory Overhead</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>struct NodeHealth {
    intervals: VecDeque&lt;Duration&gt;,  // 100 entries × 16 bytes = 1.6 KB
    last_heartbeat: Instant,        // 16 bytes
    running_mean: Duration,         // 16 bytes
    running_variance: f64,          // 8 bytes
}

// Total per node: ~1.7 KB
<span class="boring">}</span></code></pre></pre>
<p><strong>For 100 nodes</strong>: ~170 KB memory (negligible)</p>
<h3 id="detection-time"><a class="header" href="#detection-time">Detection Time</a></h3>
<p>Measured time from actual failure to detection:</p>
<div class="table-wrapper"><table><thead><tr><th>Network Stability</th><th>Heartbeat Interval</th><th>Phi Threshold</th><th>Detection Time</th></tr></thead><tbody>
<tr><td>Stable (σ=10ms)</td><td>1s</td><td>8.0</td><td>2-3s</td></tr>
<tr><td>Variable (σ=200ms)</td><td>1s</td><td>8.0</td><td>4-6s</td></tr>
<tr><td>Unstable (σ=500ms)</td><td>1s</td><td>8.0</td><td>8-12s</td></tr>
</tbody></table>
</div>
<p><strong>Tuning for faster detection</strong>: Reduce heartbeat interval (e.g., 500ms)</p>
<h2 id="comparison-to-alternatives"><a class="header" href="#comparison-to-alternatives">Comparison to Alternatives</a></h2>
<h3 id="vs-fixed-timeout"><a class="header" href="#vs-fixed-timeout">vs Fixed Timeout</a></h3>
<pre><code>Fixed Timeout:
  ✗ Doesn't adapt to network conditions
  ✗ Binary alive/dead (no confidence)
  ✓ Simple implementation

Phi Accrual:
  ✓ Adapts automatically
  ✓ Continuous suspicion level
  ✓ Fewer false positives
  ✗ More complex
</code></pre>
<h3 id="vs-heartbeat-count"><a class="header" href="#vs-heartbeat-count">vs Heartbeat Count</a></h3>
<pre><code>Heartbeat Count (miss N in a row):
  ✗ Slow detection (N × interval)
  ✗ Doesn't account for network variance
  ✓ Simple logic

Phi Accrual:
  ✓ Faster detection
  ✓ Accounts for network patterns
  ✓ Adaptive threshold
</code></pre>
<h3 id="vs-gossip-only"><a class="header" href="#vs-gossip-only">vs Gossip Only</a></h3>
<pre><code>Gossip Only (no Phi):
  ✗ Hard threshold (suspect → failed)
  ✗ Doesn't adapt to network
  ✓ Simpler protocol

Gossip + Phi:
  ✓ Smooth suspicion curve
  ✓ Adapts to network conditions
  ✓ More accurate detection
</code></pre>
<h2 id="best-practices-2"><a class="header" href="#best-practices-2">Best Practices</a></h2>
<h3 id="1-tune-for-your-network"><a class="header" href="#1-tune-for-your-network">1. Tune for Your Network</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Measure your network characteristics first
async fn measure_network_latency() -&gt; (Duration, Duration) {
    let mut latencies = Vec::new();
    
    for _ in 0..100 {
        let start = Instant::now();
        ping_peer().await.unwrap();
        latencies.push(start.elapsed());
    }
    
    let mean = latencies.iter().sum::&lt;Duration&gt;() / latencies.len();
    let variance = latencies.iter()
        .map(|&amp;d| (d.as_secs_f64() - mean.as_secs_f64()).powi(2))
        .sum::&lt;f64&gt;() / latencies.len() as f64;
    let std_dev = Duration::from_secs_f64(variance.sqrt());
    
    println!("Network latency: {:.2?} ± {:.2?}", mean, std_dev);
    (mean, std_dev)
}

// Then configure accordingly
let (mean, std_dev) = measure_network_latency().await;
let health_config = HealthCheckConfig::default()
    .with_interval(mean * 2)          // Check at 2× mean latency
    .with_phi_threshold(8.0)
    .with_min_std_deviation(std_dev);
<span class="boring">}</span></code></pre></pre>
<h3 id="2-monitor-phi-values"><a class="header" href="#2-monitor-phi-values">2. Monitor Phi Values</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Log phi values to understand patterns
async fn monitor_phi_values(cluster: Arc&lt;ClusterMembership&gt;) {
    loop {
        tokio::time::sleep(Duration::from_secs(10)).await;
        
        for node in cluster.nodes().await {
            let phi = cluster.phi(node.id).await.unwrap_or(0.0);
            
            if phi &gt; 5.0 {
                log::warn!("Node {} phi elevated: {:.2}", node.id, phi);
            }
            
            metrics::gauge!("cluster.node.phi", phi, "node" =&gt; node.id.to_string());
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="3-handle-suspicion-state"><a class="header" href="#3-handle-suspicion-state">3. Handle Suspicion State</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Don't immediately fail on suspicion - investigate first
let mut events = cluster.subscribe();

while let Some(event) = events.recv().await {
    match event {
        ClusterEvent::NodeSuspect(node, phi) =&gt; {
            log::warn!("Node {} suspect (Φ = {:.2}), investigating...", node.id, phi);
            
            // Trigger additional checks
            tokio::spawn(async move {
                if let Err(e) = verify_node_health(&amp;node).await {
                    log::error!("Node {} verification failed: {}", node.id, e);
                }
            });
        }
        ClusterEvent::NodeFailed(node) =&gt; {
            log::error!("Node {} failed, removing from pool", node.id);
            remove_from_worker_pool(node.id).await;
        }
        _ =&gt; {}
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="4-adjust-history-size"><a class="header" href="#4-adjust-history-size">4. Adjust History Size</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Larger history = more stable, slower adaptation
.with_history_size(200)  // For very stable networks

// Smaller history = faster adaptation to changes
.with_history_size(50)   // For dynamic networks

// Default (recommended)
.with_history_size(100)
<span class="boring">}</span></code></pre></pre>
<h3 id="5-set-minimum-standard-deviation"><a class="header" href="#5-set-minimum-standard-deviation">5. Set Minimum Standard Deviation</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Prevent division by zero and overly sensitive detection
.with_min_std_deviation(Duration::from_millis(50))

// Higher min = less sensitive to small variations
.with_min_std_deviation(Duration::from_millis(100))
<span class="boring">}</span></code></pre></pre>
<h2 id="troubleshooting-4"><a class="header" href="#troubleshooting-4">Troubleshooting</a></h2>
<h3 id="false-positives-node-marked-failed-but-is-alive"><a class="header" href="#false-positives-node-marked-failed-but-is-alive">False Positives (Node marked failed but is alive)</a></h3>
<p><strong>Symptoms</strong>:</p>
<ul>
<li>Nodes frequently marked failed and recovered</li>
<li>Phi threshold exceeded during normal operation</li>
</ul>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Log phi values and intervals
for node in cluster.nodes().await {
    let phi = cluster.phi(node.id).await.unwrap_or(0.0);
    let history = cluster.heartbeat_history(node.id).await;
    println!("Node {}: Φ = {:.2}, intervals = {:?}", node.id, phi, history);
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Solutions</strong>:</p>
<ul>
<li>Increase phi threshold (8.0 → 10.0)</li>
<li>Increase heartbeat interval to match network latency</li>
<li>Increase min_std_deviation for variable networks</li>
</ul>
<h3 id="slow-detection-failures-take-too-long-to-detect"><a class="header" href="#slow-detection-failures-take-too-long-to-detect">Slow Detection (Failures take too long to detect)</a></h3>
<p><strong>Symptoms</strong>:</p>
<ul>
<li>Nodes crash but stay marked alive for minutes</li>
<li>Requests keep routing to failed nodes</li>
</ul>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Measure actual detection time
let failure_time = Instant::now();
// ... node fails ...
let detection_time = cluster.wait_for_failure(node_id).await;
println!("Detection took: {:?}", detection_time.duration_since(failure_time));
<span class="boring">}</span></code></pre></pre>
<p><strong>Solutions</strong>:</p>
<ul>
<li>Decrease phi threshold (8.0 → 6.0)</li>
<li>Decrease heartbeat interval (1s → 500ms)</li>
<li>Decrease suspicion timeout</li>
</ul>
<h3 id="memory-growth"><a class="header" href="#memory-growth">Memory Growth</a></h3>
<p><strong>Symptoms</strong>:</p>
<ul>
<li>Memory usage grows over time</li>
<li>History buffers not bounded</li>
</ul>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Check history sizes
for node in cluster.nodes().await {
    let history = cluster.heartbeat_history(node.id).await;
    println!("Node {}: {} intervals tracked", node.id, history.len());
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Solutions</strong>:</p>
<ul>
<li>Ensure history_size is set (default: 100)</li>
<li>Verify old entries are removed</li>
<li>Check for node ID leaks</li>
</ul>
<h2 id="advanced-topics"><a class="header" href="#advanced-topics">Advanced Topics</a></h2>
<h3 id="combining-multiple-detectors"><a class="header" href="#combining-multiple-detectors">Combining Multiple Detectors</a></h3>
<p>Use Phi Accrual for heartbeats AND application-level health:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>struct CompositeHealthCheck {
    phi_detector: PhiAccrualDetector,
    app_health: Arc&lt;Mutex&lt;HashMap&lt;Uuid, bool&gt;&gt;&gt;,
}

impl CompositeHealthCheck {
    async fn is_healthy(&amp;self, node_id: Uuid) -&gt; bool {
        // Both phi and application health must be good
        let phi = self.phi_detector.phi(node_id);
        let app_healthy = self.app_health.lock().await.get(&amp;node_id).copied().unwrap_or(false);
        
        phi &lt; PHI_THRESHOLD &amp;&amp; app_healthy
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="weighted-phi-thresholds"><a class="header" href="#weighted-phi-thresholds">Weighted Phi Thresholds</a></h3>
<p>Different thresholds for different node types:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>fn get_phi_threshold(node: &amp;Node) -&gt; f64 {
    match node.tags.get("criticality") {
        Some("high") =&gt; 10.0,    // Very conservative for critical nodes
        Some("low") =&gt; 6.0,      // Aggressive for non-critical
        _ =&gt; 8.0,                // Default
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="next-steps-5"><a class="header" href="#next-steps-5">Next Steps</a></h2>
<ul>
<li><strong><a href="cluster/failures.html">Failures</a></strong> - Handle node failures and partitions</li>
<li><strong><a href="cluster/discovery.html">Discovery</a></strong> - How nodes discover each other via gossip</li>
</ul>
<h2 id="references-2"><a class="header" href="#references-2">References</a></h2>
<ul>
<li><a href="https://citeseerx.ist.psu.edu/document?repid=rep1&amp;type=pdf&amp;doi=babf246cf6753ad12ce97ae47e64c9d4ff85c6f7">Phi Accrual Paper</a> - Original algorithm</li>
<li><a href="https://cassandra.apache.org/doc/latest/cassandra/architecture/failure_detection.html">Cassandra Failure Detection</a> - Production implementation</li>
<li><a href="https://doc.akka.io/docs/akka/current/typed/failure-detector.html">Akka Cluster Phi</a> - Akka's usage</li>
</ul>
<div style="break-before: page; page-break-before: always;"></div><h1 id="failure-handling"><a class="header" href="#failure-handling">Failure Handling</a></h1>
<p>Distributed systems must gracefully handle node failures, network partitions, and other failure scenarios. This chapter explains how RpcNet detects and recovers from failures in cluster deployments.</p>
<h2 id="types-of-failures"><a class="header" href="#types-of-failures">Types of Failures</a></h2>
<h3 id="1-node-crashes"><a class="header" href="#1-node-crashes">1. Node Crashes</a></h3>
<p><strong>Scenario</strong>: Worker process terminates unexpectedly</p>
<pre><code>Before:                  After:
  [Director]               [Director]
      |                        |
  ┌───┴───┐               ┌────┴────┐
  A   B   C               A       C
          X ← Crashed
</code></pre>
<p><strong>Detection</strong>:</p>
<ul>
<li>Gossip protocol detects missing heartbeats</li>
<li>Phi Accrual marks node as failed (typically 4-8 seconds)</li>
<li>Failure event propagated to all nodes</li>
</ul>
<p><strong>Recovery</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Automatic handling via WorkerRegistry
let mut events = registry.subscribe();

while let Some(event) = events.recv().await {
    match event {
        ClusterEvent::NodeFailed(node) =&gt; {
            log::error!("Worker {} failed", node.id);
            // WorkerRegistry automatically removes from pool
            // Future requests route to remaining workers
        }
        _ =&gt; {}
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="2-network-partitions"><a class="header" href="#2-network-partitions">2. Network Partitions</a></h3>
<p><strong>Scenario</strong>: Network split divides cluster</p>
<pre><code>Before partition:         After partition:
     Director                Director  |  
      /    \                   /       |     
     A      B                 A        |  B
     
Cluster view splits into two independent groups
</code></pre>
<p><strong>Detection</strong>:</p>
<ul>
<li>Nodes on each side detect "failures" of nodes on other side</li>
<li>Partition detector identifies split-brain scenario</li>
<li>Both sides continue operating independently</li>
</ul>
<p><strong>Handling</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Monitor for partitions
let mut events = cluster.subscribe();

while let Some(event) = events.recv().await {
    if let ClusterEvent::PartitionDetected(minority, majority) = event {
        log::error!("Network partition detected!");
        
        if minority.contains(&amp;my_node_id) {
            // I'm in minority partition
            log::warn!("In minority partition, entering degraded mode");
            enter_read_only_mode().await;
        } else {
            // I'm in majority partition
            log::info!("In majority partition, continuing normal operation");
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="3-slow-nodes-degraded-performance"><a class="header" href="#3-slow-nodes-degraded-performance">3. Slow Nodes (Degraded Performance)</a></h3>
<p><strong>Scenario</strong>: Node responding but very slowly</p>
<pre><code>Normal response:    100ms
Degraded response:  5000ms (50x slower)
</code></pre>
<p><strong>Detection</strong>:</p>
<ul>
<li>Phi Accrual increases suspicion level but may not mark as failed</li>
<li>Request timeouts at application level</li>
<li>Load balancer (Least Connections) naturally avoids slow nodes</li>
</ul>
<p><strong>Handling</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Set request timeout
let timeout = Duration::from_secs(5);

match tokio::time::timeout(timeout, worker.call("compute", data)).await {
    Ok(Ok(result)) =&gt; {
        // Success
    }
    Ok(Err(e)) =&gt; {
        log::error!("Worker returned error: {}", e);
        retry_with_different_worker(data).await?;
    }
    Err(_) =&gt; {
        log::warn!("Worker timeout, trying another");
        retry_with_different_worker(data).await?;
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="4-cascading-failures"><a class="header" href="#4-cascading-failures">4. Cascading Failures</a></h3>
<p><strong>Scenario</strong>: Failure of one node causes others to fail</p>
<pre><code>Worker A crashes
  → Remaining workers overloaded
    → Worker B crashes from overload
      → Worker C also crashes
        → Complete system failure
</code></pre>
<p><strong>Prevention</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Load shedding to prevent cascading failures
async fn select_worker_with_shedding(
    registry: &amp;WorkerRegistry,
    max_load: f64,
) -&gt; Result&lt;Worker&gt; {
    let worker = registry.select_worker(Some("role=worker")).await?;
    
    let load = worker.active_connections as f64 / worker.capacity as f64;
    
    if load &gt; max_load {
        // Reject request to prevent overload
        return Err(anyhow::anyhow!("All workers at capacity, shedding load"));
    }
    
    Ok(worker)
}
<span class="boring">}</span></code></pre></pre>
<h2 id="failure-detection-timeline"><a class="header" href="#failure-detection-timeline">Failure Detection Timeline</a></h2>
<h3 id="node-crash-detection"><a class="header" href="#node-crash-detection">Node Crash Detection</a></h3>
<pre><code>Time:    0s      1s      2s      3s      4s      5s      6s      7s      8s
         |       |       |       |       |       |       |       |       |
Gossip:  ✓       ✓       ✓       X       .       .       .       .       .
         
Phi:     0       0       0       2       4       6       8       10      12
                                                 ^
                                            Threshold (8.0)
                                            Node marked FAILED
                                            
Events:  -       -       -       -       -       -    NodeFailed propagated
         
Registry:-       -       -       -       -       -    Worker removed from pool
         
Clients: -       -       -       -       -       -    Requests route elsewhere
</code></pre>
<p><strong>Total time to full recovery</strong>: ~6-8 seconds with default settings</p>
<h3 id="partition-detection-timeline"><a class="header" href="#partition-detection-timeline">Partition Detection Timeline</a></h3>
<pre><code>Time:    0s          5s          10s         15s         20s
         |           |           |           |           |
         Partition occurs
         |
         Side A can't reach Side B
         Side B can't reach Side A
         |
         Both sides mark other as "suspect"
                     |
                     Multiple nodes confirm partition
                                 |
                                 PartitionDetected event
                                             |
                                             Both sides operate independently
                                                         |
                                                         Partition heals
                                                         Gossip merges views
</code></pre>
<p><strong>Detection time</strong>: 10-15 seconds<br />
<strong>Recovery time</strong>: 5-10 seconds after partition heals</p>
<h2 id="retry-strategies"><a class="header" href="#retry-strategies">Retry Strategies</a></h2>
<h3 id="automatic-retry"><a class="header" href="#automatic-retry">Automatic Retry</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use tokio::time::{sleep, Duration};

async fn call_with_retry&lt;T&gt;(
    f: impl Fn() -&gt; Pin&lt;Box&lt;dyn Future&lt;Output = Result&lt;T&gt;&gt;&gt;&gt;,
    max_retries: usize,
) -&gt; Result&lt;T&gt; {
    let mut retries = 0;
    
    loop {
        match f().await {
            Ok(result) =&gt; return Ok(result),
            Err(e) if retries &lt; max_retries =&gt; {
                retries += 1;
                log::warn!("Retry {}/{} after error: {}", retries, max_retries, e);
                
                // Exponential backoff
                let delay = Duration::from_millis(100 * 2_u64.pow(retries as u32));
                sleep(delay).await;
            }
            Err(e) =&gt; return Err(e),
        }
    }
}

// Usage
let result = call_with_retry(
    || Box::pin(worker.call("compute", data.clone())),
    3
).await?;
<span class="boring">}</span></code></pre></pre>
<h3 id="failover-to-different-worker"><a class="header" href="#failover-to-different-worker">Failover to Different Worker</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>async fn call_with_failover(
    registry: Arc&lt;WorkerRegistry&gt;,
    method: &amp;str,
    data: Vec&lt;u8&gt;,
    max_attempts: usize,
) -&gt; Result&lt;Response&gt; {
    let mut attempted_workers = HashSet::new();
    
    for attempt in 0..max_attempts {
        // Select worker we haven't tried yet
        let worker = loop {
            let w = registry.select_worker(Some("role=worker")).await?;
            if !attempted_workers.contains(&amp;w.id) {
                break w;
            }
            
            if attempted_workers.len() &gt;= registry.worker_count().await {
                return Err(anyhow::anyhow!("All workers failed"));
            }
        };
        
        attempted_workers.insert(worker.id);
        
        log::info!("Attempt {}: trying worker {}", attempt + 1, worker.label);
        
        match worker.call(method, data.clone()).await {
            Ok(response) =&gt; return Ok(response),
            Err(e) =&gt; {
                log::warn!("Worker {} failed: {}", worker.label, e);
                continue;
            }
        }
    }
    
    Err(anyhow::anyhow!("Failed after {} attempts", max_attempts))
}
<span class="boring">}</span></code></pre></pre>
<h3 id="circuit-breaker"><a class="header" href="#circuit-breaker">Circuit Breaker</a></h3>
<p>Prevent cascading failures by temporarily stopping requests to failed nodes:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use std::sync::Arc;
use tokio::sync::RwLock;
use std::collections::HashMap;

#[derive(Clone)]
enum CircuitState {
    Closed,       // Normal operation
    Open,         // Failing, reject requests
    HalfOpen,     // Testing recovery
}

struct CircuitBreaker {
    states: Arc&lt;RwLock&lt;HashMap&lt;Uuid, CircuitState&gt;&gt;&gt;,
    failure_threshold: usize,
    timeout: Duration,
}

impl CircuitBreaker {
    async fn call&lt;T&gt;(
        &amp;self,
        worker_id: Uuid,
        f: impl Future&lt;Output = Result&lt;T&gt;&gt;,
    ) -&gt; Result&lt;T&gt; {
        let state = self.states.read().await
            .get(&amp;worker_id)
            .cloned()
            .unwrap_or(CircuitState::Closed);
        
        match state {
            CircuitState::Open =&gt; {
                // Circuit open, reject immediately
                Err(anyhow::anyhow!("Circuit breaker open for worker {}", worker_id))
            }
            CircuitState::HalfOpen | CircuitState::Closed =&gt; {
                match f.await {
                    Ok(result) =&gt; {
                        // Success, close circuit
                        self.states.write().await.insert(worker_id, CircuitState::Closed);
                        Ok(result)
                    }
                    Err(e) =&gt; {
                        // Failure, open circuit
                        self.states.write().await.insert(worker_id, CircuitState::Open);
                        
                        // Schedule transition to half-open
                        let states = self.states.clone();
                        let timeout = self.timeout;
                        tokio::spawn(async move {
                            sleep(timeout).await;
                            states.write().await.insert(worker_id, CircuitState::HalfOpen);
                        });
                        
                        Err(e)
                    }
                }
            }
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="partition-handling"><a class="header" href="#partition-handling">Partition Handling</a></h2>
<h3 id="split-brain-prevention"><a class="header" href="#split-brain-prevention">Split-Brain Prevention</a></h3>
<p><strong>Problem</strong>: During partition, both sides may accept writes, leading to conflicts.</p>
<p><strong>Solution 1</strong>: Majority quorum</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>async fn handle_partition_with_quorum(
    cluster: Arc&lt;ClusterMembership&gt;,
    total_nodes: usize,
) -&gt; Result&lt;()&gt; {
    let visible_nodes = cluster.visible_nodes().await.len();
    let majority = total_nodes / 2 + 1;
    
    if visible_nodes &lt; majority {
        log::error!("Lost majority quorum ({}/{}), entering read-only mode",
            visible_nodes, total_nodes);
        
        // Enter read-only mode
        set_read_only(true).await;
        
        // Wait for partition to heal
        loop {
            sleep(Duration::from_secs(5)).await;
            let current = cluster.visible_nodes().await.len();
            
            if current &gt;= majority {
                log::info!("Regained quorum, resuming writes");
                set_read_only(false).await;
                break;
            }
        }
    }
    
    Ok(())
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Solution 2</strong>: Designated leader</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Only one node (leader) accepts writes
async fn handle_partition_with_leader(
    cluster: Arc&lt;ClusterMembership&gt;,
    leader_id: Uuid,
) -&gt; Result&lt;()&gt; {
    let my_id = cluster.local_node_id();
    
    if my_id == leader_id {
        // I'm the leader, check if I can reach majority
        if !can_reach_majority(&amp;cluster).await {
            log::error!("Leader lost majority, stepping down");
            set_read_only(true).await;
        }
    } else {
        // I'm not the leader, check if I can reach leader
        if !can_reach_node(&amp;cluster, leader_id).await {
            log::error!("Lost connection to leader, entering read-only mode");
            set_read_only(true).await;
        }
    }
    
    Ok(())
}
<span class="boring">}</span></code></pre></pre>
<h3 id="partition-recovery"><a class="header" href="#partition-recovery">Partition Recovery</a></h3>
<p>When partition heals, nodes must reconcile state:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>async fn handle_partition_recovery(
    cluster: Arc&lt;ClusterMembership&gt;,
) -&gt; Result&lt;()&gt; {
    let mut events = cluster.subscribe();
    
    while let Some(event) = events.recv().await {
        if let ClusterEvent::PartitionHealed = event {
            log::info!("Partition healed, reconciling state");
            
            // Re-sync cluster state
            cluster.resync().await?;
            
            // Reconcile application state
            reconcile_application_state().await?;
            
            // Resume normal operation
            set_read_only(false).await;
            
            log::info!("Partition recovery complete");
        }
    }
    
    Ok(())
}

async fn reconcile_application_state() -&gt; Result&lt;()&gt; {
    // Application-specific reconciliation logic
    // Examples:
    // - Compare vector clocks
    // - Merge CRDTs
    // - Apply conflict resolution rules
    // - Manual operator intervention
    
    Ok(())
}
<span class="boring">}</span></code></pre></pre>
<h2 id="client-side-handling"><a class="header" href="#client-side-handling">Client-Side Handling</a></h2>
<h3 id="transparent-failover"><a class="header" href="#transparent-failover">Transparent Failover</a></h3>
<p>Clients should automatically failover to healthy workers:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Client implementation with automatic failover
struct ResilientClient {
    registry: Arc&lt;WorkerRegistry&gt;,
    client: Arc&lt;ClusterClient&gt;,
}

impl ResilientClient {
    async fn call(&amp;self, method: &amp;str, data: Vec&lt;u8&gt;) -&gt; Result&lt;Response&gt; {
        const MAX_ATTEMPTS: usize = 3;
        
        for attempt in 1..=MAX_ATTEMPTS {
            // Get healthy worker
            let worker = match self.registry.select_worker(Some("role=worker")).await {
                Ok(w) =&gt; w,
                Err(e) if attempt &lt; MAX_ATTEMPTS =&gt; {
                    log::warn!("No workers available, retrying...");
                    sleep(Duration::from_millis(100)).await;
                    continue;
                }
                Err(e) =&gt; return Err(e),
            };
            
            // Get pooled connection
            let conn = self.connection_pool.get_or_connect(worker.addr).await?;
            
            // Make request
            match conn.call(method, data.clone()).await {
                Ok(response) =&gt; return Ok(response),
                Err(e) =&gt; {
                    log::warn!("Worker {} failed (attempt {}): {}", 
                        worker.label, attempt, e);
                    
                    // Mark worker as potentially failed
                    self.registry.report_failure(worker.id).await;
                    
                    if attempt &lt; MAX_ATTEMPTS {
                        sleep(Duration::from_millis(100 * attempt as u64)).await;
                    }
                }
            }
        }
        
        Err(anyhow::anyhow!("All attempts failed"))
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="request-hedging"><a class="header" href="#request-hedging">Request Hedging</a></h3>
<p>Send duplicate requests to multiple workers, use first response:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>async fn hedged_call(
    registry: Arc&lt;WorkerRegistry&gt;,
    method: &amp;str,
    data: Vec&lt;u8&gt;,
    hedge_after: Duration,
) -&gt; Result&lt;Response&gt; {
    let worker1 = registry.select_worker(Some("role=worker")).await?;
    
    // Start first request
    let req1 = worker1.call(method, data.clone());
    
    tokio::select! {
        result = req1 =&gt; result,
        _ = sleep(hedge_after) =&gt; {
            // First request taking too long, send hedge request
            log::info!("Hedging request to second worker");
            
            let worker2 = registry.select_worker(Some("role=worker")).await?;
            let req2 = worker2.call(method, data.clone());
            
            // Return whichever completes first
            tokio::select! {
                result = req1 =&gt; result,
                result = req2 =&gt; result,
            }
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="monitoring-failures"><a class="header" href="#monitoring-failures">Monitoring Failures</a></h2>
<h3 id="track-failure-metrics"><a class="header" href="#track-failure-metrics">Track Failure Metrics</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>struct FailureMetrics {
    node_failures: Counter,
    partition_count: Counter,
    retry_count: Counter,
    circuit_breaks: Counter,
}

async fn monitor_failures(cluster: Arc&lt;ClusterMembership&gt;) {
    let mut events = cluster.subscribe();
    
    while let Some(event) = events.recv().await {
        match event {
            ClusterEvent::NodeFailed(node) =&gt; {
                metrics::increment_counter!("cluster.node_failures");
                log::error!("Node {} failed", node.id);
                
                // Alert if critical worker
                if node.tags.get("critical") == Some(&amp;"true".to_string()) {
                    alert_ops_team(&amp;format!("Critical node {} failed", node.id));
                }
            }
            ClusterEvent::PartitionDetected(_) =&gt; {
                metrics::increment_counter!("cluster.partitions");
                alert_ops_team("Network partition detected");
            }
            _ =&gt; {}
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="health-dashboard"><a class="header" href="#health-dashboard">Health Dashboard</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>async fn health_dashboard(registry: Arc&lt;WorkerRegistry&gt;) -&gt; String {
    let workers = registry.workers().await;
    let total = workers.len();
    let healthy = workers.iter().filter(|w| w.is_healthy()).count();
    let degraded = workers.iter().filter(|w| w.is_degraded()).count();
    let failed = total - healthy - degraded;
    
    format!(
        "Cluster Health:\n\
         Total Workers: {}\n\
         Healthy: {} ({}%)\n\
         Degraded: {} ({}%)\n\
         Failed: {} ({}%)\n",
        total,
        healthy, (healthy * 100 / total),
        degraded, (degraded * 100 / total),
        failed, (failed * 100 / total)
    )
}
<span class="boring">}</span></code></pre></pre>
<h2 id="best-practices-3"><a class="header" href="#best-practices-3">Best Practices</a></h2>
<h3 id="1-design-for-failure"><a class="header" href="#1-design-for-failure">1. Design for Failure</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Assume failures will happen
// ✅ Good: Handle failures gracefully
async fn process(data: Vec&lt;u8&gt;) -&gt; Result&lt;Response&gt; {
    match call_worker(data.clone()).await {
        Ok(response) =&gt; Ok(response),
        Err(e) =&gt; {
            log::error!("Worker call failed: {}", e);
            fallback_processing(data).await
        }
    }
}

// ❌ Bad: No failure handling
async fn process(data: Vec&lt;u8&gt;) -&gt; Result&lt;Response&gt; {
    call_worker(data).await  // Will panic/error if worker fails
}
<span class="boring">}</span></code></pre></pre>
<h3 id="2-set-appropriate-timeouts"><a class="header" href="#2-set-appropriate-timeouts">2. Set Appropriate Timeouts</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// ✅ Good: Timeout prevents hanging
let result = tokio::time::timeout(
    Duration::from_secs(5),
    worker.call("compute", data)
).await??;

// ❌ Bad: No timeout, could hang forever
let result = worker.call("compute", data).await?;
<span class="boring">}</span></code></pre></pre>
<h3 id="3-implement-idempotency"><a class="header" href="#3-implement-idempotency">3. Implement Idempotency</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// ✅ Good: Idempotent operations safe to retry
#[rpc_trait]
pub trait ComputeService {
    async fn process(&amp;self, request_id: Uuid, data: Vec&lt;u8&gt;) -&gt; Result&lt;Response&gt;;
    //                      ^^^^^^^^^^^^ request ID makes it idempotent
}

// Check if already processed
if let Some(cached) = self.check_cache(request_id).await {
    return Ok(cached);
}
<span class="boring">}</span></code></pre></pre>
<h3 id="4-monitor-everything"><a class="header" href="#4-monitor-everything">4. Monitor Everything</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Track all failure types
metrics::increment_counter!("failures.node_crash");
metrics::increment_counter!("failures.timeout");
metrics::increment_counter!("failures.partition");
metrics::gauge!("cluster.healthy_nodes", healthy_count as f64);
<span class="boring">}</span></code></pre></pre>
<h3 id="5-test-failure-scenarios"><a class="header" href="#5-test-failure-scenarios">5. Test Failure Scenarios</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[tokio::test]
async fn test_worker_failure() {
    // Start cluster
    let (director, workers) = setup_cluster().await;
    
    // Kill one worker
    workers[0].shutdown().await;
    
    // Verify requests still succeed
    let client = ResilientClient::new(director.registry());
    let result = client.call("compute", vec![1, 2, 3]).await;
    assert!(result.is_ok());
}
<span class="boring">}</span></code></pre></pre>
<h2 id="next-steps-6"><a class="header" href="#next-steps-6">Next Steps</a></h2>
<ul>
<li><strong><a href="cluster/discovery.html">Discovery</a></strong> - Understand how nodes discover failures</li>
<li><strong><a href="cluster/health.html">Health Checking</a></strong> - Learn about Phi Accrual detection</li>
<li><strong><a href="cluster/../advanced/production.html">Production Guide</a></strong> - Deploy resilient clusters</li>
</ul>
<h2 id="references-3"><a class="header" href="#references-3">References</a></h2>
<ul>
<li><a href="https://en.wikipedia.org/wiki/Fallacies_of_distributed_computing">Fallacies of Distributed Computing</a> - Common mistakes</li>
<li><a href="https://en.wikipedia.org/wiki/CAP_theorem">CAP Theorem</a> - Consistency vs Availability trade-offs</li>
<li><a href="https://martinfowler.com/bliki/CircuitBreaker.html">Circuit Breaker Pattern</a> - Martin Fowler's article</li>
</ul>
<div style="break-before: page; page-break-before: always;"></div><h1 id="streaming-overview"><a class="header" href="#streaming-overview">Streaming Overview</a></h1>
<p>RpcNet builds streaming on top of QUIC bidirectional streams, letting clients
and servers exchange sequences of frames concurrently. This chapter explains the
core terminology, how the helpers map to underlying QUIC behaviour, and which
features to reach for when designing real-time APIs.</p>
<h2 id="what-streaming-means-in-rpcnet"><a class="header" href="#what-streaming-means-in-rpcnet">What “streaming” means in RpcNet</a></h2>
<p>Each streaming RPC opens a fresh QUIC bidirectional stream:</p>
<ul>
<li>Frames are transported as length-prefixed <code>Vec&lt;u8&gt;</code> payloads.</li>
<li>Upload and download directions operate independently; the client can keep
sending while the server responds, and vice versa.</li>
<li>Either side sends a zero-length frame to signal end-of-stream.</li>
</ul>
<p>RpcNet exposes three convenience helpers that mirror gRPC-style semantics:</p>
<div class="table-wrapper"><table><thead><tr><th>Pattern</th><th>Helper on <code>RpcClient</code></th><th>Typical use case</th></tr></thead><tbody>
<tr><td>Bidirectional streaming</td><td><code>call_streaming</code></td><td>Chat, collaborative editing, turn-taking</td></tr>
<tr><td>Server streaming</td><td><code>call_server_streaming</code></td><td>Live dashboards, subscriptions, long poll</td></tr>
<tr><td>Client streaming</td><td><code>call_client_streaming</code></td><td>Batched uploads, telemetry aggregation</td></tr>
</tbody></table>
</div>
<p>The server registers a single handler API (<code>register_streaming</code>) for all three
patterns; the difference lies in how the client constructs the request stream
and how many responses it expects.</p>
<h2 id="frame-format"><a class="header" href="#frame-format">Frame format</a></h2>
<p>RpcNet’s streaming frames follow this layout:</p>
<pre><code>&lt;u32 payload_length in little endian&gt;&lt;payload bytes&gt;
</code></pre>
<ul>
<li><code>payload_length == 0</code> means “no more frames”.</li>
<li>Payloads contain arbitrary user-defined bytes; most examples serialize using
<code>bincode</code> or <code>serde_json</code>.</li>
<li>The library allocates buffers lazily and only keeps a single frame in memory
per direction.</li>
</ul>
<h2 id="bidirectional-streaming-in-detail"><a class="header" href="#bidirectional-streaming-in-detail">Bidirectional streaming in detail</a></h2>
<p>Use <code>RpcClient::call_streaming</code> when both sides continuously trade messages:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>let responses = client.call_streaming("chat", outbound_frames).await?;
<span class="boring">}</span></code></pre></pre>
<p>The client passes an async <code>Stream&lt;Item = Vec&lt;u8&gt;&gt;</code> and receives another stream
for responses. RpcNet multiplexes both directions on a single QUIC stream. The
server handler receives an async stream of request frames and must return an
async stream of <code>Result&lt;Vec&lt;u8&gt;, RpcError&gt;</code> responses.</p>
<p>Choose this mode when:</p>
<ul>
<li>Each request needs a corresponding response (command/reply flow).</li>
<li>Both parties produce data over time (whiteboard sessions, multiplayer games).</li>
<li>You want to push updates without closing the upload direction.</li>
</ul>
<h2 id="server-streaming"><a class="header" href="#server-streaming">Server streaming</a></h2>
<p><code>RpcClient::call_server_streaming</code> wraps <code>call_streaming</code> for the common case
where the client sends <strong>one</strong> request and the server streams many responses:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>let stream = client.call_server_streaming("subscribe", request_bytes).await?;
<span class="boring">}</span></code></pre></pre>
<p>On the server, the handler still observes a request stream; most implementations
read the first frame as the subscription and ignore additional frames. Use this
pattern when the server drives the timeline (market data, notifications,
progress updates).</p>
<h2 id="client-streaming"><a class="header" href="#client-streaming">Client streaming</a></h2>
<p><code>RpcClient::call_client_streaming</code> handles the inverse: the client uploads many
frames and waits for a single aggregated response.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>let response = client.call_client_streaming("upload", outbound_frames).await?;
<span class="boring">}</span></code></pre></pre>
<p>The server consumes every inbound frame before yielding exactly one response
frame. This pattern pairs well with compression or summarisation (log shipping,
bulk metrics, video chunk ingestion).</p>
<h2 id="keep-alive-and-flow-control"><a class="header" href="#keep-alive-and-flow-control">Keep-alive and flow control</a></h2>
<ul>
<li><code>RpcConfig::with_keep_alive_interval</code> controls heartbeat frames at the QUIC
layer, keeping otherwise idle streams alive.</li>
<li>Flow control is managed by s2n-quic; RpcNet reads and writes asynchronously,
so slow consumers only backpressure their own stream, not the entire
connection.</li>
<li>Because each RPC lives on a separate QUIC stream, you can run many streaming
calls in parallel without head-of-line blocking.</li>
</ul>
<h2 id="error-handling-semantics"><a class="header" href="#error-handling-semantics">Error handling semantics</a></h2>
<ul>
<li>
<p>Returning <code>Err(RpcError)</code> from a server response stream sends a generic error
frame to the client and terminates the stream. Encode domain-specific errors
inside your payloads when you need richer context.</p>
</li>
<li>
<p>If the client drops its output stream early, the server handler eventually</p>
<p>sees <code>None</code> from the inbound iterator and can clean up resources.</p>
</li>
<li>
<p>Timeouts follow the same <code>DEFAULT_TIMEOUT</code> as unary calls, so linger only as
long as your app requires.</p>
</li>
</ul>
<h2 id="choosing-between-streaming-helpers"><a class="header" href="#choosing-between-streaming-helpers">Choosing between streaming helpers</a></h2>
<p>Ask yourself:</p>
<ol>
<li>Does the client expect multiple responses? → Use server streaming.</li>
<li>Does the server expect multiple requests? → Use client streaming.</li>
<li>Do both sides talk repeatedly? → Use bidirectional streaming.</li>
</ol>
<p>When none of the above apply, stick with unary RPCs—they offer simpler error
handling and deterministic retry behaviour.</p>
<h2 id="whats-next"><a class="header" href="#whats-next">What’s next</a></h2>
<ul>
<li>Jump to the <a href="streaming-example.html">Streaming Walkthrough</a> for a complete
telemetry example that covers every helper.</li>
<li>Revisit <a href="concepts.html#streaming-patterns">Concepts</a> if you need low-level API
reminders or code snippets.</li>
</ul>
<p>Armed with the terminology and behaviour described here, you can design
streaming endpoints with confidence and implement them using the detailed guide
in the next chapter.</p>
<div style="break-before: page; page-break-before: always;"></div><h1 id="streaming-walkthrough"><a class="header" href="#streaming-walkthrough">Streaming Walkthrough</a></h1>
<p>This end-to-end example builds a telemetry service that exercises every
streaming mode RpcNet offers: bidirectional chat, server streaming updates, and
client streaming uploads. Follow along to scaffold the project, implement the
handlers, and drive the flows from a client binary.</p>
<h2 id="step-0-prerequisites-1"><a class="header" href="#step-0-prerequisites-1">Step 0: Prerequisites</a></h2>
<ul>
<li>Rust 1.75+ (<code>rustup show</code> to confirm)</li>
<li><code>cargo</code> on your <code>PATH</code></li>
<li>macOS or Linux (TLS support is bundled via <code>s2n-quic</code>)</li>
</ul>
<h2 id="step-1-create-the-project-layout"><a class="header" href="#step-1-create-the-project-layout">Step 1: Create the project layout</a></h2>
<pre><code class="language-bash">cargo new telemetry-streams --bin
cd telemetry-streams
mkdir -p certs src/bin
rm src/main.rs  # we'll rely on explicit binaries instead of the default main
</code></pre>
<p>The example uses two binaries: <code>src/bin/server.rs</code> and <code>src/bin/client.rs</code>.</p>
<h2 id="step-2-declare-dependencies"><a class="header" href="#step-2-declare-dependencies">Step 2: Declare dependencies</a></h2>
<p>Edit <code>Cargo.toml</code> to pull in RpcNet and helper crates:</p>
<pre><code class="language-toml">[package]
name = "telemetry-streams"
version = "0.1.0"
edition = "2021"

[dependencies]
rpcnet = "0.2"
serde = { version = "1", features = ["derive"] }
bincode = "1.3"
async-stream = "0.3"
futures = "0.3"
tokio = { version = "1", features = ["rt-multi-thread", "macros", "time"] }
</code></pre>
<ul>
<li><code>rpcnet</code> provides the client/server runtime.</li>
<li><code>async-stream</code> and <code>futures</code> help produce response streams on the server.</li>
<li><code>serde</code>/<code>bincode</code> handle payload serialization.</li>
<li>Tokio is required because RpcNet is async-first.</li>
</ul>
<h2 id="step-3-generate-development-certificates"><a class="header" href="#step-3-generate-development-certificates">Step 3: Generate development certificates</a></h2>
<p>RpcNet requires TLS material for QUIC. Create a self-signed pair for local
experiments:</p>
<pre><code class="language-bash">openssl req -x509 -newkey rsa:4096 \
  -keyout certs/server-key.pem \
  -out certs/server-cert.pem \
  -days 365 -nodes \
  -subj "/CN=localhost"
</code></pre>
<p>The client reuses the public certificate file to trust the server.</p>
<h2 id="step-4-define-shared-data-types"><a class="header" href="#step-4-define-shared-data-types">Step 4: Define shared data types</a></h2>
<p>Expose a library module that both binaries can import. Create <code>src/lib.rs</code>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// src/lib.rs
pub mod telemetry;
<span class="boring">}</span></code></pre></pre>
<p>Now add the telemetry definitions in <code>src/telemetry.rs</code>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// src/telemetry.rs
use rpcnet::RpcError;
use serde::{Deserialize, Serialize};

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct MetricReading {
    pub sensor: String,
    pub value: f64,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct LiveUpdate {
    pub sensor: String,
    pub rolling_avg: f64,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct ChatMessage {
    pub from: String,
    pub body: String,
}

#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct Ack {
    pub accepted: usize,
}

pub fn encode&lt;T: Serialize&gt;(value: &amp;T) -&gt; Result&lt;Vec&lt;u8&gt;, RpcError&gt; {
    Ok(bincode::serialize(value)?)
}

pub fn decode&lt;T: for&lt;'de&gt; Deserialize&lt;'de&gt;&gt;(bytes: &amp;[u8]) -&gt; Result&lt;T, RpcError&gt; {
    Ok(bincode::deserialize(bytes)?)
}
<span class="boring">}</span></code></pre></pre>
<p>These helpers convert structures to and from the <code>Vec&lt;u8&gt;</code> payloads that
RpcNet transports.</p>
<h2 id="step-5-implement-the-streaming-server"><a class="header" href="#step-5-implement-the-streaming-server">Step 5: Implement the streaming server</a></h2>
<p>Create <code>src/bin/server.rs</code> with three handlers—one per streaming pattern:</p>
<pre><pre class="playground"><code class="language-rust">// src/bin/server.rs
use async_stream::stream;
use futures::StreamExt;
use rpcnet::{RpcConfig, RpcServer};
use telemetry_streams::telemetry::{self, Ack, ChatMessage, LiveUpdate, MetricReading};
use tokio::time::{sleep, Duration};

#[tokio::main]
async fn main() -&gt; Result&lt;(), Box&lt;dyn std::error::Error&gt;&gt; {
    let config = RpcConfig::new("certs/server-cert.pem", "127.0.0.1:9000")
        .with_key_path("certs/server-key.pem")
        .with_server_name("localhost");

    let mut server = RpcServer::new(config);

    // Bidirectional chat: echo each message with a server tag.
    server
        .register_streaming("chat", |mut inbound| async move {
            stream! {
                while let Some(frame) = inbound.next().await {
                    let msg: ChatMessage = telemetry::decode(&amp;frame)?;
                    let reply = ChatMessage {
                        from: "server".into(),
                        body: format!("ack: {}", msg.body),
                    };
                    yield telemetry::encode(&amp;reply);
                }
            }
        })
        .await;

    // Server streaming: emit rolling averages for a requested sensor.
    server
        .register_streaming("subscribe_metrics", |mut inbound| async move {
            stream! {
                if let Some(frame) = inbound.next().await {
                    let req: MetricReading = telemetry::decode(&amp;frame)?;
                    let mut window = vec![req.value];
                    for step in 1..=5 {
                        sleep(Duration::from_millis(500)).await;
                        window.push(req.value + step as f64);
                        let avg = window.iter().copied().sum::&lt;f64&gt;() / window.len() as f64;
                        let update = LiveUpdate { sensor: req.sensor.clone(), rolling_avg: avg };
                        yield telemetry::encode(&amp;update);
                    }
                }
            }
        })
        .await;

    // Client streaming: collect readings and acknowledge how many we processed.
    server
        .register_streaming("upload_batch", |mut inbound| async move {
            stream! {
                let mut readings: Vec&lt;MetricReading&gt; = Vec::new();
                while let Some(frame) = inbound.next().await {
                    let reading: MetricReading = telemetry::decode(&amp;frame)?;
                    readings.push(reading);
                }
                let ack = Ack { accepted: readings.len() };
                yield telemetry::encode(&amp;ack);
            }
        })
        .await;

    let quic_server = server.bind()?;
    println!("Telemetry server listening on 127.0.0.1:9000");
    server.start(quic_server).await?;
    Ok(())
}</code></pre></pre>
<p>Key points:</p>
<ul>
<li><code>register_streaming</code> receives a stream of request frames (<code>Vec&lt;u8&gt;</code>) and must
return a stream of <code>Result&lt;Vec&lt;u8&gt;, RpcError&gt;</code> responses.</li>
<li>The bidirectional handler echoes every inbound payload.</li>
<li>The server-streaming handler reads a single subscription request and then
pushes periodic updates without further client input.</li>
<li>The client-streaming handler drains all incoming frames before returning one
acknowledgement.</li>
</ul>
<h2 id="step-6-implement-the-client"><a class="header" href="#step-6-implement-the-client">Step 6: Implement the client</a></h2>
<p>Create <code>src/bin/client.rs</code> to exercise each streaming helper:</p>
<pre><pre class="playground"><code class="language-rust">// src/bin/client.rs
use futures::{stream, StreamExt};
use rpcnet::{RpcClient, RpcConfig, RpcError};
use telemetry_streams::telemetry::{self, Ack, ChatMessage, LiveUpdate, MetricReading};

#[tokio::main]
async fn main() -&gt; Result&lt;(), Box&lt;dyn std::error::Error&gt;&gt; {
    let config = RpcConfig::new("certs/server-cert.pem", "127.0.0.1:0")
        .with_server_name("localhost");

    let client = RpcClient::connect("127.0.0.1:9000".parse()?, config).await?;

    chat_demo(&amp;client).await?;
    server_stream_demo(&amp;client).await?;
    client_stream_demo(&amp;client).await?;

    Ok(())
}

async fn chat_demo(client: &amp;RpcClient) -&gt; Result&lt;(), RpcError&gt; {
    println!("\n--- Bidirectional chat ---");
    let messages = vec![
        ChatMessage { from: "operator".into(), body: "ping".into() },
        ChatMessage { from: "operator".into(), body: "status?".into() },
    ];
    let outbound_frames: Vec&lt;Vec&lt;u8&gt;&gt; = messages
        .into_iter()
        .map(|msg| telemetry::encode(&amp;msg).expect("serialize chat message"))
        .collect();
    let outbound = stream::iter(outbound_frames);
    let mut inbound = client.call_streaming("chat", outbound).await?;
    while let Some(frame) = inbound.next().await {
        let bytes = frame?;
        let reply: ChatMessage = telemetry::decode(&amp;bytes)?;
        println!("reply: {}", reply.body);
    }
    Ok(())
}

async fn server_stream_demo(client: &amp;RpcClient) -&gt; Result&lt;(), RpcError&gt; {
    println!("\n--- Server streaming ---");
    let request = telemetry::encode(&amp;MetricReading { sensor: "temp".into(), value: 21.0 })?;
    let mut updates = client
        .call_server_streaming("subscribe_metrics", request)
        .await?;
    while let Some(frame) = updates.next().await {
        let bytes = frame?;
        let update: LiveUpdate = telemetry::decode(&amp;bytes)?;
        println!("rolling avg: {:.2}", update.rolling_avg);
    }
    Ok(())
}

async fn client_stream_demo(client: &amp;RpcClient) -&gt; Result&lt;(), RpcError&gt; {
    println!("\n--- Client streaming ---");
    let readings: Vec&lt;Vec&lt;u8&gt;&gt; = vec![
        MetricReading { sensor: "temp".into(), value: 21.0 },
        MetricReading { sensor: "temp".into(), value: 21.5 },
        MetricReading { sensor: "temp".into(), value: 22.0 },
    ]
    .into_iter()
    .map(|reading| telemetry::encode(&amp;reading).expect("serialize reading"))
    .collect();
    let outbound = stream::iter(readings);
    let ack_frame = client
        .call_client_streaming("upload_batch", outbound)
        .await?;
    let ack: Ack = telemetry::decode(&amp;ack_frame)?;
    println!("server accepted {} readings", ack.accepted);
    Ok(())
}</code></pre></pre>
<p>The client demonstrates:</p>
<ul>
<li><code>call_streaming</code> for true bidirectional messaging.</li>
<li><code>call_server_streaming</code> when only the server produces a stream of frames.</li>
<li><code>call_client_streaming</code> to upload many frames and receive one response.</li>
</ul>
<h2 id="step-7-run-the-scenario"><a class="header" href="#step-7-run-the-scenario">Step 7: Run the scenario</a></h2>
<p>Terminal 1 – start the server:</p>
<pre><code class="language-bash">cargo run --bin server
</code></pre>
<p>Terminal 2 – launch the client:</p>
<pre><code class="language-bash">cargo run --bin client
</code></pre>
<p>Expected output (trimmed for brevity):</p>
<pre><code>--- Bidirectional chat ---
reply: ack: ping
reply: ack: status?

--- Server streaming ---
rolling avg: 21.00
rolling avg: 21.50
...

--- Client streaming ---
server accepted 3 readings
</code></pre>
<h2 id="where-to-go-next-1"><a class="header" href="#where-to-go-next-1">Where to go next</a></h2>
<ul>
<li>Revisit the <a href="concepts.html#streaming-patterns">Concepts</a> chapter for API
reference material.</li>
<li>Combine streaming RPCs with code-generated unary services from the
<a href="getting-started.html">Getting Started</a> tutorial.</li>
<li>Layer authentication, backpressure, or persistence around these handlers to
match your production needs.</li>
</ul>
<div style="break-before: page; page-break-before: always;"></div><h1 id="performance-tuning"><a class="header" href="#performance-tuning">Performance Tuning</a></h1>
<p>RpcNet achieves <strong>172,000+ requests/second</strong> with proper configuration. This chapter provides concrete tips and techniques to maximize performance in production deployments.</p>
<h2 id="baseline-performance"><a class="header" href="#baseline-performance">Baseline Performance</a></h2>
<p>Out-of-the-box performance with default settings:</p>
<div class="table-wrapper"><table><thead><tr><th>Metric</th><th>Value</th><th>Notes</th></tr></thead><tbody>
<tr><td><strong>Throughput</strong></td><td>130K-150K RPS</td><td>Single director + 3 workers</td></tr>
<tr><td><strong>Latency (P50)</strong></td><td>0.5-0.8ms</td><td>With efficient connection handling</td></tr>
<tr><td><strong>Latency (P99)</strong></td><td>2-5ms</td><td>Under moderate load</td></tr>
<tr><td><strong>CPU (per node)</strong></td><td>40-60%</td><td>At peak throughput</td></tr>
<tr><td><strong>Memory</strong></td><td>50-100MB</td><td>Per worker node</td></tr>
</tbody></table>
</div>
<p><strong>Target after tuning</strong>: 172K+ RPS, &lt; 0.5ms P50 latency, &lt; 35% CPU</p>
<h2 id="quick-wins"><a class="header" href="#quick-wins">Quick Wins</a></h2>
<h3 id="1-optimize-connection-management"><a class="header" href="#1-optimize-connection-management">1. Optimize Connection Management</a></h3>
<p><strong>Impact</strong>: Significant throughput increase, reduced latency</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::ClusterClientConfig;

// Use built-in connection optimization
let config = ClusterClientConfig::default();
<span class="boring">}</span></code></pre></pre>
<p><strong>Why it works</strong>:</p>
<ul>
<li>Efficient connection reuse</li>
<li>Reduces handshake overhead</li>
<li>Minimizes connection setup time</li>
</ul>
<h3 id="2-use-least-connections-load-balancing"><a class="header" href="#2-use-least-connections-load-balancing">2. Use Least Connections Load Balancing</a></h3>
<p><strong>Impact</strong>: 15-20% throughput increase under variable load</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{WorkerRegistry, LoadBalancingStrategy};

// Before (Round Robin): uneven load distribution
let registry = WorkerRegistry::new(cluster, LoadBalancingStrategy::RoundRobin);

// After (Least Connections): optimal distribution
let registry = WorkerRegistry::new(cluster, LoadBalancingStrategy::LeastConnections);
<span class="boring">}</span></code></pre></pre>
<p><strong>Why it works</strong>:</p>
<ul>
<li>Prevents overloading individual workers</li>
<li>Adapts to actual load in real-time</li>
<li>Handles heterogeneous workers better</li>
</ul>
<h3 id="3-tune-gossip-interval"><a class="header" href="#3-tune-gossip-interval">3. Tune Gossip Interval</a></h3>
<p><strong>Impact</strong>: 10-15% CPU reduction, minimal latency impact</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::ClusterConfig;

// Before (default 1s): higher CPU
let config = ClusterConfig::default()
    .with_gossip_interval(Duration::from_secs(1));

// After (2s for stable networks): lower CPU
let config = ClusterConfig::default()
    .with_gossip_interval(Duration::from_secs(2));
<span class="boring">}</span></code></pre></pre>
<p><strong>Why it works</strong>:</p>
<ul>
<li>Gossip overhead scales with frequency</li>
<li>Stable networks don't need aggressive gossip</li>
<li>Failure detection still fast enough (4-8s)</li>
</ul>
<h3 id="4-increase-worker-pool-size"><a class="header" href="#4-increase-worker-pool-size">4. Increase Worker Pool Size</a></h3>
<p><strong>Impact</strong>: Linear throughput scaling</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Before: 3 workers → 150K RPS
// After: 5 workers → 250K+ RPS

// Each worker adds ~50K RPS capacity
<span class="boring">}</span></code></pre></pre>
<p><strong>Guidelines</strong>:</p>
<ul>
<li>Add workers until you hit network/director bottleneck</li>
<li>Monitor director CPU - scale director if &gt; 80%</li>
<li>Ensure network bandwidth sufficient</li>
</ul>
<h2 id="detailed-tuning"><a class="header" href="#detailed-tuning">Detailed Tuning</a></h2>
<h3 id="connection-management-optimization"><a class="header" href="#connection-management-optimization">Connection Management Optimization</a></h3>
<p>RpcNet handles connection management automatically, but you can optimize for your specific use case:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::ClusterClientConfig;

// Default configuration is optimized for most use cases
let config = ClusterClientConfig::default();
<span class="boring">}</span></code></pre></pre>
<h3 id="quic-tuning"><a class="header" href="#quic-tuning">QUIC Tuning</a></h3>
<h4 id="stream-limits"><a class="header" href="#stream-limits">Stream Limits</a></h4>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::ServerConfig;

let config = ServerConfig::builder()
    .with_max_concurrent_streams(100)  // More streams = higher throughput
    .with_max_stream_bandwidth(10 * 1024 * 1024)  // 10 MB/s per stream
    .build();
<span class="boring">}</span></code></pre></pre>
<p><strong>Guidelines</strong>:</p>
<ul>
<li><strong>max_concurrent_streams</strong>: Set to expected concurrent requests + 20%</li>
<li><strong>max_stream_bandwidth</strong>: Set based on your largest message size</li>
</ul>
<h4 id="congestion-control"><a class="header" href="#congestion-control">Congestion Control</a></h4>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Aggressive (high-bandwidth networks)
.with_congestion_control(CongestionControl::Cubic)

// Conservative (variable networks)
.with_congestion_control(CongestionControl::NewReno)

// Recommended default
.with_congestion_control(CongestionControl::Bbr)  // Best overall
<span class="boring">}</span></code></pre></pre>
<h3 id="tls-optimization"><a class="header" href="#tls-optimization">TLS Optimization</a></h3>
<h4 id="session-resumption"><a class="header" href="#session-resumption">Session Resumption</a></h4>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Enable TLS session tickets for 0-RTT
let config = ServerConfig::builder()
    .with_cert_and_key(cert, key)?
    .with_session_tickets_enabled(true)  // ← Enables 0-RTT
    .build();
<span class="boring">}</span></code></pre></pre>
<p><strong>Impact</strong>: First request after reconnect goes from 2-3 RTT to 0 RTT</p>
<h4 id="cipher-suite-selection"><a class="header" href="#cipher-suite-selection">Cipher Suite Selection</a></h4>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Prefer fast ciphers (AES-GCM with hardware acceleration)
.with_cipher_suites(&amp;[
    CipherSuite::TLS13_AES_128_GCM_SHA256,  // Fast with AES-NI
    CipherSuite::TLS13_CHACHA20_POLY1305_SHA256,  // Good for ARM
])
<span class="boring">}</span></code></pre></pre>
<h3 id="message-serialization"><a class="header" href="#message-serialization">Message Serialization</a></h3>
<h4 id="use-efficient-formats"><a class="header" href="#use-efficient-formats">Use Efficient Formats</a></h4>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Fastest: bincode (binary)
use bincode;
let bytes = bincode::serialize(&amp;data)?;

// Fast: rmp-serde (MessagePack)
use rmp_serde;
let bytes = rmp_serde::to_vec(&amp;data)?;

// Slower: serde_json (human-readable, but slower)
let bytes = serde_json::to_vec(&amp;data)?;
<span class="boring">}</span></code></pre></pre>
<p><strong>Benchmark</strong> (10KB struct):</p>
<div class="table-wrapper"><table><thead><tr><th>Format</th><th>Serialize</th><th>Deserialize</th><th>Size</th></tr></thead><tbody>
<tr><td><strong>bincode</strong></td><td>12 μs</td><td>18 μs</td><td>10240 bytes</td></tr>
<tr><td><strong>MessagePack</strong></td><td>28 μs</td><td>35 μs</td><td>9800 bytes</td></tr>
<tr><td><strong>JSON</strong></td><td>85 μs</td><td>120 μs</td><td>15300 bytes</td></tr>
</tbody></table>
</div>
<h4 id="minimize-allocations"><a class="header" href="#minimize-allocations">Minimize Allocations</a></h4>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// ❌ Bad: Multiple allocations
fn build_request(id: u64, data: Vec&lt;u8&gt;) -&gt; Request {
    Request {
        id: id.to_string(),  // Allocation
        timestamp: SystemTime::now(),
        payload: format!("data-{}", String::from_utf8_lossy(&amp;data)),  // Multiple allocations
    }
}

// ✅ Good: Reuse buffers
fn build_request(id: u64, data: &amp;[u8], buffer: &amp;mut Vec&lt;u8&gt;) -&gt; Request {
    buffer.clear();
    buffer.extend_from_slice(b"data-");
    buffer.extend_from_slice(data);
    
    Request {
        id,
        timestamp: SystemTime::now(),
        payload: buffer.clone(),  // Single allocation
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="platform-specific-optimizations"><a class="header" href="#platform-specific-optimizations">Platform-Specific Optimizations</a></h2>
<h3 id="linux"><a class="header" href="#linux">Linux</a></h3>
<h4 id="udpquic-tuning"><a class="header" href="#udpquic-tuning">UDP/QUIC Tuning</a></h4>
<pre><code class="language-bash"># Increase network buffer sizes
sudo sysctl -w net.core.rmem_max=536870912
sudo sysctl -w net.core.wmem_max=536870912
sudo sysctl -w net.ipv4.tcp_rmem='4096 87380 536870912'
sudo sysctl -w net.ipv4.tcp_wmem='4096 87380 536870912'

# Increase UDP buffer (QUIC uses UDP)
sudo sysctl -w net.core.netdev_max_backlog=5000

# Increase connection tracking
sudo sysctl -w net.netfilter.nf_conntrack_max=1000000

# Make permanent: add to /etc/sysctl.conf
</code></pre>
<h4 id="cpu-affinity"><a class="header" href="#cpu-affinity">CPU Affinity</a></h4>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use core_affinity;

// Pin worker threads to specific CPUs
fn pin_to_core(core_id: usize) {
    let core_ids = core_affinity::get_core_ids().unwrap();
    core_affinity::set_for_current(core_ids[core_id]);
}

// Usage in worker startup
tokio::task::spawn_blocking(|| {
    pin_to_core(0);  // Pin to CPU 0
    // Worker processing logic
});
<span class="boring">}</span></code></pre></pre>
<h3 id="macos"><a class="header" href="#macos">macOS</a></h3>
<h4 id="increase-file-descriptors"><a class="header" href="#increase-file-descriptors">Increase File Descriptors</a></h4>
<pre><code class="language-bash"># Check current limits
ulimit -n

# Increase (temporary)
ulimit -n 65536

# Make permanent: add to ~/.zshrc or ~/.bash_profile
echo "ulimit -n 65536" &gt;&gt; ~/.zshrc
</code></pre>
<h3 id="profiling-and-monitoring"><a class="header" href="#profiling-and-monitoring">Profiling and Monitoring</a></h3>
<h4 id="cpu-profiling"><a class="header" href="#cpu-profiling">CPU Profiling</a></h4>
<pre><code class="language-bash"># Install perf (Linux)
sudo apt install linux-tools-common linux-tools-generic

# Profile RpcNet application
sudo perf record -F 99 -a -g -- cargo run --release --bin worker
sudo perf report

# Identify hot paths and optimize
</code></pre>
<h4 id="memory-profiling"><a class="header" href="#memory-profiling">Memory Profiling</a></h4>
<pre><code class="language-bash"># Use valgrind for memory analysis
cargo build --release
valgrind --tool=massif --massif-out-file=massif.out ./target/release/worker

# Visualize with massif-visualizer
ms_print massif.out
</code></pre>
<h4 id="tokio-console"><a class="header" href="#tokio-console">Tokio Console</a></h4>
<pre><code class="language-toml"># Add to Cargo.toml
[dependencies]
console-subscriber = "0.2"
</code></pre>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// In main.rs
console_subscriber::init();

// Run application and connect with tokio-console
// cargo install tokio-console
// tokio-console
<span class="boring">}</span></code></pre></pre>
<h2 id="benchmarking"><a class="header" href="#benchmarking">Benchmarking</a></h2>
<h3 id="throughput-test"><a class="header" href="#throughput-test">Throughput Test</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use std::time::Instant;

async fn benchmark_throughput(client: Arc&lt;ClusterClient&gt;, duration_secs: u64) {
    let start = Instant::now();
    let mut count = 0;
    
    while start.elapsed().as_secs() &lt; duration_secs {
        match client.call_worker("compute", vec![], Some("role=worker")).await {
            Ok(_) =&gt; count += 1,
            Err(e) =&gt; eprintln!("Request failed: {}", e),
        }
    }
    
    let elapsed = start.elapsed().as_secs_f64();
    let rps = count as f64 / elapsed;
    
    println!("Throughput: {:.0} requests/second", rps);
    println!("Total requests: {}", count);
    println!("Duration: {:.2}s", elapsed);
}
<span class="boring">}</span></code></pre></pre>
<h3 id="latency-test"><a class="header" href="#latency-test">Latency Test</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use hdrhistogram::Histogram;

async fn benchmark_latency(client: Arc&lt;ClusterClient&gt;, num_requests: usize) {
    let mut histogram = Histogram::&lt;u64&gt;::new(3).unwrap();
    
    for _ in 0..num_requests {
        let start = Instant::now();
        let _ = client.call_worker("compute", vec![], Some("role=worker")).await;
        let latency_us = start.elapsed().as_micros() as u64;
        histogram.record(latency_us).unwrap();
    }
    
    println!("Latency percentiles (μs):");
    println!("  P50:  {}", histogram.value_at_quantile(0.50));
    println!("  P90:  {}", histogram.value_at_quantile(0.90));
    println!("  P99:  {}", histogram.value_at_quantile(0.99));
    println!("  P99.9: {}", histogram.value_at_quantile(0.999));
    println!("  Max:  {}", histogram.max());
}
<span class="boring">}</span></code></pre></pre>
<h3 id="load-test-script"><a class="header" href="#load-test-script">Load Test Script</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Concurrent load test
async fn load_test(
    client: Arc&lt;ClusterClient&gt;,
    num_concurrent: usize,
    requests_per_task: usize,
) {
    let start = Instant::now();
    
    let tasks: Vec&lt;_&gt; = (0..num_concurrent)
        .map(|_| {
            let client = client.clone();
            tokio::spawn(async move {
                for _ in 0..requests_per_task {
                    let _ = client.call_worker("compute", vec![], Some("role=worker")).await;
                }
            })
        })
        .collect();
    
    for task in tasks {
        task.await.unwrap();
    }
    
    let elapsed = start.elapsed().as_secs_f64();
    let total_requests = num_concurrent * requests_per_task;
    let rps = total_requests as f64 / elapsed;
    
    println!("Load test results:");
    println!("  Concurrency: {}", num_concurrent);
    println!("  Total requests: {}", total_requests);
    println!("  Duration: {:.2}s", elapsed);
    println!("  Throughput: {:.0} RPS", rps);
}
<span class="boring">}</span></code></pre></pre>
<h2 id="performance-checklist"><a class="header" href="#performance-checklist">Performance Checklist</a></h2>
<h3 id="before-production"><a class="header" href="#before-production">Before Production</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
Use default connection management (already optimized)</li>
<li><input disabled="" type="checkbox"/>
Use Least Connections load balancing</li>
<li><input disabled="" type="checkbox"/>
Tune gossip interval for your network</li>
<li><input disabled="" type="checkbox"/>
Configure QUIC stream limits</li>
<li><input disabled="" type="checkbox"/>
Enable TLS session resumption</li>
<li><input disabled="" type="checkbox"/>
Profile with release build (<code>--release</code>)</li>
<li><input disabled="" type="checkbox"/>
Test under expected peak load</li>
<li><input disabled="" type="checkbox"/>
Monitor CPU, memory, network utilization</li>
<li><input disabled="" type="checkbox"/>
Set up latency tracking (P50, P99, P99.9)</li>
<li><input disabled="" type="checkbox"/>
Configure OS-level network tuning</li>
</ul>
<h3 id="monitoring-in-production"><a class="header" href="#monitoring-in-production">Monitoring in Production</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Essential metrics to track
metrics::gauge!("rpc.throughput_rps", current_rps);
metrics::gauge!("rpc.latency_p50_us", latency_p50);
metrics::gauge!("rpc.latency_p99_us", latency_p99);
metrics::gauge!("rpc.cpu_usage_pct", cpu_usage);
metrics::gauge!("rpc.memory_mb", memory_mb);
metrics::gauge!("pool.hit_rate", pool_hit_rate);
metrics::gauge!("cluster.healthy_workers", healthy_count);
<span class="boring">}</span></code></pre></pre>
<h2 id="troubleshooting-performance-issues"><a class="header" href="#troubleshooting-performance-issues">Troubleshooting Performance Issues</a></h2>
<h3 id="high-latency"><a class="header" href="#high-latency">High Latency</a></h3>
<p><strong>Symptoms</strong>: P99 latency &gt; 10ms</p>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Add timing to identify bottleneck
let start = Instant::now();

let select_time = Instant::now();
let worker = registry.select_worker(Some("role=worker")).await?;
println!("Worker selection: {:?}", select_time.elapsed());

let connect_time = Instant::now();
let conn = pool.get_or_connect(worker.addr).await?;
println!("Connection: {:?}", connect_time.elapsed());

let call_time = Instant::now();
let result = conn.call("compute", data).await?;
println!("RPC call: {:?}", call_time.elapsed());

println!("Total: {:?}", start.elapsed());
<span class="boring">}</span></code></pre></pre>
<p><strong>Common causes</strong>:</p>
<ul>
<li>Connection management issues (check network configuration)</li>
<li>Slow workers (check worker CPU/memory)</li>
<li>Network latency (move closer or add local workers)</li>
</ul>
<h3 id="low-throughput"><a class="header" href="#low-throughput">Low Throughput</a></h3>
<p><strong>Symptoms</strong>: &lt; 100K RPS with multiple workers</p>
<p><strong>Debug</strong>:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Check bottlenecks
println!("Pool metrics: {:?}", pool.metrics());
println!("Worker count: {}", registry.worker_count().await);
println!("Active connections: {}", pool.active_connections());
<span class="boring">}</span></code></pre></pre>
<p><strong>Common causes</strong>:</p>
<ul>
<li>Too few workers (add more)</li>
<li>Network connectivity issues (check network configuration)</li>
<li>Director CPU saturated (scale director)</li>
<li>Network bandwidth limit (upgrade network)</li>
</ul>
<h3 id="high-cpu-usage"><a class="header" href="#high-cpu-usage">High CPU Usage</a></h3>
<p><strong>Symptoms</strong>: &gt; 80% CPU at low load</p>
<p><strong>Debug</strong>:</p>
<pre><code class="language-bash"># Profile with perf
sudo perf record -F 99 -a -g -- cargo run --release
sudo perf report

# Look for hot functions
</code></pre>
<p><strong>Common causes</strong>:</p>
<ul>
<li>Too frequent gossip (increase interval)</li>
<li>Excessive serialization (optimize message format)</li>
<li>Inefficient connection handling (use latest RpcNet version)</li>
<li>Debug build instead of release</li>
</ul>
<h2 id="real-world-results"><a class="header" href="#real-world-results">Real-World Results</a></h2>
<h3 id="case-study-video-transcoding-cluster"><a class="header" href="#case-study-video-transcoding-cluster">Case Study: Video Transcoding Cluster</a></h3>
<p><strong>Setup</strong>:</p>
<ul>
<li>1 director</li>
<li>10 GPU workers</li>
<li>1000 concurrent clients</li>
</ul>
<p><strong>Before tuning</strong>: 45K RPS, 15ms P99 latency<br />
<strong>After tuning</strong>: 180K RPS, 2ms P99 latency</p>
<p><strong>Changes</strong>:</p>
<ol>
<li>Used optimized connection management</li>
<li>Tuned gossip interval (1s → 2s)</li>
<li>Used Least Connections strategy</li>
<li>Optimized message serialization (JSON → bincode)</li>
</ol>
<h2 id="next-steps-7"><a class="header" href="#next-steps-7">Next Steps</a></h2>
<ul>
<li><strong><a href="advanced/production.html">Production Guide</a></strong> - Deploy optimized clusters</li>
<li><strong><a href="advanced/../cluster/load-balancing.html">Load Balancing</a></strong> - Strategy selection</li>
</ul>
<h2 id="references-4"><a class="header" href="#references-4">References</a></h2>
<ul>
<li><a href="https://datatracker.ietf.org/doc/html/rfc9000">QUIC Performance</a> - Protocol optimizations</li>
<li><a href="https://wwwx.cs.unc.edu/~sparkst/howto/network_tuning.php">Linux Network Tuning</a> - OS-level tuning</li>
<li><a href="https://tokio.rs/tokio/topics/performance">Tokio Performance</a> - Async runtime tips</li>
</ul>
<div style="break-before: page; page-break-before: always;"></div><h1 id="production-deployment"><a class="header" href="#production-deployment">Production Deployment</a></h1>
<p>This guide covers best practices for deploying RpcNet clusters in production environments, including security, monitoring, high availability, and operational procedures.</p>
<h2 id="architecture-patterns"><a class="header" href="#architecture-patterns">Architecture Patterns</a></h2>
<h3 id="1-basic-production-setup"><a class="header" href="#1-basic-production-setup">1. Basic Production Setup</a></h3>
<p>Minimum viable production deployment:</p>
<pre><code>                    Load Balancer (L4)
                           |
              ┌────────────┼────────────┐
              │            │            │
         ┌────▼───┐   ┌────▼───┐   ┌────▼───┐
         │Director│   │Director│   │Director│  (3+ for HA)
         │  (HA)  │   │  (HA)  │   │  (HA)  │
         └────┬───┘   └────┬───┘   └────┬───┘
              │            │            │
      ┌───────┴────────────┴────────────┴───────┐
      │                                          │
  ┌───▼────┐  ┌────────┐  ┌────────┐  ┌────────▼┐
  │Worker 1│  │Worker 2│  │Worker 3│  │Worker N │
  └────────┘  └────────┘  └────────┘  └─────────┘
</code></pre>
<p><strong>Components</strong>:</p>
<ul>
<li><strong>Load Balancer</strong>: Routes clients to healthy directors</li>
<li><strong>Directors (3+)</strong>: Coordinator nodes in HA configuration</li>
<li><strong>Workers (N)</strong>: Processing nodes, scale horizontally</li>
</ul>
<h3 id="2-multi-region-setup"><a class="header" href="#2-multi-region-setup">2. Multi-Region Setup</a></h3>
<p>For global deployments:</p>
<pre><code>        Region US-EAST              Region EU-WEST
┌──────────────────────────┐  ┌──────────────────────────┐
│   Director Cluster (3)   │  │   Director Cluster (3)   │
│   Worker Pool (10+)      │  │   Worker Pool (10+)      │
└──────────┬───────────────┘  └───────────┬──────────────┘
           │                               │
           └───────────┬───────────────────┘
                       │
                 Cross-region
                 Gossip Protocol
                 (optional coordination)
</code></pre>
<p><strong>Benefits</strong>:</p>
<ul>
<li>Lower latency for regional clients</li>
<li>Fault isolation (region failure doesn't affect others)</li>
<li>Regulatory compliance (data locality)</li>
</ul>
<h3 id="3-hybrid-edge-deployment"><a class="header" href="#3-hybrid-edge-deployment">3. Hybrid Edge Deployment</a></h3>
<p>For edge computing scenarios:</p>
<pre><code>              Cloud (Central)
         ┌─────────────────────┐
         │  Director Cluster   │
         │  Worker Pool        │
         └──────────┬──────────┘
                    │
         ┌──────────┼──────────┐
         │          │          │
    ┌────▼───┐ ┌───▼────┐ ┌───▼────┐
    │ Edge 1 │ │ Edge 2 │ │ Edge 3 │
    │Workers │ │Workers │ │Workers │
    └────────┘ └────────┘ └────────┘
</code></pre>
<p><strong>Use cases</strong>:</p>
<ul>
<li>IoT workloads</li>
<li>Low-latency requirements</li>
<li>Bandwidth optimization</li>
</ul>
<h2 id="security"><a class="header" href="#security">Security</a></h2>
<h3 id="tls-configuration"><a class="header" href="#tls-configuration">TLS Configuration</a></h3>
<h4 id="production-certificates"><a class="header" href="#production-certificates">Production Certificates</a></h4>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// ❌ Bad: Self-signed certificates
let cert = std::fs::read("self_signed.pem")?;

// ✅ Good: Proper CA-signed certificates
let cert = std::fs::read("/etc/rpcnet/certs/server.crt")?;
let key = std::fs::read("/etc/rpcnet/certs/server.key")?;
let ca = std::fs::read("/etc/rpcnet/certs/ca.crt")?;

let config = ServerConfig::builder()
    .with_cert_and_key(cert, key)?
    .with_ca_cert(ca)?  // Verify clients
    .build();
<span class="boring">}</span></code></pre></pre>
<h4 id="certificate-rotation"><a class="header" href="#certificate-rotation">Certificate Rotation</a></h4>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use tokio::time::{interval, Duration};

async fn rotate_certificates(server: Arc&lt;Server&gt;) {
    let mut check_interval = interval(Duration::from_secs(3600)); // Check hourly
    
    loop {
        check_interval.tick().await;
        
        // Check certificate expiry
        if certificate_expires_soon("/etc/rpcnet/certs/server.crt", 30).await? {
            log::warn!("Certificate expiring soon, rotating...");
            
            // Load new certificate
            let new_cert = std::fs::read("/etc/rpcnet/certs/server.crt.new")?;
            let new_key = std::fs::read("/etc/rpcnet/certs/server.key.new")?;
            
            // Hot-reload without downtime
            server.reload_certificate(new_cert, new_key).await?;
            
            log::info!("Certificate rotated successfully");
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="authentication--authorization"><a class="header" href="#authentication--authorization">Authentication &amp; Authorization</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[rpc_trait]
pub trait SecureService {
    async fn process(&amp;self, auth_token: String, data: Vec&lt;u8&gt;) -&gt; Result&lt;Response&gt;;
}

#[rpc_impl]
impl SecureService for Handler {
    async fn process(&amp;self, auth_token: String, data: Vec&lt;u8&gt;) -&gt; Result&lt;Response&gt; {
        // Verify token
        let claims = verify_jwt(&amp;auth_token)?;
        
        // Check permissions
        if !claims.has_permission("compute:execute") {
            return Err(anyhow::anyhow!("Insufficient permissions"));
        }
        
        // Process request
        Ok(self.do_process(data).await?)
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="network-segmentation"><a class="header" href="#network-segmentation">Network Segmentation</a></h3>
<pre><code>┌─────────────────────────────────────────────────────┐
│                 Public Network                       │
│  (Clients, Load Balancer)                           │
└────────────────────┬────────────────────────────────┘
                     │ Firewall
┌────────────────────▼────────────────────────────────┐
│             Management Network                       │
│  (Directors, Monitoring, Logging)                   │
└────────────────────┬────────────────────────────────┘
                     │ Firewall
┌────────────────────▼────────────────────────────────┐
│              Worker Network                          │
│  (Workers, Internal Communication)                  │
└─────────────────────────────────────────────────────┘
</code></pre>
<p><strong>Firewall Rules</strong>:</p>
<pre><code class="language-bash"># Public → Management: Only load balancer ports
iptables -A FORWARD -i public -o management -p tcp --dport 8080 -j ACCEPT

# Management → Workers: Full access
iptables -A FORWARD -i management -o workers -j ACCEPT

# Workers → Workers: Gossip protocol
iptables -A FORWARD -i workers -o workers -p udp --dport 7946 -j ACCEPT
</code></pre>
<h2 id="monitoring"><a class="header" href="#monitoring">Monitoring</a></h2>
<h3 id="essential-metrics"><a class="header" href="#essential-metrics">Essential Metrics</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use prometheus::{register_gauge, register_counter, register_histogram};

// Throughput
let request_counter = register_counter!("rpc_requests_total", "Total RPC requests");
request_counter.inc();

// Latency
let latency_histogram = register_histogram!(
    "rpc_latency_seconds",
    "RPC latency distribution",
    vec![0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]
);
latency_histogram.observe(duration.as_secs_f64());

// Health
let healthy_workers = register_gauge!("cluster_healthy_workers", "Number of healthy workers");
healthy_workers.set(registry.healthy_count().await as f64);

// Errors
let error_counter = register_counter!("rpc_errors_total", "Total RPC errors", &amp;["type"]);
error_counter.with_label_values(&amp;["timeout"]).inc();
<span class="boring">}</span></code></pre></pre>
<h3 id="prometheus-integration"><a class="header" href="#prometheus-integration">Prometheus Integration</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use prometheus::{Encoder, TextEncoder};
use warp::Filter;

async fn start_metrics_server() {
    let metrics_route = warp::path!("metrics").map(|| {
        let encoder = TextEncoder::new();
        let metric_families = prometheus::gather();
        let mut buffer = vec![];
        encoder.encode(&amp;metric_families, &amp;mut buffer).unwrap();
        
        warp::reply::with_header(
            buffer,
            "Content-Type",
            "text/plain; charset=utf-8",
        )
    });
    
    warp::serve(metrics_route)
        .run(([0, 0, 0, 0], 9090))
        .await;
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Prometheus config</strong> (<code>prometheus.yml</code>):</p>
<pre><code class="language-yaml">scrape_configs:
  - job_name: 'rpcnet_directors'
    static_configs:
      - targets: ['director-1:9090', 'director-2:9090', 'director-3:9090']
  
  - job_name: 'rpcnet_workers'
    static_configs:
      - targets: ['worker-1:9090', 'worker-2:9090', 'worker-3:9090']
</code></pre>
<h3 id="grafana-dashboards"><a class="header" href="#grafana-dashboards">Grafana Dashboards</a></h3>
<p><strong>Key panels</strong>:</p>
<ol>
<li><strong>Throughput</strong>: <code>rate(rpc_requests_total[1m])</code></li>
<li><strong>Latency P99</strong>: <code>histogram_quantile(0.99, rpc_latency_seconds)</code></li>
<li><strong>Error Rate</strong>: <code>rate(rpc_errors_total[1m])</code></li>
<li><strong>Worker Health</strong>: <code>cluster_healthy_workers</code></li>
</ol>
<h3 id="alerting"><a class="header" href="#alerting">Alerting</a></h3>
<pre><code class="language-yaml"># alerts.yml
groups:
  - name: rpcnet
    interval: 30s
    rules:
      - alert: HighErrorRate
        expr: rate(rpc_errors_total[5m]) &gt; 0.05
        for: 2m
        annotations:
          summary: "High RPC error rate detected"
      
      - alert: LowWorkerCount
        expr: cluster_healthy_workers &lt; 3
        for: 1m
        annotations:
          summary: "Less than 3 healthy workers available"
      
      - alert: HighLatency
        expr: histogram_quantile(0.99, rpc_latency_seconds) &gt; 0.1
        for: 5m
        annotations:
          summary: "P99 latency above 100ms"
</code></pre>
<h2 id="logging"><a class="header" href="#logging">Logging</a></h2>
<h3 id="structured-logging"><a class="header" href="#structured-logging">Structured Logging</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use tracing::{info, warn, error, instrument};

#[instrument(skip(data))]
async fn process_request(request_id: Uuid, worker_id: Uuid, data: Vec&lt;u8&gt;) -&gt; Result&lt;Response&gt; {
    info!(
        request_id = %request_id,
        worker_id = %worker_id,
        data_size = data.len(),
        "Processing request"
    );
    
    match worker.call("compute", data).await {
        Ok(response) =&gt; {
            info!(
                request_id = %request_id,
                worker_id = %worker_id,
                response_size = response.len(),
                "Request completed"
            );
            Ok(response)
        }
        Err(e) =&gt; {
            error!(
                request_id = %request_id,
                worker_id = %worker_id,
                error = %e,
                "Request failed"
            );
            Err(e)
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="log-aggregation"><a class="header" href="#log-aggregation">Log Aggregation</a></h3>
<p><strong>Fluentd config</strong> (<code>fluent.conf</code>):</p>
<pre><code>&lt;source&gt;
  @type forward
  port 24224
&lt;/source&gt;

&lt;match rpcnet.**&gt;
  @type elasticsearch
  host elasticsearch.example.com
  port 9200
  index_name rpcnet
  type_name logs
&lt;/match&gt;
</code></pre>
<h2 id="high-availability"><a class="header" href="#high-availability">High Availability</a></h2>
<h3 id="director-ha-setup"><a class="header" href="#director-ha-setup">Director HA Setup</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Each director is identical, configured via environment
let director_id = Uuid::new_v4();
let cluster_config = ClusterConfig::default()
    .with_bind_addr(env::var("BIND_ADDR")?.parse()?)
    .with_seeds(parse_seeds(&amp;env::var("SEED_NODES")?)?);

let cluster = server.enable_cluster(cluster_config).await?;

// Tag as director
cluster.set_tag("role", "director");
cluster.set_tag("id", &amp;director_id.to_string());

// All directors operate identically, clients can use any one
<span class="boring">}</span></code></pre></pre>
<h3 id="graceful-shutdown-1"><a class="header" href="#graceful-shutdown-1">Graceful Shutdown</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use tokio::signal;

async fn run_server(mut server: Server) -&gt; Result&lt;()&gt; {
    // Spawn server task
    let server_handle = tokio::spawn(async move {
        server.run().await
    });
    
    // Wait for shutdown signal
    signal::ctrl_c().await?;
    
    log::info!("Shutdown signal received, gracefully shutting down...");
    
    // 1. Stop accepting new connections
    server.stop_accepting().await;
    
    // 2. Wait for in-flight requests (with timeout)
    tokio::time::timeout(
        Duration::from_secs(30),
        server.wait_for_in_flight()
    ).await?;
    
    // 3. Leave cluster gracefully
    cluster.leave().await?;
    
    // 4. Close connections
    server.shutdown().await?;
    
    log::info!("Shutdown complete");
    Ok(())
}
<span class="boring">}</span></code></pre></pre>
<h3 id="health-checks"><a class="header" href="#health-checks">Health Checks</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[rpc_trait]
pub trait HealthService {
    async fn health(&amp;self) -&gt; Result&lt;HealthStatus&gt;;
    async fn ready(&amp;self) -&gt; Result&lt;ReadyStatus&gt;;
}

#[derive(Serialize, Deserialize)]
pub struct HealthStatus {
    pub healthy: bool,
    pub version: String,
    pub uptime_secs: u64,
}

#[derive(Serialize, Deserialize)]
pub struct ReadyStatus {
    pub ready: bool,
    pub workers_available: usize,
    pub cluster_size: usize,
}

#[rpc_impl]
impl HealthService for Handler {
    async fn health(&amp;self) -&gt; Result&lt;HealthStatus&gt; {
        Ok(HealthStatus {
            healthy: true,
            version: env!("CARGO_PKG_VERSION").to_string(),
            uptime_secs: self.start_time.elapsed().as_secs(),
        })
    }
    
    async fn ready(&amp;self) -&gt; Result&lt;ReadyStatus&gt; {
        let workers = self.registry.worker_count().await;
        let cluster_size = self.cluster.node_count().await;
        
        Ok(ReadyStatus {
            ready: workers &gt; 0,
            workers_available: workers,
            cluster_size,
        })
    }
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Kubernetes probes</strong>:</p>
<pre><code class="language-yaml">livenessProbe:
  exec:
    command:
    - /usr/local/bin/health-check
    - --endpoint=health
  initialDelaySeconds: 10
  periodSeconds: 10

readinessProbe:
  exec:
    command:
    - /usr/local/bin/health-check
    - --endpoint=ready
  initialDelaySeconds: 5
  periodSeconds: 5
</code></pre>
<h2 id="deployment"><a class="header" href="#deployment">Deployment</a></h2>
<h3 id="docker"><a class="header" href="#docker">Docker</a></h3>
<p><strong>Dockerfile</strong>:</p>
<pre><code class="language-dockerfile">FROM rust:1.75 as builder

WORKDIR /app
COPY Cargo.toml Cargo.lock ./
COPY src ./src

RUN cargo build --release

FROM debian:bookworm-slim

RUN apt-get update &amp;&amp; apt-get install -y \
    ca-certificates \
    &amp;&amp; rm -rf /var/lib/apt/lists/*

COPY --from=builder /app/target/release/director /usr/local/bin/
COPY --from=builder /app/target/release/worker /usr/local/bin/

# Expose ports
EXPOSE 8080 7946/udp

CMD ["director"]
</code></pre>
<p><strong>Docker Compose</strong> (<code>docker-compose.yml</code>):</p>
<pre><code class="language-yaml">version: '3.8'

services:
  director-1:
    image: rpcnet:latest
    command: director
    environment:
      - DIRECTOR_ADDR=0.0.0.0:8080
      - RUST_LOG=info
    ports:
      - "8080:8080"
      - "7946:7946/udp"
  
  worker-1:
    image: rpcnet:latest
    command: worker
    environment:
      - WORKER_LABEL=worker-1
      - WORKER_ADDR=0.0.0.0:8081
      - DIRECTOR_ADDR=director-1:8080
      - RUST_LOG=info
    depends_on:
      - director-1
</code></pre>
<h3 id="kubernetes"><a class="header" href="#kubernetes">Kubernetes</a></h3>
<p><strong>Deployment</strong> (<code>director-deployment.yaml</code>):</p>
<pre><code class="language-yaml">apiVersion: apps/v1
kind: Deployment
metadata:
  name: rpcnet-director
spec:
  replicas: 3
  selector:
    matchLabels:
      app: rpcnet-director
  template:
    metadata:
      labels:
        app: rpcnet-director
    spec:
      containers:
      - name: director
        image: rpcnet:latest
        command: ["director"]
        env:
        - name: DIRECTOR_ADDR
          value: "0.0.0.0:8080"
        - name: RUST_LOG
          value: "info"
        ports:
        - containerPort: 8080
          name: rpc
        - containerPort: 7946
          name: gossip
          protocol: UDP
        resources:
          requests:
            memory: "256Mi"
            cpu: "500m"
          limits:
            memory: "512Mi"
            cpu: "1000m"
</code></pre>
<p><strong>Service</strong> (<code>director-service.yaml</code>):</p>
<pre><code class="language-yaml">apiVersion: v1
kind: Service
metadata:
  name: rpcnet-director
spec:
  type: LoadBalancer
  selector:
    app: rpcnet-director
  ports:
  - name: rpc
    port: 8080
    targetPort: 8080
  - name: gossip
    port: 7946
    targetPort: 7946
    protocol: UDP
</code></pre>
<p><strong>HorizontalPodAutoscaler</strong>:</p>
<pre><code class="language-yaml">apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: rpcnet-worker-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: rpcnet-worker
  minReplicas: 3
  maxReplicas: 20
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
</code></pre>
<h2 id="configuration-management"><a class="header" href="#configuration-management">Configuration Management</a></h2>
<h3 id="environment-based-config"><a class="header" href="#environment-based-config">Environment-Based Config</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use config::{Config, Environment, File};

#[derive(Debug, Deserialize)]
struct Settings {
    server: ServerSettings,
    cluster: ClusterSettings,
    monitoring: MonitoringSettings,
}

#[derive(Debug, Deserialize)]
struct ServerSettings {
    bind_addr: String,
    cert_path: String,
    key_path: String,
}

fn load_config() -&gt; Result&lt;Settings&gt; {
    let settings = Config::builder()
        // Default config
        .add_source(File::with_name("config/default"))
        // Environment-specific config (optional)
        .add_source(File::with_name(&amp;format!("config/{}", env!("ENV"))).required(false))
        // Environment variables (override)
        .add_source(Environment::with_prefix("RPCNET"))
        .build()?;
    
    settings.try_deserialize()
}
<span class="boring">}</span></code></pre></pre>
<h3 id="secret-management"><a class="header" href="#secret-management">Secret Management</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use aws_sdk_secretsmanager::Client as SecretsClient;

async fn load_tls_certs_from_secrets() -&gt; Result&lt;(Vec&lt;u8&gt;, Vec&lt;u8&gt;)&gt; {
    let config = aws_config::load_from_env().await;
    let client = SecretsClient::new(&amp;config);
    
    // Load certificate
    let cert_secret = client
        .get_secret_value()
        .secret_id("rpcnet/production/tls_cert")
        .send()
        .await?;
    let cert = cert_secret.secret_binary().unwrap().as_ref().to_vec();
    
    // Load key
    let key_secret = client
        .get_secret_value()
        .secret_id("rpcnet/production/tls_key")
        .send()
        .await?;
    let key = key_secret.secret_binary().unwrap().as_ref().to_vec();
    
    Ok((cert, key))
}
<span class="boring">}</span></code></pre></pre>
<h2 id="operational-procedures"><a class="header" href="#operational-procedures">Operational Procedures</a></h2>
<h3 id="rolling-updates"><a class="header" href="#rolling-updates">Rolling Updates</a></h3>
<pre><code class="language-bash">#!/bin/bash
# Rolling update script for workers

WORKERS=("worker-1" "worker-2" "worker-3" "worker-4")

for worker in "${WORKERS[@]}"; do
    echo "Updating $worker..."
    
    # Gracefully shutdown worker
    kubectl exec $worker -- kill -SIGTERM 1
    
    # Wait for worker to leave cluster
    sleep 10
    
    # Update image
    kubectl set image deployment/rpcnet-worker worker=rpcnet:new-version
    
    # Wait for new pod to be ready
    kubectl wait --for=condition=ready pod -l app=$worker --timeout=60s
    
    # Verify worker joined cluster
    kubectl exec director-1 -- check-worker-registered $worker
    
    echo "$worker updated successfully"
done
</code></pre>
<h3 id="backup-and-restore"><a class="header" href="#backup-and-restore">Backup and Restore</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Backup cluster state (metadata only, not data)
async fn backup_cluster_state(cluster: Arc&lt;ClusterMembership&gt;) -&gt; Result&lt;()&gt; {
    let state = ClusterState {
        nodes: cluster.nodes().await,
        timestamp: SystemTime::now(),
    };
    
    let backup = serde_json::to_vec(&amp;state)?;
    std::fs::write("/backup/cluster_state.json", backup)?;
    
    Ok(())
}

// Restore from backup (for disaster recovery)
async fn restore_cluster_state(path: &amp;str) -&gt; Result&lt;ClusterState&gt; {
    let backup = std::fs::read(path)?;
    let state: ClusterState = serde_json::from_slice(&amp;backup)?;
    Ok(state)
}
<span class="boring">}</span></code></pre></pre>
<h3 id="runbooks"><a class="header" href="#runbooks">Runbooks</a></h3>
<p><strong>Worker Node Failure</strong>:</p>
<ol>
<li>Verify failure: <code>kubectl get pods | grep worker</code></li>
<li>Check logs: <code>kubectl logs &lt;worker-pod&gt;</code></li>
<li>If recoverable: <code>kubectl delete pod &lt;worker-pod&gt;</code> (auto-restarts)</li>
<li>If not: Investigate root cause, fix, redeploy</li>
<li>Verify cluster health: <code>kubectl exec director-1 -- cluster-health</code></li>
</ol>
<p><strong>High Latency</strong>:</p>
<ol>
<li>Check Grafana: Identify which nodes have high latency</li>
<li>SSH to affected nodes: <code>ssh worker-5</code></li>
<li>Check CPU/memory: <code>top</code>, <code>free -h</code></li>
<li>Check network: <code>netstat -s</code>, <code>iftop</code></li>
<li>Review logs: <code>journalctl -u rpcnet-worker -n 1000</code></li>
<li>If needed: Scale up workers or restart affected nodes</li>
</ol>
<h2 id="cost-optimization"><a class="header" href="#cost-optimization">Cost Optimization</a></h2>
<h3 id="resource-sizing"><a class="header" href="#resource-sizing">Resource Sizing</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Right-size based on actual usage
async fn recommend_sizing(metrics: &amp;Metrics) -&gt; Recommendation {
    let avg_cpu = metrics.avg_cpu_usage();
    let avg_memory = metrics.avg_memory_usage();
    let p99_cpu = metrics.p99_cpu_usage();
    
    if avg_cpu &lt; 30.0 &amp;&amp; p99_cpu &lt; 60.0 {
        Recommendation::DownsizeWorkers
    } else if p99_cpu &gt; 80.0 {
        Recommendation::UpsizeWorkers
    } else {
        Recommendation::CurrentSizingOptimal
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="auto-scaling"><a class="header" href="#auto-scaling">Auto-Scaling</a></h3>
<pre><code class="language-yaml"># Scale workers based on request rate
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: rpcnet-worker-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: rpcnet-worker
  minReplicas: 2
  maxReplicas: 20
  metrics:
  - type: Pods
    pods:
      metric:
        name: rpc_requests_per_second
      target:
        type: AverageValue
        averageValue: "5000"  # Scale when &gt; 5K RPS per worker
</code></pre>
<h2 id="checklist"><a class="header" href="#checklist">Checklist</a></h2>
<h3 id="pre-deployment"><a class="header" href="#pre-deployment">Pre-Deployment</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
TLS certificates from trusted CA</li>
<li><input disabled="" type="checkbox"/>
Secrets stored in secret manager (not env vars)</li>
<li><input disabled="" type="checkbox"/>
Monitoring and alerting configured</li>
<li><input disabled="" type="checkbox"/>
Log aggregation set up</li>
<li><input disabled="" type="checkbox"/>
Health checks implemented</li>
<li><input disabled="" type="checkbox"/>
Graceful shutdown handling</li>
<li><input disabled="" type="checkbox"/>
Resource limits configured</li>
<li><input disabled="" type="checkbox"/>
Auto-scaling rules defined</li>
<li><input disabled="" type="checkbox"/>
Backup procedures tested</li>
<li><input disabled="" type="checkbox"/>
Runbooks documented</li>
</ul>
<h3 id="post-deployment"><a class="header" href="#post-deployment">Post-Deployment</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
Verify all nodes healthy</li>
<li><input disabled="" type="checkbox"/>
Check metrics dashboards</li>
<li><input disabled="" type="checkbox"/>
Test failover scenarios</li>
<li><input disabled="" type="checkbox"/>
Validate performance (latency, throughput)</li>
<li><input disabled="" type="checkbox"/>
Review logs for errors</li>
<li><input disabled="" type="checkbox"/>
Test rolling updates</li>
<li><input disabled="" type="checkbox"/>
Verify backups working</li>
<li><input disabled="" type="checkbox"/>
Update documentation</li>
</ul>
<h2 id="next-steps-8"><a class="header" href="#next-steps-8">Next Steps</a></h2>
<ul>
<li><strong><a href="advanced/performance.html">Performance Tuning</a></strong> - Optimize for production load</li>
<li><strong><a href="advanced/../cluster/failures.html">Failure Handling</a></strong> - Handle production incidents</li>
<li><strong><a href="advanced/migration.html">Migration Guide</a></strong> - Migrate existing systems</li>
</ul>
<h2 id="references-5"><a class="header" href="#references-5">References</a></h2>
<ul>
<li><a href="https://kubernetes.io/docs/concepts/configuration/overview/">Kubernetes Best Practices</a> - K8s configuration</li>
<li><a href="https://prometheus.io/docs/practices/naming/">Prometheus Monitoring</a> - Metrics best practices</li>
<li><a href="https://aws.amazon.com/architecture/well-architected/">AWS Well-Architected</a> - Cloud architecture patterns</li>
</ul>
<div style="break-before: page; page-break-before: always;"></div><h1 id="migration-guide"><a class="header" href="#migration-guide">Migration Guide</a></h1>
<p>This guide helps you migrate from manual worker management patterns to RpcNet's built-in cluster features, reducing code complexity and improving reliability.</p>
<h2 id="why-migrate"><a class="header" href="#why-migrate">Why Migrate?</a></h2>
<h3 id="before-manual-worker-management"><a class="header" href="#before-manual-worker-management">Before: Manual Worker Management</a></h3>
<p><strong>Typical manual pattern</strong> requires ~200 lines of boilerplate:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Custom worker tracking
struct WorkerPool {
    workers: Arc&lt;Mutex&lt;HashMap&lt;Uuid, WorkerInfo&gt;&gt;&gt;,
    next_idx: Arc&lt;Mutex&lt;usize&gt;&gt;,
}

struct WorkerInfo {
    id: Uuid,
    addr: SocketAddr,
    label: String,
    last_ping: Instant,
}

impl WorkerPool {
    // Manual registration
    async fn register_worker(&amp;self, info: WorkerInfo) -&gt; Uuid {
        let id = Uuid::new_v4();
        self.workers.lock().await.insert(id, info);
        id
    }
    
    // Manual round-robin selection
    async fn get_next_worker(&amp;self) -&gt; Option&lt;WorkerInfo&gt; {
        let workers = self.workers.lock().await;
        if workers.is_empty() {
            return None;
        }
        let mut idx = self.next_idx.lock().await;
        let worker_list: Vec&lt;_&gt; = workers.values().collect();
        let worker = worker_list[*idx % worker_list.len()].clone();
        *idx += 1;
        Some(worker)
    }
    
    // Manual health checking
    async fn check_health(&amp;self) {
        let mut workers = self.workers.lock().await;
        workers.retain(|_, worker| {
            worker.last_ping.elapsed() &lt; Duration::from_secs(30)
        });
    }
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Problems</strong>:</p>
<ul>
<li>❌ No automatic discovery</li>
<li>❌ Basic round-robin only</li>
<li>❌ Simple timeout-based health checks</li>
<li>❌ Manual connection management</li>
<li>❌ No partition detection</li>
<li>❌ ~200+ lines of error-prone code</li>
</ul>
<h3 id="after-built-in-cluster-features"><a class="header" href="#after-built-in-cluster-features">After: Built-in Cluster Features</a></h3>
<p><strong>With RpcNet's cluster</strong> - only ~50 lines:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{WorkerRegistry, LoadBalancingStrategy, ClusterClient};

// Automatic discovery + load balancing + health checking
let registry = Arc::new(WorkerRegistry::new(
    cluster,
    LoadBalancingStrategy::LeastConnections
));
registry.start().await;

let client = Arc::new(ClusterClient::new(registry, config));

// That's it! Everything else is automatic:
let result = client.call_worker("compute", data, Some("role=worker")).await?;
<span class="boring">}</span></code></pre></pre>
<p><strong>Benefits</strong>:</p>
<ul>
<li>✅ Automatic discovery via gossip</li>
<li>✅ Multiple load balancing strategies</li>
<li>✅ Phi Accrual failure detection</li>
<li>✅ Efficient connection management</li>
<li>✅ Partition detection</li>
<li>✅ <strong>75% code reduction</strong></li>
</ul>
<h2 id="migration-steps"><a class="header" href="#migration-steps">Migration Steps</a></h2>
<h3 id="step-1-add-cluster-feature"><a class="header" href="#step-1-add-cluster-feature">Step 1: Add Cluster Feature</a></h3>
<p>Update <code>Cargo.toml</code>:</p>
<pre><code class="language-toml">[dependencies]
# Before
rpcnet = "0.2"

# After
rpcnet = { version = "0.2", features = ["cluster"] }
</code></pre>
<h3 id="step-2-enable-cluster-on-server"><a class="header" href="#step-2-enable-cluster-on-server">Step 2: Enable Cluster on Server</a></h3>
<p>Replace manual worker registration with cluster:</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Before: Manual RPC endpoint for registration
#[rpc_trait]
pub trait DirectorService {
    async fn register_worker(&amp;self, info: WorkerInfo) -&gt; Result&lt;Uuid&gt;;
}

// After: Enable cluster on server
let cluster_config = ClusterConfig::default()
    .with_bind_addr(bind_addr.parse()?);

let cluster = server.enable_cluster(cluster_config).await?;

// Tag for discovery
cluster.set_tag("role", "director");
<span class="boring">}</span></code></pre></pre>
<h3 id="step-3-replace-workerpool-with-workerregistry"><a class="header" href="#step-3-replace-workerpool-with-workerregistry">Step 3: Replace WorkerPool with WorkerRegistry</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Before: Custom WorkerPool
let worker_pool = Arc::new(WorkerPool::new());

// Spawn health checker
tokio::spawn({
    let pool = worker_pool.clone();
    async move {
        loop {
            pool.check_health().await;
            tokio::time::sleep(Duration::from_secs(10)).await;
        }
    }
});

// After: Built-in WorkerRegistry
let registry = Arc::new(WorkerRegistry::new(
    cluster,
    LoadBalancingStrategy::LeastConnections
));
registry.start().await;  // Automatic health checking included!
<span class="boring">}</span></code></pre></pre>
<h3 id="step-4-update-worker-startup"><a class="header" href="#step-4-update-worker-startup">Step 4: Update Worker Startup</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Before: Worker calls register RPC
let director_client = DirectorClient::connect(&amp;director_addr, config).await?;
let worker_id = director_client.register_worker(WorkerInfo {
    label: worker_label,
    addr: worker_addr,
}).await?;

// After: Worker joins cluster
let cluster_config = ClusterConfig::default()
    .with_bind_addr(worker_addr.parse()?);

let cluster = server.enable_cluster(cluster_config).await?;
cluster.join(vec![director_addr.parse()?]).await?;

// Tag for discovery
cluster.set_tag("role", "worker");
cluster.set_tag("label", &amp;worker_label);
<span class="boring">}</span></code></pre></pre>
<h3 id="step-5-replace-manual-selection-with-clusterclient"><a class="header" href="#step-5-replace-manual-selection-with-clusterclient">Step 5: Replace Manual Selection with ClusterClient</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Before: Manual worker selection + connection
let worker = worker_pool.get_next_worker().await
    .ok_or_else(|| anyhow::anyhow!("No workers available"))?;

let conn = Connection::connect(&amp;worker.addr, client_config).await?;
let result = conn.call("compute", data).await?;

// After: Automatic selection + pooled connection
let result = cluster_client.call_worker("compute", data, Some("role=worker")).await?;
<span class="boring">}</span></code></pre></pre>
<h3 id="step-6-remove-manual-health-checks"><a class="header" href="#step-6-remove-manual-health-checks">Step 6: Remove Manual Health Checks</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Before: Periodic ping to check health
tokio::spawn(async move {
    loop {
        for worker in workers.iter() {
            match ping_worker(&amp;worker.addr).await {
                Ok(_) =&gt; worker.last_ping = Instant::now(),
                Err(_) =&gt; remove_worker(worker.id).await,
            }
        }
        tokio::time::sleep(Duration::from_secs(10)).await;
    }
});

// After: Nothing! Phi Accrual + gossip handles it automatically
// Just subscribe to events if you want notifications:
let mut events = cluster.subscribe();
tokio::spawn(async move {
    while let Some(event) = events.recv().await {
        match event {
            ClusterEvent::NodeFailed(node) =&gt; {
                log::error!("Worker {} failed", node.id);
            }
            _ =&gt; {}
        }
    }
});
<span class="boring">}</span></code></pre></pre>
<h2 id="migration-examples"><a class="header" href="#migration-examples">Migration Examples</a></h2>
<h3 id="example-1-simple-director-worker"><a class="header" href="#example-1-simple-director-worker">Example 1: Simple Director-Worker</a></h3>
<h4 id="before-manual"><a class="header" href="#before-manual">Before (Manual)</a></h4>
<pre><pre class="playground"><code class="language-rust">// director.rs - ~150 lines
struct Director {
    workers: Arc&lt;Mutex&lt;HashMap&lt;Uuid, WorkerInfo&gt;&gt;&gt;,
    next_idx: Arc&lt;Mutex&lt;usize&gt;&gt;,
}

#[rpc_impl]
impl DirectorService for Director {
    async fn register_worker(&amp;self, info: WorkerInfo) -&gt; Result&lt;Uuid&gt; {
        let id = Uuid::new_v4();
        self.workers.lock().await.insert(id, info);
        Ok(id)
    }
    
    async fn get_worker(&amp;self) -&gt; Result&lt;WorkerInfo&gt; {
        let workers = self.workers.lock().await;
        if workers.is_empty() {
            return Err(anyhow::anyhow!("No workers"));
        }
        let mut idx = self.next_idx.lock().await;
        let worker_list: Vec&lt;_&gt; = workers.values().collect();
        let worker = worker_list[*idx % worker_list.len()].clone();
        *idx += 1;
        Ok(worker)
    }
}

// worker.rs - ~50 lines
async fn main() -&gt; Result&lt;()&gt; {
    let mut server = Server::new(config);
    server.register_service(Arc::new(WorkerHandler));
    server.bind(&amp;worker_addr).await?;
    
    // Register with director
    let director_client = DirectorClient::connect(&amp;director_addr, config).await?;
    director_client.register_worker(WorkerInfo {
        label: worker_label,
        addr: worker_addr,
    }).await?;
    
    server.run().await?;
    Ok(())
}</code></pre></pre>
<p><strong>Total</strong>: ~200 lines</p>
<h4 id="after-cluster"><a class="header" href="#after-cluster">After (Cluster)</a></h4>
<pre><pre class="playground"><code class="language-rust">// director.rs - ~50 lines
async fn main() -&gt; Result&lt;()&gt; {
    let mut server = Server::new(config);
    
    // Enable cluster
    let cluster = server.enable_cluster(cluster_config).await?;
    cluster.set_tag("role", "director");
    
    // Create registry
    let registry = Arc::new(WorkerRegistry::new(
        cluster,
        LoadBalancingStrategy::LeastConnections
    ));
    registry.start().await;
    
    server.bind(&amp;director_addr).await?;
    server.run().await?;
    Ok(())
}

// worker.rs - ~30 lines
async fn main() -&gt; Result&lt;()&gt; {
    let mut server = Server::new(config);
    server.register_service(Arc::new(WorkerHandler));
    server.bind(&amp;worker_addr).await?;
    
    // Join cluster
    let cluster = server.enable_cluster(cluster_config).await?;
    cluster.join(vec![director_addr.parse()?]).await?;
    cluster.set_tag("role", "worker");
    cluster.set_tag("label", &amp;worker_label);
    
    server.run().await?;
    Ok(())
}</code></pre></pre>
<p><strong>Total</strong>: ~80 lines (60% reduction)</p>
<h3 id="example-2-connection-swap-pattern"><a class="header" href="#example-2-connection-swap-pattern">Example 2: Connection Swap Pattern</a></h3>
<p>The old <code>connection_swap</code> example has been replaced by the <code>cluster</code> example which uses built-in features.</p>
<h4 id="migration-path"><a class="header" href="#migration-path">Migration Path</a></h4>
<ol>
<li><strong>Remove custom WorkerPool</strong> → Use <code>WorkerRegistry</code></li>
<li><strong>Remove manual registration RPC</strong> → Use gossip discovery</li>
<li><strong>Remove health check pings</strong> → Use Phi Accrual</li>
<li><strong>Keep application logic unchanged</strong> → RPC interfaces stay the same</li>
</ol>
<p><strong>See</strong>: <code>examples/cluster/</code> for complete working example</p>
<h2 id="feature-comparison"><a class="header" href="#feature-comparison">Feature Comparison</a></h2>
<div class="table-wrapper"><table><thead><tr><th>Feature</th><th>Manual Pattern</th><th>Built-in Cluster</th></tr></thead><tbody>
<tr><td><strong>Discovery</strong></td><td>Manual RPC registration</td><td>Automatic via gossip</td></tr>
<tr><td><strong>Load Balancing</strong></td><td>Basic round-robin</td><td>Round Robin, Random, Least Connections</td></tr>
<tr><td><strong>Health Checking</strong></td><td>Timeout-based ping</td><td>Phi Accrual algorithm</td></tr>
<tr><td><strong>Failure Detection</strong></td><td>Simple timeout</td><td>Indirect probes + Phi</td></tr>
<tr><td><strong>Connection Management</strong></td><td>Manual implementation</td><td>Built-in optimization</td></tr>
<tr><td><strong>Partition Detection</strong></td><td>Not available</td><td>Automatic</td></tr>
<tr><td><strong>Code Complexity</strong></td><td>~200 lines</td><td>~50 lines</td></tr>
<tr><td><strong>Maintenance</strong></td><td>High (custom code)</td><td>Low (battle-tested)</td></tr>
</tbody></table>
</div>
<h2 id="common-migration-issues"><a class="header" href="#common-migration-issues">Common Migration Issues</a></h2>
<h3 id="issue-1-port-conflicts"><a class="header" href="#issue-1-port-conflicts">Issue 1: Port Conflicts</a></h3>
<p><strong>Problem</strong>: Gossip protocol uses UDP, might conflict with existing services.</p>
<p><strong>Solution</strong>: Configure gossip port explicitly</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>let cluster_config = ClusterConfig::default()
    .with_bind_addr("0.0.0.0:7946".parse()?)  // Gossip on different port
    .with_gossip_port(7947);  // Custom gossip port
<span class="boring">}</span></code></pre></pre>
<h3 id="issue-2-firewall-rules"><a class="header" href="#issue-2-firewall-rules">Issue 2: Firewall Rules</a></h3>
<p><strong>Problem</strong>: Gossip UDP traffic blocked by firewall.</p>
<p><strong>Solution</strong>: Allow UDP traffic between cluster nodes</p>
<pre><code class="language-bash"># Allow gossip protocol
iptables -A INPUT -p udp --dport 7946 -j ACCEPT
iptables -A OUTPUT -p udp --sport 7946 -j ACCEPT
</code></pre>
<h3 id="issue-3-existing-health-check-logic"><a class="header" href="#issue-3-existing-health-check-logic">Issue 3: Existing Health Check Logic</a></h3>
<p><strong>Problem</strong>: Have custom health check logic that needs to be preserved.</p>
<p><strong>Solution</strong>: Combine with cluster events</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Keep custom health checks
async fn custom_health_check(worker: &amp;Worker) -&gt; bool {
    // Your custom logic
    worker.cpu_usage &lt; 80.0 &amp;&amp; worker.memory_available &gt; 1_000_000
}

// Use alongside cluster events
let mut events = cluster.subscribe();
while let Some(event) = events.recv().await {
    if let ClusterEvent::NodeFailed(node) = event {
        // Cluster detected failure
        handle_failure(node).await;
    }
}

// Periodic custom checks
tokio::spawn(async move {
    loop {
        for worker in registry.workers().await {
            if !custom_health_check(&amp;worker).await {
                log::warn!("Custom health check failed for {}", worker.label);
            }
        }
        tokio::time::sleep(Duration::from_secs(30)).await;
    }
});
<span class="boring">}</span></code></pre></pre>
<h3 id="issue-4-different-node-roles"><a class="header" href="#issue-4-different-node-roles">Issue 4: Different Node Roles</a></h3>
<p><strong>Problem</strong>: Have multiple types of nodes (coordinator, worker, storage, etc.).</p>
<p><strong>Solution</strong>: Use tags to differentiate</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Coordinator
cluster.set_tag("role", "coordinator");

// GPU worker
cluster.set_tag("role", "worker");
cluster.set_tag("gpu", "true");

// CPU worker
cluster.set_tag("role", "worker");
cluster.set_tag("cpu_only", "true");

// Select by role
let gpu_worker = registry.select_worker(Some("gpu=true")).await?;
let any_worker = registry.select_worker(Some("role=worker")).await?;
<span class="boring">}</span></code></pre></pre>
<h2 id="testing-after-migration"><a class="header" href="#testing-after-migration">Testing After Migration</a></h2>
<h3 id="unit-tests"><a class="header" href="#unit-tests">Unit Tests</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[tokio::test]
async fn test_worker_discovery() {
    // Start director
    let director = start_test_director().await;
    
    // Start worker
    let worker = start_test_worker().await;
    worker.join(vec![director.addr()]).await.unwrap();
    
    // Wait for discovery
    tokio::time::sleep(Duration::from_secs(2)).await;
    
    // Verify worker discovered
    let workers = director.registry().workers().await;
    assert_eq!(workers.len(), 1);
    assert_eq!(workers[0].tags.get("role"), Some(&amp;"worker".to_string()));
}

#[tokio::test]
async fn test_load_balancing() {
    let director = start_test_director().await;
    
    // Start 3 workers
    let worker1 = start_test_worker("worker-1").await;
    let worker2 = start_test_worker("worker-2").await;
    let worker3 = start_test_worker("worker-3").await;
    
    // Make 100 requests
    let mut worker_counts = HashMap::new();
    for _ in 0..100 {
        let result = director.call_worker("compute", vec![]).await.unwrap();
        *worker_counts.entry(result.worker_label).or_insert(0) += 1;
    }
    
    // Verify distribution (should be roughly equal)
    assert!(worker_counts.get("worker-1").unwrap() &gt; &amp;20);
    assert!(worker_counts.get("worker-2").unwrap() &gt; &amp;20);
    assert!(worker_counts.get("worker-3").unwrap() &gt; &amp;20);
}
<span class="boring">}</span></code></pre></pre>
<h3 id="integration-tests"><a class="header" href="#integration-tests">Integration Tests</a></h3>
<pre><code class="language-bash"># Test full cluster
cargo test --features cluster --test integration_tests

# Test failure scenarios
cargo test --features cluster --test failure_tests

# Test with actual network (examples)
cd examples/cluster
cargo run --bin director &amp;
cargo run --bin worker &amp;
cargo run --bin client
</code></pre>
<h2 id="rollback-plan"><a class="header" href="#rollback-plan">Rollback Plan</a></h2>
<p>If migration causes issues, you can rollback:</p>
<h3 id="option-1-feature-flag"><a class="header" href="#option-1-feature-flag">Option 1: Feature Flag</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[cfg(feature = "use-cluster")]
use rpcnet::cluster::{WorkerRegistry, ClusterClient};

#[cfg(not(feature = "use-cluster"))]
use crate::manual_pool::WorkerPool;

// Toggle between old and new with feature flag
<span class="boring">}</span></code></pre></pre>
<h3 id="option-2-gradual-migration"><a class="header" href="#option-2-gradual-migration">Option 2: Gradual Migration</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Run both systems in parallel temporarily
let manual_pool = Arc::new(WorkerPool::new());  // Old system
let cluster_registry = Arc::new(WorkerRegistry::new(cluster, strategy));  // New system

// Route percentage of traffic to new system
if rand::random::&lt;f64&gt;() &lt; 0.10 {  // 10% to new system
    cluster_registry.select_worker(filter).await
} else {
    manual_pool.get_next_worker().await  // 90% to old system
}

// Gradually increase percentage over time
<span class="boring">}</span></code></pre></pre>
<h2 id="checklist-1"><a class="header" href="#checklist-1">Checklist</a></h2>
<h3 id="pre-migration"><a class="header" href="#pre-migration">Pre-Migration</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
Review current worker management code</li>
<li><input disabled="" type="checkbox"/>
Identify custom health check logic to preserve</li>
<li><input disabled="" type="checkbox"/>
Plan firewall rule changes for gossip</li>
<li><input disabled="" type="checkbox"/>
Write tests for current behavior</li>
<li><input disabled="" type="checkbox"/>
Create rollback plan</li>
</ul>
<h3 id="during-migration"><a class="header" href="#during-migration">During Migration</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
Add cluster feature to Cargo.toml</li>
<li><input disabled="" type="checkbox"/>
Enable cluster on servers</li>
<li><input disabled="" type="checkbox"/>
Replace WorkerPool with WorkerRegistry</li>
<li><input disabled="" type="checkbox"/>
Update worker startup (join instead of register)</li>
<li><input disabled="" type="checkbox"/>
Remove manual health checks</li>
<li><input disabled="" type="checkbox"/>
Test in staging environment</li>
</ul>
<h3 id="post-migration"><a class="header" href="#post-migration">Post-Migration</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
Verify worker discovery working</li>
<li><input disabled="" type="checkbox"/>
Check load balancing distribution</li>
<li><input disabled="" type="checkbox"/>
Monitor failure detection</li>
<li><input disabled="" type="checkbox"/>
Validate performance metrics</li>
<li><input disabled="" type="checkbox"/>
Remove old worker pool code</li>
<li><input disabled="" type="checkbox"/>
Update documentation</li>
</ul>
<h2 id="performance-impact-1"><a class="header" href="#performance-impact-1">Performance Impact</a></h2>
<p><strong>Before migration</strong>:</p>
<ul>
<li>Manual round-robin: ~100K RPS</li>
<li>Timeout-based health: 30s detection time</li>
<li>Manual connection handling: 20-50ms latency</li>
</ul>
<p><strong>After migration</strong>:</p>
<ul>
<li>Least Connections: 172K+ RPS (70% increase)</li>
<li>Phi Accrual: 6-8s detection time (better accuracy)</li>
<li>Built-in connection management: &lt;1ms latency (98% reduction)</li>
</ul>
<h2 id="next-steps-9"><a class="header" href="#next-steps-9">Next Steps</a></h2>
<ul>
<li><strong><a href="advanced/../cluster/tutorial.html">Cluster Tutorial</a></strong> - Build cluster from scratch</li>
<li><strong><a href="advanced/production.html">Production Guide</a></strong> - Deploy migrated cluster</li>
<li><strong><a href="advanced/performance.html">Performance Tuning</a></strong> - Optimize new setup</li>
</ul>
<h2 id="references-6"><a class="header" href="#references-6">References</a></h2>
<ul>
<li><strong><a href="https://github.com/yourusername/rpcnet/tree/main/examples/cluster">Cluster Example</a></strong> - Complete working example</li>
<li><strong><a href="https://www.cs.cornell.edu/projects/Quicksilver/public_pdfs/SWIM.pdf">SWIM Paper</a></strong> - Gossip protocol details</li>
<li><strong><a href="https://citeseerx.ist.psu.edu/document?repid=rep1&amp;type=pdf&amp;doi=babf246cf6753ad12ce97ae47e64c9d4ff85c6f7">Phi Accrual Paper</a></strong> - Failure detection algorithm</li>
</ul>
<div style="break-before: page; page-break-before: always;"></div><h1 id="api-reference"><a class="header" href="#api-reference">API Reference</a></h1>
<p>Quick reference for RpcNet's most commonly used APIs. For complete documentation, see the <a href="https://docs.rs/rpcnet">API docs</a>.</p>
<h2 id="core-types"><a class="header" href="#core-types">Core Types</a></h2>
<h3 id="server"><a class="header" href="#server">Server</a></h3>
<p>Creates and manages RPC servers.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::{Server, ServerConfig};

// Create server
let config = ServerConfig::builder()
    .with_cert_and_key(cert, key)?
    .build();
let mut server = Server::new(config);

// Register services
server.register_service(Arc::new(MyService));

// Bind and run
server.bind("0.0.0.0:8080").await?;
server.run().await?;
<span class="boring">}</span></code></pre></pre>
<p><strong>Key methods</strong>:</p>
<ul>
<li><code>new(config)</code> - Create server with configuration</li>
<li><code>register_service(service)</code> - Register RPC service handler</li>
<li><code>bind(addr)</code> - Bind to address</li>
<li><code>enable_cluster(config)</code> - Enable cluster features</li>
<li><code>run()</code> - Start server (blocks until shutdown)</li>
<li><code>shutdown()</code> - Gracefully shut down server</li>
</ul>
<h3 id="client"><a class="header" href="#client">Client</a></h3>
<p>Connects to RPC servers and makes requests.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::{Client, ClientConfig};

// Create client
let config = ClientConfig::builder()
    .with_server_cert(cert)?
    .build();

// Connect
let client = MyServiceClient::connect("server.example.com:8080", config).await?;

// Make request
let response = client.my_method(args).await?;
<span class="boring">}</span></code></pre></pre>
<p><strong>Key methods</strong>:</p>
<ul>
<li><code>connect(addr, config)</code> - Connect to server</li>
<li>Generated methods per RPC trait</li>
<li>Auto-reconnect on connection loss</li>
</ul>
<h2 id="cluster-apis"><a class="header" href="#cluster-apis">Cluster APIs</a></h2>
<h3 id="clustermembership"><a class="header" href="#clustermembership">ClusterMembership</a></h3>
<p>Manages node membership via SWIM gossip protocol.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::ClusterMembership;

// Create cluster
let config = ClusterConfig::default()
    .with_bind_addr("0.0.0.0:7946".parse()?);
let cluster = ClusterMembership::new(config).await?;

// Join via seed nodes
cluster.join(vec!["seed.example.com:7946".parse()?]).await?;

// Tag node
cluster.set_tag("role", "worker");

// Subscribe to events
let mut events = cluster.subscribe();
while let Some(event) = events.recv().await {
    // Handle cluster events
}
<span class="boring">}</span></code></pre></pre>
<p><strong>Key methods</strong>:</p>
<ul>
<li><code>new(config)</code> - Create cluster membership</li>
<li><code>join(seeds)</code> - Join cluster via seed nodes</li>
<li><code>leave()</code> - Gracefully leave cluster</li>
<li><code>set_tag(key, value)</code> - Set metadata tag</li>
<li><code>get_tag(key)</code> - Get metadata tag</li>
<li><code>nodes()</code> - Get all cluster nodes</li>
<li><code>subscribe()</code> - Subscribe to cluster events</li>
<li><code>local_node_id()</code> - Get local node ID</li>
</ul>
<h3 id="workerregistry-1"><a class="header" href="#workerregistry-1">WorkerRegistry</a></h3>
<p>Tracks worker nodes with load balancing.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{WorkerRegistry, LoadBalancingStrategy};

// Create registry
let registry = Arc::new(WorkerRegistry::new(
    cluster,
    LoadBalancingStrategy::LeastConnections
));

// Start monitoring
registry.start().await;

// Select worker
let worker = registry.select_worker(Some("role=worker")).await?;
println!("Selected: {} at {}", worker.label, worker.addr);

// Get all workers
let workers = registry.workers().await;
<span class="boring">}</span></code></pre></pre>
<p><strong>Key methods</strong>:</p>
<ul>
<li><code>new(cluster, strategy)</code> - Create registry</li>
<li><code>start()</code> - Start monitoring cluster events</li>
<li><code>select_worker(filter)</code> - Select worker by tag filter</li>
<li><code>workers()</code> - Get all workers</li>
<li><code>worker_count()</code> - Get number of workers</li>
<li><code>subscribe()</code> - Subscribe to registry events</li>
</ul>
<h3 id="noderegistry-1"><a class="header" href="#noderegistry-1">NodeRegistry</a></h3>
<p>Tracks all cluster nodes.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::NodeRegistry;

// Create registry
let registry = Arc::new(NodeRegistry::new(cluster));
registry.start().await;

// Get all nodes
let nodes = registry.nodes().await;

// Filter by tag
let directors = nodes.iter()
    .filter(|n| n.tags.get("role") == Some(&amp;"director".to_string()))
    .collect::&lt;Vec&lt;_&gt;&gt;();
<span class="boring">}</span></code></pre></pre>
<p><strong>Key methods</strong>:</p>
<ul>
<li><code>new(cluster)</code> - Create node registry</li>
<li><code>start()</code> - Start monitoring cluster</li>
<li><code>nodes()</code> - Get all nodes</li>
<li><code>node_count()</code> - Count nodes</li>
<li><code>subscribe()</code> - Subscribe to events</li>
</ul>
<h3 id="clusterclient-1"><a class="header" href="#clusterclient-1">ClusterClient</a></h3>
<p>High-level API for calling workers.</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::{ClusterClient, ClusterClientConfig};

// Create client
let config = ClusterClientConfig::default();
let client = Arc::new(ClusterClient::new(registry, config));

// Call any worker
let result = client.call_worker("compute", request, Some("role=worker")).await?;
<span class="boring">}</span></code></pre></pre>
<p><strong>Key methods</strong>:</p>
<ul>
<li><code>new(registry, config)</code> - Create cluster client</li>
<li><code>call_worker(method, data, filter)</code> - Call any worker matching filter</li>
</ul>
<h2 id="configuration-2"><a class="header" href="#configuration-2">Configuration</a></h2>
<h3 id="serverconfig"><a class="header" href="#serverconfig">ServerConfig</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::ServerConfig;

let config = ServerConfig::builder()
    .with_cert_and_key(cert, key)?           // TLS certificate and key
    .with_ca_cert(ca)?                        // CA certificate for client verification
    .with_max_concurrent_streams(100)?       // Max concurrent QUIC streams
    .with_max_idle_timeout(Duration::from_secs(30))? // Idle timeout
    .build();
<span class="boring">}</span></code></pre></pre>
<h3 id="clientconfig"><a class="header" href="#clientconfig">ClientConfig</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::ClientConfig;

let config = ClientConfig::builder()
    .with_server_cert(cert)?                 // Server certificate
    .with_ca_cert(ca)?                       // CA certificate
    .with_connect_timeout(Duration::from_secs(5))? // Connection timeout
    .build();
<span class="boring">}</span></code></pre></pre>
<h3 id="clusterconfig"><a class="header" href="#clusterconfig">ClusterConfig</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::ClusterConfig;

let config = ClusterConfig::default()
    .with_bind_addr("0.0.0.0:7946".parse()?)
    .with_gossip_interval(Duration::from_secs(1))
    .with_health_check_interval(Duration::from_secs(2))
    .with_phi_threshold(8.0);
<span class="boring">}</span></code></pre></pre>
<h2 id="code-generation"><a class="header" href="#code-generation">Code Generation</a></h2>
<h3 id="rpc-trait-definition"><a class="header" href="#rpc-trait-definition">RPC Trait Definition</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::prelude::*;

#[rpc_trait]
pub trait MyService {
    async fn my_method(&amp;self, arg1: String, arg2: i32) -&gt; Result&lt;Response&gt;;
    async fn streaming(&amp;self, request: Request) -&gt; impl Stream&lt;Item = Result&lt;Chunk&gt;&gt;;
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Response {
    pub data: Vec&lt;u8&gt;,
}
<span class="boring">}</span></code></pre></pre>
<h3 id="generate-code"><a class="header" href="#generate-code">Generate Code</a></h3>
<pre><code class="language-bash">rpcnet-gen --input my_service.rpc.rs --output src/generated
</code></pre>
<h3 id="use-generated-code"><a class="header" href="#use-generated-code">Use Generated Code</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>mod generated;
use generated::my_service::*;

// Server side
#[rpc_impl]
impl MyService for Handler {
    async fn my_method(&amp;self, arg1: String, arg2: i32) -&gt; Result&lt;Response&gt; {
        // Implementation
    }
}

// Client side
let client = MyServiceClient::connect(addr, config).await?;
let response = client.my_method("test".to_string(), 42).await?;
<span class="boring">}</span></code></pre></pre>
<h2 id="streaming"><a class="header" href="#streaming">Streaming</a></h2>
<h3 id="server-side-streaming"><a class="header" href="#server-side-streaming">Server-Side Streaming</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[rpc_trait]
pub trait StreamService {
    async fn stream_data(&amp;self, count: usize) -&gt; impl Stream&lt;Item = Result&lt;Data&gt;&gt;;
}

#[rpc_impl]
impl StreamService for Handler {
    async fn stream_data(&amp;self, count: usize) -&gt; impl Stream&lt;Item = Result&lt;Data&gt;&gt; {
        futures::stream::iter(0..count).map(|i| {
            Ok(Data { value: i })
        })
    }
}
<span class="boring">}</span></code></pre></pre>
<h3 id="client-side-streaming"><a class="header" href="#client-side-streaming">Client-Side Streaming</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[rpc_trait]
pub trait UploadService {
    async fn upload(&amp;self, stream: impl Stream&lt;Item = Chunk&gt;) -&gt; Result&lt;Summary&gt;;
}

// Client usage
let chunks = futures::stream::iter(vec![chunk1, chunk2, chunk3]);
let summary = client.upload(chunks).await?;
<span class="boring">}</span></code></pre></pre>
<h3 id="bidirectional-streaming"><a class="header" href="#bidirectional-streaming">Bidirectional Streaming</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[rpc_trait]
pub trait ChatService {
    async fn chat(&amp;self, stream: impl Stream&lt;Item = Message&gt;) 
        -&gt; impl Stream&lt;Item = Result&lt;Message&gt;&gt;;
}
<span class="boring">}</span></code></pre></pre>
<h2 id="load-balancing-strategies-2"><a class="header" href="#load-balancing-strategies-2">Load Balancing Strategies</a></h2>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::LoadBalancingStrategy;

// Round Robin - even distribution
LoadBalancingStrategy::RoundRobin

// Random - stateless selection
LoadBalancingStrategy::Random

// Least Connections - pick least loaded (recommended)
LoadBalancingStrategy::LeastConnections
<span class="boring">}</span></code></pre></pre>
<h2 id="cluster-events"><a class="header" href="#cluster-events">Cluster Events</a></h2>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::cluster::ClusterEvent;

let mut events = cluster.subscribe();
while let Some(event) = events.recv().await {
    match event {
        ClusterEvent::NodeJoined(node) =&gt; {
            println!("Node {} joined at {}", node.id, node.addr);
        }
        ClusterEvent::NodeLeft(node) =&gt; {
            println!("Node {} left", node.id);
        }
        ClusterEvent::NodeFailed(node) =&gt; {
            println!("Node {} failed", node.id);
        }
        ClusterEvent::NodeUpdated(node) =&gt; {
            println!("Node {} updated", node.id);
        }
        ClusterEvent::PartitionDetected(minority, majority) =&gt; {
            println!("Partition detected!");
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="error-handling"><a class="header" href="#error-handling">Error Handling</a></h2>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use rpcnet::{Error, ErrorKind};

match client.call("method", args).await {
    Ok(response) =&gt; {
        // Handle success
    }
    Err(e) =&gt; {
        match e.kind() {
            ErrorKind::ConnectionFailed =&gt; {
                // Connection issue, retry with different worker
            }
            ErrorKind::Timeout =&gt; {
                // Request timed out
            }
            ErrorKind::SerializationError =&gt; {
                // Data serialization failed
            }
            ErrorKind::ApplicationError =&gt; {
                // Application-level error from handler
            }
            _ =&gt; {
                // Other errors
            }
        }
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="common-patterns"><a class="header" href="#common-patterns">Common Patterns</a></h2>
<h3 id="health-check-endpoint"><a class="header" href="#health-check-endpoint">Health Check Endpoint</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>#[rpc_trait]
pub trait HealthService {
    async fn health(&amp;self) -&gt; Result&lt;HealthStatus&gt;;
}

#[derive(Serialize, Deserialize)]
pub struct HealthStatus {
    pub healthy: bool,
    pub version: String,
    pub uptime_secs: u64,
}
<span class="boring">}</span></code></pre></pre>
<h3 id="graceful-shutdown-2"><a class="header" href="#graceful-shutdown-2">Graceful Shutdown</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>use tokio::signal;

async fn run(mut server: Server, cluster: Arc&lt;ClusterMembership&gt;) -&gt; Result&lt;()&gt; {
    let server_task = tokio::spawn(async move { server.run().await });
    
    signal::ctrl_c().await?;
    
    // Leave cluster gracefully
    cluster.leave().await?;
    
    // Wait for in-flight requests
    server.shutdown().await?;
    
    Ok(())
}
<span class="boring">}</span></code></pre></pre>
<h3 id="connection-retry"><a class="header" href="#connection-retry">Connection Retry</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>async fn call_with_retry&lt;T&gt;(
    f: impl Fn() -&gt; Pin&lt;Box&lt;dyn Future&lt;Output = Result&lt;T&gt;&gt;&gt;&gt;,
    max_retries: usize,
) -&gt; Result&lt;T&gt; {
    for attempt in 0..max_retries {
        match f().await {
            Ok(result) =&gt; return Ok(result),
            Err(e) if attempt &lt; max_retries - 1 =&gt; {
                tokio::time::sleep(Duration::from_millis(100 * 2_u64.pow(attempt as u32))).await;
            }
            Err(e) =&gt; return Err(e),
        }
    }
    unreachable!()
}
<span class="boring">}</span></code></pre></pre>
<h2 id="environment-variables-1"><a class="header" href="#environment-variables-1">Environment Variables</a></h2>
<p>Common environment variables used in examples:</p>
<pre><code class="language-bash"># Director
DIRECTOR_ADDR=127.0.0.1:61000
RUST_LOG=info

# Worker
WORKER_LABEL=worker-1
WORKER_ADDR=127.0.0.1:62001
DIRECTOR_ADDR=127.0.0.1:61000

# Client
CLIENT_ID=client-1

# Logging
RUST_LOG=rpcnet=debug,my_app=info
</code></pre>
<h2 id="feature-flags"><a class="header" href="#feature-flags">Feature Flags</a></h2>
<pre><code class="language-toml">[dependencies]
rpcnet = { version = "0.2", features = ["cluster", "metrics"] }
</code></pre>
<p>Available features:</p>
<ul>
<li><code>cluster</code> - Enable cluster features (WorkerRegistry, ClusterClient, etc.)</li>
<li><code>metrics</code> - Enable Prometheus metrics</li>
<li><code>codegen</code> - Enable code generation support (always included in v0.2+)</li>
</ul>
<h2 id="quick-examples"><a class="header" href="#quick-examples">Quick Examples</a></h2>
<h3 id="simple-rpc-server"><a class="header" href="#simple-rpc-server">Simple RPC Server</a></h3>
<pre><pre class="playground"><code class="language-rust">use rpcnet::prelude::*;

#[rpc_trait]
pub trait Echo {
    async fn echo(&amp;self, msg: String) -&gt; Result&lt;String&gt;;
}

#[rpc_impl]
impl Echo for Handler {
    async fn echo(&amp;self, msg: String) -&gt; Result&lt;String&gt; {
        Ok(msg)
    }
}

#[tokio::main]
async fn main() -&gt; Result&lt;()&gt; {
    let config = ServerConfig::builder()
        .with_cert_and_key(cert, key)?
        .build();
    
    let mut server = Server::new(config);
    server.register_service(Arc::new(Handler));
    server.bind("0.0.0.0:8080").await?;
    server.run().await?;
    Ok(())
}</code></pre></pre>
<h3 id="simple-rpc-client"><a class="header" href="#simple-rpc-client">Simple RPC Client</a></h3>
<pre><pre class="playground"><code class="language-rust">#[tokio::main]
async fn main() -&gt; Result&lt;()&gt; {
    let config = ClientConfig::builder()
        .with_server_cert(cert)?
        .build();
    
    let client = EchoClient::connect("localhost:8080", config).await?;
    let response = client.echo("Hello!".to_string()).await?;
    println!("Response: {}", response);
    Ok(())
}</code></pre></pre>
<h2 id="next-steps-10"><a class="header" href="#next-steps-10">Next Steps</a></h2>
<ul>
<li><strong><a href="reference/examples.html">Examples</a></strong> - Complete example programs</li>
<li><strong><a href="reference/../cluster/tutorial.html">Cluster Tutorial</a></strong> - Build a cluster</li>
<li><strong><a href="https://docs.rs/rpcnet">API Documentation</a></strong> - Full API docs</li>
</ul>
<div style="break-before: page; page-break-before: always;"></div><h1 id="example-programs"><a class="header" href="#example-programs">Example Programs</a></h1>
<p>This page indexes all example programs included in the RpcNet repository. Each example demonstrates specific features and can be run locally.</p>
<h2 id="repository-structure"><a class="header" href="#repository-structure">Repository Structure</a></h2>
<p>All examples are located in the <code>examples/</code> directory:</p>
<pre><code>examples/
├── cluster/          - Distributed cluster with auto-discovery
└── (more to come)
</code></pre>
<h2 id="cluster-example-1"><a class="header" href="#cluster-example-1">Cluster Example</a></h2>
<p><strong>Location</strong>: <code>examples/cluster/</code><br />
<strong>Documentation</strong>: <a href="reference/../cluster-example.html">Cluster Example Chapter</a></p>
<p>Demonstrates RpcNet's distributed cluster features with automatic service discovery, load balancing, and failure handling.</p>
<h3 id="components-1"><a class="header" href="#components-1">Components</a></h3>
<p><strong>Director</strong> (<code>examples/cluster/src/bin/director.rs</code>)</p>
<ul>
<li>Coordinator node for the cluster</li>
<li>Uses <code>WorkerRegistry</code> for auto-discovery</li>
<li>Implements load-balanced request routing</li>
<li>Monitors worker pool health</li>
</ul>
<p><strong>Worker</strong> (<code>examples/cluster/src/bin/worker.rs</code>)</p>
<ul>
<li>Processing node that joins cluster automatically</li>
<li>Tags itself with <code>role=worker</code> for discovery</li>
<li>Handles compute tasks</li>
<li>Supports failure simulation for testing</li>
</ul>
<p><strong>Client</strong> (<code>examples/cluster/src/bin/client.rs</code>)</p>
<ul>
<li>Connects through director</li>
<li>Establishes direct connections to workers</li>
<li>Handles worker failover automatically</li>
<li>Demonstrates streaming requests</li>
</ul>
<h3 id="quick-start"><a class="header" href="#quick-start">Quick Start</a></h3>
<pre><code class="language-bash"># Terminal 1: Start Director
DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --manifest-path examples/cluster/Cargo.toml --bin director

# Terminal 2: Start Worker A
WORKER_LABEL=worker-a \
  WORKER_ADDR=127.0.0.1:62001 \
  DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --manifest-path examples/cluster/Cargo.toml --bin worker

# Terminal 3: Start Worker B
WORKER_LABEL=worker-b \
  WORKER_ADDR=127.0.0.1:62002 \
  DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --manifest-path examples/cluster/Cargo.toml --bin worker

# Terminal 4: Run Client
DIRECTOR_ADDR=127.0.0.1:61000 \
  RUST_LOG=info \
  cargo run --manifest-path examples/cluster/Cargo.toml --bin client
</code></pre>
<h3 id="features-demonstrated"><a class="header" href="#features-demonstrated">Features Demonstrated</a></h3>
<ul>
<li>✅ <strong>Automatic Discovery</strong>: Workers join via SWIM gossip protocol</li>
<li>✅ <strong>Load Balancing</strong>: Uses <code>LeastConnections</code> strategy</li>
<li>✅ <strong>Health Checking</strong>: Phi Accrual failure detection</li>
<li>✅ <strong>Failover</strong>: Client handles worker failures gracefully</li>
<li>✅ <strong>Streaming</strong>: Server-side streaming responses</li>
<li>✅ <strong>Tag-Based Routing</strong>: Filter workers by role</li>
<li>✅ <strong>Cluster Events</strong>: Monitor node joined/left/failed</li>
</ul>
<h3 id="testing-scenarios"><a class="header" href="#testing-scenarios">Testing Scenarios</a></h3>
<p><strong>1. Normal Operation</strong>:</p>
<ul>
<li>Start director + 2 workers + client</li>
<li>Observe load distribution across workers</li>
<li>Watch streaming responses flow</li>
</ul>
<p><strong>2. Worker Failure</strong>:</p>
<pre><code class="language-bash"># Enable failure simulation
WORKER_FAILURE_ENABLED=true cargo run --bin worker
</code></pre>
<ul>
<li>Worker cycles through failures every ~18 seconds</li>
<li>Client detects failures and switches workers</li>
<li>Streaming continues with minimal interruption</li>
</ul>
<p><strong>3. Hard Kill</strong>:</p>
<ul>
<li>Press <code>Ctrl+C</code> on a worker</li>
<li>Director detects failure via gossip</li>
<li>Client fails over to remaining workers</li>
</ul>
<p><strong>4. Worker Restart</strong>:</p>
<ul>
<li>Restart killed worker</li>
<li>Automatic re-discovery and re-integration</li>
<li>Load distribution resumes</li>
</ul>
<h3 id="configuration-options-1"><a class="header" href="#configuration-options-1">Configuration Options</a></h3>
<p><strong>Director</strong>:</p>
<ul>
<li><code>DIRECTOR_ADDR</code> - Bind address (default: <code>127.0.0.1:61000</code>)</li>
<li><code>RUST_LOG</code> - Log level (e.g., <code>info</code>, <code>debug</code>)</li>
</ul>
<p><strong>Worker</strong>:</p>
<ul>
<li><code>WORKER_LABEL</code> - Worker identifier (default: <code>worker-1</code>)</li>
<li><code>WORKER_ADDR</code> - Bind address (default: <code>127.0.0.1:62001</code>)</li>
<li><code>DIRECTOR_ADDR</code> - Director address (default: <code>127.0.0.1:61000</code>)</li>
<li><code>WORKER_FAILURE_ENABLED</code> - Enable failure simulation (default: <code>false</code>)</li>
<li><code>RUST_LOG</code> - Log level</li>
</ul>
<p><strong>Client</strong>:</p>
<ul>
<li><code>DIRECTOR_ADDR</code> - Director address (default: <code>127.0.0.1:61000</code>)</li>
<li><code>RUST_LOG</code> - Log level</li>
</ul>
<h3 id="code-highlights"><a class="header" href="#code-highlights">Code Highlights</a></h3>
<p><strong>Worker Auto-Discovery</strong> (<code>worker.rs</code>):</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Join cluster
let cluster = server.enable_cluster(cluster_config).await?;
cluster.join(vec![director_addr.parse()?]).await?;

// Tag for discovery
cluster.set_tag("role", "worker");
cluster.set_tag("label", &amp;worker_label);
<span class="boring">}</span></code></pre></pre>
<p><strong>Load-Balanced Selection</strong> (<code>director.rs</code>):</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Create registry with load balancing
let registry = Arc::new(WorkerRegistry::new(
    cluster,
    LoadBalancingStrategy::LeastConnections
));

// Select worker automatically
let worker = registry.select_worker(Some("role=worker")).await?;
<span class="boring">}</span></code></pre></pre>
<p><strong>Client Failover</strong> (<code>client.rs</code>):</p>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// Try worker
match worker_client.generate(request).await {
    Ok(stream) =&gt; {
        // Process stream
    }
    Err(e) =&gt; {
        // Worker failed - return to director for new assignment
        println!("Worker failed: {}", e);
        continue;
    }
}
<span class="boring">}</span></code></pre></pre>
<h2 id="running-examples-from-repository"><a class="header" href="#running-examples-from-repository">Running Examples from Repository</a></h2>
<h3 id="prerequisites-2"><a class="header" href="#prerequisites-2">Prerequisites</a></h3>
<ol>
<li><strong>Clone repository</strong>:</li>
</ol>
<pre><code class="language-bash">git clone https://github.com/yourusername/rpcnet.git
cd rpcnet
</code></pre>
<ol start="2">
<li><strong>Generate test certificates</strong>:</li>
</ol>
<pre><code class="language-bash">mkdir certs
cd certs
openssl req -x509 -newkey rsa:4096 -nodes \
  -keyout test_key.pem -out test_cert.pem \
  -days 365 -subj "/CN=localhost"
cd ..
</code></pre>
<ol start="3">
<li><strong>Install dependencies</strong>:</li>
</ol>
<pre><code class="language-bash">cargo build --examples
</code></pre>
<h3 id="run-specific-example"><a class="header" href="#run-specific-example">Run Specific Example</a></h3>
<pre><code class="language-bash"># Cluster example
cd examples/cluster
cargo run --bin director
cargo run --bin worker
cargo run --bin client
</code></pre>
<h2 id="creating-your-own-examples"><a class="header" href="#creating-your-own-examples">Creating Your Own Examples</a></h2>
<h3 id="basic-template"><a class="header" href="#basic-template">Basic Template</a></h3>
<pre><pre class="playground"><code class="language-rust"><span class="boring">#![allow(unused)]
</span><span class="boring">fn main() {
</span>// examples/my_example/Cargo.toml
[package]
name = "my_example"
version = "0.1.0"
edition = "2021"

[dependencies]
rpcnet = { path = "../..", features = ["cluster"] }
tokio = { version = "1", features = ["full"] }
anyhow = "1"

[[bin]]
name = "server"
path = "src/bin/server.rs"

[[bin]]
name = "client"
path = "src/bin/client.rs"
<span class="boring">}</span></code></pre></pre>
<h3 id="example-structure"><a class="header" href="#example-structure">Example Structure</a></h3>
<pre><code>examples/my_example/
├── Cargo.toml
├── README.md
├── my_service.rpc.rs          # RPC trait definition
├── src/
│   ├── lib.rs
│   ├── generated/             # Generated code
│   │   └── my_service.rs
│   └── bin/
│       ├── server.rs
│       └── client.rs
└── tests/
    └── integration_tests.rs
</code></pre>
<h3 id="generate-code-1"><a class="header" href="#generate-code-1">Generate Code</a></h3>
<pre><code class="language-bash">cd examples/my_example
rpcnet-gen --input my_service.rpc.rs --output src/generated
</code></pre>
<h3 id="document-your-example"><a class="header" href="#document-your-example">Document Your Example</a></h3>
<p>Create <code>examples/my_example/README.md</code>:</p>
<pre><code class="language-markdown"># My Example

Brief description of what this example demonstrates.

## Features

- Feature 1
- Feature 2

## Running

Terminal 1:
\`\`\`bash
cargo run --bin server
\`\`\`

Terminal 2:
\`\`\`bash
cargo run --bin client
\`\`\`

## Expected Output

...
</code></pre>
<h2 id="testing-examples"><a class="header" href="#testing-examples">Testing Examples</a></h2>
<h3 id="manual-testing"><a class="header" href="#manual-testing">Manual Testing</a></h3>
<pre><code class="language-bash"># Run example
cd examples/cluster
cargo run --bin director &amp;
cargo run --bin worker &amp;
cargo run --bin client

# Verify output
# Clean up
killall director worker
</code></pre>
<h3 id="integration-tests-1"><a class="header" href="#integration-tests-1">Integration Tests</a></h3>
<pre><code class="language-bash"># Run example's tests
cd examples/cluster
cargo test

# Run all example tests
cargo test --examples
</code></pre>
<h2 id="example-comparison"><a class="header" href="#example-comparison">Example Comparison</a></h2>
<div class="table-wrapper"><table><thead><tr><th>Example</th><th>Complexity</th><th>Features</th><th>Best For</th></tr></thead><tbody>
<tr><td><strong>cluster</strong></td><td>Intermediate</td><td>Discovery, Load Balancing, Failover, Streaming</td><td>Understanding distributed systems</td></tr>
</tbody></table>
</div>
<h2 id="common-issues"><a class="header" href="#common-issues">Common Issues</a></h2>
<h3 id="certificate-errors"><a class="header" href="#certificate-errors">Certificate Errors</a></h3>
<pre><code>Error: Certificate verification failed
</code></pre>
<p><strong>Solution</strong>: Ensure certificates exist in <code>certs/</code>:</p>
<pre><code class="language-bash">ls certs/test_cert.pem certs/test_key.pem
</code></pre>
<h3 id="port-already-in-use"><a class="header" href="#port-already-in-use">Port Already in Use</a></h3>
<pre><code>Error: Address already in use (os error 48)
</code></pre>
<p><strong>Solution</strong>: Kill existing processes or change port:</p>
<pre><code class="language-bash">lsof -ti:61000 | xargs kill
# or
DIRECTOR_ADDR=127.0.0.1:61001 cargo run --bin director
</code></pre>
<h3 id="workers-not-discovered"><a class="header" href="#workers-not-discovered">Workers Not Discovered</a></h3>
<pre><code>Error: No workers available
</code></pre>
<p><strong>Solution</strong>:</p>
<ol>
<li>Start director first (seed node)</li>
<li>Wait 2-3 seconds for gossip propagation</li>
<li>Check firewall allows UDP port 7946</li>
</ol>
<h2 id="contributing-examples"><a class="header" href="#contributing-examples">Contributing Examples</a></h2>
<p>Want to contribute an example? Great! Here's how:</p>
<ol>
<li><strong>Create example directory</strong>: <code>examples/your_example/</code></li>
<li><strong>Write code</strong>: Follow structure above</li>
<li><strong>Test thoroughly</strong>: Include integration tests</li>
<li><strong>Document well</strong>: Clear README with running instructions</li>
<li><strong>Submit PR</strong>: Include example in this index</li>
</ol>
<p><strong>Good example ideas</strong>:</p>
<ul>
<li>Basic client-server RPC</li>
<li>Bidirectional streaming</li>
<li>Multi-region deployment</li>
<li>Custom load balancing strategy</li>
<li>Monitoring and metrics integration</li>
</ul>
<h2 id="next-steps-11"><a class="header" href="#next-steps-11">Next Steps</a></h2>
<ul>
<li><strong><a href="reference/../cluster/tutorial.html">Cluster Tutorial</a></strong> - Build cluster from scratch</li>
<li><strong><a href="reference/api.html">API Reference</a></strong> - API documentation</li>
<li><strong><a href="https://github.com/yourusername/rpcnet">GitHub Repository</a></strong> - Browse all examples</li>
</ul>
<h2 id="video-walkthroughs"><a class="header" href="#video-walkthroughs">Video Walkthroughs</a></h2>
<p>Coming soon! Video walkthroughs demonstrating:</p>
<ul>
<li>Running the cluster example</li>
<li>Testing failure scenarios</li>
<li>Building your own example</li>
</ul>

                    </main>

                    <nav class="nav-wrapper" aria-label="Page navigation">
                        <!-- Mobile navigation buttons -->


                        <div style="clear: both"></div>
                    </nav>
                </div>
            </div>

            <nav class="nav-wide-wrapper" aria-label="Page navigation">

            </nav>

        </div>

        <!-- Livereload script (if served using the cli tool) -->
        <script>
            const wsProtocol = location.protocol === 'https:' ? 'wss:' : 'ws:';
            const wsAddress = wsProtocol + "//" + location.host + "/" + "__livereload";
            const socket = new WebSocket(wsAddress);
            socket.onmessage = function (event) {
                if (event.data === "reload") {
                    socket.close();
                    location.reload();
                }
            };

            window.onbeforeunload = function() {
                socket.close();
            }
        </script>



        <script>
            window.playground_copyable = true;
        </script>


        <script src="elasticlunr.min.js"></script>
        <script src="mark.min.js"></script>
        <script src="searcher.js"></script>

        <script src="clipboard.min.js"></script>
        <script src="highlight.js"></script>
        <script src="book.js"></script>

        <!-- Custom JS scripts -->

        <script>
        window.addEventListener('load', function() {
            window.setTimeout(window.print, 100);
        });
        </script>


    </div>
    </body>
</html>