1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at https://mozilla.org/MPL/2.0/. */
use core::{
    num::{NonZeroU8, NonZeroUsize},
    time::Duration,
};

/// A Config specifies the parameters Foca will use for the SWIM
/// protocol.
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct Config {
    /// Specifies how often a random member will be probed for activity.
    ///
    /// At the end of this period, if the member didn't reply (directly
    /// or indirectly, see [`crate::Message`]) it's declared
    /// [`crate::State::Suspect`].
    ///
    /// Should be strictly larger than [`Self::probe_rtt`]. Preferably more
    /// than twice its value since we need to wait for the indirect ping cycle.
    /// If unsure, err on the safe side with `probe_rtt * 3` and tune
    /// later.
    ///
    /// Must not be zero.
    pub probe_period: Duration,

    /// How long to wait for a direct reply to a probe before starting
    /// the indirect probing cycle.
    ///
    /// It should be set to a value that describes well your transport
    /// round-trip time. A reasonable value would be a high quantile
    /// (p99, for example) of your cluster-wide `ICMP PING` RTT.
    ///
    /// Must be strictly smaller than [`Self::probe_period`].
    ///
    /// Must not be zero.
    pub probe_rtt: Duration,

    /// How many members will be asked to perform an indirect ping
    /// in case the probed member takes too long to reply.
    ///
    /// This doesn't need to be a large number: we're essentially
    /// fanning out to ensure a message actually reaches the original
    /// ping target in case of poor transmission quality or weird
    /// partitions.
    ///
    /// Setting this to 3-5 should be more than enough for a "modern"
    /// network.
    pub num_indirect_probes: NonZeroUsize,

    /// Specifies how many times a single update/broadcast will be sent
    /// along with a normal message.
    ///
    /// A high value trades off bandwidth for higher chances of fully
    /// disseminating broadcasts throughout the cluster.
    ///
    /// Reasonable values range from 5, for small clusters to 15 for
    /// *very* large clusters.
    pub max_transmissions: NonZeroU8,

    /// How long a Suspect member is considered active before being
    /// declared Down.
    ///
    /// Here you want to give time for the member to realize it has
    /// been declared Suspect and notify the cluster that its actually
    /// active.
    ///
    /// Higher values give more time for a member to recover from a
    /// false suspicion, but slows down detection of a failed state.
    ///
    /// Very application-dependent. Smaller clusters likely want
    /// this value to be a small multiplier of [`Self::probe_period`]
    /// whereas large clusters can easily tolerate several seconds of
    /// wait.
    ///
    /// Must not be zero.
    pub suspect_to_down_after: Duration,

    /// Governs how long Foca will remember an identity as being
    /// Down.
    ///
    /// It's recommended to have very high values here, in the order
    /// of hours or longer.
    ///
    /// Identities that opt-in on auto-rejoining don't need to worry
    /// about this value being high: this setting only prevents nodes
    /// using the exact same identity from joining the cluster.
    ///
    /// If you choose to use a small duration here (maybe you can't
    /// enable auto-rejoin, maybe you'd like to reduce the (small)
    /// increase in memory usage that a high value here may lead to),
    /// keep a close eye on [`crate::Foca::updates_backlog`]: large
    /// numbers higher than the number of nodes in the cluster are a
    /// strong sign that this configuration should be set to a higher
    /// value.
    ///
    /// See [`crate::Identity::renew`].
    pub remove_down_after: Duration,

    /// The maximum packet size Foca will produce AND consume.
    ///
    /// This is transport-dependent. The main goal is reducing
    /// fragmentation and congestion.
    ///
    /// If using UDP as a transport, use `rfc8085` guidelines and stick
    /// to a value smaller than your network's MTU.  1400 is a good
    /// value for a in a non-ancient network.
    pub max_packet_size: NonZeroUsize,

    /// Wether foca should try to let members that are down know about it
    ///
    /// Whenever a member is declared down by the cluster, their messages
    /// get ignored and there's no way for them to learn that this is happening.
    /// With this setting enabled, will try to notify the down member that
    /// their messages are being discarded.
    ///
    /// This feature is an extension to the SWIM protocol and should be left
    /// disabled if you're aiming at pure SWIM behavior.
    pub notify_down_members: bool,

    /// How often should foca ask its peers for more peers
    ///
    /// [`crate::Message::Announce`] is the mechanism foca uses to discover
    /// members. After joining a sizeable cluster, it may take a while until
    /// foca discovers every active member. Periodically announcing helps speed
    /// up this process.
    ///
    /// This setting is helpful for any cluster size, but smaller ones can
    /// get by without it if discovering the full active roster quickly is
    /// not a requirement.
    ///
    /// As a rule of thumb, use large values for `frequency` (say, every
    /// 30s, every minute, etc) and small values for `num_members`: just
    /// one might be good enough for many clusters.
    ///
    /// Whilst you can change the parameters at runtime, foca prevents you from
    /// changing it from `None` to `Some` to simplify reasoning. It's required
    /// to recreate your foca instance in these cases.
    ///
    /// This feature is an extension to the SWIM protocol and should be left
    /// disabled if you're aiming at pure SWIM behavior.
    pub periodic_announce: Option<PeriodicParams>,

    /// How often should foca send an announce message to members it currently
    /// considers [`crate::State::Down`]
    ///
    /// This setting instructs foca to try and talk to members that are down
    /// so that it can (eventually) recover from network partitions without
    /// additional hand-holding.
    ///
    /// It's particularly useful when used in tandem with identities that
    /// can auto-rejoin (`crate::Identity::renew`) and with
    /// `Self::notify_down_members` enabled.
    ///
    /// This feature is an extension to the SWIM protocol and should be left
    /// disabled if you're aiming at pure SWIM behavior.
    pub periodic_announce_to_down_members: Option<PeriodicParams>,

    /// How often should foca send cluster updates to peers
    ///
    /// By default, SWIM disseminates cluster updates during the direct and
    /// indirect probe cycle (See [`crate::Message`]). This setting instructs
    /// foca to also propagate updates periodically.
    ///
    /// Periodically gossiping influences the speed in which the cluster learns
    /// new information, but gossiping too much is often unnecessary since
    /// cluster changes are not (normally) high-rate events.
    ///
    /// A LAN cluster can afford high frequency gossiping (every 200ms, for example)
    /// without much worry; A WAN cluster might have better results gossiping less
    /// often (500ms) but to more members at once.
    ///
    /// Whilst you can change the parameters at runtime, foca prevents you from
    /// changing it from `None` to `Some` to simplify reasoning. It's required
    /// to recreate your foca instance in these cases.
    ///
    /// This feature is an extension to the SWIM protocol and should be left
    /// disabled if you're aiming at pure SWIM behavior.
    pub periodic_gossip: Option<PeriodicParams>,
}

/// Configuration for a task that should happen periodically
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct PeriodicParams {
    /// How often should the task be performed
    pub frequency: Duration,
    /// How many random members should be chosen
    pub num_members: NonZeroUsize,
}

impl Config {
    /// A simple configuration that should work well in a LAN scenario.
    ///
    /// This is Foca in its simplest form and has no extensions enabled.
    /// Use this config if you are trying to get a grasp of how SWIM
    /// works, without any additional behavior.
    pub fn simple() -> Self {
        Self {
            probe_period: Duration::from_millis(1500),
            probe_rtt: Duration::from_millis(500),
            num_indirect_probes: NonZeroUsize::new(3).unwrap(),

            max_transmissions: NonZeroU8::new(10).unwrap(),

            suspect_to_down_after: Duration::from_secs(3),
            remove_down_after: DEFAULT_REMOVE_DOWN_AFTER,

            max_packet_size: NonZeroUsize::new(1400).unwrap(),

            notify_down_members: false,

            periodic_announce: None,
            periodic_announce_to_down_members: None,
            periodic_gossip: None,
        }
    }
}

#[cfg(feature = "std")]
use core::num::NonZeroU32;

#[cfg(feature = "std")]
impl Config {
    /// Generate a configuration for a LAN cluster given an expected
    /// total number of active members.
    ///
    /// The `cluster_size` parameter is used to define how many times updates
    /// are disseminated ([`Config::max_transmissions`]) and how long Foca
    /// will wait before declaring a suspected member as down
    /// ([`Config::suspect_to_down_after`]).
    ///
    /// Settings derived from [memberlist's DefaultLanConfig][dlc].
    ///
    /// [dlc]: https://pkg.go.dev/github.com/hashicorp/memberlist#DefaultLANConfig
    pub fn new_lan(cluster_size: NonZeroU32) -> Self {
        let period = Duration::from_secs(1);

        Self {
            probe_period: period,
            probe_rtt: Duration::from_millis(500),
            num_indirect_probes: NonZeroUsize::new(3).unwrap(),

            max_transmissions: Self::compute_max_tx(cluster_size),

            suspect_to_down_after: Self::suspicion_duration(cluster_size, period, 4.0),
            remove_down_after: DEFAULT_REMOVE_DOWN_AFTER,

            max_packet_size: NonZeroUsize::new(1400).unwrap(),

            notify_down_members: true,

            periodic_announce: Some(PeriodicParams {
                frequency: Duration::from_secs(30),
                num_members: NonZeroUsize::new(1).unwrap(),
            }),

            periodic_announce_to_down_members: Some(PeriodicParams {
                frequency: Duration::from_secs(65),
                num_members: NonZeroUsize::new(2).unwrap(),
            }),

            periodic_gossip: Some(PeriodicParams {
                frequency: Duration::from_millis(200),
                num_members: NonZeroUsize::new(3).unwrap(),
            }),
        }
    }

    /// Generate a configuration for a WAN cluster given an expected
    /// total number of active members.
    ///
    /// Settings derived from [memberlist's DefaultWanConfig][dwc].
    ///
    /// [dwc]: https://pkg.go.dev/github.com/hashicorp/memberlist#DefaultWANConfig
    ///
    /// See [`Config::new_lan`].
    pub fn new_wan(cluster_size: NonZeroU32) -> Self {
        let period = Duration::from_secs(5);

        Self {
            probe_period: period,
            probe_rtt: Duration::from_secs(3),
            num_indirect_probes: NonZeroUsize::new(3).unwrap(),

            max_transmissions: Self::compute_max_tx(cluster_size),

            suspect_to_down_after: Self::suspicion_duration(cluster_size, period, 6.0),
            remove_down_after: DEFAULT_REMOVE_DOWN_AFTER,

            max_packet_size: NonZeroUsize::new(1400).unwrap(),

            notify_down_members: true,

            periodic_announce: Some(PeriodicParams {
                frequency: Duration::from_secs(60),
                num_members: NonZeroUsize::new(2).unwrap(),
            }),

            periodic_announce_to_down_members: Some(PeriodicParams {
                frequency: Duration::from_secs(125),
                num_members: NonZeroUsize::new(3).unwrap(),
            }),

            periodic_gossip: Some(PeriodicParams {
                frequency: Duration::from_millis(500),
                num_members: NonZeroUsize::new(4).unwrap(),
            }),
        }
    }

    fn compute_max_tx(cluster_size: NonZeroU32) -> NonZeroU8 {
        let multiplier = 4.0f64;
        let max_tx = f64::from(cluster_size.get().saturating_add(1)).log10() * multiplier;
        // XXX over-zealous: `multiplier` is not exposed; 4.0 guarantees it doesn't end up here
        if max_tx <= 1.0 {
            NonZeroU8::new(1).unwrap()
        } else if max_tx >= 255.0 {
            NonZeroU8::new(core::u8::MAX).unwrap()
        } else {
            NonZeroU8::new(max_tx as u8).expect("f64 ]1,255[ as u8 is non-zero")
        }
    }

    fn suspicion_duration(
        cluster_size: NonZeroU32,
        probe_period: Duration,
        multiplier: f64,
    ) -> Duration {
        let secs = f64::max(1.0, f64::from(cluster_size.get()).log10())
            * multiplier
            * probe_period.as_secs_f64();

        // XXX `Duration::from_secs_f64` is panicky, but:
        //  - multiplier is either 4 or 6
        //  - probe_period is either 1 or 5 secs
        // So we know `secs` is finite, greater than zero and won't overflow
        Duration::from_secs_f64(secs)
    }
}

const DEFAULT_REMOVE_DOWN_AFTER: Duration = Duration::from_secs(60 * 60 * 24); // 24h

#[cfg(test)]
mod tests {

    #[test]
    #[cfg(feature = "std")]
    fn suspicion_scales_slowly() {
        use super::*;

        let probe_period = Duration::from_secs(1);
        let mult = 4.0;

        assert_eq!(
            Duration::from_secs(4),
            Config::suspicion_duration(NonZeroU32::new(5).unwrap(), probe_period, mult)
        );

        assert_eq!(
            Duration::from_secs(4),
            Config::suspicion_duration(NonZeroU32::new(10).unwrap(), probe_period, mult)
        );

        assert_eq!(
            Duration::from_secs(8),
            Config::suspicion_duration(NonZeroU32::new(100).unwrap(), probe_period, mult)
        );
    }
}