1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

use crate::{
    counter::{Counter, Saturating},
    ensure, event,
    event::{builder::MtuUpdatedCause, IntoEvent},
    frame,
    inet::SocketAddress,
    packet::number::PacketNumber,
    path,
    path::{MaxMtu, IPV4_MIN_HEADER_LEN, IPV6_MIN_HEADER_LEN, MINIMUM_MTU, UDP_HEADER_LEN},
    recovery::{congestion_controller, CongestionController},
    time::{timer, Timer, Timestamp},
    transmission,
};
use core::time::Duration;
use s2n_codec::EncoderValue;

#[cfg(test)]
mod tests;

#[cfg(any(test, feature = "testing"))]
pub mod testing {
    use super::*;
    use crate::inet::{IpV4Address, SocketAddressV4};

    /// Creates a new mtu::Controller with an IPv4 address and the given `max_mtu`
    pub fn new_controller(max_mtu: u16) -> Controller {
        let ip = IpV4Address::new([127, 0, 0, 1]);
        let addr = SocketAddress::IpV4(SocketAddressV4::new(ip, 443));
        Controller::new(max_mtu.try_into().unwrap(), &addr)
    }

    /// Creates a new mtu::Controller with the given mtu and probed size
    pub fn test_controller(mtu: u16, probed_size: u16) -> Controller {
        let mut controller = new_controller(u16::max_value());
        controller.plpmtu = mtu;
        controller.probed_size = probed_size;
        controller
    }
}

#[derive(Clone, Debug, PartialEq, Eq)]
enum State {
    //= https://www.rfc-editor.org/rfc/rfc8899#section-5.2
    //# The DISABLED state is the initial state before probing has started.
    Disabled,
    /// SEARCH_REQUESTED is used to indicate a probe packet has been requested
    /// to be transmitted, but has not been transmitted yet.
    SearchRequested,
    //= https://www.rfc-editor.org/rfc/rfc8899#section-5.2
    //# The SEARCHING state is the main probing state.
    Searching(PacketNumber, Timestamp),
    //= https://www.rfc-editor.org/rfc/rfc8899#section-5.2
    //# The SEARCH_COMPLETE state indicates that a search has completed.
    SearchComplete,
}

//= https://www.rfc-editor.org/rfc/rfc9000#section-14.3
//# Endpoints SHOULD set the initial value of BASE_PLPMTU (Section 5.1 of
//# [DPLPMTUD]) to be consistent with QUIC's smallest allowed maximum
//# datagram size.

//= https://www.rfc-editor.org/rfc/rfc8899#section-5.1.2
//# When using IPv4, there is no currently equivalent size specified,
//# and a default BASE_PLPMTU of 1200 bytes is RECOMMENDED.
const BASE_PLPMTU: u16 = MINIMUM_MTU;

//= https://www.rfc-editor.org/rfc/rfc8899#section-5.1.2
//# The MAX_PROBES is the maximum value of the PROBE_COUNT
//# counter (see Section 5.1.3).  MAX_PROBES represents the limit for
//# the number of consecutive probe attempts of any size.  Search
//# algorithms benefit from a MAX_PROBES value greater than 1 because
//# this can provide robustness to isolated packet loss.  The default
//# value of MAX_PROBES is 3.
const MAX_PROBES: u8 = 3;

/// The minimum length of the data field of a packet sent over an
/// Ethernet is 1500 octets, thus the maximum length of an IP datagram
/// sent over an Ethernet is 1500 octets.
/// See https://www.rfc-editor.org/rfc/rfc894.txt
const ETHERNET_MTU: u16 = 1500;

/// If the next value to probe is within the PROBE_THRESHOLD bytes of
/// the current Path MTU, probing will be considered complete.
const PROBE_THRESHOLD: u16 = 20;

/// When the black_hole_counter exceeds this threshold, on_black_hole_detected will be
/// called to reduce the MTU to the BASE_PLPMTU. The black_hole_counter is incremented when
/// a burst of consecutive packets is lost that starts with a packet that is:
///      1) not an MTU probe
///      2) larger than the BASE_PLPMTU
///      3) sent after the largest MTU-sized acknowledged packet number
/// This is a possible indication that the path cannot support the MTU that was previously confirmed.
const BLACK_HOLE_THRESHOLD: u8 = 3;

/// After a black hole has been detected, the mtu::Controller will wait this duration
/// before probing for a larger MTU again.
const BLACK_HOLE_COOL_OFF_DURATION: Duration = Duration::from_secs(60);

//= https://www.rfc-editor.org/rfc/rfc8899#section-5.1.1
//# The PMTU_RAISE_TIMER is configured to the period a
//# sender will continue to use the current PLPMTU, after which it
//# reenters the Search Phase.  This timer has a period of 600
//# seconds, as recommended by PLPMTUD [RFC4821].
const PMTU_RAISE_TIMER_DURATION: Duration = Duration::from_secs(600);

#[derive(Clone, Debug)]
pub struct Controller {
    state: State,
    //= https://www.rfc-editor.org/rfc/rfc8899#section-2
    //# The Packetization Layer PMTU is an estimate of the largest size
    //# of PL datagram that can be sent by a path, controlled by PLPMTUD
    plpmtu: u16,
    /// The maximum size any packet can reach
    max_mtu: MaxMtu,
    /// The maximum size the UDP payload can reach for any probe packet.
    max_udp_payload: u16,
    //= https://www.rfc-editor.org/rfc/rfc8899#section-5.1.3
    //# The PROBED_SIZE is the size of the current probe packet
    //# as determined at the PL.  This is a tentative value for the
    //# PLPMTU, which is awaiting confirmation by an acknowledgment.
    probed_size: u16,
    /// The maximum size datagram to probe for. In contrast to the max_udp_payload,
    /// this value will decrease if probes are not acknowledged.
    max_probe_size: u16,
    //= https://www.rfc-editor.org/rfc/rfc8899#section-5.1.3
    //# The PROBE_COUNT is a count of the number of successive
    //# unsuccessful probe packets that have been sent.
    probe_count: u8,
    /// A count of the number of packets with a size > MINIMUM_MTU lost since
    /// the last time a packet with size equal to the current MTU was acknowledged.
    black_hole_counter: Counter<u8, Saturating>,
    /// The largest acknowledged packet with size >= the plpmtu. Used when tracking
    /// packets that have been lost for the purpose of detecting a black hole.
    largest_acked_mtu_sized_packet: Option<PacketNumber>,
    //= https://www.rfc-editor.org/rfc/rfc8899#section-5.1.1
    //# The PMTU_RAISE_TIMER is configured to the period a
    //# sender will continue to use the current PLPMTU, after which it
    //# reenters the Search Phase.
    pmtu_raise_timer: Timer,
}

impl Controller {
    /// Construct a new mtu::Controller with the given `max_mtu` and `peer_socket_address`
    ///
    /// The UDP header length and IP header length will be subtracted from `max_mtu` to
    /// determine the max_udp_payload used for limiting the payload length of probe packets.
    /// max_mtu is the maximum allowed mtu, e.g. for jumbo frames this value is expected to
    /// be over 9000.
    #[inline]
    pub fn new(max_mtu: MaxMtu, peer_socket_address: &SocketAddress) -> Self {
        let min_ip_header_len = match peer_socket_address {
            SocketAddress::IpV4(_) => IPV4_MIN_HEADER_LEN,
            SocketAddress::IpV6(_) => IPV6_MIN_HEADER_LEN,
        };
        let max_udp_payload =
            (u16::from(max_mtu) - UDP_HEADER_LEN - min_ip_header_len).max(BASE_PLPMTU);

        // The UDP payload size for the most likely MTU is based on standard Ethernet MTU minus
        // the minimum length IP headers (without IPv4 options or IPv6 extensions) and UPD header
        let initial_probed_size =
            (ETHERNET_MTU - UDP_HEADER_LEN - min_ip_header_len).min(max_udp_payload);

        Self {
            state: State::Disabled,
            plpmtu: BASE_PLPMTU,
            probed_size: initial_probed_size,
            max_mtu,
            max_udp_payload,
            max_probe_size: max_udp_payload,
            probe_count: 0,
            black_hole_counter: Default::default(),
            largest_acked_mtu_sized_packet: None,
            pmtu_raise_timer: Timer::default(),
        }
    }

    /// Enable path MTU probing
    #[inline]
    pub fn enable(&mut self) {
        // ensure we haven't already enabled the controller
        ensure!(self.state == State::Disabled);

        // TODO: Look up current MTU in a cache. If there is a cache hit
        //       move directly to SearchComplete and arm the PMTU raise timer.
        //       Otherwise, start searching for a larger PMTU immediately
        self.request_new_search(None);
    }

    /// Called when the connection timer expires
    #[inline]
    pub fn on_timeout(&mut self, now: Timestamp) {
        ensure!(self.pmtu_raise_timer.poll_expiration(now).is_ready());
        self.request_new_search(None);
    }

    //= https://www.rfc-editor.org/rfc/rfc8899#section-4.2
    //# When
    //# supported, this mechanism MAY also be used by DPLPMTUD to acknowledge
    //# reception of a probe packet.
    /// This method gets called when a packet delivery got acknowledged
    #[inline]
    pub fn on_packet_ack<CC: CongestionController, Pub: event::ConnectionPublisher>(
        &mut self,
        packet_number: PacketNumber,
        sent_bytes: u16,
        congestion_controller: &mut CC,
        path_id: path::Id,
        publisher: &mut Pub,
    ) {
        // no need to process anything in the disabled state
        ensure!(self.state != State::Disabled);

        // MTU probes are only sent in application data space
        ensure!(packet_number.space().is_application_data());

        if sent_bytes >= self.plpmtu
            && self
                .largest_acked_mtu_sized_packet
                .map_or(true, |pn| packet_number > pn)
        {
            // Reset the black hole counter since a packet the size of the current MTU or larger
            // has been acknowledged, indicating the path can still support the current MTU
            self.black_hole_counter = Default::default();
            self.largest_acked_mtu_sized_packet = Some(packet_number);
        }

        if let State::Searching(probe_packet_number, transmit_time) = self.state {
            if packet_number == probe_packet_number {
                self.plpmtu = self.probed_size;
                // A new MTU has been confirmed, notify the congestion controller
                congestion_controller.on_mtu_update(
                    self.plpmtu,
                    &mut congestion_controller::PathPublisher::new(publisher, path_id),
                );

                publisher.on_mtu_updated(event::builder::MtuUpdated {
                    path_id: path_id.into_event(),
                    mtu: self.plpmtu,
                    cause: MtuUpdatedCause::ProbeAcknowledged,
                });

                self.update_probed_size();

                //= https://www.rfc-editor.org/rfc/rfc8899#section-8
                //# To avoid excessive load, the interval between individual probe
                //# packets MUST be at least one RTT, and the interval between rounds of
                //# probing is determined by the PMTU_RAISE_TIMER.

                // Subsequent probe packets are sent based on the round trip transmission and
                // acknowledgement/loss of a packet, so the interval will be at least 1 RTT.
                self.request_new_search(Some(transmit_time));
            }
        }
    }

    //= https://www.rfc-editor.org/rfc/rfc8899#section-3
    //# The PL is REQUIRED to be
    //# robust in the case where probe packets are lost due to other
    //# reasons (including link transmission error, congestion).
    /// This method gets called when a packet loss is reported
    #[allow(clippy::too_many_arguments)]
    #[inline]
    pub fn on_packet_loss<CC: CongestionController, Pub: event::ConnectionPublisher>(
        &mut self,
        packet_number: PacketNumber,
        lost_bytes: u16,
        new_loss_burst: bool,
        now: Timestamp,
        congestion_controller: &mut CC,
        path_id: path::Id,
        publisher: &mut Pub,
    ) {
        // MTU probes are only sent in application data space
        ensure!(packet_number.space().is_application_data());

        match &self.state {
            State::Disabled => {}
            State::Searching(probe_pn, _) if *probe_pn == packet_number => {
                // The MTU probe was lost
                if self.probe_count == MAX_PROBES {
                    // We've sent MAX_PROBES without acknowledgement, so
                    // attempt a smaller probe size
                    self.max_probe_size = self.probed_size;
                    self.update_probed_size();
                    self.request_new_search(None);
                } else {
                    // Try the same probe size again
                    self.state = State::SearchRequested
                }
            }
            State::Searching(_, _) | State::SearchComplete | State::SearchRequested => {
                if (BASE_PLPMTU + 1..=self.plpmtu).contains(&lost_bytes)
                    && self
                        .largest_acked_mtu_sized_packet
                        .map_or(true, |pn| packet_number > pn)
                    && new_loss_burst
                {
                    // A non-probe packet larger than the BASE_PLPMTU that was sent after the last
                    // acknowledged MTU-sized packet has been lost
                    self.black_hole_counter += 1;
                }

                if self.black_hole_counter > BLACK_HOLE_THRESHOLD {
                    self.on_black_hole_detected(now, congestion_controller, path_id, publisher);
                }
            }
        }
    }

    /// Gets the currently validated maximum transmission unit, not including IP or UDP header len
    #[inline]
    pub fn mtu(&self) -> usize {
        self.plpmtu as usize
    }

    /// Returns the maximum size any packet can reach
    #[inline]
    pub fn max_mtu(&self) -> MaxMtu {
        self.max_mtu
    }

    /// Gets the MTU currently being probed for
    #[inline]
    pub fn probed_sized(&self) -> usize {
        self.probed_size as usize
    }

    /// Sets `probed_size` to the next MTU size to probe for based on a binary search
    #[inline]
    fn update_probed_size(&mut self) {
        //= https://www.rfc-editor.org/rfc/rfc8899#section-5.3.2
        //# Implementations SHOULD select the set of probe packet sizes to
        //# maximize the gain in PLPMTU from each search step.
        self.probed_size = self.plpmtu + ((self.max_probe_size - self.plpmtu) / 2);
    }

    /// Requests a new search to be initiated
    ///
    /// If `last_probe_time` is supplied, the PMTU Raise Timer will be armed as
    /// necessary if the probed_size is already within the PROBE_THRESHOLD
    /// of the current PLPMTU
    #[inline]
    fn request_new_search(&mut self, last_probe_time: Option<Timestamp>) {
        if self.probed_size - self.plpmtu >= PROBE_THRESHOLD {
            self.probe_count = 0;
            self.state = State::SearchRequested;
        } else {
            // The next probe size is within the threshold of the current MTU
            // so its not worth additional probing.
            self.state = State::SearchComplete;

            if let Some(last_probe_time) = last_probe_time {
                self.arm_pmtu_raise_timer(last_probe_time + PMTU_RAISE_TIMER_DURATION);
            }
        }
    }

    /// Called when an excessive number of packets larger than the BASE_PLPMTU have been lost
    #[inline]
    fn on_black_hole_detected<CC: CongestionController, Pub: event::ConnectionPublisher>(
        &mut self,
        now: Timestamp,
        congestion_controller: &mut CC,
        path_id: path::Id,
        publisher: &mut Pub,
    ) {
        self.black_hole_counter = Default::default();
        self.largest_acked_mtu_sized_packet = None;
        // Reset the plpmtu back to the BASE_PLPMTU and notify the congestion controller
        self.plpmtu = BASE_PLPMTU;
        congestion_controller.on_mtu_update(
            BASE_PLPMTU,
            &mut congestion_controller::PathPublisher::new(publisher, path_id),
        );
        // Cancel any current probes
        self.state = State::SearchComplete;
        // Arm the PMTU raise timer to try a larger MTU again after a cooling off period
        self.arm_pmtu_raise_timer(now + BLACK_HOLE_COOL_OFF_DURATION);

        publisher.on_mtu_updated(event::builder::MtuUpdated {
            path_id: path_id.into_event(),
            mtu: self.plpmtu,
            cause: MtuUpdatedCause::Blackhole,
        })
    }

    /// Arm the PMTU Raise Timer if there is still room to increase the
    /// MTU before hitting the max plpmtu
    #[inline]
    fn arm_pmtu_raise_timer(&mut self, timestamp: Timestamp) {
        // Reset the max_probe_size to the max_udp_payload to allow for larger probe sizes
        self.max_probe_size = self.max_udp_payload;
        self.update_probed_size();

        if self.probed_size - self.plpmtu >= PROBE_THRESHOLD {
            // There is still some room to try a larger MTU again,
            // so arm the pmtu raise timer
            self.pmtu_raise_timer.set(timestamp);
        }
    }
}

impl timer::Provider for Controller {
    #[inline]
    fn timers<Q: timer::Query>(&self, query: &mut Q) -> timer::Result {
        self.pmtu_raise_timer.timers(query)?;

        Ok(())
    }
}

impl transmission::Provider for Controller {
    /// Queries the component for any outgoing frames that need to get sent
    ///
    /// This method assumes that no other data (other than the packet header) has been written
    /// to the supplied `WriteContext`. This necessitates the caller ensuring the probe packet
    /// written by this method to be in its own connection transmission.
    #[inline]
    fn on_transmit<W: transmission::Writer>(&mut self, context: &mut W) {
        //= https://www.rfc-editor.org/rfc/rfc8899#section-5.2
        //# When used with an acknowledged PL (e.g., SCTP), DPLPMTUD SHOULD NOT continue to
        //# generate PLPMTU probes in this state.
        ensure!(self.state == State::SearchRequested);

        ensure!(context.transmission_mode().is_mtu_probing());

        // Each packet contains overhead in the form of a packet header and an authentication tag.
        // This overhead contributes to the overall size of the packet, so the payload we write
        // to the packet will account for this overhead to reach the target probed size.
        let probe_payload_size =
            self.probed_size as usize - context.header_len() - context.tag_len();

        if context.remaining_capacity() < probe_payload_size {
            // There isn't enough capacity in the buffer to write the datagram we
            // want to probe, so we've reached the maximum pmtu and the search is complete.
            self.state = State::SearchComplete;
            return;
        }

        //= https://www.rfc-editor.org/rfc/rfc9000#section-14.4
        //# Endpoints could limit the content of PMTU probes to PING and PADDING
        //# frames, since packets that are larger than the current maximum
        //# datagram size are more likely to be dropped by the network.

        //= https://www.rfc-editor.org/rfc/rfc8899#section-3
        //# Probe loss recovery: It is RECOMMENDED to use probe packets that
        //# do not carry any user data that would require retransmission if
        //# lost.

        //= https://www.rfc-editor.org/rfc/rfc8899#section-4.1
        //# DPLPMTUD MAY choose to use only one of these methods to simplify the
        //# implementation.

        context.write_frame(&frame::Ping);
        let padding_size = probe_payload_size - frame::Ping.encoding_size();
        if let Some(packet_number) = context.write_frame(&frame::Padding {
            length: padding_size,
        }) {
            self.probe_count += 1;
            self.state = State::Searching(packet_number, context.current_time());
        }
    }
}

impl transmission::interest::Provider for Controller {
    #[inline]
    fn transmission_interest<Q: transmission::interest::Query>(
        &self,
        query: &mut Q,
    ) -> transmission::interest::Result {
        match self.state {
            State::SearchRequested => query.on_new_data(),
            _ => Ok(()),
        }
    }
}