mountpoint-s3-fs 0.9.3

Mountpoint S3 main library
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
use std::ops::Range;
use std::sync::Arc;

use async_channel::{Receiver, RecvError, Sender, unbounded};
use humansize::make_format;
use tracing::trace;

use crate::mem_limiter::{BufferArea, MemoryLimiter};

use super::PrefetchReadError;

#[derive(Debug)]
pub enum BackpressureFeedbackEvent {
    /// An event where data with a certain length has been read out of the stream
    DataRead { offset: u64, length: usize },
    /// An event indicating part queue stall
    PartQueueStall,
}

/// Read window alignment config.
#[derive(Copy, Clone, Debug)]
pub enum ReadWindowAlignmentConfig {
    /// Align only offsets starting from `from_offset` (and relative to it) to multiple of part_size.
    AlignToPartSize { from_offset: u64, part_size: u64 },
    /// No alignment.
    Disable,
}

impl ReadWindowAlignmentConfig {
    /// Align `read_window_end` to next part boundary relative to `AlignToPartSize::from_offset`.
    /// If alignment is disabled, returns the `read_window_end` unchanged.
    ///
    /// When enabled, this function ensures that read window end offsets are aligned to part boundaries,
    /// which improves accuracy of tracking reserved memory.
    ///
    /// Note: window excludes the last byte, denoted by `read_window_end`.
    fn align(&self, read_window_end: u64) -> u64 {
        match self {
            ReadWindowAlignmentConfig::AlignToPartSize { from_offset, part_size } => {
                if read_window_end > *from_offset {
                    let relative_end_offset = read_window_end - from_offset;
                    from_offset + relative_end_offset.next_multiple_of(*part_size)
                } else {
                    read_window_end
                }
            }
            ReadWindowAlignmentConfig::Disable => read_window_end,
        }
    }
}

pub struct BackpressureConfig {
    /// Backpressure's initial read window size
    pub initial_read_window_size: usize,
    /// Minimum read window size that the backpressure controller is allowed to scale down to
    pub min_read_window_size: usize,
    /// Maximum read window size that the backpressure controller is allowed to scale up to
    pub max_read_window_size: usize,
    /// Factor to increase the read window size by when the part queue is stalled
    pub read_window_size_multiplier: usize,
    /// Request range to apply backpressure
    pub request_range: Range<u64>,
    /// Enable alignment of read window end to part boundary
    pub read_window_alignment_config: ReadWindowAlignmentConfig,
}

/// A [BackpressureController] should be given to consumers of a byte stream.
/// It is used to send feedback ([Self::send_feedback]) to its corresponding [BackpressureLimiter],
/// the counterpart which should be leveraged by the stream producer.
#[derive(Debug)]
pub struct BackpressureController {
    /// Sender for the [BackpressureLimiter] to receive size increments from the controller.
    read_window_updater: Sender<usize>,
    /// Amount by which the producer should be producing data ahead of [Self::next_read_offset].
    preferred_read_window_size: usize,
    min_read_window_size: usize,
    max_read_window_size: usize,
    /// Multiplier by which [Self::preferred_read_window_size] is scaled.
    read_window_size_multiplier: usize,
    /// Upper bound of the current read window, relative to the start of the S3 object.
    ///
    /// The request can return data up to this offset *exclusively*.
    /// This value must be advanced to continue fetching new data.
    read_window_end_offset: u64,
    /// Next offset of the data to be read, relative to the start of the S3 object.
    next_read_offset: u64,
    /// End offset within the S3 object for the request.
    ///
    /// The request can return data up to this offset *exclusively*.
    request_end_offset: u64,
    /// Memory limiter is used to guide decisions on how much data to prefetch.
    ///
    /// For example, when memory is low we should scale down [Self::preferred_read_window_size].
    mem_limiter: Arc<MemoryLimiter>,
    /// Enable alignment of read window end to part boundary
    read_window_alignment_config: ReadWindowAlignmentConfig,
}

/// The [BackpressureLimiter] is used on producer side of a stream, for example,
/// any [super::part_stream::ObjectPartStream] that supports backpressure.
///
/// The producer can call [Self::wait_for_read_window_increment] to wait for feedback from the consumer.
#[derive(Debug)]
pub struct BackpressureLimiter {
    read_window_increment_queue: ReadWindowIncrementQueue,
    /// Upper bound of the current read window.
    /// Calling [BackpressureLimiter::wait_for_read_window_increment()] will block current
    /// thread until this value is advanced.
    read_window_end_offset: u64,
    /// End offset for the request we want to apply backpressure. The request can return
    /// data up to this offset *exclusively*.
    request_end_offset: u64,
}

/// Creates a [BackpressureController] and its related [BackpressureLimiter].
///
/// This pair allows a consumer to send feedback ([BackpressureFeedbackEvent]) when starved or bytes are consumed,
/// informing a producer (a holder of the [BackpressureLimiter]) when it should provide data more aggressively.
pub fn new_backpressure_controller(
    config: BackpressureConfig,
    mem_limiter: Arc<MemoryLimiter>,
) -> (BackpressureController, BackpressureLimiter) {
    // Minimum window size multiplier as the scaling up and down won't work if the multiplier is 1.
    const MIN_WINDOW_SIZE_MULTIPLIER: usize = 2;
    let read_window_end_offset = config.request_range.start + config.initial_read_window_size as u64;
    mem_limiter.reserve(BufferArea::Prefetch, config.initial_read_window_size as u64);

    let (read_window_updater, read_window_increment_queue) = unbounded();
    let read_window_increment_queue = ReadWindowIncrementQueue::new(read_window_increment_queue);

    let controller = BackpressureController {
        read_window_updater,
        preferred_read_window_size: config.initial_read_window_size,
        min_read_window_size: config.min_read_window_size,
        max_read_window_size: config.max_read_window_size,
        read_window_size_multiplier: config.read_window_size_multiplier.max(MIN_WINDOW_SIZE_MULTIPLIER),
        read_window_end_offset,
        next_read_offset: config.request_range.start,
        request_end_offset: config.request_range.end,
        mem_limiter,
        read_window_alignment_config: config.read_window_alignment_config,
    };

    trace!(?controller, "initialising backpressure controller");

    let limiter = BackpressureLimiter {
        read_window_increment_queue,
        read_window_end_offset,
        request_end_offset: config.request_range.end,
    };

    (controller, limiter)
}

impl BackpressureController {
    pub fn read_window_end_offset(&self) -> u64 {
        self.read_window_end_offset
    }

    /// Send a feedback to the backpressure controller when reading data out of the stream. The backpressure controller
    /// will ensure that the read window size is enough to read this offset and that it is always close to `preferred_read_window_size`.
    pub async fn send_feedback<E>(&mut self, event: BackpressureFeedbackEvent) -> Result<(), PrefetchReadError<E>> {
        match event {
            // Note, that this may come from a backwards seek, so offsets observed by this method are not necessarily ascending
            BackpressureFeedbackEvent::DataRead { offset, length } => {
                self.next_read_offset = offset + length as u64;
                self.mem_limiter.release(BufferArea::Prefetch, length as u64);
                let remaining_window = self.read_window_end_offset.saturating_sub(self.next_read_offset) as usize;

                // Increment the read window only if the remaining window reaches some threshold i.e. half of it left.
                // When memory is low the `preferred_read_window_size` will be scaled down so we have to keep trying
                // until we have enough read window.
                while remaining_window < (self.preferred_read_window_size / 2)
                    && self.read_window_end_offset < self.request_end_offset
                {
                    let new_read_window_end_offset_preferred = self
                        .next_read_offset
                        .saturating_add(self.preferred_read_window_size as u64);
                    let new_read_window_end_offset_aligned = self
                        .read_window_alignment_config
                        .align(new_read_window_end_offset_preferred);
                    // NOTE: in the end of the object we still have up to "part_size" of unaccounted memory.
                    // For more accurate memory limiting we could reserve in "full parts", but that would make
                    // release more complicated. So accept this inaccuracy, and reserve only till `request_end_offset`.
                    let new_read_window_end_offset = new_read_window_end_offset_aligned.min(self.request_end_offset);
                    // We can skip if the new `read_window_end_offset` is less than or equal to the current one, this
                    // could happen after the read window is scaled down.
                    if new_read_window_end_offset <= self.read_window_end_offset {
                        break;
                    }
                    let to_increase = new_read_window_end_offset.saturating_sub(self.read_window_end_offset) as usize;

                    // Force incrementing read window regardless of available memory when we are already at minimum
                    // read window size.
                    if self.preferred_read_window_size <= self.min_read_window_size {
                        trace!(new_read_window_end_offset, "sending a read window increment");
                        self.mem_limiter.reserve(BufferArea::Prefetch, to_increase as u64);
                        self.increment_read_window(to_increase).await;
                        break;
                    }

                    // Try to reserve the memory for the length we want to increase before sending the request,
                    // scale down the read window if it fails.
                    if self.mem_limiter.try_reserve(BufferArea::Prefetch, to_increase as u64) {
                        trace!(new_read_window_end_offset, "sending a read window increment");
                        self.increment_read_window(to_increase).await;
                        break;
                    } else {
                        self.scale_down();
                    }
                }
            }
            BackpressureFeedbackEvent::PartQueueStall => self.scale_up(),
        }
        Ok(())
    }

    // Send an increment read window request to the stream producer
    async fn increment_read_window(&mut self, len: usize) {
        let prev_window_end_offset = self.read_window_end_offset;
        let next_window_end_offset = prev_window_end_offset + len as u64;
        trace!(
            next_read_offset = self.next_read_offset,
            prev_window_end_offset, next_window_end_offset, len, "incrementing read window",
        );

        // This should not block since the channel is unbounded
        let _ = self
            .read_window_updater
            .send(len)
            .await
            .inspect_err(|_| trace!("read window incrementing queue is already closed"));
        self.read_window_end_offset = next_window_end_offset;
    }

    /// Scale up preferred read window size with a multiplier configured at initialization.
    ///
    /// Fails silently if there is insufficient free memory to perform it according to [Self::mem_limiter].
    fn scale_up(&mut self) {
        if self.preferred_read_window_size < self.max_read_window_size {
            let new_read_window_size = (self.preferred_read_window_size * self.read_window_size_multiplier)
                .max(self.min_read_window_size)
                .min(self.max_read_window_size);
            // Only scale up when there is enough memory. We don't have to reserve the memory here
            // because only `preferred_read_window_size` is increased but the actual read window will
            // be updated later on `DataRead` event (where we do reserve memory).
            let to_increase = (new_read_window_size - self.preferred_read_window_size) as u64;
            let available_mem = self.mem_limiter.available_mem();
            if available_mem >= to_increase {
                let formatter = make_format(humansize::BINARY);
                trace!(
                    prev_size = formatter(self.preferred_read_window_size),
                    new_size = formatter(new_read_window_size),
                    "scaled up preferred read window"
                );
                self.preferred_read_window_size = new_read_window_size;
                metrics::histogram!("prefetch.window_after_increase_mib")
                    .record((self.preferred_read_window_size / 1024 / 1024) as f64);
            }
        }
    }

    /// Scale down [Self::preferred_read_window_size] by a multiplier configured at initialization.
    fn scale_down(&mut self) {
        if self.preferred_read_window_size > self.min_read_window_size {
            assert!(self.read_window_size_multiplier > 1);
            let new_read_window_size = (self.preferred_read_window_size / self.read_window_size_multiplier)
                .max(self.min_read_window_size)
                .min(self.max_read_window_size);
            let formatter = make_format(humansize::BINARY);
            trace!(
                current_size = formatter(self.preferred_read_window_size),
                new_size = formatter(new_read_window_size),
                "scaled down read window"
            );
            self.preferred_read_window_size = new_read_window_size;
            metrics::histogram!("prefetch.window_after_decrease_mib")
                .record((self.preferred_read_window_size / 1024 / 1024) as f64);
        }
    }
}

impl Drop for BackpressureController {
    fn drop(&mut self) {
        debug_assert!(
            self.next_read_offset <= self.request_end_offset,
            "invariant: the next read offset should never be larger than the request end offset",
        );
        // Free up memory we have reserved for the read window.
        let remaining_window = self.read_window_end_offset.saturating_sub(self.next_read_offset);
        self.mem_limiter.release(BufferArea::Prefetch, remaining_window);
    }
}

impl BackpressureLimiter {
    pub fn read_window_end_offset(&self) -> u64 {
        self.read_window_end_offset
    }

    /// Wait until the backpressure window moves ahead of the given offset.
    ///
    /// Returns the new read window offset if it has changed, otherwise [None].
    pub async fn wait_for_read_window_increment<E>(
        &mut self,
        offset: u64,
    ) -> Result<Option<u64>, PrefetchReadError<E>> {
        // There is already enough read window so no need to block
        if self.read_window_end_offset > offset {
            if let Some(increment_amount) = self.read_window_increment_queue.try_recv_drain() {
                self.read_window_end_offset += increment_amount as u64;
                return Ok(Some(self.read_window_end_offset));
            } else {
                return Ok(None);
            }
        }

        // Reaching here means there is not enough read window, so we block until it is large enough
        while self.read_window_end_offset <= offset && self.read_window_end_offset < self.request_end_offset {
            trace!(
                desired_offset = offset,
                current_offset = self.read_window_end_offset,
                "blocking for read window increment",
            );
            match self.read_window_increment_queue.recv_drain().await {
                Ok(len) => self.read_window_end_offset += len as u64,
                Err(RecvError) => return Err(PrefetchReadError::ReadWindowIncrement),
            }
        }
        Ok(Some(self.read_window_end_offset))
    }
}

/// Wraps a queue, ensuring that all accesses fully drain the queue of values (increments).
#[derive(Debug)]
struct ReadWindowIncrementQueue(Receiver<usize>);

impl ReadWindowIncrementQueue {
    pub fn new(receiver: Receiver<usize>) -> Self {
        Self(receiver)
    }

    /// Drain the increment queue, blocking if required for the first increment.
    ///
    /// Returns [Err] if the underlying [Receiver] was closed.
    pub async fn recv_drain(&self) -> Result<usize, RecvError> {
        let mut increment_total = self.0.recv().await?;
        while let Ok(len) = self.0.try_recv() {
            increment_total += len;
        }
        Ok(increment_total)
    }

    /// Drain the increment queue of any available increments.
    ///
    /// Returns the total amount to increment if any, otherwise [None].
    pub fn try_recv_drain(&self) -> Option<usize> {
        let mut increment_total = 0;
        while let Ok(len) = self.0.try_recv() {
            increment_total += len;
        }

        if increment_total > 0 {
            Some(increment_total)
        } else {
            None
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    use std::sync::Arc;

    use futures::executor::block_on;
    use mountpoint_s3_client::mock_client::MockClientError;
    use test_case::test_case;

    use crate::mem_limiter::MemoryLimiter;
    use crate::memory::PagedPool;
    use crate::prefetch::INITIAL_REQUEST_SIZE;

    #[test_case(INITIAL_REQUEST_SIZE, 2)] // real config
    #[test_case(3 * 1024 * 1024, 4)]
    #[test_case(8 * 1024 * 1024, 8)]
    #[test_case(2 * 1024 * 1024 * 1024, 2)]
    fn test_read_window_scale_up(initial_read_window_size: usize, read_window_size_multiplier: usize) {
        let request_range = 0..(5 * 1024 * 1024 * 1024);
        let backpressure_config = BackpressureConfig {
            initial_read_window_size,
            min_read_window_size: 8 * 1024 * 1024,
            max_read_window_size: 2 * 1024 * 1024 * 1024,
            read_window_size_multiplier,
            request_range,
            read_window_alignment_config: ReadWindowAlignmentConfig::Disable,
        };

        let (mut backpressure_controller, _backpressure_limiter) =
            new_backpressure_controller_for_test(backpressure_config);
        while backpressure_controller.preferred_read_window_size < backpressure_controller.max_read_window_size {
            backpressure_controller.scale_up();
            assert!(backpressure_controller.preferred_read_window_size >= backpressure_controller.min_read_window_size);
            assert!(backpressure_controller.preferred_read_window_size <= backpressure_controller.max_read_window_size);
        }
        assert_eq!(
            backpressure_controller.preferred_read_window_size, backpressure_controller.max_read_window_size,
            "should have scaled up to max read window size"
        );
    }

    #[test_case(2 * 1024 * 1024 * 1024, 2)]
    #[test_case(15 * 1024 * 1024 * 1024, 2)]
    #[test_case(2 * 1024 * 1024 * 1024, 8)]
    #[test_case(8 * 1024 * 1024, 8)]
    fn test_read_window_scale_down(initial_read_window_size: usize, read_window_size_multiplier: usize) {
        let request_range = 0..(5 * 1024 * 1024 * 1024);
        let backpressure_config = BackpressureConfig {
            initial_read_window_size,
            min_read_window_size: 8 * 1024 * 1024,
            max_read_window_size: 2 * 1024 * 1024 * 1024,
            read_window_size_multiplier,
            request_range,
            read_window_alignment_config: ReadWindowAlignmentConfig::Disable,
        };

        let (mut backpressure_controller, _backpressure_limiter) =
            new_backpressure_controller_for_test(backpressure_config);
        while backpressure_controller.preferred_read_window_size > backpressure_controller.min_read_window_size {
            backpressure_controller.scale_down();
            assert!(backpressure_controller.preferred_read_window_size <= backpressure_controller.max_read_window_size);
            assert!(backpressure_controller.preferred_read_window_size >= backpressure_controller.min_read_window_size);
        }
        assert_eq!(
            backpressure_controller.preferred_read_window_size, backpressure_controller.min_read_window_size,
            "should have scaled down to min read window size"
        );
    }

    #[test]
    fn wait_for_read_window_increment_drains_all_events() {
        const KIB: usize = 1024;
        const MIB: usize = 1024 * KIB;
        const GIB: usize = 1024 * MIB;

        // OK, back to basics. Just reproduce what happened, verify it passes after the fix.
        #[allow(clippy::identity_op)]
        let backpressure_config = BackpressureConfig {
            initial_read_window_size: 1 * MIB,
            min_read_window_size: 8 * MIB,
            max_read_window_size: 2 * GIB,
            read_window_size_multiplier: 2,
            request_range: 0..(5 * GIB as u64),
            read_window_alignment_config: ReadWindowAlignmentConfig::Disable,
        };

        let (mut backpressure_controller, mut backpressure_limiter) =
            new_backpressure_controller_for_test(backpressure_config);

        block_on(async {
            #[allow(clippy::identity_op)]
            let expected_offset = 1 * MIB as u64;
            assert_eq!(
                backpressure_limiter.read_window_end_offset(),
                expected_offset,
                "read window end offset should already be {expected_offset} due to initial read window size config",
            );

            // Send more than one increment.
            backpressure_controller.increment_read_window(7 * MIB).await;
            backpressure_controller.increment_read_window(8 * MIB).await;
            backpressure_controller.increment_read_window(8 * MIB).await;

            let curr_offset = backpressure_limiter
                .wait_for_read_window_increment::<MockClientError>(0)
                .await
                .expect("should return OK as we have new values to increment before channels are closed")
                .expect("value should change as we sent increments");
            assert_eq!(
                24 * MIB as u64,
                curr_offset,
                "expected offset did not match offset reported by limiter",
            );
        });
    }

    #[test_case(500, 1000, 100, 500; "offset before second request start")]
    #[test_case(1000, 1000, 512, 1000; "offset at second request start")]
    #[test_case(1500, 1000, 512, 1512; "offset after second request start, needs alignment")]
    #[test_case(2024, 1000, 512, 2024; "offset after second request start, already aligned")]
    #[test_case(1001, 1000, 512, 1512; "offset just after second request start, needs alignment")]
    #[test_case(1512, 1000, 512, 1512; "offset exactly at part boundary")]
    #[test_case(1513, 1000, 512, 2024; "offset just past part boundary")]
    fn test_read_window_alignment(offset: u64, from_offset: u64, part_size: u64, expected: u64) {
        let result = ReadWindowAlignmentConfig::AlignToPartSize { from_offset, part_size }.align(offset);
        assert_eq!(result, expected);
    }

    fn new_backpressure_controller_for_test(
        backpressure_config: BackpressureConfig,
    ) -> (BackpressureController, BackpressureLimiter) {
        let pool = PagedPool::new_with_candidate_sizes([8 * 1024 * 1024]);
        let mem_limiter = Arc::new(MemoryLimiter::new(
            pool,
            backpressure_config.max_read_window_size as u64,
        ));
        new_backpressure_controller(backpressure_config, mem_limiter.clone())
    }
}