s3-algo 0.7.4 - Docs.rs

use super::*;
use aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output;
use aws_sdk_s3::primitives::ByteStream;
use aws_sdk_s3::types::{Delete, Object, ObjectIdentifier, ObjectStorageClass};
use aws_smithy_types_convert::stream::PaginationStreamExt;
use futures::future::ok;
use futures::stream::Stream;
use std::future::Future;
use std::pin::Pin;
use std::task::{Context, Poll};
use tokio::io;

/// A stream that can list objects, and (using member functions) delete or copy listed files.
pub struct ListObjects<S> {
    s3: Client,
    config: Config,
    bucket: String,
    /// Common prefix (as requested) of the listed objects. Empty string if all objects were
    /// requestd.
    prefix: String,
    stream: S,
}
impl<S> ListObjects<S>
where
    S: Stream<Item = Result<ListObjectsV2Output, Error>> + Sized + Send + 'static,
{
    pub fn boxed(
        self,
    ) -> ListObjects<Pin<Box<dyn Stream<Item = Result<ListObjectsV2Output, Error>> + Send>>> {
        ListObjects {
            s3: self.s3,
            config: self.config,
            bucket: self.bucket,
            stream: self.stream.boxed(),
            prefix: self.prefix,
        }
    }

    /// Calls an async closure on all the individual objects of the list operation
    pub async fn process<P, F>(self, f: P) -> Result<(), Error>
    where
        P: Fn(Object) -> F + Clone,
        F: Future<Output = ()>,
    {
        let ListObjects {
            stream, prefix: _, ..
        } = self;
        stream
            .try_filter_map(|response| ok(response.contents))
            .map_ok(|x| stream::iter(x).map(Ok))
            .try_flatten()
            .try_for_each_concurrent(None, move |object| {
                let f = f.clone();
                async move {
                    f(object).await;
                    Ok(())
                }
            })
            .await
    }
    /// Download all listed objects - returns a stream of the contents.
    /// Used as a basis for other `download_all_*` functions.
    /// Note: This function filters out archived objects (GLACIER, DEEP_ARCHIVE) that cannot be
    /// downloaded directly via GetObject and must first be restored via RestoreObject.
    pub fn download_all_stream(
        self,
    ) -> impl Stream<Item = Result<(String, ByteStream, Option<i64>), Error>> {
        self.download_all_stream_internal(None::<fn() -> Pin<Box<dyn Future<Output = ()> + Send>>>)
    }

    /// Internal implementation with optional hook for testing
    fn download_all_stream_internal<F, Fut>(
        self,
        pre_download_hook: Option<F>,
    ) -> impl Stream<Item = Result<(String, ByteStream, Option<i64>), Error>>
    where
        F: Fn() -> Fut + Send + Clone + 'static,
        Fut: Future<Output = ()> + Send + 'static,
    {
        let ListObjects {
            s3,
            config,
            bucket,
            stream,
            prefix: _,
        } = self;
        let download_parallelization = config.download_parallelization;
        
        stream
            .try_filter_map(|response| ok(response.contents))
            .map_ok(|x| stream::iter(x).map(Ok))
            .try_flatten()
            .try_filter_map(|obj| {
                // Filter out archived objects that cannot be downloaded directly
                let is_archived = matches!(
                    obj.storage_class(),
                    Some(ObjectStorageClass::Glacier) | Some(ObjectStorageClass::DeepArchive)
                );
                
                if is_archived {
                    ok(None)
                } else {
                    let Object { key, size, .. } = obj;
                    if let Some(key) = key {
                        ok(Some((key, size)))
                    } else {
                        ok(None)
                    }
                }
            })
            .map_ok(move |(key, _size)| {
                let (s3, bucket) = (s3.clone(), bucket.clone());
                let hook = pre_download_hook.clone();
                async move {
                    // Call hook before download (for testing)
                    if let Some(ref h) = hook {
                        h().await;
                    }
                    
                    let output = s3
                        .get_object()
                        .bucket(bucket.clone())
                        .key(key.clone())
                        .send()
                        .await
                        .context(err::GetObject {
                            key: key.clone(),
                            bucket,
                        })?;
                    Ok((key, output.body, output.content_length))
                }
                .boxed()
            })
            .try_buffer_unordered(download_parallelization)
    }

    pub fn download_all_to_vec(self) -> impl Stream<Item = Result<(String, Vec<u8>), Error>> {
        self.download_all_stream()
            .and_then(|(key, body, _)| async move {
                let mut contents = vec![];
                io::copy(&mut body.into_async_read(), &mut contents)
                    .await
                    .context(err::TokioIo)?;
                Ok((key, contents))
            })
    }

    /*
    /// Download all listed objects to file system.
    /// UNIMPLEMENTED.
    pub fn download_all(self) -> impl Future<Output = Result<(), Error>> {
        // TODO use download_all_stream
        ok(unimplemented!())
    }
    */

    /// Delete all listed objects.
    ///
    /// With the two arguments, you can implement a detailed real-time progress report of both how
    /// many files have been listed, and how many files have been deleted.
    ///
    /// `list_progress`: Closure that is given number of files listed as argument. Is called
    /// several times, one for each batch of files listed.
    /// `delete_progress`: Closure that is given RequestReport of a delete request. The `size`
    /// field refers to the number of fields deleted.
    ///
    pub fn delete_all<P1, P2, F1, F2>(
        self,
        list_progress: P1,
        delete_progress: P2,
    ) -> impl Future<Output = Result<(), Error>>
    where
        P1: Fn(usize) -> F1 + Clone + Send + Sync + 'static,
        P2: Fn(RequestReport) -> F2 + Clone + Send + Sync + 'static,
        F1: Future<Output = ()> + Send + 'static,
        F2: Future<Output = ()> + Send + 'static,
    {
        // For each ListObjectsV2Output, send a request to delete all the listed objects
        let ListObjects {
            s3,
            config,
            bucket,
            stream,
            prefix: _,
        } = self;
        let timeout = Arc::new(Mutex::new(TimeoutState::new(
            config.algorithm.clone(),
            config.delete_requests.clone(),
        )));
        let n_retries = config.algorithm.n_retries;
        stream.try_for_each_concurrent(None, move |object| {
            let (s3, bucket, timeout, delete_progress2, list_progress2) = (
                s3.clone(),
                bucket.clone(),
                timeout.clone(),
                delete_progress.clone(),
                list_progress.clone(),
            );
            let objects = object
                .contents
                .unwrap_or_default() // unwrap or empty Vec
                .iter()
                .filter_map(|obj| {
                    obj.key.as_ref().map(|key| {
                        ObjectIdentifier::builder()
                            .set_key(Some(key.clone()))
                            .set_version_id(None)
                            .build()
                            .unwrap() // unwrap: shouldn't fail building as the key comes directly
                                      // from S3
                    })
                })
                .collect::<Vec<_>>();
            let n_objects = objects.len();

            async move {
                list_progress2(n_objects).await;
                let (report, _) = s3_request(
                    move || {
                        let (s3, bucket, objects) = (s3.clone(), bucket.clone(), objects.clone());
                        async move {
                            let (s3, bucket, objects) =
                                (s3.clone(), bucket.clone(), objects.clone());
                            Ok((
                                async move {
                                    s3.delete_objects()
                                        .set_bucket(Some(bucket))
                                        .set_delete(Some(
                                            Delete::builder()
                                                .set_objects(Some(objects))
                                                .build()
                                                .unwrap(), // unwrap: shouldn't fail building
                                                           // because all the input comes directly from S3
                                        ))
                                        .send()
                                        .await
                                        .map_err(|e| e.into())
                                },
                                n_objects,
                            ))
                        }
                    },
                    |_, size| size,
                    n_retries,
                    timeout.clone(),
                )
                .await?;
                timeout.lock().await.update(&report);
                delete_progress2(report).await;
                Ok(())
            }
        })
    }

    /// Flatten into a stream of Objects.
    pub fn flatten(self) -> impl Stream<Item = Result<Object, Error>> {
        self.stream
            .try_filter_map(|response| ok(response.contents))
            .map_ok(|x| stream::iter(x).map(Ok))
            .try_flatten()
    }

    /// Filter objects by key name before performing operations.
    /// This is useful to exclude specific files (e.g., "archive.zip") before downloading.
    pub fn filter_keys<F>(self, predicate: F) -> ListObjects<impl Stream<Item = Result<ListObjectsV2Output, Error>> + Sized + Send>
    where
        F: Fn(&str) -> bool + Clone + Send + 'static,
    {
        let filtered_stream = self.stream.map_ok(move |mut response| {
            if let Some(contents) = response.contents.as_mut() {
                contents.retain(|obj| {
                    obj.key().map(|k| predicate(k)).unwrap_or(false)
                });
            }
            response
        });

        ListObjects {
            s3: self.s3,
            config: self.config,
            bucket: self.bucket,
            prefix: self.prefix,
            stream: filtered_stream,
        }
    }

    /*
    /// This function exists to provide a stream to copy all objects, for both `copy_all` and
    /// `move_all`. The `String` that is the stream's `Item` is the _source key_. An `Ok` value
    /// thus signals (relevant when used in `move_all`) that a certain key is ready for deletion.
    fn copy_all_stream<F, R>(
        self,
        dest_bucket: Option<String>,
        mapping: F,
        default_request: R,
    ) -> impl Stream<Item = Result<String, Error>>
    where
        F: Fn(&str) -> String + Clone + Send + Sync + Unpin + 'static,
        R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static,
    {
        let ListObjects {
            s3,
            config,
            bucket,
            stream,
            prefix: _,
        } = self;
        let timeout = Arc::new(Mutex::new(TimeoutState::new(
            config.algorithm.clone(),
            config.put_requests.clone(),
        )));
        let n_retries = config.algorithm.n_retries;
        let dest_bucket = dest_bucket.unwrap_or_else(|| bucket.clone());
        stream
            .try_filter_map(|response| ok(response.1.contents))
            .map_ok(|x| stream::iter(x).map(Ok))
            .try_flatten()
            .try_filter_map(|obj| {
                // Just filter out any object that does not have both of `key` and `size`
                let Object { key, size, .. } = obj;
                ok(key.and_then(|key| size.map(|size| (key, size))))
            })
            .and_then(move |(key, size)| {
                let (s3, timeout) = (s3.clone(), timeout.clone());
                let request = CopyObjectRequest {
                    copy_source: format!("{}/{}", bucket, key),
                    bucket: dest_bucket.clone(),
                    key: mapping(&key),
                    ..default_request()
                };
                // println!("COPY REQUEST\n{:#?}", request);
                s3_request(
                    move || {
                        let (s3, request) = (s3.clone(), request.clone());
                        async move {
                            let (s3, request) = (s3.clone(), request.clone());
                            Ok((async move{s3.copy_object(request).context(err::CopyObject).await}, size as usize))
                        }
                    },
                    |_, size| size,
                    n_retries,
                    timeout,
                )
                .map_ok(|_| key)
            })
    }

    /// Copy all listed objects, to a different S3 location as defined in `mapping` and
    /// `dest_bucket`.
    /// If `other_bucket` is not provided, copy to same bucket
    pub fn copy_all<F, R>(
        self,
        dest_bucket: Option<String>,
        mapping: F,
        default_request: R,
    ) -> impl Future<Output = Result<(), Error>>
    where
        F: Fn(&str) -> String + Clone + Send + Sync + Unpin + 'static,
        R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static,
    {
        self.copy_all_stream(dest_bucket, mapping, default_request)
            .try_for_each(|_| async { Ok(()) })
    }
    // TODO: Is it possible to change copy_all so that we can move_all by just chaining copy_all
    // and delete_all? Then copy_all would need to return a stream of old keys, but does that make
    // sense in general?
    // For now, this is code duplication.
    pub fn move_all<F, R>(
        self,
        dest_bucket: Option<String>,
        mapping: F,
        default_request: R,
    ) -> impl Future<Output = Result<(), Error>>
    where
        F: Fn(&str) -> String + Clone + Send + Sync + Unpin + 'static,
        R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static,
    {
        let src_bucket = self.bucket.clone();
        let timeout = Arc::new(Mutex::new(TimeoutState::new(
            self.config.algorithm.clone(),
            self.config.delete_requests.clone(),
        )));
        let n_retries = self.config.algorithm.n_retries;
        let s3 = self.s3.clone();
        self.copy_all_stream(dest_bucket, mapping, default_request)
            .and_then(move |src_key| {
                let delete_request = DeleteObjectRequest {
                    bucket: src_bucket.clone(),
                    key: src_key,
                    ..Default::default()
                };
                let (s3, timeout) = (s3.clone(), timeout.clone());
                s3_request(
                    move || {
                        let (s3, delete_request) = (s3.clone(), delete_request.clone());
                        async move {
                            let (s3, delete_request) = (s3.clone(), delete_request.clone());
                            Ok((
                                async move {
                                    s3.delete_object(delete_request)
                                        .context(err::DeleteObject)
                                        .await
                                },
                                1,
                            ))
                        }
                    },
                    |_, _| 1,
                    n_retries,
                    timeout,
                )
                .map_ok(drop)
                .boxed()
            })
            .try_for_each(|_| async { Ok(()) })
            .boxed()
    }
    /// Move all listed objects by substituting their common prefix with `new_prefix`.
    pub fn move_to_prefix<R>(
        self,
        dest_bucket: Option<String>,
        new_prefix: String,
        default_request: R,
    ) -> impl Future<Output = Result<(), Error>>
    where
        R: Fn() -> CopyObjectRequest + Clone + Unpin + Sync + Send + 'static,
    {
        let old_prefix = self.prefix.clone();
        let substitute_prefix =
            move |source: &str| format!("{}{}", new_prefix, source.trim_start_matches(&old_prefix));
        self.move_all(dest_bucket, substitute_prefix, default_request)
            .boxed()
    }
    */
}

impl<S> Stream for ListObjects<S>
where
    S: Stream<Item = Result<ListObjectsV2Output, Error>> + Sized + Send + Unpin,
{
    type Item = Result<ListObjectsV2Output, Error>;
    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context) -> Poll<Option<Self::Item>> {
        Pin::new(&mut self.stream).poll_next(cx)
    }
}

impl S3Algo {
    /// List objects of a bucket.
    pub fn list_prefix(
        &self,
        bucket: String,
        prefix: Option<String>,
    ) -> ListObjects<impl Stream<Item = Result<ListObjectsV2Output, Error>> + Sized + Send> {
        // TODO: Reintroduce retry and timeout

        let stream = self
            .s3
            .list_objects_v2()
            .bucket(bucket.clone())
            .set_prefix(prefix)
            .into_paginator()
            .send();
        let stream = PaginationStreamExt::into_stream_03x(stream)
            // Turn into a stream of Objects
            .map_err(|source| Error::ListObjectsV2 { source });

        ListObjects {
            s3: self.s3.clone(),
            config: self.config.clone(),
            stream,
            bucket,
            prefix: String::new(),
        }
    }
}

#[cfg(test)]
mod test {
    use super::*;
    use crate::test::rand_string;
    use std::sync::atomic::{AtomicUsize, Ordering};
    #[tokio::test]
    async fn test_s3_delete_files_progress() {
        // Minio does paging at 10'000 fles, so we need more than that.
        // It means this test will take a minutes or two.
        let algo = S3Algo::new(testing_sdk_client().await);
        let dir = rand_string(14);
        let dir2 = dir.clone();
        const N_FILES: usize = 11_000;
        let files = (0..N_FILES).map(move |i| ObjectSource::Data {
            data: vec![1, 2, 3],
            key: format!("{}/{}.file", dir2, i),
        });
        algo.upload_files(
            "test-bucket".into(),
            files,
            |result| async move {
                if result.seq % 100 == 0 {
                    println!("{} files uploaded", result.seq);
                }
            },
            |client| client.put_object(),
        )
        .await
        .unwrap();

        let listed_files = Arc::new(AtomicUsize::new(0));
        let deleted_files = Arc::new(AtomicUsize::new(0));
        let listed_files2 = listed_files.clone();
        let deleted_files2 = deleted_files.clone();

        // Do one listing only to check the exact file names
        let present = Arc::new(Mutex::new(std::collections::HashSet::new()));
        algo.list_prefix("test-bucket".into(), Some(dir.clone()))
            .process(|object| async {
                let name = object.key.unwrap_or_else(|| "NONE".to_string());
                println!("OBJ {}", name);
                present.lock().await.insert(name);
            })
            .await
            .unwrap();
        let mut present = present.lock().await;

        // All files are present
        for i in 0..N_FILES {
            let file_name = &format!("{}/{}.file", dir, i);
            assert!(present.remove(file_name));
        }

        // No unexpected filesnames.
        // Because once, it listed 11_200 files instead of 11_000
        if !present.is_empty() {
            println!("Left-over object names: {:?}", present);
            panic!("Not empty ({} files)", present.len());
        }

        // Assert that number of files is N_FILES
        let count = algo
            .list_prefix("test-bucket".into(), Some(dir.clone()))
            .flatten()
            .try_fold(0usize, |acc, _| ok(acc + 1))
            .await
            .unwrap();
        assert_eq!(count, N_FILES);

        // Delete all
        algo.list_prefix("test-bucket".into(), Some(dir.clone()))
            .delete_all(
                move |n| {
                    println!("Listed {} items", n);
                    let listed_files = listed_files2.clone();
                    async move {
                        listed_files.fetch_add(n, Ordering::Relaxed);
                    }
                },
                move |del_rep| {
                    let n = del_rep.size as usize;
                    println!("Deleted {} items", n);
                    let deleted_files = deleted_files2.clone();
                    async move {
                        deleted_files.fetch_add(n, Ordering::Relaxed);
                    }
                },
            )
            .await
            .unwrap();

        // Assert number of objects listed and deleted
        assert_eq!(listed_files.load(Ordering::Relaxed), N_FILES);
        assert_eq!(deleted_files.load(Ordering::Relaxed), N_FILES);

        // Assert that number of files is 0
        let count = algo
            .list_prefix("test-bucket".into(), Some(dir))
            .flatten()
            .try_fold(0usize, |acc, _| ok(acc + 1))
            .await
            .unwrap();

        assert_eq!(count, 0);
    }

    #[tokio::test]
    async fn test_s3_download_files() {
        let algo = S3Algo::new(testing_sdk_client().await);
        let dir = rand_string(14);
        let dir2 = dir.clone();
        
        // Test with a reasonable number of files to verify concurrency
        const N_FILES: usize = 50;
        const FILE_DATA: &[u8] = b"test data content for download";
        
        // Upload test files
        let files = (0..N_FILES).map(move |i| ObjectSource::Data {
            data: FILE_DATA.to_vec(),
            key: format!("{}/{}.file", dir2, i),
        });
        algo.upload_files(
            "test-bucket".into(),
            files,
            |_result| async move {},
            |client| client.put_object(),
        )
        .await
        .unwrap();

        // Download all files
        let downloaded_files = Arc::new(Mutex::new(std::collections::HashMap::new()));
        let downloaded_files2 = downloaded_files.clone();
        
        algo.list_prefix("test-bucket".into(), Some(dir.clone()))
            .filter_keys(|key| !key.ends_with("archive.zip"))
            .download_all_stream()
            .try_for_each(|(key, body, size)| {
                let downloaded_files = downloaded_files2.clone();
                async move {
                    let mut contents = vec![];
                    tokio::io::copy(&mut body.into_async_read(), &mut contents)
                        .await
                        .unwrap();
                    downloaded_files.lock().await.insert(key, (contents, size));
                    Ok(())
                }
            })
            .await
            .unwrap();

        let downloaded = downloaded_files.lock().await;
        
        // Verify all files were downloaded
        assert_eq!(downloaded.len(), N_FILES);
        
        // Verify each file's content and size
        for i in 0..N_FILES {
            let key = format!("{}/{}.file", dir, i);
            let (contents, size) = downloaded.get(&key)
                .unwrap_or_else(|| panic!("File {} not found in downloaded files", key));
            assert_eq!(contents.as_slice(), FILE_DATA);
            assert_eq!(*size, Some(FILE_DATA.len() as i64));
        }

        // Cleanup: delete test files
        algo.list_prefix("test-bucket".into(), Some(dir))
            .delete_all(|_| async {}, |_| async {})
            .await
            .unwrap();
    }

    #[tokio::test]
    async fn test_download_parallelism() {
        use std::sync::atomic::{AtomicUsize, Ordering};
        use std::time::Duration;
        
        let algo = S3Algo::new(testing_sdk_client().await);
        let dir = rand_string(14);
        let dir2 = dir.clone();
        
        // Setup: 50 files, parallelism of 20, 100ms simulated delay each
        // Expected: ceil(50/20) = 3 batches
        // With real time: should take roughly 300ms + overhead
        const N_FILES: usize = 50;
        const PARALLELISM: usize = 20;
        const DELAY_PER_FILE: Duration = Duration::from_millis(100);
        const FILE_DATA: &[u8] = b"test data";
        
        // Upload test files (with real time)
        let files = (0..N_FILES).map(move |i| ObjectSource::Data {
            data: FILE_DATA.to_vec(),
            key: format!("{}/{}.file", dir2, i),
        });
        algo.upload_files(
            "test-bucket".into(),
            files,
            |_result| async move {},
            |client| client.put_object(),
        )
        .await
        .unwrap();

        // Track concurrency
        let current_concurrent = Arc::new(AtomicUsize::new(0));
        let max_concurrent = Arc::new(AtomicUsize::new(0));
        let total_downloads = Arc::new(AtomicUsize::new(0));
        let sum_concurrent = Arc::new(AtomicUsize::new(0));

        let current_concurrent2 = current_concurrent.clone();
        let max_concurrent2 = max_concurrent.clone();
        let total_downloads2 = total_downloads.clone();
        let sum_concurrent2 = sum_concurrent.clone();
        
        let start = tokio::time::Instant::now();
        
        // Download with instrumentation
        let mut config = Config::default();
        config.download_parallelization = PARALLELISM;
        let algo_custom = S3Algo::with_config(testing_sdk_client().await, config);
        
        // Create hook that tracks concurrency and adds delay
        let hook = move || {
            let current = current_concurrent2.clone();
            let max = max_concurrent2.clone();
            let total = total_downloads2.clone();
            let sum = sum_concurrent2.clone();
            
            async move {
                // Increment current concurrent downloads
                let concurrent = current.fetch_add(1, Ordering::SeqCst) + 1;
                
                // Update max concurrent if needed
                max.fetch_max(concurrent, Ordering::SeqCst);
                
                // Track for mean calculation
                total.fetch_add(1, Ordering::SeqCst);
                sum.fetch_add(concurrent, Ordering::SeqCst);
                
                // Simulate network delay
                tokio::time::sleep(DELAY_PER_FILE).await;
                
                // Decrement will happen after the download completes
                current.fetch_sub(1, Ordering::SeqCst);
            }
        };
        
        algo_custom
            .list_prefix("test-bucket".into(), Some(dir.clone()))
            .download_all_stream_internal(Some(hook))
            .try_for_each(|(_key, body, _size)| {
                async move {
                    // Read the body
                    let mut contents = vec![];
                    tokio::io::copy(&mut body.into_async_read(), &mut contents)
                        .await
                        .unwrap();
                    Ok(())
                }
            })
            .await
            .unwrap();

        let elapsed = start.elapsed();
        
        let max_concurrent_observed = max_concurrent.load(Ordering::SeqCst);
        let total = total_downloads.load(Ordering::SeqCst);
        let sum = sum_concurrent.load(Ordering::SeqCst);
        let mean_concurrent = sum as f64 / total as f64;

        // Cleanup
        algo.list_prefix("test-bucket".into(), Some(dir))
            .delete_all(|_| async {}, |_| async {})
            .await
            .unwrap();

        // Assertions
        println!("Elapsed: {:?}", elapsed);
        println!("Max concurrent: {}", max_concurrent_observed);
        println!("Mean concurrent: {:.2}", mean_concurrent);
        println!("Total downloads: {}", total);
        
        // Verify we downloaded all files
        assert_eq!(total, N_FILES, "Should download all {} files", N_FILES);
        
        // Verify parallelism
        assert_eq!(max_concurrent_observed, PARALLELISM, 
                   "Max concurrent downloads should equal parallelism setting of {}", PARALLELISM);
        
        // Mean concurrency should be reasonably high (at least 50% of max)
        // Lower bound accounts for startup/rampup effects
        assert!(mean_concurrent >= PARALLELISM as f64 * 0.5, 
                "Mean concurrent ({:.2}) should be at least 50% of max parallelism ({})", 
                mean_concurrent, PARALLELISM);
        
        // Time check: with 2-3 batches of DELAY_PER_FILE each, should be roughly that much
        // Allow generous bounds due to S3 operation overhead
        let min_expected = DELAY_PER_FILE * ((N_FILES / PARALLELISM) as u32);
        let max_expected = min_expected * 10; // Allow 10x for S3 overhead + delays
        
        assert!(elapsed >= min_expected,
                "Elapsed time ({:?}) should be at least {} batches * {:?} = {:?}",
                elapsed, N_FILES / PARALLELISM, DELAY_PER_FILE, min_expected);
        assert!(elapsed <= max_expected,
                "Elapsed time ({:?}) should not exceed {:?} (10x theoretical minimum for test environment)",
                elapsed, max_expected);
    }
}