Skip to main content

oxicuda_memory/
bandwidth_profiler.rs

1//! Memory bandwidth profiling and throughput measurement.
2//!
3//! This module provides tools for measuring and analyzing transfer bandwidth
4//! between host and device memory. It supports profiling of host-to-device,
5//! device-to-host, device-to-device, and host-to-host transfers.
6//!
7//! # Overview
8//!
9//! The profiling workflow consists of:
10//!
11//! 1. **Recording** individual transfer measurements via [`BandwidthProfiler::record`].
12//! 2. **Summarizing** collected data with [`BandwidthProfiler::summary`] or
13//!    [`BandwidthProfiler::summary_by_direction`].
14//! 3. **Estimating** transfer times and utilization with the standalone functions
15//!    [`estimate_transfer_time`], [`theoretical_peak_bandwidth`], and
16//!    [`bandwidth_utilization`].
17//!
18//! # Example
19//!
20//! ```rust
21//! use oxicuda_memory::bandwidth_profiler::*;
22//!
23//! let mut profiler = BandwidthProfiler::new();
24//!
25//! // Record some measurements
26//! let m = BandwidthMeasurement::new(
27//!     TransferDirection::HostToDevice,
28//!     1_048_576, // 1 MB
29//!     0.5,       // 0.5 ms
30//! );
31//! profiler.record(m);
32//!
33//! let summary = profiler.summary();
34//! println!("{summary}");
35//! ```
36
37use std::fmt;
38use std::time::Instant;
39
40// ---------------------------------------------------------------------------
41// TransferDirection
42// ---------------------------------------------------------------------------
43
44/// Direction of a memory transfer operation.
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
46pub enum TransferDirection {
47    /// Host (CPU) memory to device (GPU) memory.
48    HostToDevice,
49    /// Device (GPU) memory to host (CPU) memory.
50    DeviceToHost,
51    /// Device (GPU) memory to device (GPU) memory.
52    DeviceToDevice,
53    /// Host (CPU) memory to host (CPU) memory.
54    HostToHost,
55}
56
57impl fmt::Display for TransferDirection {
58    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
59        match self {
60            Self::HostToDevice => write!(f, "Host -> Device"),
61            Self::DeviceToHost => write!(f, "Device -> Host"),
62            Self::DeviceToDevice => write!(f, "Device -> Device"),
63            Self::HostToHost => write!(f, "Host -> Host"),
64        }
65    }
66}
67
68// ---------------------------------------------------------------------------
69// BandwidthMeasurement
70// ---------------------------------------------------------------------------
71
72/// A single transfer bandwidth measurement.
73///
74/// Each measurement captures the direction, size, elapsed time, computed
75/// throughput, and a wall-clock timestamp for when it was recorded.
76#[derive(Debug, Clone)]
77pub struct BandwidthMeasurement {
78    /// Direction of the transfer.
79    pub direction: TransferDirection,
80    /// Number of bytes transferred.
81    pub bytes: usize,
82    /// Elapsed time for the transfer in milliseconds.
83    pub elapsed_ms: f64,
84    /// Computed bandwidth in GB/s (10^9 bytes/s).
85    pub bandwidth_gbps: f64,
86    /// Wall-clock timestamp when this measurement was created.
87    pub timestamp: Instant,
88}
89
90impl BandwidthMeasurement {
91    /// Creates a new measurement from raw transfer parameters.
92    ///
93    /// The `bandwidth_gbps` field is automatically computed from `bytes` and
94    /// `elapsed_ms`. If `elapsed_ms` is zero or negative, bandwidth is set
95    /// to zero to avoid division-by-zero or negative values.
96    pub fn new(direction: TransferDirection, bytes: usize, elapsed_ms: f64) -> Self {
97        let bandwidth_gbps = if elapsed_ms > 0.0 {
98            // bytes / (ms * 1e-3) = bytes * 1000 / ms  => bytes/s
99            // then divide by 1e9 to get GB/s
100            (bytes as f64) / (elapsed_ms * 1e-3) / 1e9
101        } else {
102            0.0
103        };
104
105        Self {
106            direction,
107            bytes,
108            elapsed_ms,
109            bandwidth_gbps,
110            timestamp: Instant::now(),
111        }
112    }
113
114    /// Returns the bandwidth in MB/s (10^6 bytes/s).
115    #[inline]
116    pub fn bandwidth_mbps(&self) -> f64 {
117        self.bandwidth_gbps * 1000.0
118    }
119
120    /// Returns the transfer latency in microseconds.
121    #[inline]
122    pub fn latency_us(&self) -> f64 {
123        self.elapsed_ms * 1000.0
124    }
125}
126
127impl fmt::Display for BandwidthMeasurement {
128    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
129        write!(
130            f,
131            "{}: {} bytes in {:.3} ms ({:.2} GB/s)",
132            self.direction, self.bytes, self.elapsed_ms, self.bandwidth_gbps
133        )
134    }
135}
136
137// ---------------------------------------------------------------------------
138// DirectionSummary
139// ---------------------------------------------------------------------------
140
141/// Aggregated statistics for transfers in a single direction.
142#[derive(Debug, Clone)]
143pub struct DirectionSummary {
144    /// The transfer direction these statistics cover.
145    pub direction: TransferDirection,
146    /// Number of transfers recorded for this direction.
147    pub transfer_count: usize,
148    /// Total bytes transferred across all measurements.
149    pub total_bytes: usize,
150    /// Average bandwidth in GB/s across all measurements.
151    pub avg_bandwidth_gbps: f64,
152    /// Minimum observed bandwidth in GB/s.
153    pub min_bandwidth_gbps: f64,
154    /// Maximum observed bandwidth in GB/s.
155    pub max_bandwidth_gbps: f64,
156    /// Estimated fixed latency overhead in microseconds.
157    ///
158    /// Derived from the smallest transfer: this approximates the per-transfer
159    /// setup cost independent of data size.
160    pub latency_overhead_us: f64,
161}
162
163impl fmt::Display for DirectionSummary {
164    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
165        write!(
166            f,
167            "{}: {} transfers, {} bytes total, avg {:.2} GB/s (min {:.2}, max {:.2}), \
168             overhead ~{:.1} us",
169            self.direction,
170            self.transfer_count,
171            self.total_bytes,
172            self.avg_bandwidth_gbps,
173            self.min_bandwidth_gbps,
174            self.max_bandwidth_gbps,
175            self.latency_overhead_us
176        )
177    }
178}
179
180// ---------------------------------------------------------------------------
181// BandwidthSummary
182// ---------------------------------------------------------------------------
183
184/// Aggregated bandwidth statistics across all recorded measurements.
185#[derive(Debug, Clone)]
186pub struct BandwidthSummary {
187    /// Total number of transfers recorded.
188    pub total_transfers: usize,
189    /// Total bytes transferred across all measurements.
190    pub total_bytes: usize,
191    /// Total wall-clock time of all transfers in milliseconds.
192    pub total_time_ms: f64,
193    /// Average bandwidth in GB/s across all measurements.
194    pub avg_bandwidth_gbps: f64,
195    /// Peak (maximum) bandwidth observed in any single measurement.
196    pub peak_bandwidth_gbps: f64,
197    /// Per-direction breakdown of statistics.
198    pub per_direction: Vec<DirectionSummary>,
199}
200
201impl fmt::Display for BandwidthSummary {
202    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
203        writeln!(f, "=== Bandwidth Summary ===")?;
204        writeln!(
205            f,
206            "Total: {} transfers, {} bytes, {:.3} ms",
207            self.total_transfers, self.total_bytes, self.total_time_ms
208        )?;
209        writeln!(
210            f,
211            "Avg: {:.2} GB/s, Peak: {:.2} GB/s",
212            self.avg_bandwidth_gbps, self.peak_bandwidth_gbps
213        )?;
214        for ds in &self.per_direction {
215            writeln!(f, "  {ds}")?;
216        }
217        Ok(())
218    }
219}
220
221// ---------------------------------------------------------------------------
222// BandwidthProfiler
223// ---------------------------------------------------------------------------
224
225/// Accumulates bandwidth measurements and produces summary statistics.
226///
227/// The profiler stores individual [`BandwidthMeasurement`] records and can
228/// compute aggregate statistics across all measurements or filtered by
229/// transfer direction.
230#[derive(Debug, Clone)]
231pub struct BandwidthProfiler {
232    /// Collected measurements.
233    measurements: Vec<BandwidthMeasurement>,
234    /// Number of warmup iterations (hint for benchmark drivers).
235    pub warmup_iterations: u32,
236    /// Number of benchmark iterations (hint for benchmark drivers).
237    pub benchmark_iterations: u32,
238}
239
240impl Default for BandwidthProfiler {
241    fn default() -> Self {
242        Self::new()
243    }
244}
245
246impl BandwidthProfiler {
247    /// Creates a new profiler with default iteration counts
248    /// (3 warmup, 10 benchmark).
249    pub fn new() -> Self {
250        Self {
251            measurements: Vec::new(),
252            warmup_iterations: 3,
253            benchmark_iterations: 10,
254        }
255    }
256
257    /// Creates a new profiler with custom iteration counts.
258    pub fn with_iterations(warmup: u32, benchmark: u32) -> Self {
259        Self {
260            measurements: Vec::new(),
261            warmup_iterations: warmup,
262            benchmark_iterations: benchmark,
263        }
264    }
265
266    /// Records a single bandwidth measurement.
267    pub fn record(&mut self, measurement: BandwidthMeasurement) {
268        self.measurements.push(measurement);
269    }
270
271    /// Returns the number of recorded measurements.
272    #[inline]
273    pub fn measurement_count(&self) -> usize {
274        self.measurements.len()
275    }
276
277    /// Returns a reference to all recorded measurements.
278    #[inline]
279    pub fn measurements(&self) -> &[BandwidthMeasurement] {
280        &self.measurements
281    }
282
283    /// Clears all recorded measurements.
284    pub fn clear(&mut self) {
285        self.measurements.clear();
286    }
287
288    /// Computes a summary of all recorded measurements.
289    ///
290    /// If no measurements have been recorded, all summary fields will be zero
291    /// and `per_direction` will be empty.
292    pub fn summary(&self) -> BandwidthSummary {
293        if self.measurements.is_empty() {
294            return BandwidthSummary {
295                total_transfers: 0,
296                total_bytes: 0,
297                total_time_ms: 0.0,
298                avg_bandwidth_gbps: 0.0,
299                peak_bandwidth_gbps: 0.0,
300                per_direction: Vec::new(),
301            };
302        }
303
304        let total_transfers = self.measurements.len();
305        let total_bytes: usize = self.measurements.iter().map(|m| m.bytes).sum();
306        let total_time_ms: f64 = self.measurements.iter().map(|m| m.elapsed_ms).sum();
307
308        let bw_sum: f64 = self.measurements.iter().map(|m| m.bandwidth_gbps).sum();
309        let avg_bandwidth_gbps = bw_sum / total_transfers as f64;
310
311        let peak_bandwidth_gbps = self
312            .measurements
313            .iter()
314            .map(|m| m.bandwidth_gbps)
315            .fold(0.0_f64, f64::max);
316
317        // Build per-direction summaries for each direction that has data.
318        let directions = [
319            TransferDirection::HostToDevice,
320            TransferDirection::DeviceToHost,
321            TransferDirection::DeviceToDevice,
322            TransferDirection::HostToHost,
323        ];
324
325        let per_direction: Vec<DirectionSummary> = directions
326            .iter()
327            .filter_map(|&dir| self.compute_direction_summary(dir))
328            .collect();
329
330        BandwidthSummary {
331            total_transfers,
332            total_bytes,
333            total_time_ms,
334            avg_bandwidth_gbps,
335            peak_bandwidth_gbps,
336            per_direction,
337        }
338    }
339
340    /// Computes a summary for a single transfer direction.
341    ///
342    /// Returns `None` if no measurements exist for the given direction.
343    pub fn summary_by_direction(&self, dir: TransferDirection) -> Option<DirectionSummary> {
344        self.compute_direction_summary(dir)
345    }
346
347    /// Internal helper to compute a [`DirectionSummary`] for one direction.
348    fn compute_direction_summary(&self, dir: TransferDirection) -> Option<DirectionSummary> {
349        let filtered: Vec<&BandwidthMeasurement> = self
350            .measurements
351            .iter()
352            .filter(|m| m.direction == dir)
353            .collect();
354
355        if filtered.is_empty() {
356            return None;
357        }
358
359        let transfer_count = filtered.len();
360        let total_bytes: usize = filtered.iter().map(|m| m.bytes).sum();
361
362        let bw_sum: f64 = filtered.iter().map(|m| m.bandwidth_gbps).sum();
363        let avg_bandwidth_gbps = bw_sum / transfer_count as f64;
364
365        let min_bandwidth_gbps = filtered
366            .iter()
367            .map(|m| m.bandwidth_gbps)
368            .fold(f64::INFINITY, f64::min);
369
370        let max_bandwidth_gbps = filtered
371            .iter()
372            .map(|m| m.bandwidth_gbps)
373            .fold(0.0_f64, f64::max);
374
375        // Estimate latency overhead from the smallest transfer.
376        // The smallest transfer is most dominated by fixed overhead, so its
377        // latency serves as a reasonable approximation.
378        let latency_overhead_us = filtered
379            .iter()
380            .min_by_key(|m| m.bytes)
381            .map(|m| m.latency_us())
382            .unwrap_or(0.0);
383
384        Some(DirectionSummary {
385            direction: dir,
386            transfer_count,
387            total_bytes,
388            avg_bandwidth_gbps,
389            min_bandwidth_gbps,
390            max_bandwidth_gbps,
391            latency_overhead_us,
392        })
393    }
394}
395
396// ---------------------------------------------------------------------------
397// BandwidthBenchmarkConfig
398// ---------------------------------------------------------------------------
399
400/// Configuration for a bandwidth benchmark sweep.
401///
402/// Specifies which transfer sizes, directions, and iteration counts to use
403/// when running a benchmark. The [`Default`] implementation provides a
404/// standard set of sizes from 1 KB to 256 MB across all four directions.
405#[derive(Debug, Clone)]
406pub struct BandwidthBenchmarkConfig {
407    /// Transfer sizes to benchmark (in bytes).
408    pub sizes: Vec<usize>,
409    /// Transfer directions to benchmark.
410    pub directions: Vec<TransferDirection>,
411    /// Number of warmup iterations before timing begins.
412    pub warmup_iterations: u32,
413    /// Number of timed benchmark iterations per size/direction pair.
414    pub benchmark_iterations: u32,
415    /// Whether to use pinned (page-locked) host memory for transfers.
416    pub use_pinned_memory: bool,
417}
418
419impl Default for BandwidthBenchmarkConfig {
420    fn default() -> Self {
421        Self {
422            sizes: vec![
423                1 << 10,   // 1 KB
424                4 << 10,   // 4 KB
425                16 << 10,  // 16 KB
426                64 << 10,  // 64 KB
427                256 << 10, // 256 KB
428                1 << 20,   // 1 MB
429                4 << 20,   // 4 MB
430                16 << 20,  // 16 MB
431                64 << 20,  // 64 MB
432                256 << 20, // 256 MB
433            ],
434            directions: vec![
435                TransferDirection::HostToDevice,
436                TransferDirection::DeviceToHost,
437                TransferDirection::DeviceToDevice,
438                TransferDirection::HostToHost,
439            ],
440            warmup_iterations: 3,
441            benchmark_iterations: 10,
442            use_pinned_memory: true,
443        }
444    }
445}
446
447impl BandwidthBenchmarkConfig {
448    /// Creates a new config with custom sizes and default settings.
449    pub fn with_sizes(sizes: Vec<usize>) -> Self {
450        Self {
451            sizes,
452            ..Self::default()
453        }
454    }
455
456    /// Creates a new config for a single direction.
457    pub fn for_direction(direction: TransferDirection) -> Self {
458        Self {
459            directions: vec![direction],
460            ..Self::default()
461        }
462    }
463
464    /// Sets the number of warmup and benchmark iterations.
465    pub fn set_iterations(&mut self, warmup: u32, benchmark: u32) {
466        self.warmup_iterations = warmup;
467        self.benchmark_iterations = benchmark;
468    }
469
470    /// Total number of individual transfers this config would produce.
471    ///
472    /// Equal to `sizes.len() * directions.len() * benchmark_iterations`.
473    pub fn total_transfers(&self) -> usize {
474        self.sizes.len() * self.directions.len() * self.benchmark_iterations as usize
475    }
476}
477
478// ---------------------------------------------------------------------------
479// Standalone functions
480// ---------------------------------------------------------------------------
481
482/// Estimates the transfer time in milliseconds for a given data size.
483///
484/// Uses a simple linear model: `time = latency + bytes / bandwidth`.
485///
486/// # Parameters
487///
488/// * `bytes` — number of bytes to transfer.
489/// * `bandwidth_gbps` — sustained bandwidth in GB/s.
490/// * `latency_us` — fixed per-transfer overhead in microseconds.
491///
492/// # Returns
493///
494/// Estimated transfer time in milliseconds. Returns `f64::INFINITY` if
495/// `bandwidth_gbps` is zero or negative.
496pub fn estimate_transfer_time(bytes: usize, bandwidth_gbps: f64, latency_us: f64) -> f64 {
497    if bandwidth_gbps <= 0.0 {
498        return f64::INFINITY;
499    }
500    let latency_ms = latency_us / 1000.0;
501    // bandwidth_gbps = GB/s = 1e9 bytes/s
502    // time_for_data (s) = bytes / (bandwidth_gbps * 1e9)
503    // time_for_data (ms) = bytes / (bandwidth_gbps * 1e9) * 1e3
504    //                    = bytes / (bandwidth_gbps * 1e6)
505    let data_time_ms = bytes as f64 / (bandwidth_gbps * 1e6);
506    latency_ms + data_time_ms
507}
508
509/// Returns the theoretical peak unidirectional bandwidth for a PCIe
510/// configuration in GB/s.
511///
512/// Accounts for the 128b/130b encoding overhead used by PCIe 3.0+ (yielding
513/// ~98.46% efficiency) and the per-lane raw data rates:
514///
515/// | Generation | Per-lane rate (GT/s) |
516/// |------------|----------------------|
517/// | PCIe 1.0   | 2.5                  |
518/// | PCIe 2.0   | 5.0                  |
519/// | PCIe 3.0   | 8.0                  |
520/// | PCIe 4.0   | 16.0                 |
521/// | PCIe 5.0   | 32.0                 |
522/// | PCIe 6.0   | 64.0                 |
523///
524/// # Parameters
525///
526/// * `pcie_gen` — PCIe generation (1–6).
527/// * `lanes` — number of PCIe lanes (typically 1, 4, 8, or 16).
528///
529/// # Returns
530///
531/// Theoretical peak bandwidth in GB/s, or `0.0` if `pcie_gen` is out of
532/// range or `lanes` is zero.
533pub fn theoretical_peak_bandwidth(pcie_gen: u32, lanes: u32) -> f64 {
534    if lanes == 0 {
535        return 0.0;
536    }
537
538    // Per-lane data rate in GT/s (gigatransfers/second)
539    let rate_gtps: f64 = match pcie_gen {
540        1 => 2.5,
541        2 => 5.0,
542        3 => 8.0,
543        4 => 16.0,
544        5 => 32.0,
545        6 => 64.0,
546        _ => return 0.0,
547    };
548
549    // PCIe 1.0 and 2.0 use 8b/10b encoding (80% efficiency).
550    // PCIe 3.0+ use 128b/130b encoding (~98.46% efficiency).
551    let encoding_efficiency: f64 = if pcie_gen <= 2 { 0.8 } else { 128.0 / 130.0 };
552
553    // Each transfer moves 1 bit, so GT/s = Gbit/s.
554    // Convert to GB/s: Gbit/s / 8.
555    rate_gtps * lanes as f64 * encoding_efficiency / 8.0
556}
557
558/// Returns the bandwidth utilization ratio (0.0–1.0).
559///
560/// # Parameters
561///
562/// * `measured_gbps` — measured bandwidth in GB/s.
563/// * `peak_gbps` — theoretical peak bandwidth in GB/s.
564///
565/// # Returns
566///
567/// The ratio `measured / peak`, clamped to `[0.0, 1.0]`. Returns `0.0` if
568/// `peak_gbps` is zero or negative.
569pub fn bandwidth_utilization(measured_gbps: f64, peak_gbps: f64) -> f64 {
570    if peak_gbps <= 0.0 {
571        return 0.0;
572    }
573    (measured_gbps / peak_gbps).clamp(0.0, 1.0)
574}
575
576/// Formats a byte count into a human-readable string (e.g., "1.00 MB").
577pub fn format_bytes(bytes: usize) -> String {
578    const KB: usize = 1 << 10;
579    const MB: usize = 1 << 20;
580    const GB: usize = 1 << 30;
581
582    if bytes >= GB {
583        format!("{:.2} GB", bytes as f64 / GB as f64)
584    } else if bytes >= MB {
585        format!("{:.2} MB", bytes as f64 / MB as f64)
586    } else if bytes >= KB {
587        format!("{:.2} KB", bytes as f64 / KB as f64)
588    } else {
589        format!("{bytes} B")
590    }
591}
592
593/// Returns a human-readable description of a bandwidth value.
594///
595/// Useful for logging and reporting.
596pub fn describe_bandwidth(gbps: f64) -> String {
597    if gbps >= 1.0 {
598        format!("{gbps:.2} GB/s")
599    } else {
600        format!("{:.2} MB/s", gbps * 1000.0)
601    }
602}
603
604// ---------------------------------------------------------------------------
605// Tests
606// ---------------------------------------------------------------------------
607
608#[cfg(test)]
609mod tests {
610    use super::*;
611
612    // -- BandwidthMeasurement ------------------------------------------------
613
614    #[test]
615    fn measurement_new_computes_bandwidth() {
616        // 1 GB in 1000 ms = 1 GB/s
617        let m = BandwidthMeasurement::new(TransferDirection::HostToDevice, 1_000_000_000, 1000.0);
618        assert!((m.bandwidth_gbps - 1.0).abs() < 1e-6);
619    }
620
621    #[test]
622    fn measurement_zero_elapsed_gives_zero_bandwidth() {
623        let m = BandwidthMeasurement::new(TransferDirection::HostToDevice, 1024, 0.0);
624        assert!((m.bandwidth_gbps - 0.0).abs() < f64::EPSILON);
625    }
626
627    #[test]
628    fn measurement_negative_elapsed_gives_zero_bandwidth() {
629        let m = BandwidthMeasurement::new(TransferDirection::DeviceToHost, 1024, -1.0);
630        assert!((m.bandwidth_gbps - 0.0).abs() < f64::EPSILON);
631    }
632
633    #[test]
634    fn measurement_bandwidth_mbps() {
635        let m = BandwidthMeasurement::new(TransferDirection::DeviceToDevice, 1_000_000_000, 1000.0);
636        assert!((m.bandwidth_mbps() - 1000.0).abs() < 1e-3);
637    }
638
639    #[test]
640    fn measurement_latency_us() {
641        let m = BandwidthMeasurement::new(TransferDirection::HostToHost, 1024, 2.5);
642        assert!((m.latency_us() - 2500.0).abs() < 1e-6);
643    }
644
645    #[test]
646    fn measurement_display() {
647        let m = BandwidthMeasurement::new(TransferDirection::HostToDevice, 1048576, 0.5);
648        let s = format!("{m}");
649        assert!(s.contains("Host -> Device"));
650        assert!(s.contains("1048576"));
651        assert!(s.contains("0.500 ms"));
652        assert!(s.contains("GB/s"));
653    }
654
655    // -- BandwidthProfiler ---------------------------------------------------
656
657    #[test]
658    fn profiler_empty_summary() {
659        let profiler = BandwidthProfiler::new();
660        let s = profiler.summary();
661        assert_eq!(s.total_transfers, 0);
662        assert_eq!(s.total_bytes, 0);
663        assert!((s.avg_bandwidth_gbps - 0.0).abs() < f64::EPSILON);
664        assert!(s.per_direction.is_empty());
665    }
666
667    #[test]
668    fn profiler_record_and_summary() {
669        let mut profiler = BandwidthProfiler::new();
670
671        // 1 MB in 0.5 ms and 2 MB in 1.0 ms (both HtoD)
672        profiler.record(BandwidthMeasurement::new(
673            TransferDirection::HostToDevice,
674            1 << 20,
675            0.5,
676        ));
677        profiler.record(BandwidthMeasurement::new(
678            TransferDirection::HostToDevice,
679            2 << 20,
680            1.0,
681        ));
682        // 512 KB in 0.25 ms (DtoH)
683        profiler.record(BandwidthMeasurement::new(
684            TransferDirection::DeviceToHost,
685            512 << 10,
686            0.25,
687        ));
688
689        let s = profiler.summary();
690        assert_eq!(s.total_transfers, 3);
691        assert_eq!(s.total_bytes, (1 << 20) + (2 << 20) + (512 << 10));
692        assert!((s.total_time_ms - 1.75).abs() < 1e-9);
693        assert!(s.peak_bandwidth_gbps > 0.0);
694        assert_eq!(s.per_direction.len(), 2); // HtoD and DtoH
695    }
696
697    #[test]
698    fn profiler_summary_by_direction() {
699        let mut profiler = BandwidthProfiler::new();
700
701        profiler.record(BandwidthMeasurement::new(
702            TransferDirection::HostToDevice,
703            1 << 20,
704            0.5,
705        ));
706        profiler.record(BandwidthMeasurement::new(
707            TransferDirection::DeviceToHost,
708            1 << 20,
709            0.6,
710        ));
711
712        assert!(
713            profiler
714                .summary_by_direction(TransferDirection::HostToDevice)
715                .is_some()
716        );
717        assert!(
718            profiler
719                .summary_by_direction(TransferDirection::DeviceToHost)
720                .is_some()
721        );
722        assert!(
723            profiler
724                .summary_by_direction(TransferDirection::DeviceToDevice)
725                .is_none()
726        );
727    }
728
729    #[test]
730    fn profiler_direction_summary_stats() {
731        let mut profiler = BandwidthProfiler::new();
732
733        // Two HtoD transfers with different bandwidths
734        let m1 = BandwidthMeasurement::new(TransferDirection::HostToDevice, 1_000_000, 1.0);
735        let m2 = BandwidthMeasurement::new(TransferDirection::HostToDevice, 2_000_000, 1.0);
736        let bw1 = m1.bandwidth_gbps;
737        let bw2 = m2.bandwidth_gbps;
738        profiler.record(m1);
739        profiler.record(m2);
740
741        let ds = profiler
742            .summary_by_direction(TransferDirection::HostToDevice)
743            .expect("should have HtoD summary");
744
745        assert_eq!(ds.transfer_count, 2);
746        assert_eq!(ds.total_bytes, 3_000_000);
747        assert!((ds.avg_bandwidth_gbps - (bw1 + bw2) / 2.0).abs() < 1e-9);
748        assert!((ds.min_bandwidth_gbps - bw1).abs() < 1e-9);
749        assert!((ds.max_bandwidth_gbps - bw2).abs() < 1e-9);
750    }
751
752    #[test]
753    fn profiler_with_iterations() {
754        let p = BandwidthProfiler::with_iterations(5, 20);
755        assert_eq!(p.warmup_iterations, 5);
756        assert_eq!(p.benchmark_iterations, 20);
757        assert_eq!(p.measurement_count(), 0);
758    }
759
760    #[test]
761    fn profiler_clear() {
762        let mut p = BandwidthProfiler::new();
763        p.record(BandwidthMeasurement::new(
764            TransferDirection::HostToDevice,
765            1024,
766            0.1,
767        ));
768        assert_eq!(p.measurement_count(), 1);
769        p.clear();
770        assert_eq!(p.measurement_count(), 0);
771    }
772
773    // -- Standalone functions ------------------------------------------------
774
775    #[test]
776    fn estimate_transfer_time_basic() {
777        // 1 GB at 10 GB/s with 5 us latency
778        let t = estimate_transfer_time(1_000_000_000, 10.0, 5.0);
779        // data_time = 1e9 / (10 * 1e6) = 100 ms
780        // latency = 5 / 1000 = 0.005 ms
781        // total = 100.005 ms
782        assert!((t - 100.005).abs() < 1e-6);
783    }
784
785    #[test]
786    fn estimate_transfer_time_zero_bandwidth() {
787        let t = estimate_transfer_time(1024, 0.0, 5.0);
788        assert!(t.is_infinite());
789    }
790
791    #[test]
792    fn theoretical_peak_bandwidth_pcie3_x16() {
793        let bw = theoretical_peak_bandwidth(3, 16);
794        // PCIe 3.0 x16: 8 GT/s * 16 * (128/130) / 8 ≈ 15.754 GB/s
795        assert!((bw - 15.754).abs() < 0.01);
796    }
797
798    #[test]
799    fn theoretical_peak_bandwidth_pcie4_x16() {
800        let bw = theoretical_peak_bandwidth(4, 16);
801        // PCIe 4.0 x16: 16 GT/s * 16 * (128/130) / 8 ≈ 31.508 GB/s
802        assert!((bw - 31.508).abs() < 0.01);
803    }
804
805    #[test]
806    fn theoretical_peak_bandwidth_pcie5_x16() {
807        let bw = theoretical_peak_bandwidth(5, 16);
808        // PCIe 5.0 x16: 32 GT/s * 16 * (128/130) / 8 ≈ 63.015 GB/s
809        assert!((bw - 63.015).abs() < 0.02);
810    }
811
812    #[test]
813    fn theoretical_peak_bandwidth_invalid_gen() {
814        assert!((theoretical_peak_bandwidth(0, 16) - 0.0).abs() < f64::EPSILON);
815        assert!((theoretical_peak_bandwidth(7, 16) - 0.0).abs() < f64::EPSILON);
816    }
817
818    #[test]
819    fn theoretical_peak_bandwidth_zero_lanes() {
820        assert!((theoretical_peak_bandwidth(3, 0) - 0.0).abs() < f64::EPSILON);
821    }
822
823    #[test]
824    fn bandwidth_utilization_basic() {
825        let u = bandwidth_utilization(12.0, 16.0);
826        assert!((u - 0.75).abs() < 1e-9);
827    }
828
829    #[test]
830    fn bandwidth_utilization_clamps_above_one() {
831        let u = bandwidth_utilization(20.0, 16.0);
832        assert!((u - 1.0).abs() < f64::EPSILON);
833    }
834
835    #[test]
836    fn bandwidth_utilization_zero_peak() {
837        let u = bandwidth_utilization(10.0, 0.0);
838        assert!((u - 0.0).abs() < f64::EPSILON);
839    }
840
841    // -- BandwidthBenchmarkConfig --------------------------------------------
842
843    #[test]
844    fn benchmark_config_default_sizes() {
845        let cfg = BandwidthBenchmarkConfig::default();
846        assert_eq!(cfg.sizes.len(), 10);
847        assert_eq!(cfg.sizes[0], 1 << 10); // 1 KB
848        assert_eq!(cfg.sizes[9], 256 << 20); // 256 MB
849        assert_eq!(cfg.directions.len(), 4);
850        assert_eq!(cfg.warmup_iterations, 3);
851        assert_eq!(cfg.benchmark_iterations, 10);
852        assert!(cfg.use_pinned_memory);
853    }
854
855    #[test]
856    fn benchmark_config_total_transfers() {
857        let cfg = BandwidthBenchmarkConfig::default();
858        // 10 sizes * 4 directions * 10 iterations = 400
859        assert_eq!(cfg.total_transfers(), 400);
860    }
861
862    #[test]
863    fn benchmark_config_with_sizes() {
864        let cfg = BandwidthBenchmarkConfig::with_sizes(vec![1024, 2048]);
865        assert_eq!(cfg.sizes.len(), 2);
866        assert_eq!(cfg.directions.len(), 4); // inherits default
867    }
868
869    #[test]
870    fn benchmark_config_for_direction() {
871        let cfg = BandwidthBenchmarkConfig::for_direction(TransferDirection::DeviceToDevice);
872        assert_eq!(cfg.directions.len(), 1);
873        assert_eq!(cfg.directions[0], TransferDirection::DeviceToDevice);
874    }
875
876    // -- Display / formatting ------------------------------------------------
877
878    #[test]
879    fn summary_display_format() {
880        let mut profiler = BandwidthProfiler::new();
881        profiler.record(BandwidthMeasurement::new(
882            TransferDirection::HostToDevice,
883            1 << 20,
884            0.5,
885        ));
886        let summary = profiler.summary();
887        let display = format!("{summary}");
888        assert!(display.contains("Bandwidth Summary"));
889        assert!(display.contains("GB/s"));
890    }
891
892    #[test]
893    fn direction_display() {
894        assert_eq!(
895            format!("{}", TransferDirection::HostToDevice),
896            "Host -> Device"
897        );
898        assert_eq!(
899            format!("{}", TransferDirection::DeviceToHost),
900            "Device -> Host"
901        );
902        assert_eq!(
903            format!("{}", TransferDirection::DeviceToDevice),
904            "Device -> Device"
905        );
906        assert_eq!(format!("{}", TransferDirection::HostToHost), "Host -> Host");
907    }
908
909    #[test]
910    fn format_bytes_ranges() {
911        assert_eq!(format_bytes(500), "500 B");
912        assert_eq!(format_bytes(1024), "1.00 KB");
913        assert_eq!(format_bytes(1 << 20), "1.00 MB");
914        assert_eq!(format_bytes(1 << 30), "1.00 GB");
915    }
916
917    #[test]
918    fn describe_bandwidth_formatting() {
919        assert_eq!(describe_bandwidth(2.5), "2.50 GB/s");
920        assert_eq!(describe_bandwidth(0.5), "500.00 MB/s");
921    }
922
923    // -- PCIe gen 1/2 encoding -----------------------------------------------
924
925    #[test]
926    fn theoretical_peak_bandwidth_pcie1_x16() {
927        let bw = theoretical_peak_bandwidth(1, 16);
928        // PCIe 1.0 x16: 2.5 GT/s * 16 * 0.8 / 8 = 4.0 GB/s
929        assert!((bw - 4.0).abs() < 1e-6);
930    }
931
932    #[test]
933    fn theoretical_peak_bandwidth_pcie2_x16() {
934        let bw = theoretical_peak_bandwidth(2, 16);
935        // PCIe 2.0 x16: 5.0 GT/s * 16 * 0.8 / 8 = 8.0 GB/s
936        assert!((bw - 8.0).abs() < 1e-6);
937    }
938}