use std::fmt;
use std::collections::HashMap;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum AccessPattern {
FullyCoalesced,
Strided { stride: usize },
Scattered,
Broadcast,
BlockCyclic { block_size: usize },
}
#[derive(Debug, Clone)]
pub struct MemoryAccess {
pub lane_id: u32,
pub address: usize,
pub is_write: bool,
pub elem_size: usize,
}
#[derive(Debug, Clone)]
pub struct CoalescingReport {
pub pattern: AccessPattern,
pub transactions: u32,
pub efficiency: f64,
pub cache_lines_touched: u32,
pub suggestion: String,
}
impl fmt::Display for CoalescingReport {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Coalescing: {:?}, {} transactions, {:.1}% efficiency — {}",
self.pattern, self.transactions, self.efficiency * 100.0, self.suggestion)
}
}
const CACHE_LINE_SIZE: usize = 128;
const TRANSACTION_SIZE: usize = 32;
pub fn analyze_warp_access(accesses: &[MemoryAccess]) -> CoalescingReport {
if accesses.is_empty() {
return CoalescingReport {
pattern: AccessPattern::FullyCoalesced,
transactions: 0,
efficiency: 1.0,
cache_lines_touched: 0,
suggestion: "No accesses to analyze".into(),
};
}
let elem_size = accesses[0].elem_size;
let mut sorted = accesses.to_vec();
sorted.sort_by_key(|a| a.lane_id);
if sorted.windows(2).all(|w| w[0].address == w[1].address) {
return CoalescingReport {
pattern: AccessPattern::Broadcast,
transactions: 1,
efficiency: elem_size as f64 / TRANSACTION_SIZE as f64,
cache_lines_touched: 1,
suggestion: "Broadcast access — consider using shared memory or constant cache".into(),
};
}
let mut strides = Vec::new();
for window in sorted.windows(2) {
if window[1].address >= window[0].address {
strides.push(window[1].address - window[0].address);
}
}
let mut cache_lines: Vec<usize> = accesses.iter()
.map(|a| a.address / CACHE_LINE_SIZE)
.collect();
cache_lines.sort();
cache_lines.dedup();
let cache_lines_touched = cache_lines.len() as u32;
let mut segments: Vec<usize> = accesses.iter()
.map(|a| a.address / TRANSACTION_SIZE)
.collect();
segments.sort();
segments.dedup();
let transactions = segments.len() as u32;
let useful_bytes = accesses.len() * elem_size;
let total_bytes = transactions as usize * TRANSACTION_SIZE;
let efficiency = if total_bytes > 0 {
useful_bytes as f64 / total_bytes as f64
} else {
1.0
};
let is_uniform_stride = !strides.is_empty() && strides.iter().all(|&s| s == strides[0]);
let (pattern, suggestion) = if is_uniform_stride {
let stride = strides[0];
if stride == elem_size {
(AccessPattern::FullyCoalesced,
"Fully coalesced — optimal memory access pattern".into())
} else if stride == 0 {
(AccessPattern::Broadcast,
"Broadcast — consider constant memory cache".into())
} else {
let stride_ratio = stride / elem_size;
(AccessPattern::Strided { stride },
format!("Stride-{} access — consider transposing data layout or using shared memory tiling", stride_ratio))
}
} else {
(AccessPattern::Scattered,
"Scattered access — consider sorting indices or using texture cache".into())
};
CoalescingReport {
pattern,
transactions,
efficiency,
cache_lines_touched,
suggestion,
}
}
pub fn simulate_linear_access(
base: usize,
stride: usize,
offset: usize,
elem_size: usize,
warp_size: u32,
) -> Vec<MemoryAccess> {
(0..warp_size).map(|lane| {
MemoryAccess {
lane_id: lane,
address: base + (lane as usize * stride + offset) * elem_size,
is_write: false,
elem_size,
}
}).collect()
}
pub fn simulate_column_access(
base: usize,
num_cols: usize,
col: usize,
elem_size: usize,
warp_size: u32,
) -> Vec<MemoryAccess> {
(0..warp_size).map(|lane| {
MemoryAccess {
lane_id: lane,
address: base + (lane as usize * num_cols + col) * elem_size,
is_write: false,
elem_size,
}
}).collect()
}
pub struct AccessRecorder {
accesses: Vec<Vec<MemoryAccess>>,
current_warp: Vec<MemoryAccess>,
}
impl AccessRecorder {
pub fn new() -> Self {
Self {
accesses: Vec::new(),
current_warp: Vec::new(),
}
}
pub fn record(&mut self, lane_id: u32, address: usize, elem_size: usize, is_write: bool) {
self.current_warp.push(MemoryAccess {
lane_id,
address,
is_write,
elem_size,
});
if self.current_warp.len() >= 32 {
self.flush_warp();
}
}
pub fn flush_warp(&mut self) {
if !self.current_warp.is_empty() {
self.accesses.push(std::mem::take(&mut self.current_warp));
}
}
pub fn analyze(&mut self) -> Vec<CoalescingReport> {
self.flush_warp();
self.accesses.iter().map(|warp| analyze_warp_access(warp)).collect()
}
pub fn summary(&mut self) -> AccessSummary {
let reports = self.analyze();
let mut pattern_counts: HashMap<String, usize> = HashMap::new();
let mut total_efficiency = 0.0;
let mut total_transactions = 0u32;
for report in &reports {
let key = format!("{:?}", report.pattern);
*pattern_counts.entry(key).or_insert(0) += 1;
total_efficiency += report.efficiency;
total_transactions += report.transactions;
}
let count = reports.len();
AccessSummary {
total_warps_analyzed: count,
avg_efficiency: if count > 0 { total_efficiency / count as f64 } else { 0.0 },
total_transactions,
pattern_distribution: pattern_counts,
}
}
}
#[derive(Debug)]
pub struct AccessSummary {
pub total_warps_analyzed: usize,
pub avg_efficiency: f64,
pub total_transactions: u32,
pub pattern_distribution: HashMap<String, usize>,
}
impl fmt::Display for AccessSummary {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Access Summary: {} warps, {:.1}% avg efficiency, {} transactions",
self.total_warps_analyzed,
self.avg_efficiency * 100.0,
self.total_transactions)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_coalesced_access() {
let accesses = simulate_linear_access(0, 1, 0, 4, 32);
let report = analyze_warp_access(&accesses);
assert_eq!(report.pattern, AccessPattern::FullyCoalesced);
assert!(report.efficiency > 0.9);
}
#[test]
fn test_strided_access() {
let accesses = simulate_column_access(0, 512, 0, 4, 32);
let report = analyze_warp_access(&accesses);
match report.pattern {
AccessPattern::Strided { stride } => assert_eq!(stride, 512 * 4),
_ => panic!("Expected strided pattern, got {:?}", report.pattern),
}
assert!(report.efficiency < 0.2, "Strided access should have low efficiency: {}", report.efficiency);
}
#[test]
fn test_broadcast_access() {
let accesses: Vec<MemoryAccess> = (0..32).map(|lane| {
MemoryAccess { lane_id: lane, address: 1000, is_write: false, elem_size: 4 }
}).collect();
let report = analyze_warp_access(&accesses);
assert_eq!(report.pattern, AccessPattern::Broadcast);
assert_eq!(report.transactions, 1);
}
#[test]
fn test_scattered_access() {
let addresses = [100, 5000, 200, 9000, 50, 7000, 300, 2000,
400, 6000, 150, 8000, 250, 3000, 350, 1000,
450, 4000, 550, 10000, 650, 11000, 750, 12000,
850, 13000, 950, 14000, 1050, 15000, 1150, 16000];
let accesses: Vec<MemoryAccess> = addresses.iter().enumerate().map(|(i, &addr)| {
MemoryAccess { lane_id: i as u32, address: addr, is_write: false, elem_size: 4 }
}).collect();
let report = analyze_warp_access(&accesses);
assert_eq!(report.pattern, AccessPattern::Scattered);
assert!(report.transactions > 1);
}
#[test]
fn test_recorder() {
let mut recorder = AccessRecorder::new();
for lane in 0..32 {
recorder.record(lane, (lane as usize) * 4, 4, false);
}
let reports = recorder.analyze();
assert_eq!(reports.len(), 1);
assert_eq!(reports[0].pattern, AccessPattern::FullyCoalesced);
}
#[test]
fn test_summary() {
let mut recorder = AccessRecorder::new();
for lane in 0..32 {
recorder.record(lane, (lane as usize) * 4, 4, false);
}
for lane in 0..32 {
recorder.record(lane, (lane as usize) * 2048, 4, false);
}
let summary = recorder.summary();
assert_eq!(summary.total_warps_analyzed, 2);
assert!(summary.avg_efficiency > 0.0);
}
#[test]
fn test_report_display() {
let report = CoalescingReport {
pattern: AccessPattern::FullyCoalesced,
transactions: 4,
efficiency: 1.0,
cache_lines_touched: 1,
suggestion: "Optimal".into(),
};
let s = format!("{}", report);
assert!(s.contains("100.0%"));
}
#[test]
fn test_empty_access() {
let report = analyze_warp_access(&[]);
assert_eq!(report.transactions, 0);
assert_eq!(report.pattern, AccessPattern::FullyCoalesced);
}
}