use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashMap};
use crate::error::{CudaError, CudaResult};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NvLinkVersion {
V1,
V2,
V3,
V4,
NvSwitch,
}
impl NvLinkVersion {
#[inline]
pub fn per_link_bandwidth_gbps(self) -> f64 {
match self {
Self::V1 => 20.0,
Self::V2 => 25.0,
Self::V3 => 25.0,
Self::V4 => 25.0,
Self::NvSwitch => 25.0,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NvLinkStatus {
Active,
Inactive,
Unsupported,
}
#[derive(Debug, Clone)]
pub struct NvLinkInfo {
pub version: NvLinkVersion,
pub bandwidth_gbps: f64,
pub link_count: u32,
pub peer_device_id: i32,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum LinkType {
NvLink,
PCIe,
NvSwitch,
}
#[derive(Debug, Clone)]
pub struct TopologyLink {
pub from_device: i32,
pub to_device: i32,
pub link_type: LinkType,
pub bandwidth_gbps: f64,
pub latency_ns: f64,
pub hop_count: u32,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum TopologyType {
SingleGpu,
NvLinkPair,
NvLinkRing,
NvLinkMesh,
NvSwitchFabric,
PcieOnly,
}
#[derive(Debug, Clone)]
pub struct TopologyTree {
pub root: i32,
pub children: HashMap<i32, Vec<i32>>,
pub parent: HashMap<i32, i32>,
}
impl TopologyTree {
pub fn devices(&self) -> Vec<i32> {
let mut devs: Vec<i32> = self.children.keys().copied().collect();
devs.sort_unstable();
devs
}
pub fn depth(&self) -> usize {
fn walk_depth(node: i32, children: &HashMap<i32, Vec<i32>>) -> usize {
match children.get(&node) {
Some(kids) if !kids.is_empty() => {
1 + kids
.iter()
.map(|&c| walk_depth(c, children))
.max()
.unwrap_or(0)
}
_ => 1,
}
}
walk_depth(self.root, &self.children)
}
}
#[derive(Debug, Clone)]
pub struct Transfer {
pub src: i32,
pub dst: i32,
pub data_size: u64,
}
#[derive(Debug, Clone)]
pub struct CommunicationSchedule {
pub transfers: Vec<Transfer>,
pub estimated_time_us: f64,
}
#[derive(Debug, Clone)]
pub struct TaskCommunication {
pub task_a: usize,
pub task_b: usize,
pub volume_bytes: u64,
}
#[derive(Debug, Clone)]
pub struct TopologyAwarePlacement {
pub assignment: HashMap<usize, i32>,
pub total_cost: f64,
}
#[derive(Debug, Clone)]
pub struct GpuTopology {
pub devices: Vec<i32>,
pub links: Vec<TopologyLink>,
adj_bandwidth: Vec<Vec<f64>>,
adj_latency: Vec<Vec<f64>>,
}
impl GpuTopology {
pub fn discover() -> CudaResult<Self> {
#[cfg(target_os = "macos")]
{
Self::synthetic_mesh(4)
}
#[cfg(not(target_os = "macos"))]
{
Self::discover_real()
}
}
#[cfg(any(target_os = "macos", test))]
fn synthetic_mesh(n: usize) -> CudaResult<Self> {
if n == 0 {
return Err(CudaError::NoDevice);
}
let devices: Vec<i32> = (0..n as i32).collect();
let mut links = Vec::new();
let mut adj_bandwidth = vec![vec![0.0; n]; n];
let mut adj_latency = vec![vec![f64::MAX; n]; n];
for i in 0..n {
adj_bandwidth[i][i] = f64::INFINITY;
adj_latency[i][i] = 0.0;
}
for i in 0..n {
for j in 0..n {
if i == j {
continue;
}
let bw = 100.0; let lat = 500.0; links.push(TopologyLink {
from_device: i as i32,
to_device: j as i32,
link_type: LinkType::NvLink,
bandwidth_gbps: bw,
latency_ns: lat,
hop_count: 1,
});
adj_bandwidth[i][j] = bw;
adj_latency[i][j] = lat;
}
}
Ok(Self {
devices,
links,
adj_bandwidth,
adj_latency,
})
}
#[cfg(not(target_os = "macos"))]
fn discover_real() -> CudaResult<Self> {
use crate::device::Device;
crate::init()?;
let count = Device::count()?;
if count <= 0 {
return Err(CudaError::NoDevice);
}
let n = count as usize;
let devices: Vec<i32> = (0..count).collect();
let mut links = Vec::new();
let mut adj_bandwidth = vec![vec![0.0; n]; n];
let mut adj_latency = vec![vec![f64::MAX; n]; n];
for i in 0..n {
adj_bandwidth[i][i] = f64::INFINITY;
adj_latency[i][i] = 0.0;
}
for i in 0..n {
for j in 0..n {
if i == j {
continue;
}
let can_peer = crate::device::can_access_peer(
&Device::get(i as i32)?,
&Device::get(j as i32)?,
)?;
let (lt, bw, lat, hops) = if can_peer {
(LinkType::NvLink, 50.0, 800.0, 1)
} else {
(LinkType::PCIe, 16.0, 2000.0, 2)
};
links.push(TopologyLink {
from_device: i as i32,
to_device: j as i32,
link_type: lt,
bandwidth_gbps: bw,
latency_ns: lat,
hop_count: hops,
});
adj_bandwidth[i][j] = bw;
adj_latency[i][j] = lat;
}
}
Ok(Self {
devices,
links,
adj_bandwidth,
adj_latency,
})
}
pub fn topology_type(&self) -> TopologyType {
let n = self.devices.len();
if n == 0 {
return TopologyType::SingleGpu;
}
if n == 1 {
return TopologyType::SingleGpu;
}
let has_nvswitch = self.links.iter().any(|l| l.link_type == LinkType::NvSwitch);
if has_nvswitch {
return TopologyType::NvSwitchFabric;
}
let nvlink_links: Vec<&TopologyLink> = self
.links
.iter()
.filter(|l| l.link_type == LinkType::NvLink)
.collect();
if nvlink_links.is_empty() {
return TopologyType::PcieOnly;
}
let mut nvlink_neighbors: HashMap<i32, Vec<i32>> = HashMap::new();
for link in &nvlink_links {
nvlink_neighbors
.entry(link.from_device)
.or_default()
.push(link.to_device);
}
let all_have_nvlink = self
.devices
.iter()
.all(|d| nvlink_neighbors.contains_key(d));
if !all_have_nvlink {
if n == 2 {
return TopologyType::NvLinkPair;
}
return TopologyType::PcieOnly;
}
let is_full_mesh = self.devices.iter().all(|d| {
let neighbors = nvlink_neighbors.get(d).map(|v| v.len()).unwrap_or(0);
neighbors == n - 1
});
if is_full_mesh {
return TopologyType::NvLinkMesh;
}
if n == 2 {
return TopologyType::NvLinkPair;
}
let is_ring = self
.devices
.iter()
.all(|d| nvlink_neighbors.get(d).map(|v| v.len()).unwrap_or(0) == 2);
if is_ring && self.verify_ring(&nvlink_neighbors) {
return TopologyType::NvLinkRing;
}
TopologyType::NvLinkMesh
}
fn verify_ring(&self, neighbors: &HashMap<i32, Vec<i32>>) -> bool {
if self.devices.is_empty() {
return false;
}
let start = self.devices[0];
let mut visited = vec![false; self.devices.len()];
let mut current = start;
let mut prev = -1_i32;
for step in 0..self.devices.len() {
let idx = match self.device_index(current) {
Some(i) => i,
None => return false,
};
if visited[idx] {
return false;
}
visited[idx] = true;
let nbrs = match neighbors.get(¤t) {
Some(v) => v,
None => return false,
};
if step < self.devices.len() - 1 {
let next = nbrs.iter().find(|&&n| n != prev);
match next {
Some(&n) => {
prev = current;
current = n;
}
None => return false,
}
} else {
return nbrs.contains(&start);
}
}
false
}
pub fn best_path(&self, src: i32, dst: i32) -> CudaResult<Vec<i32>> {
let src_idx = self.device_index(src).ok_or(CudaError::InvalidDevice)?;
let dst_idx = self.device_index(dst).ok_or(CudaError::InvalidDevice)?;
if src_idx == dst_idx {
return Ok(vec![src]);
}
let n = self.devices.len();
let mut dist = vec![f64::INFINITY; n];
let mut prev: Vec<Option<usize>> = vec![None; n];
dist[src_idx] = 0.0;
let mut heap: BinaryHeap<Reverse<(OrderedF64, usize)>> = BinaryHeap::new();
heap.push(Reverse((OrderedF64(0.0), src_idx)));
while let Some(Reverse((OrderedF64(cost), u))) = heap.pop() {
if u == dst_idx {
break;
}
if cost > dist[u] {
continue;
}
for v in 0..n {
if u == v {
continue;
}
let bw = self.adj_bandwidth[u][v];
if bw <= 0.0 {
continue;
}
let edge_cost = 1.0 / bw;
let new_dist = dist[u] + edge_cost;
if new_dist < dist[v] {
dist[v] = new_dist;
prev[v] = Some(u);
heap.push(Reverse((OrderedF64(new_dist), v)));
}
}
}
if prev[dst_idx].is_none() {
return Err(CudaError::InvalidValue);
}
let mut path = Vec::new();
let mut cur = dst_idx;
while let Some(p) = prev[cur] {
path.push(self.devices[cur]);
cur = p;
}
path.push(self.devices[src_idx]);
path.reverse();
Ok(path)
}
pub fn bandwidth_between(&self, src: i32, dst: i32) -> f64 {
let src_idx = match self.device_index(src) {
Some(i) => i,
None => return 0.0,
};
let dst_idx = match self.device_index(dst) {
Some(i) => i,
None => return 0.0,
};
self.adj_bandwidth[src_idx][dst_idx]
}
pub fn latency_between(&self, src: i32, dst: i32) -> f64 {
let src_idx = match self.device_index(src) {
Some(i) => i,
None => return f64::MAX / 1000.0,
};
let dst_idx = match self.device_index(dst) {
Some(i) => i,
None => return f64::MAX / 1000.0,
};
self.adj_latency[src_idx][dst_idx] / 1000.0 }
pub fn optimal_ring_order(&self) -> CudaResult<Vec<i32>> {
let n = self.devices.len();
if n == 0 {
return Err(CudaError::NoDevice);
}
if n == 1 {
return Ok(vec![self.devices[0]]);
}
let mut visited = vec![false; n];
let mut ring = Vec::with_capacity(n);
let mut current = 0_usize;
visited[current] = true;
ring.push(self.devices[current]);
for _ in 1..n {
let mut best_bw = -1.0_f64;
let mut best_idx = None;
for (j, &is_visited) in visited.iter().enumerate() {
if is_visited {
continue;
}
let bw = self.adj_bandwidth[current][j];
if bw > best_bw {
best_bw = bw;
best_idx = Some(j);
}
}
match best_idx {
Some(idx) => {
visited[idx] = true;
ring.push(self.devices[idx]);
current = idx;
}
None => {
return Err(CudaError::InvalidValue);
}
}
}
Ok(ring)
}
pub fn optimal_tree(&self) -> CudaResult<TopologyTree> {
let n = self.devices.len();
if n == 0 {
return Err(CudaError::NoDevice);
}
let root_idx = (0..n)
.max_by(|&a, &b| {
let sum_a: f64 = (0..n)
.filter(|&j| j != a)
.map(|j| {
let bw = self.adj_bandwidth[a][j];
if bw.is_infinite() { 0.0 } else { bw }
})
.sum();
let sum_b: f64 = (0..n)
.filter(|&j| j != b)
.map(|j| {
let bw = self.adj_bandwidth[b][j];
if bw.is_infinite() { 0.0 } else { bw }
})
.sum();
sum_a
.partial_cmp(&sum_b)
.unwrap_or(std::cmp::Ordering::Equal)
})
.unwrap_or(0);
let root = self.devices[root_idx];
let mut children: HashMap<i32, Vec<i32>> = HashMap::new();
let mut parent: HashMap<i32, i32> = HashMap::new();
for &d in &self.devices {
children.insert(d, Vec::new());
}
if n == 1 {
return Ok(TopologyTree {
root,
children,
parent,
});
}
let mut in_tree = vec![false; n];
let mut best_edge: Vec<(f64, Option<usize>)> = vec![(0.0, None); n];
in_tree[root_idx] = true;
for (j, edge) in best_edge.iter_mut().enumerate() {
if j != root_idx {
let bw = self.adj_bandwidth[root_idx][j];
let bw_val = if bw.is_infinite() { 0.0 } else { bw };
*edge = (bw_val, Some(root_idx));
}
}
for _ in 1..n {
let mut best_bw = -1.0_f64;
let mut best_node = None;
for j in 0..n {
if !in_tree[j] && best_edge[j].0 > best_bw {
best_bw = best_edge[j].0;
best_node = Some(j);
}
}
let node = match best_node {
Some(j) => j,
None => break,
};
in_tree[node] = true;
let parent_idx = match best_edge[node].1 {
Some(p) => p,
None => continue,
};
let parent_dev = self.devices[parent_idx];
let child_dev = self.devices[node];
parent.insert(child_dev, parent_dev);
children.entry(parent_dev).or_default().push(child_dev);
for j in 0..n {
if !in_tree[j] {
let bw = self.adj_bandwidth[node][j];
let bw_val = if bw.is_infinite() { 0.0 } else { bw };
if bw_val > best_edge[j].0 {
best_edge[j] = (bw_val, Some(node));
}
}
}
}
Ok(TopologyTree {
root,
children,
parent,
})
}
pub fn optimal_placement(
&self,
num_tasks: usize,
communications: &[TaskCommunication],
) -> CudaResult<TopologyAwarePlacement> {
let n = self.devices.len();
if num_tasks == 0 || n == 0 {
return Err(CudaError::InvalidValue);
}
if num_tasks > n {
return Err(CudaError::InvalidValue);
}
if num_tasks <= 8 && n <= 8 {
self.placement_exhaustive(num_tasks, communications)
} else {
self.placement_greedy(num_tasks, communications)
}
}
fn placement_exhaustive(
&self,
num_tasks: usize,
communications: &[TaskCommunication],
) -> CudaResult<TopologyAwarePlacement> {
let n = self.devices.len();
let mut best_cost = f64::INFINITY;
let mut best_assignment: HashMap<usize, i32> = HashMap::new();
let mut perm: Vec<usize> = (0..n).collect();
let mut found = false;
Self::for_each_permutation(&mut perm, n, &mut |p| {
let assignment: HashMap<usize, i32> =
(0..num_tasks).map(|t| (t, self.devices[p[t]])).collect();
let cost = Self::compute_placement_cost(
&assignment,
communications,
&self.adj_bandwidth,
&self.devices,
);
if cost < best_cost {
best_cost = cost;
best_assignment = assignment;
found = true;
}
});
if !found {
best_assignment = (0..num_tasks).map(|t| (t, self.devices[t])).collect();
best_cost = Self::compute_placement_cost(
&best_assignment,
communications,
&self.adj_bandwidth,
&self.devices,
);
}
Ok(TopologyAwarePlacement {
assignment: best_assignment,
total_cost: best_cost,
})
}
fn for_each_permutation(arr: &mut [usize], k: usize, callback: &mut impl FnMut(&[usize])) {
if k == 1 {
callback(arr);
return;
}
Self::for_each_permutation(arr, k - 1, callback);
for i in 0..k - 1 {
if k % 2 == 0 {
arr.swap(i, k - 1);
} else {
arr.swap(0, k - 1);
}
Self::for_each_permutation(arr, k - 1, callback);
}
}
fn placement_greedy(
&self,
num_tasks: usize,
communications: &[TaskCommunication],
) -> CudaResult<TopologyAwarePlacement> {
let mut task_volume: Vec<(usize, u64)> = (0..num_tasks).map(|t| (t, 0u64)).collect();
for comm in communications {
if comm.task_a < num_tasks {
task_volume[comm.task_a].1 += comm.volume_bytes;
}
if comm.task_b < num_tasks {
task_volume[comm.task_b].1 += comm.volume_bytes;
}
}
task_volume.sort_by_key(|entry| std::cmp::Reverse(entry.1));
let mut assignment: HashMap<usize, i32> = HashMap::new();
let mut used_devices: Vec<bool> = vec![false; self.devices.len()];
for &(task, _) in &task_volume {
let mut best_cost = f64::INFINITY;
let mut best_dev_idx = None;
for (dev_idx, &used) in used_devices.iter().enumerate() {
if used {
continue;
}
let mut cost = 0.0_f64;
for comm in communications {
let (peer_task, is_relevant) = if comm.task_a == task {
(comm.task_b, true)
} else if comm.task_b == task {
(comm.task_a, true)
} else {
(0, false)
};
if !is_relevant {
continue;
}
if let Some(&peer_dev) = assignment.get(&peer_task) {
let peer_dev_idx = match self.device_index(peer_dev) {
Some(i) => i,
None => continue,
};
let bw = self.adj_bandwidth[dev_idx][peer_dev_idx];
let bw_val = if bw.is_infinite() || bw <= 0.0 {
1e-9
} else {
bw
};
cost += comm.volume_bytes as f64 / bw_val;
}
}
if cost < best_cost {
best_cost = cost;
best_dev_idx = Some(dev_idx);
}
}
let dev_idx =
best_dev_idx.unwrap_or_else(|| used_devices.iter().position(|&u| !u).unwrap_or(0));
used_devices[dev_idx] = true;
assignment.insert(task, self.devices[dev_idx]);
}
let total_cost = Self::compute_placement_cost(
&assignment,
communications,
&self.adj_bandwidth,
&self.devices,
);
Ok(TopologyAwarePlacement {
assignment,
total_cost,
})
}
fn compute_placement_cost(
assignment: &HashMap<usize, i32>,
communications: &[TaskCommunication],
adj_bandwidth: &[Vec<f64>],
devices: &[i32],
) -> f64 {
let dev_to_idx: HashMap<i32, usize> =
devices.iter().enumerate().map(|(i, &d)| (d, i)).collect();
let mut cost = 0.0_f64;
for comm in communications {
let dev_a = match assignment.get(&comm.task_a) {
Some(&d) => d,
None => continue,
};
let dev_b = match assignment.get(&comm.task_b) {
Some(&d) => d,
None => continue,
};
if dev_a == dev_b {
continue; }
let idx_a = match dev_to_idx.get(&dev_a) {
Some(&i) => i,
None => continue,
};
let idx_b = match dev_to_idx.get(&dev_b) {
Some(&i) => i,
None => continue,
};
let bw = adj_bandwidth[idx_a][idx_b];
let bw_val = if bw.is_infinite() || bw <= 0.0 {
1e-9
} else {
bw
};
cost += comm.volume_bytes as f64 / bw_val;
}
cost
}
pub fn build_schedule(&self, transfers: &[(i32, i32, u64)]) -> CommunicationSchedule {
let mut entries: Vec<(f64, Transfer)> = transfers
.iter()
.map(|&(src, dst, size)| {
let bw = self.bandwidth_between(src, dst);
let bw_val = if bw.is_infinite() || bw <= 0.0 {
1e-9
} else {
bw
};
(
bw_val,
Transfer {
src,
dst,
data_size: size,
},
)
})
.collect();
entries.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
let mut total_time_us = 0.0_f64;
let transfers_out: Vec<Transfer> = entries
.into_iter()
.map(|(bw, t)| {
total_time_us += t.data_size as f64 / (bw * 1000.0);
t
})
.collect();
CommunicationSchedule {
transfers: transfers_out,
estimated_time_us: total_time_us,
}
}
pub fn nvlink_info(&self, device: i32, peer: i32) -> Option<NvLinkInfo> {
let link = self
.links
.iter()
.find(|l| l.from_device == device && l.to_device == peer)?;
if link.link_type == LinkType::PCIe {
return None;
}
let version = match link.link_type {
LinkType::NvSwitch => NvLinkVersion::NvSwitch,
_ => NvLinkVersion::V3, };
Some(NvLinkInfo {
version,
bandwidth_gbps: link.bandwidth_gbps,
link_count: if link.bandwidth_gbps > 0.0 {
(link.bandwidth_gbps / version.per_link_bandwidth_gbps()).ceil() as u32
} else {
0
},
peer_device_id: peer,
})
}
fn device_index(&self, device_id: i32) -> Option<usize> {
self.devices.iter().position(|&d| d == device_id)
}
pub fn adjacency_matrix(&self) -> &Vec<Vec<f64>> {
&self.adj_bandwidth
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
struct OrderedF64(f64);
impl Eq for OrderedF64 {}
impl PartialOrd for OrderedF64 {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for OrderedF64 {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.0
.partial_cmp(&other.0)
.unwrap_or(std::cmp::Ordering::Equal)
}
}
#[cfg(test)]
mod tests {
use super::*;
fn make_synthetic_4gpu() -> GpuTopology {
GpuTopology::synthetic_mesh(4).expect("synthetic mesh should not fail")
}
fn make_synthetic_1gpu() -> GpuTopology {
GpuTopology::synthetic_mesh(1).expect("single GPU mesh should not fail")
}
#[test]
fn test_topology_discovery_macos_synthetic() {
let topo = GpuTopology::discover();
#[cfg(target_os = "macos")]
{
let topo = topo.expect("discover should succeed on macOS");
assert_eq!(topo.devices.len(), 4);
assert_eq!(topo.links.len(), 12);
}
#[cfg(not(target_os = "macos"))]
{
let _ = topo; }
}
#[test]
fn test_topology_type_single_gpu() {
let topo = make_synthetic_1gpu();
assert_eq!(topo.topology_type(), TopologyType::SingleGpu);
}
#[test]
fn test_topology_type_nvlink_mesh() {
let topo = make_synthetic_4gpu();
assert_eq!(topo.topology_type(), TopologyType::NvLinkMesh);
}
#[test]
fn test_topology_type_pcie_only() {
let mut topo = make_synthetic_4gpu();
for link in &mut topo.links {
link.link_type = LinkType::PCIe;
}
assert_eq!(topo.topology_type(), TopologyType::PcieOnly);
}
#[test]
fn test_topology_type_nvlink_pair() {
let topo = GpuTopology::synthetic_mesh(2).expect("pair mesh");
assert_eq!(topo.topology_type(), TopologyType::NvLinkMesh);
let mut partial = GpuTopology {
devices: vec![0, 1],
links: vec![TopologyLink {
from_device: 0,
to_device: 1,
link_type: LinkType::NvLink,
bandwidth_gbps: 50.0,
latency_ns: 500.0,
hop_count: 1,
}],
adj_bandwidth: vec![vec![f64::INFINITY, 50.0], vec![0.0, f64::INFINITY]],
adj_latency: vec![vec![0.0, 500.0], vec![f64::MAX, 0.0]],
};
assert_eq!(partial.topology_type(), TopologyType::NvLinkPair);
partial.links.push(TopologyLink {
from_device: 1,
to_device: 0,
link_type: LinkType::NvLink,
bandwidth_gbps: 50.0,
latency_ns: 500.0,
hop_count: 1,
});
partial.adj_bandwidth[1][0] = 50.0;
partial.adj_latency[1][0] = 500.0;
assert_eq!(partial.topology_type(), TopologyType::NvLinkMesh);
}
#[test]
fn test_best_path_direct() {
let topo = make_synthetic_4gpu();
let path = topo.best_path(0, 3).expect("path should exist");
assert_eq!(path.len(), 2);
assert_eq!(path[0], 0);
assert_eq!(path[1], 3);
}
#[test]
fn test_best_path_same_device() {
let topo = make_synthetic_4gpu();
let path = topo.best_path(1, 1).expect("same device path");
assert_eq!(path, vec![1]);
}
#[test]
fn test_best_path_invalid_device() {
let topo = make_synthetic_4gpu();
let result = topo.best_path(0, 99);
assert!(result.is_err());
}
#[test]
fn test_bandwidth_between() {
let topo = make_synthetic_4gpu();
let bw = topo.bandwidth_between(0, 1);
assert!((bw - 100.0).abs() < 1e-6);
let bw_self = topo.bandwidth_between(0, 0);
assert!(bw_self.is_infinite());
let bw_invalid = topo.bandwidth_between(0, 99);
assert!((bw_invalid - 0.0).abs() < 1e-6);
}
#[test]
fn test_latency_between() {
let topo = make_synthetic_4gpu();
let lat = topo.latency_between(0, 1);
assert!((lat - 0.5).abs() < 1e-6);
let lat_self = topo.latency_between(0, 0);
assert!((lat_self - 0.0).abs() < 1e-6);
}
#[test]
fn test_ring_order() {
let topo = make_synthetic_4gpu();
let ring = topo.optimal_ring_order().expect("ring order");
assert_eq!(ring.len(), 4);
let mut sorted = ring.clone();
sorted.sort_unstable();
assert_eq!(sorted, vec![0, 1, 2, 3]);
}
#[test]
fn test_ring_order_single() {
let topo = make_synthetic_1gpu();
let ring = topo.optimal_ring_order().expect("ring order");
assert_eq!(ring, vec![0]);
}
#[test]
fn test_tree_construction() {
let topo = make_synthetic_4gpu();
let tree = topo.optimal_tree().expect("tree");
assert!(topo.devices.contains(&tree.root));
assert_eq!(tree.parent.len(), 3);
let devs = tree.devices();
assert_eq!(devs.len(), 4);
assert!(tree.depth() >= 1);
}
#[test]
fn test_adjacency_matrix_correctness() {
let topo = make_synthetic_4gpu();
let adj = topo.adjacency_matrix();
assert_eq!(adj.len(), 4);
for (i, adj_row) in adj.iter().enumerate().take(4) {
assert!(adj_row[i].is_infinite()); for (j, adj_val) in adj_row.iter().enumerate().take(4) {
if i != j {
assert!((*adj_val - 100.0).abs() < 1e-6);
}
}
}
}
#[test]
fn test_path_symmetry() {
let topo = make_synthetic_4gpu();
for i in 0..4_i32 {
for j in 0..4_i32 {
if i == j {
continue;
}
let bw_ij = topo.bandwidth_between(i, j);
let bw_ji = topo.bandwidth_between(j, i);
assert!(
(bw_ij - bw_ji).abs() < 1e-6,
"bandwidth {i}->{j} ({bw_ij}) != {j}->{i} ({bw_ji})"
);
}
}
}
#[test]
fn test_communication_schedule() {
let topo = make_synthetic_4gpu();
let transfers = vec![(0, 1, 1_000_000u64), (2, 3, 2_000_000), (0, 3, 500_000)];
let schedule = topo.build_schedule(&transfers);
assert_eq!(schedule.transfers.len(), 3);
assert!(schedule.estimated_time_us > 0.0);
}
#[test]
fn test_placement_optimization() {
let topo = make_synthetic_4gpu();
let comms = vec![
TaskCommunication {
task_a: 0,
task_b: 1,
volume_bytes: 1_000_000,
},
TaskCommunication {
task_a: 1,
task_b: 2,
volume_bytes: 2_000_000,
},
];
let placement = topo
.optimal_placement(3, &comms)
.expect("placement should succeed");
assert_eq!(placement.assignment.len(), 3);
let mut devs: Vec<i32> = placement.assignment.values().copied().collect();
devs.sort_unstable();
devs.dedup();
assert_eq!(devs.len(), 3);
}
#[test]
fn test_nvswitch_fabric_detection() {
let mut topo = make_synthetic_4gpu();
for link in &mut topo.links {
link.link_type = LinkType::NvSwitch;
}
assert_eq!(topo.topology_type(), TopologyType::NvSwitchFabric);
}
#[test]
fn test_empty_topology_errors() {
let result = GpuTopology::synthetic_mesh(0);
assert!(result.is_err());
}
#[test]
fn test_nvlink_info_link_count_correct() {
let topo = make_synthetic_4gpu();
let info = topo
.nvlink_info(0, 1)
.expect("NVLink info should exist for NvLink topology");
assert_eq!(info.peer_device_id, 1);
assert!((info.bandwidth_gbps - 100.0).abs() < 1e-6);
let per_link = info.version.per_link_bandwidth_gbps();
let expected_count = (100.0_f64 / per_link).ceil() as u32;
assert_eq!(
info.link_count, expected_count,
"link count should match bandwidth / per-link rate"
);
}
#[test]
fn test_nvlink_info_returns_none_for_pcie_link() {
let mut topo = make_synthetic_4gpu();
for link in &mut topo.links {
link.link_type = LinkType::PCIe;
}
let info = topo.nvlink_info(0, 1);
assert!(info.is_none(), "should return None for PCIe links");
}
#[test]
fn test_nvlink_info_nonexistent_peer() {
let topo = make_synthetic_4gpu();
let info = topo.nvlink_info(0, 99);
assert!(info.is_none(), "should return None for nonexistent peer");
}
#[test]
fn test_best_path_finds_intermediate_hop() {
let bw_direct = 100.0_f64;
let lat = 500.0_f64;
let topo = GpuTopology {
devices: vec![0, 1, 2],
links: vec![
TopologyLink {
from_device: 0,
to_device: 1,
link_type: LinkType::NvLink,
bandwidth_gbps: bw_direct,
latency_ns: lat,
hop_count: 1,
},
TopologyLink {
from_device: 1,
to_device: 0,
link_type: LinkType::NvLink,
bandwidth_gbps: bw_direct,
latency_ns: lat,
hop_count: 1,
},
TopologyLink {
from_device: 1,
to_device: 2,
link_type: LinkType::NvLink,
bandwidth_gbps: bw_direct,
latency_ns: lat,
hop_count: 1,
},
TopologyLink {
from_device: 2,
to_device: 1,
link_type: LinkType::NvLink,
bandwidth_gbps: bw_direct,
latency_ns: lat,
hop_count: 1,
},
],
adj_bandwidth: vec![
vec![f64::INFINITY, bw_direct, 0.0],
vec![bw_direct, f64::INFINITY, bw_direct],
vec![0.0, bw_direct, f64::INFINITY],
],
adj_latency: vec![
vec![0.0, lat, f64::MAX],
vec![lat, 0.0, lat],
vec![f64::MAX, lat, 0.0],
],
};
let path = topo.best_path(0, 2).expect("path should exist via hop");
assert_eq!(path.len(), 3, "chain topology requires intermediate hop");
assert_eq!(path[0], 0);
assert_eq!(path[1], 1);
assert_eq!(path[2], 2);
}
#[test]
fn test_build_schedule_sorted_by_bandwidth_desc() {
let topo = make_synthetic_4gpu();
let transfers = vec![(0, 1, 1_000_000_u64), (1, 2, 500_000), (2, 3, 2_000_000)];
let schedule = topo.build_schedule(&transfers);
assert_eq!(
schedule.transfers.len(),
3,
"all transfers should appear in schedule"
);
assert!(
schedule.estimated_time_us > 0.0,
"total time should be positive"
);
let time_2mb = 2_000_000.0_f64 / (100.0 * 1000.0);
let time_1mb = 1_000_000.0_f64 / (100.0 * 1000.0);
let time_0_5mb = 500_000.0_f64 / (100.0 * 1000.0);
let expected_total = time_2mb + time_1mb + time_0_5mb;
assert!(
(schedule.estimated_time_us - expected_total).abs() < 1e-6,
"estimated time should match sum of individual transfer times"
);
}
#[test]
fn test_adjacency_matrix_chain_topology() {
let topo = GpuTopology {
devices: vec![0, 1, 2],
links: vec![],
adj_bandwidth: vec![
vec![f64::INFINITY, 50.0, 0.0],
vec![50.0, f64::INFINITY, 50.0],
vec![0.0, 50.0, f64::INFINITY],
],
adj_latency: vec![
vec![0.0, 500.0, f64::MAX],
vec![500.0, 0.0, 500.0],
vec![f64::MAX, 500.0, 0.0],
],
};
let adj = topo.adjacency_matrix();
assert!(
(adj[0][1] - 50.0).abs() < 1e-6,
"0->1 bandwidth should be 50 GB/s"
);
assert!((adj[0][2] - 0.0).abs() < 1e-6, "0->2 has no direct link");
assert!(adj[1][1].is_infinite(), "self-bandwidth is infinite");
}
#[test]
fn test_optimal_placement_all_same_device_no_cost() {
let topo = make_synthetic_4gpu();
let comms = vec![];
let placement = topo
.optimal_placement(1, &comms)
.expect("single-task placement should succeed");
assert_eq!(placement.assignment.len(), 1);
assert_eq!(placement.total_cost, 0.0, "no comms means no cost");
}
#[test]
fn test_topology_tree_single_device() {
let topo = make_synthetic_1gpu();
let tree = topo.optimal_tree().expect("single-device tree");
assert_eq!(tree.root, 0);
assert_eq!(tree.parent.len(), 0, "root has no parent");
assert_eq!(tree.depth(), 1, "single-node tree has depth 1");
}
}