use serde_json::Value;
pub const F15_REQUIRED_KEYS: &[&str] =
&["host", "rank", "nccl_version", "last_op", "code", "suggest"];
pub const F15_MIN_EXIT_CODE: i32 = 128;
pub const F15_DOC_LINK_SUBSTRINGS: &[&str] = &[
"docs.nvidia.com",
"developer.nvidia.com",
"github.com/NVIDIA/nccl",
];
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum NcclSchemaOutcome {
Ok,
NotAnObject,
MissingKey { key: &'static str },
HostNotString,
RankNotNonNegativeInt { got: i64 },
NcclVersionNotString,
LastOpNotString,
CodeNotInt,
SuggestNotString,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum NcclExitOutcome {
Ok { code: i32 },
BelowThreshold { got: i32, threshold: i32 },
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum NcclDocLinkOutcome {
Ok,
NoDocLink { suggest: String },
}
pub fn classify_schema(body: &Value) -> NcclSchemaOutcome {
let Some(obj) = body.as_object() else {
return NcclSchemaOutcome::NotAnObject;
};
for k in F15_REQUIRED_KEYS {
if !obj.contains_key(*k) {
return NcclSchemaOutcome::MissingKey { key: k };
}
}
if obj.get("host").and_then(Value::as_str).is_none() {
return NcclSchemaOutcome::HostNotString;
}
let rank = obj.get("rank").and_then(Value::as_i64).unwrap_or(-1);
if rank < 0 {
return NcclSchemaOutcome::RankNotNonNegativeInt { got: rank };
}
if obj.get("nccl_version").and_then(Value::as_str).is_none() {
return NcclSchemaOutcome::NcclVersionNotString;
}
if obj.get("last_op").and_then(Value::as_str).is_none() {
return NcclSchemaOutcome::LastOpNotString;
}
if obj.get("code").and_then(Value::as_i64).is_none() {
return NcclSchemaOutcome::CodeNotInt;
}
if obj.get("suggest").and_then(Value::as_str).is_none() {
return NcclSchemaOutcome::SuggestNotString;
}
NcclSchemaOutcome::Ok
}
pub fn classify_exit_code(got: i32, threshold: i32) -> NcclExitOutcome {
if got >= threshold {
NcclExitOutcome::Ok { code: got }
} else {
NcclExitOutcome::BelowThreshold { got, threshold }
}
}
pub fn classify_doc_link(body: &Value) -> NcclDocLinkOutcome {
let suggest = body
.get("suggest")
.and_then(Value::as_str)
.unwrap_or("")
.to_string();
for needle in F15_DOC_LINK_SUBSTRINGS {
if suggest.contains(*needle) {
return NcclDocLinkOutcome::Ok;
}
}
NcclDocLinkOutcome::NoDocLink { suggest }
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
fn good_body() -> Value {
json!({
"host": "node-0",
"rank": 0,
"peer_rank": 1,
"nccl_version": "2.20.5",
"cuda_devices": "0,1",
"fabric": "ib",
"last_op": "AllReduce",
"code": 6,
"suggest": "See https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html"
})
}
#[test]
fn schema_ok_on_good_body() {
assert_eq!(classify_schema(&good_body()), NcclSchemaOutcome::Ok);
}
#[test]
fn schema_rejects_not_an_object() {
assert_eq!(
classify_schema(&json!([1, 2])),
NcclSchemaOutcome::NotAnObject
);
}
#[test]
fn schema_reports_missing_key() {
let mut body = good_body();
body.as_object_mut().expect("obj").remove("suggest");
assert_eq!(
classify_schema(&body),
NcclSchemaOutcome::MissingKey { key: "suggest" }
);
}
#[test]
fn schema_rejects_negative_rank() {
let mut body = good_body();
body["rank"] = json!(-1);
assert!(matches!(
classify_schema(&body),
NcclSchemaOutcome::RankNotNonNegativeInt { got: -1 }
));
}
#[test]
fn schema_rejects_non_string_host() {
let mut body = good_body();
body["host"] = json!(123);
assert_eq!(classify_schema(&body), NcclSchemaOutcome::HostNotString);
}
#[test]
fn schema_rejects_non_int_code() {
let mut body = good_body();
body["code"] = json!("six");
assert_eq!(classify_schema(&body), NcclSchemaOutcome::CodeNotInt);
}
#[test]
fn exit_code_ok_at_threshold() {
assert_eq!(
classify_exit_code(F15_MIN_EXIT_CODE, F15_MIN_EXIT_CODE),
NcclExitOutcome::Ok { code: 128 }
);
}
#[test]
fn exit_code_ok_above_threshold() {
assert_eq!(
classify_exit_code(134, F15_MIN_EXIT_CODE),
NcclExitOutcome::Ok { code: 134 }
);
}
#[test]
fn exit_code_rejects_generic_one() {
assert_eq!(
classify_exit_code(1, F15_MIN_EXIT_CODE),
NcclExitOutcome::BelowThreshold {
got: 1,
threshold: 128
}
);
}
#[test]
fn doc_link_ok_on_nvidia_url() {
assert_eq!(classify_doc_link(&good_body()), NcclDocLinkOutcome::Ok);
}
#[test]
fn doc_link_ok_on_nccl_github() {
let mut body = good_body();
body["suggest"] = json!("Check github.com/NVIDIA/nccl/issues/123");
assert_eq!(classify_doc_link(&body), NcclDocLinkOutcome::Ok);
}
#[test]
fn doc_link_rejects_free_text() {
let mut body = good_body();
body["suggest"] = json!("Try restarting your training run");
match classify_doc_link(&body) {
NcclDocLinkOutcome::NoDocLink { suggest } => {
assert!(suggest.contains("restarting"));
}
other => panic!("expected NoDocLink, got {other:?}"),
}
}
}