use std::sync::Arc;
use std::time::Duration;
use crate::http_client::{self, FetchError, HttpClient};
use crate::ingest::auth_header::{AuthHeader, ScraperAuthOutcome, parse_scraper_auth_header};
use crate::report::metrics::{MetricsState, RedfishScrapeReason};
use crate::score::ops_snapshot_diff::OpsSnapshotDiff;
use std::collections::HashMap;
use super::apply::{apply_chassis_scrape, build_chassis_services};
use super::config::{RedfishConfig, RedfishSchema};
use super::parser::{ParseOutcome, parse_redfish_power};
use super::state::{RedfishState, ServiceEnergy, monotonic_ms};
const UNSUPPORTED_PLATFORM_FAILURE_THRESHOLD: u32 = 3;
#[derive(Debug, thiserror::Error)]
pub(super) enum ScraperError {
#[error("Redfish fetch failed")]
Fetch(#[source] FetchError),
#[error("Redfish response was not valid UTF-8")]
Utf8(#[source] std::string::FromUtf8Error),
#[error("Redfish JSON parse failed")]
InvalidJson,
#[error("Redfish power path missing from response")]
PathMissing,
#[error("Redfish power value was non-finite, null, or non-positive")]
InvalidValue,
}
pub(super) fn scraper_error_reason(err: &ScraperError) -> RedfishScrapeReason {
match err {
ScraperError::Fetch(fe) => fetch_error_reason(fe),
ScraperError::Utf8(_) => RedfishScrapeReason::InvalidUtf8,
ScraperError::InvalidJson => RedfishScrapeReason::InvalidJson,
ScraperError::PathMissing => RedfishScrapeReason::PathMissing,
ScraperError::InvalidValue => RedfishScrapeReason::InvalidValue,
}
}
fn fetch_error_reason(err: &FetchError) -> RedfishScrapeReason {
match err {
FetchError::Transport(_) => RedfishScrapeReason::Unreachable,
FetchError::Timeout => RedfishScrapeReason::Timeout,
FetchError::HttpStatus(_) => RedfishScrapeReason::HttpError,
FetchError::BodyRead(_) => RedfishScrapeReason::BodyReadError,
FetchError::RequestBuild(_) => RedfishScrapeReason::RequestError,
}
}
#[must_use]
pub fn spawn_scraper(
cfg: RedfishConfig,
state: Arc<RedfishState>,
metrics: Arc<MetricsState>,
) -> tokio::task::JoinHandle<()> {
tokio::spawn(async move {
run_scraper_loop(cfg, state, metrics).await;
})
}
async fn fetch_chassis_once(
client: &HttpClient,
uri: &hyper::Uri,
auth: Option<&AuthHeader>,
) -> Result<String, ScraperError> {
let bytes = http_client::fetch_get(
client,
uri,
"perf-sentinel/redfish-scraper",
Duration::from_secs(5),
auth,
)
.await
.map_err(ScraperError::Fetch)?;
String::from_utf8(bytes.to_vec()).map_err(ScraperError::Utf8)
}
async fn scrape_chassis(
client: &HttpClient,
uri: &hyper::Uri,
auth: Option<&AuthHeader>,
schema: RedfishSchema,
) -> Result<f64, ScraperError> {
let body = fetch_chassis_once(client, uri, auth).await?;
match parse_redfish_power(&body, schema) {
ParseOutcome::Ok(w) => Ok(w),
ParseOutcome::InvalidJson => Err(ScraperError::InvalidJson),
ParseOutcome::PathMissing => Err(ScraperError::PathMissing),
ParseOutcome::InvalidValue => Err(ScraperError::InvalidValue),
}
}
fn parse_chassis_uris(cfg: &RedfishConfig) -> Vec<(String, hyper::Uri, RedfishSchema)> {
use std::str::FromStr;
let mut out = Vec::with_capacity(cfg.endpoints.len());
for (id, endpoint) in &cfg.endpoints {
match hyper::Uri::from_str(&endpoint.url) {
Ok(u) => out.push((id.clone(), u, endpoint.schema)),
Err(e) => {
tracing::error!(
chassis = %id,
error = %e,
"Redfish scraper dropping chassis: endpoint URI failed to parse"
);
}
}
}
out
}
fn record_chassis_failure(
chassis_id: &str,
uri: &hyper::Uri,
err: &ScraperError,
metrics: &MetricsState,
failure_streak_warned: &mut bool,
) {
let reason = scraper_error_reason(err);
metrics.redfish_scrape_failed.inc();
metrics
.redfish_scrape_failed_total
.with_label_values(&[reason.as_str()])
.inc();
if *failure_streak_warned {
tracing::debug!(
chassis = %chassis_id,
error = %err,
"Redfish scrape failed again"
);
} else {
tracing::warn!(
chassis = %chassis_id,
error = %err,
endpoint = %http_client::redact_endpoint(uri),
"Redfish scrape failed; subsequent failures will log at debug level"
);
*failure_streak_warned = true;
}
}
struct TickContext<'a> {
client: &'a HttpClient,
auth: Option<&'a AuthHeader>,
chassis_services: &'a HashMap<String, Vec<String>>,
deltas: &'a HashMap<String, u64>,
scrape_interval_secs: f64,
now_ms: u64,
}
struct TickOutcome {
any_success: bool,
any_change: bool,
}
async fn run_tick(
uris: &[(String, hyper::Uri, RedfishSchema)],
ctx: &TickContext<'_>,
next: &mut HashMap<String, ServiceEnergy>,
metrics: &MetricsState,
failure_streak_warned: &mut bool,
) -> TickOutcome {
let mut outcome = TickOutcome {
any_success: false,
any_change: false,
};
for (chassis_id, uri, schema) in uris {
match scrape_chassis(ctx.client, uri, ctx.auth, *schema).await {
Ok(watts) => {
outcome.any_success = true;
metrics.redfish_scrape_success.inc();
let services: &[String] = ctx
.chassis_services
.get(chassis_id)
.map_or(&[], Vec::as_slice);
if apply_chassis_scrape(
next,
services,
watts,
ctx.scrape_interval_secs,
ctx.deltas,
ctx.now_ms,
) {
outcome.any_change = true;
}
tracing::debug!(chassis = %chassis_id, watts = watts, "Redfish scrape succeeded");
}
Err(e) => {
record_chassis_failure(chassis_id, uri, &e, metrics, failure_streak_warned);
}
}
}
outcome
}
async fn run_scraper_loop(
cfg: RedfishConfig,
state: Arc<RedfishState>,
metrics: Arc<MetricsState>,
) {
if cfg.ca_bundle_path.is_some() {
tracing::error!(
"[green.redfish] ca_bundle_path is set but custom-CA TLS support \
is not yet implemented. The scraper will not start. Front the BMC \
with a reverse proxy presenting a publicly-signed cert, or remove \
ca_bundle_path to use the public TLS roots. See \
docs/LIMITATIONS.md#redfish-bmc-precision-bounds."
);
return;
}
let uris = parse_chassis_uris(&cfg);
if uris.is_empty() {
tracing::error!(
"[green.redfish] no valid chassis endpoints after URI parse, \
scraper will not start"
);
return;
}
let auth_context = format!("redfish ({} chassis)", uris.len());
let parsed_auth: Option<AuthHeader> = match parse_scraper_auth_header(
cfg.auth_header.as_deref(),
&auth_context,
&auth_context,
"redfish",
) {
ScraperAuthOutcome::Invalid => return,
ScraperAuthOutcome::None => None,
ScraperAuthOutcome::Some(h) => Some(h),
};
let chassis_services = build_chassis_services(&cfg.service_mappings);
let client = http_client::build_client();
let mut ticker = tokio::time::interval(cfg.scrape_interval);
ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
ticker.tick().await;
let mut snapshot_diff = OpsSnapshotDiff::default();
let mut failure_streak_warned = false;
let mut consecutive_failures: u32 = 0;
let mut unsupported_platform_warned = false;
let mut last_success_ms: u64 = monotonic_ms();
let scrape_interval_secs = cfg.scrape_interval.as_secs_f64();
tracing::info!(
chassis_count = uris.len(),
scrape_interval_secs = cfg.scrape_interval.as_secs(),
service_count = cfg.service_mappings.len(),
"Redfish scraper started"
);
loop {
ticker.tick().await;
let current_ops = metrics.snapshot_service_io_ops();
let deltas = snapshot_diff.delta_and_advance(current_ops);
let now = monotonic_ms();
let mut next = state.current_owned();
let ctx = TickContext {
client: &client,
auth: parsed_auth.as_ref(),
chassis_services: &chassis_services,
deltas: &deltas,
scrape_interval_secs,
now_ms: now,
};
let TickOutcome {
any_success,
any_change,
} = run_tick(&uris, &ctx, &mut next, &metrics, &mut failure_streak_warned).await;
if any_change {
state.publish(next);
}
if any_success {
failure_streak_warned = false;
consecutive_failures = 0;
last_success_ms = now;
metrics.redfish_last_scrape_age_seconds.set(0.0);
} else {
consecutive_failures = consecutive_failures.saturating_add(1);
let age_secs = now.saturating_sub(last_success_ms) as f64 / 1000.0;
metrics.redfish_last_scrape_age_seconds.set(age_secs);
if !unsupported_platform_warned
&& consecutive_failures >= UNSUPPORTED_PLATFORM_FAILURE_THRESHOLD
{
tracing::warn!(
consecutive_failures = consecutive_failures,
"Every Redfish chassis has been unreachable for \
{UNSUPPORTED_PLATFORM_FAILURE_THRESHOLD} consecutive scrapes. \
Check the BMC endpoints, credentials, and TLS. \
The daemon is falling back through the precedence chain. \
See docs/LIMITATIONS.md#redfish-bmc-precision-bounds."
);
unsupported_platform_warned = true;
}
}
}
}