groups:
- name: ant-node-alerts
rules:
- alert: AntNodeDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Autonomi node unreachable"
description: "Node {{ $labels.instance }} in {{ $labels.region }} has been down for more than 2 minutes"
- alert: AntNodeUnhealthy
expr: p2p_health_status == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Autonomi node unhealthy"
description: "Node {{ $labels.instance }} is reporting unhealthy status for more than 5 minutes"
- alert: AntLowPeerCount
expr: p2p_network_peer_count < 3
for: 5m
labels:
severity: warning
annotations:
summary: "Low peer count"
description: "Node {{ $labels.instance }} has only {{ $value }} peers (minimum 3 expected)"
- alert: AntNoPeers
expr: p2p_network_peer_count == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Node has no peers"
description: "Node {{ $labels.instance }} has no peers - potentially isolated"
- alert: AntLowDHTSize
expr: p2p_dht_routing_table_size < 20
for: 10m
labels:
severity: warning
annotations:
summary: "Low DHT routing table"
description: "Node {{ $labels.instance }} has only {{ $value }} DHT entries (minimum 20 expected)"
- alert: AntHighPacketLoss
expr: rate(p2p_network_failed_connections_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High connection failure rate"
description: "Node {{ $labels.instance }} is experiencing high connection failures"
- name: ant-cluster-alerts
rules:
- alert: AntClusterUnhealthy
expr: (sum(p2p_health_status == 1) / count(p2p_health_status)) < 0.95
for: 5m
labels:
severity: critical
annotations:
summary: "Cluster health below 95%"
description: "Only {{ $value | humanizePercentage }} of nodes are healthy"
- alert: AntRegionDegraded
expr: (sum(p2p_health_status == 1) by (region) / count(p2p_health_status) by (region)) < 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Region {{ $labels.region }} degraded"
description: "Only {{ $value | humanizePercentage }} of nodes in {{ $labels.region }} are healthy"
- alert: AntPotentialPartition
expr: stddev(p2p_network_peer_count) > 10
for: 10m
labels:
severity: warning
annotations:
summary: "Possible network partition"
description: "High variance in peer counts suggests potential network partition"
- name: ant-payment-alerts
rules:
- alert: AntPaymentVerificationFailures
expr: rate(payment_verification_failed_total[5m]) > 0.01
for: 5m
labels:
severity: warning
annotations:
summary: "Payment verification failures"
description: "Node {{ $labels.instance }} has elevated payment verification failures"
- alert: AntQuoteGenerationSlow
expr: histogram_quantile(0.95, rate(quote_generation_duration_seconds_bucket[5m])) > 1
for: 5m
labels:
severity: warning
annotations:
summary: "Slow quote generation"
description: "95th percentile quote generation time is {{ $value }}s (expected < 1s)"
- name: ant-security-alerts
rules:
- alert: AntHighIPDiversityRejections
expr: rate(ant_ip_diversity_rejections_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High IP diversity rejections"
description: "Node {{ $labels.instance }} is rejecting {{ $value | humanize }}/s peers due to IP diversity enforcement"
- alert: AntHighGeoDiversityRejections
expr: rate(ant_geographic_diversity_rejections_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High geographic diversity rejections"
description: "Node {{ $labels.instance }} is rejecting {{ $value | humanize }}/s peers due to geographic diversity enforcement"
- alert: AntTrustViolationsHigh
expr: rate(ant_trust_threshold_violations_total[5m]) > 0.05
for: 5m
labels:
severity: warning
annotations:
summary: "Elevated trust violations"
description: "Node {{ $labels.instance }} is seeing {{ $value | humanize }}/s trust threshold violations"
- alert: AntHighLowTrustNodes
expr: ant_low_trust_nodes_current > 10
for: 10m
labels:
severity: warning
annotations:
summary: "Many low trust nodes detected"
description: "Node {{ $labels.instance }} is tracking {{ $value }} low trust nodes in its routing table"
- alert: AntStrictEnforcementActivated
expr: ant_enforcement_mode_strict == 1
for: 1m
labels:
severity: info
annotations:
summary: "Strict enforcement mode activated"
description: "Node {{ $labels.instance }} has activated strict trust enforcement mode"
- alert: AntCloseGroupFailures
expr: rate(ant_close_group_failure_by_type[5m]) > 0.01
for: 5m
labels:
severity: warning
annotations:
summary: "Close group failures detected"
description: "Node {{ $labels.instance }} is experiencing close group failures of type {{ $labels.type }} at {{ $value | humanize }}/s"
- alert: AntHighEvictionRate
expr: sum(rate(ant_eviction_by_reason[5m])) by (instance) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "High node eviction rate"
description: "Node {{ $labels.instance }} is evicting nodes at {{ $value | humanize }}/s"