1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
//! Module process health monitoring
//!
//! Monitors module processes for crashes and health issues.
use tokio::sync::mpsc;
use tokio::time::Duration;
use tracing::{debug, error, info, warn};
use crate::module::process::spawner::ModuleProcess;
use crate::module::traits::ModuleError;
use std::sync::Arc;
/// Module health monitor
pub struct ModuleProcessMonitor {
/// Monitoring interval
interval: Duration,
/// Crash notification channel
crash_tx: mpsc::UnboundedSender<(String, ModuleError)>,
}
/// Module health status
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub enum ModuleHealth {
/// Module is healthy
Healthy,
/// Module is unresponsive
Unresponsive,
/// Module has crashed
Crashed(String),
}
impl ModuleProcessMonitor {
/// Create a new module process monitor
pub fn new(crash_tx: mpsc::UnboundedSender<(String, ModuleError)>) -> Self {
Self {
interval: Duration::from_secs(5),
crash_tx,
}
}
/// Set monitoring interval
pub fn with_interval(mut self, interval: Duration) -> Self {
self.interval = interval;
self
}
/// Start monitoring a module
pub async fn monitor_module(
&self,
module_name: String,
mut process: ModuleProcess,
) -> Result<(), ModuleError> {
info!("Starting health monitoring for module: {}", module_name);
let mut ticker = tokio::time::interval(self.interval);
loop {
ticker.tick().await;
// Check if process is still running
if !process.is_running() {
// Process exited, check exit status
match process.wait().await? {
Some(status) => {
if status.success() {
info!("Module {} exited normally", module_name);
} else {
let error_msg = format!(
"Module {} exited with error: {:?}",
module_name,
status.code()
);
error!("{}", error_msg);
let _ = self
.crash_tx
.send((module_name.clone(), ModuleError::ModuleCrashed(error_msg)));
}
return Ok(());
}
None => {
warn!("Module {} process status unknown", module_name);
return Ok(());
}
}
}
// Check heartbeat via IPC
if let Some(client) = process.client_mut() {
use crate::module::ipc::protocol::MessageType;
use crate::module::ipc::protocol::RequestMessage;
use crate::module::ipc::protocol::RequestPayload;
let heartbeat_request = RequestMessage {
correlation_id: 0,
request_type: MessageType::GetChainTip,
payload: RequestPayload::GetChainTip,
};
let heartbeat_result =
tokio::time::timeout(Duration::from_secs(2), client.request(heartbeat_request))
.await;
match heartbeat_result {
Ok(Ok(_)) => debug!("Module {} heartbeat OK", module_name),
Ok(Err(e)) => warn!("Module {} heartbeat failed: {}", module_name, e),
Err(_) => warn!("Module {} heartbeat timeout", module_name),
}
} else {
debug!(
"Module {} has no IPC client for heartbeat check",
module_name
);
}
// Loop continues to next iteration
}
}
/// Start monitoring a module (with shared process via Arc<Mutex<>>)
pub async fn monitor_module_shared(
&self,
module_name: String,
shared_process: Arc<tokio::sync::Mutex<ModuleProcess>>,
) -> Result<(), ModuleError> {
info!("Starting health monitoring for module: {}", module_name);
let mut ticker = tokio::time::interval(self.interval);
loop {
ticker.tick().await;
// Check if process is still running
let is_running = {
let mut process_guard = shared_process.lock().await;
process_guard.is_running()
};
if !is_running {
// Process exited, check exit status
let exit_status = {
let mut process_guard = shared_process.lock().await;
process_guard.wait().await?
};
match exit_status {
Some(status) => {
if status.success() {
info!("Module {} exited normally", module_name);
} else {
let error_msg = format!(
"Module {} exited with error: {:?}",
module_name,
status.code()
);
error!("{}", error_msg);
let _ = self
.crash_tx
.send((module_name.clone(), ModuleError::ModuleCrashed(error_msg)));
}
return Ok(());
}
None => {
warn!("Module {} process status unknown", module_name);
return Ok(());
}
}
}
// Check heartbeat via IPC
{
let mut process_guard = shared_process.lock().await;
if let Some(client) = process_guard.client_mut() {
use crate::module::ipc::protocol::MessageType;
use crate::module::ipc::protocol::RequestMessage;
use crate::module::ipc::protocol::RequestPayload;
// Use GetChainTip as a lightweight heartbeat check
let heartbeat_request = RequestMessage {
correlation_id: 0, // Use 0 for heartbeat (won't match any real request)
request_type: MessageType::GetChainTip,
payload: RequestPayload::GetChainTip,
};
// Send heartbeat with timeout
let heartbeat_result = tokio::time::timeout(
Duration::from_secs(2),
client.request(heartbeat_request),
)
.await;
match heartbeat_result {
Ok(Ok(_)) => {
// Heartbeat successful - module is responsive
debug!("Module {} heartbeat OK", module_name);
}
Ok(Err(e)) => {
warn!("Module {} heartbeat failed: {}", module_name, e);
// Mark as unresponsive but don't crash yet
}
Err(_) => {
warn!("Module {} heartbeat timeout", module_name);
// Timeout - module may be unresponsive
}
}
} else {
// No IPC client available - can't check heartbeat
debug!(
"Module {} has no IPC client for heartbeat check",
module_name
);
}
}
// Loop continues to next iteration
}
}
/// Check module health
pub fn check_health(process: &mut ModuleProcess) -> ModuleHealth {
if !process.is_running() {
ModuleHealth::Crashed("Process exited".to_string())
} else if let Some(client) = process.client_mut() {
use crate::module::ipc::protocol::{MessageType, RequestMessage, RequestPayload};
use tokio::time::timeout;
let heartbeat_request = RequestMessage {
correlation_id: 0,
request_type: MessageType::GetChainTip,
payload: RequestPayload::GetChainTip,
};
match tokio::runtime::Handle::try_current() {
Ok(handle) => {
let result = handle.block_on(timeout(
Duration::from_secs(1),
client.request(heartbeat_request),
));
match result {
Ok(Ok(_)) => ModuleHealth::Healthy,
_ => ModuleHealth::Unresponsive,
}
}
Err(_) => ModuleHealth::Healthy,
}
} else {
ModuleHealth::Healthy
}
}
}