actr-hyper 0.3.1

Hyper — Actor platform infrastructure: sandbox, transport, scheduler, WASM engine, signing, AIS bootstrap, persistence & crypto primitives
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
//! Heartbeat management for ActrNode
//!
//! This module contains functions for sending periodic heartbeat messages
//! to the signaling server and handling responses.

use crate::ais_client::AisClient;
use crate::lifecycle::CredentialState;
use crate::transport::NetworkError;
use crate::wire::webrtc::SignalingClient;
use crate::wire::webrtc::{HookCallback, HookEvent};
use actr_protocol::{ActrId, RegisterRequest, ServiceAvailabilityState};
use actr_runtime_mailbox::Mailbox;
use std::sync::Arc;
use std::time::{Duration, SystemTime};
use tokio_util::sync::CancellationToken;

/// Convert a `prost_types::Timestamp` expiry to a wall-clock
/// `SystemTime`. Clamps negative seconds to the Unix epoch so downstream
/// `Duration` math stays safe.
fn expiry_to_system_time(expires_at: &prost_types::Timestamp) -> SystemTime {
    SystemTime::UNIX_EPOCH + Duration::from_secs(expires_at.seconds.max(0) as u64)
}

/// Invoke a [`HookCallback`], if present, awaiting its completion.
async fn fire_hook(cb: Option<&HookCallback>, event: HookEvent) {
    if let Some(cb) = cb {
        cb(event).await;
    }
}

/// Typical mailbox capacity for backlog ratio calculation
/// A typical_capacity of 1000 means 100 messages = 10% backlog
const TYPICAL_CAPACITY: f32 = 1000.0;

/// Log the first heartbeat failure immediately, then sample long failure runs.
/// With the typical 5s-30s heartbeat interval this keeps offline periods visible
/// without flooding client logs.
const HEARTBEAT_FAILURE_LOG_EVERY: u64 = 12;

fn should_log_heartbeat_failure(consecutive_failures: u64) -> bool {
    consecutive_failures == 1 || consecutive_failures.is_multiple_of(HEARTBEAT_FAILURE_LOG_EVERY)
}

/// Get power reserve, mailbox backlog and calculate service availability
///
/// This function fetches the power reserve from pwrzv and mailbox backlog,
/// then calculates the service availability state based on both metrics.
///
/// # Arguments
/// * `mailbox` - Mailbox instance to get backlog statistics
///
/// # Returns
/// A tuple of (power_reserve, mailbox_backlog, availability) where:
/// - `power_reserve`: Power reserve level from pwrzv (1.0 to 5.0, where higher = more available)
/// - `mailbox_backlog`: Mailbox backlog ratio (0.0 to 1.0, where higher = more backlog)
/// - `availability`: Calculated ServiceAvailabilityState
async fn get_power_reserve_and_availability(
    mailbox: &Arc<dyn Mailbox>,
) -> (f32, f32, ServiceAvailabilityState) {
    // TODO: Ensure the default value is correct
    // Get real power reserve from pwrzv (returns 1.0 to 5.0, where higher = more available)
    let power_reserve = pwrzv::get_power_reserve_level_direct().await.unwrap_or(1.0); // Default to minimum capacity on error

    // Get mailbox backlog from mailbox stats
    // Calculate backlog ratio: (queued + inflight) / typical_capacity
    let mailbox_backlog = match mailbox.status().await {
        Ok(stats) => {
            let total_messages = (stats.queued_messages + stats.inflight_messages) as f32;
            (total_messages / TYPICAL_CAPACITY).min(1.0)
        }
        Err(e) => {
            tracing::warn!("⚠️ Failed to get mailbox stats: {}", e);
            0.0
        }
    };

    // TODO: Improve availability calculation
    // Determine availability based on power reserve and mailbox backlog
    // Power reserve range: 1.0 (worst) to 5.0 (best)
    // Thresholds adjusted for 1.0-5.0 range: 4.2 (80%), 3.0 (50%), 1.8 (20%)
    let availability = if power_reserve > 4.2 && mailbox_backlog < 0.5 {
        ServiceAvailabilityState::Full
    } else if power_reserve > 3.0 && mailbox_backlog < 0.8 {
        ServiceAvailabilityState::Degraded
    } else if power_reserve > 1.8 && mailbox_backlog < 0.95 {
        ServiceAvailabilityState::Overloaded
    } else {
        ServiceAvailabilityState::Unavailable
    };

    (power_reserve, mailbox_backlog, availability)
}

/// Send a single heartbeat and handle the Pong response
///
/// This function sends a heartbeat message to the signaling server,
/// waits for the Pong response, and handles credential warnings if present.
/// If credential has expired (401 error), it triggers re-registration.
///
/// # Arguments
/// * `client` - Signaling client for sending heartbeats
/// * `actor_id` - Actor ID for heartbeat messages
/// * `credential_state` - Shared credential state
/// * `mailbox` - Mailbox instance for backlog statistics
/// * `heartbeat_interval` - Interval between heartbeats (used for timeout calculation)
/// * `register_request` - RegisterRequest for re-registration on credential expiry
///
/// Returns `Some(new_actor_id)` when re-registration assigns a new ActrId,
/// so the caller can update its loop variable for subsequent heartbeats.
#[allow(clippy::too_many_arguments)]
async fn send_heartbeat_and_handle_response(
    client: &Arc<dyn SignalingClient>,
    actor_id: &ActrId,
    credential_state: &CredentialState,
    mailbox: &Arc<dyn Mailbox>,
    heartbeat_interval: Duration,
    register_request: &RegisterRequest,
    consecutive_failures: &mut u64,
    ais_endpoint: &str,
    hook_callback: Option<&HookCallback>,
) -> Option<ActrId> {
    // Get current credential from shared state
    let current_credential = credential_state.credential().await;

    // Get power reserve, mailbox backlog and calculate availability
    let (power_reserve, mailbox_backlog, availability) =
        get_power_reserve_and_availability(mailbox).await;

    let ping_timeout_secs = (heartbeat_interval.as_secs() as f64 * 0.4) as u64;
    let pong_response = tokio::time::timeout(
        Duration::from_secs(ping_timeout_secs),
        client.send_heartbeat(
            actor_id.clone(),
            current_credential.clone(),
            availability,
            power_reserve,
            mailbox_backlog,
        ),
    )
    .await;

    let pong = match pong_response {
        Ok(Ok(pong)) => pong,
        Ok(Err(NetworkError::CredentialExpired(msg))) => {
            // Credential has expired, trigger re-registration
            tracing::warn!(
                "⚠️ Credential expired during heartbeat: {}. Attempting re-registration.",
                msg
            );

            // Fire `on_credential_expiring` with the last-known expiry
            // timestamp (best-effort — the credential might already be
            // past its advertised `expires_at`, but firing the event
            // gives the workload one final chance to observe the
            // transition before we attempt re-registration).
            if let Some(expires_at) = credential_state.expires_at().await {
                fire_hook(
                    hook_callback,
                    HookEvent::CredentialExpiring {
                        new_expiry: expiry_to_system_time(&expires_at),
                    },
                )
                .await;
            }

            let new_actor_id = re_register_task(
                client.clone(),
                actor_id.clone(),
                register_request.clone(),
                credential_state.clone(),
                ais_endpoint.to_string(),
                hook_callback.cloned(),
            )
            .await;

            // Return updated ActrId only if it actually changed
            if &new_actor_id != actor_id {
                return Some(new_actor_id);
            }
            return None;
        }
        Ok(Err(e)) => {
            *consecutive_failures += 1;
            if should_log_heartbeat_failure(*consecutive_failures) {
                tracing::warn!(
                    consecutive_failures = *consecutive_failures,
                    "⚠️ Failed to send heartbeat or receive Pong: {}",
                    e
                );
            } else {
                tracing::debug!(
                    consecutive_failures = *consecutive_failures,
                    "Suppressed repeated heartbeat failure: {}",
                    e
                );
            }
            return None;
        }
        Err(_) => {
            *consecutive_failures += 1;
            if should_log_heartbeat_failure(*consecutive_failures) {
                tracing::warn!(
                    consecutive_failures = *consecutive_failures,
                    "⚠️ Heartbeat timeout after {}s",
                    ping_timeout_secs
                );
            } else {
                tracing::debug!(
                    consecutive_failures = *consecutive_failures,
                    "Suppressed repeated heartbeat timeout after {}s",
                    ping_timeout_secs
                );
            }
            return None;
        }
    };

    if *consecutive_failures > 0 {
        tracing::info!(
            consecutive_failures = *consecutive_failures,
            "✅ Heartbeat recovered after consecutive failures"
        );
        *consecutive_failures = 0;
    }

    tracing::trace!(
        "💓 Heartbeat sent and Pong received for Actor {} (power_reserve={:.2}, mailbox_backlog={:.2}, availability={:?})",
        actor_id,
        power_reserve,
        mailbox_backlog,
        availability
    );
    // TODO: Handle suggest_interval_secs
    // Handle credential_warning
    if let Some(warning) = pong.credential_warning {
        tracing::warn!(
            "⚠️ Credential warning received: type={:?}, message={}",
            warning.r#type(),
            warning.message
        );

        // Fire `on_credential_expiring` hook once per warning so the
        // workload can preload its credential-renewed handlers (e.g. to
        // rotate derived secrets) before the refresh round-trip lands.
        if let Some(expires_at) = credential_state.expires_at().await {
            fire_hook(
                hook_callback,
                HookEvent::CredentialExpiring {
                    new_expiry: expiry_to_system_time(&expires_at),
                },
            )
            .await;
        }

        // Trigger immediate credential refresh in a spawned task
        tokio::spawn(credential_refresh_task(
            client.clone(),
            actor_id.clone(),
            credential_state.clone(),
            hook_callback.cloned(),
        ));
    }
    None
}

/// Heartbeat task that periodically sends Ping messages to signaling server
///
/// This task runs in a loop, sending heartbeat messages at the specified interval
/// and handling Pong responses, including credential warnings.
/// If credential has expired (401 error), it triggers re-registration.
///
/// # Arguments
/// * `shutdown` - Cancellation token for graceful shutdown
/// * `client` - Signaling client for sending heartbeats
/// * `actor_id` - Actor ID for heartbeat messages
/// * `credential_state` - Shared credential state
/// * `mailbox` - Mailbox instance for backlog statistics
/// * `heartbeat_interval` - Interval between heartbeats
/// * `register_request` - RegisterRequest for re-registration on credential expiry
/// * `ais_endpoint` - AIS HTTP endpoint for re-registration
#[allow(clippy::too_many_arguments)]
pub async fn heartbeat_task(
    shutdown: CancellationToken,
    client: Arc<dyn SignalingClient>,
    actor_id: ActrId,
    credential_state: CredentialState,
    mailbox: Arc<dyn Mailbox>,
    heartbeat_interval: Duration,
    register_request: RegisterRequest,
    ais_endpoint: String,
    hook_callback: Option<HookCallback>,
) {
    let mut interval = tokio::time::interval(heartbeat_interval);
    let mut actor_id = actor_id;
    let mut consecutive_failures = 0;

    loop {
        tokio::select! {
            _ = shutdown.cancelled() => {
                tracing::info!("💓 Heartbeat task received shutdown signal");
                break;
            }
            _ = interval.tick() => {
                if let Some(new_actor_id) = send_heartbeat_and_handle_response(
                    &client,
                    &actor_id,
                    &credential_state,
                    &mailbox,
                    heartbeat_interval,
                    &register_request,
                    &mut consecutive_failures,
                    &ais_endpoint,
                    hook_callback.as_ref(),
                )
                .await {
                    tracing::info!(
                        "🔄 Heartbeat actor_id updated: {} -> {}",
                        actor_id,
                        new_actor_id,
                    );
                    actor_id = new_actor_id;
                }
            }
        }
    }
    tracing::info!("✅ Heartbeat task terminated gracefully");
}

/// Refresh credential for an actor
///
/// This function sends a credential update request to the signaling server
/// and updates the shared credential state upon success.
///
/// # Arguments
/// * `client` - Signaling client for sending credential update request
/// * `actor_id` - Actor ID for the credential update
/// * `credential_state` - Shared credential state to update
async fn credential_refresh_task(
    client: Arc<dyn SignalingClient>,
    actor_id: ActrId,
    credential_state: CredentialState,
    hook_callback: Option<HookCallback>,
) {
    tracing::info!("🔑 Refreshing credential for Actor {}", actor_id);

    match client
        .send_credential_update_request(actor_id.clone(), credential_state.credential().await)
        .await
    {
        Ok(register_response) => {
            match register_response.result {
                Some(actr_protocol::register_response::Result::Success(register_ok)) => {
                    let new_credential = register_ok.credential;
                    let new_expires_at = register_ok.credential_expires_at;
                    // TurnCredential is a required proto field; wrap as Some directly
                    let new_turn_credential = Some(register_ok.turn_credential);

                    // Update shared credential state, synchronously updating TURN credential
                    credential_state
                        .update(new_credential.clone(), new_expires_at, new_turn_credential)
                        .await;

                    tracing::info!(
                        "✅ Credential refreshed successfully for Actor {}",
                        actor_id,
                    );

                    tracing::debug!("TurnCredential updated, TURN authentication ready");

                    if let Some(expires_at) = &new_expires_at {
                        tracing::debug!("⏰ New credential expires at: {}s", expires_at.seconds);
                        // Fire `on_credential_renewed` so the workload
                        // can rotate downstream state tied to the old
                        // credential (e.g. AIS-derived tokens).
                        fire_hook(
                            hook_callback.as_ref(),
                            HookEvent::CredentialRenewed {
                                new_expiry: expiry_to_system_time(expires_at),
                            },
                        )
                        .await;
                    }
                }
                Some(actr_protocol::register_response::Result::Error(err)) => {
                    tracing::error!(
                        "❌ Credential refresh failed: code={}, message={}",
                        err.code,
                        err.message
                    );
                }
                None => {
                    tracing::error!("❌ Credential refresh response missing result");
                }
            }
        }
        Err(e) => {
            tracing::warn!("⚠️ Failed to send credential update request: {}", e);
        }
    }
}

/// Re-register actor after credential expiry
///
/// When the credential has completely expired, re-register via AIS HTTP,
/// then disconnect/reconnect the signaling WebSocket with the new credential.
///
/// # Arguments
/// * `client` - Signaling client for reconnection
/// * `actor_id` - Current actor ID (used for logging)
/// * `register_request` - RegisterRequest containing actor type, realm, and service spec
/// * `credential_state` - Shared credential state to update
/// * `ais_endpoint` - AIS HTTP endpoint for registration
async fn re_register_task(
    client: Arc<dyn SignalingClient>,
    actor_id: ActrId,
    register_request: RegisterRequest,
    credential_state: CredentialState,
    ais_endpoint: String,
    hook_callback: Option<HookCallback>,
) -> ActrId {
    tracing::info!(
        "🔄 Re-registering actor {} after credential expiry via AIS HTTP (type: {}/{})",
        actor_id,
        register_request.actr_type.manufacturer,
        register_request.actr_type.name
    );

    // Step 1: Register via AIS HTTP to get new credential
    let ais = AisClient::new(&ais_endpoint);
    let resp = match ais.register_with_manifest(register_request.clone()).await {
        Ok(resp) => resp,
        Err(e) => {
            tracing::error!("❌ AIS re-registration HTTP request failed: {}", e);
            return actor_id;
        }
    };

    match resp.result {
        Some(actr_protocol::register_response::Result::Success(register_ok)) => {
            let new_actor_id = register_ok.actr_id.clone();
            let new_credential = register_ok.credential;
            let new_expires_at = register_ok.credential_expires_at;
            let new_turn_credential = Some(register_ok.turn_credential);

            // Update shared credential state
            credential_state
                .update(new_credential.clone(), new_expires_at, new_turn_credential)
                .await;

            // Step 2: Clear old identity from signaling client
            client.clear_identity().await;

            // Step 3: Disconnect old WebSocket session
            tracing::info!("🔌 Disconnecting signaling client to refresh session");
            if let Err(e) = client.disconnect().await {
                tracing::warn!("⚠️ Disconnect failed (non-fatal, continuing): {}", e);
            }

            // Step 4: Update signaling client identity with new credential
            client.set_actor_id(new_actor_id.clone()).await;
            client.set_credential_state(credential_state.clone()).await;

            // Step 5: Reconnect signaling WebSocket (URL will carry new credential)
            tracing::info!("🔗 Reconnecting signaling client with new credential");
            if let Err(e) = client.connect().await {
                tracing::error!("❌ Failed to reconnect after re-registration: {}", e);
                // Credential is updated even if reconnect fails — next heartbeat retry will reconnect
            }

            tracing::info!(
                "✅ Re-registration successful via AIS HTTP (ActrId: {})",
                new_actor_id,
            );

            tracing::debug!("TurnCredential updated, TURN authentication ready");

            if let Some(expires_at) = &new_expires_at {
                tracing::debug!("⏰ New credential expires at: {}s", expires_at.seconds);
                // Fire `on_credential_renewed` for the re-registration
                // path as well — same semantics as the refresh path.
                fire_hook(
                    hook_callback.as_ref(),
                    HookEvent::CredentialRenewed {
                        new_expiry: expiry_to_system_time(expires_at),
                    },
                )
                .await;
            }

            new_actor_id
        }
        Some(actr_protocol::register_response::Result::Error(err)) => {
            tracing::error!(
                "❌ AIS re-registration failed: code={}, message={}",
                err.code,
                err.message
            );
            actor_id
        }
        None => {
            tracing::error!("❌ AIS re-registration response missing result");
            actor_id
        }
    }
}