agave_validator/commands/wait_for_restart_window/
mod.rs

1use {
2    crate::{
3        admin_rpc_service,
4        commands::{FromClapArgMatches, Result},
5        new_spinner_progress_bar, println_name_value,
6    },
7    clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand},
8    console::style,
9    solana_clap_utils::{
10        input_parsers::pubkey_of,
11        input_validators::{is_parsable, is_pubkey_or_keypair, is_valid_percentage},
12    },
13    solana_clock::{Slot, DEFAULT_S_PER_SLOT},
14    solana_commitment_config::CommitmentConfig,
15    solana_pubkey::Pubkey,
16    solana_rpc_client::rpc_client::RpcClient,
17    solana_rpc_client_api::config::RpcLeaderScheduleConfig,
18    std::{
19        collections::VecDeque,
20        path::Path,
21        time::{Duration, SystemTime},
22    },
23};
24
25const COMMAND: &str = "wait-for-restart-window";
26
27const DEFAULT_MIN_IDLE_TIME: &str = "10";
28const DEFAULT_MAX_DELINQUENT_STAKE: &str = "5";
29
30#[derive(Debug, PartialEq)]
31pub struct WaitForRestartWindowArgs {
32    pub min_idle_time: usize,
33    pub identity: Option<Pubkey>,
34    pub max_delinquent_stake: u8,
35    pub skip_new_snapshot_check: bool,
36    pub skip_health_check: bool,
37}
38
39impl FromClapArgMatches for WaitForRestartWindowArgs {
40    fn from_clap_arg_match(matches: &ArgMatches) -> Result<Self> {
41        Ok(WaitForRestartWindowArgs {
42            min_idle_time: value_t_or_exit!(matches, "min_idle_time", usize),
43            identity: pubkey_of(matches, "identity"),
44            max_delinquent_stake: value_t_or_exit!(matches, "max_delinquent_stake", u8),
45            skip_new_snapshot_check: matches.is_present("skip_new_snapshot_check"),
46            skip_health_check: matches.is_present("skip_health_check"),
47        })
48    }
49}
50
51pub(crate) fn command<'a>() -> App<'a, 'a> {
52    SubCommand::with_name(COMMAND)
53        .about("Monitor the validator for a good time to restart")
54        .arg(
55            Arg::with_name("min_idle_time")
56                .long("min-idle-time")
57                .takes_value(true)
58                .validator(is_parsable::<usize>)
59                .value_name("MINUTES")
60                .default_value(DEFAULT_MIN_IDLE_TIME)
61                .help(
62                    "Minimum time that the validator should not be leader before restarting",
63                ),
64        )
65        .arg(
66            Arg::with_name("identity")
67                .long("identity")
68                .value_name("ADDRESS")
69                .takes_value(true)
70                .validator(is_pubkey_or_keypair)
71                .help("Validator identity to monitor [default: your validator]"),
72        )
73        .arg(
74            Arg::with_name("max_delinquent_stake")
75                .long("max-delinquent-stake")
76                .takes_value(true)
77                .validator(is_valid_percentage)
78                .value_name("PERCENT")
79                .default_value(DEFAULT_MAX_DELINQUENT_STAKE)
80                .help("The maximum delinquent stake % permitted for a restart"),
81        )
82        .arg(
83            Arg::with_name("skip_new_snapshot_check")
84                .long("skip-new-snapshot-check")
85                .help("Skip check for a new snapshot"),
86        )
87        .arg(
88            Arg::with_name("skip_health_check")
89                .long("skip-health-check")
90                .help("Skip health check"),
91        )
92        .after_help(
93            "Note: If this command exits with a non-zero status then this not a good time for a restart",
94        )
95}
96
97pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
98    let wait_for_restart_window_args = WaitForRestartWindowArgs::from_clap_arg_match(matches)?;
99
100    wait_for_restart_window(
101        ledger_path,
102        wait_for_restart_window_args.identity,
103        wait_for_restart_window_args.min_idle_time,
104        wait_for_restart_window_args.max_delinquent_stake,
105        wait_for_restart_window_args.skip_new_snapshot_check,
106        wait_for_restart_window_args.skip_health_check,
107    )?;
108
109    Ok(())
110}
111
112pub fn wait_for_restart_window(
113    ledger_path: &Path,
114    identity: Option<Pubkey>,
115    min_idle_time_in_minutes: usize,
116    max_delinquency_percentage: u8,
117    skip_new_snapshot_check: bool,
118    skip_health_check: bool,
119) -> std::result::Result<(), Box<dyn std::error::Error>> {
120    let sleep_interval = Duration::from_secs(5);
121
122    let min_idle_slots = (min_idle_time_in_minutes as f64 * 60. / DEFAULT_S_PER_SLOT) as Slot;
123
124    let admin_client = admin_rpc_service::connect(ledger_path);
125    let rpc_addr = admin_rpc_service::runtime()
126        .block_on(async move { admin_client.await?.rpc_addr().await })
127        .map_err(|err| format!("validator RPC address request failed: {err}"))?
128        .ok_or("validator RPC is unavailable".to_string())?;
129    let rpc_client = RpcClient::new_socket(rpc_addr);
130
131    let my_identity = rpc_client.get_identity()?;
132    let identity = identity.unwrap_or(my_identity);
133    let monitoring_another_validator = identity != my_identity;
134    println_name_value("Identity:", &identity.to_string());
135    println_name_value(
136        "Minimum Idle Time:",
137        &format!("{min_idle_slots} slots (~{min_idle_time_in_minutes} minutes)"),
138    );
139
140    println!("Maximum permitted delinquency: {max_delinquency_percentage}%");
141
142    let mut current_epoch = None;
143    let mut leader_schedule = VecDeque::new();
144    let mut restart_snapshot = None;
145    let mut upcoming_idle_windows = vec![]; // Vec<(starting slot, idle window length in slots)>
146
147    let progress_bar = new_spinner_progress_bar();
148    let monitor_start_time = SystemTime::now();
149
150    let mut seen_incremential_snapshot = false;
151    loop {
152        let snapshot_slot_info = rpc_client.get_highest_snapshot_slot().ok();
153        let snapshot_slot_info_has_incremential = snapshot_slot_info
154            .as_ref()
155            .map(|snapshot_slot_info| snapshot_slot_info.incremental.is_some())
156            .unwrap_or_default();
157        seen_incremential_snapshot |= snapshot_slot_info_has_incremential;
158
159        let epoch_info = rpc_client.get_epoch_info_with_commitment(CommitmentConfig::processed())?;
160        let healthy = skip_health_check || rpc_client.get_health().ok().is_some();
161        let delinquent_stake_percentage = {
162            let vote_accounts = rpc_client.get_vote_accounts()?;
163            let current_stake: u64 = vote_accounts
164                .current
165                .iter()
166                .map(|va| va.activated_stake)
167                .sum();
168            let delinquent_stake: u64 = vote_accounts
169                .delinquent
170                .iter()
171                .map(|va| va.activated_stake)
172                .sum();
173            let total_stake = current_stake + delinquent_stake;
174            delinquent_stake as f64 / total_stake as f64
175        };
176
177        if match current_epoch {
178            None => true,
179            Some(current_epoch) => current_epoch != epoch_info.epoch,
180        } {
181            progress_bar.set_message(format!(
182                "Fetching leader schedule for epoch {}...",
183                epoch_info.epoch
184            ));
185            let first_slot_in_epoch = epoch_info.absolute_slot - epoch_info.slot_index;
186            leader_schedule = rpc_client
187                .get_leader_schedule_with_config(
188                    Some(first_slot_in_epoch),
189                    RpcLeaderScheduleConfig {
190                        identity: Some(identity.to_string()),
191                        ..RpcLeaderScheduleConfig::default()
192                    },
193                )?
194                .ok_or_else(|| {
195                    format!("Unable to get leader schedule from slot {first_slot_in_epoch}")
196                })?
197                .get(&identity.to_string())
198                .cloned()
199                .unwrap_or_default()
200                .into_iter()
201                .map(|slot_index| first_slot_in_epoch.saturating_add(slot_index as u64))
202                .filter(|slot| *slot > epoch_info.absolute_slot)
203                .collect::<VecDeque<_>>();
204
205            upcoming_idle_windows.clear();
206            {
207                let mut leader_schedule = leader_schedule.clone();
208                let mut max_idle_window = 0;
209
210                let mut idle_window_start_slot = epoch_info.absolute_slot;
211                while let Some(next_leader_slot) = leader_schedule.pop_front() {
212                    let idle_window = next_leader_slot - idle_window_start_slot;
213                    max_idle_window = max_idle_window.max(idle_window);
214                    if idle_window > min_idle_slots {
215                        upcoming_idle_windows.push((idle_window_start_slot, idle_window));
216                    }
217                    idle_window_start_slot = next_leader_slot;
218                }
219                if !leader_schedule.is_empty() && upcoming_idle_windows.is_empty() {
220                    return Err(format!(
221                        "Validator has no idle window of at least {} slots. Largest idle window \
222                       for epoch {} is {} slots",
223                        min_idle_slots, epoch_info.epoch, max_idle_window
224                    )
225                    .into());
226                }
227            }
228
229            current_epoch = Some(epoch_info.epoch);
230        }
231
232        let status = {
233            if !healthy {
234                style("Node is unhealthy").red().to_string()
235            } else {
236                // Wait until a hole in the leader schedule before restarting the node
237                let in_leader_schedule_hole = if epoch_info.slot_index + min_idle_slots
238                    > epoch_info.slots_in_epoch
239                {
240                    Err("Current epoch is almost complete".to_string())
241                } else {
242                    while leader_schedule
243                        .front()
244                        .map(|slot| *slot < epoch_info.absolute_slot)
245                        .unwrap_or(false)
246                    {
247                        leader_schedule.pop_front();
248                    }
249                    while upcoming_idle_windows
250                        .first()
251                        .map(|(slot, _)| *slot < epoch_info.absolute_slot)
252                        .unwrap_or(false)
253                    {
254                        upcoming_idle_windows.pop();
255                    }
256
257                    match leader_schedule.front() {
258                        None => {
259                            Ok(()) // Validator has no leader slots
260                        }
261                        Some(next_leader_slot) => {
262                            let idle_slots =
263                                next_leader_slot.saturating_sub(epoch_info.absolute_slot);
264                            if idle_slots >= min_idle_slots {
265                                Ok(())
266                            } else {
267                                Err(match upcoming_idle_windows.first() {
268                                    Some((starting_slot, length_in_slots)) => {
269                                        format!(
270                                            "Next idle window in {} slots, for {} slots",
271                                            starting_slot.saturating_sub(epoch_info.absolute_slot),
272                                            length_in_slots
273                                        )
274                                    }
275                                    None => format!(
276                                        "Validator will be leader soon. Next leader slot is \
277                                       {next_leader_slot}"
278                                    ),
279                                })
280                            }
281                        }
282                    }
283                };
284
285                match in_leader_schedule_hole {
286                    Ok(_) => {
287                        if skip_new_snapshot_check {
288                            break; // Restart!
289                        }
290                        let snapshot_slot = snapshot_slot_info.map(|snapshot_slot_info| {
291                            snapshot_slot_info
292                                .incremental
293                                .unwrap_or(snapshot_slot_info.full)
294                        });
295                        if restart_snapshot.is_none() {
296                            restart_snapshot = snapshot_slot;
297                        }
298                        if restart_snapshot == snapshot_slot && !monitoring_another_validator {
299                            "Waiting for a new snapshot".to_string()
300                        } else if delinquent_stake_percentage
301                            >= (max_delinquency_percentage as f64 / 100.)
302                        {
303                            style("Delinquency too high").red().to_string()
304                        } else if seen_incremential_snapshot && !snapshot_slot_info_has_incremential
305                        {
306                            // Restarts using just a full snapshot will put the node significantly
307                            // further behind than if an incremental snapshot is also used, as full
308                            // snapshots are larger and take much longer to create.
309                            //
310                            // Therefore if the node just created a new full snapshot, wait a
311                            // little longer until it creates the first incremental snapshot for
312                            // the full snapshot.
313                            "Waiting for incremental snapshot".to_string()
314                        } else {
315                            break; // Restart!
316                        }
317                    }
318                    Err(why) => style(why).yellow().to_string(),
319                }
320            }
321        };
322
323        progress_bar.set_message(format!(
324            "{} | Processed Slot: {} {} | {:.2}% delinquent stake | {}",
325            {
326                let elapsed =
327                    chrono::Duration::from_std(monitor_start_time.elapsed().unwrap()).unwrap();
328
329                format!(
330                    "{:02}:{:02}:{:02}",
331                    elapsed.num_hours(),
332                    elapsed.num_minutes() % 60,
333                    elapsed.num_seconds() % 60
334                )
335            },
336            epoch_info.absolute_slot,
337            if monitoring_another_validator {
338                "".to_string()
339            } else {
340                format!(
341                    "| Full Snapshot Slot: {} | Incremental Snapshot Slot: {}",
342                    snapshot_slot_info
343                        .as_ref()
344                        .map(|snapshot_slot_info| snapshot_slot_info.full.to_string())
345                        .unwrap_or_else(|| '-'.to_string()),
346                    snapshot_slot_info
347                        .as_ref()
348                        .and_then(|snapshot_slot_info| snapshot_slot_info
349                            .incremental
350                            .map(|incremental| incremental.to_string()))
351                        .unwrap_or_else(|| '-'.to_string()),
352                )
353            },
354            delinquent_stake_percentage * 100.,
355            status
356        ));
357        std::thread::sleep(sleep_interval);
358    }
359    drop(progress_bar);
360    println!("{}", style("Ready to restart").green());
361    Ok(())
362}
363
364#[cfg(test)]
365mod tests {
366    use {super::*, crate::commands::tests::verify_args_struct_by_command, std::str::FromStr};
367
368    impl Default for WaitForRestartWindowArgs {
369        fn default() -> Self {
370            WaitForRestartWindowArgs {
371                min_idle_time: DEFAULT_MIN_IDLE_TIME
372                    .parse()
373                    .expect("invalid DEFAULT_MIN_IDLE_TIME"),
374                identity: None,
375                max_delinquent_stake: DEFAULT_MAX_DELINQUENT_STAKE
376                    .parse()
377                    .expect("invalid DEFAULT_MAX_DELINQUENT_STAKE"),
378                skip_new_snapshot_check: false,
379                skip_health_check: false,
380            }
381        }
382    }
383
384    #[test]
385    fn verify_args_struct_by_command_wait_for_restart_window_default() {
386        verify_args_struct_by_command(
387            command(),
388            vec![COMMAND],
389            WaitForRestartWindowArgs::default(),
390        );
391    }
392
393    #[test]
394    fn verify_args_struct_by_command_wait_for_restart_window_skip_new_snapshot_check() {
395        verify_args_struct_by_command(
396            command(),
397            vec![COMMAND, "--skip-new-snapshot-check"],
398            WaitForRestartWindowArgs {
399                skip_new_snapshot_check: true,
400                ..WaitForRestartWindowArgs::default()
401            },
402        );
403    }
404
405    #[test]
406    fn verify_args_struct_by_command_wait_for_restart_window_skip_health_check() {
407        verify_args_struct_by_command(
408            command(),
409            vec![COMMAND, "--skip-health-check"],
410            WaitForRestartWindowArgs {
411                skip_health_check: true,
412                ..WaitForRestartWindowArgs::default()
413            },
414        );
415    }
416
417    #[test]
418    fn verify_args_struct_by_command_wait_for_restart_window_min_idle_time() {
419        verify_args_struct_by_command(
420            command(),
421            vec![COMMAND, "--min-idle-time", "60"],
422            WaitForRestartWindowArgs {
423                min_idle_time: 60,
424                ..WaitForRestartWindowArgs::default()
425            },
426        );
427    }
428
429    #[test]
430    fn verify_args_struct_by_command_wait_for_restart_window_identity() {
431        verify_args_struct_by_command(
432            command(),
433            vec![
434                COMMAND,
435                "--identity",
436                "ch1do11111111111111111111111111111111111111",
437            ],
438            WaitForRestartWindowArgs {
439                identity: Some(
440                    Pubkey::from_str("ch1do11111111111111111111111111111111111111").unwrap(),
441                ),
442                ..WaitForRestartWindowArgs::default()
443            },
444        );
445    }
446
447    #[test]
448    fn verify_args_struct_by_command_wait_for_restart_window_max_delinquent_stake() {
449        verify_args_struct_by_command(
450            command(),
451            vec![COMMAND, "--max-delinquent-stake", "10"],
452            WaitForRestartWindowArgs {
453                max_delinquent_stake: 10,
454                ..WaitForRestartWindowArgs::default()
455            },
456        );
457    }
458}