agave_validator/commands/wait_for_restart_window/
mod.rs1use {
2 crate::{
3 admin_rpc_service,
4 commands::{FromClapArgMatches, Result},
5 new_spinner_progress_bar, println_name_value,
6 },
7 clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand},
8 console::style,
9 solana_clap_utils::{
10 input_parsers::pubkey_of,
11 input_validators::{is_parsable, is_pubkey_or_keypair, is_valid_percentage},
12 },
13 solana_clock::{Slot, DEFAULT_S_PER_SLOT},
14 solana_commitment_config::CommitmentConfig,
15 solana_pubkey::Pubkey,
16 solana_rpc_client::rpc_client::RpcClient,
17 solana_rpc_client_api::config::RpcLeaderScheduleConfig,
18 std::{
19 collections::VecDeque,
20 path::Path,
21 time::{Duration, SystemTime},
22 },
23};
24
25const COMMAND: &str = "wait-for-restart-window";
26
27const DEFAULT_MIN_IDLE_TIME: &str = "10";
28const DEFAULT_MAX_DELINQUENT_STAKE: &str = "5";
29
30#[derive(Debug, PartialEq)]
31pub struct WaitForRestartWindowArgs {
32 pub min_idle_time: usize,
33 pub identity: Option<Pubkey>,
34 pub max_delinquent_stake: u8,
35 pub skip_new_snapshot_check: bool,
36 pub skip_health_check: bool,
37}
38
39impl FromClapArgMatches for WaitForRestartWindowArgs {
40 fn from_clap_arg_match(matches: &ArgMatches) -> Result<Self> {
41 Ok(WaitForRestartWindowArgs {
42 min_idle_time: value_t_or_exit!(matches, "min_idle_time", usize),
43 identity: pubkey_of(matches, "identity"),
44 max_delinquent_stake: value_t_or_exit!(matches, "max_delinquent_stake", u8),
45 skip_new_snapshot_check: matches.is_present("skip_new_snapshot_check"),
46 skip_health_check: matches.is_present("skip_health_check"),
47 })
48 }
49}
50
51pub(crate) fn command<'a>() -> App<'a, 'a> {
52 SubCommand::with_name(COMMAND)
53 .about("Monitor the validator for a good time to restart")
54 .arg(
55 Arg::with_name("min_idle_time")
56 .long("min-idle-time")
57 .takes_value(true)
58 .validator(is_parsable::<usize>)
59 .value_name("MINUTES")
60 .default_value(DEFAULT_MIN_IDLE_TIME)
61 .help(
62 "Minimum time that the validator should not be leader before restarting",
63 ),
64 )
65 .arg(
66 Arg::with_name("identity")
67 .long("identity")
68 .value_name("ADDRESS")
69 .takes_value(true)
70 .validator(is_pubkey_or_keypair)
71 .help("Validator identity to monitor [default: your validator]"),
72 )
73 .arg(
74 Arg::with_name("max_delinquent_stake")
75 .long("max-delinquent-stake")
76 .takes_value(true)
77 .validator(is_valid_percentage)
78 .value_name("PERCENT")
79 .default_value(DEFAULT_MAX_DELINQUENT_STAKE)
80 .help("The maximum delinquent stake % permitted for a restart"),
81 )
82 .arg(
83 Arg::with_name("skip_new_snapshot_check")
84 .long("skip-new-snapshot-check")
85 .help("Skip check for a new snapshot"),
86 )
87 .arg(
88 Arg::with_name("skip_health_check")
89 .long("skip-health-check")
90 .help("Skip health check"),
91 )
92 .after_help(
93 "Note: If this command exits with a non-zero status then this not a good time for a restart",
94 )
95}
96
97pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
98 let wait_for_restart_window_args = WaitForRestartWindowArgs::from_clap_arg_match(matches)?;
99
100 wait_for_restart_window(
101 ledger_path,
102 wait_for_restart_window_args.identity,
103 wait_for_restart_window_args.min_idle_time,
104 wait_for_restart_window_args.max_delinquent_stake,
105 wait_for_restart_window_args.skip_new_snapshot_check,
106 wait_for_restart_window_args.skip_health_check,
107 )?;
108
109 Ok(())
110}
111
112pub fn wait_for_restart_window(
113 ledger_path: &Path,
114 identity: Option<Pubkey>,
115 min_idle_time_in_minutes: usize,
116 max_delinquency_percentage: u8,
117 skip_new_snapshot_check: bool,
118 skip_health_check: bool,
119) -> std::result::Result<(), Box<dyn std::error::Error>> {
120 let sleep_interval = Duration::from_secs(5);
121
122 let min_idle_slots = (min_idle_time_in_minutes as f64 * 60. / DEFAULT_S_PER_SLOT) as Slot;
123
124 let admin_client = admin_rpc_service::connect(ledger_path);
125 let rpc_addr = admin_rpc_service::runtime()
126 .block_on(async move { admin_client.await?.rpc_addr().await })
127 .map_err(|err| format!("validator RPC address request failed: {err}"))?
128 .ok_or("validator RPC is unavailable".to_string())?;
129 let rpc_client = RpcClient::new_socket(rpc_addr);
130
131 let my_identity = rpc_client.get_identity()?;
132 let identity = identity.unwrap_or(my_identity);
133 let monitoring_another_validator = identity != my_identity;
134 println_name_value("Identity:", &identity.to_string());
135 println_name_value(
136 "Minimum Idle Time:",
137 &format!("{min_idle_slots} slots (~{min_idle_time_in_minutes} minutes)"),
138 );
139
140 println!("Maximum permitted delinquency: {max_delinquency_percentage}%");
141
142 let mut current_epoch = None;
143 let mut leader_schedule = VecDeque::new();
144 let mut restart_snapshot = None;
145 let mut upcoming_idle_windows = vec![]; let progress_bar = new_spinner_progress_bar();
148 let monitor_start_time = SystemTime::now();
149
150 let mut seen_incremential_snapshot = false;
151 loop {
152 let snapshot_slot_info = rpc_client.get_highest_snapshot_slot().ok();
153 let snapshot_slot_info_has_incremential = snapshot_slot_info
154 .as_ref()
155 .map(|snapshot_slot_info| snapshot_slot_info.incremental.is_some())
156 .unwrap_or_default();
157 seen_incremential_snapshot |= snapshot_slot_info_has_incremential;
158
159 let epoch_info = rpc_client.get_epoch_info_with_commitment(CommitmentConfig::processed())?;
160 let healthy = skip_health_check || rpc_client.get_health().ok().is_some();
161 let delinquent_stake_percentage = {
162 let vote_accounts = rpc_client.get_vote_accounts()?;
163 let current_stake: u64 = vote_accounts
164 .current
165 .iter()
166 .map(|va| va.activated_stake)
167 .sum();
168 let delinquent_stake: u64 = vote_accounts
169 .delinquent
170 .iter()
171 .map(|va| va.activated_stake)
172 .sum();
173 let total_stake = current_stake + delinquent_stake;
174 delinquent_stake as f64 / total_stake as f64
175 };
176
177 if match current_epoch {
178 None => true,
179 Some(current_epoch) => current_epoch != epoch_info.epoch,
180 } {
181 progress_bar.set_message(format!(
182 "Fetching leader schedule for epoch {}...",
183 epoch_info.epoch
184 ));
185 let first_slot_in_epoch = epoch_info.absolute_slot - epoch_info.slot_index;
186 leader_schedule = rpc_client
187 .get_leader_schedule_with_config(
188 Some(first_slot_in_epoch),
189 RpcLeaderScheduleConfig {
190 identity: Some(identity.to_string()),
191 ..RpcLeaderScheduleConfig::default()
192 },
193 )?
194 .ok_or_else(|| {
195 format!("Unable to get leader schedule from slot {first_slot_in_epoch}")
196 })?
197 .get(&identity.to_string())
198 .cloned()
199 .unwrap_or_default()
200 .into_iter()
201 .map(|slot_index| first_slot_in_epoch.saturating_add(slot_index as u64))
202 .filter(|slot| *slot > epoch_info.absolute_slot)
203 .collect::<VecDeque<_>>();
204
205 upcoming_idle_windows.clear();
206 {
207 let mut leader_schedule = leader_schedule.clone();
208 let mut max_idle_window = 0;
209
210 let mut idle_window_start_slot = epoch_info.absolute_slot;
211 while let Some(next_leader_slot) = leader_schedule.pop_front() {
212 let idle_window = next_leader_slot - idle_window_start_slot;
213 max_idle_window = max_idle_window.max(idle_window);
214 if idle_window > min_idle_slots {
215 upcoming_idle_windows.push((idle_window_start_slot, idle_window));
216 }
217 idle_window_start_slot = next_leader_slot;
218 }
219 if !leader_schedule.is_empty() && upcoming_idle_windows.is_empty() {
220 return Err(format!(
221 "Validator has no idle window of at least {} slots. Largest idle window \
222 for epoch {} is {} slots",
223 min_idle_slots, epoch_info.epoch, max_idle_window
224 )
225 .into());
226 }
227 }
228
229 current_epoch = Some(epoch_info.epoch);
230 }
231
232 let status = {
233 if !healthy {
234 style("Node is unhealthy").red().to_string()
235 } else {
236 let in_leader_schedule_hole = if epoch_info.slot_index + min_idle_slots
238 > epoch_info.slots_in_epoch
239 {
240 Err("Current epoch is almost complete".to_string())
241 } else {
242 while leader_schedule
243 .front()
244 .map(|slot| *slot < epoch_info.absolute_slot)
245 .unwrap_or(false)
246 {
247 leader_schedule.pop_front();
248 }
249 while upcoming_idle_windows
250 .first()
251 .map(|(slot, _)| *slot < epoch_info.absolute_slot)
252 .unwrap_or(false)
253 {
254 upcoming_idle_windows.pop();
255 }
256
257 match leader_schedule.front() {
258 None => {
259 Ok(()) }
261 Some(next_leader_slot) => {
262 let idle_slots =
263 next_leader_slot.saturating_sub(epoch_info.absolute_slot);
264 if idle_slots >= min_idle_slots {
265 Ok(())
266 } else {
267 Err(match upcoming_idle_windows.first() {
268 Some((starting_slot, length_in_slots)) => {
269 format!(
270 "Next idle window in {} slots, for {} slots",
271 starting_slot.saturating_sub(epoch_info.absolute_slot),
272 length_in_slots
273 )
274 }
275 None => format!(
276 "Validator will be leader soon. Next leader slot is \
277 {next_leader_slot}"
278 ),
279 })
280 }
281 }
282 }
283 };
284
285 match in_leader_schedule_hole {
286 Ok(_) => {
287 if skip_new_snapshot_check {
288 break; }
290 let snapshot_slot = snapshot_slot_info.map(|snapshot_slot_info| {
291 snapshot_slot_info
292 .incremental
293 .unwrap_or(snapshot_slot_info.full)
294 });
295 if restart_snapshot.is_none() {
296 restart_snapshot = snapshot_slot;
297 }
298 if restart_snapshot == snapshot_slot && !monitoring_another_validator {
299 "Waiting for a new snapshot".to_string()
300 } else if delinquent_stake_percentage
301 >= (max_delinquency_percentage as f64 / 100.)
302 {
303 style("Delinquency too high").red().to_string()
304 } else if seen_incremential_snapshot && !snapshot_slot_info_has_incremential
305 {
306 "Waiting for incremental snapshot".to_string()
314 } else {
315 break; }
317 }
318 Err(why) => style(why).yellow().to_string(),
319 }
320 }
321 };
322
323 progress_bar.set_message(format!(
324 "{} | Processed Slot: {} {} | {:.2}% delinquent stake | {}",
325 {
326 let elapsed =
327 chrono::Duration::from_std(monitor_start_time.elapsed().unwrap()).unwrap();
328
329 format!(
330 "{:02}:{:02}:{:02}",
331 elapsed.num_hours(),
332 elapsed.num_minutes() % 60,
333 elapsed.num_seconds() % 60
334 )
335 },
336 epoch_info.absolute_slot,
337 if monitoring_another_validator {
338 "".to_string()
339 } else {
340 format!(
341 "| Full Snapshot Slot: {} | Incremental Snapshot Slot: {}",
342 snapshot_slot_info
343 .as_ref()
344 .map(|snapshot_slot_info| snapshot_slot_info.full.to_string())
345 .unwrap_or_else(|| '-'.to_string()),
346 snapshot_slot_info
347 .as_ref()
348 .and_then(|snapshot_slot_info| snapshot_slot_info
349 .incremental
350 .map(|incremental| incremental.to_string()))
351 .unwrap_or_else(|| '-'.to_string()),
352 )
353 },
354 delinquent_stake_percentage * 100.,
355 status
356 ));
357 std::thread::sleep(sleep_interval);
358 }
359 drop(progress_bar);
360 println!("{}", style("Ready to restart").green());
361 Ok(())
362}
363
364#[cfg(test)]
365mod tests {
366 use {super::*, crate::commands::tests::verify_args_struct_by_command, std::str::FromStr};
367
368 impl Default for WaitForRestartWindowArgs {
369 fn default() -> Self {
370 WaitForRestartWindowArgs {
371 min_idle_time: DEFAULT_MIN_IDLE_TIME
372 .parse()
373 .expect("invalid DEFAULT_MIN_IDLE_TIME"),
374 identity: None,
375 max_delinquent_stake: DEFAULT_MAX_DELINQUENT_STAKE
376 .parse()
377 .expect("invalid DEFAULT_MAX_DELINQUENT_STAKE"),
378 skip_new_snapshot_check: false,
379 skip_health_check: false,
380 }
381 }
382 }
383
384 #[test]
385 fn verify_args_struct_by_command_wait_for_restart_window_default() {
386 verify_args_struct_by_command(
387 command(),
388 vec![COMMAND],
389 WaitForRestartWindowArgs::default(),
390 );
391 }
392
393 #[test]
394 fn verify_args_struct_by_command_wait_for_restart_window_skip_new_snapshot_check() {
395 verify_args_struct_by_command(
396 command(),
397 vec![COMMAND, "--skip-new-snapshot-check"],
398 WaitForRestartWindowArgs {
399 skip_new_snapshot_check: true,
400 ..WaitForRestartWindowArgs::default()
401 },
402 );
403 }
404
405 #[test]
406 fn verify_args_struct_by_command_wait_for_restart_window_skip_health_check() {
407 verify_args_struct_by_command(
408 command(),
409 vec![COMMAND, "--skip-health-check"],
410 WaitForRestartWindowArgs {
411 skip_health_check: true,
412 ..WaitForRestartWindowArgs::default()
413 },
414 );
415 }
416
417 #[test]
418 fn verify_args_struct_by_command_wait_for_restart_window_min_idle_time() {
419 verify_args_struct_by_command(
420 command(),
421 vec![COMMAND, "--min-idle-time", "60"],
422 WaitForRestartWindowArgs {
423 min_idle_time: 60,
424 ..WaitForRestartWindowArgs::default()
425 },
426 );
427 }
428
429 #[test]
430 fn verify_args_struct_by_command_wait_for_restart_window_identity() {
431 verify_args_struct_by_command(
432 command(),
433 vec![
434 COMMAND,
435 "--identity",
436 "ch1do11111111111111111111111111111111111111",
437 ],
438 WaitForRestartWindowArgs {
439 identity: Some(
440 Pubkey::from_str("ch1do11111111111111111111111111111111111111").unwrap(),
441 ),
442 ..WaitForRestartWindowArgs::default()
443 },
444 );
445 }
446
447 #[test]
448 fn verify_args_struct_by_command_wait_for_restart_window_max_delinquent_stake() {
449 verify_args_struct_by_command(
450 command(),
451 vec![COMMAND, "--max-delinquent-stake", "10"],
452 WaitForRestartWindowArgs {
453 max_delinquent_stake: 10,
454 ..WaitForRestartWindowArgs::default()
455 },
456 );
457 }
458}