agave_validator/commands/exit/
mod.rs

1#[cfg(target_os = "linux")]
2use std::{io, thread, time::Duration};
3use {
4    crate::{
5        admin_rpc_service,
6        commands::{monitor, wait_for_restart_window, Error, FromClapArgMatches, Result},
7    },
8    clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand},
9    solana_clap_utils::{
10        hidden_unless_forced,
11        input_validators::{is_parsable, is_valid_percentage},
12    },
13    std::path::Path,
14};
15
16const COMMAND: &str = "exit";
17
18const DEFAULT_MIN_IDLE_TIME: &str = "10";
19const DEFAULT_MAX_DELINQUENT_STAKE: &str = "5";
20
21#[derive(Clone, Debug, PartialEq)]
22pub enum PostExitAction {
23    // Run the agave-validator monitor command indefinitely
24    Monitor,
25    // Block until the exiting validator process has terminated
26    Wait,
27}
28
29#[derive(Debug, PartialEq)]
30pub struct ExitArgs {
31    pub force: bool,
32    pub post_exit_action: Option<PostExitAction>,
33    pub min_idle_time: usize,
34    pub max_delinquent_stake: u8,
35    pub skip_new_snapshot_check: bool,
36    pub skip_health_check: bool,
37}
38
39impl FromClapArgMatches for ExitArgs {
40    fn from_clap_arg_match(matches: &ArgMatches) -> Result<Self> {
41        let post_exit_action = if matches.is_present("monitor") {
42            Some(PostExitAction::Monitor)
43        } else if matches.is_present("no_wait_for_exit") {
44            None
45        } else {
46            Some(PostExitAction::Wait)
47        };
48
49        if matches.is_present("wait_for_exit") {
50            eprintln!(
51                "WARN: The --wait-for-exit flag has been deprecated, waiting for exit is now the \
52                 default behavior"
53            );
54        }
55
56        Ok(ExitArgs {
57            force: matches.is_present("force"),
58            post_exit_action,
59            min_idle_time: value_t_or_exit!(matches, "min_idle_time", usize),
60            max_delinquent_stake: value_t_or_exit!(matches, "max_delinquent_stake", u8),
61            skip_new_snapshot_check: matches.is_present("skip_new_snapshot_check"),
62            skip_health_check: matches.is_present("skip_health_check"),
63        })
64    }
65}
66
67pub fn command<'a>() -> App<'a, 'a> {
68    SubCommand::with_name(COMMAND)
69        .about("Send an exit request to the validator")
70        .arg(
71            Arg::with_name("force")
72                .short("f")
73                .long("force")
74                .takes_value(false)
75                .help(
76                    "Request the validator exit immediately instead of waiting for a restart \
77                     window",
78                ),
79        )
80        .arg(
81            Arg::with_name("monitor")
82                .short("m")
83                .long("monitor")
84                .takes_value(false)
85                .requires("no_wait_for_exit")
86                .help("Monitor the validator after sending the exit request"),
87        )
88        .arg(
89            Arg::with_name("wait_for_exit")
90                .long("wait-for-exit")
91                .conflicts_with("monitor")
92                .hidden(hidden_unless_forced())
93                .help("Wait for the validator to terminate after sending the exit request"),
94        )
95        .arg(
96            Arg::with_name("no_wait_for_exit")
97                .long("no-wait-for-exit")
98                .takes_value(false)
99                .conflicts_with("wait_for_exit")
100                .help("Do not wait for the validator to terminate after sending the exit request"),
101        )
102        .arg(
103            Arg::with_name("min_idle_time")
104                .long("min-idle-time")
105                .takes_value(true)
106                .validator(is_parsable::<usize>)
107                .value_name("MINUTES")
108                .default_value(DEFAULT_MIN_IDLE_TIME)
109                .help("Minimum time that the validator should not be leader before restarting"),
110        )
111        .arg(
112            Arg::with_name("max_delinquent_stake")
113                .long("max-delinquent-stake")
114                .takes_value(true)
115                .validator(is_valid_percentage)
116                .default_value(DEFAULT_MAX_DELINQUENT_STAKE)
117                .value_name("PERCENT")
118                .help("The maximum delinquent stake % permitted for an exit"),
119        )
120        .arg(
121            Arg::with_name("skip_new_snapshot_check")
122                .long("skip-new-snapshot-check")
123                .help("Skip check for a new snapshot"),
124        )
125        .arg(
126            Arg::with_name("skip_health_check")
127                .long("skip-health-check")
128                .help("Skip health check"),
129        )
130}
131
132pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
133    let exit_args = ExitArgs::from_clap_arg_match(matches)?;
134
135    if !exit_args.force {
136        wait_for_restart_window::wait_for_restart_window(
137            ledger_path,
138            None,
139            exit_args.min_idle_time,
140            exit_args.max_delinquent_stake,
141            exit_args.skip_new_snapshot_check,
142            exit_args.skip_health_check,
143        )?;
144    }
145
146    // Grab the pid from the process before initiating exit as the running
147    // validator will be unable to respond after exit has returned.
148    //
149    // Additionally, only check the pid() RPC call result if it will be used.
150    // In an upgrade scenario, it is possible that a binary that calls pid()
151    // will be initating exit against a process that doesn't support pid().
152    const WAIT_FOR_EXIT_UNSUPPORTED_ERROR: &str = "remote process exit cannot be waited on. \
153                                                   `--wait-for-exit` is not supported by the \
154                                                   remote process";
155    let post_exit_action = exit_args.post_exit_action.clone();
156    let validator_pid = admin_rpc_service::runtime().block_on(async move {
157        let admin_client = admin_rpc_service::connect(ledger_path).await?;
158        let validator_pid = match post_exit_action {
159            Some(PostExitAction::Wait) => admin_client
160                .pid()
161                .await
162                .map_err(|_err| Error::Dynamic(WAIT_FOR_EXIT_UNSUPPORTED_ERROR.into()))?,
163            _ => 0,
164        };
165        admin_client.exit().await?;
166
167        Ok::<u32, Error>(validator_pid)
168    })?;
169
170    println!("Exit request sent");
171
172    match exit_args.post_exit_action {
173        None => Ok(()),
174        Some(PostExitAction::Monitor) => monitor::execute(matches, ledger_path),
175        Some(PostExitAction::Wait) => poll_until_pid_terminates(validator_pid),
176    }?;
177
178    Ok(())
179}
180
181#[cfg(target_os = "linux")]
182fn poll_until_pid_terminates(pid: u32) -> Result<()> {
183    let pid = i32::try_from(pid)?;
184
185    println!("Waiting for agave-validator process {pid} to terminate");
186    loop {
187        // From man kill(2)
188        //
189        // If sig is 0, then no signal is sent, but existence and permission
190        // checks are still performed; this can be used to check for the
191        // existence of a process ID or process group ID that the caller is
192        // permitted to signal.
193        let result = unsafe {
194            libc::kill(pid, /*sig:*/ 0)
195        };
196        if result >= 0 {
197            // Give the process some time to exit before checking again
198            thread::sleep(Duration::from_millis(500));
199        } else {
200            let errno = io::Error::last_os_error()
201                .raw_os_error()
202                .ok_or(Error::Dynamic("unable to read raw os error".into()))?;
203            match errno {
204                libc::ESRCH => {
205                    println!("Done, agave-validator process {pid} has terminated");
206                    break;
207                }
208                libc::EINVAL => {
209                    // An invalid signal was specified, we only pass sig=0 so
210                    // this should not be possible
211                    Err(Error::Dynamic(
212                        format!("unexpected invalid signal error for kill({pid}, 0)").into(),
213                    ))?;
214                }
215                libc::EPERM => {
216                    Err(io::Error::from(io::ErrorKind::PermissionDenied))?;
217                }
218                unknown => {
219                    Err(Error::Dynamic(
220                        format!("unexpected errno for kill({pid}, 0): {unknown}").into(),
221                    ))?;
222                }
223            }
224        }
225    }
226
227    Ok(())
228}
229
230#[cfg(not(target_os = "linux"))]
231fn poll_until_pid_terminates(_pid: u32) -> Result<()> {
232    Err(Error::Dynamic(
233        "Unable to wait for agave-validator process termination on this platform".into(),
234    ))
235}
236
237#[cfg(test)]
238mod tests {
239    use {super::*, crate::commands::tests::verify_args_struct_by_command};
240
241    impl Default for ExitArgs {
242        fn default() -> Self {
243            ExitArgs {
244                min_idle_time: DEFAULT_MIN_IDLE_TIME
245                    .parse()
246                    .expect("invalid DEFAULT_MIN_IDLE_TIME"),
247                max_delinquent_stake: DEFAULT_MAX_DELINQUENT_STAKE
248                    .parse()
249                    .expect("invalid DEFAULT_MAX_DELINQUENT_STAKE"),
250                force: false,
251                post_exit_action: Some(PostExitAction::Wait),
252                skip_new_snapshot_check: false,
253                skip_health_check: false,
254            }
255        }
256    }
257
258    #[test]
259    fn verify_args_struct_by_command_exit_default() {
260        verify_args_struct_by_command(command(), vec![COMMAND], ExitArgs::default());
261    }
262
263    #[test]
264    fn verify_args_struct_by_command_exit_with_force() {
265        verify_args_struct_by_command(
266            command(),
267            vec![COMMAND, "--force"],
268            ExitArgs {
269                force: true,
270                ..ExitArgs::default()
271            },
272        );
273    }
274
275    #[test]
276    fn verify_args_struct_by_command_exit_with_post_exit_action() {
277        verify_args_struct_by_command(
278            command(),
279            vec![COMMAND, "--monitor", "--no-wait-for-exit"],
280            ExitArgs {
281                post_exit_action: Some(PostExitAction::Monitor),
282                ..ExitArgs::default()
283            },
284        );
285
286        verify_args_struct_by_command(
287            command(),
288            vec![COMMAND, "--no-wait-for-exit"],
289            ExitArgs {
290                post_exit_action: None,
291                ..ExitArgs::default()
292            },
293        );
294
295        verify_args_struct_by_command(
296            command(),
297            vec![COMMAND, "--wait-for-exit"],
298            ExitArgs {
299                post_exit_action: Some(PostExitAction::Wait),
300                ..ExitArgs::default()
301            },
302        );
303    }
304
305    #[test]
306    fn verify_args_struct_by_command_exit_with_min_idle_time() {
307        verify_args_struct_by_command(
308            command(),
309            vec![COMMAND, "--min-idle-time", "60"],
310            ExitArgs {
311                min_idle_time: 60,
312                ..ExitArgs::default()
313            },
314        );
315    }
316
317    #[test]
318    fn verify_args_struct_by_command_exit_with_max_delinquent_stake() {
319        verify_args_struct_by_command(
320            command(),
321            vec![COMMAND, "--max-delinquent-stake", "10"],
322            ExitArgs {
323                max_delinquent_stake: 10,
324                ..ExitArgs::default()
325            },
326        );
327    }
328
329    #[test]
330    fn verify_args_struct_by_command_exit_with_skip_new_snapshot_check() {
331        verify_args_struct_by_command(
332            command(),
333            vec![COMMAND, "--skip-new-snapshot-check"],
334            ExitArgs {
335                skip_new_snapshot_check: true,
336                ..ExitArgs::default()
337            },
338        );
339    }
340
341    #[test]
342    fn verify_args_struct_by_command_exit_with_skip_health_check() {
343        verify_args_struct_by_command(
344            command(),
345            vec![COMMAND, "--skip-health-check"],
346            ExitArgs {
347                skip_health_check: true,
348                ..ExitArgs::default()
349            },
350        );
351    }
352}