agave_validator/commands/exit/
mod.rs

1#[cfg(target_os = "linux")]
2use std::{io, thread, time::Duration};
3use {
4    crate::{
5        admin_rpc_service,
6        commands::{monitor, wait_for_restart_window, Error, FromClapArgMatches, Result},
7    },
8    clap::{value_t_or_exit, App, Arg, ArgMatches, SubCommand},
9    solana_clap_utils::input_validators::{is_parsable, is_valid_percentage},
10    std::path::Path,
11};
12
13const COMMAND: &str = "exit";
14
15const DEFAULT_MIN_IDLE_TIME: &str = "10";
16const DEFAULT_MAX_DELINQUENT_STAKE: &str = "5";
17
18#[derive(Clone, Debug, PartialEq)]
19pub enum PostExitAction {
20    // Run the agave-validator monitor command indefinitely
21    Monitor,
22    // Block until the exiting validator process has terminated
23    Wait,
24}
25
26#[derive(Debug, PartialEq)]
27pub struct ExitArgs {
28    pub force: bool,
29    pub post_exit_action: Option<PostExitAction>,
30    pub min_idle_time: usize,
31    pub max_delinquent_stake: u8,
32    pub skip_new_snapshot_check: bool,
33    pub skip_health_check: bool,
34}
35
36impl FromClapArgMatches for ExitArgs {
37    fn from_clap_arg_match(matches: &ArgMatches) -> Result<Self> {
38        let post_exit_action = if matches.is_present("monitor") {
39            Some(PostExitAction::Monitor)
40        } else if matches.is_present("wait_for_exit") {
41            Some(PostExitAction::Wait)
42        } else {
43            None
44        };
45
46        Ok(ExitArgs {
47            force: matches.is_present("force"),
48            post_exit_action,
49            min_idle_time: value_t_or_exit!(matches, "min_idle_time", usize),
50            max_delinquent_stake: value_t_or_exit!(matches, "max_delinquent_stake", u8),
51            skip_new_snapshot_check: matches.is_present("skip_new_snapshot_check"),
52            skip_health_check: matches.is_present("skip_health_check"),
53        })
54    }
55}
56
57pub fn command<'a>() -> App<'a, 'a> {
58    SubCommand::with_name(COMMAND)
59        .about("Send an exit request to the validator")
60        .arg(
61            Arg::with_name("force")
62                .short("f")
63                .long("force")
64                .takes_value(false)
65                .help(
66                    "Request the validator exit immediately instead of waiting for a restart window",
67                ),
68        )
69        .arg(
70            Arg::with_name("monitor")
71                .short("m")
72                .long("monitor")
73                .takes_value(false)
74                .help("Monitor the validator after sending the exit request"),
75        )
76        .arg(
77            Arg::with_name("wait_for_exit")
78                .long("wait-for-exit")
79                .conflicts_with("monitor")
80                .help("Wait for the validator to terminate after sending the exit request"),
81        )
82        .arg(
83            Arg::with_name("min_idle_time")
84                .long("min-idle-time")
85                .takes_value(true)
86                .validator(is_parsable::<usize>)
87                .value_name("MINUTES")
88                .default_value(DEFAULT_MIN_IDLE_TIME)
89                .help(
90                    "Minimum time that the validator should not be leader before restarting",
91                ),
92        )
93        .arg(
94            Arg::with_name("max_delinquent_stake")
95                .long("max-delinquent-stake")
96                .takes_value(true)
97                .validator(is_valid_percentage)
98                .default_value(DEFAULT_MAX_DELINQUENT_STAKE)
99                .value_name("PERCENT")
100                .help("The maximum delinquent stake % permitted for an exit"),
101        )
102        .arg(
103            Arg::with_name("skip_new_snapshot_check")
104                .long("skip-new-snapshot-check")
105                .help("Skip check for a new snapshot"),
106        )
107        .arg(
108            Arg::with_name("skip_health_check")
109                .long("skip-health-check")
110                .help("Skip health check"),
111        )
112}
113
114pub fn execute(matches: &ArgMatches, ledger_path: &Path) -> Result<()> {
115    let exit_args = ExitArgs::from_clap_arg_match(matches)?;
116
117    if !exit_args.force {
118        wait_for_restart_window::wait_for_restart_window(
119            ledger_path,
120            None,
121            exit_args.min_idle_time,
122            exit_args.max_delinquent_stake,
123            exit_args.skip_new_snapshot_check,
124            exit_args.skip_health_check,
125        )?;
126    }
127
128    // Grab the pid from the process before initiating exit as the running
129    // validator will be unable to respond after exit has returned.
130    //
131    // Additionally, only check the pid() RPC call result if it will be used.
132    // In an upgrade scenario, it is possible that a binary that calls pid()
133    // will be initating exit against a process that doesn't support pid().
134    // Since PostExitAction::Wait case is opt-in (via --wait-for-exit), the
135    // result is checked ONLY in that case to provide a friendlier upgrade
136    // path for users who are NOT using --wait-for-exit
137    const WAIT_FOR_EXIT_UNSUPPORTED_ERROR: &str =
138        "remote process exit cannot be waited on. `--wait-for-exit` is not supported by the remote process";
139    let post_exit_action = exit_args.post_exit_action.clone();
140    let validator_pid = admin_rpc_service::runtime().block_on(async move {
141        let admin_client = admin_rpc_service::connect(ledger_path).await?;
142        let validator_pid = match post_exit_action {
143            Some(PostExitAction::Wait) => admin_client
144                .pid()
145                .await
146                .map_err(|_err| Error::Dynamic(WAIT_FOR_EXIT_UNSUPPORTED_ERROR.into()))?,
147            _ => 0,
148        };
149        admin_client.exit().await?;
150
151        Ok::<u32, Error>(validator_pid)
152    })?;
153
154    println!("Exit request sent");
155
156    match exit_args.post_exit_action {
157        None => Ok(()),
158        Some(PostExitAction::Monitor) => monitor::execute(matches, ledger_path),
159        Some(PostExitAction::Wait) => poll_until_pid_terminates(validator_pid),
160    }?;
161
162    Ok(())
163}
164
165#[cfg(target_os = "linux")]
166fn poll_until_pid_terminates(pid: u32) -> Result<()> {
167    let pid = i32::try_from(pid)?;
168
169    println!("Waiting for agave-validator process {pid} to terminate");
170    loop {
171        // From man kill(2)
172        //
173        // If sig is 0, then no signal is sent, but existence and permission
174        // checks are still performed; this can be used to check for the
175        // existence of a process ID or process group ID that the caller is
176        // permitted to signal.
177        let result = unsafe {
178            libc::kill(pid, /*sig:*/ 0)
179        };
180        if result >= 0 {
181            // Give the process some time to exit before checking again
182            thread::sleep(Duration::from_millis(500));
183        } else {
184            let errno = io::Error::last_os_error()
185                .raw_os_error()
186                .ok_or(Error::Dynamic("unable to read raw os error".into()))?;
187            match errno {
188                libc::ESRCH => {
189                    println!("Done, agave-validator process {pid} has terminated");
190                    break;
191                }
192                libc::EINVAL => {
193                    // An invalid signal was specified, we only pass sig=0 so
194                    // this should not be possible
195                    Err(Error::Dynamic(
196                        format!("unexpected invalid signal error for kill({pid}, 0)").into(),
197                    ))?;
198                }
199                libc::EPERM => {
200                    Err(io::Error::from(io::ErrorKind::PermissionDenied))?;
201                }
202                unknown => {
203                    Err(Error::Dynamic(
204                        format!("unexpected errno for kill({pid}, 0): {unknown}").into(),
205                    ))?;
206                }
207            }
208        }
209    }
210
211    Ok(())
212}
213
214#[cfg(not(target_os = "linux"))]
215fn poll_until_pid_terminates(_pid: u32) -> Result<()> {
216    Err(Error::Dynamic(
217        "Unable to wait for agave-validator process termination on this platform".into(),
218    ))
219}
220
221#[cfg(test)]
222mod tests {
223    use {super::*, crate::commands::tests::verify_args_struct_by_command};
224
225    impl Default for ExitArgs {
226        fn default() -> Self {
227            ExitArgs {
228                min_idle_time: DEFAULT_MIN_IDLE_TIME
229                    .parse()
230                    .expect("invalid DEFAULT_MIN_IDLE_TIME"),
231                max_delinquent_stake: DEFAULT_MAX_DELINQUENT_STAKE
232                    .parse()
233                    .expect("invalid DEFAULT_MAX_DELINQUENT_STAKE"),
234                force: false,
235                post_exit_action: None,
236                skip_new_snapshot_check: false,
237                skip_health_check: false,
238            }
239        }
240    }
241
242    #[test]
243    fn verify_args_struct_by_command_exit_default() {
244        verify_args_struct_by_command(command(), vec![COMMAND], ExitArgs::default());
245    }
246
247    #[test]
248    fn verify_args_struct_by_command_exit_with_force() {
249        verify_args_struct_by_command(
250            command(),
251            vec![COMMAND, "--force"],
252            ExitArgs {
253                force: true,
254                ..ExitArgs::default()
255            },
256        );
257    }
258
259    #[test]
260    fn verify_args_struct_by_command_exit_with_post_exit_action() {
261        verify_args_struct_by_command(
262            command(),
263            vec![COMMAND, "--monitor"],
264            ExitArgs {
265                post_exit_action: Some(PostExitAction::Monitor),
266                ..ExitArgs::default()
267            },
268        );
269
270        verify_args_struct_by_command(
271            command(),
272            vec![COMMAND, "--wait-for-exit"],
273            ExitArgs {
274                post_exit_action: Some(PostExitAction::Wait),
275                ..ExitArgs::default()
276            },
277        );
278    }
279
280    #[test]
281    fn verify_args_struct_by_command_exit_with_min_idle_time() {
282        verify_args_struct_by_command(
283            command(),
284            vec![COMMAND, "--min-idle-time", "60"],
285            ExitArgs {
286                min_idle_time: 60,
287                ..ExitArgs::default()
288            },
289        );
290    }
291
292    #[test]
293    fn verify_args_struct_by_command_exit_with_max_delinquent_stake() {
294        verify_args_struct_by_command(
295            command(),
296            vec![COMMAND, "--max-delinquent-stake", "10"],
297            ExitArgs {
298                max_delinquent_stake: 10,
299                ..ExitArgs::default()
300            },
301        );
302    }
303
304    #[test]
305    fn verify_args_struct_by_command_exit_with_skip_new_snapshot_check() {
306        verify_args_struct_by_command(
307            command(),
308            vec![COMMAND, "--skip-new-snapshot-check"],
309            ExitArgs {
310                skip_new_snapshot_check: true,
311                ..ExitArgs::default()
312            },
313        );
314    }
315
316    #[test]
317    fn verify_args_struct_by_command_exit_with_skip_health_check() {
318        verify_args_struct_by_command(
319            command(),
320            vec![COMMAND, "--skip-health-check"],
321            ExitArgs {
322                skip_health_check: true,
323                ..ExitArgs::default()
324            },
325        );
326    }
327}