1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
// [[file:../remote.note::3a532d42][3a532d42]]
use super::*;
use gut::cli::*;
use gut::fs::*;

pub use gut::prelude::*;
// 3a532d42 ends here

// [[file:../remote.note::bdfa3d68][bdfa3d68]]
const GOSH_SCHEDULER_FILE: &str = "gosh-remote-scheduler.lock";

fn read_scheduler_address_from_lock_file(scheduler_address_file: &Path, timeout: f64) -> Result<String> {
    debug!("reading scheduler address from file: {scheduler_address_file:?}");
    LockFile::wait(scheduler_address_file, timeout)?;
    let o = gut::fs::read_file(scheduler_address_file)?.trim().to_string();
    Ok(o)
}
// bdfa3d68 ends here

// [[file:../remote.note::512e88e7][512e88e7]]
// use crate::remote::{Client, Server};

/// The client side for running program concurrently distributed over multiple
/// remote nodes
#[derive(StructOpt)]
struct ClientCli {
    /// The remote execution service address, e.g. localhost:3031
    #[structopt(long = "address", conflicts_with = "scheduler-address-file")]
    scheduler_address: Option<String>,

    /// The scheduler address to be read from file `scheduler_address_file`
    #[structopt(short = 'w', default_value = GOSH_SCHEDULER_FILE)]
    scheduler_address_file: PathBuf,

    #[clap(subcommand)]
    action: ClientAction,
}

#[derive(Subcommand)]
enum ClientAction {
    Run(ClientRun),
    /// Request server to add a new node for remote computation.
    AddNode {
        /// The node to be added into node list for remote computation.
        node: String,
    },
}

#[derive(StructOpt)]
/// request server to run a cmd
struct ClientRun {
    /// The cmd to run in remote session
    cmd: String,

    /// The working dir to run the cmd
    #[structopt(long, default_value = ".")]
    wrk_dir: PathBuf,
}

impl ClientCli {
    async fn enter_main(self) -> Result<()> {
        use crate::client::Client;
        let scheduler_address = if let Some(a) = self.scheduler_address {
            a
        } else {
            read_scheduler_address_from_lock_file(&self.scheduler_address_file, 2.0)?
        };

        let client = Client::connect(&scheduler_address);
        match self.action {
            ClientAction::Run(run) => {
                let wrk_dir = run.wrk_dir.canonicalize()?;
                let o = client.run_cmd(&run.cmd, &wrk_dir)?;
                println!("{o}");
            }
            ClientAction::AddNode { node } => {
                client.add_node(&node)?;
            }
        }

        Ok(())
    }
}
// 512e88e7 ends here

// [[file:../remote.note::674c2404][674c2404]]
use base::LockFile;
use server::Server;

#[derive(Debug, Clone, ValueEnum)]
enum ServerMode {
    AsScheduler,
    AsWorker,
}

/// The server side for running program concurrently distributed over multiple remote nodes
#[derive(Parser, Debug)]
struct ServerCli {
    /// Bind on the address for providing remote execution service
    #[arg(long)]
    address: String,

    /// The server mode to start.
    #[arg(value_enum)]
    mode: ServerMode,
}

impl ServerCli {
    async fn enter_main(self) -> Result<()> {
        let address = &self.address;
        match self.mode {
            ServerMode::AsScheduler => {
                println!("Start scheduler serivce at {address:?}");
                Server::serve_as_scheduler(address).await;
            }
            ServerMode::AsWorker => {
                println!("Start worker serivce at {address:?}");
                Server::serve_as_worker(address).await?;
            }
        }

        Ok(())
    }

    async fn run_as_scheduler(address: String) -> Result<()> {
        let server = ServerCli {
            address: address,
            mode: ServerMode::AsScheduler,
        };
        server.enter_main().await?;
        Ok(())
    }

    async fn run_as_worker(address: String) -> Result<()> {
        let server = ServerCli {
            address: address,
            mode: ServerMode::AsWorker,
        };
        server.enter_main().await?;
        Ok(())
    }
}
// 674c2404 ends here

// [[file:../remote.note::001e63a1][001e63a1]]
/// Start scheduler and worker services automatically when run in MPI
/// environment (to be called with mpirun command)
#[derive(Parser)]
struct BootstrapCli {
    /// The scheduler address will be wrote into `address_file`
    #[arg(short = 'w', default_value = GOSH_SCHEDULER_FILE)]
    address_file: PathBuf,

    #[arg(long, default_value = "2.0")]
    timeout: f64,

    /// The server mode to start.
    #[arg(value_enum)]
    mode: ServerMode,
}

impl BootstrapCli {
    async fn enter_main(&self) -> Result<()> {
        let node = hostname();
        let address = default_server_address();
        let address_file = self.address_file.to_owned();
        let timeout = self.timeout;
        match self.mode {
            ServerMode::AsScheduler => {
                info!("install scheduler on {node}");
                let _lock = LockFile::new(&address_file, &address)?;
                ServerCli::run_as_scheduler(address).await?;
            }
            ServerMode::AsWorker => {
                info!("install worker on {node}");
                let o = read_scheduler_address_from_lock_file(&address_file, timeout)?;
                // tell the scheduler add this worker
                crate::client::Client::connect(o).add_node(&address)?;
                ServerCli::run_as_worker(address).await?;
            }
        }
        Ok(())
    }
}

fn default_server_address() -> String {
    match get_free_tcp_address().expect("tcp address") {
        std::net::SocketAddr::V4(addr) => addr.to_string(),
        std::net::SocketAddr::V6(_) => panic!("IPV6 is not supported"),
    }
}
// 001e63a1 ends here

// [[file:../remote.note::5f9971ad][5f9971ad]]
/// A helper program for running program concurrently distributed over multiple
/// remote nodes
#[derive(Parser)]
#[clap(author, version, about)]
struct Cli {
    #[structopt(flatten)]
    verbose: gut::cli::Verbosity,

    #[clap(subcommand)]
    command: RemoteCommand,
}

#[derive(Subcommand)]
enum RemoteCommand {
    Client(ClientCli),
    Server(ServerCli),
    Bootstrap(BootstrapCli),
}

pub async fn remote_enter_main() -> Result<()> {
    let args = Cli::parse();
    args.verbose.setup_logger();

    match args.command {
        RemoteCommand::Client(client) => {
            client.enter_main().await?;
        }
        RemoteCommand::Server(server) => {
            debug!("Run VASP for interactive calculation ...");
            server.enter_main().await?;
        }
        RemoteCommand::Bootstrap(bootstrap) => {
            bootstrap.enter_main().await?;
        }
    }

    Ok(())
}
// 5f9971ad ends here