fleche 6.19.2

Remote job runner for Slurm clusters
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
//! Command-line interface definition.
//!
//! This module defines the CLI structure using clap. All subcommands and their
//! arguments are defined here, with argument parsing handled by clap's derive macros.

use clap::{Parser, Subcommand};
use clap_complete::Shell;

/// GNU-style long version string with copyright and license.
///
/// Note: Update the date literal below when cutting a new release.
fn long_version() -> &'static str {
    concat!(
        env!("CARGO_PKG_VERSION"),
        " (2026-04-08)\n\n", // Update date when releasing
        "Copyright (C) 2026 Italo Silva\n",
        "License GPLv3+: GNU GPL version 3 or later <https://gnu.org/licenses/gpl.html>\n",
        "This is free software: you are free to change and redistribute it.\n",
        "There is NO WARRANTY, to the extent permitted by law."
    )
}

/// The main CLI structure for fleche.
#[derive(Parser)]
#[command(name = "fleche")]
#[command(about = "Remote job runner for Slurm clusters")]
#[command(version, long_version = long_version())]
pub struct Cli {
    /// Run as if fleche was started in this directory
    #[arg(short = 'C', long = "directory", global = true, value_name = "PATH")]
    pub directory: Option<std::path::PathBuf>,

    /// Enable verbose SSH output for debugging connection issues
    #[arg(long, global = true)]
    pub debug: bool,

    /// Output results as JSON (for scripting and AI agents)
    #[arg(long, global = true)]
    pub json: bool,

    /// The subcommand to run.
    #[command(subcommand)]
    pub command: Commands,
}

/// All available subcommands.
#[derive(Subcommand)]
#[allow(clippy::large_enum_variant)]
pub enum Commands {
    /// Run a job on the remote cluster via Slurm
    ///
    /// Syncs your project, submits to Slurm, and streams output.
    /// Use --bg to run in background without streaming.
    Run(RunArgs),

    /// Execute a command directly via SSH (no Slurm)
    ///
    /// Syncs your project and runs the command directly over SSH.
    /// Useful for quick tests or interactive work.
    Exec {
        /// Command to run (in quotes)
        command: String,

        /// Set environment variable (repeatable)
        #[arg(long = "env", value_parser = parse_key_value)]
        env_vars: Vec<(String, String)>,

        /// Run on specific host ("local" for local execution)
        #[arg(long)]
        host: Option<String>,

        /// Skip syncing project code and inputs before execution
        #[arg(long)]
        no_sync: bool,
    },

    /// Show status of jobs
    ///
    /// Without arguments, lists recent jobs.
    /// With a job ID, shows detailed status.
    Status(StatusArgs),

    /// Fetch and display job logs
    ///
    /// Without a job ID, shows logs of the most recent job.
    Logs(LogsArgs),

    /// Download output files from remote to local
    ///
    /// Without a job ID, downloads outputs from the most recent job.
    Download(DownloadArgs),

    /// Cancel a running or pending job
    ///
    /// Without arguments, cancels the most recent running job.
    Cancel(CancelArgs),

    /// Archive or delete finished jobs
    ///
    /// By default, jobs are archived (hidden from listings but preserved).
    /// Use --delete to permanently remove jobs and their remote files.
    /// Use --workspace with --delete to also clear the shared workspace.
    Clean(CleanArgs),

    /// List available jobs from configuration
    ///
    /// Reads fleche.toml (and fleche/*.toml files) and prints all defined
    /// job names with their commands.
    Jobs,

    /// List all unique tags across jobs
    Tags,

    /// Re-run a previous job with the same settings
    Rerun {
        /// Job ID to re-run
        job_id: String,

        /// Run in background (don't stream output)
        #[arg(long)]
        bg: bool,

        /// Send push notifications via ntfy.sh on state changes
        #[arg(long, value_name = "TOPIC")]
        ntfy: Option<String>,

        /// Add tag for filtering/organization (repeatable)
        #[arg(long = "tag", value_parser = parse_key_value)]
        tags: Vec<(String, String)>,
    },

    /// Create a starter fleche.toml in current directory
    Init,

    /// Validate configuration without running anything
    ///
    /// By default, only validates the local configuration file.
    /// Use --remote to also check SSH connectivity, Slurm availability,
    /// partition validity, and disk space.
    Check {
        /// Also validate against the remote server
        #[arg(long)]
        remote: bool,
    },

    /// Print or install the fleche skill for AI coding agents
    ///
    /// Prints the fleche skill reference to stdout. Use --install to
    /// write it to .agents/skills/ (with a symlink from .claude/skills/).
    Skill {
        /// Install the skill to project or global scope
        #[arg(long, value_name = "SCOPE")]
        install: Option<InstallScope>,
    },

    /// Comprehensive diagnostic for troubleshooting
    ///
    /// Checks local environment, SSH connectivity, Slurm status, and registry
    /// health. Provides suggestions for fixing common issues.
    Doctor,

    /// Check cluster health by pinging the Slurm controller
    ///
    /// Runs `scontrol ping` on the remote host to verify the Slurm
    /// scheduler is responsive. Useful for diagnosing timeout issues.
    Ping,

    /// Wait for a job to complete
    ///
    /// Polls job status until it reaches a terminal state (completed, failed, cancelled).
    /// Useful for scripting or waiting on background jobs.
    Wait {
        /// Job ID to wait for (default: most recent job)
        job_id: Option<String>,

        /// Send terminal notification when job completes
        #[arg(long)]
        notify: bool,

        /// Send push notifications via ntfy.sh on state changes
        #[arg(long, value_name = "TOPIC")]
        ntfy: Option<String>,

        /// Filter by tag when using default job (repeatable)
        #[arg(long = "tag", value_parser = parse_key_value)]
        tags: Vec<(String, String)>,
    },

    /// Generate shell completions
    ///
    /// Prints completion script for the specified shell to stdout.
    /// Add to your shell config, e.g.: `fleche completions bash >> ~/.bashrc`
    Completions {
        /// Shell to generate completions for
        #[arg(value_enum)]
        shell: Shell,
    },

    /// Show resource usage statistics for jobs
    ///
    /// Queries Slurm's sacct to show elapsed time, CPU time, memory usage,
    /// and allocated resources for completed jobs.
    Stats {
        /// Job ID to show stats for (default: most recent job)
        job_id: Option<String>,

        /// Show stats for last N jobs
        #[arg(long, short = 'n', default_value = "1")]
        last: usize,

        /// Filter by tag (repeatable)
        #[arg(long = "tag", value_parser = parse_key_value)]
        tags: Vec<(String, String)>,
    },

    /// Add or view a note on a job
    ///
    /// Without a note, displays the existing note for the job.
    /// With a note, sets or updates the job's note.
    Note {
        /// Job ID to annotate
        job_id: String,

        /// Note text to set (omit to view existing note)
        note: Option<String>,
    },

    /// Compare two jobs side-by-side
    ///
    /// Shows differences in configuration, environment, Slurm settings,
    /// tags, and status between two jobs.
    Compare {
        /// First job ID
        job_a: String,

        /// Second job ID
        job_b: String,
    },

    /// Run a command through a SOCKS proxy tunnel to the remote host
    ///
    /// Opens an SSH dynamic port forward to the configured remote, sets
    /// proxy environment variables (`ALL_PROXY`, `HTTP_PROXY`, `HTTPS_PROXY`,
    /// etc.), and runs the given command. The tunnel is cached per-host
    /// so repeated invocations reuse the same connection.
    ///
    /// Example: fleche proxy -- curl <https://example.com>
    Proxy {
        /// Command and arguments to run through the proxy
        #[arg(required = true, trailing_var_arg = true, allow_hyphen_values = true)]
        command: Vec<String>,

        /// SOCKS proxy port (default: random available port)
        #[arg(long)]
        port: Option<u16>,

        /// Override remote host (default: from fleche.toml)
        #[arg(long)]
        host: Option<String>,
    },
}

/// Where to install the fleche skill.
#[derive(Clone, Copy, clap::ValueEnum)]
pub enum InstallScope {
    /// Install to the current project directory
    Project,
    /// Install to the user-level config directory
    Global,
}

#[derive(clap::Args)]
pub struct RunArgs {
    /// Job name from config, or command to run (in quotes)
    #[arg(value_name = "JOB_OR_COMMAND")]
    pub job_or_command: Option<String>,

    /// Override or provide command (if job name given)
    #[arg(long)]
    pub command: Option<String>,

    /// Run in background (don't stream output)
    #[arg(long)]
    pub bg: bool,

    /// Send terminal notification when job completes (useful with --bg)
    #[arg(long)]
    pub notify: bool,

    /// Send push notifications via ntfy.sh on state changes
    #[arg(long, value_name = "TOPIC")]
    pub ntfy: Option<String>,

    /// Set environment variable (repeatable)
    #[arg(long = "env", value_parser = parse_key_value)]
    pub env_vars: Vec<(String, String)>,

    /// Add tag for filtering/organization (repeatable)
    #[arg(long = "tag", value_parser = parse_key_value)]
    pub tags: Vec<(String, String)>,

    /// Override Slurm partition
    #[arg(long)]
    pub partition: Option<String>,

    /// Override wall time
    #[arg(long)]
    pub time: Option<String>,

    /// Override GPU count
    #[arg(long)]
    pub gpus: Option<u32>,

    /// Override CPU count
    #[arg(long)]
    pub cpus: Option<u32>,

    /// Override memory
    #[arg(long)]
    pub memory: Option<String>,

    /// Override constraint
    #[arg(long)]
    pub constraint: Option<String>,

    /// Override nodes
    #[arg(long)]
    pub nodes: Option<u32>,

    /// Override exclude
    #[arg(long)]
    pub exclude: Option<String>,

    /// Run after another job completes successfully
    ///
    /// Takes a job ID (or suffix). The new job will only start after
    /// the dependency job completes with exit code 0.
    #[arg(long)]
    pub after: Option<String>,

    /// Print generated sbatch script without submitting
    #[arg(long)]
    pub dry_run: bool,

    /// Run on specific host ("local" for local execution)
    #[arg(long)]
    pub host: Option<String>,

    /// Run directly via SSH instead of submitting to Slurm
    #[arg(long)]
    pub exec: bool,

    /// Retry failed jobs with exponential backoff (e.g., --retry 3)
    #[arg(long)]
    pub retry: Option<u32>,

    /// Add a note/annotation to the job
    #[arg(long)]
    pub note: Option<String>,
}

#[derive(clap::Args)]
pub struct StatusArgs {
    /// Job ID to check (default: list recent jobs)
    pub job_id: Option<String>,

    /// Filter by status (pending, running, completed, failed, cancelled) - repeatable
    #[arg(long)]
    pub filter: Vec<String>,

    /// Filter by job name regex (e.g., "123" matches "train-123-xy", "^train" matches "train-foo")
    #[arg(long)]
    pub name: Option<String>,

    /// Filter by tag (repeatable)
    #[arg(long = "tag", value_parser = parse_key_value)]
    pub tags: Vec<(String, String)>,

    /// Number of jobs to show (default: 20)
    #[arg(short = 'n', long)]
    pub last: Option<usize>,

    /// Show only archived jobs
    #[arg(long)]
    pub archived: bool,

    /// Show all jobs including archived
    #[arg(long = "all-jobs", conflicts_with = "archived")]
    pub all_jobs: bool,

    /// Hide the subtitle line (job name, tags, note) below each row
    #[arg(long)]
    pub compact: bool,
}

#[derive(clap::Args)]
pub struct LogsArgs {
    /// Job ID (default: most recent job)
    pub job_id: Option<String>,

    /// Stream logs in real-time (Ctrl+C to disconnect)
    #[arg(long, short)]
    pub follow: bool,

    /// Show only stdout (default shows both stdout and stderr)
    #[arg(long)]
    pub stdout: bool,

    /// Show only stderr (default shows both stdout and stderr)
    #[arg(long)]
    pub stderr: bool,

    /// Show only the last N lines
    #[arg(short = 'n', long)]
    pub tail: Option<usize>,

    /// Strip ANSI escape codes from output (auto-detected when piped)
    #[arg(long)]
    pub raw: bool,

    /// Filter by tag when using default job (repeatable)
    #[arg(long = "tag", value_parser = parse_key_value)]
    pub tags: Vec<(String, String)>,

    /// Filter by note content (regex pattern, case-insensitive)
    #[arg(long)]
    pub note: Option<String>,
}

#[derive(clap::Args)]
pub struct DownloadArgs {
    /// Job ID (default: most recent job)
    pub job_id: Option<String>,

    /// Download even if job is still running
    #[arg(long)]
    pub partial: bool,

    /// Specific path to download (default: all configured outputs)
    #[arg(long)]
    pub path: Option<String>,

    /// Filter outputs by glob pattern (repeatable). Prefix with ! to exclude.
    #[arg(long)]
    pub filter: Vec<String>,

    /// Filter by tag when using default job (repeatable)
    #[arg(long = "tag", value_parser = parse_key_value)]
    pub tags: Vec<(String, String)>,

    /// Show what would be downloaded without actually downloading
    #[arg(long)]
    pub dry_run: bool,
}

#[derive(clap::Args)]
pub struct CancelArgs {
    /// Job ID (default: most recent running job)
    pub job_id: Option<String>,

    /// Cancel all running/pending jobs
    #[arg(long)]
    pub all: bool,

    /// Show what would be cancelled without actually cancelling
    #[arg(long)]
    pub dry_run: bool,

    /// Skip confirmation prompt
    #[arg(short, long)]
    pub yes: bool,

    /// Filter by tag (repeatable)
    #[arg(long = "tag", value_parser = parse_key_value)]
    pub tags: Vec<(String, String)>,
}

#[derive(clap::Args)]
pub struct CleanArgs {
    /// Job ID (optional with --all or --older-than)
    pub job_id: Option<String>,

    /// Clean all completed/failed jobs
    #[arg(long)]
    pub all: bool,

    /// Filter by status (completed, failed, cancelled) - repeatable
    #[arg(long)]
    pub filter: Vec<String>,

    /// Clean jobs older than duration (e.g., 7d, 24h)
    #[arg(long)]
    pub older_than: Option<String>,

    /// Permanently delete jobs instead of archiving
    #[arg(long, conflicts_with = "unarchive")]
    pub delete: bool,

    /// Also delete the shared workspace (requires --delete)
    #[arg(long, requires = "delete")]
    pub workspace: bool,

    /// Target archived jobs (for --delete or --unarchive)
    #[arg(long, conflicts_with_all = ["filter"])]
    pub archived: bool,

    /// Restore archived job to normal listings
    #[arg(long, conflicts_with = "delete")]
    pub unarchive: bool,

    /// Show what would be done without actually doing it
    #[arg(long)]
    pub dry_run: bool,

    /// Skip confirmation prompt
    #[arg(short, long)]
    pub yes: bool,

    /// Filter by tag (repeatable)
    #[arg(long = "tag", value_parser = parse_key_value)]
    pub tags: Vec<(String, String)>,
}

/// Parses a KEY=VALUE string into a tuple.
fn parse_key_value(s: &str) -> Result<(String, String), String> {
    let parts: Vec<&str> = s.splitn(2, '=').collect();
    if parts.len() != 2 {
        return Err(format!("Invalid format '{s}'. Expected KEY=VALUE"));
    }
    Ok((parts[0].to_string(), parts[1].to_string()))
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_parse_key_value_simple() {
        let (k, v) = parse_key_value("FOO=bar").unwrap();
        assert_eq!(k, "FOO");
        assert_eq!(v, "bar");
    }

    #[test]
    fn test_parse_key_value_with_equals_in_value() {
        // Value can contain equals signs
        let (k, v) = parse_key_value("CONFIG=a=b=c").unwrap();
        assert_eq!(k, "CONFIG");
        assert_eq!(v, "a=b=c");
    }

    #[test]
    fn test_parse_key_value_empty_value() {
        let (k, v) = parse_key_value("EMPTY=").unwrap();
        assert_eq!(k, "EMPTY");
        assert_eq!(v, "");
    }

    #[test]
    fn test_parse_key_value_spaces_in_value() {
        let (k, v) = parse_key_value("MSG=hello world").unwrap();
        assert_eq!(k, "MSG");
        assert_eq!(v, "hello world");
    }

    #[test]
    fn test_parse_key_value_no_equals() {
        assert!(parse_key_value("NOEQUALS").is_err());
    }

    #[test]
    fn test_parse_key_value_empty() {
        assert!(parse_key_value("").is_err());
    }
}