Skip to main content

arrs/
cli.rs

1use std::path::PathBuf;
2
3use clap::{Args, Parser, Subcommand, ValueEnum};
4
5#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)]
6pub enum Format {
7    Csv,
8    Jsonl,
9    /// Pretty-printed table for interactive use; nested cells are JSON-encoded.
10    /// Buffers all rows before printing, so prefer `jsonl`/`csv` for very large
11    /// inputs piped through `cat`/`head`/etc.
12    Table,
13}
14
15/// How to render Binary / LargeBinary / FixedSizeBinary / BinaryView values.
16#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)]
17pub enum BinaryFormat {
18    /// Drop top-level binary columns from output; render nested binary as null.
19    None,
20    /// `\xHH` lowercase-hex-escaped strings.
21    Hex,
22    /// Standard-alphabet base64 strings.
23    Base64,
24}
25
26#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)]
27pub enum SchemaType {
28    /// Logical arrow schema.
29    Arrow,
30    /// Physical (format-native) schema.
31    Physical,
32}
33
34/// Lance-specific selectors for which version of a dataset to read.
35///
36/// `--branch` is independent and can be combined with either `--version` or
37/// `--tag`. `--version` and `--tag` are mutually exclusive (a tag is just a
38/// named version). With no flags set, the latest version of `main` is used.
39#[derive(Debug, Clone, Args, Default)]
40pub struct LanceArgs {
41    /// Read from the named Lance branch (default: main).
42    #[arg(long)]
43    pub branch: Option<String>,
44
45    /// Read from a specific Lance version on the chosen branch.
46    #[arg(long, conflicts_with = "tag")]
47    pub version: Option<u64>,
48
49    /// Read from a specific Lance tag on the chosen branch.
50    #[arg(long, conflicts_with = "version")]
51    pub tag: Option<String>,
52}
53
54impl LanceArgs {
55    /// True when at least one Lance-specific selector was supplied.
56    pub fn is_any_set(&self) -> bool {
57        self.branch.is_some() || self.version.is_some() || self.tag.is_some()
58    }
59}
60
61#[derive(Debug, Parser)]
62#[command(name = "arrs", about = "Inspect Arrow-based datasets.", version)]
63pub struct Cli {
64    /// Output format for row-producing commands. When unset, metadata commands
65    /// (versions/branches/tags/indices) default to `table` (fully buffered to enable column
66    /// alignment); everything else to `jsonl` (streaming).
67    #[arg(long, global = true, value_enum)]
68    pub format: Option<Format>,
69
70    /// How to render binary columns in the output.
71    #[arg(long = "binary-format", global = true, value_enum, default_value_t = BinaryFormat::None)]
72    pub binary_format: BinaryFormat,
73
74    /// Comma-separated list of columns to include.
75    #[arg(long, global = true, value_delimiter = ',')]
76    pub columns: Option<Vec<String>>,
77
78    /// Comma-separated list of columns to exclude. Takes precedence over --columns.
79    #[arg(long = "exclude-columns", global = true, value_delimiter = ',')]
80    pub exclude_columns: Option<Vec<String>>,
81
82    #[command(subcommand)]
83    pub command: Command,
84}
85
86#[derive(Debug, Subcommand)]
87pub enum Command {
88    /// Concatenate one or more datasets and print every row.
89    Cat {
90        /// Dataset paths (at least one).
91        #[arg(required = true)]
92        inputs: Vec<PathBuf>,
93        #[command(flatten)]
94        lance: LanceArgs,
95    },
96
97    /// Print the first N rows.
98    Head {
99        input: PathBuf,
100        #[arg(short = 'n', long, default_value_t = 10)]
101        limit: u64,
102        #[command(flatten)]
103        lance: LanceArgs,
104    },
105
106    /// Print the last N rows.
107    Tail {
108        input: PathBuf,
109        #[arg(short = 'n', long, default_value_t = 10)]
110        limit: u64,
111        #[command(flatten)]
112        lance: LanceArgs,
113    },
114
115    /// Print rows at the given indices (comma-separated; supports `a:b`, `a:`, `:b`, negatives).
116    Take {
117        input: PathBuf,
118        #[arg(long, allow_hyphen_values = true)]
119        indices: String,
120        #[command(flatten)]
121        lance: LanceArgs,
122    },
123
124    /// Print the number of rows.
125    Rowcount {
126        input: PathBuf,
127        #[command(flatten)]
128        lance: LanceArgs,
129    },
130
131    /// Randomly sample N rows without replacement.
132    Sample {
133        input: PathBuf,
134        #[arg(short = 'n', long)]
135        limit: u64,
136        /// Optional u64 seed for reproducibility.
137        #[arg(long)]
138        seed: Option<u64>,
139        #[command(flatten)]
140        lance: LanceArgs,
141    },
142
143    /// Print the schema of the dataset.
144    Schema {
145        input: PathBuf,
146        /// Which schema flavor to print.
147        #[arg(long = "type", value_enum, default_value_t = SchemaType::Arrow)]
148        ty: SchemaType,
149        #[command(flatten)]
150        lance: LanceArgs,
151    },
152
153    /// (Lance only) Print versions of the dataset.
154    Versions {
155        input: PathBuf,
156        /// Scope to a specific branch (default: main).
157        #[arg(long)]
158        branch: Option<String>,
159        /// Hide versions that have no tag (default: show all versions).
160        #[arg(long = "tagged-only", default_value_t = false)]
161        tagged_only: bool,
162    },
163
164    /// (Lance only) Print branches available for the dataset.
165    Branches { input: PathBuf },
166
167    /// (Lance only) Print tags defined on the dataset, across all branches.
168    Tags { input: PathBuf },
169
170    /// (Lance only) Print indices defined on the dataset.
171    Indices {
172        input: PathBuf,
173        #[command(flatten)]
174        lance: LanceArgs,
175    },
176}