arrs/cli.rs
1use std::path::PathBuf;
2
3use clap::{Args, Parser, Subcommand, ValueEnum};
4
5#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)]
6pub enum Format {
7 Csv,
8 Jsonl,
9 /// Pretty-printed table for interactive use; nested cells are JSON-encoded.
10 /// Buffers all rows before printing, so prefer `jsonl`/`csv` for very large
11 /// inputs piped through `cat`/`head`/etc.
12 Table,
13}
14
15/// How to render Binary / LargeBinary / FixedSizeBinary / BinaryView values.
16#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)]
17pub enum BinaryFormat {
18 /// Drop top-level binary columns from output; render nested binary as null.
19 None,
20 /// `\xHH` lowercase-hex-escaped strings.
21 Hex,
22 /// Standard-alphabet base64 strings.
23 Base64,
24}
25
26#[derive(Debug, Copy, Clone, PartialEq, Eq, ValueEnum)]
27pub enum SchemaType {
28 /// Logical arrow schema.
29 Arrow,
30 /// Physical (format-native) schema.
31 Physical,
32}
33
34/// Lance-specific selectors for which version of a dataset to read.
35///
36/// `--branch` is independent and can be combined with either `--version` or
37/// `--tag`. `--version` and `--tag` are mutually exclusive (a tag is just a
38/// named version). With no flags set, the latest version of `main` is used.
39#[derive(Debug, Clone, Args, Default)]
40pub struct LanceArgs {
41 /// Read from the named Lance branch (default: main).
42 #[arg(long)]
43 pub branch: Option<String>,
44
45 /// Read from a specific Lance version on the chosen branch.
46 #[arg(long, conflicts_with = "tag")]
47 pub version: Option<u64>,
48
49 /// Read from a specific Lance tag on the chosen branch.
50 #[arg(long, conflicts_with = "version")]
51 pub tag: Option<String>,
52}
53
54impl LanceArgs {
55 /// True when at least one Lance-specific selector was supplied.
56 pub fn is_any_set(&self) -> bool {
57 self.branch.is_some() || self.version.is_some() || self.tag.is_some()
58 }
59}
60
61#[derive(Debug, Parser)]
62#[command(name = "arrs", about = "Inspect Arrow-based datasets.", version)]
63pub struct Cli {
64 /// Output format for row-producing commands. When unset, metadata commands
65 /// (versions/branches/tags/indices) default to `table` (fully buffered to enable column
66 /// alignment); everything else to `jsonl` (streaming).
67 #[arg(long, global = true, value_enum)]
68 pub format: Option<Format>,
69
70 /// How to render binary columns in the output.
71 #[arg(long = "binary-format", global = true, value_enum, default_value_t = BinaryFormat::None)]
72 pub binary_format: BinaryFormat,
73
74 /// Comma-separated list of columns to include.
75 #[arg(long, global = true, value_delimiter = ',')]
76 pub columns: Option<Vec<String>>,
77
78 /// Comma-separated list of columns to exclude. Takes precedence over --columns.
79 #[arg(long = "exclude-columns", global = true, value_delimiter = ',')]
80 pub exclude_columns: Option<Vec<String>>,
81
82 #[command(subcommand)]
83 pub command: Command,
84}
85
86#[derive(Debug, Subcommand)]
87pub enum Command {
88 /// Concatenate one or more datasets and print every row.
89 Cat {
90 /// Dataset paths (at least one).
91 #[arg(required = true)]
92 inputs: Vec<PathBuf>,
93 #[command(flatten)]
94 lance: LanceArgs,
95 },
96
97 /// Print the first N rows.
98 Head {
99 input: PathBuf,
100 #[arg(short = 'n', long, default_value_t = 10)]
101 limit: u64,
102 #[command(flatten)]
103 lance: LanceArgs,
104 },
105
106 /// Print the last N rows.
107 Tail {
108 input: PathBuf,
109 #[arg(short = 'n', long, default_value_t = 10)]
110 limit: u64,
111 #[command(flatten)]
112 lance: LanceArgs,
113 },
114
115 /// Print rows at the given indices (comma-separated; supports `a:b`, `a:`, `:b`, negatives).
116 Take {
117 input: PathBuf,
118 #[arg(long, allow_hyphen_values = true)]
119 indices: String,
120 #[command(flatten)]
121 lance: LanceArgs,
122 },
123
124 /// Print the number of rows.
125 Rowcount {
126 input: PathBuf,
127 #[command(flatten)]
128 lance: LanceArgs,
129 },
130
131 /// Randomly sample N rows without replacement.
132 Sample {
133 input: PathBuf,
134 #[arg(short = 'n', long)]
135 limit: u64,
136 /// Optional u64 seed for reproducibility.
137 #[arg(long)]
138 seed: Option<u64>,
139 #[command(flatten)]
140 lance: LanceArgs,
141 },
142
143 /// Print the schema of the dataset.
144 Schema {
145 input: PathBuf,
146 /// Which schema flavor to print.
147 #[arg(long = "type", value_enum, default_value_t = SchemaType::Arrow)]
148 ty: SchemaType,
149 #[command(flatten)]
150 lance: LanceArgs,
151 },
152
153 /// (Lance only) Print versions of the dataset.
154 Versions {
155 input: PathBuf,
156 /// Scope to a specific branch (default: main).
157 #[arg(long)]
158 branch: Option<String>,
159 /// Hide versions that have no tag (default: show all versions).
160 #[arg(long = "tagged-only", default_value_t = false)]
161 tagged_only: bool,
162 },
163
164 /// (Lance only) Print branches available for the dataset.
165 Branches { input: PathBuf },
166
167 /// (Lance only) Print tags defined on the dataset, across all branches.
168 Tags { input: PathBuf },
169
170 /// (Lance only) Print indices defined on the dataset.
171 Indices {
172 input: PathBuf,
173 #[command(flatten)]
174 lance: LanceArgs,
175 },
176}