rype 1.0.0-rc.1

High-performance genomic sequence classification using minimizer-based k-mer sketching in RY space
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
//! Command-line argument definitions for the rype CLI.

use clap::{Parser, Subcommand};
use std::path::PathBuf;

use super::helpers::{
    parse_bloom_fpp, parse_max_memory_arg, parse_shard_size_arg, validate_minimum_length,
    validate_trim_to,
};

#[derive(Parser)]
#[command(name = "rype")]
#[command(about = "High-performance Read Partitioning Engine (RY-Space, K=16/32/64)")]
#[command(
    long_about = "Rype: High-performance genomic sequence classification using minimizer-based k-mer sketching in RY (purine/pyrimidine) space.

WORKFLOW:
  1. Create an index:     rype index create -o index.ryxdi -r refs.fasta
  2. Classify reads:      rype classify run -i index.ryxdi -1 reads.fq

INPUT FORMATS:
  FASTA (.fa, .fasta, .fna) and FASTQ (.fq, .fastq) files are supported.
  Gzip-compressed files (.gz) are automatically detected and decompressed.
  Parquet (.parquet) with columns: read_id, sequence1, sequence2 (optional)

OUTPUT FORMAT (classify):
  Format auto-detected from extension:
  - .tsv or no extension: Plain TSV
  - .tsv.gz: Gzip-compressed TSV
  - .parquet: Apache Parquet with zstd compression
  - -: stdout (TSV)

  Tab-separated columns: read_id<TAB>bucket_name<TAB>score
  - read_id: Sequence header (first whitespace-delimited token)
  - bucket_name: Human-readable name from index
  - score: Fraction of query minimizers matching (0.0-1.0)"
)]
#[command(after_help = "EXAMPLES:
  # Create index from reference genomes
  rype index create -o bacteria.ryxdi -r genome1.fna -r genome2.fna -k 64 -w 50

  # Create index with one bucket per sequence
  rype index create -o genes.ryxdi -r genes.fasta --separate-buckets

  # Classify single-end reads
  rype classify run -i bacteria.ryxdi -1 reads.fq -t 0.1 -o results.tsv

  # Classify paired-end reads with negative filtering
  rype classify run -i bacteria.ryxdi -N host.ryxdi -1 R1.fq -2 R2.fq -t 0.1

  # Aggregate mode for higher sensitivity
  rype classify aggregate -i bacteria.ryxdi -1 R1.fq -2 R2.fq -t 0.05")]
pub struct Cli {
    /// Enable verbose progress output with timestamps
    #[arg(short, long, global = true)]
    pub verbose: bool,

    #[command(subcommand)]
    pub command: Commands,
}

#[derive(Subcommand)]
pub enum Commands {
    /// Index operations: create, modify, and inspect indices
    #[command(subcommand)]
    Index(IndexCommands),

    /// Classification operations: classify reads against an index
    #[command(subcommand)]
    Classify(ClassifyCommands),

    /// Inspect minimizer details and matches (debugging)
    #[command(subcommand)]
    Inspect(InspectCommands),
}

#[derive(Subcommand)]
pub enum IndexCommands {
    /// Create a new index from reference sequences
    #[command(after_help = "EXAMPLES:
  # Basic index creation
  rype index create -o index.ryxdi -r genome.fasta

  # Multiple references, all in one bucket
  rype index create -o index.ryxdi -r chr1.fa -r chr2.fa

  # One bucket per sequence (e.g., for gene-level classification)
  rype index create -o genes.ryxdi -r genes.fasta --separate-buckets

  # Large index with sharding (for memory-constrained systems)
  rype index create -o large.ryxdi -r refs.fa --max-shard-size 1073741824")]
    Create {
        /// Output index path (.ryxdi directory will be created)
        #[arg(short, long)]
        output: PathBuf,

        /// Reference sequence files (FASTA/FASTQ, optionally gzipped).
        /// Can specify multiple times: -r file1.fa -r file2.fa
        #[arg(short, long, required = true)]
        reference: Vec<PathBuf>,

        /// K-mer size for minimizer computation. Must be 16, 32, or 64.
        /// Larger k = more specific matches, fewer false positives.
        /// Smaller k = more sensitive, may find distant homologs.
        #[arg(short = 'k', long, default_value_t = 64)]
        kmer_size: usize,

        /// Minimizer window size. Larger values = smaller index, less sensitive.
        /// Recommended: 30-100 for genomes, 20-50 for shorter sequences.
        #[arg(short, long, default_value_t = 50)]
        window: usize,

        /// XOR salt for hash randomization. Must match for index compatibility.
        /// Default is fine for most uses; change to create incompatible indices.
        #[arg(short, long, default_value_t = 0x5555555555555555)]
        salt: u64,

        /// Create one bucket per input sequence instead of one per file.
        /// Use when each sequence represents a distinct classification target
        /// (e.g., individual genes, plasmids, or genomes in a multi-FASTA).
        #[arg(long)]
        separate_buckets: bool,

        /// Maximum shard size for large indices (e.g., "1G", "512M").
        /// Creates multiple shard files loaded on-demand during classification.
        #[arg(long, value_parser = parse_shard_size_arg)]
        max_shard_size: Option<usize>,

        /// Row group size (rows per group). Larger = better compression.
        #[arg(long, default_value_t = 100_000)]
        row_group_size: usize,

        /// Use Zstd compression instead of Snappy for Parquet files.
        /// Better compression ratio but slower.
        #[arg(long)]
        zstd: bool,

        /// Enable bloom filters for faster lookups.
        /// Increases file size slightly.
        #[arg(long)]
        bloom_filter: bool,

        /// Bloom filter false positive probability (0.0-1.0).
        /// Lower = more accurate but larger files. Only used with --bloom-filter.
        #[arg(long, default_value = "0.05", value_parser = parse_bloom_fpp)]
        bloom_fpp: f64,

        /// Print timing diagnostics to stderr for performance analysis.
        #[arg(long)]
        timing: bool,
    },

    /// Show index statistics and bucket information
    Stats {
        /// Path to index directory (.ryxdi)
        #[arg(short, long)]
        index: PathBuf,
    },

    /// Show source file paths or sequence IDs for a bucket
    BucketSourceDetail {
        /// Path to index directory (.ryxdi)
        #[arg(short, long)]
        index: PathBuf,

        /// Bucket identifier: numeric ID (e.g., '1') or exact bucket name (case-sensitive).
        /// Numeric IDs take precedence - if a bucket is named '42', use its numeric ID instead.
        #[arg(short, long, required = true)]
        bucket: String,

        /// Show only unique file paths (one per line)
        #[arg(long)]
        paths: bool,

        /// Show only bucket IDs (for scripting)
        #[arg(long)]
        ids: bool,
    },

    /// Add a new reference file as a new bucket to an existing index (development pending)
    BucketAdd {
        /// Path to existing index directory
        #[arg(short, long)]
        index: PathBuf,

        /// Reference file to add (creates a new bucket)
        #[arg(short, long)]
        reference: PathBuf,
    },

    /// Build index from a TOML configuration file (see CONFIG FORMAT below)
    #[command(after_help = "CONFIG FORMAT (from-config):
  [index]
  k = 64                           # K-mer size (16, 32, or 64)
  window = 50                      # Minimizer window size
  salt = 0x5555555555555555        # Hash salt (hex)
  output = \"index.ryxdi\"           # Output path (directory will be created)
  orient_sequences = true          # Optional: orient sequences for better overlap

  [buckets.BucketName]             # Define a bucket
  files = [\"ref1.fa\", \"ref2.fa\"]   # Files for this bucket

  [buckets.AnotherBucket]
  files = [\"other.fasta\"]

CLI OPTIONS OVERRIDE CONFIG FILE:
  --max-memory controls memory budget (auto-detected if not specified)
  --orient overrides [index].orient_sequences

SUBTRACTION MODE (--subtract-from):
  Removes minimizers present in an existing index from all buckets during build.
  Useful for host depletion: build a non-host index in one step.
  The subtraction index must have matching k, w, and salt values.

  Example: rype index from-config -c config.toml --subtract-from host.ryxdi")]
    FromConfig {
        /// Path to TOML config file
        #[arg(short, long)]
        config: PathBuf,

        /// Maximum memory to use (e.g., "8G", "512M", "auto").
        /// Controls chunk sizes for input processing.
        /// Default: auto-detect from system/cgroups/SLURM.
        #[arg(long, default_value = "auto", value_parser = parse_max_memory_arg)]
        max_memory: usize,

        /// Row group size (rows per group). Larger = better compression.
        #[arg(long, default_value_t = 100_000)]
        row_group_size: usize,

        /// Enable bloom filters for faster lookups.
        #[arg(long)]
        bloom_filter: bool,

        /// Bloom filter false positive probability (0.0-1.0).
        /// Only used with --bloom-filter.
        #[arg(long, default_value = "0.05", value_parser = parse_bloom_fpp)]
        bloom_fpp: f64,

        /// Orient sequences within buckets to maximize minimizer overlap.
        /// First sequence establishes baseline; subsequent sequences use
        /// forward or reverse-complement based on which has higher overlap.
        #[arg(long)]
        orient: bool,

        /// Print timing diagnostics to stderr for performance analysis.
        #[arg(long)]
        timing: bool,

        /// Subtract minimizers from an existing index before building.
        /// Removes any minimizer that exists in the subtraction index.
        /// Useful for host depletion: build a non-host index in one step.
        /// The subtraction index must have the same k, w, and salt values.
        #[arg(long)]
        subtract_from: Option<PathBuf>,
    },

    /// Add files to existing index using TOML config (development pending)
    #[command(after_help = "CONFIG FORMAT (bucket-add-config):
  [target]
  index = \"existing.ryxdi\"         # Index to modify

  [assignment]
  mode = \"new_bucket\"              # or \"existing_bucket\" or \"best_bin\"
  bucket_name = \"MyBucket\"         # For new_bucket/existing_bucket modes
  # For best_bin mode:
  # threshold = 0.3                 # Min score to match existing bucket
  # fallback = \"create_new\"        # or \"skip\" or \"error\"

  [files]
  paths = [\"new1.fa\", \"new2.fa\"]   # Files to add")]
    BucketAddConfig {
        /// Path to TOML config file
        #[arg(short, long)]
        config: PathBuf,
    },

    /// Show detailed minimizer statistics for compression analysis
    Summarize {
        /// Path to index directory (.ryxdi)
        #[arg(short, long)]
        index: PathBuf,
    },

    /// Merge two indices into one
    #[command(after_help = "MERGE OPERATION:
  Combines all buckets from both indices into a single output index.
  Bucket IDs are renumbered sequentially (1, 2, 3...) with primary buckets first.

REQUIREMENTS:
  - Both indices must have the same k, w, and salt values
  - Bucket names must be unique across both indices (no duplicates)

SUBTRACTION MODE (--subtract-from-primary):
  When enabled, minimizers present in the primary index are removed from
  the secondary index before merging. This is useful for creating indices
  where secondary buckets only contain sequences NOT found in primary.

  Use case: Create a \"non-host\" index by subtracting host minimizers.

EXAMPLES:
  # Simple merge of two indices
  rype index merge --index-primary bacteria.ryxdi --index-secondary phage.ryxdi -o combined.ryxdi

  # Merge with subtraction (create non-host index)
  rype index merge --index-primary host.ryxdi --index-secondary sample.ryxdi \\
      -o non_host.ryxdi --subtract-from-primary

  # Merge with compression options
  rype index merge --index-primary idx1.ryxdi --index-secondary idx2.ryxdi \\
      -o merged.ryxdi --zstd --bloom-filter")]
    Merge {
        /// Path to primary index directory (.ryxdi)
        #[arg(long)]
        index_primary: PathBuf,

        /// Path to secondary index directory (.ryxdi)
        #[arg(long)]
        index_secondary: PathBuf,

        /// Output path for merged index (.ryxdi directory will be created)
        #[arg(short, long)]
        output: PathBuf,

        /// Remove minimizers from secondary that exist in primary.
        /// Useful for creating indices where secondary buckets contain
        /// only sequences NOT found in the primary index.
        #[arg(long)]
        subtract_from_primary: bool,

        /// Maximum memory to use for merge operations (e.g., "8G", "512M", or "auto").
        /// When "auto", detects available system memory.
        /// Memory-bounded merging processes secondary shards one at a time to
        /// avoid OOM on large indices with high overlap.
        #[arg(long, default_value = "auto", value_parser = parse_max_memory_arg)]
        max_memory: usize,

        /// Row group size (rows per group). Larger = better compression.
        #[arg(long, default_value_t = 100_000)]
        row_group_size: usize,

        /// Use Zstd compression instead of Snappy for Parquet files.
        #[arg(long)]
        zstd: bool,

        /// Enable bloom filters for faster lookups.
        #[arg(long)]
        bloom_filter: bool,

        /// Bloom filter false positive probability (0.0-1.0).
        #[arg(long, default_value = "0.05", value_parser = parse_bloom_fpp)]
        bloom_fpp: f64,

        /// Print timing diagnostics to stderr for performance analysis.
        #[arg(long)]
        timing: bool,
    },
}

#[derive(Subcommand)]
pub enum ClassifyCommands {
    /// Classify reads against an index, one result line per read
    #[command(after_help = "OUTPUT FORMAT:
  Tab-separated values (TSV): read_id<TAB>bucket_name<TAB>score

  read_id     - First whitespace-delimited token from FASTA/FASTQ header
  bucket_name - Human-readable name from index (or filename if unnamed)
  score       - Fraction of query minimizers matching bucket (0.0-1.0)

  Only reads with score >= threshold for at least one bucket are output.
  A single read may produce multiple lines if it matches multiple buckets.

THRESHOLD GUIDANCE:
  0.05  - High sensitivity, useful for detecting distant homologs
  0.10  - Balanced (default), good for most metagenomic classification
  0.20  - High specificity, fewer false positives
  0.30+ - Very stringent, may miss true matches

WHEN TO USE 'run' vs 'aggregate':
  Use 'run' (this command) for:
  - Per-read classification results
  - Downstream analysis requiring read-level assignments
  - When you need to know which specific reads matched

  Use 'aggregate' for:
  - Sample-level composition estimates
  - Higher sensitivity (pools evidence across reads)
  - Abundance estimation")]
    Run {
        /// Path to target index directory (.ryxdi)
        #[arg(short, long, visible_alias = "positive-index")]
        index: PathBuf,

        /// Path to negative index for contamination filtering.
        /// Minimizers matching the negative index are excluded before scoring.
        /// Use for host depletion (e.g., human reads) or adapter removal.
        /// Must have same k, w, salt as positive index.
        #[arg(short = 'N', long)]
        negative_index: Option<PathBuf>,

        /// Forward reads. Formats: FASTA/FASTQ (.fa/.fq, optionally .gz),
        /// Parquet (.parquet) with columns: read_id, sequence1, sequence2 (optional)
        #[arg(short = '1', long)]
        r1: PathBuf,

        /// Reverse reads for paired-end data (optional).
        /// Not supported with Parquet input - use sequence2 column instead.
        #[arg(short = '2', long)]
        r2: Option<PathBuf>,

        /// Minimum score threshold for reporting matches (0.0-1.0).
        /// Score = matching_minimizers / total_query_minimizers.
        /// Lower = more sensitive, higher = more specific.
        #[arg(short, long, default_value_t = 0.1)]
        threshold: f64,

        /// Maximum memory to use (e.g., "4G", "512M", "auto").
        /// Default: auto-detect available memory.
        /// Batch size is calculated automatically based on this limit.
        #[arg(long, default_value = "auto", value_parser = parse_max_memory_arg)]
        max_memory: usize,

        /// Override automatic batch size calculation.
        /// If set, uses this fixed batch size instead of adaptive sizing.
        #[arg(short, long)]
        batch_size: Option<usize>,

        /// Output file path. Format auto-detected from extension:
        /// - `.tsv` or no extension: Plain TSV
        /// - `.tsv.gz`: Gzip-compressed TSV
        /// - `.parquet`: Apache Parquet with zstd compression
        /// - `-`: stdout (TSV)
        #[arg(short, long)]
        output: Option<PathBuf>,

        /// Use parallel row group processing.
        /// Processes each row group independently in parallel, maximizing CPU utilization.
        /// Most effective when query minimizers span entire index range (row group
        /// filtering is ineffective).
        #[arg(long)]
        parallel_rg: bool,

        /// Use bloom filters for row group filtering.
        /// Reduces I/O by rejecting row groups that definitely don't contain query minimizers.
        /// Only effective if index was built with --bloom-filter.
        #[arg(long)]
        use_bloom_filter: bool,

        /// Enable parallel row group reading for Parquet input files.
        /// Processes N row groups in parallel for faster decompression.
        /// Default: 4 when enabled, 0 = disabled (sequential reading).
        /// Most effective with SSDs when decompression is CPU-bound.
        #[arg(long, default_value_t = 0)]
        parallel_input_rg: usize,

        /// Print timing diagnostics to stderr for performance analysis.
        #[arg(long)]
        timing: bool,

        /// Report only the single best hit per query.
        /// If multiple buckets tie for best score, one is chosen arbitrarily.
        #[arg(long)]
        best_hit: bool,

        /// Trim sequences to first N nucleotides before classification.
        /// Sequences shorter than N are skipped.
        ///
        /// For paired-end reads, R1 must be at least N bases (pairs with shorter R1 are skipped).
        /// R2 is trimmed to min(length, N) - a short R2 does not cause the pair to be skipped.
        ///
        /// Recommended: N >= k (k-mer size) to ensure minimizer extraction.
        /// Values smaller than k will produce no minimizers and yield no results.
        #[arg(long, value_parser = validate_trim_to)]
        trim_to: Option<usize>,

        /// Skip reads with R1 shorter than N bases.
        /// Applied before --trim-to: reads must be at least N bases to be processed.
        /// Unlike --trim-to, surviving reads are not modified (compatible with --output-sequences).
        #[arg(long, value_parser = validate_minimum_length)]
        minimum_length: Option<usize>,

        /// Output wide-form matrix instead of long-form TSV.
        /// Columns: read_id, then one column per bucket (ordered by bucket_id).
        /// Each row contains scores for all buckets (0.0 if no hit).
        /// Incompatible with --threshold (all scores must be reported).
        #[arg(long)]
        wide: bool,
    },

    /// Pool all reads for sample-level classification (higher sensitivity)
    #[command(alias = "agg")]
    #[command(after_help = "AGGREGATE vs RUN:
  'aggregate' pools minimizers from all reads before scoring, providing:
  - Higher sensitivity for low-abundance targets
  - Sample-level composition rather than per-read assignments
  - Reduced noise from individual read variation

  Use 'aggregate' when you want to know what's in a sample.
  Use 'run' when you need read-level assignments.

OUTPUT FORMAT:
  Tab-separated: query_name<TAB>bucket_name<TAB>score

  query_name is always 'global' since reads are aggregated.

THRESHOLD:
  Default 0.05 (lower than 'run') since aggregation reduces noise.
  Score represents fraction of total unique minimizers matching bucket.")]
    Aggregate {
        /// Path to target index directory (.ryxdi)
        #[arg(short, long, visible_alias = "positive-index")]
        index: PathBuf,

        /// Path to negative index for contamination filtering
        #[arg(short = 'N', long)]
        negative_index: Option<PathBuf>,

        /// Forward reads. Formats: FASTA/FASTQ (.fa/.fq, optionally .gz),
        /// Parquet (.parquet) with columns: read_id, sequence1, sequence2 (optional)
        #[arg(short = '1', long)]
        r1: PathBuf,

        /// Reverse reads for paired-end data (optional).
        /// Not supported with Parquet input - use sequence2 column instead.
        #[arg(short = '2', long)]
        r2: Option<PathBuf>,

        /// Minimum score threshold (default lower than 'run' since
        /// aggregation reduces noise)
        #[arg(short, long, default_value_t = 0.05)]
        threshold: f64,

        /// Maximum memory to use (e.g., "4G", "512M", "auto").
        #[arg(long, default_value = "auto", value_parser = parse_max_memory_arg)]
        max_memory: usize,

        /// Override automatic batch size calculation
        #[arg(short, long)]
        batch_size: Option<usize>,

        /// Output file path. Format auto-detected from extension:
        /// - `.tsv` or no extension: Plain TSV
        /// - `.tsv.gz`: Gzip-compressed TSV
        /// - `.parquet`: Apache Parquet with zstd compression
        /// - `-`: stdout (TSV)
        #[arg(short, long)]
        output: Option<PathBuf>,
    },

    /// Compute log10(numerator_score / denominator_score) using two single-bucket indices
    #[command(after_help = "LOG-RATIO MODE:
  Computes log10(numerator_score / denominator_score) for each read using
  two separate single-bucket indices.

  Reads are first classified against the numerator index. Reads with numerator
  score >= --numerator-skip-threshold (default 0.5) are assigned +inf without
  needing denominator classification. All other reads are classified against
  the denominator index to compute the exact log-ratio.

OUTPUT FORMAT:
  Tab-separated: read_id<TAB>log10([Num] / [Denom])<TAB>score<TAB>fast_path

  fast_path column: 'none' (exact), 'num_high' (+inf)

SCORE INTERPRETATION:
  positive    Read favors numerator
  negative    Read favors denominator
  +inf        Numerator signal with no denominator signal
  -inf        Denominator signal with no numerator signal
  NaN         No signal from either index (both scores zero)
  0.0         Equal scores — no directional evidence

SEQUENCE OUTPUT (--output-sequences):
  By default, sequences with NEGATIVE log-ratio are written (favoring denominator).
  With --passing-is-positive, sequences with POSITIVE log-ratio are written.
  In both modes:
  - Zero log-ratio (equal scores) is EXCLUDED — no directional evidence.
  - NaN reads are INCLUDED — no matches means the sequence is unresolved.

EXAMPLES:
  # Basic log-ratio with two indices
  rype classify log-ratio -n numerator.ryxdi -d denominator.ryxdi -1 reads.fq

  # Disable fast-path (classify all reads against both indices)
  rype classify log-ratio -n num.ryxdi -d denom.ryxdi -1 reads.fq --numerator-skip-threshold 1.0

  # Output sequences favoring denominator
  rype classify log-ratio -n num.ryxdi -d denom.ryxdi -1 reads.fq \\
      --output-sequences filtered.fastq.gz")]
    LogRatio {
        /// Path to numerator index directory (.ryxdi). Must have exactly 1 bucket.
        #[arg(short = 'n', long)]
        numerator: PathBuf,

        /// Path to denominator index directory (.ryxdi). Must have exactly 1 bucket.
        #[arg(short = 'd', long)]
        denominator: PathBuf,

        /// Forward reads. Formats: FASTA/FASTQ (.fa/.fq, optionally .gz),
        /// Parquet (.parquet) with columns: read_id, sequence1, sequence2 (optional)
        #[arg(short = '1', long)]
        r1: PathBuf,

        /// Reverse reads for paired-end data (optional).
        /// Not supported with Parquet input - use sequence2 column instead.
        #[arg(short = '2', long)]
        r2: Option<PathBuf>,

        /// Maximum memory to use (e.g., "4G", "512M", "auto").
        #[arg(long, default_value = "auto", value_parser = parse_max_memory_arg)]
        max_memory: usize,

        /// Override automatic batch size calculation.
        #[arg(short, long)]
        batch_size: Option<usize>,

        /// Output file path. Format auto-detected from extension:
        /// - `.tsv` or no extension: Plain TSV
        /// - `.tsv.gz`: Gzip-compressed TSV
        /// - `.parquet`: Apache Parquet with zstd compression
        /// - `-`: stdout (TSV)
        #[arg(short, long)]
        output: Option<PathBuf>,

        /// Use parallel row group processing.
        #[arg(long)]
        parallel_rg: bool,

        /// Use bloom filters for row group filtering.
        #[arg(long)]
        use_bloom_filter: bool,

        /// Enable parallel row group reading for Parquet input files.
        #[arg(long, default_value_t = 0)]
        parallel_input_rg: usize,

        /// Print timing diagnostics to stderr.
        #[arg(long)]
        timing: bool,

        /// Trim sequences to first N nucleotides before classification.
        #[arg(long, value_parser = validate_trim_to)]
        trim_to: Option<usize>,

        /// Skip reads with R1 shorter than N bases.
        /// Applied before --trim-to: reads must be at least N bases to be processed.
        /// Unlike --trim-to, surviving reads are not modified (compatible with --output-sequences).
        #[arg(long, value_parser = validate_minimum_length)]
        minimum_length: Option<usize>,

        /// Output passing sequences to gzipped FASTA/FASTQ.
        /// By default, writes sequences with NEGATIVE log-ratio (favoring denominator).
        /// Zero log-ratio (equal scores) is excluded — no directional evidence.
        /// NaN reads (no matches in either index) are always included.
        /// For paired-end: foo.fastq.gz creates foo.R1.fastq.gz and foo.R2.fastq.gz.
        /// Not supported with --trim-to.
        #[arg(long)]
        output_sequences: Option<PathBuf>,

        /// Pass sequences with POSITIVE log-ratio (default: pass NEGATIVE).
        /// Zero log-ratio (equal scores) is excluded in both modes.
        /// Requires --output-sequences.
        #[arg(long, requires = "output_sequences")]
        passing_is_positive: bool,

        /// Skip denominator classification for reads with numerator score >= this value.
        /// These reads are assigned +inf with fast_path=num_high. A majority of
        /// minimizers matching the numerator guarantees a positive log-ratio,
        /// so the denominator classification can be skipped.
        /// Must be between 0.0 (exclusive) and 1.0 (inclusive).
        /// Set to 1.0 to disable fast-path (classify all reads against both indices).
        #[arg(long, default_value_t = 0.5)]
        numerator_skip_threshold: f64,
    },
}

#[derive(Subcommand)]
pub enum InspectCommands {
    /// Show matching minimizers between queries and buckets with reference details
    Matches {
        /// Path to the index file
        #[arg(short, long)]
        index: PathBuf,

        /// Query sequences (FASTA/FASTQ)
        #[arg(short = '1', long)]
        queries: PathBuf,

        /// File with sequence IDs to inspect (one per line)
        #[arg(long)]
        ids: PathBuf,

        /// Bucket IDs to check against (comma-separated)
        #[arg(short, long, value_delimiter = ',', required = true)]
        buckets: Vec<u32>,
    },
}