fxtools 0.2.39

A collection of commandline Fasta/Fastq utility tools
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
use clap::{
    builder::{
        styling::{AnsiColor, Effects},
        Styles,
    },
    Parser, Subcommand,
};

use crate::commands::csv::Delimiter;

// Configures Clap v3-style help menu colors
const STYLES: Styles = Styles::styled()
    .header(AnsiColor::Green.on_default().effects(Effects::BOLD))
    .usage(AnsiColor::Green.on_default().effects(Effects::BOLD))
    .literal(AnsiColor::Cyan.on_default().effects(Effects::BOLD))
    .placeholder(AnsiColor::Cyan.on_default());

#[derive(Parser)]
#[command(styles = STYLES)]
#[clap(author, version, about, long_about = None)]
pub struct Cli {
    #[clap(subcommand)]
    pub command: Commands,

    /// Compression threads to use for output files if applicable
    #[clap(global = true, short = 'j', long)]
    pub compression_threads: Option<usize>,

    /// Compression level to use for output files if applicable
    #[clap(global = true, short = 'Z', long)]
    pub compression_level: Option<usize>,
}

#[derive(Subcommand)]
pub enum Commands {
    /// Concatenates multiple Fastx files together
    Cat {
        #[clap(short, long, value_parser, num_args=1.., required = true)]
        /// Input FASTX to concatenate
        inputs: Vec<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,

        #[clap(short, long, value_parser, conflicts_with = "headers_only")]
        /// Selectively only write sequences of records
        sequence_only: bool,

        #[clap(short = 'H', long, value_parser, conflicts_with = "sequence_only")]
        /// Selectively only write headers of records
        headers_only: bool,

        /// Concatenate the sequences into a single line
        #[clap(
            short = 'S',
            long,
            value_parser,
            conflicts_with = "headers_only",
            requires = "sequence_only"
        )]
        single_line: bool,
    },

    /// Counts the number of records in a Fastx file
    Count {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to Count
        input: Option<String>,
    },

    /// Clip nucleotide sequences between two indices
    Clip {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to clip
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,

        #[clap(short, long, value_parser)]
        /// Number of nucleotides from the start of the sequence to clip
        start: Option<usize>,

        #[clap(short, long, value_parser)]
        /// Number of nucleotides from the end of the sequence to clip
        end: Option<usize>,

        #[clap(short, long, value_parser, conflicts_with_all = &["start", "end"])]
        /// Range of nucleotides to accept (everything else is clipped)
        /// Format: [start]..[end]
        range: Option<String>,
    },

    /// Converts a CSV file to a FASTA file
    CsvToFasta {
        #[clap(short, long, value_parser)]
        /// Input CSV to Convert
        input: Option<String>,

        /// Filepath to write output to [default: stdout]
        /// If not provided, will write to stdout
        #[clap(short, long, value_parser)]
        output: Option<String>,

        /// Column to use as the header
        #[clap(short, long, value_parser, default_value = "sgrna")]
        header_col: String,

        /// Column to use as the sequence
        #[clap(short, long, value_parser, default_value = "sequence")]
        sequence_col: String,

        /// Delimiter used in the CSV file
        #[clap(short, long, value_parser, default_value = "comma")]
        delim: Delimiter,
    },

    /// Create all unambiguous one-off sequences for a collection of sequences
    Disambiseq {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to disambiguate
        input: Option<String>,
        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,
        #[clap(short = 'p', long, value_parser, default_value = "false")]
        /// Include the original (parent) sequence in the output
        include_parents: bool,
    },

    /// Filters same length sequences to their variable region. Useful in CRISPRi/a libraries where
    /// the variable region is prefixed and suffixed by some constant region
    ExtractVariable {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to to extract variable region
        input: String,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,

        #[clap(short, long, value_parser, default_value = "5000")]
        /// Number of samples to calculate positional entropy on
        num_samples: usize,

        #[clap(short, long, value_parser, default_value = "0.5")]
        /// Number of samples to calculate positional entropy on
        zscore_threshold: f64,
    },

    /// Filters a fastx file by searching for whether they follow a regex pattern on the sequence
    Filter {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to fix
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,

        #[clap(short, long, value_parser)]
        /// Regex pattern to search for
        pattern: String,

        #[clap(short = 'v', long, value_parser, default_value = "false")]
        /// Whether to invert the filter
        invert: bool,

        #[clap(short = 'H', long, value_parser, default_value = "false")]
        /// Whether to search for the pattern in the header
        header: bool,
    },

    /// Fix a fastx file by replacing invalid characters with N
    Fix {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to fix
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,
    },

    /// Multiplex a set of fastx files by prepending a barcode to the sequences
    Multiplex {
        #[clap(short, long, value_parser, num_args=1.., required = true)]
        /// Input FASTXs to multiplex
        input: Vec<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,

        #[clap(short, long, value_parser)]
        /// Optional whitelist of barcodes to prepend generated barcodes with
        whitelist: Option<String>,

        #[clap(
            short = 'O',
            long,
            value_parser,
            default_value = "multiplex_whitelist.txt"
        )]
        /// Output whitelist of barcodes to file
        output_whitelist: String,

        #[clap(short, long, value_parser, default_value = "multiplex_log.json")]
        /// Filepath to write barcode stats to
        log: String,

        #[clap(short, long, value_parser)]
        /// The size of the barcode to prepend to the sequences (will be adjusted to minimum
        /// barcode size if too small)
        barcode_size: Option<usize>,

        #[clap(short, long, value_parser)]
        /// The random seed to use for the barcode generation
        seed: Option<u64>,

        #[clap(short, long, value_parser, default_value = "100000")]
        timeout: u64,
    },

    /// Creates the Reverse complement for a provided fastx
    Reverse {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to Convert to Upper
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,
    },

    /// Samples a fastx file by a frequency
    Sample {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to sample
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,

        #[clap(short, long, value_parser, default_value = "0.5")]
        /// Frequency to sample by
        frequency: f64,

        #[clap(short, long, value_parser)]
        /// Seed to use for sampling
        seed: Option<u64>,

        #[clap(short, long, value_parser, default_value = "false")]
        /// Don't write number of records sampled to stderr
        quiet: bool,
    },

    /// Creates a mapping of sgRNAs to their parent gene
    SgrnaTable {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to Generate table
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write table to [default: stdout]
        output: Option<String>,

        #[clap(short = 's', long, action)]
        /// Whether to include the sequence in the output table [default: false]
        include_sequence: bool,

        #[clap(short, long)]
        /// Ignore TSS information in the header, default is to separate by TSS
        tss_ignore: bool,

        #[clap(short, long, value_parser)]
        /// Specify ordering of columns as 3 value string ([Hh]eader, [Ss]equence, [Gg]ene).
        /// [default: ghs]
        reorder: Option<String>,

        #[clap(short, long, value_parser)]
        /// Optional choice of output delimiter [default: '\t']
        delim: Option<char>,
    },

    /// Sorts a fastx file by sequence
    Sort {
        #[clap(short = 'i', long, value_parser)]
        /// Input FASTA/Q to sort
        r1: Option<String>,

        #[clap(short = 'I', long, value_parser)]
        /// Optional choice of R2 to sort by
        r2: Option<String>,

        #[clap(short, long, value_parser)]
        /// Prefix to write sorted files to
        /// if single-end [default: stdout]
        /// if paired-end [default: sorted]
        prefix: Option<String>,

        #[clap(short, long, value_parser, default_value = "true")]
        /// Whether to gzip the output files
        gzip: bool,

        #[clap(short, long, value_parser, default_value = "false")]
        /// Whether to sort by R1 or R2
        sort_by_r1: bool,
    },

    /// Extracts the transcript to gene mapping from an ensembl cdna fasta file
    T2g {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to fix
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,

        #[clap(short, long)]
        /// Whether to include the gene symbol in the output if available.
        /// Defaults to ensembl gene id
        symbol: bool,

        #[clap(short, long)]
        /// Whether to include the dot version of the transcript id
        /// Defaults to clipping the dot version
        dot_version: bool,
    },

    /// Takes exactly a number of records from an input fastx file
    Take {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to take records from
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write taken records to [default: stdout]
        output: Option<String>,

        #[clap(short, long, value_parser)]
        /// Number of records to take
        num_records: usize,

        #[clap(short, long, value_parser, default_value = "0")]
        /// How many records to skip before taking the first n
        skip: usize,
    },

    /// Trims adapter sequences that are dynamically placed within the sequence.
    Trim {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to trim sequences
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Adapater sequence to trim
        adapter: String,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,

        #[clap(short, long, value_parser, default_value = "false")]
        /// Trim the adapter off the sequence
        trim_adapter: bool,
    },

    /// Filters the Fastx file for Unique Sequences
    Unique {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to Filter on Unique / Duplicate Sequences
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write unique records to [default: stdout]
        output: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write unique records to
        null: Option<String>,

        #[clap(short, long, value_parser)]
        /// Allow invalid nucleotides in output
        allow_invalid: bool,
    },

    /// Converts all lowercase nucleotides to uppercase
    /// and validates for unexpected nucleotides
    Upper {
        #[clap(short, long, value_parser)]
        /// Input FASTA/Q to Convert to Upper
        input: Option<String>,

        #[clap(short, long, value_parser)]
        /// Filepath to write output to [default: stdout]
        output: Option<String>,

        #[clap(short, long, value_parser)]
        /// Allow invalid nucleotides in output
        allow_invalid: bool,
    },
}