nanalogue_core/cli.rs
1//! # Command line interface (CLI) options, including input processing options
2//!
3//! This file provides some global options in the command line interface.
4use crate::{
5 Error, F32Bw0and1, GenomicRegion, ModChar, PathOrURLOrStdin, ReadStates,
6 RestrictModCalledStrand, ThresholdState,
7};
8use bedrs::prelude::Bed3;
9use clap::{Args, FromArgMatches};
10use derive_builder::Builder;
11use rust_htslib::bam;
12use serde::{Deserialize, Serialize};
13use std::collections::HashSet;
14use std::num::{NonZeroU32, NonZeroUsize};
15use std::str::FromStr;
16
17/// Options to parse the input bam file and the filters that should be applied to the bam file.
18///
19/// This struct is parsed to create command line arguments and then passed to many functions.
20/// We have copied and edited a similar struct from the fibertools-rs repository.
21/// You can build this through [`InputBamBuilder`].
22/// In CLI mode, `clap` populates this struct.
23/// This and the [`InputMods`] struct are used to set almost all input options
24/// to many of our functions that process BAM/modBAM files.
25///
26/// # Examples
27///
28/// We first begin with an example to build the struct.
29/// The next example shows how this struct and [`InputMods`] can be used
30/// to construct inputs to one of our BAM processing functions.
31///
32/// Sample way to build the struct. Some of the parameters are optional
33/// and can be left unset which would give them default values.
34/// We do not check if the specified bam path or URL exists as there are
35/// use cases where files are generated before the `InputBam` object is used.
36/// Not all options are listed here; for a full list, please see all the
37/// methods of the builder.
38///
39/// ```
40/// use nanalogue_core::{Error, F32Bw0and1, InputBamBuilder, PathOrURLOrStdin};
41///
42/// let bam = InputBamBuilder::default()
43/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
44/// .min_seq_len(30000u64)
45/// .min_align_len(20000i64)
46/// .read_id("some-id")
47/// .read_filter("primary_forward,secondary_forward".into())
48/// .sample_fraction(F32Bw0and1::new(1.0).expect("no error"))
49/// .mapq_filter(20)
50/// .exclude_mapq_unavail(true)
51/// .region("chr4:1000-2000".into())
52/// .full_region(true)
53/// .build()?;
54/// # Ok::<(), Error>(())
55/// ```
56///
57/// This struct and the [`InputMods`] struct allow us to set input options
58/// for BAM/modBAM calculations. An example is shown below where the [`crate::read_info::run`]
59/// command is called to process data from a BAM file with some input options.
60///
61/// ```
62/// use nanalogue_core::{BamRcRecords, BamPreFilt, Error, InputBamBuilder, InputModsBuilder,
63/// OptionalTag, PathOrURLOrStdin, ThresholdState, nanalogue_bam_reader, read_info};
64///
65/// let mut bam = InputBamBuilder::default()
66/// .bam_path(PathOrURLOrStdin::Path("./examples/example_1.bam".into()))
67/// .region("dummyI".into())
68/// .build()?;
69/// let mut mods = InputModsBuilder::<OptionalTag>::default()
70/// .mod_prob_filter(ThresholdState::GtEq(0))
71/// .build()?;
72///
73/// let mut buffer = Vec::new();
74/// let mut reader = nanalogue_bam_reader(&bam.bam_path.to_string())?;
75/// let bam_rc_records = BamRcRecords::new(&mut reader, &mut bam, &mut mods)?;
76/// read_info::run(
77/// &mut buffer,
78/// bam_rc_records.rc_records
79/// .filter(|r| r.as_ref().map_or(true, |v| v.pre_filt(&bam))),
80/// mods,
81/// None,
82/// )?;
83/// assert!(str::from_utf8(buffer.as_slice())?
84/// .contains("5d10eb9a-aae1-4db8-8ec6-7ebb34d32575"));
85/// # Ok::<(), Error>(())
86/// ```
87///
88/// ## Examples resulting in errors
89///
90/// Full region without actually setting a region
91///
92/// ```should_panic
93/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
94///
95/// let bam = InputBamBuilder::default()
96/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
97/// .read_id("some-id")
98/// .full_region(true)
99/// .build()?;
100/// # Ok::<(), Error>(())
101/// ```
102///
103/// Setting both `region` and `region_bed3`. `region` can be converted to
104/// `region_bed3` using [`GenomicRegion::try_to_bed3`] and a BAM header.
105///
106/// ```should_panic
107/// use bedrs::prelude::Bed3;
108/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
109///
110/// let bam = InputBamBuilder::default()
111/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
112/// .read_id("some-id")
113/// .region("chr4:1000-2000".into())
114/// .region_bed3(Bed3::<i32,u64>::new(3, 1000, 2000))
115/// .build()?;
116/// # Ok::<(), Error>(())
117/// ```
118///
119/// Setting more than one of `read_id`, `read_id_list` and `read_id_set`.
120/// * `read_id` means filter to retain only this read.
121/// * `read_id_list` is a path to a file with a list of read ids.
122/// * `read_id_set` is a set of read ids supplied directly.
123///
124/// ```
125/// use bedrs::prelude::Bed3;
126/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
127/// use std::collections::HashSet;
128///
129/// let _ = InputBamBuilder::default()
130/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
131/// .read_id("some-id")
132/// .read_id_list("/some/file.txt")
133/// .build().unwrap_err();
134///
135/// let mut read_id_set = HashSet::<String>::new();
136/// read_id_set.insert("some-read-a".to_owned());
137/// read_id_set.insert("some-read-b".to_owned());
138///
139/// let _ = InputBamBuilder::default()
140/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
141/// .read_id_list("/some/file.txt")
142/// .read_id_set(read_id_set.clone())
143/// .build().unwrap_err();
144///
145/// let _ = InputBamBuilder::default()
146/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
147/// .read_id("some-id")
148/// .read_id_set(read_id_set)
149/// .build().unwrap_err();
150/// ```
151#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
152#[serde(default)]
153#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
154#[non_exhaustive]
155pub struct InputBam {
156 /// Input BAM file. Set to a local file path, or set to - to read from stdin,
157 /// or set to a URL to read from a remote file. If using stdin and piping in
158 /// from `samtools view`, always include the header with the `-h` option.
159 pub bam_path: PathOrURLOrStdin,
160 /// Exclude reads whose sequence length in the BAM file is
161 /// below this value. Defaults to 0.
162 #[clap(long, default_value_t)]
163 #[builder(setter(into))]
164 pub min_seq_len: u64,
165 /// Exclude reads whose alignment length in the BAM file is
166 /// below this value. Defaults to unused.
167 #[clap(long)]
168 #[builder(setter(into, strip_option))]
169 pub min_align_len: Option<i64>,
170 /// Only include this read id, defaults to unused i.e. all reads are used.
171 /// NOTE: if there are multiple alignments corresponding
172 /// to this read id, all of them are used.
173 #[clap(long, conflicts_with = "read_id_list")]
174 #[builder(setter(into, strip_option))]
175 pub read_id: Option<String>,
176 /// Path to file containing list of read IDs (one per line).
177 /// Lines starting with '#' are treated as comments and ignored.
178 /// Cannot be used together with --read-id.
179 #[clap(long, conflicts_with = "read_id")]
180 #[builder(setter(into, strip_option))]
181 pub read_id_list: Option<String>,
182 /// Internal `HashSet` of read IDs loaded from `read_id_list` file.
183 /// This is populated automatically and not exposed to users.
184 #[clap(skip)]
185 #[builder(setter(into, strip_option))]
186 pub read_id_set: Option<HashSet<String>>,
187 /// Number of threads used during some aspects of program execution
188 #[clap(long, default_value_t = NonZeroU32::new(2).expect("no error"))]
189 #[builder(setter(into))]
190 pub threads: NonZeroU32,
191 /// Include "zero-length" sequences e.g. sequences with "*" in the sequence
192 /// field. By default, these sequences are excluded to avoid processing errors.
193 /// If this flag is set, these reads are included irrespective of any
194 /// minimum sequence or align length criteria the user may have set.
195 /// WARNINGS: (1) Some functions of the codebase may break or produce incorrect
196 /// results if you use this flag. (2) due to a technical reason, we need a DNA sequence
197 /// in the sequence field and cannot infer sequence length from other sources
198 /// e.g. CIGAR strings.
199 #[clap(long, default_value_t)]
200 pub include_zero_len: bool,
201 /// Only retain reads of this type. Allowed types are `primary_forward`,
202 /// `primary_reverse`, `secondary_forward`, `secondary_reverse`, `supplementary_forward`,
203 /// `supplementary_reverse` and unmapped. Specify more than one type if needed
204 /// separated by commas, in which case reads of any type in list are retained.
205 /// Defaults to retain reads of all types.
206 #[clap(long)]
207 #[builder(field(
208 ty = "String",
209 build = "(!self.read_filter.is_empty()).then(|| ReadStates::from_str(&self.read_filter)).transpose()?"
210 ))]
211 pub read_filter: Option<ReadStates>,
212 /// Subsample BAM to retain only this fraction of total number of reads,
213 /// defaults to 1.0. The sampling algorithm considers every read according
214 /// to the specified probability, so due to this, you may not always get
215 /// the same number of reads e.g. if you set `-s 0.05` in a file with 1000 reads,
216 /// you will get 50 +- sqrt(50) reads.
217 /// NOTE: a new subsample is drawn every time as the seed is not fixed.
218 /// If you want reproducibility, consider piping the output of `samtools view -s`
219 /// to our program.
220 #[clap(short, long, default_value_t = F32Bw0and1::one())]
221 pub sample_fraction: F32Bw0and1,
222 /// Exclude reads whose MAPQ (Mapping quality of position) is below this value.
223 /// Defaults to zero i.e. do not exclude any read.
224 #[clap(long, default_value_t)]
225 #[builder(setter(into))]
226 pub mapq_filter: u8,
227 /// Exclude sequences with MAPQ unavailable.
228 /// In the BAM format, a value of 255 in this column means MAPQ is unavailable.
229 /// These reads are allowed by default, set this flag to exclude.
230 #[clap(long, default_value_t)]
231 pub exclude_mapq_unavail: bool,
232 /// Only keep reads passing through this region. If a BAM index is available
233 /// with a name same as the BAM file but with the .bai suffix, the operation
234 /// of selecting such reads will be faster. If you are using standard input
235 /// as your input e.g. you are piping in the output from samtools, then
236 /// you cannot use an index as a BAM filename is not available.
237 #[clap(long)]
238 #[builder(field(
239 ty = "String",
240 build = "(!self.region.is_empty()).then(|| GenomicRegion::from_str(&self.region)).transpose()?"
241 ))]
242 pub region: Option<GenomicRegion>,
243 /// Only keep read data from this region.
244 /// This is an internal option not exposed to the user, we will set it
245 /// based on the other options that the user sets.
246 #[clap(skip)]
247 #[builder(setter(into, strip_option))]
248 pub region_bed3: Option<Bed3<i32, u64>>,
249 /// Only keep reads if they pass through the specified region in full.
250 /// Related to the input `--region`; has no effect if that is not set.
251 #[clap(long, default_value_t, requires = "region")]
252 pub full_region: bool,
253}
254
255impl InputBamBuilder {
256 /// Validate [`InputBam`] creation through the Builder method.
257 ///
258 /// [`InputBam`] can be created in many ways:
259 /// - directly populating the fields as they are public.
260 /// - through the CLI-machinery
261 /// - through the builder.
262 ///
263 /// While we cannot check a user's usage through the first method,
264 /// we can check constructions through methods 2 and 3 above.
265 /// The CLI construction checking is handled by `clap`, and the builder
266 /// checking is done through this function.
267 /// The construction checks are slightly different as what may be feasible
268 /// through a CLI and through code are different.
269 /// So we have this validate method to check construction through
270 /// method number 3 above.
271 #[expect(
272 clippy::arithmetic_side_effects,
273 reason = "1 + 1 + 1 is not gonna overflow"
274 )]
275 fn validate(&self) -> Result<(), Error> {
276 match (
277 !self.region.is_empty(),
278 self.region_bed3.is_some(),
279 self.full_region,
280 ) {
281 (false, false, Some(true)) => {
282 return Err(Error::BuilderValidation("InputBamBuilder: cannot set `full_region` without setting `region` or `region_bed3`".to_owned()));
283 }
284 (true, true, _) => {
285 return Err(Error::BuilderValidation(
286 "InputBamBuilder: cannot set both `region` and `region_bed3`".to_owned(),
287 ));
288 }
289 _ => {}
290 }
291
292 if u8::from(self.read_id.is_some())
293 + u8::from(self.read_id_list.is_some())
294 + u8::from(self.read_id_set.is_some())
295 > 1
296 {
297 Err(Error::BuilderValidation(
298 "InputBamBuilder: cannot set >1 of `read_id`, `read_id_list` and `read_id_set`"
299 .to_owned(),
300 ))
301 } else {
302 Ok(())
303 }
304 }
305}
306
307/// Implements a default class for `InputBAM`
308impl Default for InputBam {
309 fn default() -> Self {
310 InputBam {
311 bam_path: PathOrURLOrStdin::Stdin,
312 min_seq_len: 0,
313 min_align_len: None,
314 read_id: None,
315 read_id_list: None,
316 read_id_set: None,
317 threads: NonZeroU32::new(2).expect("no error"),
318 include_zero_len: false,
319 read_filter: None,
320 sample_fraction: F32Bw0and1::one(),
321 mapq_filter: 0,
322 exclude_mapq_unavail: false,
323 region: None,
324 region_bed3: None,
325 full_region: false,
326 }
327 }
328}
329
330/// This struct contains an optional modification tag
331#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
332#[non_exhaustive]
333pub struct OptionalTag {
334 /// modified tag
335 #[clap(long)]
336 pub tag: Option<ModChar>,
337}
338
339impl FromStr for OptionalTag {
340 type Err = Error;
341
342 /// process the modification type from a string.
343 ///
344 /// ```
345 /// use nanalogue_core::{Error, ModChar, OptionalTag};
346 /// use std::str::FromStr;
347 ///
348 /// let tag = OptionalTag::from_str("m")?;
349 /// assert_eq!(tag.tag.unwrap().val(), 'm');
350 /// # Ok::<(), Error>(())
351 /// ```
352 fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
353 Ok(OptionalTag {
354 tag: Some(ModChar::from_str(mod_type)?),
355 })
356 }
357}
358
359/// This struct contains a required modification tag
360#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
361#[non_exhaustive]
362pub struct RequiredTag {
363 /// modified tag
364 #[clap(long)]
365 pub tag: ModChar,
366}
367
368impl FromStr for RequiredTag {
369 type Err = Error;
370
371 /// process the modification type from a string.
372 ///
373 /// ```
374 /// use nanalogue_core::{Error, ModChar, RequiredTag};
375 /// use std::str::FromStr;
376 ///
377 /// let tag = RequiredTag::from_str("m")?;
378 /// assert_eq!(tag.tag.val(), 'm');
379 /// # Ok::<(), Error>(())
380 /// ```
381 fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
382 Ok(RequiredTag {
383 tag: ModChar::from_str(mod_type)?,
384 })
385 }
386}
387
388/// Trait that returns a modification tag
389pub trait TagState {
390 /// Returns the modification tag of the tag state in an option
391 fn tag(&self) -> Option<ModChar> {
392 unimplemented!();
393 }
394}
395
396impl TagState for OptionalTag {
397 fn tag(&self) -> Option<ModChar> {
398 self.tag
399 }
400}
401
402impl TagState for RequiredTag {
403 fn tag(&self) -> Option<ModChar> {
404 Some(self.tag)
405 }
406}
407
408/// This struct contains the options input to our
409/// modification-data functions with restrictions on data received.
410///
411/// If you want to build this, use [`InputModsBuilder`].
412/// In CLI mode, `clap` populates this struct.
413/// This and the [`InputBam`] struct are used to set almost all input options
414/// to many of our functions that process BAM/modBAM files.
415///
416/// # Examples
417///
418/// Please look at the documentation of [`InputBam`] to see a fully-worked
419/// example to set up inputs to perform a calculation.
420///
421/// Following example uses many fields and is for a [`RequiredTag`] variant.
422/// You can omit some of them depending on your use case. `mod_region` must
423/// can be converted to `region_bed3` using [`GenomicRegion::try_to_bed3`]
424/// with a suitable BAM header before this struct can be used with most of
425/// our functions. We do not force this in the builder route as we may not
426/// want to do this for some reason.
427///
428/// ```
429/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
430/// use std::str::FromStr;
431///
432/// let input_options = InputModsBuilder::default()
433/// .tag(RequiredTag::from_str("m")?)
434/// .mod_strand("bc".into())
435/// .mod_prob_filter(ThresholdState::GtEq(0))
436/// .trim_read_ends_mod(10)
437/// .base_qual_filter_mod(10)
438/// .mod_region("chr3:4000-5000".into())
439/// .build()?;
440///
441/// # Ok::<(), Error>(())
442/// ```
443///
444/// A [`RequiredTag`] variant that sets `region_bed3` instead of `mod_region`.
445/// If `mod_region` is set, then a downstream function must parse a BAM file,
446/// and then convert `mod_region` into `region_bed3` using the header.
447/// If the user is confident that they know the index number of a contig,
448/// then they can directly set `region_bed3`. In the example below, the user
449/// knows `chr3` corresponds to `4`, so they set the region directly.
450///
451/// ```
452/// use bedrs::prelude::Bed3;
453/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
454/// use std::str::FromStr;
455///
456/// let input_options = InputModsBuilder::<RequiredTag>::default()
457/// .tag(RequiredTag::from_str("m")?)
458/// .mod_strand("bc".into())
459/// .mod_prob_filter(ThresholdState::GtEq(0))
460/// .trim_read_ends_mod(10)
461/// .base_qual_filter_mod(10)
462/// .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
463/// .build()?;
464///
465/// # Ok::<(), Error>(())
466/// ```
467///
468/// Setting both `region_bed3` and `mod_region` should result in an error,
469/// even if they both refer to the same region. This is because setting both
470/// can result in an undefined state, so we do not allow this.
471///
472/// ```should_panic
473/// use bedrs::prelude::Bed3;
474/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
475/// use std::str::FromStr;
476///
477/// let input_options = InputModsBuilder::<RequiredTag>::default()
478/// .mod_prob_filter(ThresholdState::GtEq(0))
479/// .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
480/// .mod_region("chr3:4000-5000".into())
481/// .build()?;
482///
483/// # Ok::<(), Error>(())
484/// ```
485///
486/// Following example is for an [`OptionalTag`] variant with the `tag` omitted,
487/// and many other fields omitted as well, which take on a default value.
488///
489/// ```
490/// use nanalogue_core::{InputModsBuilder, Error, OptionalTag, ThresholdState};
491/// use std::str::FromStr;
492///
493/// let input_options = InputModsBuilder::<OptionalTag>::default()
494/// .mod_prob_filter(ThresholdState::GtEq(0))
495/// .build()?;
496///
497/// # Ok::<(), Error>(())
498/// ```
499#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
500#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
501#[serde(default)]
502#[non_exhaustive]
503pub struct InputMods<S: TagState + Args + FromArgMatches + Default> {
504 /// modified tag
505 #[clap(flatten)]
506 pub tag: S,
507 /// modified strand, set this to `bc` or `bc_comp`, meaning
508 /// on basecalled strand or its complement. Some technologies
509 /// like `PacBio` or `ONT` duplex can call mod data on both a strand
510 /// and its complementary DNA and store it in the record corresponding
511 /// to the strand, so you can use this filter to select only for
512 /// mod data on a strand or its complement. Please note that this
513 /// filter is different from selecting for forward or reverse
514 /// aligned reads using the BAM flags.
515 #[clap(long)]
516 #[builder(field(
517 ty = "String",
518 build = "(!self.mod_strand.is_empty()).then(|| RestrictModCalledStrand::from_str(&self.mod_strand)).transpose()?"
519 ))]
520 pub mod_strand: Option<RestrictModCalledStrand>,
521 /// Filter to reject mods before analysis. Specify as low,high where
522 /// both are fractions to reject modifications where the probabilities (p)
523 /// are in this range e.g. "0.4,0.6" rejects 0.4 <= p <= 0.6.
524 /// You can use this to reject 'weak' modification calls before analysis
525 /// i.e. those with probabilities close to 0.5.
526 /// NOTE: (1) Whether this filtration is applied or not, mods < 0.5
527 /// are considered unmodified and >= 0.5 are considered modified by our program.
528 /// (2) mod probabilities are stored as a number from 0-255 in the modBAM format,
529 /// so we internally convert 0.0-1.0 to 0-255. Default: reject nothing.
530 #[clap(long, value_parser=ThresholdState::from_str_ordpair_fraction, default_value = "")]
531 pub mod_prob_filter: ThresholdState,
532 /// Filter this many bp at the start and
533 /// end of a read before any mod operations.
534 /// Please note that the units here are bp and
535 /// not units of base being queried.
536 #[clap(long, default_value_t)]
537 pub trim_read_ends_mod: usize,
538 /// Exclude bases whose base quality is below
539 /// this threshold before any mod operation, defaults to 0 i.e. unused.
540 /// NOTE: (1) This step is only applied before modification operations, and
541 /// not before any other operations.
542 /// (2) No offsets such as +33 are needed here.
543 /// (3) Modifications on reads where base quality information
544 /// is not available are all rejected if this is non-zero.
545 #[clap(long, default_value_t)]
546 pub base_qual_filter_mod: u8,
547 /// Only keep modification data from this region
548 #[clap(long)]
549 #[builder(field(
550 ty = "String",
551 build = "(!self.mod_region.is_empty()).then(|| GenomicRegion::from_str(&self.mod_region)).transpose()?"
552 ))]
553 pub mod_region: Option<GenomicRegion>,
554 /// Only keep modification data from this region.
555 /// We do not expose this to the user, but infer it from
556 /// the other options set by the user.
557 /// We cannot populate this directly at the time of CLI parsing,
558 /// as we need to look at the BAM header to convert a contig name
559 /// into a numeric contig id.
560 #[clap(skip)]
561 #[builder(setter(strip_option))]
562 pub region_bed3: Option<Bed3<i32, u64>>,
563}
564
565impl<S: TagState + Args + FromArgMatches + Default> InputModsBuilder<S> {
566 /// Validate [`InputMods`] creation through the Builder method.
567 ///
568 /// [`InputMods`] can be created in many ways:
569 /// - directly populating the fields as they are public.
570 /// - through the CLI-machinery
571 /// - through the builder.
572 ///
573 /// While we cannot check a user's usage through the first method,
574 /// we can check constructions through methods 2 and 3 above.
575 /// The CLI construction checking is handled by `clap`, and the builder
576 /// checking is done through this function.
577 /// The construction checks are slightly different as what may be feasible
578 /// through a CLI and through code are different.
579 /// So we have this validate method to check construction through
580 /// method number 3 above.
581 fn validate(&self) -> Result<(), Error> {
582 // self.mod_region is a `String` for the Builder before conversion into an `Option<_>`
583 if (!self.mod_region.is_empty()) && self.region_bed3.is_some() {
584 Err(Error::BuilderValidation(
585 "cannot set `mod_region` and `region_bed3` simultaneously!".to_owned(),
586 ))
587 } else {
588 Ok(())
589 }
590 }
591}
592
593/// Implements defaults for `InputMods`
594impl<S: TagState + Args + FromArgMatches + Default> Default for InputMods<S> {
595 fn default() -> Self {
596 InputMods::<S> {
597 tag: S::default(),
598 mod_strand: None,
599 mod_prob_filter: ThresholdState::GtEq(0),
600 trim_read_ends_mod: 0,
601 base_qual_filter_mod: 0,
602 mod_region: None,
603 region_bed3: None,
604 }
605 }
606}
607
608/// Retrieves options for modification input
609pub trait InputModOptions {
610 /// retrieves tag
611 fn tag(&self) -> Option<ModChar> {
612 unimplemented!()
613 }
614 /// retrieves option to set basecalled strand or opposite in mod retrieval
615 fn mod_strand(&self) -> Option<RestrictModCalledStrand> {
616 unimplemented!()
617 }
618 /// returns probability filter
619 fn mod_prob_filter(&self) -> ThresholdState {
620 unimplemented!()
621 }
622 /// returns read end trimming
623 fn trim_read_ends_mod(&self) -> usize {
624 unimplemented!()
625 }
626 /// returns threshold for filtering base PHRED quality
627 fn base_qual_filter_mod(&self) -> u8 {
628 unimplemented!()
629 }
630}
631
632/// Retrieves options for region
633pub trait InputRegionOptions {
634 /// returns region requested
635 fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
636 unimplemented!()
637 }
638 /// returns region requested but region in genomic string format
639 fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
640 unimplemented!()
641 }
642 /// sets region requested
643 fn set_region_filter(&mut self, _value: Option<Bed3<i32, u64>>) {
644 unimplemented!()
645 }
646 /// returns true if full overlap with region is requested as opposed to
647 /// only partial overlap. defaults to false.
648 fn is_full_overlap(&self) -> bool {
649 false
650 }
651 /// converts region from genomic string representation to bed3 representation
652 ///
653 /// # Errors
654 /// Returns error if conversion fails
655 fn convert_region_to_bed3(&mut self, header: bam::HeaderView) -> Result<(), Error> {
656 match (
657 self.region_filter_genomic_string().is_some(),
658 self.region_filter().is_some(),
659 ) {
660 (false, false) => self.set_region_filter(None),
661 (true, false) => {
662 let genomic_region = self
663 .region_filter_genomic_string()
664 .expect("checked above that this is Some");
665 self.set_region_filter(Some(genomic_region.try_to_bed3(&header)?));
666 }
667 (false, true) => {}
668 (true, true) => {
669 return Err(Error::InvalidState(
670 "cannot set a region as both a `GenomicRegion` and a `Bed3`".to_owned(),
671 ));
672 }
673 }
674 Ok(())
675 }
676}
677
678impl<S: TagState + Args + FromArgMatches + Default> InputModOptions for InputMods<S> {
679 fn tag(&self) -> Option<ModChar> {
680 self.tag.tag()
681 }
682 fn mod_strand(&self) -> Option<RestrictModCalledStrand> {
683 self.mod_strand
684 }
685 fn mod_prob_filter(&self) -> ThresholdState {
686 self.mod_prob_filter
687 }
688 fn trim_read_ends_mod(&self) -> usize {
689 self.trim_read_ends_mod
690 }
691 fn base_qual_filter_mod(&self) -> u8 {
692 self.base_qual_filter_mod
693 }
694}
695
696impl<S: TagState + Args + FromArgMatches + Default> InputRegionOptions for InputMods<S> {
697 fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
698 self.mod_region.clone()
699 }
700 fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
701 &self.region_bed3
702 }
703 fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
704 self.region_bed3 = value;
705 }
706}
707
708/// can return tag without encasing in an option in the `RequiredTag` variant
709impl InputMods<RequiredTag> {
710 /// retrieves tag
711 #[must_use]
712 pub fn required_tag(&self) -> ModChar {
713 self.tag.tag
714 }
715}
716
717impl InputRegionOptions for InputBam {
718 fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
719 self.region.clone()
720 }
721 fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
722 &self.region_bed3
723 }
724 fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
725 self.region_bed3 = value;
726 }
727 fn is_full_overlap(&self) -> bool {
728 self.full_region
729 }
730}
731
732/// This struct contains the options input to our
733/// modification-data-windowing functions
734///
735/// # Example
736/// ```
737/// use nanalogue_core::{Error, InputWindowingBuilder};
738///
739/// let window = InputWindowingBuilder::default()
740/// .win(20)
741/// .step(10).build()?;
742/// # Ok::<(), Error>(())
743/// ```
744#[derive(Builder, Debug, Args, Clone, Copy, Serialize, Deserialize)]
745#[builder(default, build_fn(error = "Error"))]
746#[serde(default)]
747#[non_exhaustive]
748pub struct InputWindowing {
749 /// size of window in units of base being queried i.e.
750 /// if you are looking for cytosine modifications, then
751 /// a window of a value 300 means create windows each with
752 /// 300 cytosines irrespective of their modification status.
753 #[clap(long)]
754 #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.win)?"))]
755 pub win: NonZeroUsize,
756 /// step window by this size in units of base being queried.
757 #[clap(long)]
758 #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.step)?"))]
759 pub step: NonZeroUsize,
760}
761
762/// Implements a default for `InputWindowing`.
763/// NOTE the defaults of 1 for each are just for ease of programming.
764/// We do not expose these defaults to the command-line user.
765impl Default for InputWindowing {
766 fn default() -> Self {
767 InputWindowing {
768 win: NonZeroUsize::new(1).expect("no error"),
769 step: NonZeroUsize::new(1).expect("no error"),
770 }
771 }
772}
773
774/// This enum contains inputs to display sequences with various options
775#[derive(Debug, Clone, Default, Copy, Serialize, Deserialize)]
776#[non_exhaustive]
777pub enum SeqDisplayOptions {
778 /// Do not display any sequence, default
779 #[default]
780 No,
781 /// Display the entire sequence without any subsetting
782 Full {
783 /// Option to display basecalling qualities
784 show_base_qual: bool,
785 },
786 /// Display the sequence within a region
787 Region {
788 /// Option to display basecalling qualities
789 show_base_qual: bool,
790 /// Option to show bases in lowercase if they are insertions
791 show_ins_lowercase: bool,
792 /// Option to show modified bases as Z (or z depending on other options)
793 show_mod_z: bool,
794 /// Region over which sequence must be shown
795 region: Bed3<i32, u64>,
796 },
797}
798
799#[cfg(test)]
800mod tag_struct_tests {
801 use super::*;
802
803 #[test]
804 fn optional_tag_default() {
805 let optional_tag = OptionalTag::default();
806 assert!(optional_tag.tag.is_none());
807 assert_eq!(optional_tag.tag(), None);
808 }
809
810 #[test]
811 fn optional_tag_with_value() {
812 let mod_char = ModChar::new('m');
813 let optional_tag = OptionalTag {
814 tag: Some(mod_char),
815 };
816 assert_eq!(optional_tag.tag(), Some(mod_char));
817 }
818
819 #[test]
820 fn required_tag_default() {
821 let required_tag = RequiredTag::default();
822 assert_eq!(required_tag.tag(), Some(ModChar::default()));
823 }
824
825 #[test]
826 fn required_tag_with_value() {
827 let mod_char = ModChar::new('C');
828 let required_tag = RequiredTag { tag: mod_char };
829 assert_eq!(required_tag.tag(), Some(mod_char));
830 }
831
832 #[test]
833 fn optional_tag_from_str() {
834 let tag = OptionalTag::from_str("m").unwrap();
835 assert_eq!(tag.tag.unwrap().val(), 'm');
836 }
837
838 #[test]
839 fn required_tag_from_str() {
840 let tag = RequiredTag::from_str("m").unwrap();
841 assert_eq!(tag.tag.val(), 'm');
842 }
843}
844
845#[cfg(test)]
846mod input_windowing_tests {
847 use super::*;
848
849 #[test]
850 fn input_windowing_default() {
851 let windowing = InputWindowing::default();
852 assert_eq!(windowing.win, NonZeroUsize::new(1).unwrap());
853 assert_eq!(windowing.step, NonZeroUsize::new(1).unwrap());
854 }
855
856 #[test]
857 fn input_windowing_custom_values() {
858 let windowing = InputWindowing {
859 win: NonZeroUsize::new(300).unwrap(),
860 step: NonZeroUsize::new(150).unwrap(),
861 };
862 assert_eq!(windowing.win, NonZeroUsize::new(300).unwrap());
863 assert_eq!(windowing.step, NonZeroUsize::new(150).unwrap());
864 }
865
866 #[test]
867 fn input_windowing_builder() {
868 let _: InputWindowing = InputWindowingBuilder::default()
869 .win(20)
870 .step(10)
871 .build()
872 .unwrap();
873 }
874}
875
876#[cfg(test)]
877mod input_mods_required_tag_tests {
878 use super::*;
879
880 #[test]
881 fn input_mods_required_tag_fn_tag() {
882 let mod_char = ModChar::new('C');
883 let input_mods = InputMods::<RequiredTag> {
884 tag: RequiredTag { tag: mod_char },
885 mod_strand: None,
886 mod_prob_filter: ThresholdState::GtEq(0),
887 trim_read_ends_mod: 0,
888 base_qual_filter_mod: 0,
889 mod_region: None,
890 region_bed3: None,
891 };
892
893 assert_eq!(input_mods.required_tag(), mod_char);
894 }
895
896 #[test]
897 fn input_mods_builder_required_tag_with_mod_region() {
898 let _: InputMods<RequiredTag> = InputModsBuilder::default()
899 .tag(RequiredTag::from_str("m").unwrap())
900 .mod_strand("bc".into())
901 .mod_prob_filter(ThresholdState::GtEq(0))
902 .trim_read_ends_mod(10)
903 .base_qual_filter_mod(10)
904 .mod_region("chr3:4000-5000".into())
905 .build()
906 .unwrap();
907 }
908
909 #[test]
910 fn input_mods_builder_required_tag_with_region_bed3() {
911 let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
912 .tag(RequiredTag::from_str("m").unwrap())
913 .mod_strand("bc".into())
914 .mod_prob_filter(ThresholdState::GtEq(0))
915 .trim_read_ends_mod(10)
916 .base_qual_filter_mod(10)
917 .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
918 .build()
919 .unwrap();
920 }
921
922 #[test]
923 fn input_mods_builder_optional_tag_minimal() {
924 let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
925 .mod_prob_filter(ThresholdState::GtEq(0))
926 .build()
927 .unwrap();
928 }
929}
930
931#[cfg(test)]
932mod input_bam_tests {
933 use super::*;
934 use bedrs::Coordinates as _;
935 use indoc::indoc;
936
937 #[test]
938 fn input_bam_is_full_overlap() {
939 // Test default (false)
940 let input_bam_default = InputBam::default();
941 assert!(!input_bam_default.is_full_overlap());
942
943 // Test explicit false
944 let input_bam_false = InputBam {
945 full_region: false,
946 ..Default::default()
947 };
948 assert!(!input_bam_false.is_full_overlap());
949
950 // Test true
951 let input_bam_true = InputBam {
952 full_region: true,
953 ..Default::default()
954 };
955 assert!(input_bam_true.is_full_overlap());
956 }
957
958 #[test]
959 fn input_bam_builder() {
960 let _: InputBam = InputBamBuilder::default()
961 .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
962 .min_seq_len(30000u64)
963 .min_align_len(20000i64)
964 .read_id("some-id")
965 .read_filter("primary_forward,secondary_forward".into())
966 .sample_fraction(F32Bw0and1::one())
967 .mapq_filter(20)
968 .exclude_mapq_unavail(true)
969 .region("chr4:1000-2000".into())
970 .full_region(true)
971 .build()
972 .unwrap();
973 }
974
975 #[test]
976 fn input_bam_convert_region_to_bed3_none() {
977 let mut input_bam = InputBam::default();
978
979 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
980 @SQ\tSN:chr1\tLN:248956422\n"});
981
982 input_bam.convert_region_to_bed3(header_view).unwrap();
983 assert!(input_bam.region_bed3.is_none());
984 }
985
986 #[test]
987 fn input_bam_convert_region_to_bed3_with_region() {
988 let mut input_bam = InputBam {
989 region: Some(GenomicRegion::from_str("chr2:3400-3600").unwrap()),
990 ..Default::default()
991 };
992
993 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
994 @SQ\tSN:chr1\tLN:3000
995 @SQ\tSN:chr2\tLN:4000\n"});
996
997 input_bam.convert_region_to_bed3(header_view).unwrap();
998 assert!(input_bam.region_bed3.is_some());
999
1000 let bed3 = input_bam.region_bed3.unwrap();
1001 assert_eq!(bed3.chr(), &1);
1002 assert_eq!(bed3.start(), 3400);
1003 assert_eq!(bed3.end(), 3600);
1004 }
1005
1006 #[test]
1007 #[should_panic(expected = "InvalidRegion")]
1008 fn input_bam_convert_region_to_bed3_invalid_region() {
1009 let mut input_bam = InputBam {
1010 region: Some(GenomicRegion::from_str("chr2:4400-4600").expect("no error")),
1011 ..Default::default()
1012 };
1013 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1014 @SQ\tSN:chr1\tLN:3000
1015 @SQ\tSN:chr2\tLN:4000\n"});
1016
1017 input_bam.convert_region_to_bed3(header_view).unwrap();
1018 }
1019
1020 #[test]
1021 #[should_panic(expected = "InvalidRegion")]
1022 fn input_bam_convert_region_to_bed3_invalid_open_ended_region() {
1023 let mut input_bam = InputBam {
1024 region: Some(GenomicRegion::from_str("chr2:4600-").expect("no error")),
1025 ..Default::default()
1026 };
1027 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1028 @SQ\tSN:chr1\tLN:3000
1029 @SQ\tSN:chr2\tLN:4000\n"});
1030
1031 input_bam.convert_region_to_bed3(header_view).unwrap();
1032 }
1033
1034 #[test]
1035 #[should_panic(expected = "InvalidAlignCoords")]
1036 fn input_bam_convert_region_to_bed3_invalid_contig() {
1037 let mut input_bam = InputBam {
1038 region: Some(GenomicRegion::from_str("chr3:1000-2000").expect("no error")),
1039 ..Default::default()
1040 };
1041 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1042 @SQ\tSN:chr1\tLN:3000
1043 @SQ\tSN:chr2\tLN:4000\n"});
1044
1045 input_bam.convert_region_to_bed3(header_view).unwrap();
1046 }
1047
1048 #[test]
1049 fn input_bam_convert_region_to_bed3_already_set_bed3() {
1050 // When region_bed3 is already set and region is None, should not convert
1051 let mut input_bam = InputBam {
1052 region: None,
1053 region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1054 ..Default::default()
1055 };
1056
1057 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1058 @SQ\tSN:chr1\tLN:3000
1059 @SQ\tSN:chr2\tLN:4000\n"});
1060
1061 input_bam.convert_region_to_bed3(header_view).unwrap();
1062
1063 // Bed3 should remain unchanged
1064 assert!(input_bam.region_bed3.is_some());
1065 let bed3 = input_bam.region_bed3.unwrap();
1066 assert_eq!(bed3.chr(), &1);
1067 assert_eq!(bed3.start(), 1000);
1068 assert_eq!(bed3.end(), 2000);
1069 }
1070
1071 #[test]
1072 #[should_panic(expected = "InvalidState")]
1073 fn input_bam_convert_region_to_bed3_both_set() {
1074 // When both region and region_bed3 are set, should error
1075 let mut input_bam = InputBam {
1076 region: Some(GenomicRegion::from_str("chr2:1500-2500").unwrap()),
1077 region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1078 ..Default::default()
1079 };
1080
1081 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1082 @SQ\tSN:chr1\tLN:3000
1083 @SQ\tSN:chr2\tLN:4000\n"});
1084
1085 input_bam.convert_region_to_bed3(header_view).unwrap();
1086 }
1087}
1088
1089#[cfg(test)]
1090mod validate_builder_functions {
1091 use super::*;
1092
1093 // Tests for InputBamBuilder::validate conflicts
1094
1095 #[test]
1096 #[should_panic(expected = "BuilderValidation")]
1097 fn input_bam_builder_full_region_without_region_fails() {
1098 let _: InputBam = InputBamBuilder::default()
1099 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1100 .full_region(true)
1101 .build()
1102 .unwrap();
1103 }
1104
1105 #[test]
1106 #[should_panic(expected = "BuilderValidation")]
1107 fn input_bam_builder_both_region_and_region_bed3_fails() {
1108 let _: InputBam = InputBamBuilder::default()
1109 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1110 .region("chr1:1000-2000".into())
1111 .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1112 .build()
1113 .unwrap();
1114 }
1115
1116 #[test]
1117 #[should_panic(expected = "BuilderValidation")]
1118 fn input_bam_builder_read_id_and_read_id_list_fails() {
1119 let _: InputBam = InputBamBuilder::default()
1120 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1121 .read_id("some-id")
1122 .read_id_list("/some/file.txt")
1123 .build()
1124 .unwrap();
1125 }
1126
1127 #[test]
1128 #[should_panic(expected = "BuilderValidation")]
1129 fn input_bam_builder_read_id_and_read_id_set_fails() {
1130 let mut read_ids = HashSet::new();
1131 let _: bool = read_ids.insert("read1".to_owned());
1132 let _: InputBam = InputBamBuilder::default()
1133 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1134 .read_id("some-id")
1135 .read_id_set(read_ids)
1136 .build()
1137 .unwrap();
1138 }
1139
1140 #[test]
1141 #[should_panic(expected = "BuilderValidation")]
1142 fn input_bam_builder_read_id_list_and_read_id_set_fails() {
1143 let mut read_ids = HashSet::new();
1144 let _: bool = read_ids.insert("read1".to_owned());
1145 let _: InputBam = InputBamBuilder::default()
1146 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1147 .read_id_list("/some/file.txt")
1148 .read_id_set(read_ids)
1149 .build()
1150 .unwrap();
1151 }
1152
1153 #[test]
1154 #[should_panic(expected = "BuilderValidation")]
1155 fn input_bam_builder_all_three_read_id_options_fails() {
1156 let mut read_ids = HashSet::new();
1157 let _: bool = read_ids.insert("read1".to_owned());
1158 let _: InputBam = InputBamBuilder::default()
1159 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1160 .read_id("some-id")
1161 .read_id_list("/some/file.txt")
1162 .read_id_set(read_ids)
1163 .build()
1164 .unwrap();
1165 }
1166
1167 // Tests for InputModsBuilder::validate conflicts
1168
1169 #[test]
1170 #[should_panic(expected = "BuilderValidation")]
1171 fn input_mods_builder_both_mod_region_and_region_bed3_fails() {
1172 let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
1173 .mod_prob_filter(ThresholdState::GtEq(0))
1174 .mod_region("chr1:1000-2000".into())
1175 .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1176 .build()
1177 .unwrap();
1178 }
1179
1180 #[test]
1181 #[should_panic(expected = "BuilderValidation")]
1182 fn input_mods_builder_required_tag_both_regions_fails() {
1183 let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
1184 .tag(RequiredTag::from_str("m").unwrap())
1185 .mod_prob_filter(ThresholdState::GtEq(0))
1186 .mod_region("chr1:1000-2000".into())
1187 .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1188 .build()
1189 .unwrap();
1190 }
1191}