Skip to main content

nanalogue_core/
cli.rs

1//! # Command line interface (CLI) options, including input processing options
2//!
3//! This file provides some global options in the command line interface.
4use crate::{
5    Error, F32Bw0and1, GenomicRegion, ModChar, PathOrURLOrStdin, ReadStates,
6    RestrictModCalledStrand, ThresholdState,
7};
8use bedrs::prelude::Bed3;
9use clap::{Args, FromArgMatches};
10use derive_builder::Builder;
11use rust_htslib::bam;
12use serde::{Deserialize, Serialize};
13use std::collections::HashSet;
14use std::num::{NonZeroU32, NonZeroUsize};
15use std::str::FromStr;
16
17/// Options to parse the input bam file and the filters that should be applied to the bam file.
18///
19/// This struct is parsed to create command line arguments and then passed to many functions.
20/// We have copied and edited a similar struct from the fibertools-rs repository.
21/// You can build this through [`InputBamBuilder`].
22/// In CLI mode, `clap` populates this struct.
23/// This and the [`InputMods`] struct are used to set almost all input options
24/// to many of our functions that process BAM/modBAM files.
25///
26/// # Examples
27///
28/// We first begin with an example to build the struct.
29/// The next example shows how this struct and [`InputMods`] can be used
30/// to construct inputs to one of our BAM processing functions.
31///
32/// Sample way to build the struct. Some of the parameters are optional
33/// and can be left unset which would give them default values.
34/// We do not check if the specified bam path or URL exists as there are
35/// use cases where files are generated before the `InputBam` object is used.
36/// Not all options are listed here; for a full list, please see all the
37/// methods of the builder.
38///
39/// ```
40/// use nanalogue_core::{Error, F32Bw0and1, InputBamBuilder, PathOrURLOrStdin};
41///
42/// let bam = InputBamBuilder::default()
43///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
44///     .min_seq_len(30000u64)
45///     .min_align_len(20000i64)
46///     .read_id("some-id")
47///     .read_filter("primary_forward,secondary_forward".into())
48///     .sample_fraction(F32Bw0and1::new(1.0).expect("no error"))
49///     .sample_seed(42u64)
50///     .mapq_filter(20)
51///     .exclude_mapq_unavail(true)
52///     .region("chr4:1000-2000".into())
53///     .full_region(true)
54///     .build()?;
55/// # Ok::<(), Error>(())
56/// ```
57///
58/// This struct and the [`InputMods`] struct allow us to set input options
59/// for BAM/modBAM calculations. An example is shown below where the [`crate::read_info::run`]
60/// command is called to process data from a BAM file with some input options.
61///
62/// ```
63/// use nanalogue_core::{BamRcRecords, BamPreFilt, Error, InputBamBuilder, InputModsBuilder,
64///     OptionalTag, PathOrURLOrStdin, ThresholdState, nanalogue_bam_reader, read_info};
65///
66/// let mut bam = InputBamBuilder::default()
67///     .bam_path(PathOrURLOrStdin::Path("./examples/example_1.bam".into()))
68///     .region("dummyI".into())
69///     .build()?;
70/// let mut mods = InputModsBuilder::<OptionalTag>::default()
71///     .mod_prob_filter(ThresholdState::GtEq(0))
72///     .build()?;
73///
74/// let mut buffer = Vec::new();
75/// let mut reader = nanalogue_bam_reader(&bam.bam_path.to_string())?;
76/// let bam_rc_records = BamRcRecords::new(&mut reader, &mut bam, &mut mods)?;
77/// read_info::run(
78///     &mut buffer,
79///     bam_rc_records.rc_records
80///         .filter(|r| r.as_ref().map_or(true, |v| v.pre_filt(&bam))),
81///     mods,
82///     None,
83/// )?;
84/// assert!(str::from_utf8(buffer.as_slice())?
85///     .contains("5d10eb9a-aae1-4db8-8ec6-7ebb34d32575"));
86/// # Ok::<(), Error>(())
87/// ```
88///
89/// ## Examples resulting in errors
90///
91/// Full region without actually setting a region
92///
93/// ```should_panic
94/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
95///
96/// let bam = InputBamBuilder::default()
97///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
98///     .read_id("some-id")
99///     .full_region(true)
100///     .build()?;
101/// # Ok::<(), Error>(())
102/// ```
103///
104/// Setting both `region` and `region_bed3`. `region` can be converted to
105/// `region_bed3` using [`GenomicRegion::try_to_bed3`] and a BAM header.
106///
107/// ```should_panic
108/// use bedrs::prelude::Bed3;
109/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
110///
111/// let bam = InputBamBuilder::default()
112///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
113///     .read_id("some-id")
114///     .region("chr4:1000-2000".into())
115///     .region_bed3(Bed3::<i32,u64>::new(3, 1000, 2000))
116///     .build()?;
117/// # Ok::<(), Error>(())
118/// ```
119///
120/// Setting more than one of `read_id`, `read_id_list` and `read_id_set`.
121/// * `read_id` means filter to retain only this read.
122/// * `read_id_list` is a path to a file with a list of read ids.
123/// * `read_id_set` is a set of read ids supplied directly.
124///
125/// ```
126/// use bedrs::prelude::Bed3;
127/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
128/// use std::collections::HashSet;
129///
130/// let _ = InputBamBuilder::default()
131///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
132///     .read_id("some-id")
133///     .read_id_list("/some/file.txt")
134///     .build().unwrap_err();
135///
136/// let mut read_id_set = HashSet::<String>::new();
137/// read_id_set.insert("some-read-a".to_owned());
138/// read_id_set.insert("some-read-b".to_owned());
139///
140/// let _ = InputBamBuilder::default()
141///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
142///     .read_id_list("/some/file.txt")
143///     .read_id_set(read_id_set.clone())
144///     .build().unwrap_err();
145///
146/// let _ = InputBamBuilder::default()
147///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
148///     .read_id("some-id")
149///     .read_id_set(read_id_set)
150///     .build().unwrap_err();
151/// ```
152#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
153#[serde(default)]
154#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
155#[non_exhaustive]
156pub struct InputBam {
157    /// Input BAM file. Set to a local file path, or set to - to read from stdin,
158    /// or set to a URL to read from a remote file. If using stdin and piping in
159    /// from `samtools view`, always include the header with the `-h` option.
160    pub bam_path: PathOrURLOrStdin,
161    /// Exclude reads whose sequence length in the BAM file is
162    /// below this value. Defaults to 0.
163    #[clap(long, default_value_t)]
164    #[builder(setter(into))]
165    pub min_seq_len: u64,
166    /// Exclude reads whose alignment length in the BAM file is
167    /// below this value. Defaults to unused.
168    #[clap(long)]
169    #[builder(setter(into, strip_option))]
170    pub min_align_len: Option<i64>,
171    /// Only include this read id, defaults to unused i.e. all reads are used.
172    /// NOTE: if there are multiple alignments corresponding
173    /// to this read id, all of them are used.
174    #[clap(long, conflicts_with = "read_id_list")]
175    #[builder(setter(into, strip_option))]
176    pub read_id: Option<String>,
177    /// Path to file containing list of read IDs (one per line).
178    /// Lines starting with '#' are treated as comments and ignored.
179    /// Cannot be used together with --read-id.
180    #[clap(long, conflicts_with = "read_id")]
181    #[builder(setter(into, strip_option))]
182    pub read_id_list: Option<String>,
183    /// Internal `HashSet` of read IDs loaded from `read_id_list` file.
184    /// This is populated automatically and not exposed to users.
185    #[clap(skip)]
186    #[builder(setter(into, strip_option))]
187    pub read_id_set: Option<HashSet<String>>,
188    /// Number of threads used during some aspects of program execution
189    #[clap(long, default_value_t = NonZeroU32::new(2).expect("no error"))]
190    #[builder(setter(into))]
191    pub threads: NonZeroU32,
192    /// Include "zero-length" sequences e.g. sequences with "*" in the sequence
193    /// field. By default, these sequences are excluded to avoid processing errors.
194    /// If this flag is set, these reads are included irrespective of any
195    /// minimum sequence or align length criteria the user may have set.
196    /// WARNINGS: (1) Some functions of the codebase may break or produce incorrect
197    /// results if you use this flag. (2) due to a technical reason, we need a DNA sequence
198    /// in the sequence field and cannot infer sequence length from other sources
199    /// e.g. CIGAR strings.
200    #[clap(long, default_value_t)]
201    pub include_zero_len: bool,
202    /// Only retain reads of this type. Allowed types are `primary_forward`,
203    /// `primary_reverse`, `secondary_forward`, `secondary_reverse`, `supplementary_forward`,
204    /// `supplementary_reverse` and unmapped. Specify more than one type if needed
205    /// separated by commas, in which case reads of any type in list are retained.
206    /// Defaults to retain reads of all types.
207    #[clap(long)]
208    #[builder(field(
209        ty = "String",
210        build = "(!self.read_filter.is_empty()).then(|| ReadStates::from_str(&self.read_filter)).transpose()?"
211    ))]
212    pub read_filter: Option<ReadStates>,
213    /// Subsample BAM to retain only this fraction of total number of reads,
214    /// defaults to 1.0. The sampling algorithm considers every read according
215    /// to the specified probability, so due to this, you may not always get
216    /// the same number of reads e.g. if you set `-s 0.05` in a file with 1000 reads,
217    /// you will get 50 +- sqrt(50) reads.
218    /// By default, a new subsample is drawn every time as the seed is not fixed.
219    /// Set `--sample-seed` to get reproducible subsampling.
220    #[clap(short, long, default_value_t = F32Bw0and1::one())]
221    pub sample_fraction: F32Bw0and1,
222    /// Seed for reproducible subsampling. When set, the subsampling decision
223    /// for each read is deterministic based on the read name and the seed.
224    /// Different seeds produce different subsets. If not set, subsampling
225    /// is random and non-reproducible (the default behavior).
226    #[clap(long)]
227    #[builder(setter(into, strip_option))]
228    pub sample_seed: Option<u64>,
229    /// Exclude reads whose MAPQ (Mapping quality of position) is below this value.
230    /// Defaults to zero i.e. do not exclude any read.
231    #[clap(long, default_value_t)]
232    #[builder(setter(into))]
233    pub mapq_filter: u8,
234    /// Exclude sequences with MAPQ unavailable.
235    /// In the BAM format, a value of 255 in this column means MAPQ is unavailable.
236    /// These reads are allowed by default, set this flag to exclude.
237    #[clap(long, default_value_t)]
238    pub exclude_mapq_unavail: bool,
239    /// Only keep reads passing through this region. If a BAM index is available
240    /// with a name same as the BAM file but with the .bai suffix, the operation
241    /// of selecting such reads will be faster. If you are using standard input
242    /// as your input e.g. you are piping in the output from samtools, then
243    /// you cannot use an index as a BAM filename is not available.
244    #[clap(long)]
245    #[builder(field(
246        ty = "String",
247        build = "(!self.region.is_empty()).then(|| GenomicRegion::from_str(&self.region)).transpose()?"
248    ))]
249    pub region: Option<GenomicRegion>,
250    /// Only keep read data from this region.
251    /// This is an internal option not exposed to the user, we will set it
252    /// based on the other options that the user sets.
253    #[clap(skip)]
254    #[builder(setter(into, strip_option))]
255    pub region_bed3: Option<Bed3<i32, u64>>,
256    /// Only keep reads if they pass through the specified region in full.
257    /// Related to the input `--region`; has no effect if that is not set.
258    #[clap(long, default_value_t, requires = "region")]
259    pub full_region: bool,
260}
261
262impl InputBamBuilder {
263    /// Validate [`InputBam`] creation through the Builder method.
264    ///
265    /// [`InputBam`] can be created in many ways:
266    /// - directly populating the fields as they are public.
267    /// - through the CLI-machinery
268    /// - through the builder.
269    ///
270    /// While we cannot check a user's usage through the first method,
271    /// we can check constructions through methods 2 and 3 above.
272    /// The CLI construction checking is handled by `clap`, and the builder
273    /// checking is done through this function.
274    /// The construction checks are slightly different as what may be feasible
275    /// through a CLI and through code are different.
276    /// So we have this validate method to check construction through
277    /// method number 3 above.
278    #[expect(
279        clippy::arithmetic_side_effects,
280        reason = "1 + 1 + 1 is not gonna overflow"
281    )]
282    fn validate(&self) -> Result<(), Error> {
283        match (
284            !self.region.is_empty(),
285            self.region_bed3.is_some(),
286            self.full_region,
287        ) {
288            (false, false, Some(true)) => {
289                return Err(Error::BuilderValidation("InputBamBuilder: cannot set `full_region` without setting `region` or `region_bed3`".to_owned()));
290            }
291            (true, true, _) => {
292                return Err(Error::BuilderValidation(
293                    "InputBamBuilder: cannot set both `region` and `region_bed3`".to_owned(),
294                ));
295            }
296            _ => {}
297        }
298
299        if u8::from(self.read_id.is_some())
300            + u8::from(self.read_id_list.is_some())
301            + u8::from(self.read_id_set.is_some())
302            > 1
303        {
304            Err(Error::BuilderValidation(
305                "InputBamBuilder: cannot set >1 of `read_id`, `read_id_list` and `read_id_set`"
306                    .to_owned(),
307            ))
308        } else {
309            Ok(())
310        }
311    }
312}
313
314/// Implements a default class for `InputBAM`
315impl Default for InputBam {
316    fn default() -> Self {
317        InputBam {
318            bam_path: PathOrURLOrStdin::Stdin,
319            min_seq_len: 0,
320            min_align_len: None,
321            read_id: None,
322            read_id_list: None,
323            read_id_set: None,
324            threads: NonZeroU32::new(2).expect("no error"),
325            include_zero_len: false,
326            read_filter: None,
327            sample_fraction: F32Bw0and1::one(),
328            sample_seed: None,
329            mapq_filter: 0,
330            exclude_mapq_unavail: false,
331            region: None,
332            region_bed3: None,
333            full_region: false,
334        }
335    }
336}
337
338/// This struct contains an optional modification tag
339#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
340#[non_exhaustive]
341pub struct OptionalTag {
342    /// modified tag
343    #[clap(long)]
344    pub tag: Option<ModChar>,
345}
346
347impl FromStr for OptionalTag {
348    type Err = Error;
349
350    /// process the modification type from a string.
351    ///
352    /// ```
353    /// use nanalogue_core::{Error, ModChar, OptionalTag};
354    /// use std::str::FromStr;
355    ///
356    /// let tag = OptionalTag::from_str("m")?;
357    /// assert_eq!(tag.tag.unwrap().val(), 'm');
358    /// # Ok::<(), Error>(())
359    /// ```
360    fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
361        Ok(OptionalTag {
362            tag: Some(ModChar::from_str(mod_type)?),
363        })
364    }
365}
366
367/// This struct contains a required modification tag
368#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
369#[non_exhaustive]
370pub struct RequiredTag {
371    /// modified tag
372    #[clap(long)]
373    pub tag: ModChar,
374}
375
376impl FromStr for RequiredTag {
377    type Err = Error;
378
379    /// process the modification type from a string.
380    ///
381    /// ```
382    /// use nanalogue_core::{Error, ModChar, RequiredTag};
383    /// use std::str::FromStr;
384    ///
385    /// let tag = RequiredTag::from_str("m")?;
386    /// assert_eq!(tag.tag.val(), 'm');
387    /// # Ok::<(), Error>(())
388    /// ```
389    fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
390        Ok(RequiredTag {
391            tag: ModChar::from_str(mod_type)?,
392        })
393    }
394}
395
396/// Trait that returns a modification tag
397pub trait TagState {
398    /// Returns the modification tag of the tag state in an option
399    fn tag(&self) -> Option<ModChar> {
400        unimplemented!();
401    }
402}
403
404impl TagState for OptionalTag {
405    fn tag(&self) -> Option<ModChar> {
406        self.tag
407    }
408}
409
410impl TagState for RequiredTag {
411    fn tag(&self) -> Option<ModChar> {
412        Some(self.tag)
413    }
414}
415
416/// This struct contains the options input to our
417/// modification-data functions with restrictions on data received.
418///
419/// If you want to build this, use [`InputModsBuilder`].
420/// In CLI mode, `clap` populates this struct.
421/// This and the [`InputBam`] struct are used to set almost all input options
422/// to many of our functions that process BAM/modBAM files.
423///
424/// # Examples
425///
426/// Please look at the documentation of [`InputBam`] to see a fully-worked
427/// example to set up inputs to perform a calculation.
428///
429/// Following example uses many fields and is for a [`RequiredTag`] variant.
430/// You can omit some of them depending on your use case. `mod_region` must
431/// can be converted to `region_bed3` using [`GenomicRegion::try_to_bed3`]
432/// with a suitable BAM header before this struct can be used with most of
433/// our functions. We do not force this in the builder route as we may not
434/// want to do this for some reason.
435///
436/// ```
437/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
438/// use std::str::FromStr;
439///
440/// let input_options = InputModsBuilder::default()
441///     .tag(RequiredTag::from_str("m")?)
442///     .mod_strand("bc".into())
443///     .mod_prob_filter(ThresholdState::GtEq(0))
444///     .trim_read_ends_mod(10)
445///     .base_qual_filter_mod(10)
446///     .mod_region("chr3:4000-5000".into())
447///     .build()?;
448///
449/// # Ok::<(), Error>(())
450/// ```
451///
452/// A [`RequiredTag`] variant that sets `region_bed3` instead of `mod_region`.
453/// If `mod_region` is set, then a downstream function must parse a BAM file,
454/// and then convert `mod_region` into `region_bed3` using the header.
455/// If the user is confident that they know the index number of a contig,
456/// then they can directly set `region_bed3`. In the example below, the user
457/// knows `chr3` corresponds to `4`, so they set the region directly.
458///
459/// ```
460/// use bedrs::prelude::Bed3;
461/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
462/// use std::str::FromStr;
463///
464/// let input_options = InputModsBuilder::<RequiredTag>::default()
465///     .tag(RequiredTag::from_str("m")?)
466///     .mod_strand("bc".into())
467///     .mod_prob_filter(ThresholdState::GtEq(0))
468///     .trim_read_ends_mod(10)
469///     .base_qual_filter_mod(10)
470///     .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
471///     .build()?;
472///
473/// # Ok::<(), Error>(())
474/// ```
475///
476/// Setting both `region_bed3` and `mod_region` should result in an error,
477/// even if they both refer to the same region. This is because setting both
478/// can result in an undefined state, so we do not allow this.
479///
480/// ```should_panic
481/// use bedrs::prelude::Bed3;
482/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
483/// use std::str::FromStr;
484///
485/// let input_options = InputModsBuilder::<RequiredTag>::default()
486///     .mod_prob_filter(ThresholdState::GtEq(0))
487///     .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
488///     .mod_region("chr3:4000-5000".into())
489///     .build()?;
490///
491/// # Ok::<(), Error>(())
492/// ```
493///
494/// Following example is for an [`OptionalTag`] variant with the `tag` omitted,
495/// and many other fields omitted as well, which take on a default value.
496///
497/// ```
498/// use nanalogue_core::{InputModsBuilder, Error, OptionalTag, ThresholdState};
499/// use std::str::FromStr;
500///
501/// let input_options = InputModsBuilder::<OptionalTag>::default()
502///     .mod_prob_filter(ThresholdState::GtEq(0))
503///     .build()?;
504///
505/// # Ok::<(), Error>(())
506/// ```
507#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
508#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
509#[serde(default)]
510#[non_exhaustive]
511pub struct InputMods<S: TagState + Args + FromArgMatches + Default> {
512    /// modified tag
513    #[clap(flatten)]
514    pub tag: S,
515    /// modified strand, set this to `bc` or `bc_comp`, meaning
516    /// on basecalled strand or its complement. Some technologies
517    /// like `PacBio` or `ONT` duplex can call mod data on both a strand
518    /// and its complementary DNA and store it in the record corresponding
519    /// to the strand, so you can use this filter to select only for
520    /// mod data on a strand or its complement. Please note that this
521    /// filter is different from selecting for forward or reverse
522    /// aligned reads using the BAM flags.
523    #[clap(long)]
524    #[builder(field(
525        ty = "String",
526        build = "(!self.mod_strand.is_empty()).then(|| RestrictModCalledStrand::from_str(&self.mod_strand)).transpose()?"
527    ))]
528    pub mod_strand: Option<RestrictModCalledStrand>,
529    /// Filter to reject mods before analysis. Specify as low,high where
530    /// both are fractions to reject modifications where the probabilities (p)
531    /// are in this range e.g. "0.4,0.6" rejects 0.4 <= p <= 0.6.
532    /// You can use this to reject 'weak' modification calls before analysis
533    /// i.e. those with probabilities close to 0.5.
534    /// NOTE: (1) Whether this filtration is applied or not, mods < 0.5
535    /// are considered unmodified and >= 0.5 are considered modified by our program.
536    /// (2) mod probabilities are stored as a number from 0-255 in the modBAM format,
537    /// so we internally convert 0.0-1.0 to 0-255. Default: reject nothing.
538    #[clap(long, value_parser=ThresholdState::from_str_ordpair_fraction, default_value = "")]
539    pub mod_prob_filter: ThresholdState,
540    /// Filter this many bp at the start and
541    /// end of a read before any mod operations.
542    /// Please note that the units here are bp and
543    /// not units of base being queried.
544    #[clap(long, default_value_t)]
545    pub trim_read_ends_mod: usize,
546    /// Exclude bases whose base quality is below
547    /// this threshold before any mod operation, defaults to 0 i.e. unused.
548    /// NOTE: (1) This step is only applied before modification operations, and
549    /// not before any other operations.
550    /// (2) No offsets such as +33 are needed here.
551    /// (3) Modifications on reads where base quality information
552    /// is not available are all rejected if this is non-zero.
553    #[clap(long, default_value_t)]
554    pub base_qual_filter_mod: u8,
555    /// Only keep modification data from this region
556    #[clap(long)]
557    #[builder(field(
558        ty = "String",
559        build = "(!self.mod_region.is_empty()).then(|| GenomicRegion::from_str(&self.mod_region)).transpose()?"
560    ))]
561    pub mod_region: Option<GenomicRegion>,
562    /// Only keep modification data from this region.
563    /// We do not expose this to the user, but infer it from
564    /// the other options set by the user.
565    /// We cannot populate this directly at the time of CLI parsing,
566    /// as we need to look at the BAM header to convert a contig name
567    /// into a numeric contig id.
568    #[clap(skip)]
569    #[builder(setter(strip_option))]
570    pub region_bed3: Option<Bed3<i32, u64>>,
571}
572
573impl<S: TagState + Args + FromArgMatches + Default> InputModsBuilder<S> {
574    /// Validate [`InputMods`] creation through the Builder method.
575    ///
576    /// [`InputMods`] can be created in many ways:
577    /// - directly populating the fields as they are public.
578    /// - through the CLI-machinery
579    /// - through the builder.
580    ///
581    /// While we cannot check a user's usage through the first method,
582    /// we can check constructions through methods 2 and 3 above.
583    /// The CLI construction checking is handled by `clap`, and the builder
584    /// checking is done through this function.
585    /// The construction checks are slightly different as what may be feasible
586    /// through a CLI and through code are different.
587    /// So we have this validate method to check construction through
588    /// method number 3 above.
589    fn validate(&self) -> Result<(), Error> {
590        // self.mod_region is a `String` for the Builder before conversion into an `Option<_>`
591        if (!self.mod_region.is_empty()) && self.region_bed3.is_some() {
592            Err(Error::BuilderValidation(
593                "cannot set `mod_region` and `region_bed3` simultaneously!".to_owned(),
594            ))
595        } else {
596            Ok(())
597        }
598    }
599}
600
601/// Implements defaults for `InputMods`
602impl<S: TagState + Args + FromArgMatches + Default> Default for InputMods<S> {
603    fn default() -> Self {
604        InputMods::<S> {
605            tag: S::default(),
606            mod_strand: None,
607            mod_prob_filter: ThresholdState::GtEq(0),
608            trim_read_ends_mod: 0,
609            base_qual_filter_mod: 0,
610            mod_region: None,
611            region_bed3: None,
612        }
613    }
614}
615
616/// Retrieves options for modification input
617pub trait InputModOptions {
618    /// retrieves tag
619    fn tag(&self) -> Option<ModChar> {
620        unimplemented!()
621    }
622    /// retrieves option to set basecalled strand or opposite in mod retrieval
623    fn mod_strand(&self) -> Option<RestrictModCalledStrand> {
624        unimplemented!()
625    }
626    /// returns probability filter
627    fn mod_prob_filter(&self) -> ThresholdState {
628        unimplemented!()
629    }
630    /// returns read end trimming
631    fn trim_read_ends_mod(&self) -> usize {
632        unimplemented!()
633    }
634    /// returns threshold for filtering base PHRED quality
635    fn base_qual_filter_mod(&self) -> u8 {
636        unimplemented!()
637    }
638}
639
640/// Retrieves options for region
641pub trait InputRegionOptions {
642    /// returns region requested
643    fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
644        unimplemented!()
645    }
646    /// returns region requested but region in genomic string format
647    fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
648        unimplemented!()
649    }
650    /// sets region requested
651    fn set_region_filter(&mut self, _value: Option<Bed3<i32, u64>>) {
652        unimplemented!()
653    }
654    /// returns true if full overlap with region is requested as opposed to
655    /// only partial overlap. defaults to false.
656    fn is_full_overlap(&self) -> bool {
657        false
658    }
659    /// converts region from genomic string representation to bed3 representation
660    ///
661    /// # Errors
662    /// Returns error if conversion fails
663    fn convert_region_to_bed3(&mut self, header: bam::HeaderView) -> Result<(), Error> {
664        match (
665            self.region_filter_genomic_string().is_some(),
666            self.region_filter().is_some(),
667        ) {
668            (false, false) => self.set_region_filter(None),
669            (true, false) => {
670                let genomic_region = self
671                    .region_filter_genomic_string()
672                    .expect("checked above that this is Some");
673                self.set_region_filter(Some(genomic_region.try_to_bed3(&header)?));
674            }
675            (false, true) => {}
676            (true, true) => {
677                return Err(Error::InvalidState(
678                    "cannot set a region as both a `GenomicRegion` and a `Bed3`".to_owned(),
679                ));
680            }
681        }
682        Ok(())
683    }
684}
685
686impl<S: TagState + Args + FromArgMatches + Default> InputModOptions for InputMods<S> {
687    fn tag(&self) -> Option<ModChar> {
688        self.tag.tag()
689    }
690    fn mod_strand(&self) -> Option<RestrictModCalledStrand> {
691        self.mod_strand
692    }
693    fn mod_prob_filter(&self) -> ThresholdState {
694        self.mod_prob_filter
695    }
696    fn trim_read_ends_mod(&self) -> usize {
697        self.trim_read_ends_mod
698    }
699    fn base_qual_filter_mod(&self) -> u8 {
700        self.base_qual_filter_mod
701    }
702}
703
704impl<S: TagState + Args + FromArgMatches + Default> InputRegionOptions for InputMods<S> {
705    fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
706        self.mod_region.clone()
707    }
708    fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
709        &self.region_bed3
710    }
711    fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
712        self.region_bed3 = value;
713    }
714}
715
716/// can return tag without encasing in an option in the `RequiredTag` variant
717impl InputMods<RequiredTag> {
718    /// retrieves tag
719    #[must_use]
720    pub fn required_tag(&self) -> ModChar {
721        self.tag.tag
722    }
723}
724
725impl InputRegionOptions for InputBam {
726    fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
727        self.region.clone()
728    }
729    fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
730        &self.region_bed3
731    }
732    fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
733        self.region_bed3 = value;
734    }
735    fn is_full_overlap(&self) -> bool {
736        self.full_region
737    }
738}
739
740/// This struct contains the options input to our
741/// modification-data-windowing functions
742///
743/// # Example
744/// ```
745/// use nanalogue_core::{Error, InputWindowingBuilder};
746///
747/// let window = InputWindowingBuilder::default()
748///     .win(20)
749///     .step(10).build()?;
750/// # Ok::<(), Error>(())
751/// ```
752#[derive(Builder, Debug, Args, Clone, Copy, Serialize, Deserialize)]
753#[builder(default, build_fn(error = "Error"))]
754#[serde(default)]
755#[non_exhaustive]
756pub struct InputWindowing {
757    /// size of window in units of base being queried i.e.
758    /// if you are looking for cytosine modifications, then
759    /// a window of a value 300 means create windows each with
760    /// 300 cytosines irrespective of their modification status.
761    #[clap(long)]
762    #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.win)?"))]
763    pub win: NonZeroUsize,
764    /// step window by this size in units of base being queried.
765    #[clap(long)]
766    #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.step)?"))]
767    pub step: NonZeroUsize,
768}
769
770/// Implements a default for `InputWindowing`.
771/// NOTE the defaults of 1 for each are just for ease of programming.
772/// We do not expose these defaults to the command-line user.
773impl Default for InputWindowing {
774    fn default() -> Self {
775        InputWindowing {
776            win: NonZeroUsize::new(1).expect("no error"),
777            step: NonZeroUsize::new(1).expect("no error"),
778        }
779    }
780}
781
782/// This enum contains inputs to display sequences with various options
783#[derive(Debug, Clone, Default, Copy, Serialize, Deserialize)]
784#[non_exhaustive]
785pub enum SeqDisplayOptions {
786    /// Do not display any sequence, default
787    #[default]
788    No,
789    /// Display the entire sequence without any subsetting
790    Full {
791        /// Option to display basecalling qualities
792        show_base_qual: bool,
793    },
794    /// Display the sequence within a region
795    Region {
796        /// Option to display basecalling qualities
797        show_base_qual: bool,
798        /// Option to show bases in lowercase if they are insertions
799        show_ins_lowercase: bool,
800        /// Option to show modified bases as Z (or z depending on other options)
801        show_mod_z: bool,
802        /// Region over which sequence must be shown
803        region: Bed3<i32, u64>,
804    },
805}
806
807#[cfg(test)]
808mod tag_struct_tests {
809    use super::*;
810
811    #[test]
812    fn optional_tag_default() {
813        let optional_tag = OptionalTag::default();
814        assert!(optional_tag.tag.is_none());
815        assert_eq!(optional_tag.tag(), None);
816    }
817
818    #[test]
819    fn optional_tag_with_value() {
820        let mod_char = ModChar::new('m');
821        let optional_tag = OptionalTag {
822            tag: Some(mod_char),
823        };
824        assert_eq!(optional_tag.tag(), Some(mod_char));
825    }
826
827    #[test]
828    fn required_tag_default() {
829        let required_tag = RequiredTag::default();
830        assert_eq!(required_tag.tag(), Some(ModChar::default()));
831    }
832
833    #[test]
834    fn required_tag_with_value() {
835        let mod_char = ModChar::new('C');
836        let required_tag = RequiredTag { tag: mod_char };
837        assert_eq!(required_tag.tag(), Some(mod_char));
838    }
839
840    #[test]
841    fn optional_tag_from_str() {
842        let tag = OptionalTag::from_str("m").unwrap();
843        assert_eq!(tag.tag.unwrap().val(), 'm');
844    }
845
846    #[test]
847    fn required_tag_from_str() {
848        let tag = RequiredTag::from_str("m").unwrap();
849        assert_eq!(tag.tag.val(), 'm');
850    }
851}
852
853#[cfg(test)]
854mod input_windowing_tests {
855    use super::*;
856
857    #[test]
858    fn input_windowing_default() {
859        let windowing = InputWindowing::default();
860        assert_eq!(windowing.win, NonZeroUsize::new(1).unwrap());
861        assert_eq!(windowing.step, NonZeroUsize::new(1).unwrap());
862    }
863
864    #[test]
865    fn input_windowing_custom_values() {
866        let windowing = InputWindowing {
867            win: NonZeroUsize::new(300).unwrap(),
868            step: NonZeroUsize::new(150).unwrap(),
869        };
870        assert_eq!(windowing.win, NonZeroUsize::new(300).unwrap());
871        assert_eq!(windowing.step, NonZeroUsize::new(150).unwrap());
872    }
873
874    #[test]
875    fn input_windowing_builder() {
876        let _: InputWindowing = InputWindowingBuilder::default()
877            .win(20)
878            .step(10)
879            .build()
880            .unwrap();
881    }
882}
883
884#[cfg(test)]
885mod input_mods_required_tag_tests {
886    use super::*;
887
888    #[test]
889    fn input_mods_required_tag_fn_tag() {
890        let mod_char = ModChar::new('C');
891        let input_mods = InputMods::<RequiredTag> {
892            tag: RequiredTag { tag: mod_char },
893            mod_strand: None,
894            mod_prob_filter: ThresholdState::GtEq(0),
895            trim_read_ends_mod: 0,
896            base_qual_filter_mod: 0,
897            mod_region: None,
898            region_bed3: None,
899        };
900
901        assert_eq!(input_mods.required_tag(), mod_char);
902    }
903
904    #[test]
905    fn input_mods_builder_required_tag_with_mod_region() {
906        let _: InputMods<RequiredTag> = InputModsBuilder::default()
907            .tag(RequiredTag::from_str("m").unwrap())
908            .mod_strand("bc".into())
909            .mod_prob_filter(ThresholdState::GtEq(0))
910            .trim_read_ends_mod(10)
911            .base_qual_filter_mod(10)
912            .mod_region("chr3:4000-5000".into())
913            .build()
914            .unwrap();
915    }
916
917    #[test]
918    fn input_mods_builder_required_tag_with_region_bed3() {
919        let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
920            .tag(RequiredTag::from_str("m").unwrap())
921            .mod_strand("bc".into())
922            .mod_prob_filter(ThresholdState::GtEq(0))
923            .trim_read_ends_mod(10)
924            .base_qual_filter_mod(10)
925            .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
926            .build()
927            .unwrap();
928    }
929
930    #[test]
931    fn input_mods_builder_optional_tag_minimal() {
932        let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
933            .mod_prob_filter(ThresholdState::GtEq(0))
934            .build()
935            .unwrap();
936    }
937}
938
939#[cfg(test)]
940mod input_bam_tests {
941    use super::*;
942    use bedrs::Coordinates as _;
943    use indoc::indoc;
944
945    #[test]
946    fn input_bam_is_full_overlap() {
947        // Test default (false)
948        let input_bam_default = InputBam::default();
949        assert!(!input_bam_default.is_full_overlap());
950
951        // Test explicit false
952        let input_bam_false = InputBam {
953            full_region: false,
954            ..Default::default()
955        };
956        assert!(!input_bam_false.is_full_overlap());
957
958        // Test true
959        let input_bam_true = InputBam {
960            full_region: true,
961            ..Default::default()
962        };
963        assert!(input_bam_true.is_full_overlap());
964    }
965
966    #[test]
967    fn input_bam_builder() {
968        let _: InputBam = InputBamBuilder::default()
969            .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
970            .min_seq_len(30000u64)
971            .min_align_len(20000i64)
972            .read_id("some-id")
973            .read_filter("primary_forward,secondary_forward".into())
974            .sample_fraction(F32Bw0and1::one())
975            .mapq_filter(20)
976            .exclude_mapq_unavail(true)
977            .region("chr4:1000-2000".into())
978            .full_region(true)
979            .build()
980            .unwrap();
981    }
982
983    #[test]
984    fn input_bam_convert_region_to_bed3_none() {
985        let mut input_bam = InputBam::default();
986
987        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
988        @SQ\tSN:chr1\tLN:248956422\n"});
989
990        input_bam.convert_region_to_bed3(header_view).unwrap();
991        assert!(input_bam.region_bed3.is_none());
992    }
993
994    #[test]
995    fn input_bam_convert_region_to_bed3_with_region() {
996        let mut input_bam = InputBam {
997            region: Some(GenomicRegion::from_str("chr2:3400-3600").unwrap()),
998            ..Default::default()
999        };
1000
1001        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1002                @SQ\tSN:chr1\tLN:3000
1003                @SQ\tSN:chr2\tLN:4000\n"});
1004
1005        input_bam.convert_region_to_bed3(header_view).unwrap();
1006        assert!(input_bam.region_bed3.is_some());
1007
1008        let bed3 = input_bam.region_bed3.unwrap();
1009        assert_eq!(bed3.chr(), &1);
1010        assert_eq!(bed3.start(), 3400);
1011        assert_eq!(bed3.end(), 3600);
1012    }
1013
1014    #[test]
1015    #[should_panic(expected = "InvalidRegion")]
1016    fn input_bam_convert_region_to_bed3_invalid_region() {
1017        let mut input_bam = InputBam {
1018            region: Some(GenomicRegion::from_str("chr2:4400-4600").expect("no error")),
1019            ..Default::default()
1020        };
1021        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1022                @SQ\tSN:chr1\tLN:3000
1023                @SQ\tSN:chr2\tLN:4000\n"});
1024
1025        input_bam.convert_region_to_bed3(header_view).unwrap();
1026    }
1027
1028    #[test]
1029    #[should_panic(expected = "InvalidRegion")]
1030    fn input_bam_convert_region_to_bed3_invalid_open_ended_region() {
1031        let mut input_bam = InputBam {
1032            region: Some(GenomicRegion::from_str("chr2:4600-").expect("no error")),
1033            ..Default::default()
1034        };
1035        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1036                @SQ\tSN:chr1\tLN:3000
1037                @SQ\tSN:chr2\tLN:4000\n"});
1038
1039        input_bam.convert_region_to_bed3(header_view).unwrap();
1040    }
1041
1042    #[test]
1043    #[should_panic(expected = "InvalidAlignCoords")]
1044    fn input_bam_convert_region_to_bed3_invalid_contig() {
1045        let mut input_bam = InputBam {
1046            region: Some(GenomicRegion::from_str("chr3:1000-2000").expect("no error")),
1047            ..Default::default()
1048        };
1049        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1050                @SQ\tSN:chr1\tLN:3000
1051                @SQ\tSN:chr2\tLN:4000\n"});
1052
1053        input_bam.convert_region_to_bed3(header_view).unwrap();
1054    }
1055
1056    #[test]
1057    fn input_bam_convert_region_to_bed3_already_set_bed3() {
1058        // When region_bed3 is already set and region is None, should not convert
1059        let mut input_bam = InputBam {
1060            region: None,
1061            region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1062            ..Default::default()
1063        };
1064
1065        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1066                @SQ\tSN:chr1\tLN:3000
1067                @SQ\tSN:chr2\tLN:4000\n"});
1068
1069        input_bam.convert_region_to_bed3(header_view).unwrap();
1070
1071        // Bed3 should remain unchanged
1072        assert!(input_bam.region_bed3.is_some());
1073        let bed3 = input_bam.region_bed3.unwrap();
1074        assert_eq!(bed3.chr(), &1);
1075        assert_eq!(bed3.start(), 1000);
1076        assert_eq!(bed3.end(), 2000);
1077    }
1078
1079    #[test]
1080    #[should_panic(expected = "InvalidState")]
1081    fn input_bam_convert_region_to_bed3_both_set() {
1082        // When both region and region_bed3 are set, should error
1083        let mut input_bam = InputBam {
1084            region: Some(GenomicRegion::from_str("chr2:1500-2500").unwrap()),
1085            region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1086            ..Default::default()
1087        };
1088
1089        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1090                @SQ\tSN:chr1\tLN:3000
1091                @SQ\tSN:chr2\tLN:4000\n"});
1092
1093        input_bam.convert_region_to_bed3(header_view).unwrap();
1094    }
1095}
1096
1097#[cfg(test)]
1098mod validate_builder_functions {
1099    use super::*;
1100
1101    // Tests for InputBamBuilder::validate conflicts
1102
1103    #[test]
1104    #[should_panic(expected = "BuilderValidation")]
1105    fn input_bam_builder_full_region_without_region_fails() {
1106        let _: InputBam = InputBamBuilder::default()
1107            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1108            .full_region(true)
1109            .build()
1110            .unwrap();
1111    }
1112
1113    #[test]
1114    #[should_panic(expected = "BuilderValidation")]
1115    fn input_bam_builder_both_region_and_region_bed3_fails() {
1116        let _: InputBam = InputBamBuilder::default()
1117            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1118            .region("chr1:1000-2000".into())
1119            .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1120            .build()
1121            .unwrap();
1122    }
1123
1124    #[test]
1125    #[should_panic(expected = "BuilderValidation")]
1126    fn input_bam_builder_read_id_and_read_id_list_fails() {
1127        let _: InputBam = InputBamBuilder::default()
1128            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1129            .read_id("some-id")
1130            .read_id_list("/some/file.txt")
1131            .build()
1132            .unwrap();
1133    }
1134
1135    #[test]
1136    #[should_panic(expected = "BuilderValidation")]
1137    fn input_bam_builder_read_id_and_read_id_set_fails() {
1138        let mut read_ids = HashSet::new();
1139        let _: bool = read_ids.insert("read1".to_owned());
1140        let _: InputBam = InputBamBuilder::default()
1141            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1142            .read_id("some-id")
1143            .read_id_set(read_ids)
1144            .build()
1145            .unwrap();
1146    }
1147
1148    #[test]
1149    #[should_panic(expected = "BuilderValidation")]
1150    fn input_bam_builder_read_id_list_and_read_id_set_fails() {
1151        let mut read_ids = HashSet::new();
1152        let _: bool = read_ids.insert("read1".to_owned());
1153        let _: InputBam = InputBamBuilder::default()
1154            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1155            .read_id_list("/some/file.txt")
1156            .read_id_set(read_ids)
1157            .build()
1158            .unwrap();
1159    }
1160
1161    #[test]
1162    #[should_panic(expected = "BuilderValidation")]
1163    fn input_bam_builder_all_three_read_id_options_fails() {
1164        let mut read_ids = HashSet::new();
1165        let _: bool = read_ids.insert("read1".to_owned());
1166        let _: InputBam = InputBamBuilder::default()
1167            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1168            .read_id("some-id")
1169            .read_id_list("/some/file.txt")
1170            .read_id_set(read_ids)
1171            .build()
1172            .unwrap();
1173    }
1174
1175    // Tests for InputModsBuilder::validate conflicts
1176
1177    #[test]
1178    #[should_panic(expected = "BuilderValidation")]
1179    fn input_mods_builder_both_mod_region_and_region_bed3_fails() {
1180        let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
1181            .mod_prob_filter(ThresholdState::GtEq(0))
1182            .mod_region("chr1:1000-2000".into())
1183            .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1184            .build()
1185            .unwrap();
1186    }
1187
1188    #[test]
1189    #[should_panic(expected = "BuilderValidation")]
1190    fn input_mods_builder_required_tag_both_regions_fails() {
1191        let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
1192            .tag(RequiredTag::from_str("m").unwrap())
1193            .mod_prob_filter(ThresholdState::GtEq(0))
1194            .mod_region("chr1:1000-2000".into())
1195            .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1196            .build()
1197            .unwrap();
1198    }
1199}