nanalogue_core/
cli.rs

1//! # Command line interface (CLI) options, including input processing options
2//!
3//! This file provides some global options in the command line interface.
4use crate::{
5    Error, F32Bw0and1, GenomicRegion, ModChar, PathOrURLOrStdin, ReadStates,
6    RestrictModCalledStrand, ThresholdState,
7};
8use bedrs::prelude::Bed3;
9use clap::{Args, FromArgMatches};
10use derive_builder::Builder;
11use rust_htslib::bam;
12use serde::{Deserialize, Serialize};
13use std::collections::HashSet;
14use std::num::{NonZeroU32, NonZeroUsize};
15use std::str::FromStr;
16
17/// Options to parse the input bam file and the filters that should be applied to the bam file.
18///
19/// This struct is parsed to create command line arguments and then passed to many functions.
20/// We have copied and edited a similar struct from the fibertools-rs repository.
21/// You can build this through [`InputBamBuilder`].
22/// In CLI mode, `clap` populates this struct.
23/// This and the [`InputMods`] struct are used to set almost all input options
24/// to many of our functions that process BAM/modBAM files.
25///
26/// # Examples
27///
28/// We first begin with an example to build the struct.
29/// The next example shows how this struct and [`InputMods`] can be used
30/// to construct inputs to one of our BAM processing functions.
31///
32/// Sample way to build the struct. Some of the parameters are optional
33/// and can be left unset which would give them default values.
34/// We do not check if the specified bam path or URL exists as there are
35/// use cases where files are generated before the `InputBam` object is used.
36/// Not all options are listed here; for a full list, please see all the
37/// methods of the builder.
38///
39/// ```
40/// use nanalogue_core::{Error, F32Bw0and1, InputBamBuilder, PathOrURLOrStdin};
41///
42/// let bam = InputBamBuilder::default()
43///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
44///     .min_seq_len(30000u64)
45///     .min_align_len(20000i64)
46///     .read_id("some-id")
47///     .read_filter("primary_forward,secondary_forward".into())
48///     .sample_fraction(F32Bw0and1::new(1.0).expect("no error"))
49///     .mapq_filter(20)
50///     .exclude_mapq_unavail(true)
51///     .region("chr4:1000-2000".into())
52///     .full_region(true)
53///     .build()?;
54/// # Ok::<(), Error>(())
55/// ```
56///
57/// This struct and the [`InputMods`] struct allow us to set input options
58/// for BAM/modBAM calculations. An example is shown below where the [`crate::read_info::run`]
59/// command is called to process data from a BAM file with some input options.
60///
61/// ```
62/// use nanalogue_core::{BamRcRecords, BamPreFilt, Error, InputBamBuilder, InputModsBuilder,
63///     OptionalTag, PathOrURLOrStdin, ThresholdState, nanalogue_bam_reader, read_info};
64///
65/// let mut bam = InputBamBuilder::default()
66///     .bam_path(PathOrURLOrStdin::Path("./examples/example_1.bam".into()))
67///     .region("dummyI".into())
68///     .build()?;
69/// let mut mods = InputModsBuilder::<OptionalTag>::default()
70///     .mod_prob_filter(ThresholdState::GtEq(0))
71///     .build()?;
72///
73/// let mut buffer = Vec::new();
74/// let mut reader = nanalogue_bam_reader(&bam.bam_path.to_string())?;
75/// let bam_rc_records = BamRcRecords::new(&mut reader, &mut bam, &mut mods)?;
76/// read_info::run(
77///     &mut buffer,
78///     bam_rc_records.rc_records
79///         .filter(|r| r.as_ref().map_or(true, |v| v.pre_filt(&bam))),
80///     mods,
81///     None,
82/// )?;
83/// assert!(str::from_utf8(buffer.as_slice())?
84///     .contains("5d10eb9a-aae1-4db8-8ec6-7ebb34d32575"));
85/// # Ok::<(), Error>(())
86/// ```
87///
88/// ## Examples resulting in errors
89///
90/// Full region without actually setting a region
91///
92/// ```should_panic
93/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
94///
95/// let bam = InputBamBuilder::default()
96///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
97///     .read_id("some-id")
98///     .full_region(true)
99///     .build()?;
100/// # Ok::<(), Error>(())
101/// ```
102///
103/// Setting both `region` and `region_bed3`. `region` can be converted to
104/// `region_bed3` using [`GenomicRegion::try_to_bed3`] and a BAM header.
105///
106/// ```should_panic
107/// use bedrs::prelude::Bed3;
108/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
109///
110/// let bam = InputBamBuilder::default()
111///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
112///     .read_id("some-id")
113///     .region("chr4:1000-2000".into())
114///     .region_bed3(Bed3::<i32,u64>::new(3, 1000, 2000))
115///     .build()?;
116/// # Ok::<(), Error>(())
117/// ```
118///
119/// Setting more than one of `read_id`, `read_id_list` and `read_id_set`.
120/// * `read_id` means filter to retain only this read.
121/// * `read_id_list` is a path to a file with a list of read ids.
122/// * `read_id_set` is a set of read ids supplied directly.
123///
124/// ```
125/// use bedrs::prelude::Bed3;
126/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
127/// use std::collections::HashSet;
128///
129/// let _ = InputBamBuilder::default()
130///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
131///     .read_id("some-id")
132///     .read_id_list("/some/file.txt")
133///     .build().unwrap_err();
134///
135/// let mut read_id_set = HashSet::<String>::new();
136/// read_id_set.insert("some-read-a".to_owned());
137/// read_id_set.insert("some-read-b".to_owned());
138///
139/// let _ = InputBamBuilder::default()
140///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
141///     .read_id_list("/some/file.txt")
142///     .read_id_set(read_id_set.clone())
143///     .build().unwrap_err();
144///
145/// let _ = InputBamBuilder::default()
146///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
147///     .read_id("some-id")
148///     .read_id_set(read_id_set)
149///     .build().unwrap_err();
150/// ```
151#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
152#[serde(default)]
153#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
154#[non_exhaustive]
155pub struct InputBam {
156    /// Input BAM file. Set to a local file path, or set to - to read from stdin,
157    /// or set to a URL to read from a remote file. If using stdin and piping in
158    /// from `samtools view`, always include the header with the `-h` option.
159    pub bam_path: PathOrURLOrStdin,
160    /// Exclude reads whose sequence length in the BAM file is
161    /// below this value. Defaults to 0.
162    #[clap(long, default_value_t)]
163    #[builder(setter(into))]
164    pub min_seq_len: u64,
165    /// Exclude reads whose alignment length in the BAM file is
166    /// below this value. Defaults to unused.
167    #[clap(long)]
168    #[builder(setter(into, strip_option))]
169    pub min_align_len: Option<i64>,
170    /// Only include this read id, defaults to unused i.e. all reads are used.
171    /// NOTE: if there are multiple alignments corresponding
172    /// to this read id, all of them are used.
173    #[clap(long, conflicts_with = "read_id_list")]
174    #[builder(setter(into, strip_option))]
175    pub read_id: Option<String>,
176    /// Path to file containing list of read IDs (one per line).
177    /// Lines starting with '#' are treated as comments and ignored.
178    /// Cannot be used together with --read-id.
179    #[clap(long, conflicts_with = "read_id")]
180    #[builder(setter(into, strip_option))]
181    pub read_id_list: Option<String>,
182    /// Internal `HashSet` of read IDs loaded from `read_id_list` file.
183    /// This is populated automatically and not exposed to users.
184    #[clap(skip)]
185    #[builder(setter(into, strip_option))]
186    pub read_id_set: Option<HashSet<String>>,
187    /// Number of threads used during some aspects of program execution
188    #[clap(long, default_value_t = NonZeroU32::new(2).expect("no error"))]
189    #[builder(setter(into))]
190    pub threads: NonZeroU32,
191    /// Include "zero-length" sequences e.g. sequences with "*" in the sequence
192    /// field. By default, these sequences are excluded to avoid processing errors.
193    /// If this flag is set, these reads are included irrespective of any
194    /// minimum sequence or align length criteria the user may have set.
195    /// WARNINGS: (1) Some functions of the codebase may break or produce incorrect
196    /// results if you use this flag. (2) due to a technical reason, we need a DNA sequence
197    /// in the sequence field and cannot infer sequence length from other sources
198    /// e.g. CIGAR strings.
199    #[clap(long, default_value_t)]
200    pub include_zero_len: bool,
201    /// Only retain reads of this type. Allowed types are `primary_forward`,
202    /// `primary_reverse`, `secondary_forward`, `secondary_reverse`, `supplementary_forward`,
203    /// `supplementary_reverse` and unmapped. Specify more than one type if needed
204    /// separated by commas, in which case reads of any type in list are retained.
205    /// Defaults to retain reads of all types.
206    #[clap(long)]
207    #[builder(field(
208        ty = "String",
209        build = "(!self.read_filter.is_empty()).then(|| ReadStates::from_str(&self.read_filter)).transpose()?"
210    ))]
211    pub read_filter: Option<ReadStates>,
212    /// Subsample BAM to retain only this fraction of total number of reads,
213    /// defaults to 1.0. The sampling algorithm considers every read according
214    /// to the specified probability, so due to this, you may not always get
215    /// the same number of reads e.g. if you set `-s 0.05` in a file with 1000 reads,
216    /// you will get 50 +- sqrt(50) reads.
217    /// NOTE: a new subsample is drawn every time as the seed is not fixed.
218    /// If you want reproducibility, consider piping the output of `samtools view -s`
219    /// to our program.
220    #[clap(short, long, default_value_t = F32Bw0and1::one())]
221    pub sample_fraction: F32Bw0and1,
222    /// Exclude reads whose MAPQ (Mapping quality of position) is below this value.
223    /// Defaults to zero i.e. do not exclude any read.
224    #[clap(long, default_value_t)]
225    #[builder(setter(into))]
226    pub mapq_filter: u8,
227    /// Exclude sequences with MAPQ unavailable.
228    /// In the BAM format, a value of 255 in this column means MAPQ is unavailable.
229    /// These reads are allowed by default, set this flag to exclude.
230    #[clap(long, default_value_t)]
231    pub exclude_mapq_unavail: bool,
232    /// Only keep reads passing through this region. If a BAM index is available
233    /// with a name same as the BAM file but with the .bai suffix, the operation
234    /// of selecting such reads will be faster. If you are using standard input
235    /// as your input e.g. you are piping in the output from samtools, then
236    /// you cannot use an index as a BAM filename is not available.
237    #[clap(long)]
238    #[builder(field(
239        ty = "String",
240        build = "(!self.region.is_empty()).then(|| GenomicRegion::from_str(&self.region)).transpose()?"
241    ))]
242    pub region: Option<GenomicRegion>,
243    /// Only keep read data from this region.
244    /// This is an internal option not exposed to the user, we will set it
245    /// based on the other options that the user sets.
246    #[clap(skip)]
247    #[builder(setter(into, strip_option))]
248    pub region_bed3: Option<Bed3<i32, u64>>,
249    /// Only keep reads if they pass through the specified region in full.
250    /// Related to the input `--region`; has no effect if that is not set.
251    #[clap(long, default_value_t, requires = "region")]
252    pub full_region: bool,
253}
254
255impl InputBamBuilder {
256    /// Validate [`InputBam`] creation through the Builder method.
257    ///
258    /// [`InputBam`] can be created in many ways:
259    /// - directly populating the fields as they are public.
260    /// - through the CLI-machinery
261    /// - through the builder.
262    ///
263    /// While we cannot check a user's usage through the first method,
264    /// we can check constructions through methods 2 and 3 above.
265    /// The CLI construction checking is handled by `clap`, and the builder
266    /// checking is done through this function.
267    /// The construction checks are slightly different as what may be feasible
268    /// through a CLI and through code are different.
269    /// So we have this validate method to check construction through
270    /// method number 3 above.
271    #[expect(
272        clippy::arithmetic_side_effects,
273        reason = "1 + 1 + 1 is not gonna overflow"
274    )]
275    fn validate(&self) -> Result<(), Error> {
276        match (
277            !self.region.is_empty(),
278            self.region_bed3.is_some(),
279            self.full_region,
280        ) {
281            (false, false, Some(true)) => {
282                return Err(Error::BuilderValidation("InputBamBuilder: cannot set `full_region` without setting `region` or `region_bed3`".to_owned()));
283            }
284            (true, true, _) => {
285                return Err(Error::BuilderValidation(
286                    "InputBamBuilder: cannot set both `region` and `region_bed3`".to_owned(),
287                ));
288            }
289            _ => {}
290        }
291
292        if u8::from(self.read_id.is_some())
293            + u8::from(self.read_id_list.is_some())
294            + u8::from(self.read_id_set.is_some())
295            > 1
296        {
297            Err(Error::BuilderValidation(
298                "InputBamBuilder: cannot set >1 of `read_id`, `read_id_list` and `read_id_set`"
299                    .to_owned(),
300            ))
301        } else {
302            Ok(())
303        }
304    }
305}
306
307/// Implements a default class for `InputBAM`
308impl Default for InputBam {
309    fn default() -> Self {
310        InputBam {
311            bam_path: PathOrURLOrStdin::Stdin,
312            min_seq_len: 0,
313            min_align_len: None,
314            read_id: None,
315            read_id_list: None,
316            read_id_set: None,
317            threads: NonZeroU32::new(2).expect("no error"),
318            include_zero_len: false,
319            read_filter: None,
320            sample_fraction: F32Bw0and1::one(),
321            mapq_filter: 0,
322            exclude_mapq_unavail: false,
323            region: None,
324            region_bed3: None,
325            full_region: false,
326        }
327    }
328}
329
330/// This struct contains an optional modification tag
331#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
332#[non_exhaustive]
333pub struct OptionalTag {
334    /// modified tag
335    #[clap(long)]
336    pub tag: Option<ModChar>,
337}
338
339impl FromStr for OptionalTag {
340    type Err = Error;
341
342    /// process the modification type from a string.
343    ///
344    /// ```
345    /// use nanalogue_core::{Error, ModChar, OptionalTag};
346    /// use std::str::FromStr;
347    ///
348    /// let tag = OptionalTag::from_str("m")?;
349    /// assert_eq!(tag.tag.unwrap().val(), 'm');
350    /// # Ok::<(), Error>(())
351    /// ```
352    fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
353        Ok(OptionalTag {
354            tag: Some(ModChar::from_str(mod_type)?),
355        })
356    }
357}
358
359/// This struct contains a required modification tag
360#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
361#[non_exhaustive]
362pub struct RequiredTag {
363    /// modified tag
364    #[clap(long)]
365    pub tag: ModChar,
366}
367
368impl FromStr for RequiredTag {
369    type Err = Error;
370
371    /// process the modification type from a string.
372    ///
373    /// ```
374    /// use nanalogue_core::{Error, ModChar, RequiredTag};
375    /// use std::str::FromStr;
376    ///
377    /// let tag = RequiredTag::from_str("m")?;
378    /// assert_eq!(tag.tag.val(), 'm');
379    /// # Ok::<(), Error>(())
380    /// ```
381    fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
382        Ok(RequiredTag {
383            tag: ModChar::from_str(mod_type)?,
384        })
385    }
386}
387
388/// Trait that returns a modification tag
389pub trait TagState {
390    /// Returns the modification tag of the tag state in an option
391    fn tag(&self) -> Option<ModChar> {
392        unimplemented!();
393    }
394}
395
396impl TagState for OptionalTag {
397    fn tag(&self) -> Option<ModChar> {
398        self.tag
399    }
400}
401
402impl TagState for RequiredTag {
403    fn tag(&self) -> Option<ModChar> {
404        Some(self.tag)
405    }
406}
407
408/// This struct contains the options input to our
409/// modification-data functions with restrictions on data received.
410///
411/// If you want to build this, use [`InputModsBuilder`].
412/// In CLI mode, `clap` populates this struct.
413/// This and the [`InputBam`] struct are used to set almost all input options
414/// to many of our functions that process BAM/modBAM files.
415///
416/// # Examples
417///
418/// Please look at the documentation of [`InputBam`] to see a fully-worked
419/// example to set up inputs to perform a calculation.
420///
421/// Following example uses many fields and is for a [`RequiredTag`] variant.
422/// You can omit some of them depending on your use case. `mod_region` must
423/// can be converted to `region_bed3` using [`GenomicRegion::try_to_bed3`]
424/// with a suitable BAM header before this struct can be used with most of
425/// our functions. We do not force this in the builder route as we may not
426/// want to do this for some reason.
427///
428/// ```
429/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
430/// use std::str::FromStr;
431///
432/// let input_options = InputModsBuilder::default()
433///     .tag(RequiredTag::from_str("m")?)
434///     .mod_strand("bc".into())
435///     .mod_prob_filter(ThresholdState::GtEq(0))
436///     .trim_read_ends_mod(10)
437///     .base_qual_filter_mod(10)
438///     .mod_region("chr3:4000-5000".into())
439///     .build()?;
440///
441/// # Ok::<(), Error>(())
442/// ```
443///
444/// A [`RequiredTag`] variant that sets `region_bed3` instead of `mod_region`.
445/// If `mod_region` is set, then a downstream function must parse a BAM file,
446/// and then convert `mod_region` into `region_bed3` using the header.
447/// If the user is confident that they know the index number of a contig,
448/// then they can directly set `region_bed3`. In the example below, the user
449/// knows `chr3` corresponds to `4`, so they set the region directly.
450///
451/// ```
452/// use bedrs::prelude::Bed3;
453/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
454/// use std::str::FromStr;
455///
456/// let input_options = InputModsBuilder::<RequiredTag>::default()
457///     .tag(RequiredTag::from_str("m")?)
458///     .mod_strand("bc".into())
459///     .mod_prob_filter(ThresholdState::GtEq(0))
460///     .trim_read_ends_mod(10)
461///     .base_qual_filter_mod(10)
462///     .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
463///     .build()?;
464///
465/// # Ok::<(), Error>(())
466/// ```
467///
468/// Setting both `region_bed3` and `mod_region` should result in an error,
469/// even if they both refer to the same region. This is because setting both
470/// can result in an undefined state, so we do not allow this.
471///
472/// ```should_panic
473/// use bedrs::prelude::Bed3;
474/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
475/// use std::str::FromStr;
476///
477/// let input_options = InputModsBuilder::<RequiredTag>::default()
478///     .mod_prob_filter(ThresholdState::GtEq(0))
479///     .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
480///     .mod_region("chr3:4000-5000".into())
481///     .build()?;
482///
483/// # Ok::<(), Error>(())
484/// ```
485///
486/// Following example is for an [`OptionalTag`] variant with the `tag` omitted,
487/// and many other fields omitted as well, which take on a default value.
488///
489/// ```
490/// use nanalogue_core::{InputModsBuilder, Error, OptionalTag, ThresholdState};
491/// use std::str::FromStr;
492///
493/// let input_options = InputModsBuilder::<OptionalTag>::default()
494///     .mod_prob_filter(ThresholdState::GtEq(0))
495///     .build()?;
496///
497/// # Ok::<(), Error>(())
498/// ```
499#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
500#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
501#[serde(default)]
502#[non_exhaustive]
503pub struct InputMods<S: TagState + Args + FromArgMatches + Default> {
504    /// modified tag
505    #[clap(flatten)]
506    pub tag: S,
507    /// modified strand, set this to `bc` or `bc_comp`, meaning
508    /// on basecalled strand or its complement. Some technologies
509    /// like `PacBio` or `ONT` duplex can call mod data on both a strand
510    /// and its complementary DNA and store it in the record corresponding
511    /// to the strand, so you can use this filter to select only for
512    /// mod data on a strand or its complement. Please note that this
513    /// filter is different from selecting for forward or reverse
514    /// aligned reads using the BAM flags.
515    #[clap(long)]
516    #[builder(field(
517        ty = "String",
518        build = "(!self.mod_strand.is_empty()).then(|| RestrictModCalledStrand::from_str(&self.mod_strand)).transpose()?"
519    ))]
520    pub mod_strand: Option<RestrictModCalledStrand>,
521    /// Filter to reject mods before analysis. Specify as low,high where
522    /// both are fractions to reject modifications where the probabilities (p)
523    /// are in this range e.g. "0.4,0.6" rejects 0.4 <= p <= 0.6.
524    /// You can use this to reject 'weak' modification calls before analysis
525    /// i.e. those with probabilities close to 0.5.
526    /// NOTE: (1) Whether this filtration is applied or not, mods < 0.5
527    /// are considered unmodified and >= 0.5 are considered modified by our program.
528    /// (2) mod probabilities are stored as a number from 0-255 in the modBAM format,
529    /// so we internally convert 0.0-1.0 to 0-255. Default: reject nothing.
530    #[clap(long, value_parser=ThresholdState::from_str_ordpair_fraction, default_value = "")]
531    pub mod_prob_filter: ThresholdState,
532    /// Filter this many bp at the start and
533    /// end of a read before any mod operations.
534    /// Please note that the units here are bp and
535    /// not units of base being queried.
536    #[clap(long, default_value_t)]
537    pub trim_read_ends_mod: usize,
538    /// Exclude bases whose base quality is below
539    /// this threshold before any mod operation, defaults to 0 i.e. unused.
540    /// NOTE: (1) This step is only applied before modification operations, and
541    /// not before any other operations.
542    /// (2) No offsets such as +33 are needed here.
543    /// (3) Modifications on reads where base quality information
544    /// is not available are all rejected if this is non-zero.
545    #[clap(long, default_value_t)]
546    pub base_qual_filter_mod: u8,
547    /// Only keep modification data from this region
548    #[clap(long)]
549    #[builder(field(
550        ty = "String",
551        build = "(!self.mod_region.is_empty()).then(|| GenomicRegion::from_str(&self.mod_region)).transpose()?"
552    ))]
553    pub mod_region: Option<GenomicRegion>,
554    /// Only keep modification data from this region.
555    /// We do not expose this to the user, but infer it from
556    /// the other options set by the user.
557    /// We cannot populate this directly at the time of CLI parsing,
558    /// as we need to look at the BAM header to convert a contig name
559    /// into a numeric contig id.
560    #[clap(skip)]
561    #[builder(setter(strip_option))]
562    pub region_bed3: Option<Bed3<i32, u64>>,
563}
564
565impl<S: TagState + Args + FromArgMatches + Default> InputModsBuilder<S> {
566    /// Validate [`InputMods`] creation through the Builder method.
567    ///
568    /// [`InputMods`] can be created in many ways:
569    /// - directly populating the fields as they are public.
570    /// - through the CLI-machinery
571    /// - through the builder.
572    ///
573    /// While we cannot check a user's usage through the first method,
574    /// we can check constructions through methods 2 and 3 above.
575    /// The CLI construction checking is handled by `clap`, and the builder
576    /// checking is done through this function.
577    /// The construction checks are slightly different as what may be feasible
578    /// through a CLI and through code are different.
579    /// So we have this validate method to check construction through
580    /// method number 3 above.
581    fn validate(&self) -> Result<(), Error> {
582        // self.mod_region is a `String` for the Builder before conversion into an `Option<_>`
583        if (!self.mod_region.is_empty()) && self.region_bed3.is_some() {
584            Err(Error::BuilderValidation(
585                "cannot set `mod_region` and `region_bed3` simultaneously!".to_owned(),
586            ))
587        } else {
588            Ok(())
589        }
590    }
591}
592
593/// Implements defaults for `InputMods`
594impl<S: TagState + Args + FromArgMatches + Default> Default for InputMods<S> {
595    fn default() -> Self {
596        InputMods::<S> {
597            tag: S::default(),
598            mod_strand: None,
599            mod_prob_filter: ThresholdState::GtEq(0),
600            trim_read_ends_mod: 0,
601            base_qual_filter_mod: 0,
602            mod_region: None,
603            region_bed3: None,
604        }
605    }
606}
607
608/// Retrieves options for modification input
609pub trait InputModOptions {
610    /// retrieves tag
611    fn tag(&self) -> Option<ModChar> {
612        unimplemented!()
613    }
614    /// retrieves option to set basecalled strand or opposite in mod retrieval
615    fn mod_strand(&self) -> Option<RestrictModCalledStrand> {
616        unimplemented!()
617    }
618    /// returns probability filter
619    fn mod_prob_filter(&self) -> ThresholdState {
620        unimplemented!()
621    }
622    /// returns read end trimming
623    fn trim_read_ends_mod(&self) -> usize {
624        unimplemented!()
625    }
626    /// returns threshold for filtering base PHRED quality
627    fn base_qual_filter_mod(&self) -> u8 {
628        unimplemented!()
629    }
630}
631
632/// Retrieves options for region
633pub trait InputRegionOptions {
634    /// returns region requested
635    fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
636        unimplemented!()
637    }
638    /// returns region requested but region in genomic string format
639    fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
640        unimplemented!()
641    }
642    /// sets region requested
643    fn set_region_filter(&mut self, _value: Option<Bed3<i32, u64>>) {
644        unimplemented!()
645    }
646    /// returns true if full overlap with region is requested as opposed to
647    /// only partial overlap. defaults to false.
648    fn is_full_overlap(&self) -> bool {
649        false
650    }
651    /// converts region from genomic string representation to bed3 representation
652    ///
653    /// # Errors
654    /// Returns error if conversion fails
655    fn convert_region_to_bed3(&mut self, header: bam::HeaderView) -> Result<(), Error> {
656        match (
657            self.region_filter_genomic_string().is_some(),
658            self.region_filter().is_some(),
659        ) {
660            (false, false) => self.set_region_filter(None),
661            (true, false) => {
662                let genomic_region = self
663                    .region_filter_genomic_string()
664                    .expect("checked above that this is Some");
665                self.set_region_filter(Some(genomic_region.try_to_bed3(&header)?));
666            }
667            (false, true) => {}
668            (true, true) => {
669                return Err(Error::InvalidState(
670                    "cannot set a region as both a `GenomicRegion` and a `Bed3`".to_owned(),
671                ));
672            }
673        }
674        Ok(())
675    }
676}
677
678impl<S: TagState + Args + FromArgMatches + Default> InputModOptions for InputMods<S> {
679    fn tag(&self) -> Option<ModChar> {
680        self.tag.tag()
681    }
682    fn mod_strand(&self) -> Option<RestrictModCalledStrand> {
683        self.mod_strand
684    }
685    fn mod_prob_filter(&self) -> ThresholdState {
686        self.mod_prob_filter
687    }
688    fn trim_read_ends_mod(&self) -> usize {
689        self.trim_read_ends_mod
690    }
691    fn base_qual_filter_mod(&self) -> u8 {
692        self.base_qual_filter_mod
693    }
694}
695
696impl<S: TagState + Args + FromArgMatches + Default> InputRegionOptions for InputMods<S> {
697    fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
698        self.mod_region.clone()
699    }
700    fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
701        &self.region_bed3
702    }
703    fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
704        self.region_bed3 = value;
705    }
706}
707
708/// can return tag without encasing in an option in the `RequiredTag` variant
709impl InputMods<RequiredTag> {
710    /// retrieves tag
711    #[must_use]
712    pub fn required_tag(&self) -> ModChar {
713        self.tag.tag
714    }
715}
716
717impl InputRegionOptions for InputBam {
718    fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
719        self.region.clone()
720    }
721    fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
722        &self.region_bed3
723    }
724    fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
725        self.region_bed3 = value;
726    }
727    fn is_full_overlap(&self) -> bool {
728        self.full_region
729    }
730}
731
732/// This struct contains the options input to our
733/// modification-data-windowing functions
734///
735/// # Example
736/// ```
737/// use nanalogue_core::{Error, InputWindowingBuilder};
738///
739/// let window = InputWindowingBuilder::default()
740///     .win(20)
741///     .step(10).build()?;
742/// # Ok::<(), Error>(())
743/// ```
744#[derive(Builder, Debug, Args, Clone, Copy, Serialize, Deserialize)]
745#[builder(default, build_fn(error = "Error"))]
746#[serde(default)]
747#[non_exhaustive]
748pub struct InputWindowing {
749    /// size of window in units of base being queried i.e.
750    /// if you are looking for cytosine modifications, then
751    /// a window of a value 300 means create windows each with
752    /// 300 cytosines irrespective of their modification status.
753    #[clap(long)]
754    #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.win)?"))]
755    pub win: NonZeroUsize,
756    /// step window by this size in units of base being queried.
757    #[clap(long)]
758    #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.step)?"))]
759    pub step: NonZeroUsize,
760}
761
762/// Implements a default for `InputWindowing`.
763/// NOTE the defaults of 1 for each are just for ease of programming.
764/// We do not expose these defaults to the command-line user.
765impl Default for InputWindowing {
766    fn default() -> Self {
767        InputWindowing {
768            win: NonZeroUsize::new(1).expect("no error"),
769            step: NonZeroUsize::new(1).expect("no error"),
770        }
771    }
772}
773
774/// This enum contains inputs to display sequences with various options
775#[derive(Debug, Clone, Default, Copy, Serialize, Deserialize)]
776#[non_exhaustive]
777pub enum SeqDisplayOptions {
778    /// Do not display any sequence, default
779    #[default]
780    No,
781    /// Display the entire sequence without any subsetting
782    Full {
783        /// Option to display basecalling qualities
784        show_base_qual: bool,
785    },
786    /// Display the sequence within a region
787    Region {
788        /// Option to display basecalling qualities
789        show_base_qual: bool,
790        /// Option to show bases in lowercase if they are insertions
791        show_ins_lowercase: bool,
792        /// Option to show modified bases as Z (or z depending on other options)
793        show_mod_z: bool,
794        /// Region over which sequence must be shown
795        region: Bed3<i32, u64>,
796    },
797}
798
799#[cfg(test)]
800mod tag_struct_tests {
801    use super::*;
802
803    #[test]
804    fn optional_tag_default() {
805        let optional_tag = OptionalTag::default();
806        assert!(optional_tag.tag.is_none());
807        assert_eq!(optional_tag.tag(), None);
808    }
809
810    #[test]
811    fn optional_tag_with_value() {
812        let mod_char = ModChar::new('m');
813        let optional_tag = OptionalTag {
814            tag: Some(mod_char),
815        };
816        assert_eq!(optional_tag.tag(), Some(mod_char));
817    }
818
819    #[test]
820    fn required_tag_default() {
821        let required_tag = RequiredTag::default();
822        assert_eq!(required_tag.tag(), Some(ModChar::default()));
823    }
824
825    #[test]
826    fn required_tag_with_value() {
827        let mod_char = ModChar::new('C');
828        let required_tag = RequiredTag { tag: mod_char };
829        assert_eq!(required_tag.tag(), Some(mod_char));
830    }
831
832    #[test]
833    fn optional_tag_from_str() {
834        let tag = OptionalTag::from_str("m").unwrap();
835        assert_eq!(tag.tag.unwrap().val(), 'm');
836    }
837
838    #[test]
839    fn required_tag_from_str() {
840        let tag = RequiredTag::from_str("m").unwrap();
841        assert_eq!(tag.tag.val(), 'm');
842    }
843}
844
845#[cfg(test)]
846mod input_windowing_tests {
847    use super::*;
848
849    #[test]
850    fn input_windowing_default() {
851        let windowing = InputWindowing::default();
852        assert_eq!(windowing.win, NonZeroUsize::new(1).unwrap());
853        assert_eq!(windowing.step, NonZeroUsize::new(1).unwrap());
854    }
855
856    #[test]
857    fn input_windowing_custom_values() {
858        let windowing = InputWindowing {
859            win: NonZeroUsize::new(300).unwrap(),
860            step: NonZeroUsize::new(150).unwrap(),
861        };
862        assert_eq!(windowing.win, NonZeroUsize::new(300).unwrap());
863        assert_eq!(windowing.step, NonZeroUsize::new(150).unwrap());
864    }
865
866    #[test]
867    fn input_windowing_builder() {
868        let _: InputWindowing = InputWindowingBuilder::default()
869            .win(20)
870            .step(10)
871            .build()
872            .unwrap();
873    }
874}
875
876#[cfg(test)]
877mod input_mods_required_tag_tests {
878    use super::*;
879
880    #[test]
881    fn input_mods_required_tag_fn_tag() {
882        let mod_char = ModChar::new('C');
883        let input_mods = InputMods::<RequiredTag> {
884            tag: RequiredTag { tag: mod_char },
885            mod_strand: None,
886            mod_prob_filter: ThresholdState::GtEq(0),
887            trim_read_ends_mod: 0,
888            base_qual_filter_mod: 0,
889            mod_region: None,
890            region_bed3: None,
891        };
892
893        assert_eq!(input_mods.required_tag(), mod_char);
894    }
895
896    #[test]
897    fn input_mods_builder_required_tag_with_mod_region() {
898        let _: InputMods<RequiredTag> = InputModsBuilder::default()
899            .tag(RequiredTag::from_str("m").unwrap())
900            .mod_strand("bc".into())
901            .mod_prob_filter(ThresholdState::GtEq(0))
902            .trim_read_ends_mod(10)
903            .base_qual_filter_mod(10)
904            .mod_region("chr3:4000-5000".into())
905            .build()
906            .unwrap();
907    }
908
909    #[test]
910    fn input_mods_builder_required_tag_with_region_bed3() {
911        let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
912            .tag(RequiredTag::from_str("m").unwrap())
913            .mod_strand("bc".into())
914            .mod_prob_filter(ThresholdState::GtEq(0))
915            .trim_read_ends_mod(10)
916            .base_qual_filter_mod(10)
917            .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
918            .build()
919            .unwrap();
920    }
921
922    #[test]
923    fn input_mods_builder_optional_tag_minimal() {
924        let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
925            .mod_prob_filter(ThresholdState::GtEq(0))
926            .build()
927            .unwrap();
928    }
929}
930
931#[cfg(test)]
932mod input_bam_tests {
933    use super::*;
934    use bedrs::Coordinates as _;
935    use indoc::indoc;
936
937    #[test]
938    fn input_bam_is_full_overlap() {
939        // Test default (false)
940        let input_bam_default = InputBam::default();
941        assert!(!input_bam_default.is_full_overlap());
942
943        // Test explicit false
944        let input_bam_false = InputBam {
945            full_region: false,
946            ..Default::default()
947        };
948        assert!(!input_bam_false.is_full_overlap());
949
950        // Test true
951        let input_bam_true = InputBam {
952            full_region: true,
953            ..Default::default()
954        };
955        assert!(input_bam_true.is_full_overlap());
956    }
957
958    #[test]
959    fn input_bam_builder() {
960        let _: InputBam = InputBamBuilder::default()
961            .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
962            .min_seq_len(30000u64)
963            .min_align_len(20000i64)
964            .read_id("some-id")
965            .read_filter("primary_forward,secondary_forward".into())
966            .sample_fraction(F32Bw0and1::one())
967            .mapq_filter(20)
968            .exclude_mapq_unavail(true)
969            .region("chr4:1000-2000".into())
970            .full_region(true)
971            .build()
972            .unwrap();
973    }
974
975    #[test]
976    fn input_bam_convert_region_to_bed3_none() {
977        let mut input_bam = InputBam::default();
978
979        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
980        @SQ\tSN:chr1\tLN:248956422\n"});
981
982        input_bam.convert_region_to_bed3(header_view).unwrap();
983        assert!(input_bam.region_bed3.is_none());
984    }
985
986    #[test]
987    fn input_bam_convert_region_to_bed3_with_region() {
988        let mut input_bam = InputBam {
989            region: Some(GenomicRegion::from_str("chr2:3400-3600").unwrap()),
990            ..Default::default()
991        };
992
993        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
994                @SQ\tSN:chr1\tLN:3000
995                @SQ\tSN:chr2\tLN:4000\n"});
996
997        input_bam.convert_region_to_bed3(header_view).unwrap();
998        assert!(input_bam.region_bed3.is_some());
999
1000        let bed3 = input_bam.region_bed3.unwrap();
1001        assert_eq!(bed3.chr(), &1);
1002        assert_eq!(bed3.start(), 3400);
1003        assert_eq!(bed3.end(), 3600);
1004    }
1005
1006    #[test]
1007    #[should_panic(expected = "InvalidRegion")]
1008    fn input_bam_convert_region_to_bed3_invalid_region() {
1009        let mut input_bam = InputBam {
1010            region: Some(GenomicRegion::from_str("chr2:4400-4600").expect("no error")),
1011            ..Default::default()
1012        };
1013        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1014                @SQ\tSN:chr1\tLN:3000
1015                @SQ\tSN:chr2\tLN:4000\n"});
1016
1017        input_bam.convert_region_to_bed3(header_view).unwrap();
1018    }
1019
1020    #[test]
1021    #[should_panic(expected = "InvalidRegion")]
1022    fn input_bam_convert_region_to_bed3_invalid_open_ended_region() {
1023        let mut input_bam = InputBam {
1024            region: Some(GenomicRegion::from_str("chr2:4600-").expect("no error")),
1025            ..Default::default()
1026        };
1027        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1028                @SQ\tSN:chr1\tLN:3000
1029                @SQ\tSN:chr2\tLN:4000\n"});
1030
1031        input_bam.convert_region_to_bed3(header_view).unwrap();
1032    }
1033
1034    #[test]
1035    #[should_panic(expected = "InvalidAlignCoords")]
1036    fn input_bam_convert_region_to_bed3_invalid_contig() {
1037        let mut input_bam = InputBam {
1038            region: Some(GenomicRegion::from_str("chr3:1000-2000").expect("no error")),
1039            ..Default::default()
1040        };
1041        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1042                @SQ\tSN:chr1\tLN:3000
1043                @SQ\tSN:chr2\tLN:4000\n"});
1044
1045        input_bam.convert_region_to_bed3(header_view).unwrap();
1046    }
1047
1048    #[test]
1049    fn input_bam_convert_region_to_bed3_already_set_bed3() {
1050        // When region_bed3 is already set and region is None, should not convert
1051        let mut input_bam = InputBam {
1052            region: None,
1053            region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1054            ..Default::default()
1055        };
1056
1057        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1058                @SQ\tSN:chr1\tLN:3000
1059                @SQ\tSN:chr2\tLN:4000\n"});
1060
1061        input_bam.convert_region_to_bed3(header_view).unwrap();
1062
1063        // Bed3 should remain unchanged
1064        assert!(input_bam.region_bed3.is_some());
1065        let bed3 = input_bam.region_bed3.unwrap();
1066        assert_eq!(bed3.chr(), &1);
1067        assert_eq!(bed3.start(), 1000);
1068        assert_eq!(bed3.end(), 2000);
1069    }
1070
1071    #[test]
1072    #[should_panic(expected = "InvalidState")]
1073    fn input_bam_convert_region_to_bed3_both_set() {
1074        // When both region and region_bed3 are set, should error
1075        let mut input_bam = InputBam {
1076            region: Some(GenomicRegion::from_str("chr2:1500-2500").unwrap()),
1077            region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1078            ..Default::default()
1079        };
1080
1081        let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1082                @SQ\tSN:chr1\tLN:3000
1083                @SQ\tSN:chr2\tLN:4000\n"});
1084
1085        input_bam.convert_region_to_bed3(header_view).unwrap();
1086    }
1087}
1088
1089#[cfg(test)]
1090mod validate_builder_functions {
1091    use super::*;
1092
1093    // Tests for InputBamBuilder::validate conflicts
1094
1095    #[test]
1096    #[should_panic(expected = "BuilderValidation")]
1097    fn input_bam_builder_full_region_without_region_fails() {
1098        let _: InputBam = InputBamBuilder::default()
1099            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1100            .full_region(true)
1101            .build()
1102            .unwrap();
1103    }
1104
1105    #[test]
1106    #[should_panic(expected = "BuilderValidation")]
1107    fn input_bam_builder_both_region_and_region_bed3_fails() {
1108        let _: InputBam = InputBamBuilder::default()
1109            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1110            .region("chr1:1000-2000".into())
1111            .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1112            .build()
1113            .unwrap();
1114    }
1115
1116    #[test]
1117    #[should_panic(expected = "BuilderValidation")]
1118    fn input_bam_builder_read_id_and_read_id_list_fails() {
1119        let _: InputBam = InputBamBuilder::default()
1120            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1121            .read_id("some-id")
1122            .read_id_list("/some/file.txt")
1123            .build()
1124            .unwrap();
1125    }
1126
1127    #[test]
1128    #[should_panic(expected = "BuilderValidation")]
1129    fn input_bam_builder_read_id_and_read_id_set_fails() {
1130        let mut read_ids = HashSet::new();
1131        let _: bool = read_ids.insert("read1".to_owned());
1132        let _: InputBam = InputBamBuilder::default()
1133            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1134            .read_id("some-id")
1135            .read_id_set(read_ids)
1136            .build()
1137            .unwrap();
1138    }
1139
1140    #[test]
1141    #[should_panic(expected = "BuilderValidation")]
1142    fn input_bam_builder_read_id_list_and_read_id_set_fails() {
1143        let mut read_ids = HashSet::new();
1144        let _: bool = read_ids.insert("read1".to_owned());
1145        let _: InputBam = InputBamBuilder::default()
1146            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1147            .read_id_list("/some/file.txt")
1148            .read_id_set(read_ids)
1149            .build()
1150            .unwrap();
1151    }
1152
1153    #[test]
1154    #[should_panic(expected = "BuilderValidation")]
1155    fn input_bam_builder_all_three_read_id_options_fails() {
1156        let mut read_ids = HashSet::new();
1157        let _: bool = read_ids.insert("read1".to_owned());
1158        let _: InputBam = InputBamBuilder::default()
1159            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1160            .read_id("some-id")
1161            .read_id_list("/some/file.txt")
1162            .read_id_set(read_ids)
1163            .build()
1164            .unwrap();
1165    }
1166
1167    // Tests for InputModsBuilder::validate conflicts
1168
1169    #[test]
1170    #[should_panic(expected = "BuilderValidation")]
1171    fn input_mods_builder_both_mod_region_and_region_bed3_fails() {
1172        let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
1173            .mod_prob_filter(ThresholdState::GtEq(0))
1174            .mod_region("chr1:1000-2000".into())
1175            .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1176            .build()
1177            .unwrap();
1178    }
1179
1180    #[test]
1181    #[should_panic(expected = "BuilderValidation")]
1182    fn input_mods_builder_required_tag_both_regions_fails() {
1183        let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
1184            .tag(RequiredTag::from_str("m").unwrap())
1185            .mod_prob_filter(ThresholdState::GtEq(0))
1186            .mod_region("chr1:1000-2000".into())
1187            .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1188            .build()
1189            .unwrap();
1190    }
1191}