Skip to main content

nanalogue_core/
cli.rs

1//! # Command line interface (CLI) options, including input processing options
2//!
3//! This file provides some global options in the command line interface.
4use crate::{
5    Error, F32Bw0and1, GenomicRegion, ModChar, PathOrURLOrStdin, ReadStates,
6    RestrictModCalledStrand, ThresholdState,
7};
8use bedrs::prelude::Bed3;
9use clap::{Args, FromArgMatches};
10use derive_builder::Builder;
11use rust_htslib::bam;
12use serde::{Deserialize, Serialize};
13use std::collections::HashSet;
14use std::num::{NonZeroU32, NonZeroUsize};
15use std::str::FromStr;
16
17/// Options to parse the input bam file and the filters that should be applied to the bam file.
18///
19/// This struct is parsed to create command line arguments and then passed to many functions.
20/// This type is adapted from a similar struct in the published crate
21/// `fibertools-rs` v0.8.2 (<https://crates.io/crates/fibertools-rs>), whose
22/// published crate metadata declares the MIT license. See
23/// `THIRD_PARTY_NOTICES.md` for the full license text we rely on.
24/// You can build this through [`InputBamBuilder`].
25/// In CLI mode, `clap` populates this struct.
26/// This and the [`InputMods`] struct are used to set almost all input options
27/// to many of our functions that process BAM/modBAM files.
28///
29/// # Examples
30///
31/// We first begin with an example to build the struct.
32/// The next example shows how this struct and [`InputMods`] can be used
33/// to construct inputs to one of our BAM processing functions.
34///
35/// Sample way to build the struct. Some of the parameters are optional
36/// and can be left unset which would give them default values.
37/// We do not check if the specified bam path or URL exists as there are
38/// use cases where files are generated before the `InputBam` object is used.
39/// Not all options are listed here; for a full list, please see all the
40/// methods of the builder.
41///
42/// ```
43/// use nanalogue_core::{Error, F32Bw0and1, InputBamBuilder, PathOrURLOrStdin};
44///
45/// let bam = InputBamBuilder::default()
46///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
47///     .min_seq_len(30000u64)
48///     .min_align_len(20000i64)
49///     .read_id("some-id")
50///     .read_filter("primary_forward,secondary_forward".into())
51///     .sample_fraction(F32Bw0and1::new(1.0).expect("no error"))
52///     .sample_seed(42u64)
53///     .mapq_filter(20)
54///     .exclude_mapq_unavail(true)
55///     .region("chr4:1000-2000".into())
56///     .full_region(true)
57///     .build()?;
58/// # Ok::<(), Error>(())
59/// ```
60///
61/// This struct and the [`InputMods`] struct allow us to set input options
62/// for BAM/modBAM calculations. An example is shown below where the [`crate::read_info::run`]
63/// command is called to process data from a BAM file with some input options.
64///
65/// ```
66/// use nanalogue_core::{BamRcRecords, BamPreFilt, Error, InputBamBuilder, InputModsBuilder,
67///     OptionalTag, PathOrURLOrStdin, ThresholdState, nanalogue_bam_reader, read_info};
68///
69/// let mut bam = InputBamBuilder::default()
70///     .bam_path(PathOrURLOrStdin::Path("./examples/example_1.bam".into()))
71///     .region("dummyI".into())
72///     .build()?;
73/// let mut mods = InputModsBuilder::<OptionalTag>::default()
74///     .mod_prob_filter(ThresholdState::GtEq(0))
75///     .build()?;
76///
77/// let mut buffer = Vec::new();
78/// let mut reader = nanalogue_bam_reader(&bam.bam_path.to_string())?;
79/// let bam_rc_records = BamRcRecords::new(&mut reader, &mut bam, &mut mods)?;
80/// read_info::run(
81///     &mut buffer,
82///     bam_rc_records.rc_records
83///         .filter(|r| r.as_ref().map_or(true, |v| v.pre_filt(&bam))),
84///     mods,
85///     None,
86/// )?;
87/// assert!(str::from_utf8(buffer.as_slice())?
88///     .contains("5d10eb9a-aae1-4db8-8ec6-7ebb34d32575"));
89/// # Ok::<(), Error>(())
90/// ```
91///
92/// ## Examples resulting in errors
93///
94/// Full region without actually setting a region
95///
96/// ```should_panic
97/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
98///
99/// let bam = InputBamBuilder::default()
100///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
101///     .read_id("some-id")
102///     .full_region(true)
103///     .build()?;
104/// # Ok::<(), Error>(())
105/// ```
106///
107/// Setting both `region` and `region_bed3`. `region` can be converted to
108/// `region_bed3` using [`GenomicRegion::try_to_bed3`] and a BAM header.
109///
110/// ```should_panic
111/// use bedrs::prelude::Bed3;
112/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
113///
114/// let bam = InputBamBuilder::default()
115///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
116///     .read_id("some-id")
117///     .region("chr4:1000-2000".into())
118///     .region_bed3(Bed3::<i32,u64>::new(3, 1000, 2000))
119///     .build()?;
120/// # Ok::<(), Error>(())
121/// ```
122///
123/// Setting more than one of `read_id`, `read_id_list` and `read_id_set`.
124/// * `read_id` means filter to retain only this read.
125/// * `read_id_list` is a path to a file with a list of read ids.
126/// * `read_id_set` is a set of read ids supplied directly.
127///
128/// ```
129/// use bedrs::prelude::Bed3;
130/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
131/// use std::collections::HashSet;
132///
133/// let _ = InputBamBuilder::default()
134///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
135///     .read_id("some-id")
136///     .read_id_list("/some/file.txt")
137///     .build().unwrap_err();
138///
139/// let mut read_id_set = HashSet::<String>::new();
140/// read_id_set.insert("some-read-a".to_owned());
141/// read_id_set.insert("some-read-b".to_owned());
142///
143/// let _ = InputBamBuilder::default()
144///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
145///     .read_id_list("/some/file.txt")
146///     .read_id_set(read_id_set.clone())
147///     .build().unwrap_err();
148///
149/// let _ = InputBamBuilder::default()
150///     .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
151///     .read_id("some-id")
152///     .read_id_set(read_id_set)
153///     .build().unwrap_err();
154/// ```
155#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
156#[serde(default)]
157#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
158#[non_exhaustive]
159pub struct InputBam {
160    /// Input BAM file. Set to a local file path, or set to - to read from stdin,
161    /// or set to a URL to read from a remote file. If using stdin and piping in
162    /// from `samtools view`, always include the header with the `-h` option.
163    pub bam_path: PathOrURLOrStdin,
164    /// Exclude reads whose sequence length in the BAM file is
165    /// below this value. Defaults to 0.
166    #[clap(long, default_value_t)]
167    #[builder(setter(into))]
168    pub min_seq_len: u64,
169    /// Exclude reads whose alignment length in the BAM file is
170    /// below this value. Defaults to unused.
171    #[clap(long)]
172    #[builder(setter(into, strip_option))]
173    pub min_align_len: Option<i64>,
174    /// Only include this read id, defaults to unused i.e. all reads are used.
175    /// NOTE: if there are multiple alignments corresponding
176    /// to this read id, all of them are used.
177    #[clap(long, conflicts_with = "read_id_list")]
178    #[builder(setter(into, strip_option))]
179    pub read_id: Option<String>,
180    /// Path to file containing list of read IDs (one per line).
181    /// Lines starting with '#' are treated as comments and ignored.
182    /// Cannot be used together with --read-id.
183    #[clap(long, conflicts_with = "read_id")]
184    #[builder(setter(into, strip_option))]
185    pub read_id_list: Option<String>,
186    /// Internal `HashSet` of read IDs loaded from `read_id_list` file.
187    /// This is populated automatically and not exposed to users.
188    #[clap(skip)]
189    #[builder(setter(into, strip_option))]
190    pub read_id_set: Option<HashSet<String>>,
191    /// Number of threads used during some aspects of program execution
192    #[clap(long, default_value_t = NonZeroU32::new(2).expect("no error"))]
193    #[builder(setter(into))]
194    pub threads: NonZeroU32,
195    /// Include "zero-length" sequences e.g. sequences with "*" in the sequence
196    /// field. By default, these sequences are excluded to avoid processing errors.
197    /// If this flag is set, these reads are included irrespective of any
198    /// minimum sequence or align length criteria the user may have set.
199    /// WARNINGS: (1) Some functions of the codebase may break or produce incorrect
200    /// results if you use this flag. (2) due to a technical reason, we need a DNA sequence
201    /// in the sequence field and cannot infer sequence length from other sources
202    /// e.g. CIGAR strings.
203    #[clap(long, default_value_t)]
204    pub include_zero_len: bool,
205    /// Only retain reads of this type. Allowed types are `primary_forward`,
206    /// `primary_reverse`, `secondary_forward`, `secondary_reverse`, `supplementary_forward`,
207    /// `supplementary_reverse` and unmapped. Specify more than one type if needed
208    /// separated by commas, in which case reads of any type in list are retained.
209    /// Defaults to retain reads of all types.
210    #[clap(long)]
211    #[builder(field(
212        ty = "String",
213        build = "(!self.read_filter.is_empty()).then(|| ReadStates::from_str(&self.read_filter)).transpose()?"
214    ))]
215    pub read_filter: Option<ReadStates>,
216    /// Subsample BAM to retain only this fraction of total number of reads,
217    /// defaults to 1.0. The sampling algorithm considers every read according
218    /// to the specified probability, so due to this, you may not always get
219    /// the same number of reads e.g. if you set `-s 0.05` in a file with 1000 reads,
220    /// you will get 50 +- sqrt(50) reads.
221    /// By default, a new subsample is drawn every time as the seed is not fixed.
222    /// Set `--sample-seed` to get reproducible subsampling.
223    #[clap(short, long, default_value_t = F32Bw0and1::one())]
224    pub sample_fraction: F32Bw0and1,
225    /// Seed for reproducible subsampling. When set, the subsampling decision
226    /// for each read is deterministic based on the read name and the seed.
227    /// Different seeds produce different subsets. If not set, subsampling
228    /// is random and non-reproducible (the default behavior).
229    #[clap(long)]
230    #[builder(setter(into, strip_option))]
231    pub sample_seed: Option<u64>,
232    /// Exclude reads whose MAPQ (Mapping quality of position) is below this value.
233    /// Defaults to zero i.e. do not exclude any read.
234    #[clap(long, default_value_t)]
235    #[builder(setter(into))]
236    pub mapq_filter: u8,
237    /// Exclude sequences with MAPQ unavailable.
238    /// In the BAM format, a value of 255 in this column means MAPQ is unavailable.
239    /// These reads are allowed by default, set this flag to exclude.
240    #[clap(long, default_value_t)]
241    pub exclude_mapq_unavail: bool,
242    /// Only keep reads passing through this region. If a BAM index is available
243    /// with a name same as the BAM file but with the .bai suffix, the operation
244    /// of selecting such reads will be faster. If you are using standard input
245    /// as your input e.g. you are piping in the output from samtools, then
246    /// you cannot use an index as a BAM filename is not available.
247    #[clap(long)]
248    #[builder(field(
249        ty = "String",
250        build = "(!self.region.is_empty()).then(|| GenomicRegion::from_str(&self.region)).transpose()?"
251    ))]
252    pub region: Option<GenomicRegion>,
253    /// Only keep read data from this region.
254    /// This is an internal option not exposed to the user, we will set it
255    /// based on the other options that the user sets.
256    #[clap(skip)]
257    #[builder(setter(into, strip_option))]
258    pub region_bed3: Option<Bed3<i32, u64>>,
259    /// Only keep reads if they pass through the specified region in full.
260    /// Related to the input `--region`; has no effect if that is not set.
261    #[clap(long, default_value_t, requires = "region")]
262    pub full_region: bool,
263}
264
265impl InputBamBuilder {
266    /// Validate [`InputBam`] creation through the Builder method.
267    ///
268    /// [`InputBam`] can be created in many ways:
269    /// - directly populating the fields as they are public.
270    /// - through the CLI-machinery
271    /// - through the builder.
272    ///
273    /// While we cannot check a user's usage through the first method,
274    /// we can check constructions through methods 2 and 3 above.
275    /// The CLI construction checking is handled by `clap`, and the builder
276    /// checking is done through this function.
277    /// The construction checks are slightly different as what may be feasible
278    /// through a CLI and through code are different.
279    /// So we have this validate method to check construction through
280    /// method number 3 above.
281    #[expect(
282        clippy::arithmetic_side_effects,
283        reason = "1 + 1 + 1 is not gonna overflow"
284    )]
285    fn validate(&self) -> Result<(), Error> {
286        match (
287            !self.region.is_empty(),
288            self.region_bed3.is_some(),
289            self.full_region,
290        ) {
291            (false, false, Some(true)) => {
292                return Err(Error::BuilderValidation("InputBamBuilder: cannot set `full_region` without setting `region` or `region_bed3`".to_owned()));
293            }
294            (true, true, _) => {
295                return Err(Error::BuilderValidation(
296                    "InputBamBuilder: cannot set both `region` and `region_bed3`".to_owned(),
297                ));
298            }
299            _ => {}
300        }
301
302        if u8::from(self.read_id.is_some())
303            + u8::from(self.read_id_list.is_some())
304            + u8::from(self.read_id_set.is_some())
305            > 1
306        {
307            Err(Error::BuilderValidation(
308                "InputBamBuilder: cannot set >1 of `read_id`, `read_id_list` and `read_id_set`"
309                    .to_owned(),
310            ))
311        } else {
312            Ok(())
313        }
314    }
315}
316
317/// Implements a default class for `InputBAM`
318impl Default for InputBam {
319    fn default() -> Self {
320        InputBam {
321            bam_path: PathOrURLOrStdin::Stdin,
322            min_seq_len: 0,
323            min_align_len: None,
324            read_id: None,
325            read_id_list: None,
326            read_id_set: None,
327            threads: NonZeroU32::new(2).expect("no error"),
328            include_zero_len: false,
329            read_filter: None,
330            sample_fraction: F32Bw0and1::one(),
331            sample_seed: None,
332            mapq_filter: 0,
333            exclude_mapq_unavail: false,
334            region: None,
335            region_bed3: None,
336            full_region: false,
337        }
338    }
339}
340
341/// This struct contains an optional modification tag
342#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
343#[non_exhaustive]
344pub struct OptionalTag {
345    /// modified tag
346    #[clap(long)]
347    pub tag: Option<ModChar>,
348}
349
350impl FromStr for OptionalTag {
351    type Err = Error;
352
353    /// process the modification type from a string.
354    ///
355    /// ```
356    /// use nanalogue_core::{Error, ModChar, OptionalTag};
357    /// use std::str::FromStr;
358    ///
359    /// let tag = OptionalTag::from_str("m")?;
360    /// assert_eq!(tag.tag.unwrap().val(), 'm');
361    /// # Ok::<(), Error>(())
362    /// ```
363    fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
364        Ok(OptionalTag {
365            tag: Some(ModChar::from_str(mod_type)?),
366        })
367    }
368}
369
370/// This struct contains a required modification tag
371#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
372#[non_exhaustive]
373pub struct RequiredTag {
374    /// modified tag
375    #[clap(long)]
376    pub tag: ModChar,
377}
378
379impl FromStr for RequiredTag {
380    type Err = Error;
381
382    /// process the modification type from a string.
383    ///
384    /// ```
385    /// use nanalogue_core::{Error, ModChar, RequiredTag};
386    /// use std::str::FromStr;
387    ///
388    /// let tag = RequiredTag::from_str("m")?;
389    /// assert_eq!(tag.tag.val(), 'm');
390    /// # Ok::<(), Error>(())
391    /// ```
392    fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
393        Ok(RequiredTag {
394            tag: ModChar::from_str(mod_type)?,
395        })
396    }
397}
398
399/// Trait that returns a modification tag
400pub trait TagState {
401    /// Returns the modification tag of the tag state in an option
402    fn tag(&self) -> Option<ModChar>;
403}
404
405impl TagState for OptionalTag {
406    fn tag(&self) -> Option<ModChar> {
407        self.tag
408    }
409}
410
411impl TagState for RequiredTag {
412    fn tag(&self) -> Option<ModChar> {
413        Some(self.tag)
414    }
415}
416
417/// This struct contains the options input to our
418/// modification-data functions with restrictions on data received.
419///
420/// If you want to build this, use [`InputModsBuilder`].
421/// In CLI mode, `clap` populates this struct.
422/// This and the [`InputBam`] struct are used to set almost all input options
423/// to many of our functions that process BAM/modBAM files.
424///
425/// # Examples
426///
427/// Please look at the documentation of [`InputBam`] to see a fully-worked
428/// example to set up inputs to perform a calculation.
429///
430/// Following example uses many fields and is for a [`RequiredTag`] variant.
431/// You can omit some of them depending on your use case. `mod_region` must
432/// can be converted to `region_bed3` using [`GenomicRegion::try_to_bed3`]
433/// with a suitable BAM header before this struct can be used with most of
434/// our functions. We do not force this in the builder route as we may not
435/// want to do this for some reason.
436///
437/// ```
438/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
439/// use std::str::FromStr;
440///
441/// let input_options = InputModsBuilder::default()
442///     .tag(RequiredTag::from_str("m")?)
443///     .mod_strand("bc".into())
444///     .mod_prob_filter(ThresholdState::GtEq(0))
445///     .trim_read_ends_mod(10)
446///     .base_qual_filter_mod(10)
447///     .mod_region("chr3:4000-5000".into())
448///     .build()?;
449///
450/// # Ok::<(), Error>(())
451/// ```
452///
453/// A [`RequiredTag`] variant that sets `region_bed3` instead of `mod_region`.
454/// If `mod_region` is set, then a downstream function must parse a BAM file,
455/// and then convert `mod_region` into `region_bed3` using the header.
456/// If the user is confident that they know the index number of a contig,
457/// then they can directly set `region_bed3`. In the example below, the user
458/// knows `chr3` corresponds to `4`, so they set the region directly.
459///
460/// ```
461/// use bedrs::prelude::Bed3;
462/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
463/// use std::str::FromStr;
464///
465/// let input_options = InputModsBuilder::<RequiredTag>::default()
466///     .tag(RequiredTag::from_str("m")?)
467///     .mod_strand("bc".into())
468///     .mod_prob_filter(ThresholdState::GtEq(0))
469///     .trim_read_ends_mod(10)
470///     .base_qual_filter_mod(10)
471///     .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
472///     .build()?;
473///
474/// # Ok::<(), Error>(())
475/// ```
476///
477/// Setting both `region_bed3` and `mod_region` should result in an error,
478/// even if they both refer to the same region. This is because setting both
479/// can result in an undefined state, so we do not allow this.
480///
481/// ```should_panic
482/// use bedrs::prelude::Bed3;
483/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
484/// use std::str::FromStr;
485///
486/// let input_options = InputModsBuilder::<RequiredTag>::default()
487///     .mod_prob_filter(ThresholdState::GtEq(0))
488///     .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
489///     .mod_region("chr3:4000-5000".into())
490///     .build()?;
491///
492/// # Ok::<(), Error>(())
493/// ```
494///
495/// Following example is for an [`OptionalTag`] variant with the `tag` omitted,
496/// and many other fields omitted as well, which take on a default value.
497///
498/// ```
499/// use nanalogue_core::{InputModsBuilder, Error, OptionalTag, ThresholdState};
500/// use std::str::FromStr;
501///
502/// let input_options = InputModsBuilder::<OptionalTag>::default()
503///     .mod_prob_filter(ThresholdState::GtEq(0))
504///     .build()?;
505///
506/// # Ok::<(), Error>(())
507/// ```
508#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
509#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
510#[serde(default)]
511#[non_exhaustive]
512pub struct InputMods<S: TagState + Args + FromArgMatches + Default> {
513    /// modified tag
514    #[clap(flatten)]
515    pub tag: S,
516    /// modified strand, set this to `bc` or `bc_comp`, meaning
517    /// on basecalled strand or its complement. Some technologies
518    /// like `PacBio` or `ONT` duplex can call mod data on both a strand
519    /// and its complementary DNA and store it in the record corresponding
520    /// to the strand, so you can use this filter to select only for
521    /// mod data on a strand or its complement. Please note that this
522    /// filter is different from selecting for forward or reverse
523    /// aligned reads using the BAM flags.
524    #[clap(long)]
525    #[builder(field(
526        ty = "String",
527        build = "(!self.mod_strand.is_empty()).then(|| RestrictModCalledStrand::from_str(&self.mod_strand)).transpose()?"
528    ))]
529    pub mod_strand: Option<RestrictModCalledStrand>,
530    /// Filter to reject mods before analysis. Specify as low,high where
531    /// both are fractions to reject modifications where the probabilities (p)
532    /// are in this range e.g. "0.4,0.6" rejects 0.4 <= p <= 0.6.
533    /// You can use this to reject 'weak' modification calls before analysis
534    /// i.e. those with probabilities close to 0.5.
535    /// NOTE: (1) Whether this filtration is applied or not, mods < 0.5
536    /// are considered unmodified and >= 0.5 are considered modified by our program.
537    /// (2) mod probabilities are stored as a number from 0-255 in the modBAM format,
538    /// so we internally convert 0.0-1.0 to 0-255. Default: reject nothing.
539    #[clap(long, value_parser=ThresholdState::from_str_ordpair_fraction, default_value = "")]
540    pub mod_prob_filter: ThresholdState,
541    /// Filter this many bp at the start and
542    /// end of a read before any mod operations.
543    /// Please note that the units here are bp and
544    /// not units of base being queried.
545    #[clap(long, default_value_t)]
546    pub trim_read_ends_mod: usize,
547    /// Exclude bases whose base quality is below
548    /// this threshold before any mod operation, defaults to 0 i.e. unused.
549    /// NOTE: (1) This step is only applied before modification operations, and
550    /// not before any other operations.
551    /// (2) No offsets such as +33 are needed here.
552    /// (3) Modifications on reads where base quality information
553    /// is not available are all rejected if this is non-zero.
554    #[clap(long, default_value_t)]
555    pub base_qual_filter_mod: u8,
556    /// Only keep modification data from this region
557    #[clap(long)]
558    #[builder(field(
559        ty = "String",
560        build = "(!self.mod_region.is_empty()).then(|| GenomicRegion::from_str(&self.mod_region)).transpose()?"
561    ))]
562    pub mod_region: Option<GenomicRegion>,
563    /// Only keep modification data from this region.
564    /// We do not expose this to the user, but infer it from
565    /// the other options set by the user.
566    /// We cannot populate this directly at the time of CLI parsing,
567    /// as we need to look at the BAM header to convert a contig name
568    /// into a numeric contig id.
569    #[clap(skip)]
570    #[builder(setter(strip_option))]
571    pub region_bed3: Option<Bed3<i32, u64>>,
572}
573
574impl<S: TagState + Args + FromArgMatches + Default> InputModsBuilder<S> {
575    /// Validate [`InputMods`] creation through the Builder method.
576    ///
577    /// [`InputMods`] can be created in many ways:
578    /// - directly populating the fields as they are public.
579    /// - through the CLI-machinery
580    /// - through the builder.
581    ///
582    /// While we cannot check a user's usage through the first method,
583    /// we can check constructions through methods 2 and 3 above.
584    /// The CLI construction checking is handled by `clap`, and the builder
585    /// checking is done through this function.
586    /// The construction checks are slightly different as what may be feasible
587    /// through a CLI and through code are different.
588    /// So we have this validate method to check construction through
589    /// method number 3 above.
590    fn validate(&self) -> Result<(), Error> {
591        // self.mod_region is a `String` for the Builder before conversion into an `Option<_>`
592        if (!self.mod_region.is_empty()) && self.region_bed3.is_some() {
593            Err(Error::BuilderValidation(
594                "cannot set `mod_region` and `region_bed3` simultaneously!".to_owned(),
595            ))
596        } else {
597            Ok(())
598        }
599    }
600}
601
602/// Implements defaults for `InputMods`
603impl<S: TagState + Args + FromArgMatches + Default> Default for InputMods<S> {
604    fn default() -> Self {
605        InputMods::<S> {
606            tag: S::default(),
607            mod_strand: None,
608            mod_prob_filter: ThresholdState::GtEq(0),
609            trim_read_ends_mod: 0,
610            base_qual_filter_mod: 0,
611            mod_region: None,
612            region_bed3: None,
613        }
614    }
615}
616
617/// Retrieves options for modification input
618pub trait InputModOptions {
619    /// retrieves tag
620    fn tag(&self) -> Option<ModChar>;
621    /// retrieves option to set basecalled strand or opposite in mod retrieval
622    fn mod_strand(&self) -> Option<RestrictModCalledStrand>;
623    /// returns probability filter
624    fn mod_prob_filter(&self) -> ThresholdState;
625    /// returns read end trimming
626    fn trim_read_ends_mod(&self) -> usize;
627    /// returns threshold for filtering base PHRED quality
628    fn base_qual_filter_mod(&self) -> u8;
629}
630
631/// Retrieves options for region
632pub trait InputRegionOptions {
633    /// returns region requested
634    fn region_filter(&self) -> &Option<Bed3<i32, u64>>;
635    /// returns region requested but region in genomic string format
636    fn region_filter_genomic_string(&self) -> Option<GenomicRegion>;
637    /// sets region requested
638    fn set_region_filter(&mut self, _value: Option<Bed3<i32, u64>>);
639    /// returns true if full overlap with region is requested as opposed to
640    /// only partial overlap. defaults to false.
641    fn is_full_overlap(&self) -> bool {
642        false
643    }
644    /// converts region from genomic string representation to bed3 representation
645    ///
646    /// # Errors
647    /// Returns error if conversion fails
648    fn convert_region_to_bed3(&mut self, header: bam::HeaderView) -> Result<(), Error> {
649        match (
650            self.region_filter_genomic_string().is_some(),
651            self.region_filter().is_some(),
652        ) {
653            (false, false) => self.set_region_filter(None),
654            (true, false) => {
655                let genomic_region = self
656                    .region_filter_genomic_string()
657                    .expect("checked above that this is Some");
658                self.set_region_filter(Some(genomic_region.try_to_bed3(&header)?));
659            }
660            (false, true) => {}
661            (true, true) => {
662                return Err(Error::InvalidState(
663                    "cannot set a region as both a `GenomicRegion` and a `Bed3`".to_owned(),
664                ));
665            }
666        }
667        Ok(())
668    }
669}
670
671impl<S: TagState + Args + FromArgMatches + Default> InputModOptions for InputMods<S> {
672    fn tag(&self) -> Option<ModChar> {
673        self.tag.tag()
674    }
675    fn mod_strand(&self) -> Option<RestrictModCalledStrand> {
676        self.mod_strand
677    }
678    fn mod_prob_filter(&self) -> ThresholdState {
679        self.mod_prob_filter
680    }
681    fn trim_read_ends_mod(&self) -> usize {
682        self.trim_read_ends_mod
683    }
684    fn base_qual_filter_mod(&self) -> u8 {
685        self.base_qual_filter_mod
686    }
687}
688
689impl<S: TagState + Args + FromArgMatches + Default> InputRegionOptions for InputMods<S> {
690    fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
691        self.mod_region.clone()
692    }
693    fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
694        &self.region_bed3
695    }
696    fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
697        self.region_bed3 = value;
698    }
699}
700
701/// can return tag without encasing in an option in the `RequiredTag` variant
702impl InputMods<RequiredTag> {
703    /// retrieves tag
704    #[must_use]
705    pub fn required_tag(&self) -> ModChar {
706        self.tag.tag
707    }
708}
709
710impl InputRegionOptions for InputBam {
711    fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
712        self.region.clone()
713    }
714    fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
715        &self.region_bed3
716    }
717    fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
718        self.region_bed3 = value;
719    }
720    fn is_full_overlap(&self) -> bool {
721        self.full_region
722    }
723}
724
725/// This struct contains the options input to our
726/// modification-data-windowing functions
727///
728/// # Example
729/// ```
730/// use nanalogue_core::{Error, InputWindowingBuilder};
731///
732/// let window = InputWindowingBuilder::default()
733///     .win(20)
734///     .step(10).build()?;
735/// # Ok::<(), Error>(())
736/// ```
737#[derive(Builder, Debug, Args, Clone, Copy, Serialize, Deserialize)]
738#[builder(default, build_fn(error = "Error"))]
739#[serde(default)]
740#[non_exhaustive]
741pub struct InputWindowing {
742    /// size of window in units of base being queried i.e.
743    /// if you are looking for cytosine modifications, then
744    /// a window of a value 300 means create windows each with
745    /// 300 cytosines irrespective of their modification status.
746    #[clap(long)]
747    #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.win)?"))]
748    pub win: NonZeroUsize,
749    /// step window by this size in units of base being queried.
750    #[clap(long)]
751    #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.step)?"))]
752    pub step: NonZeroUsize,
753}
754
755/// Implements a default for `InputWindowing`.
756/// NOTE the defaults of 1 for each are just for ease of programming.
757/// We do not expose these defaults to the command-line user.
758impl Default for InputWindowing {
759    fn default() -> Self {
760        InputWindowing {
761            win: NonZeroUsize::new(1).expect("no error"),
762            step: NonZeroUsize::new(1).expect("no error"),
763        }
764    }
765}
766
767/// This enum contains inputs to display sequences with various options
768#[derive(Debug, Clone, Default, Copy, Serialize, Deserialize)]
769#[non_exhaustive]
770pub enum SeqDisplayOptions {
771    /// Do not display any sequence, default
772    #[default]
773    No,
774    /// Display the entire sequence without any subsetting
775    Full {
776        /// Option to display basecalling qualities
777        show_base_qual: bool,
778    },
779    /// Display the sequence within a region
780    Region {
781        /// Option to display basecalling qualities
782        show_base_qual: bool,
783        /// Option to show bases in lowercase if they are insertions
784        show_ins_lowercase: bool,
785        /// Option to show modified bases as Z (or z depending on other options)
786        show_mod_z: bool,
787        /// Region over which sequence must be shown
788        region: Bed3<i32, u64>,
789    },
790}
791
792#[cfg(test)]
793mod tag_struct_tests {
794    use super::*;
795
796    #[test]
797    fn optional_tag_default() {
798        let optional_tag = OptionalTag::default();
799        assert!(optional_tag.tag.is_none());
800        assert_eq!(optional_tag.tag(), None);
801    }
802
803    #[test]
804    fn optional_tag_with_value() {
805        let mod_char = ModChar::new('m');
806        let optional_tag = OptionalTag {
807            tag: Some(mod_char),
808        };
809        assert_eq!(optional_tag.tag(), Some(mod_char));
810    }
811
812    #[test]
813    fn required_tag_default() {
814        let required_tag = RequiredTag::default();
815        assert_eq!(required_tag.tag(), Some(ModChar::default()));
816    }
817
818    #[test]
819    fn required_tag_with_value() {
820        let mod_char = ModChar::new('C');
821        let required_tag = RequiredTag { tag: mod_char };
822        assert_eq!(required_tag.tag(), Some(mod_char));
823    }
824
825    #[test]
826    fn optional_tag_from_str() {
827        let tag = OptionalTag::from_str("m").unwrap();
828        assert_eq!(tag.tag.unwrap().val(), 'm');
829    }
830
831    #[test]
832    fn required_tag_from_str() {
833        let tag = RequiredTag::from_str("m").unwrap();
834        assert_eq!(tag.tag.val(), 'm');
835    }
836}
837
838#[cfg(test)]
839mod input_windowing_tests {
840    use super::*;
841
842    #[test]
843    fn input_windowing_default() {
844        let windowing = InputWindowing::default();
845        assert_eq!(windowing.win, NonZeroUsize::new(1).unwrap());
846        assert_eq!(windowing.step, NonZeroUsize::new(1).unwrap());
847    }
848
849    #[test]
850    fn input_windowing_custom_values() {
851        let windowing = InputWindowing {
852            win: NonZeroUsize::new(300).unwrap(),
853            step: NonZeroUsize::new(150).unwrap(),
854        };
855        assert_eq!(windowing.win, NonZeroUsize::new(300).unwrap());
856        assert_eq!(windowing.step, NonZeroUsize::new(150).unwrap());
857    }
858
859    #[test]
860    fn input_windowing_builder() {
861        let _: InputWindowing = InputWindowingBuilder::default()
862            .win(20)
863            .step(10)
864            .build()
865            .unwrap();
866    }
867}
868
869#[cfg(test)]
870mod input_mods_required_tag_tests {
871    use super::*;
872
873    #[test]
874    fn input_mods_required_tag_fn_tag() {
875        let mod_char = ModChar::new('C');
876        let input_mods = InputMods::<RequiredTag> {
877            tag: RequiredTag { tag: mod_char },
878            mod_strand: None,
879            mod_prob_filter: ThresholdState::GtEq(0),
880            trim_read_ends_mod: 0,
881            base_qual_filter_mod: 0,
882            mod_region: None,
883            region_bed3: None,
884        };
885
886        assert_eq!(input_mods.required_tag(), mod_char);
887    }
888
889    #[test]
890    fn input_mods_builder_required_tag_with_mod_region() {
891        let _: InputMods<RequiredTag> = InputModsBuilder::default()
892            .tag(RequiredTag::from_str("m").unwrap())
893            .mod_strand("bc".into())
894            .mod_prob_filter(ThresholdState::GtEq(0))
895            .trim_read_ends_mod(10)
896            .base_qual_filter_mod(10)
897            .mod_region("chr3:4000-5000".into())
898            .build()
899            .unwrap();
900    }
901
902    #[test]
903    fn input_mods_builder_required_tag_with_region_bed3() {
904        let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
905            .tag(RequiredTag::from_str("m").unwrap())
906            .mod_strand("bc".into())
907            .mod_prob_filter(ThresholdState::GtEq(0))
908            .trim_read_ends_mod(10)
909            .base_qual_filter_mod(10)
910            .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
911            .build()
912            .unwrap();
913    }
914
915    #[test]
916    fn input_mods_builder_optional_tag_minimal() {
917        let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
918            .mod_prob_filter(ThresholdState::GtEq(0))
919            .build()
920            .unwrap();
921    }
922}
923
924#[cfg(test)]
925mod input_bam_tests {
926    use super::*;
927    use bedrs::Coordinates as _;
928
929    #[test]
930    fn input_bam_is_full_overlap() {
931        // Test default (false)
932        let input_bam_default = InputBam::default();
933        assert!(!input_bam_default.is_full_overlap());
934
935        // Test explicit false
936        let input_bam_false = InputBam {
937            full_region: false,
938            ..Default::default()
939        };
940        assert!(!input_bam_false.is_full_overlap());
941
942        // Test true
943        let input_bam_true = InputBam {
944            full_region: true,
945            ..Default::default()
946        };
947        assert!(input_bam_true.is_full_overlap());
948    }
949
950    #[test]
951    fn input_bam_builder() {
952        let _: InputBam = InputBamBuilder::default()
953            .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
954            .min_seq_len(30000u64)
955            .min_align_len(20000i64)
956            .read_id("some-id")
957            .read_filter("primary_forward,secondary_forward".into())
958            .sample_fraction(F32Bw0and1::one())
959            .mapq_filter(20)
960            .exclude_mapq_unavail(true)
961            .region("chr4:1000-2000".into())
962            .full_region(true)
963            .build()
964            .unwrap();
965    }
966
967    #[test]
968    fn input_bam_deserialize_rejects_disallowed_read_filter() {
969        let bad: Result<InputBam, _> =
970            serde_json::from_str(r#"{"read_filter":[1, 700], "include_zero_len": true}"#);
971        let _: serde_json::Error = bad.unwrap_err();
972    }
973
974    #[test]
975    fn input_bam_convert_region_to_bed3_none() {
976        let mut input_bam = InputBam::default();
977
978        let header_view = bam::HeaderView::from_bytes(
979            concat!(
980                "@HD\tVN:1.6\tSO:coordinate\n",
981                "@SQ\tSN:chr1\tLN:248956422\n"
982            )
983            .as_bytes(),
984        );
985
986        input_bam.convert_region_to_bed3(header_view).unwrap();
987        assert!(input_bam.region_bed3.is_none());
988    }
989
990    #[test]
991    fn input_bam_convert_region_to_bed3_with_region() {
992        let mut input_bam = InputBam {
993            region: Some(GenomicRegion::from_str("chr2:3400-3600").unwrap()),
994            ..Default::default()
995        };
996
997        let header_view = bam::HeaderView::from_bytes(
998            concat!(
999                "@HD\tVN:1.6\tSO:coordinate\n",
1000                "@SQ\tSN:chr1\tLN:3000\n",
1001                "@SQ\tSN:chr2\tLN:4000\n"
1002            )
1003            .as_bytes(),
1004        );
1005
1006        input_bam.convert_region_to_bed3(header_view).unwrap();
1007        assert!(input_bam.region_bed3.is_some());
1008
1009        let bed3 = input_bam.region_bed3.unwrap();
1010        assert_eq!(bed3.chr(), &1);
1011        assert_eq!(bed3.start(), 3400);
1012        assert_eq!(bed3.end(), 3600);
1013    }
1014
1015    #[test]
1016    #[should_panic(expected = "InvalidRegion")]
1017    fn input_bam_convert_region_to_bed3_invalid_region() {
1018        let mut input_bam = InputBam {
1019            region: Some(GenomicRegion::from_str("chr2:4400-4600").expect("no error")),
1020            ..Default::default()
1021        };
1022        let header_view = bam::HeaderView::from_bytes(
1023            concat!(
1024                "@HD\tVN:1.6\tSO:coordinate\n",
1025                "@SQ\tSN:chr1\tLN:3000\n",
1026                "@SQ\tSN:chr2\tLN:4000\n"
1027            )
1028            .as_bytes(),
1029        );
1030
1031        input_bam.convert_region_to_bed3(header_view).unwrap();
1032    }
1033
1034    #[test]
1035    #[should_panic(expected = "InvalidRegion")]
1036    fn input_bam_convert_region_to_bed3_invalid_open_ended_region() {
1037        let mut input_bam = InputBam {
1038            region: Some(GenomicRegion::from_str("chr2:4600-").expect("no error")),
1039            ..Default::default()
1040        };
1041        let header_view = bam::HeaderView::from_bytes(
1042            concat!(
1043                "@HD\tVN:1.6\tSO:coordinate\n",
1044                "@SQ\tSN:chr1\tLN:3000\n",
1045                "@SQ\tSN:chr2\tLN:4000\n"
1046            )
1047            .as_bytes(),
1048        );
1049
1050        input_bam.convert_region_to_bed3(header_view).unwrap();
1051    }
1052
1053    #[test]
1054    #[should_panic(expected = "InvalidAlignCoords")]
1055    fn input_bam_convert_region_to_bed3_invalid_contig() {
1056        let mut input_bam = InputBam {
1057            region: Some(GenomicRegion::from_str("chr3:1000-2000").expect("no error")),
1058            ..Default::default()
1059        };
1060        let header_view = bam::HeaderView::from_bytes(
1061            concat!(
1062                "@HD\tVN:1.6\tSO:coordinate\n",
1063                "@SQ\tSN:chr1\tLN:3000\n",
1064                "@SQ\tSN:chr2\tLN:4000\n"
1065            )
1066            .as_bytes(),
1067        );
1068
1069        input_bam.convert_region_to_bed3(header_view).unwrap();
1070    }
1071
1072    #[test]
1073    fn input_bam_convert_region_to_bed3_already_set_bed3() {
1074        // When region_bed3 is already set and region is None, should not convert
1075        let mut input_bam = InputBam {
1076            region: None,
1077            region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1078            ..Default::default()
1079        };
1080
1081        let header_view = bam::HeaderView::from_bytes(
1082            concat!(
1083                "@HD\tVN:1.6\tSO:coordinate\n",
1084                "@SQ\tSN:chr1\tLN:3000\n",
1085                "@SQ\tSN:chr2\tLN:4000\n"
1086            )
1087            .as_bytes(),
1088        );
1089
1090        input_bam.convert_region_to_bed3(header_view).unwrap();
1091
1092        // Bed3 should remain unchanged
1093        assert!(input_bam.region_bed3.is_some());
1094        let bed3 = input_bam.region_bed3.unwrap();
1095        assert_eq!(bed3.chr(), &1);
1096        assert_eq!(bed3.start(), 1000);
1097        assert_eq!(bed3.end(), 2000);
1098    }
1099
1100    #[test]
1101    #[should_panic(expected = "InvalidState")]
1102    fn input_bam_convert_region_to_bed3_both_set() {
1103        // When both region and region_bed3 are set, should error
1104        let mut input_bam = InputBam {
1105            region: Some(GenomicRegion::from_str("chr2:1500-2500").unwrap()),
1106            region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1107            ..Default::default()
1108        };
1109
1110        let header_view = bam::HeaderView::from_bytes(
1111            concat!(
1112                "@HD\tVN:1.6\tSO:coordinate\n",
1113                "@SQ\tSN:chr1\tLN:3000\n",
1114                "@SQ\tSN:chr2\tLN:4000\n"
1115            )
1116            .as_bytes(),
1117        );
1118
1119        input_bam.convert_region_to_bed3(header_view).unwrap();
1120    }
1121}
1122
1123#[cfg(test)]
1124mod input_region_options_tests {
1125    use super::*;
1126
1127    #[derive(Debug, Default)]
1128    struct TestInputRegionOptions {
1129        region_filter: Option<Bed3<i32, u64>>,
1130        region_filter_genomic_string: Option<GenomicRegion>,
1131    }
1132
1133    impl InputRegionOptions for TestInputRegionOptions {
1134        fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
1135            &self.region_filter
1136        }
1137
1138        fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
1139            self.region_filter_genomic_string.clone()
1140        }
1141
1142        fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
1143            self.region_filter = value;
1144        }
1145    }
1146
1147    #[test]
1148    fn input_region_options_is_full_overlap_false() {
1149        let test_obj = TestInputRegionOptions::default();
1150        assert!(!test_obj.is_full_overlap());
1151    }
1152}
1153
1154#[cfg(test)]
1155mod validate_builder_functions {
1156    use super::*;
1157
1158    // Tests for InputBamBuilder::validate conflicts
1159
1160    #[test]
1161    #[should_panic(expected = "BuilderValidation")]
1162    fn input_bam_builder_full_region_without_region_fails() {
1163        let _: InputBam = InputBamBuilder::default()
1164            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1165            .full_region(true)
1166            .build()
1167            .unwrap();
1168    }
1169
1170    #[test]
1171    #[should_panic(expected = "BuilderValidation")]
1172    fn input_bam_builder_both_region_and_region_bed3_fails() {
1173        let _: InputBam = InputBamBuilder::default()
1174            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1175            .region("chr1:1000-2000".into())
1176            .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1177            .build()
1178            .unwrap();
1179    }
1180
1181    #[test]
1182    #[should_panic(expected = "BuilderValidation")]
1183    fn input_bam_builder_read_id_and_read_id_list_fails() {
1184        let _: InputBam = InputBamBuilder::default()
1185            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1186            .read_id("some-id")
1187            .read_id_list("/some/file.txt")
1188            .build()
1189            .unwrap();
1190    }
1191
1192    #[test]
1193    #[should_panic(expected = "BuilderValidation")]
1194    fn input_bam_builder_read_id_and_read_id_set_fails() {
1195        let mut read_ids = HashSet::new();
1196        let _: bool = read_ids.insert("read1".to_owned());
1197        let _: InputBam = InputBamBuilder::default()
1198            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1199            .read_id("some-id")
1200            .read_id_set(read_ids)
1201            .build()
1202            .unwrap();
1203    }
1204
1205    #[test]
1206    #[should_panic(expected = "BuilderValidation")]
1207    fn input_bam_builder_read_id_list_and_read_id_set_fails() {
1208        let mut read_ids = HashSet::new();
1209        let _: bool = read_ids.insert("read1".to_owned());
1210        let _: InputBam = InputBamBuilder::default()
1211            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1212            .read_id_list("/some/file.txt")
1213            .read_id_set(read_ids)
1214            .build()
1215            .unwrap();
1216    }
1217
1218    #[test]
1219    #[should_panic(expected = "BuilderValidation")]
1220    fn input_bam_builder_all_three_read_id_options_fails() {
1221        let mut read_ids = HashSet::new();
1222        let _: bool = read_ids.insert("read1".to_owned());
1223        let _: InputBam = InputBamBuilder::default()
1224            .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1225            .read_id("some-id")
1226            .read_id_list("/some/file.txt")
1227            .read_id_set(read_ids)
1228            .build()
1229            .unwrap();
1230    }
1231
1232    // Tests for InputModsBuilder::validate conflicts
1233
1234    #[test]
1235    #[should_panic(expected = "BuilderValidation")]
1236    fn input_mods_builder_both_mod_region_and_region_bed3_fails() {
1237        let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
1238            .mod_prob_filter(ThresholdState::GtEq(0))
1239            .mod_region("chr1:1000-2000".into())
1240            .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1241            .build()
1242            .unwrap();
1243    }
1244
1245    #[test]
1246    #[should_panic(expected = "BuilderValidation")]
1247    fn input_mods_builder_required_tag_both_regions_fails() {
1248        let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
1249            .tag(RequiredTag::from_str("m").unwrap())
1250            .mod_prob_filter(ThresholdState::GtEq(0))
1251            .mod_region("chr1:1000-2000".into())
1252            .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1253            .build()
1254            .unwrap();
1255    }
1256}