nanalogue_core/cli.rs
1//! # Command line interface (CLI) options, including input processing options
2//!
3//! This file provides some global options in the command line interface.
4use crate::{
5 Error, F32Bw0and1, GenomicRegion, ModChar, PathOrURLOrStdin, ReadStates,
6 RestrictModCalledStrand, ThresholdState,
7};
8use bedrs::prelude::Bed3;
9use clap::{Args, FromArgMatches};
10use derive_builder::Builder;
11use rust_htslib::bam;
12use serde::{Deserialize, Serialize};
13use std::collections::HashSet;
14use std::num::{NonZeroU32, NonZeroUsize};
15use std::str::FromStr;
16
17/// Options to parse the input bam file and the filters that should be applied to the bam file.
18///
19/// This struct is parsed to create command line arguments and then passed to many functions.
20/// This type is adapted from a similar struct in the published crate
21/// `fibertools-rs` v0.8.2 (<https://crates.io/crates/fibertools-rs>), whose
22/// published crate metadata declares the MIT license. See
23/// `THIRD_PARTY_NOTICES.md` for the full license text we rely on.
24/// You can build this through [`InputBamBuilder`].
25/// In CLI mode, `clap` populates this struct.
26/// This and the [`InputMods`] struct are used to set almost all input options
27/// to many of our functions that process BAM/modBAM files.
28///
29/// # Examples
30///
31/// We first begin with an example to build the struct.
32/// The next example shows how this struct and [`InputMods`] can be used
33/// to construct inputs to one of our BAM processing functions.
34///
35/// Sample way to build the struct. Some of the parameters are optional
36/// and can be left unset which would give them default values.
37/// We do not check if the specified bam path or URL exists as there are
38/// use cases where files are generated before the `InputBam` object is used.
39/// Not all options are listed here; for a full list, please see all the
40/// methods of the builder.
41///
42/// ```
43/// use nanalogue_core::{Error, F32Bw0and1, InputBamBuilder, PathOrURLOrStdin};
44///
45/// let bam = InputBamBuilder::default()
46/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
47/// .min_seq_len(30000u64)
48/// .min_align_len(20000i64)
49/// .read_id("some-id")
50/// .read_filter("primary_forward,secondary_forward".into())
51/// .sample_fraction(F32Bw0and1::new(1.0).expect("no error"))
52/// .sample_seed(42u64)
53/// .mapq_filter(20)
54/// .exclude_mapq_unavail(true)
55/// .region("chr4:1000-2000".into())
56/// .full_region(true)
57/// .build()?;
58/// # Ok::<(), Error>(())
59/// ```
60///
61/// This struct and the [`InputMods`] struct allow us to set input options
62/// for BAM/modBAM calculations. An example is shown below where the [`crate::read_info::run`]
63/// command is called to process data from a BAM file with some input options.
64///
65/// ```
66/// use nanalogue_core::{BamRcRecords, BamPreFilt, Error, InputBamBuilder, InputModsBuilder,
67/// OptionalTag, PathOrURLOrStdin, ThresholdState, nanalogue_bam_reader, read_info};
68///
69/// let mut bam = InputBamBuilder::default()
70/// .bam_path(PathOrURLOrStdin::Path("./examples/example_1.bam".into()))
71/// .region("dummyI".into())
72/// .build()?;
73/// let mut mods = InputModsBuilder::<OptionalTag>::default()
74/// .mod_prob_filter(ThresholdState::GtEq(0))
75/// .build()?;
76///
77/// let mut buffer = Vec::new();
78/// let mut reader = nanalogue_bam_reader(&bam.bam_path.to_string())?;
79/// let bam_rc_records = BamRcRecords::new(&mut reader, &mut bam, &mut mods)?;
80/// read_info::run(
81/// &mut buffer,
82/// bam_rc_records.rc_records
83/// .filter(|r| r.as_ref().map_or(true, |v| v.pre_filt(&bam))),
84/// mods,
85/// None,
86/// )?;
87/// assert!(str::from_utf8(buffer.as_slice())?
88/// .contains("5d10eb9a-aae1-4db8-8ec6-7ebb34d32575"));
89/// # Ok::<(), Error>(())
90/// ```
91///
92/// ## Examples resulting in errors
93///
94/// Full region without actually setting a region
95///
96/// ```should_panic
97/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
98///
99/// let bam = InputBamBuilder::default()
100/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
101/// .read_id("some-id")
102/// .full_region(true)
103/// .build()?;
104/// # Ok::<(), Error>(())
105/// ```
106///
107/// Setting both `region` and `region_bed3`. `region` can be converted to
108/// `region_bed3` using [`GenomicRegion::try_to_bed3`] and a BAM header.
109///
110/// ```should_panic
111/// use bedrs::prelude::Bed3;
112/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
113///
114/// let bam = InputBamBuilder::default()
115/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
116/// .read_id("some-id")
117/// .region("chr4:1000-2000".into())
118/// .region_bed3(Bed3::<i32,u64>::new(3, 1000, 2000))
119/// .build()?;
120/// # Ok::<(), Error>(())
121/// ```
122///
123/// Setting more than one of `read_id`, `read_id_list` and `read_id_set`.
124/// * `read_id` means filter to retain only this read.
125/// * `read_id_list` is a path to a file with a list of read ids.
126/// * `read_id_set` is a set of read ids supplied directly.
127///
128/// ```
129/// use bedrs::prelude::Bed3;
130/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
131/// use std::collections::HashSet;
132///
133/// let _ = InputBamBuilder::default()
134/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
135/// .read_id("some-id")
136/// .read_id_list("/some/file.txt")
137/// .build().unwrap_err();
138///
139/// let mut read_id_set = HashSet::<String>::new();
140/// read_id_set.insert("some-read-a".to_owned());
141/// read_id_set.insert("some-read-b".to_owned());
142///
143/// let _ = InputBamBuilder::default()
144/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
145/// .read_id_list("/some/file.txt")
146/// .read_id_set(read_id_set.clone())
147/// .build().unwrap_err();
148///
149/// let _ = InputBamBuilder::default()
150/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
151/// .read_id("some-id")
152/// .read_id_set(read_id_set)
153/// .build().unwrap_err();
154/// ```
155#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
156#[serde(default)]
157#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
158#[non_exhaustive]
159pub struct InputBam {
160 /// Input BAM file. Set to a local file path, or set to - to read from stdin,
161 /// or set to a URL to read from a remote file. If using stdin and piping in
162 /// from `samtools view`, always include the header with the `-h` option.
163 pub bam_path: PathOrURLOrStdin,
164 /// Exclude reads whose sequence length in the BAM file is
165 /// below this value. Defaults to 0.
166 #[clap(long, default_value_t)]
167 #[builder(setter(into))]
168 pub min_seq_len: u64,
169 /// Exclude reads whose alignment length in the BAM file is
170 /// below this value. Defaults to unused.
171 #[clap(long)]
172 #[builder(setter(into, strip_option))]
173 pub min_align_len: Option<i64>,
174 /// Only include this read id, defaults to unused i.e. all reads are used.
175 /// NOTE: if there are multiple alignments corresponding
176 /// to this read id, all of them are used.
177 #[clap(long, conflicts_with = "read_id_list")]
178 #[builder(setter(into, strip_option))]
179 pub read_id: Option<String>,
180 /// Path to file containing list of read IDs (one per line).
181 /// Lines starting with '#' are treated as comments and ignored.
182 /// Cannot be used together with --read-id.
183 #[clap(long, conflicts_with = "read_id")]
184 #[builder(setter(into, strip_option))]
185 pub read_id_list: Option<String>,
186 /// Internal `HashSet` of read IDs loaded from `read_id_list` file.
187 /// This is populated automatically and not exposed to users.
188 #[clap(skip)]
189 #[builder(setter(into, strip_option))]
190 pub read_id_set: Option<HashSet<String>>,
191 /// Number of threads used during some aspects of program execution
192 #[clap(long, default_value_t = NonZeroU32::new(2).expect("no error"))]
193 #[builder(setter(into))]
194 pub threads: NonZeroU32,
195 /// Include "zero-length" sequences e.g. sequences with "*" in the sequence
196 /// field. By default, these sequences are excluded to avoid processing errors.
197 /// If this flag is set, these reads are included irrespective of any
198 /// minimum sequence or align length criteria the user may have set.
199 /// WARNINGS: (1) Some functions of the codebase may break or produce incorrect
200 /// results if you use this flag. (2) due to a technical reason, we need a DNA sequence
201 /// in the sequence field and cannot infer sequence length from other sources
202 /// e.g. CIGAR strings.
203 #[clap(long, default_value_t)]
204 pub include_zero_len: bool,
205 /// Only retain reads of this type. Allowed types are `primary_forward`,
206 /// `primary_reverse`, `secondary_forward`, `secondary_reverse`, `supplementary_forward`,
207 /// `supplementary_reverse` and unmapped. Specify more than one type if needed
208 /// separated by commas, in which case reads of any type in list are retained.
209 /// Defaults to retain reads of all types.
210 #[clap(long)]
211 #[builder(field(
212 ty = "String",
213 build = "(!self.read_filter.is_empty()).then(|| ReadStates::from_str(&self.read_filter)).transpose()?"
214 ))]
215 pub read_filter: Option<ReadStates>,
216 /// Subsample BAM to retain only this fraction of total number of reads,
217 /// defaults to 1.0. The sampling algorithm considers every read according
218 /// to the specified probability, so due to this, you may not always get
219 /// the same number of reads e.g. if you set `-s 0.05` in a file with 1000 reads,
220 /// you will get 50 +- sqrt(50) reads.
221 /// By default, a new subsample is drawn every time as the seed is not fixed.
222 /// Set `--sample-seed` to get reproducible subsampling.
223 #[clap(short, long, default_value_t = F32Bw0and1::one())]
224 pub sample_fraction: F32Bw0and1,
225 /// Seed for reproducible subsampling. When set, the subsampling decision
226 /// for each read is deterministic based on the read name and the seed.
227 /// Different seeds produce different subsets. If not set, subsampling
228 /// is random and non-reproducible (the default behavior).
229 #[clap(long)]
230 #[builder(setter(into, strip_option))]
231 pub sample_seed: Option<u64>,
232 /// Exclude reads whose MAPQ (Mapping quality of position) is below this value.
233 /// Defaults to zero i.e. do not exclude any read.
234 #[clap(long, default_value_t)]
235 #[builder(setter(into))]
236 pub mapq_filter: u8,
237 /// Exclude sequences with MAPQ unavailable.
238 /// In the BAM format, a value of 255 in this column means MAPQ is unavailable.
239 /// These reads are allowed by default, set this flag to exclude.
240 #[clap(long, default_value_t)]
241 pub exclude_mapq_unavail: bool,
242 /// Only keep reads passing through this region. If a BAM index is available
243 /// with a name same as the BAM file but with the .bai suffix, the operation
244 /// of selecting such reads will be faster. If you are using standard input
245 /// as your input e.g. you are piping in the output from samtools, then
246 /// you cannot use an index as a BAM filename is not available.
247 #[clap(long)]
248 #[builder(field(
249 ty = "String",
250 build = "(!self.region.is_empty()).then(|| GenomicRegion::from_str(&self.region)).transpose()?"
251 ))]
252 pub region: Option<GenomicRegion>,
253 /// Only keep read data from this region.
254 /// This is an internal option not exposed to the user, we will set it
255 /// based on the other options that the user sets.
256 #[clap(skip)]
257 #[builder(setter(into, strip_option))]
258 pub region_bed3: Option<Bed3<i32, u64>>,
259 /// Only keep reads if they pass through the specified region in full.
260 /// Related to the input `--region`; has no effect if that is not set.
261 #[clap(long, default_value_t, requires = "region")]
262 pub full_region: bool,
263}
264
265impl InputBamBuilder {
266 /// Validate [`InputBam`] creation through the Builder method.
267 ///
268 /// [`InputBam`] can be created in many ways:
269 /// - directly populating the fields as they are public.
270 /// - through the CLI-machinery
271 /// - through the builder.
272 ///
273 /// While we cannot check a user's usage through the first method,
274 /// we can check constructions through methods 2 and 3 above.
275 /// The CLI construction checking is handled by `clap`, and the builder
276 /// checking is done through this function.
277 /// The construction checks are slightly different as what may be feasible
278 /// through a CLI and through code are different.
279 /// So we have this validate method to check construction through
280 /// method number 3 above.
281 #[expect(
282 clippy::arithmetic_side_effects,
283 reason = "1 + 1 + 1 is not gonna overflow"
284 )]
285 fn validate(&self) -> Result<(), Error> {
286 match (
287 !self.region.is_empty(),
288 self.region_bed3.is_some(),
289 self.full_region,
290 ) {
291 (false, false, Some(true)) => {
292 return Err(Error::BuilderValidation("InputBamBuilder: cannot set `full_region` without setting `region` or `region_bed3`".to_owned()));
293 }
294 (true, true, _) => {
295 return Err(Error::BuilderValidation(
296 "InputBamBuilder: cannot set both `region` and `region_bed3`".to_owned(),
297 ));
298 }
299 _ => {}
300 }
301
302 if u8::from(self.read_id.is_some())
303 + u8::from(self.read_id_list.is_some())
304 + u8::from(self.read_id_set.is_some())
305 > 1
306 {
307 Err(Error::BuilderValidation(
308 "InputBamBuilder: cannot set >1 of `read_id`, `read_id_list` and `read_id_set`"
309 .to_owned(),
310 ))
311 } else {
312 Ok(())
313 }
314 }
315}
316
317/// Implements a default class for `InputBAM`
318impl Default for InputBam {
319 fn default() -> Self {
320 InputBam {
321 bam_path: PathOrURLOrStdin::Stdin,
322 min_seq_len: 0,
323 min_align_len: None,
324 read_id: None,
325 read_id_list: None,
326 read_id_set: None,
327 threads: NonZeroU32::new(2).expect("no error"),
328 include_zero_len: false,
329 read_filter: None,
330 sample_fraction: F32Bw0and1::one(),
331 sample_seed: None,
332 mapq_filter: 0,
333 exclude_mapq_unavail: false,
334 region: None,
335 region_bed3: None,
336 full_region: false,
337 }
338 }
339}
340
341/// This struct contains an optional modification tag
342#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
343#[non_exhaustive]
344pub struct OptionalTag {
345 /// modified tag
346 #[clap(long)]
347 pub tag: Option<ModChar>,
348}
349
350impl FromStr for OptionalTag {
351 type Err = Error;
352
353 /// process the modification type from a string.
354 ///
355 /// ```
356 /// use nanalogue_core::{Error, ModChar, OptionalTag};
357 /// use std::str::FromStr;
358 ///
359 /// let tag = OptionalTag::from_str("m")?;
360 /// assert_eq!(tag.tag.unwrap().val(), 'm');
361 /// # Ok::<(), Error>(())
362 /// ```
363 fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
364 Ok(OptionalTag {
365 tag: Some(ModChar::from_str(mod_type)?),
366 })
367 }
368}
369
370/// This struct contains a required modification tag
371#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
372#[non_exhaustive]
373pub struct RequiredTag {
374 /// modified tag
375 #[clap(long)]
376 pub tag: ModChar,
377}
378
379impl FromStr for RequiredTag {
380 type Err = Error;
381
382 /// process the modification type from a string.
383 ///
384 /// ```
385 /// use nanalogue_core::{Error, ModChar, RequiredTag};
386 /// use std::str::FromStr;
387 ///
388 /// let tag = RequiredTag::from_str("m")?;
389 /// assert_eq!(tag.tag.val(), 'm');
390 /// # Ok::<(), Error>(())
391 /// ```
392 fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
393 Ok(RequiredTag {
394 tag: ModChar::from_str(mod_type)?,
395 })
396 }
397}
398
399/// Trait that returns a modification tag
400pub trait TagState {
401 /// Returns the modification tag of the tag state in an option
402 fn tag(&self) -> Option<ModChar>;
403}
404
405impl TagState for OptionalTag {
406 fn tag(&self) -> Option<ModChar> {
407 self.tag
408 }
409}
410
411impl TagState for RequiredTag {
412 fn tag(&self) -> Option<ModChar> {
413 Some(self.tag)
414 }
415}
416
417/// This struct contains the options input to our
418/// modification-data functions with restrictions on data received.
419///
420/// If you want to build this, use [`InputModsBuilder`].
421/// In CLI mode, `clap` populates this struct.
422/// This and the [`InputBam`] struct are used to set almost all input options
423/// to many of our functions that process BAM/modBAM files.
424///
425/// # Examples
426///
427/// Please look at the documentation of [`InputBam`] to see a fully-worked
428/// example to set up inputs to perform a calculation.
429///
430/// Following example uses many fields and is for a [`RequiredTag`] variant.
431/// You can omit some of them depending on your use case. `mod_region` must
432/// can be converted to `region_bed3` using [`GenomicRegion::try_to_bed3`]
433/// with a suitable BAM header before this struct can be used with most of
434/// our functions. We do not force this in the builder route as we may not
435/// want to do this for some reason.
436///
437/// ```
438/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
439/// use std::str::FromStr;
440///
441/// let input_options = InputModsBuilder::default()
442/// .tag(RequiredTag::from_str("m")?)
443/// .mod_strand("bc".into())
444/// .mod_prob_filter(ThresholdState::GtEq(0))
445/// .trim_read_ends_mod(10)
446/// .base_qual_filter_mod(10)
447/// .mod_region("chr3:4000-5000".into())
448/// .build()?;
449///
450/// # Ok::<(), Error>(())
451/// ```
452///
453/// A [`RequiredTag`] variant that sets `region_bed3` instead of `mod_region`.
454/// If `mod_region` is set, then a downstream function must parse a BAM file,
455/// and then convert `mod_region` into `region_bed3` using the header.
456/// If the user is confident that they know the index number of a contig,
457/// then they can directly set `region_bed3`. In the example below, the user
458/// knows `chr3` corresponds to `4`, so they set the region directly.
459///
460/// ```
461/// use bedrs::prelude::Bed3;
462/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
463/// use std::str::FromStr;
464///
465/// let input_options = InputModsBuilder::<RequiredTag>::default()
466/// .tag(RequiredTag::from_str("m")?)
467/// .mod_strand("bc".into())
468/// .mod_prob_filter(ThresholdState::GtEq(0))
469/// .trim_read_ends_mod(10)
470/// .base_qual_filter_mod(10)
471/// .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
472/// .build()?;
473///
474/// # Ok::<(), Error>(())
475/// ```
476///
477/// Setting both `region_bed3` and `mod_region` should result in an error,
478/// even if they both refer to the same region. This is because setting both
479/// can result in an undefined state, so we do not allow this.
480///
481/// ```should_panic
482/// use bedrs::prelude::Bed3;
483/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
484/// use std::str::FromStr;
485///
486/// let input_options = InputModsBuilder::<RequiredTag>::default()
487/// .mod_prob_filter(ThresholdState::GtEq(0))
488/// .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
489/// .mod_region("chr3:4000-5000".into())
490/// .build()?;
491///
492/// # Ok::<(), Error>(())
493/// ```
494///
495/// Following example is for an [`OptionalTag`] variant with the `tag` omitted,
496/// and many other fields omitted as well, which take on a default value.
497///
498/// ```
499/// use nanalogue_core::{InputModsBuilder, Error, OptionalTag, ThresholdState};
500/// use std::str::FromStr;
501///
502/// let input_options = InputModsBuilder::<OptionalTag>::default()
503/// .mod_prob_filter(ThresholdState::GtEq(0))
504/// .build()?;
505///
506/// # Ok::<(), Error>(())
507/// ```
508#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
509#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
510#[serde(default)]
511#[non_exhaustive]
512pub struct InputMods<S: TagState + Args + FromArgMatches + Default> {
513 /// modified tag
514 #[clap(flatten)]
515 pub tag: S,
516 /// modified strand, set this to `bc` or `bc_comp`, meaning
517 /// on basecalled strand or its complement. Some technologies
518 /// like `PacBio` or `ONT` duplex can call mod data on both a strand
519 /// and its complementary DNA and store it in the record corresponding
520 /// to the strand, so you can use this filter to select only for
521 /// mod data on a strand or its complement. Please note that this
522 /// filter is different from selecting for forward or reverse
523 /// aligned reads using the BAM flags.
524 #[clap(long)]
525 #[builder(field(
526 ty = "String",
527 build = "(!self.mod_strand.is_empty()).then(|| RestrictModCalledStrand::from_str(&self.mod_strand)).transpose()?"
528 ))]
529 pub mod_strand: Option<RestrictModCalledStrand>,
530 /// Filter to reject mods before analysis. Specify as low,high where
531 /// both are fractions to reject modifications where the probabilities (p)
532 /// are in this range e.g. "0.4,0.6" rejects 0.4 <= p <= 0.6.
533 /// You can use this to reject 'weak' modification calls before analysis
534 /// i.e. those with probabilities close to 0.5.
535 /// NOTE: (1) Whether this filtration is applied or not, mods < 0.5
536 /// are considered unmodified and >= 0.5 are considered modified by our program.
537 /// (2) mod probabilities are stored as a number from 0-255 in the modBAM format,
538 /// so we internally convert 0.0-1.0 to 0-255. Default: reject nothing.
539 #[clap(long, value_parser=ThresholdState::from_str_ordpair_fraction, default_value = "")]
540 pub mod_prob_filter: ThresholdState,
541 /// Filter this many bp at the start and
542 /// end of a read before any mod operations.
543 /// Please note that the units here are bp and
544 /// not units of base being queried.
545 #[clap(long, default_value_t)]
546 pub trim_read_ends_mod: usize,
547 /// Exclude bases whose base quality is below
548 /// this threshold before any mod operation, defaults to 0 i.e. unused.
549 /// NOTE: (1) This step is only applied before modification operations, and
550 /// not before any other operations.
551 /// (2) No offsets such as +33 are needed here.
552 /// (3) Modifications on reads where base quality information
553 /// is not available are all rejected if this is non-zero.
554 #[clap(long, default_value_t)]
555 pub base_qual_filter_mod: u8,
556 /// Only keep modification data from this region
557 #[clap(long)]
558 #[builder(field(
559 ty = "String",
560 build = "(!self.mod_region.is_empty()).then(|| GenomicRegion::from_str(&self.mod_region)).transpose()?"
561 ))]
562 pub mod_region: Option<GenomicRegion>,
563 /// Only keep modification data from this region.
564 /// We do not expose this to the user, but infer it from
565 /// the other options set by the user.
566 /// We cannot populate this directly at the time of CLI parsing,
567 /// as we need to look at the BAM header to convert a contig name
568 /// into a numeric contig id.
569 #[clap(skip)]
570 #[builder(setter(strip_option))]
571 pub region_bed3: Option<Bed3<i32, u64>>,
572}
573
574impl<S: TagState + Args + FromArgMatches + Default> InputModsBuilder<S> {
575 /// Validate [`InputMods`] creation through the Builder method.
576 ///
577 /// [`InputMods`] can be created in many ways:
578 /// - directly populating the fields as they are public.
579 /// - through the CLI-machinery
580 /// - through the builder.
581 ///
582 /// While we cannot check a user's usage through the first method,
583 /// we can check constructions through methods 2 and 3 above.
584 /// The CLI construction checking is handled by `clap`, and the builder
585 /// checking is done through this function.
586 /// The construction checks are slightly different as what may be feasible
587 /// through a CLI and through code are different.
588 /// So we have this validate method to check construction through
589 /// method number 3 above.
590 fn validate(&self) -> Result<(), Error> {
591 // self.mod_region is a `String` for the Builder before conversion into an `Option<_>`
592 if (!self.mod_region.is_empty()) && self.region_bed3.is_some() {
593 Err(Error::BuilderValidation(
594 "cannot set `mod_region` and `region_bed3` simultaneously!".to_owned(),
595 ))
596 } else {
597 Ok(())
598 }
599 }
600}
601
602/// Implements defaults for `InputMods`
603impl<S: TagState + Args + FromArgMatches + Default> Default for InputMods<S> {
604 fn default() -> Self {
605 InputMods::<S> {
606 tag: S::default(),
607 mod_strand: None,
608 mod_prob_filter: ThresholdState::GtEq(0),
609 trim_read_ends_mod: 0,
610 base_qual_filter_mod: 0,
611 mod_region: None,
612 region_bed3: None,
613 }
614 }
615}
616
617/// Retrieves options for modification input
618pub trait InputModOptions {
619 /// retrieves tag
620 fn tag(&self) -> Option<ModChar>;
621 /// retrieves option to set basecalled strand or opposite in mod retrieval
622 fn mod_strand(&self) -> Option<RestrictModCalledStrand>;
623 /// returns probability filter
624 fn mod_prob_filter(&self) -> ThresholdState;
625 /// returns read end trimming
626 fn trim_read_ends_mod(&self) -> usize;
627 /// returns threshold for filtering base PHRED quality
628 fn base_qual_filter_mod(&self) -> u8;
629}
630
631/// Retrieves options for region
632pub trait InputRegionOptions {
633 /// returns region requested
634 fn region_filter(&self) -> &Option<Bed3<i32, u64>>;
635 /// returns region requested but region in genomic string format
636 fn region_filter_genomic_string(&self) -> Option<GenomicRegion>;
637 /// sets region requested
638 fn set_region_filter(&mut self, _value: Option<Bed3<i32, u64>>);
639 /// returns true if full overlap with region is requested as opposed to
640 /// only partial overlap. defaults to false.
641 fn is_full_overlap(&self) -> bool {
642 false
643 }
644 /// converts region from genomic string representation to bed3 representation
645 ///
646 /// # Errors
647 /// Returns error if conversion fails
648 fn convert_region_to_bed3(&mut self, header: bam::HeaderView) -> Result<(), Error> {
649 match (
650 self.region_filter_genomic_string().is_some(),
651 self.region_filter().is_some(),
652 ) {
653 (false, false) => self.set_region_filter(None),
654 (true, false) => {
655 let genomic_region = self
656 .region_filter_genomic_string()
657 .expect("checked above that this is Some");
658 self.set_region_filter(Some(genomic_region.try_to_bed3(&header)?));
659 }
660 (false, true) => {}
661 (true, true) => {
662 return Err(Error::InvalidState(
663 "cannot set a region as both a `GenomicRegion` and a `Bed3`".to_owned(),
664 ));
665 }
666 }
667 Ok(())
668 }
669}
670
671impl<S: TagState + Args + FromArgMatches + Default> InputModOptions for InputMods<S> {
672 fn tag(&self) -> Option<ModChar> {
673 self.tag.tag()
674 }
675 fn mod_strand(&self) -> Option<RestrictModCalledStrand> {
676 self.mod_strand
677 }
678 fn mod_prob_filter(&self) -> ThresholdState {
679 self.mod_prob_filter
680 }
681 fn trim_read_ends_mod(&self) -> usize {
682 self.trim_read_ends_mod
683 }
684 fn base_qual_filter_mod(&self) -> u8 {
685 self.base_qual_filter_mod
686 }
687}
688
689impl<S: TagState + Args + FromArgMatches + Default> InputRegionOptions for InputMods<S> {
690 fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
691 self.mod_region.clone()
692 }
693 fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
694 &self.region_bed3
695 }
696 fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
697 self.region_bed3 = value;
698 }
699}
700
701/// can return tag without encasing in an option in the `RequiredTag` variant
702impl InputMods<RequiredTag> {
703 /// retrieves tag
704 #[must_use]
705 pub fn required_tag(&self) -> ModChar {
706 self.tag.tag
707 }
708}
709
710impl InputRegionOptions for InputBam {
711 fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
712 self.region.clone()
713 }
714 fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
715 &self.region_bed3
716 }
717 fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
718 self.region_bed3 = value;
719 }
720 fn is_full_overlap(&self) -> bool {
721 self.full_region
722 }
723}
724
725/// This struct contains the options input to our
726/// modification-data-windowing functions
727///
728/// # Example
729/// ```
730/// use nanalogue_core::{Error, InputWindowingBuilder};
731///
732/// let window = InputWindowingBuilder::default()
733/// .win(20)
734/// .step(10).build()?;
735/// # Ok::<(), Error>(())
736/// ```
737#[derive(Builder, Debug, Args, Clone, Copy, Serialize, Deserialize)]
738#[builder(default, build_fn(error = "Error"))]
739#[serde(default)]
740#[non_exhaustive]
741pub struct InputWindowing {
742 /// size of window in units of base being queried i.e.
743 /// if you are looking for cytosine modifications, then
744 /// a window of a value 300 means create windows each with
745 /// 300 cytosines irrespective of their modification status.
746 #[clap(long)]
747 #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.win)?"))]
748 pub win: NonZeroUsize,
749 /// step window by this size in units of base being queried.
750 #[clap(long)]
751 #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.step)?"))]
752 pub step: NonZeroUsize,
753}
754
755/// Implements a default for `InputWindowing`.
756/// NOTE the defaults of 1 for each are just for ease of programming.
757/// We do not expose these defaults to the command-line user.
758impl Default for InputWindowing {
759 fn default() -> Self {
760 InputWindowing {
761 win: NonZeroUsize::new(1).expect("no error"),
762 step: NonZeroUsize::new(1).expect("no error"),
763 }
764 }
765}
766
767/// This enum contains inputs to display sequences with various options
768#[derive(Debug, Clone, Default, Copy, Serialize, Deserialize)]
769#[non_exhaustive]
770pub enum SeqDisplayOptions {
771 /// Do not display any sequence, default
772 #[default]
773 No,
774 /// Display the entire sequence without any subsetting
775 Full {
776 /// Option to display basecalling qualities
777 show_base_qual: bool,
778 },
779 /// Display the sequence within a region
780 Region {
781 /// Option to display basecalling qualities
782 show_base_qual: bool,
783 /// Option to show bases in lowercase if they are insertions
784 show_ins_lowercase: bool,
785 /// Option to show modified bases as Z (or z depending on other options)
786 show_mod_z: bool,
787 /// Region over which sequence must be shown
788 region: Bed3<i32, u64>,
789 },
790}
791
792#[cfg(test)]
793mod tag_struct_tests {
794 use super::*;
795
796 #[test]
797 fn optional_tag_default() {
798 let optional_tag = OptionalTag::default();
799 assert!(optional_tag.tag.is_none());
800 assert_eq!(optional_tag.tag(), None);
801 }
802
803 #[test]
804 fn optional_tag_with_value() {
805 let mod_char = ModChar::new('m');
806 let optional_tag = OptionalTag {
807 tag: Some(mod_char),
808 };
809 assert_eq!(optional_tag.tag(), Some(mod_char));
810 }
811
812 #[test]
813 fn required_tag_default() {
814 let required_tag = RequiredTag::default();
815 assert_eq!(required_tag.tag(), Some(ModChar::default()));
816 }
817
818 #[test]
819 fn required_tag_with_value() {
820 let mod_char = ModChar::new('C');
821 let required_tag = RequiredTag { tag: mod_char };
822 assert_eq!(required_tag.tag(), Some(mod_char));
823 }
824
825 #[test]
826 fn optional_tag_from_str() {
827 let tag = OptionalTag::from_str("m").unwrap();
828 assert_eq!(tag.tag.unwrap().val(), 'm');
829 }
830
831 #[test]
832 fn required_tag_from_str() {
833 let tag = RequiredTag::from_str("m").unwrap();
834 assert_eq!(tag.tag.val(), 'm');
835 }
836}
837
838#[cfg(test)]
839mod input_windowing_tests {
840 use super::*;
841
842 #[test]
843 fn input_windowing_default() {
844 let windowing = InputWindowing::default();
845 assert_eq!(windowing.win, NonZeroUsize::new(1).unwrap());
846 assert_eq!(windowing.step, NonZeroUsize::new(1).unwrap());
847 }
848
849 #[test]
850 fn input_windowing_custom_values() {
851 let windowing = InputWindowing {
852 win: NonZeroUsize::new(300).unwrap(),
853 step: NonZeroUsize::new(150).unwrap(),
854 };
855 assert_eq!(windowing.win, NonZeroUsize::new(300).unwrap());
856 assert_eq!(windowing.step, NonZeroUsize::new(150).unwrap());
857 }
858
859 #[test]
860 fn input_windowing_builder() {
861 let _: InputWindowing = InputWindowingBuilder::default()
862 .win(20)
863 .step(10)
864 .build()
865 .unwrap();
866 }
867}
868
869#[cfg(test)]
870mod input_mods_required_tag_tests {
871 use super::*;
872
873 #[test]
874 fn input_mods_required_tag_fn_tag() {
875 let mod_char = ModChar::new('C');
876 let input_mods = InputMods::<RequiredTag> {
877 tag: RequiredTag { tag: mod_char },
878 mod_strand: None,
879 mod_prob_filter: ThresholdState::GtEq(0),
880 trim_read_ends_mod: 0,
881 base_qual_filter_mod: 0,
882 mod_region: None,
883 region_bed3: None,
884 };
885
886 assert_eq!(input_mods.required_tag(), mod_char);
887 }
888
889 #[test]
890 fn input_mods_builder_required_tag_with_mod_region() {
891 let _: InputMods<RequiredTag> = InputModsBuilder::default()
892 .tag(RequiredTag::from_str("m").unwrap())
893 .mod_strand("bc".into())
894 .mod_prob_filter(ThresholdState::GtEq(0))
895 .trim_read_ends_mod(10)
896 .base_qual_filter_mod(10)
897 .mod_region("chr3:4000-5000".into())
898 .build()
899 .unwrap();
900 }
901
902 #[test]
903 fn input_mods_builder_required_tag_with_region_bed3() {
904 let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
905 .tag(RequiredTag::from_str("m").unwrap())
906 .mod_strand("bc".into())
907 .mod_prob_filter(ThresholdState::GtEq(0))
908 .trim_read_ends_mod(10)
909 .base_qual_filter_mod(10)
910 .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
911 .build()
912 .unwrap();
913 }
914
915 #[test]
916 fn input_mods_builder_optional_tag_minimal() {
917 let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
918 .mod_prob_filter(ThresholdState::GtEq(0))
919 .build()
920 .unwrap();
921 }
922}
923
924#[cfg(test)]
925mod input_bam_tests {
926 use super::*;
927 use bedrs::Coordinates as _;
928
929 #[test]
930 fn input_bam_is_full_overlap() {
931 // Test default (false)
932 let input_bam_default = InputBam::default();
933 assert!(!input_bam_default.is_full_overlap());
934
935 // Test explicit false
936 let input_bam_false = InputBam {
937 full_region: false,
938 ..Default::default()
939 };
940 assert!(!input_bam_false.is_full_overlap());
941
942 // Test true
943 let input_bam_true = InputBam {
944 full_region: true,
945 ..Default::default()
946 };
947 assert!(input_bam_true.is_full_overlap());
948 }
949
950 #[test]
951 fn input_bam_builder() {
952 let _: InputBam = InputBamBuilder::default()
953 .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
954 .min_seq_len(30000u64)
955 .min_align_len(20000i64)
956 .read_id("some-id")
957 .read_filter("primary_forward,secondary_forward".into())
958 .sample_fraction(F32Bw0and1::one())
959 .mapq_filter(20)
960 .exclude_mapq_unavail(true)
961 .region("chr4:1000-2000".into())
962 .full_region(true)
963 .build()
964 .unwrap();
965 }
966
967 #[test]
968 fn input_bam_deserialize_rejects_disallowed_read_filter() {
969 let bad: Result<InputBam, _> =
970 serde_json::from_str(r#"{"read_filter":[1, 700], "include_zero_len": true}"#);
971 let _: serde_json::Error = bad.unwrap_err();
972 }
973
974 #[test]
975 fn input_bam_convert_region_to_bed3_none() {
976 let mut input_bam = InputBam::default();
977
978 let header_view = bam::HeaderView::from_bytes(
979 concat!(
980 "@HD\tVN:1.6\tSO:coordinate\n",
981 "@SQ\tSN:chr1\tLN:248956422\n"
982 )
983 .as_bytes(),
984 );
985
986 input_bam.convert_region_to_bed3(header_view).unwrap();
987 assert!(input_bam.region_bed3.is_none());
988 }
989
990 #[test]
991 fn input_bam_convert_region_to_bed3_with_region() {
992 let mut input_bam = InputBam {
993 region: Some(GenomicRegion::from_str("chr2:3400-3600").unwrap()),
994 ..Default::default()
995 };
996
997 let header_view = bam::HeaderView::from_bytes(
998 concat!(
999 "@HD\tVN:1.6\tSO:coordinate\n",
1000 "@SQ\tSN:chr1\tLN:3000\n",
1001 "@SQ\tSN:chr2\tLN:4000\n"
1002 )
1003 .as_bytes(),
1004 );
1005
1006 input_bam.convert_region_to_bed3(header_view).unwrap();
1007 assert!(input_bam.region_bed3.is_some());
1008
1009 let bed3 = input_bam.region_bed3.unwrap();
1010 assert_eq!(bed3.chr(), &1);
1011 assert_eq!(bed3.start(), 3400);
1012 assert_eq!(bed3.end(), 3600);
1013 }
1014
1015 #[test]
1016 #[should_panic(expected = "InvalidRegion")]
1017 fn input_bam_convert_region_to_bed3_invalid_region() {
1018 let mut input_bam = InputBam {
1019 region: Some(GenomicRegion::from_str("chr2:4400-4600").expect("no error")),
1020 ..Default::default()
1021 };
1022 let header_view = bam::HeaderView::from_bytes(
1023 concat!(
1024 "@HD\tVN:1.6\tSO:coordinate\n",
1025 "@SQ\tSN:chr1\tLN:3000\n",
1026 "@SQ\tSN:chr2\tLN:4000\n"
1027 )
1028 .as_bytes(),
1029 );
1030
1031 input_bam.convert_region_to_bed3(header_view).unwrap();
1032 }
1033
1034 #[test]
1035 #[should_panic(expected = "InvalidRegion")]
1036 fn input_bam_convert_region_to_bed3_invalid_open_ended_region() {
1037 let mut input_bam = InputBam {
1038 region: Some(GenomicRegion::from_str("chr2:4600-").expect("no error")),
1039 ..Default::default()
1040 };
1041 let header_view = bam::HeaderView::from_bytes(
1042 concat!(
1043 "@HD\tVN:1.6\tSO:coordinate\n",
1044 "@SQ\tSN:chr1\tLN:3000\n",
1045 "@SQ\tSN:chr2\tLN:4000\n"
1046 )
1047 .as_bytes(),
1048 );
1049
1050 input_bam.convert_region_to_bed3(header_view).unwrap();
1051 }
1052
1053 #[test]
1054 #[should_panic(expected = "InvalidAlignCoords")]
1055 fn input_bam_convert_region_to_bed3_invalid_contig() {
1056 let mut input_bam = InputBam {
1057 region: Some(GenomicRegion::from_str("chr3:1000-2000").expect("no error")),
1058 ..Default::default()
1059 };
1060 let header_view = bam::HeaderView::from_bytes(
1061 concat!(
1062 "@HD\tVN:1.6\tSO:coordinate\n",
1063 "@SQ\tSN:chr1\tLN:3000\n",
1064 "@SQ\tSN:chr2\tLN:4000\n"
1065 )
1066 .as_bytes(),
1067 );
1068
1069 input_bam.convert_region_to_bed3(header_view).unwrap();
1070 }
1071
1072 #[test]
1073 fn input_bam_convert_region_to_bed3_already_set_bed3() {
1074 // When region_bed3 is already set and region is None, should not convert
1075 let mut input_bam = InputBam {
1076 region: None,
1077 region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1078 ..Default::default()
1079 };
1080
1081 let header_view = bam::HeaderView::from_bytes(
1082 concat!(
1083 "@HD\tVN:1.6\tSO:coordinate\n",
1084 "@SQ\tSN:chr1\tLN:3000\n",
1085 "@SQ\tSN:chr2\tLN:4000\n"
1086 )
1087 .as_bytes(),
1088 );
1089
1090 input_bam.convert_region_to_bed3(header_view).unwrap();
1091
1092 // Bed3 should remain unchanged
1093 assert!(input_bam.region_bed3.is_some());
1094 let bed3 = input_bam.region_bed3.unwrap();
1095 assert_eq!(bed3.chr(), &1);
1096 assert_eq!(bed3.start(), 1000);
1097 assert_eq!(bed3.end(), 2000);
1098 }
1099
1100 #[test]
1101 #[should_panic(expected = "InvalidState")]
1102 fn input_bam_convert_region_to_bed3_both_set() {
1103 // When both region and region_bed3 are set, should error
1104 let mut input_bam = InputBam {
1105 region: Some(GenomicRegion::from_str("chr2:1500-2500").unwrap()),
1106 region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1107 ..Default::default()
1108 };
1109
1110 let header_view = bam::HeaderView::from_bytes(
1111 concat!(
1112 "@HD\tVN:1.6\tSO:coordinate\n",
1113 "@SQ\tSN:chr1\tLN:3000\n",
1114 "@SQ\tSN:chr2\tLN:4000\n"
1115 )
1116 .as_bytes(),
1117 );
1118
1119 input_bam.convert_region_to_bed3(header_view).unwrap();
1120 }
1121}
1122
1123#[cfg(test)]
1124mod input_region_options_tests {
1125 use super::*;
1126
1127 #[derive(Debug, Default)]
1128 struct TestInputRegionOptions {
1129 region_filter: Option<Bed3<i32, u64>>,
1130 region_filter_genomic_string: Option<GenomicRegion>,
1131 }
1132
1133 impl InputRegionOptions for TestInputRegionOptions {
1134 fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
1135 &self.region_filter
1136 }
1137
1138 fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
1139 self.region_filter_genomic_string.clone()
1140 }
1141
1142 fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
1143 self.region_filter = value;
1144 }
1145 }
1146
1147 #[test]
1148 fn input_region_options_is_full_overlap_false() {
1149 let test_obj = TestInputRegionOptions::default();
1150 assert!(!test_obj.is_full_overlap());
1151 }
1152}
1153
1154#[cfg(test)]
1155mod validate_builder_functions {
1156 use super::*;
1157
1158 // Tests for InputBamBuilder::validate conflicts
1159
1160 #[test]
1161 #[should_panic(expected = "BuilderValidation")]
1162 fn input_bam_builder_full_region_without_region_fails() {
1163 let _: InputBam = InputBamBuilder::default()
1164 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1165 .full_region(true)
1166 .build()
1167 .unwrap();
1168 }
1169
1170 #[test]
1171 #[should_panic(expected = "BuilderValidation")]
1172 fn input_bam_builder_both_region_and_region_bed3_fails() {
1173 let _: InputBam = InputBamBuilder::default()
1174 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1175 .region("chr1:1000-2000".into())
1176 .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1177 .build()
1178 .unwrap();
1179 }
1180
1181 #[test]
1182 #[should_panic(expected = "BuilderValidation")]
1183 fn input_bam_builder_read_id_and_read_id_list_fails() {
1184 let _: InputBam = InputBamBuilder::default()
1185 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1186 .read_id("some-id")
1187 .read_id_list("/some/file.txt")
1188 .build()
1189 .unwrap();
1190 }
1191
1192 #[test]
1193 #[should_panic(expected = "BuilderValidation")]
1194 fn input_bam_builder_read_id_and_read_id_set_fails() {
1195 let mut read_ids = HashSet::new();
1196 let _: bool = read_ids.insert("read1".to_owned());
1197 let _: InputBam = InputBamBuilder::default()
1198 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1199 .read_id("some-id")
1200 .read_id_set(read_ids)
1201 .build()
1202 .unwrap();
1203 }
1204
1205 #[test]
1206 #[should_panic(expected = "BuilderValidation")]
1207 fn input_bam_builder_read_id_list_and_read_id_set_fails() {
1208 let mut read_ids = HashSet::new();
1209 let _: bool = read_ids.insert("read1".to_owned());
1210 let _: InputBam = InputBamBuilder::default()
1211 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1212 .read_id_list("/some/file.txt")
1213 .read_id_set(read_ids)
1214 .build()
1215 .unwrap();
1216 }
1217
1218 #[test]
1219 #[should_panic(expected = "BuilderValidation")]
1220 fn input_bam_builder_all_three_read_id_options_fails() {
1221 let mut read_ids = HashSet::new();
1222 let _: bool = read_ids.insert("read1".to_owned());
1223 let _: InputBam = InputBamBuilder::default()
1224 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1225 .read_id("some-id")
1226 .read_id_list("/some/file.txt")
1227 .read_id_set(read_ids)
1228 .build()
1229 .unwrap();
1230 }
1231
1232 // Tests for InputModsBuilder::validate conflicts
1233
1234 #[test]
1235 #[should_panic(expected = "BuilderValidation")]
1236 fn input_mods_builder_both_mod_region_and_region_bed3_fails() {
1237 let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
1238 .mod_prob_filter(ThresholdState::GtEq(0))
1239 .mod_region("chr1:1000-2000".into())
1240 .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1241 .build()
1242 .unwrap();
1243 }
1244
1245 #[test]
1246 #[should_panic(expected = "BuilderValidation")]
1247 fn input_mods_builder_required_tag_both_regions_fails() {
1248 let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
1249 .tag(RequiredTag::from_str("m").unwrap())
1250 .mod_prob_filter(ThresholdState::GtEq(0))
1251 .mod_region("chr1:1000-2000".into())
1252 .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1253 .build()
1254 .unwrap();
1255 }
1256}