nanalogue_core/cli.rs
1//! # Command line interface (CLI) options, including input processing options
2//!
3//! This file provides some global options in the command line interface.
4use crate::{
5 Error, F32Bw0and1, GenomicRegion, ModChar, PathOrURLOrStdin, ReadStates,
6 RestrictModCalledStrand, ThresholdState,
7};
8use bedrs::prelude::Bed3;
9use clap::{Args, FromArgMatches};
10use derive_builder::Builder;
11use rust_htslib::bam;
12use serde::{Deserialize, Serialize};
13use std::collections::HashSet;
14use std::num::{NonZeroU32, NonZeroUsize};
15use std::str::FromStr;
16
17/// Options to parse the input bam file and the filters that should be applied to the bam file.
18///
19/// This struct is parsed to create command line arguments and then passed to many functions.
20/// We have copied and edited a similar struct from the fibertools-rs repository.
21/// You can build this through [`InputBamBuilder`].
22/// In CLI mode, `clap` populates this struct.
23/// This and the [`InputMods`] struct are used to set almost all input options
24/// to many of our functions that process BAM/modBAM files.
25///
26/// # Examples
27///
28/// We first begin with an example to build the struct.
29/// The next example shows how this struct and [`InputMods`] can be used
30/// to construct inputs to one of our BAM processing functions.
31///
32/// Sample way to build the struct. Some of the parameters are optional
33/// and can be left unset which would give them default values.
34/// We do not check if the specified bam path or URL exists as there are
35/// use cases where files are generated before the `InputBam` object is used.
36/// Not all options are listed here; for a full list, please see all the
37/// methods of the builder.
38///
39/// ```
40/// use nanalogue_core::{Error, F32Bw0and1, InputBamBuilder, PathOrURLOrStdin};
41///
42/// let bam = InputBamBuilder::default()
43/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
44/// .min_seq_len(30000u64)
45/// .min_align_len(20000i64)
46/// .read_id("some-id")
47/// .read_filter("primary_forward,secondary_forward".into())
48/// .sample_fraction(F32Bw0and1::new(1.0).expect("no error"))
49/// .sample_seed(42u64)
50/// .mapq_filter(20)
51/// .exclude_mapq_unavail(true)
52/// .region("chr4:1000-2000".into())
53/// .full_region(true)
54/// .build()?;
55/// # Ok::<(), Error>(())
56/// ```
57///
58/// This struct and the [`InputMods`] struct allow us to set input options
59/// for BAM/modBAM calculations. An example is shown below where the [`crate::read_info::run`]
60/// command is called to process data from a BAM file with some input options.
61///
62/// ```
63/// use nanalogue_core::{BamRcRecords, BamPreFilt, Error, InputBamBuilder, InputModsBuilder,
64/// OptionalTag, PathOrURLOrStdin, ThresholdState, nanalogue_bam_reader, read_info};
65///
66/// let mut bam = InputBamBuilder::default()
67/// .bam_path(PathOrURLOrStdin::Path("./examples/example_1.bam".into()))
68/// .region("dummyI".into())
69/// .build()?;
70/// let mut mods = InputModsBuilder::<OptionalTag>::default()
71/// .mod_prob_filter(ThresholdState::GtEq(0))
72/// .build()?;
73///
74/// let mut buffer = Vec::new();
75/// let mut reader = nanalogue_bam_reader(&bam.bam_path.to_string())?;
76/// let bam_rc_records = BamRcRecords::new(&mut reader, &mut bam, &mut mods)?;
77/// read_info::run(
78/// &mut buffer,
79/// bam_rc_records.rc_records
80/// .filter(|r| r.as_ref().map_or(true, |v| v.pre_filt(&bam))),
81/// mods,
82/// None,
83/// )?;
84/// assert!(str::from_utf8(buffer.as_slice())?
85/// .contains("5d10eb9a-aae1-4db8-8ec6-7ebb34d32575"));
86/// # Ok::<(), Error>(())
87/// ```
88///
89/// ## Examples resulting in errors
90///
91/// Full region without actually setting a region
92///
93/// ```should_panic
94/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
95///
96/// let bam = InputBamBuilder::default()
97/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
98/// .read_id("some-id")
99/// .full_region(true)
100/// .build()?;
101/// # Ok::<(), Error>(())
102/// ```
103///
104/// Setting both `region` and `region_bed3`. `region` can be converted to
105/// `region_bed3` using [`GenomicRegion::try_to_bed3`] and a BAM header.
106///
107/// ```should_panic
108/// use bedrs::prelude::Bed3;
109/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
110///
111/// let bam = InputBamBuilder::default()
112/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
113/// .read_id("some-id")
114/// .region("chr4:1000-2000".into())
115/// .region_bed3(Bed3::<i32,u64>::new(3, 1000, 2000))
116/// .build()?;
117/// # Ok::<(), Error>(())
118/// ```
119///
120/// Setting more than one of `read_id`, `read_id_list` and `read_id_set`.
121/// * `read_id` means filter to retain only this read.
122/// * `read_id_list` is a path to a file with a list of read ids.
123/// * `read_id_set` is a set of read ids supplied directly.
124///
125/// ```
126/// use bedrs::prelude::Bed3;
127/// use nanalogue_core::{Error, InputBamBuilder, PathOrURLOrStdin};
128/// use std::collections::HashSet;
129///
130/// let _ = InputBamBuilder::default()
131/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
132/// .read_id("some-id")
133/// .read_id_list("/some/file.txt")
134/// .build().unwrap_err();
135///
136/// let mut read_id_set = HashSet::<String>::new();
137/// read_id_set.insert("some-read-a".to_owned());
138/// read_id_set.insert("some-read-b".to_owned());
139///
140/// let _ = InputBamBuilder::default()
141/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
142/// .read_id_list("/some/file.txt")
143/// .read_id_set(read_id_set.clone())
144/// .build().unwrap_err();
145///
146/// let _ = InputBamBuilder::default()
147/// .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
148/// .read_id("some-id")
149/// .read_id_set(read_id_set)
150/// .build().unwrap_err();
151/// ```
152#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
153#[serde(default)]
154#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
155#[non_exhaustive]
156pub struct InputBam {
157 /// Input BAM file. Set to a local file path, or set to - to read from stdin,
158 /// or set to a URL to read from a remote file. If using stdin and piping in
159 /// from `samtools view`, always include the header with the `-h` option.
160 pub bam_path: PathOrURLOrStdin,
161 /// Exclude reads whose sequence length in the BAM file is
162 /// below this value. Defaults to 0.
163 #[clap(long, default_value_t)]
164 #[builder(setter(into))]
165 pub min_seq_len: u64,
166 /// Exclude reads whose alignment length in the BAM file is
167 /// below this value. Defaults to unused.
168 #[clap(long)]
169 #[builder(setter(into, strip_option))]
170 pub min_align_len: Option<i64>,
171 /// Only include this read id, defaults to unused i.e. all reads are used.
172 /// NOTE: if there are multiple alignments corresponding
173 /// to this read id, all of them are used.
174 #[clap(long, conflicts_with = "read_id_list")]
175 #[builder(setter(into, strip_option))]
176 pub read_id: Option<String>,
177 /// Path to file containing list of read IDs (one per line).
178 /// Lines starting with '#' are treated as comments and ignored.
179 /// Cannot be used together with --read-id.
180 #[clap(long, conflicts_with = "read_id")]
181 #[builder(setter(into, strip_option))]
182 pub read_id_list: Option<String>,
183 /// Internal `HashSet` of read IDs loaded from `read_id_list` file.
184 /// This is populated automatically and not exposed to users.
185 #[clap(skip)]
186 #[builder(setter(into, strip_option))]
187 pub read_id_set: Option<HashSet<String>>,
188 /// Number of threads used during some aspects of program execution
189 #[clap(long, default_value_t = NonZeroU32::new(2).expect("no error"))]
190 #[builder(setter(into))]
191 pub threads: NonZeroU32,
192 /// Include "zero-length" sequences e.g. sequences with "*" in the sequence
193 /// field. By default, these sequences are excluded to avoid processing errors.
194 /// If this flag is set, these reads are included irrespective of any
195 /// minimum sequence or align length criteria the user may have set.
196 /// WARNINGS: (1) Some functions of the codebase may break or produce incorrect
197 /// results if you use this flag. (2) due to a technical reason, we need a DNA sequence
198 /// in the sequence field and cannot infer sequence length from other sources
199 /// e.g. CIGAR strings.
200 #[clap(long, default_value_t)]
201 pub include_zero_len: bool,
202 /// Only retain reads of this type. Allowed types are `primary_forward`,
203 /// `primary_reverse`, `secondary_forward`, `secondary_reverse`, `supplementary_forward`,
204 /// `supplementary_reverse` and unmapped. Specify more than one type if needed
205 /// separated by commas, in which case reads of any type in list are retained.
206 /// Defaults to retain reads of all types.
207 #[clap(long)]
208 #[builder(field(
209 ty = "String",
210 build = "(!self.read_filter.is_empty()).then(|| ReadStates::from_str(&self.read_filter)).transpose()?"
211 ))]
212 pub read_filter: Option<ReadStates>,
213 /// Subsample BAM to retain only this fraction of total number of reads,
214 /// defaults to 1.0. The sampling algorithm considers every read according
215 /// to the specified probability, so due to this, you may not always get
216 /// the same number of reads e.g. if you set `-s 0.05` in a file with 1000 reads,
217 /// you will get 50 +- sqrt(50) reads.
218 /// By default, a new subsample is drawn every time as the seed is not fixed.
219 /// Set `--sample-seed` to get reproducible subsampling.
220 #[clap(short, long, default_value_t = F32Bw0and1::one())]
221 pub sample_fraction: F32Bw0and1,
222 /// Seed for reproducible subsampling. When set, the subsampling decision
223 /// for each read is deterministic based on the read name and the seed.
224 /// Different seeds produce different subsets. If not set, subsampling
225 /// is random and non-reproducible (the default behavior).
226 #[clap(long)]
227 #[builder(setter(into, strip_option))]
228 pub sample_seed: Option<u64>,
229 /// Exclude reads whose MAPQ (Mapping quality of position) is below this value.
230 /// Defaults to zero i.e. do not exclude any read.
231 #[clap(long, default_value_t)]
232 #[builder(setter(into))]
233 pub mapq_filter: u8,
234 /// Exclude sequences with MAPQ unavailable.
235 /// In the BAM format, a value of 255 in this column means MAPQ is unavailable.
236 /// These reads are allowed by default, set this flag to exclude.
237 #[clap(long, default_value_t)]
238 pub exclude_mapq_unavail: bool,
239 /// Only keep reads passing through this region. If a BAM index is available
240 /// with a name same as the BAM file but with the .bai suffix, the operation
241 /// of selecting such reads will be faster. If you are using standard input
242 /// as your input e.g. you are piping in the output from samtools, then
243 /// you cannot use an index as a BAM filename is not available.
244 #[clap(long)]
245 #[builder(field(
246 ty = "String",
247 build = "(!self.region.is_empty()).then(|| GenomicRegion::from_str(&self.region)).transpose()?"
248 ))]
249 pub region: Option<GenomicRegion>,
250 /// Only keep read data from this region.
251 /// This is an internal option not exposed to the user, we will set it
252 /// based on the other options that the user sets.
253 #[clap(skip)]
254 #[builder(setter(into, strip_option))]
255 pub region_bed3: Option<Bed3<i32, u64>>,
256 /// Only keep reads if they pass through the specified region in full.
257 /// Related to the input `--region`; has no effect if that is not set.
258 #[clap(long, default_value_t, requires = "region")]
259 pub full_region: bool,
260}
261
262impl InputBamBuilder {
263 /// Validate [`InputBam`] creation through the Builder method.
264 ///
265 /// [`InputBam`] can be created in many ways:
266 /// - directly populating the fields as they are public.
267 /// - through the CLI-machinery
268 /// - through the builder.
269 ///
270 /// While we cannot check a user's usage through the first method,
271 /// we can check constructions through methods 2 and 3 above.
272 /// The CLI construction checking is handled by `clap`, and the builder
273 /// checking is done through this function.
274 /// The construction checks are slightly different as what may be feasible
275 /// through a CLI and through code are different.
276 /// So we have this validate method to check construction through
277 /// method number 3 above.
278 #[expect(
279 clippy::arithmetic_side_effects,
280 reason = "1 + 1 + 1 is not gonna overflow"
281 )]
282 fn validate(&self) -> Result<(), Error> {
283 match (
284 !self.region.is_empty(),
285 self.region_bed3.is_some(),
286 self.full_region,
287 ) {
288 (false, false, Some(true)) => {
289 return Err(Error::BuilderValidation("InputBamBuilder: cannot set `full_region` without setting `region` or `region_bed3`".to_owned()));
290 }
291 (true, true, _) => {
292 return Err(Error::BuilderValidation(
293 "InputBamBuilder: cannot set both `region` and `region_bed3`".to_owned(),
294 ));
295 }
296 _ => {}
297 }
298
299 if u8::from(self.read_id.is_some())
300 + u8::from(self.read_id_list.is_some())
301 + u8::from(self.read_id_set.is_some())
302 > 1
303 {
304 Err(Error::BuilderValidation(
305 "InputBamBuilder: cannot set >1 of `read_id`, `read_id_list` and `read_id_set`"
306 .to_owned(),
307 ))
308 } else {
309 Ok(())
310 }
311 }
312}
313
314/// Implements a default class for `InputBAM`
315impl Default for InputBam {
316 fn default() -> Self {
317 InputBam {
318 bam_path: PathOrURLOrStdin::Stdin,
319 min_seq_len: 0,
320 min_align_len: None,
321 read_id: None,
322 read_id_list: None,
323 read_id_set: None,
324 threads: NonZeroU32::new(2).expect("no error"),
325 include_zero_len: false,
326 read_filter: None,
327 sample_fraction: F32Bw0and1::one(),
328 sample_seed: None,
329 mapq_filter: 0,
330 exclude_mapq_unavail: false,
331 region: None,
332 region_bed3: None,
333 full_region: false,
334 }
335 }
336}
337
338/// This struct contains an optional modification tag
339#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
340#[non_exhaustive]
341pub struct OptionalTag {
342 /// modified tag
343 #[clap(long)]
344 pub tag: Option<ModChar>,
345}
346
347impl FromStr for OptionalTag {
348 type Err = Error;
349
350 /// process the modification type from a string.
351 ///
352 /// ```
353 /// use nanalogue_core::{Error, ModChar, OptionalTag};
354 /// use std::str::FromStr;
355 ///
356 /// let tag = OptionalTag::from_str("m")?;
357 /// assert_eq!(tag.tag.unwrap().val(), 'm');
358 /// # Ok::<(), Error>(())
359 /// ```
360 fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
361 Ok(OptionalTag {
362 tag: Some(ModChar::from_str(mod_type)?),
363 })
364 }
365}
366
367/// This struct contains a required modification tag
368#[derive(Debug, Default, Args, Clone, Copy, Serialize, Deserialize)]
369#[non_exhaustive]
370pub struct RequiredTag {
371 /// modified tag
372 #[clap(long)]
373 pub tag: ModChar,
374}
375
376impl FromStr for RequiredTag {
377 type Err = Error;
378
379 /// process the modification type from a string.
380 ///
381 /// ```
382 /// use nanalogue_core::{Error, ModChar, RequiredTag};
383 /// use std::str::FromStr;
384 ///
385 /// let tag = RequiredTag::from_str("m")?;
386 /// assert_eq!(tag.tag.val(), 'm');
387 /// # Ok::<(), Error>(())
388 /// ```
389 fn from_str(mod_type: &str) -> Result<Self, Self::Err> {
390 Ok(RequiredTag {
391 tag: ModChar::from_str(mod_type)?,
392 })
393 }
394}
395
396/// Trait that returns a modification tag
397pub trait TagState {
398 /// Returns the modification tag of the tag state in an option
399 fn tag(&self) -> Option<ModChar> {
400 unimplemented!();
401 }
402}
403
404impl TagState for OptionalTag {
405 fn tag(&self) -> Option<ModChar> {
406 self.tag
407 }
408}
409
410impl TagState for RequiredTag {
411 fn tag(&self) -> Option<ModChar> {
412 Some(self.tag)
413 }
414}
415
416/// This struct contains the options input to our
417/// modification-data functions with restrictions on data received.
418///
419/// If you want to build this, use [`InputModsBuilder`].
420/// In CLI mode, `clap` populates this struct.
421/// This and the [`InputBam`] struct are used to set almost all input options
422/// to many of our functions that process BAM/modBAM files.
423///
424/// # Examples
425///
426/// Please look at the documentation of [`InputBam`] to see a fully-worked
427/// example to set up inputs to perform a calculation.
428///
429/// Following example uses many fields and is for a [`RequiredTag`] variant.
430/// You can omit some of them depending on your use case. `mod_region` must
431/// can be converted to `region_bed3` using [`GenomicRegion::try_to_bed3`]
432/// with a suitable BAM header before this struct can be used with most of
433/// our functions. We do not force this in the builder route as we may not
434/// want to do this for some reason.
435///
436/// ```
437/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
438/// use std::str::FromStr;
439///
440/// let input_options = InputModsBuilder::default()
441/// .tag(RequiredTag::from_str("m")?)
442/// .mod_strand("bc".into())
443/// .mod_prob_filter(ThresholdState::GtEq(0))
444/// .trim_read_ends_mod(10)
445/// .base_qual_filter_mod(10)
446/// .mod_region("chr3:4000-5000".into())
447/// .build()?;
448///
449/// # Ok::<(), Error>(())
450/// ```
451///
452/// A [`RequiredTag`] variant that sets `region_bed3` instead of `mod_region`.
453/// If `mod_region` is set, then a downstream function must parse a BAM file,
454/// and then convert `mod_region` into `region_bed3` using the header.
455/// If the user is confident that they know the index number of a contig,
456/// then they can directly set `region_bed3`. In the example below, the user
457/// knows `chr3` corresponds to `4`, so they set the region directly.
458///
459/// ```
460/// use bedrs::prelude::Bed3;
461/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
462/// use std::str::FromStr;
463///
464/// let input_options = InputModsBuilder::<RequiredTag>::default()
465/// .tag(RequiredTag::from_str("m")?)
466/// .mod_strand("bc".into())
467/// .mod_prob_filter(ThresholdState::GtEq(0))
468/// .trim_read_ends_mod(10)
469/// .base_qual_filter_mod(10)
470/// .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
471/// .build()?;
472///
473/// # Ok::<(), Error>(())
474/// ```
475///
476/// Setting both `region_bed3` and `mod_region` should result in an error,
477/// even if they both refer to the same region. This is because setting both
478/// can result in an undefined state, so we do not allow this.
479///
480/// ```should_panic
481/// use bedrs::prelude::Bed3;
482/// use nanalogue_core::{InputModsBuilder, Error, RequiredTag, ThresholdState};
483/// use std::str::FromStr;
484///
485/// let input_options = InputModsBuilder::<RequiredTag>::default()
486/// .mod_prob_filter(ThresholdState::GtEq(0))
487/// .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
488/// .mod_region("chr3:4000-5000".into())
489/// .build()?;
490///
491/// # Ok::<(), Error>(())
492/// ```
493///
494/// Following example is for an [`OptionalTag`] variant with the `tag` omitted,
495/// and many other fields omitted as well, which take on a default value.
496///
497/// ```
498/// use nanalogue_core::{InputModsBuilder, Error, OptionalTag, ThresholdState};
499/// use std::str::FromStr;
500///
501/// let input_options = InputModsBuilder::<OptionalTag>::default()
502/// .mod_prob_filter(ThresholdState::GtEq(0))
503/// .build()?;
504///
505/// # Ok::<(), Error>(())
506/// ```
507#[derive(Builder, Debug, Args, Clone, Serialize, Deserialize)]
508#[builder(default, build_fn(error = "Error", validate = "Self::validate"))]
509#[serde(default)]
510#[non_exhaustive]
511pub struct InputMods<S: TagState + Args + FromArgMatches + Default> {
512 /// modified tag
513 #[clap(flatten)]
514 pub tag: S,
515 /// modified strand, set this to `bc` or `bc_comp`, meaning
516 /// on basecalled strand or its complement. Some technologies
517 /// like `PacBio` or `ONT` duplex can call mod data on both a strand
518 /// and its complementary DNA and store it in the record corresponding
519 /// to the strand, so you can use this filter to select only for
520 /// mod data on a strand or its complement. Please note that this
521 /// filter is different from selecting for forward or reverse
522 /// aligned reads using the BAM flags.
523 #[clap(long)]
524 #[builder(field(
525 ty = "String",
526 build = "(!self.mod_strand.is_empty()).then(|| RestrictModCalledStrand::from_str(&self.mod_strand)).transpose()?"
527 ))]
528 pub mod_strand: Option<RestrictModCalledStrand>,
529 /// Filter to reject mods before analysis. Specify as low,high where
530 /// both are fractions to reject modifications where the probabilities (p)
531 /// are in this range e.g. "0.4,0.6" rejects 0.4 <= p <= 0.6.
532 /// You can use this to reject 'weak' modification calls before analysis
533 /// i.e. those with probabilities close to 0.5.
534 /// NOTE: (1) Whether this filtration is applied or not, mods < 0.5
535 /// are considered unmodified and >= 0.5 are considered modified by our program.
536 /// (2) mod probabilities are stored as a number from 0-255 in the modBAM format,
537 /// so we internally convert 0.0-1.0 to 0-255. Default: reject nothing.
538 #[clap(long, value_parser=ThresholdState::from_str_ordpair_fraction, default_value = "")]
539 pub mod_prob_filter: ThresholdState,
540 /// Filter this many bp at the start and
541 /// end of a read before any mod operations.
542 /// Please note that the units here are bp and
543 /// not units of base being queried.
544 #[clap(long, default_value_t)]
545 pub trim_read_ends_mod: usize,
546 /// Exclude bases whose base quality is below
547 /// this threshold before any mod operation, defaults to 0 i.e. unused.
548 /// NOTE: (1) This step is only applied before modification operations, and
549 /// not before any other operations.
550 /// (2) No offsets such as +33 are needed here.
551 /// (3) Modifications on reads where base quality information
552 /// is not available are all rejected if this is non-zero.
553 #[clap(long, default_value_t)]
554 pub base_qual_filter_mod: u8,
555 /// Only keep modification data from this region
556 #[clap(long)]
557 #[builder(field(
558 ty = "String",
559 build = "(!self.mod_region.is_empty()).then(|| GenomicRegion::from_str(&self.mod_region)).transpose()?"
560 ))]
561 pub mod_region: Option<GenomicRegion>,
562 /// Only keep modification data from this region.
563 /// We do not expose this to the user, but infer it from
564 /// the other options set by the user.
565 /// We cannot populate this directly at the time of CLI parsing,
566 /// as we need to look at the BAM header to convert a contig name
567 /// into a numeric contig id.
568 #[clap(skip)]
569 #[builder(setter(strip_option))]
570 pub region_bed3: Option<Bed3<i32, u64>>,
571}
572
573impl<S: TagState + Args + FromArgMatches + Default> InputModsBuilder<S> {
574 /// Validate [`InputMods`] creation through the Builder method.
575 ///
576 /// [`InputMods`] can be created in many ways:
577 /// - directly populating the fields as they are public.
578 /// - through the CLI-machinery
579 /// - through the builder.
580 ///
581 /// While we cannot check a user's usage through the first method,
582 /// we can check constructions through methods 2 and 3 above.
583 /// The CLI construction checking is handled by `clap`, and the builder
584 /// checking is done through this function.
585 /// The construction checks are slightly different as what may be feasible
586 /// through a CLI and through code are different.
587 /// So we have this validate method to check construction through
588 /// method number 3 above.
589 fn validate(&self) -> Result<(), Error> {
590 // self.mod_region is a `String` for the Builder before conversion into an `Option<_>`
591 if (!self.mod_region.is_empty()) && self.region_bed3.is_some() {
592 Err(Error::BuilderValidation(
593 "cannot set `mod_region` and `region_bed3` simultaneously!".to_owned(),
594 ))
595 } else {
596 Ok(())
597 }
598 }
599}
600
601/// Implements defaults for `InputMods`
602impl<S: TagState + Args + FromArgMatches + Default> Default for InputMods<S> {
603 fn default() -> Self {
604 InputMods::<S> {
605 tag: S::default(),
606 mod_strand: None,
607 mod_prob_filter: ThresholdState::GtEq(0),
608 trim_read_ends_mod: 0,
609 base_qual_filter_mod: 0,
610 mod_region: None,
611 region_bed3: None,
612 }
613 }
614}
615
616/// Retrieves options for modification input
617pub trait InputModOptions {
618 /// retrieves tag
619 fn tag(&self) -> Option<ModChar> {
620 unimplemented!()
621 }
622 /// retrieves option to set basecalled strand or opposite in mod retrieval
623 fn mod_strand(&self) -> Option<RestrictModCalledStrand> {
624 unimplemented!()
625 }
626 /// returns probability filter
627 fn mod_prob_filter(&self) -> ThresholdState {
628 unimplemented!()
629 }
630 /// returns read end trimming
631 fn trim_read_ends_mod(&self) -> usize {
632 unimplemented!()
633 }
634 /// returns threshold for filtering base PHRED quality
635 fn base_qual_filter_mod(&self) -> u8 {
636 unimplemented!()
637 }
638}
639
640/// Retrieves options for region
641pub trait InputRegionOptions {
642 /// returns region requested
643 fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
644 unimplemented!()
645 }
646 /// returns region requested but region in genomic string format
647 fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
648 unimplemented!()
649 }
650 /// sets region requested
651 fn set_region_filter(&mut self, _value: Option<Bed3<i32, u64>>) {
652 unimplemented!()
653 }
654 /// returns true if full overlap with region is requested as opposed to
655 /// only partial overlap. defaults to false.
656 fn is_full_overlap(&self) -> bool {
657 false
658 }
659 /// converts region from genomic string representation to bed3 representation
660 ///
661 /// # Errors
662 /// Returns error if conversion fails
663 fn convert_region_to_bed3(&mut self, header: bam::HeaderView) -> Result<(), Error> {
664 match (
665 self.region_filter_genomic_string().is_some(),
666 self.region_filter().is_some(),
667 ) {
668 (false, false) => self.set_region_filter(None),
669 (true, false) => {
670 let genomic_region = self
671 .region_filter_genomic_string()
672 .expect("checked above that this is Some");
673 self.set_region_filter(Some(genomic_region.try_to_bed3(&header)?));
674 }
675 (false, true) => {}
676 (true, true) => {
677 return Err(Error::InvalidState(
678 "cannot set a region as both a `GenomicRegion` and a `Bed3`".to_owned(),
679 ));
680 }
681 }
682 Ok(())
683 }
684}
685
686impl<S: TagState + Args + FromArgMatches + Default> InputModOptions for InputMods<S> {
687 fn tag(&self) -> Option<ModChar> {
688 self.tag.tag()
689 }
690 fn mod_strand(&self) -> Option<RestrictModCalledStrand> {
691 self.mod_strand
692 }
693 fn mod_prob_filter(&self) -> ThresholdState {
694 self.mod_prob_filter
695 }
696 fn trim_read_ends_mod(&self) -> usize {
697 self.trim_read_ends_mod
698 }
699 fn base_qual_filter_mod(&self) -> u8 {
700 self.base_qual_filter_mod
701 }
702}
703
704impl<S: TagState + Args + FromArgMatches + Default> InputRegionOptions for InputMods<S> {
705 fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
706 self.mod_region.clone()
707 }
708 fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
709 &self.region_bed3
710 }
711 fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
712 self.region_bed3 = value;
713 }
714}
715
716/// can return tag without encasing in an option in the `RequiredTag` variant
717impl InputMods<RequiredTag> {
718 /// retrieves tag
719 #[must_use]
720 pub fn required_tag(&self) -> ModChar {
721 self.tag.tag
722 }
723}
724
725impl InputRegionOptions for InputBam {
726 fn region_filter_genomic_string(&self) -> Option<GenomicRegion> {
727 self.region.clone()
728 }
729 fn region_filter(&self) -> &Option<Bed3<i32, u64>> {
730 &self.region_bed3
731 }
732 fn set_region_filter(&mut self, value: Option<Bed3<i32, u64>>) {
733 self.region_bed3 = value;
734 }
735 fn is_full_overlap(&self) -> bool {
736 self.full_region
737 }
738}
739
740/// This struct contains the options input to our
741/// modification-data-windowing functions
742///
743/// # Example
744/// ```
745/// use nanalogue_core::{Error, InputWindowingBuilder};
746///
747/// let window = InputWindowingBuilder::default()
748/// .win(20)
749/// .step(10).build()?;
750/// # Ok::<(), Error>(())
751/// ```
752#[derive(Builder, Debug, Args, Clone, Copy, Serialize, Deserialize)]
753#[builder(default, build_fn(error = "Error"))]
754#[serde(default)]
755#[non_exhaustive]
756pub struct InputWindowing {
757 /// size of window in units of base being queried i.e.
758 /// if you are looking for cytosine modifications, then
759 /// a window of a value 300 means create windows each with
760 /// 300 cytosines irrespective of their modification status.
761 #[clap(long)]
762 #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.win)?"))]
763 pub win: NonZeroUsize,
764 /// step window by this size in units of base being queried.
765 #[clap(long)]
766 #[builder(field(ty = "usize", build = "NonZeroUsize::try_from(self.step)?"))]
767 pub step: NonZeroUsize,
768}
769
770/// Implements a default for `InputWindowing`.
771/// NOTE the defaults of 1 for each are just for ease of programming.
772/// We do not expose these defaults to the command-line user.
773impl Default for InputWindowing {
774 fn default() -> Self {
775 InputWindowing {
776 win: NonZeroUsize::new(1).expect("no error"),
777 step: NonZeroUsize::new(1).expect("no error"),
778 }
779 }
780}
781
782/// This enum contains inputs to display sequences with various options
783#[derive(Debug, Clone, Default, Copy, Serialize, Deserialize)]
784#[non_exhaustive]
785pub enum SeqDisplayOptions {
786 /// Do not display any sequence, default
787 #[default]
788 No,
789 /// Display the entire sequence without any subsetting
790 Full {
791 /// Option to display basecalling qualities
792 show_base_qual: bool,
793 },
794 /// Display the sequence within a region
795 Region {
796 /// Option to display basecalling qualities
797 show_base_qual: bool,
798 /// Option to show bases in lowercase if they are insertions
799 show_ins_lowercase: bool,
800 /// Option to show modified bases as Z (or z depending on other options)
801 show_mod_z: bool,
802 /// Region over which sequence must be shown
803 region: Bed3<i32, u64>,
804 },
805}
806
807#[cfg(test)]
808mod tag_struct_tests {
809 use super::*;
810
811 #[test]
812 fn optional_tag_default() {
813 let optional_tag = OptionalTag::default();
814 assert!(optional_tag.tag.is_none());
815 assert_eq!(optional_tag.tag(), None);
816 }
817
818 #[test]
819 fn optional_tag_with_value() {
820 let mod_char = ModChar::new('m');
821 let optional_tag = OptionalTag {
822 tag: Some(mod_char),
823 };
824 assert_eq!(optional_tag.tag(), Some(mod_char));
825 }
826
827 #[test]
828 fn required_tag_default() {
829 let required_tag = RequiredTag::default();
830 assert_eq!(required_tag.tag(), Some(ModChar::default()));
831 }
832
833 #[test]
834 fn required_tag_with_value() {
835 let mod_char = ModChar::new('C');
836 let required_tag = RequiredTag { tag: mod_char };
837 assert_eq!(required_tag.tag(), Some(mod_char));
838 }
839
840 #[test]
841 fn optional_tag_from_str() {
842 let tag = OptionalTag::from_str("m").unwrap();
843 assert_eq!(tag.tag.unwrap().val(), 'm');
844 }
845
846 #[test]
847 fn required_tag_from_str() {
848 let tag = RequiredTag::from_str("m").unwrap();
849 assert_eq!(tag.tag.val(), 'm');
850 }
851}
852
853#[cfg(test)]
854mod input_windowing_tests {
855 use super::*;
856
857 #[test]
858 fn input_windowing_default() {
859 let windowing = InputWindowing::default();
860 assert_eq!(windowing.win, NonZeroUsize::new(1).unwrap());
861 assert_eq!(windowing.step, NonZeroUsize::new(1).unwrap());
862 }
863
864 #[test]
865 fn input_windowing_custom_values() {
866 let windowing = InputWindowing {
867 win: NonZeroUsize::new(300).unwrap(),
868 step: NonZeroUsize::new(150).unwrap(),
869 };
870 assert_eq!(windowing.win, NonZeroUsize::new(300).unwrap());
871 assert_eq!(windowing.step, NonZeroUsize::new(150).unwrap());
872 }
873
874 #[test]
875 fn input_windowing_builder() {
876 let _: InputWindowing = InputWindowingBuilder::default()
877 .win(20)
878 .step(10)
879 .build()
880 .unwrap();
881 }
882}
883
884#[cfg(test)]
885mod input_mods_required_tag_tests {
886 use super::*;
887
888 #[test]
889 fn input_mods_required_tag_fn_tag() {
890 let mod_char = ModChar::new('C');
891 let input_mods = InputMods::<RequiredTag> {
892 tag: RequiredTag { tag: mod_char },
893 mod_strand: None,
894 mod_prob_filter: ThresholdState::GtEq(0),
895 trim_read_ends_mod: 0,
896 base_qual_filter_mod: 0,
897 mod_region: None,
898 region_bed3: None,
899 };
900
901 assert_eq!(input_mods.required_tag(), mod_char);
902 }
903
904 #[test]
905 fn input_mods_builder_required_tag_with_mod_region() {
906 let _: InputMods<RequiredTag> = InputModsBuilder::default()
907 .tag(RequiredTag::from_str("m").unwrap())
908 .mod_strand("bc".into())
909 .mod_prob_filter(ThresholdState::GtEq(0))
910 .trim_read_ends_mod(10)
911 .base_qual_filter_mod(10)
912 .mod_region("chr3:4000-5000".into())
913 .build()
914 .unwrap();
915 }
916
917 #[test]
918 fn input_mods_builder_required_tag_with_region_bed3() {
919 let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
920 .tag(RequiredTag::from_str("m").unwrap())
921 .mod_strand("bc".into())
922 .mod_prob_filter(ThresholdState::GtEq(0))
923 .trim_read_ends_mod(10)
924 .base_qual_filter_mod(10)
925 .region_bed3(Bed3::<i32, u64>::new(4, 4000, 5000))
926 .build()
927 .unwrap();
928 }
929
930 #[test]
931 fn input_mods_builder_optional_tag_minimal() {
932 let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
933 .mod_prob_filter(ThresholdState::GtEq(0))
934 .build()
935 .unwrap();
936 }
937}
938
939#[cfg(test)]
940mod input_bam_tests {
941 use super::*;
942 use bedrs::Coordinates as _;
943 use indoc::indoc;
944
945 #[test]
946 fn input_bam_is_full_overlap() {
947 // Test default (false)
948 let input_bam_default = InputBam::default();
949 assert!(!input_bam_default.is_full_overlap());
950
951 // Test explicit false
952 let input_bam_false = InputBam {
953 full_region: false,
954 ..Default::default()
955 };
956 assert!(!input_bam_false.is_full_overlap());
957
958 // Test true
959 let input_bam_true = InputBam {
960 full_region: true,
961 ..Default::default()
962 };
963 assert!(input_bam_true.is_full_overlap());
964 }
965
966 #[test]
967 fn input_bam_builder() {
968 let _: InputBam = InputBamBuilder::default()
969 .bam_path(PathOrURLOrStdin::Path("/some/path/to/bam.bam".into()))
970 .min_seq_len(30000u64)
971 .min_align_len(20000i64)
972 .read_id("some-id")
973 .read_filter("primary_forward,secondary_forward".into())
974 .sample_fraction(F32Bw0and1::one())
975 .mapq_filter(20)
976 .exclude_mapq_unavail(true)
977 .region("chr4:1000-2000".into())
978 .full_region(true)
979 .build()
980 .unwrap();
981 }
982
983 #[test]
984 fn input_bam_convert_region_to_bed3_none() {
985 let mut input_bam = InputBam::default();
986
987 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
988 @SQ\tSN:chr1\tLN:248956422\n"});
989
990 input_bam.convert_region_to_bed3(header_view).unwrap();
991 assert!(input_bam.region_bed3.is_none());
992 }
993
994 #[test]
995 fn input_bam_convert_region_to_bed3_with_region() {
996 let mut input_bam = InputBam {
997 region: Some(GenomicRegion::from_str("chr2:3400-3600").unwrap()),
998 ..Default::default()
999 };
1000
1001 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1002 @SQ\tSN:chr1\tLN:3000
1003 @SQ\tSN:chr2\tLN:4000\n"});
1004
1005 input_bam.convert_region_to_bed3(header_view).unwrap();
1006 assert!(input_bam.region_bed3.is_some());
1007
1008 let bed3 = input_bam.region_bed3.unwrap();
1009 assert_eq!(bed3.chr(), &1);
1010 assert_eq!(bed3.start(), 3400);
1011 assert_eq!(bed3.end(), 3600);
1012 }
1013
1014 #[test]
1015 #[should_panic(expected = "InvalidRegion")]
1016 fn input_bam_convert_region_to_bed3_invalid_region() {
1017 let mut input_bam = InputBam {
1018 region: Some(GenomicRegion::from_str("chr2:4400-4600").expect("no error")),
1019 ..Default::default()
1020 };
1021 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1022 @SQ\tSN:chr1\tLN:3000
1023 @SQ\tSN:chr2\tLN:4000\n"});
1024
1025 input_bam.convert_region_to_bed3(header_view).unwrap();
1026 }
1027
1028 #[test]
1029 #[should_panic(expected = "InvalidRegion")]
1030 fn input_bam_convert_region_to_bed3_invalid_open_ended_region() {
1031 let mut input_bam = InputBam {
1032 region: Some(GenomicRegion::from_str("chr2:4600-").expect("no error")),
1033 ..Default::default()
1034 };
1035 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1036 @SQ\tSN:chr1\tLN:3000
1037 @SQ\tSN:chr2\tLN:4000\n"});
1038
1039 input_bam.convert_region_to_bed3(header_view).unwrap();
1040 }
1041
1042 #[test]
1043 #[should_panic(expected = "InvalidAlignCoords")]
1044 fn input_bam_convert_region_to_bed3_invalid_contig() {
1045 let mut input_bam = InputBam {
1046 region: Some(GenomicRegion::from_str("chr3:1000-2000").expect("no error")),
1047 ..Default::default()
1048 };
1049 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1050 @SQ\tSN:chr1\tLN:3000
1051 @SQ\tSN:chr2\tLN:4000\n"});
1052
1053 input_bam.convert_region_to_bed3(header_view).unwrap();
1054 }
1055
1056 #[test]
1057 fn input_bam_convert_region_to_bed3_already_set_bed3() {
1058 // When region_bed3 is already set and region is None, should not convert
1059 let mut input_bam = InputBam {
1060 region: None,
1061 region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1062 ..Default::default()
1063 };
1064
1065 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1066 @SQ\tSN:chr1\tLN:3000
1067 @SQ\tSN:chr2\tLN:4000\n"});
1068
1069 input_bam.convert_region_to_bed3(header_view).unwrap();
1070
1071 // Bed3 should remain unchanged
1072 assert!(input_bam.region_bed3.is_some());
1073 let bed3 = input_bam.region_bed3.unwrap();
1074 assert_eq!(bed3.chr(), &1);
1075 assert_eq!(bed3.start(), 1000);
1076 assert_eq!(bed3.end(), 2000);
1077 }
1078
1079 #[test]
1080 #[should_panic(expected = "InvalidState")]
1081 fn input_bam_convert_region_to_bed3_both_set() {
1082 // When both region and region_bed3 are set, should error
1083 let mut input_bam = InputBam {
1084 region: Some(GenomicRegion::from_str("chr2:1500-2500").unwrap()),
1085 region_bed3: Some(Bed3::<i32, u64>::new(1, 1000, 2000)),
1086 ..Default::default()
1087 };
1088
1089 let header_view = bam::HeaderView::from_bytes(indoc! {b"@HD\tVN:1.6\tSO:coordinate
1090 @SQ\tSN:chr1\tLN:3000
1091 @SQ\tSN:chr2\tLN:4000\n"});
1092
1093 input_bam.convert_region_to_bed3(header_view).unwrap();
1094 }
1095}
1096
1097#[cfg(test)]
1098mod validate_builder_functions {
1099 use super::*;
1100
1101 // Tests for InputBamBuilder::validate conflicts
1102
1103 #[test]
1104 #[should_panic(expected = "BuilderValidation")]
1105 fn input_bam_builder_full_region_without_region_fails() {
1106 let _: InputBam = InputBamBuilder::default()
1107 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1108 .full_region(true)
1109 .build()
1110 .unwrap();
1111 }
1112
1113 #[test]
1114 #[should_panic(expected = "BuilderValidation")]
1115 fn input_bam_builder_both_region_and_region_bed3_fails() {
1116 let _: InputBam = InputBamBuilder::default()
1117 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1118 .region("chr1:1000-2000".into())
1119 .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1120 .build()
1121 .unwrap();
1122 }
1123
1124 #[test]
1125 #[should_panic(expected = "BuilderValidation")]
1126 fn input_bam_builder_read_id_and_read_id_list_fails() {
1127 let _: InputBam = InputBamBuilder::default()
1128 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1129 .read_id("some-id")
1130 .read_id_list("/some/file.txt")
1131 .build()
1132 .unwrap();
1133 }
1134
1135 #[test]
1136 #[should_panic(expected = "BuilderValidation")]
1137 fn input_bam_builder_read_id_and_read_id_set_fails() {
1138 let mut read_ids = HashSet::new();
1139 let _: bool = read_ids.insert("read1".to_owned());
1140 let _: InputBam = InputBamBuilder::default()
1141 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1142 .read_id("some-id")
1143 .read_id_set(read_ids)
1144 .build()
1145 .unwrap();
1146 }
1147
1148 #[test]
1149 #[should_panic(expected = "BuilderValidation")]
1150 fn input_bam_builder_read_id_list_and_read_id_set_fails() {
1151 let mut read_ids = HashSet::new();
1152 let _: bool = read_ids.insert("read1".to_owned());
1153 let _: InputBam = InputBamBuilder::default()
1154 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1155 .read_id_list("/some/file.txt")
1156 .read_id_set(read_ids)
1157 .build()
1158 .unwrap();
1159 }
1160
1161 #[test]
1162 #[should_panic(expected = "BuilderValidation")]
1163 fn input_bam_builder_all_three_read_id_options_fails() {
1164 let mut read_ids = HashSet::new();
1165 let _: bool = read_ids.insert("read1".to_owned());
1166 let _: InputBam = InputBamBuilder::default()
1167 .bam_path(PathOrURLOrStdin::Path("/some/path.bam".into()))
1168 .read_id("some-id")
1169 .read_id_list("/some/file.txt")
1170 .read_id_set(read_ids)
1171 .build()
1172 .unwrap();
1173 }
1174
1175 // Tests for InputModsBuilder::validate conflicts
1176
1177 #[test]
1178 #[should_panic(expected = "BuilderValidation")]
1179 fn input_mods_builder_both_mod_region_and_region_bed3_fails() {
1180 let _: InputMods<OptionalTag> = InputModsBuilder::<OptionalTag>::default()
1181 .mod_prob_filter(ThresholdState::GtEq(0))
1182 .mod_region("chr1:1000-2000".into())
1183 .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1184 .build()
1185 .unwrap();
1186 }
1187
1188 #[test]
1189 #[should_panic(expected = "BuilderValidation")]
1190 fn input_mods_builder_required_tag_both_regions_fails() {
1191 let _: InputMods<RequiredTag> = InputModsBuilder::<RequiredTag>::default()
1192 .tag(RequiredTag::from_str("m").unwrap())
1193 .mod_prob_filter(ThresholdState::GtEq(0))
1194 .mod_region("chr1:1000-2000".into())
1195 .region_bed3(Bed3::<i32, u64>::new(0, 1000, 2000))
1196 .build()
1197 .unwrap();
1198 }
1199}