Skip to main content

bed_utils/
bed.rs

1pub mod io;
2pub mod map;
3
4mod bed_trait;
5pub use bed_trait::*;
6mod score;
7use rkyv::{Archive, Deserialize as RkyvDeserialize, Serialize as RkyvSerialize};
8pub use score::Score;
9mod strand;
10pub use strand::Strand;
11
12use std::{
13    fmt::{self, Write},
14    ops::Deref,
15    str::FromStr,
16};
17
18#[cfg(feature = "serde")]
19use serde::{Deserialize, Serialize};
20
21const DELIMITER: char = '\t';
22const MISSING_ITEM: &str = ".";
23
24/// A minimal BED record with only 3 fields.
25#[derive(
26    Archive, RkyvSerialize, RkyvDeserialize, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash,
27)]
28#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
29pub struct GenomicRange(String, u64, u64);
30
31impl GenomicRange {
32    pub fn new<C>(chrom: C, start: u64, end: u64) -> Self
33    where
34        C: Into<String>,
35    {
36        Self(chrom.into(), start, end)
37    }
38
39    /// Convert the record to a string representation: chr:start-end
40    pub fn pretty_show(&self) -> String {
41        format!("{}:{}-{}", self.0, self.1, self.2)
42    }
43}
44
45/// Convert string to GenomicRange. '\t', ':', and '-' are all considered as
46/// valid delimiters. So any of the following formats is valid:
47/// * chr1\t100\t200
48/// * chr1:100-200
49impl FromStr for GenomicRange {
50    type Err = ParseError;
51
52    fn from_str(s: &str) -> Result<Self, Self::Err> {
53        let mut fields = s.split(&['\t', ':', '-']);
54        let chrom = parse_chrom(&mut fields)?;
55        let start = parse_start(&mut fields)?;
56        let end = parse_end(&mut fields)?;
57        Ok(GenomicRange::new(chrom, start, end))
58    }
59}
60
61impl BEDLike for GenomicRange {
62    fn chrom(&self) -> &str {
63        &self.0
64    }
65    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
66        self.0 = chrom.to_string();
67        self
68    }
69    fn start(&self) -> u64 {
70        self.1
71    }
72    fn set_start(&mut self, start: u64) -> &mut Self {
73        self.1 = start;
74        self
75    }
76    fn end(&self) -> u64 {
77        self.2
78    }
79    fn set_end(&mut self, end: u64) -> &mut Self {
80        self.2 = end;
81        self
82    }
83    fn name(&self) -> Option<&str> {
84        None
85    }
86    fn score(&self) -> Option<Score> {
87        None
88    }
89    fn strand(&self) -> Option<Strand> {
90        None
91    }
92}
93
94impl fmt::Display for GenomicRange {
95    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
96        write!(
97            f,
98            "{}{}{}{}{}",
99            self.chrom(),
100            DELIMITER,
101            self.start(),
102            DELIMITER,
103            self.end()
104        )?;
105        Ok(())
106    }
107}
108
109/// A standard BED record.
110#[derive(Archive, RkyvSerialize, RkyvDeserialize, Clone, Debug, Eq, PartialEq)]
111#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
112pub struct BED<const N: u8> {
113    chrom: String,
114    start: u64,
115    end: u64,
116    pub name: Option<String>,
117    pub score: Option<Score>,
118    pub strand: Option<Strand>,
119    pub optional_fields: OptionalFields,
120}
121
122impl<const N: u8> BED<N> {
123    pub fn new<C>(
124        chrom: C,
125        start: u64,
126        end: u64,
127        name: Option<String>,
128        score: Option<Score>,
129        strand: Option<Strand>,
130        optional_fields: OptionalFields,
131    ) -> Self
132    where
133        C: Into<String>,
134    {
135        Self {
136            chrom: chrom.into(),
137            start,
138            end,
139            name,
140            score,
141            strand,
142            optional_fields,
143        }
144    }
145}
146
147impl<const N: u8> BEDLike for BED<N> {
148    fn chrom(&self) -> &str {
149        &self.chrom
150    }
151    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
152        self.chrom = chrom.to_string();
153        self
154    }
155    fn start(&self) -> u64 {
156        self.start
157    }
158    fn set_start(&mut self, start: u64) -> &mut Self {
159        self.start = start;
160        self
161    }
162    fn end(&self) -> u64 {
163        self.end
164    }
165    fn set_end(&mut self, end: u64) -> &mut Self {
166        self.end = end;
167        self
168    }
169    fn name(&self) -> Option<&str> {
170        self.name.as_deref()
171    }
172    fn score(&self) -> Option<Score> {
173        self.score
174    }
175    fn strand(&self) -> Option<Strand> {
176        self.strand
177    }
178}
179
180// Display trait
181impl<const N: u8> fmt::Display for BED<N> {
182    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
183        write!(
184            f,
185            "{}{}{}{}{}",
186            self.chrom(),
187            DELIMITER,
188            self.start(),
189            DELIMITER,
190            self.end()
191        )?;
192        if N > 3 {
193            write!(f, "{}{}", DELIMITER, self.name().unwrap_or(MISSING_ITEM))?;
194            if N > 4 {
195                f.write_char(DELIMITER)?;
196                if let Some(score) = self.score() {
197                    write!(f, "{}", score)?;
198                } else {
199                    f.write_str(MISSING_ITEM)?;
200                }
201
202                if N > 5 {
203                    f.write_char(DELIMITER)?;
204                    if let Some(strand) = self.strand() {
205                        write!(f, "{}", strand)?;
206                    } else {
207                        f.write_str(MISSING_ITEM)?;
208                    }
209                }
210            }
211        }
212        Ok(())
213    }
214}
215
216impl<const N: u8> FromStr for BED<N> {
217    type Err = ParseError;
218
219    fn from_str(s: &str) -> Result<Self, Self::Err> {
220        let mut fields = s.split(DELIMITER);
221        let chrom = parse_chrom(&mut fields)?;
222        let start = parse_start(&mut fields)?;
223        let end = parse_end(&mut fields)?;
224        let name = if N > 3 {
225            parse_name(&mut fields)?
226        } else {
227            None
228        };
229        let score = if N > 4 {
230            parse_score(&mut fields)?
231        } else {
232            None
233        };
234        let strand = if N > 5 {
235            parse_strand(&mut fields)?
236        } else {
237            None
238        };
239        Ok(BED::new(
240            chrom,
241            start,
242            end,
243            name,
244            score,
245            strand,
246            OptionalFields::default(),
247        ))
248    }
249}
250
251/// Generic BED record optional fields.
252#[derive(Archive, RkyvSerialize, RkyvDeserialize, Clone, Debug, Default, Eq, PartialEq)]
253#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
254pub struct OptionalFields(Vec<String>);
255
256impl Deref for OptionalFields {
257    type Target = [String];
258
259    fn deref(&self) -> &Self::Target {
260        &self.0
261    }
262}
263
264impl fmt::Display for OptionalFields {
265    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
266        for (i, field) in self.0.iter().enumerate() {
267            if i > 0 {
268                f.write_char(DELIMITER)?;
269            }
270
271            f.write_str(field)?;
272        }
273
274        Ok(())
275    }
276}
277
278impl From<Vec<String>> for OptionalFields {
279    fn from(fields: Vec<String>) -> Self {
280        Self(fields)
281    }
282}
283
284/// A NarrowPeak record is a BED6+4 format that is used to store called peaks.
285#[derive(Archive, RkyvSerialize, RkyvDeserialize, Clone, Debug, PartialEq)]
286#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
287pub struct NarrowPeak {
288    pub chrom: String,
289    pub start: u64,
290    pub end: u64,
291    pub name: Option<String>,
292    pub score: Option<Score>,
293    pub strand: Option<Strand>,
294    pub signal_value: f64,
295    pub p_value: Option<f64>,
296    pub q_value: Option<f64>,
297    pub peak: u64,
298}
299
300impl BEDLike for NarrowPeak {
301    fn chrom(&self) -> &str {
302        &self.chrom
303    }
304    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
305        self.chrom = chrom.to_string();
306        self
307    }
308    fn start(&self) -> u64 {
309        self.start
310    }
311    fn set_start(&mut self, start: u64) -> &mut Self {
312        self.start = start;
313        self
314    }
315    fn end(&self) -> u64 {
316        self.end
317    }
318    fn set_end(&mut self, end: u64) -> &mut Self {
319        self.end = end;
320        self
321    }
322    fn name(&self) -> Option<&str> {
323        self.name.as_deref()
324    }
325    fn score(&self) -> Option<Score> {
326        self.score
327    }
328    fn strand(&self) -> Option<Strand> {
329        self.strand
330    }
331}
332
333impl fmt::Display for NarrowPeak {
334    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
335        write!(
336            f,
337            "{}{}{}{}{}{}{}",
338            self.chrom(),
339            DELIMITER,
340            self.start(),
341            DELIMITER,
342            self.end(),
343            DELIMITER,
344            self.name().unwrap_or(MISSING_ITEM),
345        )?;
346
347        f.write_char(DELIMITER)?;
348        if let Some(x) = self.score() {
349            write!(f, "{}", x)?;
350        } else {
351            f.write_str(MISSING_ITEM)?;
352        }
353        f.write_char(DELIMITER)?;
354        if let Some(x) = self.strand() {
355            write!(f, "{}", x)?;
356        } else {
357            f.write_str(MISSING_ITEM)?;
358        }
359        write!(
360            f,
361            "{}{}{}{}{}{}{}{}",
362            DELIMITER,
363            self.signal_value,
364            DELIMITER,
365            self.p_value.unwrap_or(-1.0),
366            DELIMITER,
367            self.q_value.unwrap_or(-1.0),
368            DELIMITER,
369            self.peak,
370        )?;
371
372        Ok(())
373    }
374}
375
376impl FromStr for NarrowPeak {
377    type Err = ParseError;
378
379    fn from_str(s: &str) -> Result<Self, Self::Err> {
380        let mut fields = s.split(DELIMITER);
381        Ok(Self {
382            chrom: parse_chrom(&mut fields)?.to_string(),
383            start: parse_start(&mut fields)?,
384            end: parse_end(&mut fields)?,
385            name: parse_name(&mut fields)?,
386            score: parse_score(&mut fields)?,
387            strand: parse_strand(&mut fields)?,
388            signal_value: fields.next().unwrap().parse().unwrap(),
389            p_value: parse_pvalue(&mut fields).unwrap(),
390            q_value: parse_pvalue(&mut fields).unwrap(),
391            peak: fields.next().unwrap().parse().unwrap(),
392        })
393    }
394}
395
396/// A BroadPeak record is a BED6+4 format that is used to store called peaks.
397#[derive(Archive, RkyvSerialize, RkyvDeserialize, Clone, Debug, PartialEq)]
398#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
399pub struct BroadPeak {
400    pub chrom: String,
401    pub start: u64,
402    pub end: u64,
403    pub name: Option<String>,
404    pub score: Option<Score>,
405    pub strand: Option<Strand>,
406    pub signal_value: f64,
407    pub p_value: Option<f64>,
408    pub q_value: Option<f64>,
409}
410
411impl BEDLike for BroadPeak {
412    fn chrom(&self) -> &str {
413        &self.chrom
414    }
415    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
416        self.chrom = chrom.to_string();
417        self
418    }
419    fn start(&self) -> u64 {
420        self.start
421    }
422    fn set_start(&mut self, start: u64) -> &mut Self {
423        self.start = start;
424        self
425    }
426    fn end(&self) -> u64 {
427        self.end
428    }
429    fn set_end(&mut self, end: u64) -> &mut Self {
430        self.end = end;
431        self
432    }
433    fn name(&self) -> Option<&str> {
434        self.name.as_deref()
435    }
436    fn score(&self) -> Option<Score> {
437        self.score
438    }
439    fn strand(&self) -> Option<Strand> {
440        self.strand
441    }
442}
443
444impl fmt::Display for BroadPeak {
445    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
446        write!(
447            f,
448            "{}{}{}{}{}{}{}",
449            self.chrom(),
450            DELIMITER,
451            self.start(),
452            DELIMITER,
453            self.end(),
454            DELIMITER,
455            self.name().unwrap_or(MISSING_ITEM),
456        )?;
457
458        f.write_char(DELIMITER)?;
459        if let Some(x) = self.score() {
460            write!(f, "{}", x)?;
461        } else {
462            f.write_str(MISSING_ITEM)?;
463        }
464        f.write_char(DELIMITER)?;
465        if let Some(x) = self.strand() {
466            write!(f, "{}", x)?;
467        } else {
468            f.write_str(MISSING_ITEM)?;
469        }
470        write!(
471            f,
472            "{}{}{}{}{}{}",
473            DELIMITER,
474            self.signal_value,
475            DELIMITER,
476            self.p_value.unwrap_or(-1.0),
477            DELIMITER,
478            self.q_value.unwrap_or(-1.0),
479        )?;
480
481        Ok(())
482    }
483}
484
485impl FromStr for BroadPeak {
486    type Err = ParseError;
487
488    fn from_str(s: &str) -> Result<Self, Self::Err> {
489        let mut fields = s.split(DELIMITER);
490        Ok(Self {
491            chrom: parse_chrom(&mut fields)?.to_string(),
492            start: parse_start(&mut fields)?,
493            end: parse_end(&mut fields)?,
494            name: parse_name(&mut fields)?,
495            score: parse_score(&mut fields)?,
496            strand: parse_strand(&mut fields)?,
497            signal_value: fields.next().unwrap().parse().unwrap(),
498            p_value: parse_pvalue(&mut fields).unwrap(),
499            q_value: parse_pvalue(&mut fields).unwrap(),
500        })
501    }
502}
503
504/// The bedGraph format allows display of continuous-valued data in track format.
505/// This display type is useful for probability scores and transcriptome data.
506#[derive(Archive, RkyvSerialize, RkyvDeserialize, Clone, Debug, PartialEq)]
507#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
508pub struct BedGraph<V> {
509    pub chrom: String,
510    pub start: u64,
511    pub end: u64,
512    pub value: V,
513}
514
515impl<V> BedGraph<V> {
516    pub fn new<C>(chrom: C, start: u64, end: u64, value: V) -> Self
517    where
518        C: Into<String>,
519    {
520        Self {
521            chrom: chrom.into(),
522            start,
523            end,
524            value,
525        }
526    }
527
528    pub fn from_bed<B: BEDLike>(bed: &B, value: V) -> Self {
529        Self::new(bed.chrom(), bed.start(), bed.end(), value)
530    }
531}
532
533impl<V> BEDLike for BedGraph<V> {
534    fn chrom(&self) -> &str {
535        &self.chrom
536    }
537    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
538        self.chrom = chrom.to_string();
539        self
540    }
541    fn start(&self) -> u64 {
542        self.start
543    }
544    fn set_start(&mut self, start: u64) -> &mut Self {
545        self.start = start;
546        self
547    }
548    fn end(&self) -> u64 {
549        self.end
550    }
551    fn set_end(&mut self, end: u64) -> &mut Self {
552        self.end = end;
553        self
554    }
555    fn name(&self) -> Option<&str> {
556        None
557    }
558    fn score(&self) -> Option<Score> {
559        None
560    }
561    fn strand(&self) -> Option<Strand> {
562        None
563    }
564}
565
566impl<V> fmt::Display for BedGraph<V>
567where
568    V: fmt::Display,
569{
570    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
571        write!(
572            f,
573            "{}{}{}{}{}{}{}",
574            self.chrom(),
575            DELIMITER,
576            self.start(),
577            DELIMITER,
578            self.end(),
579            DELIMITER,
580            self.value,
581        )
582    }
583}
584
585impl<V> FromStr for BedGraph<V>
586where
587    V: FromStr,
588    <V as FromStr>::Err: std::fmt::Debug,
589{
590    type Err = ParseError;
591
592    fn from_str(s: &str) -> Result<Self, Self::Err> {
593        let mut fields = s.split(DELIMITER);
594        Ok(Self {
595            chrom: parse_chrom(&mut fields)?.to_string(),
596            start: parse_start(&mut fields)?,
597            end: parse_end(&mut fields)?,
598            value: fields.next().unwrap().parse().unwrap(),
599        })
600    }
601}
602
603fn parse_chrom<'a, I>(fields: &mut I) -> Result<&'a str, ParseError>
604where
605    I: Iterator<Item = &'a str>,
606{
607    fields
608        .next()
609        .ok_or(ParseError::MissingReferenceSequenceName)
610}
611
612fn parse_start<'a, I>(fields: &mut I) -> Result<u64, ParseError>
613where
614    I: Iterator<Item = &'a str>,
615{
616    fields
617        .next()
618        .ok_or(ParseError::MissingStartPosition)
619        .and_then(|s| lexical::parse(s).map_err(ParseError::InvalidStartPosition))
620}
621
622fn parse_end<'a, I>(fields: &mut I) -> Result<u64, ParseError>
623where
624    I: Iterator<Item = &'a str>,
625{
626    fields
627        .next()
628        .ok_or(ParseError::MissingEndPosition)
629        .and_then(|s| lexical::parse(s).map_err(ParseError::InvalidEndPosition))
630}
631
632fn parse_name<'a, I>(fields: &mut I) -> Result<Option<String>, ParseError>
633where
634    I: Iterator<Item = &'a str>,
635{
636    fields
637        .next()
638        .ok_or(ParseError::MissingName)
639        .map(|s| match s {
640            MISSING_ITEM => None,
641            _ => Some(s.into()),
642        })
643}
644
645fn parse_score<'a, I>(fields: &mut I) -> Result<Option<Score>, ParseError>
646where
647    I: Iterator<Item = &'a str>,
648{
649    fields
650        .next()
651        .ok_or(ParseError::MissingScore)
652        .and_then(|s| match s {
653            MISSING_ITEM => Ok(None),
654            _ => s.parse().map(Some).map_err(ParseError::InvalidScore),
655        })
656}
657
658fn parse_strand<'a, I>(fields: &mut I) -> Result<Option<Strand>, ParseError>
659where
660    I: Iterator<Item = &'a str>,
661{
662    fields
663        .next()
664        .ok_or(ParseError::MissingStrand)
665        .and_then(|s| match s {
666            MISSING_ITEM => Ok(None),
667            _ => s.parse().map(Some).map_err(ParseError::InvalidStrand),
668        })
669}
670
671fn parse_pvalue<'a, I>(fields: &mut I) -> Result<Option<f64>, ParseError>
672where
673    I: Iterator<Item = &'a str>,
674{
675    fields.next().ok_or(ParseError::MissingScore).and_then(|s| {
676        let p = s.parse().unwrap();
677        if p < 0.0 {
678            Ok(None)
679        } else {
680            Ok(Some(p))
681        }
682    })
683}
684
685/// An error returned when a raw BED record fails to parse.
686#[derive(Clone, Debug, Eq, PartialEq)]
687pub enum ParseError {
688    /// The reference sequence name is missing.
689    MissingReferenceSequenceName,
690    /// The start position is missing.
691    MissingStartPosition,
692    /// The start position is invalid.
693    InvalidStartPosition(lexical::Error),
694    /// The end position is missing.
695    MissingEndPosition,
696    /// The end position is invalid.
697    InvalidEndPosition(lexical::Error),
698    /// The name is missing.
699    MissingName,
700    /// The score is missing.
701    MissingScore,
702    /// The score is invalid.
703    InvalidScore(score::ParseError),
704    /// The strand is missing.
705    MissingStrand,
706    /// The strand is invalid.
707    InvalidStrand(strand::ParseError),
708}
709
710#[cfg(test)]
711mod bed_tests {
712    use super::*;
713
714    #[test]
715    fn test_fmt() {
716        let fields = OptionalFields::default();
717        assert_eq!(fields.to_string(), "");
718
719        let fields = OptionalFields::from(vec![String::from("n")]);
720        assert_eq!(fields.to_string(), "n");
721
722        let fields = OptionalFields::from(vec![String::from("n"), String::from("d")]);
723        assert_eq!(fields.to_string(), "n\td");
724
725        let genomic_range = GenomicRange::new("chr1", 100, 200);
726        assert_eq!(
727            genomic_range,
728            GenomicRange::from_str("chr1\t100\t200").unwrap()
729        );
730        assert_eq!(
731            genomic_range,
732            GenomicRange::from_str("chr1-100-200").unwrap()
733        );
734        assert_eq!(
735            genomic_range,
736            GenomicRange::from_str("chr1:100-200").unwrap()
737        );
738        assert_eq!(
739            genomic_range,
740            GenomicRange::from_str("chr1:100:200").unwrap()
741        );
742        assert_eq!(
743            genomic_range,
744            GenomicRange::from_str(&genomic_range.pretty_show()).unwrap()
745        );
746    }
747}