bed_utils/
bed.rs

1pub mod io;
2pub mod map;
3
4mod bed_trait;
5pub use bed_trait::*;
6mod score;
7use bincode::{Decode, Encode};
8pub use score::Score;
9mod strand;
10pub use strand::Strand;
11
12use std::{fmt::{self, Write}, ops::Deref, str::FromStr};
13
14const DELIMITER: char = '\t';
15const MISSING_ITEM : &str = ".";
16
17/// A minimal BED record with only 3 fields.
18#[derive(Encode, Decode, Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)]
19pub struct GenomicRange(String, u64, u64);
20
21impl GenomicRange {
22    pub fn new<C>(chrom: C, start: u64, end: u64) -> Self
23    where
24        C: Into<String>,
25    { Self(chrom.into(), start, end) }
26
27    /// Convert the record to a string representation: chr:start-end
28    pub fn pretty_show(&self) -> String {
29        format!("{}:{}-{}", self.0, self.1, self.2)
30    }
31}
32
33/// Convert string to GenomicRange. '\t', ':', and '-' are all considered as
34/// valid delimiters. So any of the following formats is valid:
35/// * chr1\t100\t200
36/// * chr1:100-200
37impl FromStr for GenomicRange {
38    type Err = ParseError;
39
40    fn from_str(s: &str) -> Result<Self, Self::Err> {
41        let mut fields = s.split(&['\t', ':', '-']);
42        let chrom = parse_chrom(&mut fields)?;
43        let start = parse_start(&mut fields)?;
44        let end = parse_end(&mut fields)?;
45        Ok(GenomicRange::new(chrom, start, end))
46    }
47}
48
49impl BEDLike for GenomicRange {
50    fn chrom(&self) -> &str { &self.0 }
51    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
52        self.0 = chrom.to_string();
53        self
54    }
55    fn start(&self) -> u64 { self.1 }
56    fn set_start(&mut self, start: u64) -> &mut Self {
57        self.1 = start;
58        self
59    }
60    fn end(&self) -> u64 { self.2 }
61    fn set_end(&mut self, end: u64) -> &mut Self {
62        self.2 = end;
63        self
64    }
65    fn name(&self) -> Option<&str> { None }
66    fn score(&self) -> Option<Score> { None }
67    fn strand(&self) -> Option<Strand> { None }
68}
69
70impl fmt::Display for GenomicRange {
71    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
72        write!(f, "{}{}{}{}{}", self.chrom(), DELIMITER, self.start(),
73            DELIMITER, self.end()
74        )?;
75        Ok(())
76    }
77}
78
79
80/// A standard BED record.
81#[derive(Encode, Decode, Clone, Debug, Eq, PartialEq)]
82pub struct BED<const N: u8> {
83    chrom: String,
84    start: u64,
85    end: u64,
86    pub name: Option<String>,
87    pub score: Option<Score>,
88    pub strand: Option<Strand>,
89    pub optional_fields: OptionalFields,
90}
91
92impl<const N: u8> BED<N> {
93    pub fn new<C>(chrom: C, start: u64, end: u64, name: Option<String>,
94        score: Option<Score>, strand: Option<Strand>, optional_fields: OptionalFields) -> Self
95    where
96        C: Into<String>,
97    { Self { chrom: chrom.into(), start, end, name, score, strand, optional_fields } }
98}
99
100impl<const N: u8> BEDLike for BED<N> {
101    fn chrom(&self) -> &str { &self.chrom }
102    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
103        self.chrom = chrom.to_string();
104        self
105    }
106    fn start(&self) -> u64 { self.start }
107    fn set_start(&mut self, start: u64) -> &mut Self {
108        self.start = start;
109        self
110    }
111    fn end(&self) -> u64 { self.end }
112    fn set_end(&mut self, end: u64) -> &mut Self {
113        self.end = end;
114        self
115    }
116    fn name(&self) -> Option<&str> { self.name.as_deref() }
117    fn score(&self) -> Option<Score> { self.score }
118    fn strand(&self) -> Option<Strand> { self.strand }
119}
120
121// Display trait
122impl<const N: u8> fmt::Display for BED<N> {
123    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
124        write!(
125            f,
126            "{}{}{}{}{}",
127            self.chrom(),
128            DELIMITER,
129            self.start(),
130            DELIMITER,
131            self.end()
132        )?;
133        if N > 3 {
134            write!(f, "{}{}", DELIMITER, self.name().unwrap_or(MISSING_ITEM))?;
135            if N > 4 {
136                f.write_char(DELIMITER)?;
137                if let Some(score) = self.score() {
138                    write!(f, "{}", score)?;
139                } else { f.write_str(MISSING_ITEM)?; }
140
141                if N > 5 {
142                    f.write_char(DELIMITER)?;
143                    if let Some(strand) = self.strand() {
144                        write!(f, "{}", strand)?;
145                    } else { f.write_str(MISSING_ITEM)?; }
146                }
147            }
148        }
149        Ok(())
150    }
151}
152
153impl<const N: u8> FromStr for BED<N> {
154    type Err = ParseError;
155
156    fn from_str(s: &str) -> Result<Self, Self::Err> {
157        let mut fields = s.split(DELIMITER);
158        let chrom = parse_chrom(&mut fields)?;
159        let start = parse_start(&mut fields)?;
160        let end = parse_end(&mut fields)?;
161        let name = if N > 3 { parse_name(&mut fields)? } else { None };
162        let score = if N > 4 { parse_score(&mut fields)? } else { None };
163        let strand = if N > 5 { parse_strand(&mut fields)? } else { None };
164        Ok(BED::new(chrom, start, end, name, score, strand, OptionalFields::default()))
165    }
166}
167
168/// Generic BED record optional fields.
169#[derive(Encode, Decode, Clone, Debug, Default, Eq, PartialEq)]
170pub struct OptionalFields(Vec<String>);
171
172impl Deref for OptionalFields {
173    type Target = [String];
174
175    fn deref(&self) -> &Self::Target {
176        &self.0
177    }
178}
179
180impl fmt::Display for OptionalFields {
181    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
182        for (i, field) in self.0.iter().enumerate() {
183            if i > 0 {
184                f.write_char(DELIMITER)?;
185            }
186
187            f.write_str(field)?;
188        }
189
190        Ok(())
191    }
192}
193
194impl From<Vec<String>> for OptionalFields {
195    fn from(fields: Vec<String>) -> Self {
196        Self(fields)
197    }
198}
199
200
201/// A NarrowPeak record is a BED6+4 format that is used to store called peaks.
202#[derive(Encode, Decode, Clone, Debug, PartialEq)]
203pub struct NarrowPeak {
204    pub chrom: String,
205    pub start: u64,
206    pub end: u64,
207    pub name: Option<String>,
208    pub score: Option<Score>,
209    pub strand: Option<Strand>,
210    pub signal_value: f64,
211    pub p_value: Option<f64>, 
212    pub q_value: Option<f64>, 
213    pub peak: u64, 
214}
215
216impl BEDLike for NarrowPeak {
217    fn chrom(&self) -> &str { &self.chrom }
218    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
219        self.chrom = chrom.to_string();
220        self
221    }
222    fn start(&self) -> u64 { self.start }
223    fn set_start(&mut self, start: u64) -> &mut Self {
224        self.start = start;
225        self
226    }
227    fn end(&self) -> u64 { self.end }
228    fn set_end(&mut self, end: u64) -> &mut Self {
229        self.end = end;
230        self
231    }
232    fn name(&self) -> Option<&str> { self.name.as_deref() }
233    fn score(&self) -> Option<Score> { self.score }
234    fn strand(&self) -> Option<Strand> { self.strand }
235}
236
237impl fmt::Display for NarrowPeak {
238    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
239        write!(
240            f,
241            "{}{}{}{}{}{}{}",
242            self.chrom(),
243            DELIMITER, self.start(),
244            DELIMITER, self.end(),
245            DELIMITER, self.name().unwrap_or(MISSING_ITEM),
246        )?;
247
248        f.write_char(DELIMITER)?;
249        if let Some(x) = self.score() {
250            write!(f, "{}", x)?;
251        } else {
252            f.write_str(MISSING_ITEM)?;
253        }
254        f.write_char(DELIMITER)?;
255        if let Some(x) = self.strand() {
256            write!(f, "{}", x)?;
257        } else {
258            f.write_str(MISSING_ITEM)?;
259        }
260        write!(
261            f,
262            "{}{}{}{}{}{}{}{}",
263            DELIMITER, self.signal_value,
264            DELIMITER, self.p_value.unwrap_or(-1.0),
265            DELIMITER, self.q_value.unwrap_or(-1.0),
266            DELIMITER, self.peak,
267        )?;
268
269        Ok(())
270    }
271}
272
273impl FromStr for NarrowPeak {
274    type Err = ParseError;
275
276    fn from_str(s: &str) -> Result<Self, Self::Err> {
277        let mut fields = s.split(DELIMITER);
278        Ok(Self {
279            chrom: parse_chrom(&mut fields)?.to_string(),
280            start: parse_start(&mut fields)?,
281            end: parse_end(&mut fields)?,
282            name: parse_name(&mut fields)?,
283            score: parse_score(&mut fields)?,
284            strand: parse_strand(&mut fields)?,
285            signal_value: fields.next().unwrap().parse().unwrap(),
286            p_value: parse_pvalue(&mut fields).unwrap(),
287            q_value: parse_pvalue(&mut fields).unwrap(),
288            peak: fields.next().unwrap().parse().unwrap(),
289        })
290    }
291}
292
293/// A BroadPeak record is a BED6+4 format that is used to store called peaks.
294#[derive(Encode, Decode, Clone, Debug, PartialEq)]
295pub struct BroadPeak {
296    pub chrom: String,
297    pub start: u64,
298    pub end: u64,
299    pub name: Option<String>,
300    pub score: Option<Score>,
301    pub strand: Option<Strand>,
302    pub signal_value: f64,
303    pub p_value: Option<f64>, 
304    pub q_value: Option<f64>, 
305}
306
307impl BEDLike for BroadPeak {
308    fn chrom(&self) -> &str { &self.chrom }
309    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
310        self.chrom = chrom.to_string();
311        self
312    }
313    fn start(&self) -> u64 { self.start }
314    fn set_start(&mut self, start: u64) -> &mut Self {
315        self.start = start;
316        self
317    }
318    fn end(&self) -> u64 { self.end }
319    fn set_end(&mut self, end: u64) -> &mut Self {
320        self.end = end;
321        self
322    }
323    fn name(&self) -> Option<&str> { self.name.as_deref() }
324    fn score(&self) -> Option<Score> { self.score }
325    fn strand(&self) -> Option<Strand> { self.strand }
326}
327
328impl fmt::Display for BroadPeak {
329    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
330        write!(
331            f,
332            "{}{}{}{}{}{}{}",
333            self.chrom(),
334            DELIMITER, self.start(),
335            DELIMITER, self.end(),
336            DELIMITER, self.name().unwrap_or(MISSING_ITEM),
337        )?;
338
339        f.write_char(DELIMITER)?;
340        if let Some(x) = self.score() {
341            write!(f, "{}", x)?;
342        } else {
343            f.write_str(MISSING_ITEM)?;
344        }
345        f.write_char(DELIMITER)?;
346        if let Some(x) = self.strand() {
347            write!(f, "{}", x)?;
348        } else {
349            f.write_str(MISSING_ITEM)?;
350        }
351        write!(
352            f,
353            "{}{}{}{}{}{}",
354            DELIMITER, self.signal_value,
355            DELIMITER, self.p_value.unwrap_or(-1.0),
356            DELIMITER, self.q_value.unwrap_or(-1.0),
357        )?;
358
359        Ok(())
360    }
361}
362
363impl FromStr for BroadPeak {
364    type Err = ParseError;
365
366    fn from_str(s: &str) -> Result<Self, Self::Err> {
367        let mut fields = s.split(DELIMITER);
368        Ok(Self {
369            chrom: parse_chrom(&mut fields)?.to_string(),
370            start: parse_start(&mut fields)?,
371            end: parse_end(&mut fields)?,
372            name: parse_name(&mut fields)?,
373            score: parse_score(&mut fields)?,
374            strand: parse_strand(&mut fields)?,
375            signal_value: fields.next().unwrap().parse().unwrap(),
376            p_value: parse_pvalue(&mut fields).unwrap(),
377            q_value: parse_pvalue(&mut fields).unwrap(),
378        })
379    }
380}
381
382/// The bedGraph format allows display of continuous-valued data in track format.
383/// This display type is useful for probability scores and transcriptome data. 
384#[derive(Encode, Decode, Clone, Debug, PartialEq)]
385pub struct BedGraph<V> {
386    pub chrom: String,
387    pub start: u64,
388    pub end: u64,
389    pub value: V,
390}
391
392impl<V> BedGraph<V> {
393    pub fn new<C>(chrom: C, start: u64, end: u64, value: V) -> Self
394    where
395        C: Into<String>,
396    { Self { chrom: chrom.into(), start, end, value } }
397
398    pub fn from_bed<B: BEDLike>(bed: &B, value: V) -> Self {
399        Self::new(bed.chrom(), bed.start(), bed.end(), value)
400    }
401}
402
403impl<V> BEDLike for BedGraph<V> {
404    fn chrom(&self) -> &str { &self.chrom }
405    fn set_chrom(&mut self, chrom: &str) -> &mut Self {
406        self.chrom = chrom.to_string();
407        self
408    }
409    fn start(&self) -> u64 { self.start }
410    fn set_start(&mut self, start: u64) -> &mut Self {
411        self.start = start;
412        self
413    }
414    fn end(&self) -> u64 { self.end }
415    fn set_end(&mut self, end: u64) -> &mut Self {
416        self.end = end;
417        self
418    }
419    fn name(&self) -> Option<&str> { None }
420    fn score(&self) -> Option<Score> { None }
421    fn strand(&self) -> Option<Strand> { None }
422}
423
424impl<V> fmt::Display for BedGraph<V>
425where
426    V: fmt::Display,
427{
428    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result
429    {
430        write!(
431            f,
432            "{}{}{}{}{}{}{}",
433            self.chrom(),
434            DELIMITER, self.start(),
435            DELIMITER, self.end(),
436            DELIMITER, self.value,
437        )
438    }
439}
440
441impl<V> FromStr for BedGraph<V>
442where
443    V: FromStr,
444    <V as FromStr>::Err: std::fmt::Debug,
445{
446    type Err = ParseError;
447
448    fn from_str(s: &str) -> Result<Self, Self::Err>
449    {
450        let mut fields = s.split(DELIMITER);
451        Ok(Self {
452            chrom: parse_chrom(&mut fields)?.to_string(),
453            start: parse_start(&mut fields)?,
454            end: parse_end(&mut fields)?,
455            value: fields.next().unwrap().parse().unwrap(),
456        })
457    }
458}
459
460fn parse_chrom<'a, I>(fields: &mut I) -> Result<&'a str, ParseError>
461where
462    I: Iterator<Item = &'a str>,
463{
464    fields
465        .next()
466        .ok_or(ParseError::MissingReferenceSequenceName)
467}
468
469fn parse_start<'a, I>(fields: &mut I) -> Result<u64, ParseError>
470where
471    I: Iterator<Item = &'a str>,
472{
473    fields
474        .next()
475        .ok_or(ParseError::MissingStartPosition)
476        .and_then(|s| lexical::parse(s).map_err(ParseError::InvalidStartPosition))
477}
478
479fn parse_end<'a, I>(fields: &mut I) -> Result<u64, ParseError>
480where
481    I: Iterator<Item = &'a str>,
482{
483    fields
484        .next()
485        .ok_or(ParseError::MissingEndPosition)
486        .and_then(|s| lexical::parse(s).map_err(ParseError::InvalidEndPosition))
487}
488
489fn parse_name<'a, I>(fields: &mut I) -> Result<Option<String>, ParseError>
490where
491    I: Iterator<Item = &'a str>,
492{
493    fields
494        .next()
495        .ok_or(ParseError::MissingName)
496        .map(|s| match s {
497            MISSING_ITEM => None,
498            _ => Some(s.into()),
499        })
500}
501
502fn parse_score<'a, I>(fields: &mut I) -> Result<Option<Score>, ParseError>
503where
504    I: Iterator<Item = &'a str>,
505{
506    fields
507        .next()
508        .ok_or(ParseError::MissingScore)
509        .and_then(|s| match s {
510            MISSING_ITEM => Ok(None),
511            _ => s.parse().map(Some).map_err(ParseError::InvalidScore),
512        })
513}
514
515fn parse_strand<'a, I>(fields: &mut I) -> Result<Option<Strand>, ParseError>
516where
517    I: Iterator<Item = &'a str>,
518{
519    fields
520        .next()
521        .ok_or(ParseError::MissingStrand)
522        .and_then(|s| match s {
523            MISSING_ITEM => Ok(None),
524            _ => s.parse().map(Some).map_err(ParseError::InvalidStrand),
525        })
526}
527
528fn parse_pvalue<'a, I>(fields: &mut I) -> Result<Option<f64>, ParseError>
529where
530    I: Iterator<Item = &'a str>,
531{
532    fields
533        .next()
534        .ok_or(ParseError::MissingScore)
535        .and_then(|s| {
536            let p = s.parse().unwrap();
537            if p < 0.0 { Ok(None) } else { Ok(Some(p)) }
538        })
539}
540
541/// An error returned when a raw BED record fails to parse.
542#[derive(Clone, Debug, Eq, PartialEq)]
543pub enum ParseError {
544    /// The reference sequence name is missing.
545    MissingReferenceSequenceName,
546    /// The start position is missing.
547    MissingStartPosition,
548    /// The start position is invalid.
549    InvalidStartPosition(lexical::Error),
550    /// The end position is missing.
551    MissingEndPosition,
552    /// The end position is invalid.
553    InvalidEndPosition(lexical::Error),
554    /// The name is missing.
555    MissingName,
556    /// The score is missing.
557    MissingScore,
558    /// The score is invalid.
559    InvalidScore(score::ParseError),
560    /// The strand is missing.
561    MissingStrand,
562    /// The strand is invalid.
563    InvalidStrand(strand::ParseError),
564}
565
566#[cfg(test)]
567mod bed_tests {
568    use super::*;
569
570    #[test]
571    fn test_fmt() {
572        let fields = OptionalFields::default();
573        assert_eq!(fields.to_string(), "");
574
575        let fields = OptionalFields::from(vec![String::from("n")]);
576        assert_eq!(fields.to_string(), "n");
577
578        let fields = OptionalFields::from(vec![String::from("n"), String::from("d")]);
579        assert_eq!(fields.to_string(), "n\td");
580
581        let genomic_range = GenomicRange::new("chr1", 100, 200);
582        assert_eq!(genomic_range, GenomicRange::from_str("chr1\t100\t200").unwrap());
583        assert_eq!(genomic_range, GenomicRange::from_str("chr1-100-200").unwrap());
584        assert_eq!(genomic_range, GenomicRange::from_str("chr1:100-200").unwrap());
585        assert_eq!(genomic_range, GenomicRange::from_str("chr1:100:200").unwrap());
586        assert_eq!(genomic_range, GenomicRange::from_str(&genomic_range.pretty_show()).unwrap());
587    }
588}