tsg_core/graph/
node.rs

1use std::fmt;
2use std::str::FromStr;
3
4use crate::graph::Attribute;
5use ahash::HashMap;
6use anyhow::Context;
7use anyhow::Result;
8use bon::Builder;
9use bon::builder;
10use bstr::BString;
11use bstr::ByteSlice;
12use rayon::prelude::*;
13use serde_json::json;
14use std::io;
15use tracing::debug;
16
17/// Represents a simple interval with start and end positions.
18///
19/// An interval is defined by two positions:
20/// - `start`: The inclusive beginning position of the interval
21/// - `end`: The exclusive ending position of the interval
22///
23/// The interval spans from `start` (inclusive) to `end` (exclusive).
24#[derive(Debug, Builder, Clone)]
25pub struct Interval {
26    pub start: usize,
27    pub end: usize,
28}
29
30impl Interval {
31    /// Returns the length of the interval.
32    ///
33    /// The length is calculated as `end - start`.
34    ///
35    /// # Returns
36    /// The length of the interval as a `usize`.
37    pub fn span(&self) -> usize {
38        self.end - self.start
39    }
40}
41
42impl FromStr for Interval {
43    type Err = io::Error;
44
45    fn from_str(s: &str) -> Result<Self, Self::Err> {
46        let parts: Vec<&str> = s.split('-').collect();
47        if parts.len() != 2 {
48            return Err(io::Error::new(
49                io::ErrorKind::InvalidData,
50                format!("Invalid exon coordinates format: {}", s),
51            ));
52        }
53
54        let start = parts[0].parse::<usize>().map_err(|e| {
55            io::Error::new(
56                io::ErrorKind::InvalidData,
57                format!("Invalid start coordinate: {}", e),
58            )
59        })?;
60
61        let end = parts[1].parse::<usize>().map_err(|e| {
62            io::Error::new(
63                io::ErrorKind::InvalidData,
64                format!("Invalid end coordinate: {}", e),
65            )
66        })?;
67
68        Ok(Self { start, end })
69    }
70}
71
72#[derive(Debug, Builder, Clone, Default)]
73/// Represents a collection of exons, which are contiguous regions within genomic sequences.
74///
75/// Exons are the parts of a gene's DNA that code for proteins, and they're separated by
76/// non-coding regions called introns. This structure stores a collection of exons as intervals.
77///
78/// # Fields
79///
80/// * `exons` - A vector of intervals representing the positions of exons within a genomic sequence.
81pub struct Exons {
82    pub exons: Vec<Interval>,
83}
84
85impl FromStr for Exons {
86    type Err = io::Error;
87    fn from_str(s: &str) -> Result<Self, Self::Err> {
88        let exons = s
89            .split(',')
90            .map(|x| x.parse())
91            .collect::<Result<Vec<Interval>, Self::Err>>()?;
92        Ok(Exons { exons })
93    }
94}
95
96impl fmt::Display for Exons {
97    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
98        let exons = self
99            .exons
100            .iter()
101            .map(|x| format!("{}-{}", x.start, x.end))
102            .collect::<Vec<String>>()
103            .join(",");
104        write!(f, "{}", exons)
105    }
106}
107
108/// Methods for working with exon structures.
109///
110/// # Methods
111///
112/// - `introns()` - Calculates the intron intervals between exons
113/// - `is_empty()` - Checks if there are no exons
114/// - `len()` - Returns the number of exons
115/// - `span()` - Calculates the total number of bases covered by all exons
116/// - `first_exon()` - Gets a reference to the first exon
117/// - `last_exon()` - Gets a reference to the last exon
118///
119/// # Panics
120///
121/// - `first_exon()` will panic if there are no exons
122/// - `last_exon()` will panic if there are no exons
123impl Exons {
124    /// Returns a vector of intervals representing introns.
125    ///
126    /// Introns are the regions between consecutive exons. For each pair of adjacent exons,
127    /// an intron is created starting at the position immediately after the end of the first exon
128    /// and ending at the position immediately before the start of the second exon.
129    ///
130    /// # Returns
131    /// A `Vec<Interval>` containing all introns between exons in this structure.
132    pub fn introns(&self) -> Vec<Interval> {
133        let mut introns = Vec::with_capacity(self.exons.len().saturating_sub(1));
134        for i in 0..self.exons.len().saturating_sub(1) {
135            introns.push(Interval {
136                start: self.exons[i].end + 1,
137                end: self.exons[i + 1].start,
138            });
139        }
140        introns
141    }
142
143    /// Checks if the exon collection is empty.
144    ///
145    /// # Returns
146    /// `true` if there are no exons, `false` otherwise.
147    pub fn is_empty(&self) -> bool {
148        self.exons.is_empty()
149    }
150
151    /// Returns the number of exons.
152    ///
153    /// # Returns
154    /// The count of exons as a `usize`.
155    pub fn len(&self) -> usize {
156        self.exons.len()
157    }
158
159    /// Calculates the total span (combined length) of all exons.
160    ///
161    /// The span is computed by summing the lengths of all intervals,
162    /// where each interval length is calculated as `end - start + 1`.
163    ///
164    /// # Returns
165    /// The total span as a `usize`.
166    pub fn span(&self) -> usize {
167        self.exons.iter().map(|e| e.span()).sum()
168    }
169
170    /// Returns a reference to the first exon.
171    ///
172    /// # Panics
173    /// Panics if the exon collection is empty.
174    ///
175    /// # Returns
176    /// A reference to the first `Interval` in the exon collection.
177    pub fn first_exon(&self) -> &Interval {
178        &self.exons[0]
179    }
180
181    /// Returns a reference to the last exon.
182    ///
183    /// # Panics
184    /// Panics if the exon collection is empty.
185    ///
186    /// # Returns
187    /// A reference to the last `Interval` in the exon collection.
188    pub fn last_exon(&self) -> &Interval {
189        &self.exons[self.exons.len() - 1]
190    }
191}
192
193#[allow(clippy::duplicated_attributes)]
194#[derive(Debug, Clone, Builder, PartialEq)]
195#[builder(on(BString, into))] // This enables automatic conversion to BString from string types
196#[builder(on(ReadIdentity, into))] // This enables automatic conversion to ReadIdentity from strings
197pub struct ReadData {
198    pub id: BString,
199    pub identity: ReadIdentity,
200}
201
202impl fmt::Display for ReadData {
203    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204        write!(f, "{}:{:?}", self.id, self.identity)
205    }
206}
207
208impl FromStr for ReadData {
209    type Err = io::Error;
210
211    fn from_str(s: &str) -> Result<Self, Self::Err> {
212        // <id>:<identity>
213        let fields: Vec<&str> = s.split(':').collect();
214        if fields.len() != 2 {
215            return Err(io::Error::new(
216                io::ErrorKind::InvalidData,
217                format!("Invalid read line format: {}", s),
218            ));
219        }
220
221        let id: BString = fields[0].into();
222        let identity = fields[1].parse()?;
223        Ok(Self { id, identity })
224    }
225}
226
227#[derive(Debug, Clone, PartialEq)]
228pub enum ReadIdentity {
229    SO, // source
230    IN, // intermediate
231    SI, // sink
232}
233
234impl fmt::Display for ReadIdentity {
235    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
236        match self {
237            ReadIdentity::SO => write!(f, "SO"),
238            ReadIdentity::IN => write!(f, "IN"),
239            ReadIdentity::SI => write!(f, "SI"),
240        }
241    }
242}
243
244impl FromStr for ReadIdentity {
245    type Err = io::Error;
246
247    fn from_str(s: &str) -> Result<Self, Self::Err> {
248        match s {
249            "SO" => Ok(ReadIdentity::SO),
250            "IN" => Ok(ReadIdentity::IN),
251            "SI" => Ok(ReadIdentity::SI),
252            _ => Err(io::Error::new(
253                io::ErrorKind::InvalidData,
254                format!("Invalid read identity: {}", s),
255            )),
256        }
257    }
258}
259
260impl From<&str> for ReadIdentity {
261    fn from(s: &str) -> Self {
262        s.parse().unwrap()
263    }
264}
265
266/// Represents DNA strand orientation
267#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
268pub enum Strand {
269    #[default]
270    Forward,
271    Reverse,
272}
273
274impl FromStr for Strand {
275    type Err = anyhow::Error;
276
277    fn from_str(s: &str) -> Result<Self, Self::Err> {
278        match s {
279            "+" => Ok(Strand::Forward),
280            "-" => Ok(Strand::Reverse),
281            _ => Err(anyhow::anyhow!("Invalid strand: {}", s)),
282        }
283    }
284}
285
286impl fmt::Display for Strand {
287    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
288        match self {
289            Strand::Forward => write!(f, "+"),
290            Strand::Reverse => write!(f, "-"),
291        }
292    }
293}
294
295/// Node in the transcript segment graph
296#[derive(Debug, Clone, Default, Builder)]
297#[builder(on(BString, into))]
298pub struct NodeData {
299    pub id: BString,
300    #[builder(default)]
301    pub reference_id: BString,
302    #[builder(default)]
303    pub strand: Strand,
304    #[builder(default)]
305    pub exons: Exons,
306    #[builder(default)]
307    pub reads: Vec<ReadData>,
308    pub sequence: Option<BString>,
309    #[builder(default)]
310    pub attributes: HashMap<BString, Attribute>,
311}
312
313impl NodeData {
314    pub fn reference_start(&self) -> usize {
315        self.exons.first_exon().start
316    }
317    pub fn reference_end(&self) -> usize {
318        self.exons.last_exon().end
319    }
320    /// Converts the node data to a JSON representation
321    ///
322    /// # Arguments
323    /// * `attributes` - Optional additional attributes to include in the JSON
324    ///
325    /// # Returns
326    /// A JSON value representing the node
327    pub fn to_json(&self, attributes: Option<&[Attribute]>) -> Result<serde_json::Value> {
328        let mut data = json!({
329            "chrom": self.reference_id.to_str().unwrap(),
330            "ref_start": self.reference_start(),
331            "ref_end": self.reference_end(),
332            "strand": self.strand.to_string(),
333            "exons": format!("[{}]",  self.exons.to_string()),
334            "reads": self.reads.par_iter().map(|r| format!("{}", r) ).collect::<Vec<_>>(),
335            "id": self.id.to_str().unwrap(),
336        });
337
338        for attr in self.attributes.values() {
339            data[attr.tag.to_str().unwrap()] = match attr.attribute_type {
340                'f' => attr.as_float()?.into(),
341                'i' => attr.as_int()?.into(),
342                _ => attr.value.to_str().unwrap().into(),
343            };
344        }
345
346        if let Some(attributes) = attributes.as_ref() {
347            for attr in attributes.iter() {
348                data[attr.tag.to_str().unwrap()] = match attr.attribute_type {
349                    'f' => attr.as_float()?.into(),
350                    'i' => attr.as_int()?.into(),
351                    _ => attr.value.to_str().unwrap().into(),
352                };
353            }
354        }
355        let json = json!({"data": data});
356        Ok(json)
357    }
358
359    pub fn to_gtf(&self, attributes: Option<&[Attribute]>) -> Result<BString> {
360        // chr1    scannls exon    173867960       173867991       .       -       .       exon_id "001"; segment_id "0001"; ptc "1"; ptf "1.0"; transcript_id "3x1"; gene_id "3";
361        let mut res = vec![];
362        for (idx, exon) in self.exons.exons.iter().enumerate() {
363            let mut gtf = String::from("");
364            gtf.push_str(self.reference_id.to_str().unwrap());
365            gtf.push_str("\ttsg\texon\t");
366            gtf.push_str(&format!("{}\t{}\t", exon.start, exon.end));
367            gtf.push_str(".\t");
368            gtf.push_str(self.strand.to_string().as_str());
369            gtf.push_str("\t.\t");
370            gtf.push_str(format!("exon_id \"{:03}\"; ", idx + 1).as_str());
371
372            for attr in self.attributes.values() {
373                gtf.push_str(format!("{} \"{}\"; ", attr.tag, attr.value).as_str());
374            }
375
376            if let Some(attributes) = attributes.as_ref() {
377                for attr in attributes.iter() {
378                    gtf.push_str(format!("{} \"{}\"; ", attr.tag, attr.value).as_str());
379                }
380            }
381            res.push(gtf);
382        }
383        Ok(res.join("\n").into())
384    }
385}
386
387impl fmt::Display for NodeData {
388    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
389        write!(
390            f,
391            "N\t{}\t{}:{}:{}\t{}\t{}",
392            self.id,
393            self.reference_id,
394            self.strand,
395            self.exons,
396            self.reads
397                .iter()
398                .map(|r| r.to_string())
399                .collect::<Vec<_>>()
400                .join(","),
401            self.sequence.as_ref().unwrap_or(&"".into())
402        )
403    }
404}
405
406impl FromStr for NodeData {
407    type Err = io::Error;
408
409    fn from_str(s: &str) -> Result<Self, Self::Err> {
410        // N  <rid>:<id>  <chrom>:<strand>:<exons>  <reads>  [<seq>]
411        let fields: Vec<&str> = s.split_whitespace().collect();
412        if fields.len() < 4 {
413            return Err(io::Error::new(
414                io::ErrorKind::InvalidData,
415                format!("Invalid node line format: {}", s),
416            ));
417        }
418
419        debug!("Parsing node: {}", s);
420        let id: BString = fields[1].into();
421
422        let reference_and_exons: Vec<&str> = fields[2].split(":").collect();
423        let reference_id = reference_and_exons[0].into();
424        let strand = reference_and_exons[1].parse().map_err(|e| {
425            io::Error::new(
426                io::ErrorKind::InvalidData,
427                format!("Failed to parse strand: {}", e),
428            )
429        })?;
430        let exons = reference_and_exons[2].parse().map_err(|e| {
431            io::Error::new(
432                io::ErrorKind::InvalidData,
433                format!("Failed to parse exons: {}", e),
434            )
435        })?;
436
437        let reads = fields[3]
438            .split(',')
439            .map(|s| s.parse().context("failed to parse reads").unwrap())
440            .collect::<Vec<_>>();
441
442        let sequence = if fields.len() > 4 && !fields[4].is_empty() {
443            Some(fields[4].into())
444        } else {
445            None
446        };
447
448        Ok(NodeData {
449            id,
450            reference_id,
451            strand,
452            exons,
453            reads,
454            sequence,
455            ..Default::default()
456        })
457    }
458}
459
460#[cfg(test)]
461mod tests {
462    use super::*;
463    use ahash::HashMapExt;
464
465    #[test]
466    fn test_node_from_str() {
467        let node1 = NodeData::from_str("N\tn1\tchr1:+:1000-2000\tread1:SO").unwrap();
468        assert_eq!(node1.id, "n1");
469    }
470
471    #[test]
472    fn test_exons_introns() {
473        let exons = Exons::from_str("100-200,300-400,500-600").unwrap();
474        let introns = exons.introns();
475        assert_eq!(introns.len(), 2);
476        assert_eq!(introns[0].start, 201);
477        assert_eq!(introns[0].end, 300);
478        assert_eq!(introns[1].start, 401);
479        assert_eq!(introns[1].end, 500);
480    }
481
482    #[test]
483    fn test_exons_len() {
484        let exons = Exons::from_str("100-200,300-400,500-600").unwrap();
485        assert_eq!(exons.len(), 3);
486    }
487
488    #[test]
489    fn test_exons_span() {
490        let exons = Exons::from_str("100-200,300-400,500-600").unwrap();
491        // (200-100) + (400-300) + (600-500) = 100 + 100 + 100 = 300
492        assert_eq!(exons.span(), 300);
493    }
494
495    #[test]
496    fn test_exons_first_last() {
497        let exons = Exons::from_str("100-200,300-400,500-600").unwrap();
498        assert_eq!(exons.first_exon().start, 100);
499        assert_eq!(exons.first_exon().end, 200);
500        assert_eq!(exons.last_exon().start, 500);
501        assert_eq!(exons.last_exon().end, 600);
502    }
503
504    #[test]
505    fn test_node_reference_start_end() {
506        let node = NodeData::builder()
507            .id("node1")
508            .reference_id("chr1")
509            .exons(
510                Exons::builder()
511                    .exons(vec![
512                        Interval {
513                            start: 100,
514                            end: 200,
515                        },
516                        Interval {
517                            start: 300,
518                            end: 400,
519                        },
520                    ])
521                    .build(),
522            )
523            .build();
524        assert_eq!(node.reference_start(), 100);
525        assert_eq!(node.reference_end(), 400);
526    }
527
528    #[test]
529    fn test_node_to_json() -> Result<()> {
530        let node = NodeData {
531            id: "node1".into(),
532            reference_id: "chr1".into(),
533            strand: Strand::Forward,
534            exons: Exons {
535                exons: vec![
536                    Interval {
537                        start: 100,
538                        end: 200,
539                    },
540                    Interval {
541                        start: 300,
542                        end: 400,
543                    },
544                ],
545            },
546            reads: vec![
547                ReadData::builder().id("read1").identity("SO").build(),
548                ReadData::builder().id("read2").identity("IN").build(),
549            ],
550            attributes: {
551                let mut map = HashMap::new();
552                map.insert(
553                    "ptc".into(),
554                    Attribute {
555                        tag: "ptc".into(),
556                        attribute_type: 'Z',
557                        value: "1".into(),
558                    },
559                );
560                map.insert(
561                    "ptf".into(),
562                    Attribute {
563                        tag: "ptf".into(),
564                        attribute_type: 'Z',
565                        value: "0.0".into(),
566                    },
567                );
568                map
569            },
570            ..Default::default()
571        };
572
573        let json = node.to_json(None)?;
574        println!("{}", json);
575
576        // Check basic structure
577        assert!(json.get("data").is_some());
578        let data = json["data"].as_object().unwrap();
579
580        // Check fields
581        assert_eq!(data["chrom"], "chr1");
582        assert_eq!(data["ref_start"], 100);
583        assert_eq!(data["ref_end"], 400);
584        assert_eq!(data["strand"], "+");
585        assert_eq!(data["id"], "node1");
586        assert_eq!(data["ptc"], "1");
587        assert_eq!(data["ptf"], "0.0");
588
589        // Test with additional attributes
590        let additional_attrs = vec![Attribute {
591            tag: "is_head".into(),
592            attribute_type: 'Z',
593            value: "true".into(),
594        }];
595
596        let json_with_attrs = node.to_json(Some(&additional_attrs))?;
597        let data = json_with_attrs["data"].as_object().unwrap();
598        assert_eq!(data["is_head"], "true");
599
600        println!("{}", json_with_attrs);
601
602        Ok(())
603    }
604
605    #[test]
606    fn test_node_to_gtf() -> Result<()> {
607        let node = NodeData {
608            id: "node1".into(),
609            reference_id: "chr1".into(),
610            strand: Strand::Forward,
611            exons: Exons {
612                exons: vec![
613                    Interval {
614                        start: 100,
615                        end: 200,
616                    },
617                    Interval {
618                        start: 300,
619                        end: 400,
620                    },
621                ],
622            },
623            attributes: {
624                let mut map = HashMap::new();
625                map.insert(
626                    "segment_id".into(),
627                    Attribute {
628                        tag: "segment_id".into(),
629                        attribute_type: 'Z',
630                        value: "001".into(),
631                    },
632                );
633                map
634            },
635            ..Default::default()
636        };
637
638        let gtf = node.to_gtf(None)?;
639        let gtf_str = gtf.to_str().unwrap();
640        let lines: Vec<&str> = gtf_str.split('\n').collect();
641
642        assert_eq!(lines.len(), 2);
643        assert!(lines[0].starts_with("chr1\ttsg\texon\t100\t200\t.\t+\t.\texon_id \"001\""));
644        assert!(lines[0].contains("segment_id \"001\""));
645        assert!(lines[1].starts_with("chr1\ttsg\texon\t300\t400\t.\t+\t.\texon_id \"002\""));
646
647        // Test with additional attributes
648        let additional_attrs = vec![Attribute {
649            tag: "transcript_id".into(),
650            attribute_type: 'Z',
651            value: "1".into(),
652        }];
653
654        let gtf_with_attrs = node.to_gtf(Some(&additional_attrs))?;
655        let gtf_str = gtf_with_attrs.to_str().unwrap();
656        let lines: Vec<&str> = gtf_str.split('\n').collect();
657
658        assert!(lines[0].contains("transcript_id \"1\""));
659        assert!(lines[1].contains("transcript_id \"1\""));
660
661        Ok(())
662    }
663}