1use std::fmt;
2use std::str::FromStr;
3
4use crate::graph::Attribute;
5use ahash::HashMap;
6use anyhow::Context;
7use anyhow::Result;
8use bon::Builder;
9use bon::builder;
10use bstr::BString;
11use bstr::ByteSlice;
12use rayon::prelude::*;
13use serde_json::json;
14use std::io;
15use tracing::debug;
16
17#[derive(Debug, Builder, Clone)]
25pub struct Interval {
26 pub start: usize,
27 pub end: usize,
28}
29
30impl Interval {
31 pub fn span(&self) -> usize {
38 self.end - self.start
39 }
40}
41
42impl FromStr for Interval {
43 type Err = io::Error;
44
45 fn from_str(s: &str) -> Result<Self, Self::Err> {
46 let parts: Vec<&str> = s.split('-').collect();
47 if parts.len() != 2 {
48 return Err(io::Error::new(
49 io::ErrorKind::InvalidData,
50 format!("Invalid exon coordinates format: {}", s),
51 ));
52 }
53
54 let start = parts[0].parse::<usize>().map_err(|e| {
55 io::Error::new(
56 io::ErrorKind::InvalidData,
57 format!("Invalid start coordinate: {}", e),
58 )
59 })?;
60
61 let end = parts[1].parse::<usize>().map_err(|e| {
62 io::Error::new(
63 io::ErrorKind::InvalidData,
64 format!("Invalid end coordinate: {}", e),
65 )
66 })?;
67
68 Ok(Self { start, end })
69 }
70}
71
72#[derive(Debug, Builder, Clone, Default)]
73pub struct Exons {
82 pub exons: Vec<Interval>,
83}
84
85impl FromStr for Exons {
86 type Err = io::Error;
87 fn from_str(s: &str) -> Result<Self, Self::Err> {
88 let exons = s
89 .split(',')
90 .map(|x| x.parse())
91 .collect::<Result<Vec<Interval>, Self::Err>>()?;
92 Ok(Exons { exons })
93 }
94}
95
96impl fmt::Display for Exons {
97 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
98 let exons = self
99 .exons
100 .iter()
101 .map(|x| format!("{}-{}", x.start, x.end))
102 .collect::<Vec<String>>()
103 .join(",");
104 write!(f, "{}", exons)
105 }
106}
107
108impl Exons {
124 pub fn introns(&self) -> Vec<Interval> {
133 let mut introns = Vec::with_capacity(self.exons.len().saturating_sub(1));
134 for i in 0..self.exons.len().saturating_sub(1) {
135 introns.push(Interval {
136 start: self.exons[i].end + 1,
137 end: self.exons[i + 1].start,
138 });
139 }
140 introns
141 }
142
143 pub fn is_empty(&self) -> bool {
148 self.exons.is_empty()
149 }
150
151 pub fn len(&self) -> usize {
156 self.exons.len()
157 }
158
159 pub fn span(&self) -> usize {
167 self.exons.iter().map(|e| e.span()).sum()
168 }
169
170 pub fn first_exon(&self) -> &Interval {
178 &self.exons[0]
179 }
180
181 pub fn last_exon(&self) -> &Interval {
189 &self.exons[self.exons.len() - 1]
190 }
191}
192
193#[allow(clippy::duplicated_attributes)]
194#[derive(Debug, Clone, Builder, PartialEq)]
195#[builder(on(BString, into))] #[builder(on(ReadIdentity, into))] pub struct ReadData {
198 pub id: BString,
199 pub identity: ReadIdentity,
200}
201
202impl fmt::Display for ReadData {
203 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204 write!(f, "{}:{:?}", self.id, self.identity)
205 }
206}
207
208impl FromStr for ReadData {
209 type Err = io::Error;
210
211 fn from_str(s: &str) -> Result<Self, Self::Err> {
212 let fields: Vec<&str> = s.split(':').collect();
214 if fields.len() != 2 {
215 return Err(io::Error::new(
216 io::ErrorKind::InvalidData,
217 format!("Invalid read line format: {}", s),
218 ));
219 }
220
221 let id: BString = fields[0].into();
222 let identity = fields[1].parse()?;
223 Ok(Self { id, identity })
224 }
225}
226
227#[derive(Debug, Clone, PartialEq)]
228pub enum ReadIdentity {
229 SO, IN, SI, }
233
234impl fmt::Display for ReadIdentity {
235 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
236 match self {
237 ReadIdentity::SO => write!(f, "SO"),
238 ReadIdentity::IN => write!(f, "IN"),
239 ReadIdentity::SI => write!(f, "SI"),
240 }
241 }
242}
243
244impl FromStr for ReadIdentity {
245 type Err = io::Error;
246
247 fn from_str(s: &str) -> Result<Self, Self::Err> {
248 match s {
249 "SO" => Ok(ReadIdentity::SO),
250 "IN" => Ok(ReadIdentity::IN),
251 "SI" => Ok(ReadIdentity::SI),
252 _ => Err(io::Error::new(
253 io::ErrorKind::InvalidData,
254 format!("Invalid read identity: {}", s),
255 )),
256 }
257 }
258}
259
260impl From<&str> for ReadIdentity {
261 fn from(s: &str) -> Self {
262 s.parse().unwrap()
263 }
264}
265
266#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
268pub enum Strand {
269 #[default]
270 Forward,
271 Reverse,
272}
273
274impl FromStr for Strand {
275 type Err = anyhow::Error;
276
277 fn from_str(s: &str) -> Result<Self, Self::Err> {
278 match s {
279 "+" => Ok(Strand::Forward),
280 "-" => Ok(Strand::Reverse),
281 _ => Err(anyhow::anyhow!("Invalid strand: {}", s)),
282 }
283 }
284}
285
286impl fmt::Display for Strand {
287 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
288 match self {
289 Strand::Forward => write!(f, "+"),
290 Strand::Reverse => write!(f, "-"),
291 }
292 }
293}
294
295#[derive(Debug, Clone, Default, Builder)]
297#[builder(on(BString, into))]
298pub struct NodeData {
299 pub id: BString,
300 #[builder(default)]
301 pub reference_id: BString,
302 #[builder(default)]
303 pub strand: Strand,
304 #[builder(default)]
305 pub exons: Exons,
306 #[builder(default)]
307 pub reads: Vec<ReadData>,
308 pub sequence: Option<BString>,
309 #[builder(default)]
310 pub attributes: HashMap<BString, Attribute>,
311}
312
313impl NodeData {
314 pub fn reference_start(&self) -> usize {
315 self.exons.first_exon().start
316 }
317 pub fn reference_end(&self) -> usize {
318 self.exons.last_exon().end
319 }
320 pub fn to_json(&self, attributes: Option<&[Attribute]>) -> Result<serde_json::Value> {
328 let mut data = json!({
329 "chrom": self.reference_id.to_str().unwrap(),
330 "ref_start": self.reference_start(),
331 "ref_end": self.reference_end(),
332 "strand": self.strand.to_string(),
333 "exons": format!("[{}]", self.exons.to_string()),
334 "reads": self.reads.par_iter().map(|r| format!("{}", r) ).collect::<Vec<_>>(),
335 "id": self.id.to_str().unwrap(),
336 });
337
338 for attr in self.attributes.values() {
339 data[attr.tag.to_str().unwrap()] = match attr.attribute_type {
340 'f' => attr.as_float()?.into(),
341 'i' => attr.as_int()?.into(),
342 _ => attr.value.to_str().unwrap().into(),
343 };
344 }
345
346 if let Some(attributes) = attributes.as_ref() {
347 for attr in attributes.iter() {
348 data[attr.tag.to_str().unwrap()] = match attr.attribute_type {
349 'f' => attr.as_float()?.into(),
350 'i' => attr.as_int()?.into(),
351 _ => attr.value.to_str().unwrap().into(),
352 };
353 }
354 }
355 let json = json!({"data": data});
356 Ok(json)
357 }
358
359 pub fn to_gtf(&self, attributes: Option<&[Attribute]>) -> Result<BString> {
360 let mut res = vec![];
362 for (idx, exon) in self.exons.exons.iter().enumerate() {
363 let mut gtf = String::from("");
364 gtf.push_str(self.reference_id.to_str().unwrap());
365 gtf.push_str("\ttsg\texon\t");
366 gtf.push_str(&format!("{}\t{}\t", exon.start, exon.end));
367 gtf.push_str(".\t");
368 gtf.push_str(self.strand.to_string().as_str());
369 gtf.push_str("\t.\t");
370 gtf.push_str(format!("exon_id \"{:03}\"; ", idx + 1).as_str());
371
372 for attr in self.attributes.values() {
373 gtf.push_str(format!("{} \"{}\"; ", attr.tag, attr.value).as_str());
374 }
375
376 if let Some(attributes) = attributes.as_ref() {
377 for attr in attributes.iter() {
378 gtf.push_str(format!("{} \"{}\"; ", attr.tag, attr.value).as_str());
379 }
380 }
381 res.push(gtf);
382 }
383 Ok(res.join("\n").into())
384 }
385}
386
387impl fmt::Display for NodeData {
388 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
389 write!(
390 f,
391 "N\t{}\t{}:{}:{}\t{}\t{}",
392 self.id,
393 self.reference_id,
394 self.strand,
395 self.exons,
396 self.reads
397 .iter()
398 .map(|r| r.to_string())
399 .collect::<Vec<_>>()
400 .join(","),
401 self.sequence.as_ref().unwrap_or(&"".into())
402 )
403 }
404}
405
406impl FromStr for NodeData {
407 type Err = io::Error;
408
409 fn from_str(s: &str) -> Result<Self, Self::Err> {
410 let fields: Vec<&str> = s.split_whitespace().collect();
412 if fields.len() < 4 {
413 return Err(io::Error::new(
414 io::ErrorKind::InvalidData,
415 format!("Invalid node line format: {}", s),
416 ));
417 }
418
419 debug!("Parsing node: {}", s);
420 let id: BString = fields[1].into();
421
422 let reference_and_exons: Vec<&str> = fields[2].split(":").collect();
423 let reference_id = reference_and_exons[0].into();
424 let strand = reference_and_exons[1].parse().map_err(|e| {
425 io::Error::new(
426 io::ErrorKind::InvalidData,
427 format!("Failed to parse strand: {}", e),
428 )
429 })?;
430 let exons = reference_and_exons[2].parse().map_err(|e| {
431 io::Error::new(
432 io::ErrorKind::InvalidData,
433 format!("Failed to parse exons: {}", e),
434 )
435 })?;
436
437 let reads = fields[3]
438 .split(',')
439 .map(|s| s.parse().context("failed to parse reads").unwrap())
440 .collect::<Vec<_>>();
441
442 let sequence = if fields.len() > 4 && !fields[4].is_empty() {
443 Some(fields[4].into())
444 } else {
445 None
446 };
447
448 Ok(NodeData {
449 id,
450 reference_id,
451 strand,
452 exons,
453 reads,
454 sequence,
455 ..Default::default()
456 })
457 }
458}
459
460#[cfg(test)]
461mod tests {
462 use super::*;
463 use ahash::HashMapExt;
464
465 #[test]
466 fn test_node_from_str() {
467 let node1 = NodeData::from_str("N\tn1\tchr1:+:1000-2000\tread1:SO").unwrap();
468 assert_eq!(node1.id, "n1");
469 }
470
471 #[test]
472 fn test_exons_introns() {
473 let exons = Exons::from_str("100-200,300-400,500-600").unwrap();
474 let introns = exons.introns();
475 assert_eq!(introns.len(), 2);
476 assert_eq!(introns[0].start, 201);
477 assert_eq!(introns[0].end, 300);
478 assert_eq!(introns[1].start, 401);
479 assert_eq!(introns[1].end, 500);
480 }
481
482 #[test]
483 fn test_exons_len() {
484 let exons = Exons::from_str("100-200,300-400,500-600").unwrap();
485 assert_eq!(exons.len(), 3);
486 }
487
488 #[test]
489 fn test_exons_span() {
490 let exons = Exons::from_str("100-200,300-400,500-600").unwrap();
491 assert_eq!(exons.span(), 300);
493 }
494
495 #[test]
496 fn test_exons_first_last() {
497 let exons = Exons::from_str("100-200,300-400,500-600").unwrap();
498 assert_eq!(exons.first_exon().start, 100);
499 assert_eq!(exons.first_exon().end, 200);
500 assert_eq!(exons.last_exon().start, 500);
501 assert_eq!(exons.last_exon().end, 600);
502 }
503
504 #[test]
505 fn test_node_reference_start_end() {
506 let node = NodeData::builder()
507 .id("node1")
508 .reference_id("chr1")
509 .exons(
510 Exons::builder()
511 .exons(vec![
512 Interval {
513 start: 100,
514 end: 200,
515 },
516 Interval {
517 start: 300,
518 end: 400,
519 },
520 ])
521 .build(),
522 )
523 .build();
524 assert_eq!(node.reference_start(), 100);
525 assert_eq!(node.reference_end(), 400);
526 }
527
528 #[test]
529 fn test_node_to_json() -> Result<()> {
530 let node = NodeData {
531 id: "node1".into(),
532 reference_id: "chr1".into(),
533 strand: Strand::Forward,
534 exons: Exons {
535 exons: vec![
536 Interval {
537 start: 100,
538 end: 200,
539 },
540 Interval {
541 start: 300,
542 end: 400,
543 },
544 ],
545 },
546 reads: vec![
547 ReadData::builder().id("read1").identity("SO").build(),
548 ReadData::builder().id("read2").identity("IN").build(),
549 ],
550 attributes: {
551 let mut map = HashMap::new();
552 map.insert(
553 "ptc".into(),
554 Attribute {
555 tag: "ptc".into(),
556 attribute_type: 'Z',
557 value: "1".into(),
558 },
559 );
560 map.insert(
561 "ptf".into(),
562 Attribute {
563 tag: "ptf".into(),
564 attribute_type: 'Z',
565 value: "0.0".into(),
566 },
567 );
568 map
569 },
570 ..Default::default()
571 };
572
573 let json = node.to_json(None)?;
574 println!("{}", json);
575
576 assert!(json.get("data").is_some());
578 let data = json["data"].as_object().unwrap();
579
580 assert_eq!(data["chrom"], "chr1");
582 assert_eq!(data["ref_start"], 100);
583 assert_eq!(data["ref_end"], 400);
584 assert_eq!(data["strand"], "+");
585 assert_eq!(data["id"], "node1");
586 assert_eq!(data["ptc"], "1");
587 assert_eq!(data["ptf"], "0.0");
588
589 let additional_attrs = vec![Attribute {
591 tag: "is_head".into(),
592 attribute_type: 'Z',
593 value: "true".into(),
594 }];
595
596 let json_with_attrs = node.to_json(Some(&additional_attrs))?;
597 let data = json_with_attrs["data"].as_object().unwrap();
598 assert_eq!(data["is_head"], "true");
599
600 println!("{}", json_with_attrs);
601
602 Ok(())
603 }
604
605 #[test]
606 fn test_node_to_gtf() -> Result<()> {
607 let node = NodeData {
608 id: "node1".into(),
609 reference_id: "chr1".into(),
610 strand: Strand::Forward,
611 exons: Exons {
612 exons: vec![
613 Interval {
614 start: 100,
615 end: 200,
616 },
617 Interval {
618 start: 300,
619 end: 400,
620 },
621 ],
622 },
623 attributes: {
624 let mut map = HashMap::new();
625 map.insert(
626 "segment_id".into(),
627 Attribute {
628 tag: "segment_id".into(),
629 attribute_type: 'Z',
630 value: "001".into(),
631 },
632 );
633 map
634 },
635 ..Default::default()
636 };
637
638 let gtf = node.to_gtf(None)?;
639 let gtf_str = gtf.to_str().unwrap();
640 let lines: Vec<&str> = gtf_str.split('\n').collect();
641
642 assert_eq!(lines.len(), 2);
643 assert!(lines[0].starts_with("chr1\ttsg\texon\t100\t200\t.\t+\t.\texon_id \"001\""));
644 assert!(lines[0].contains("segment_id \"001\""));
645 assert!(lines[1].starts_with("chr1\ttsg\texon\t300\t400\t.\t+\t.\texon_id \"002\""));
646
647 let additional_attrs = vec![Attribute {
649 tag: "transcript_id".into(),
650 attribute_type: 'Z',
651 value: "1".into(),
652 }];
653
654 let gtf_with_attrs = node.to_gtf(Some(&additional_attrs))?;
655 let gtf_str = gtf_with_attrs.to_str().unwrap();
656 let lines: Vec<&str> = gtf_str.split('\n').collect();
657
658 assert!(lines[0].contains("transcript_id \"1\""));
659 assert!(lines[1].contains("transcript_id \"1\""));
660
661 Ok(())
662 }
663}