use cyanea_core::{CyaneaError, Result};
#[derive(Debug, Clone)]
pub struct PirRecord {
pub entry_type: String,
pub name: String,
pub description: String,
pub sequence: String,
}
pub fn parse_pir(input: &str) -> Result<Vec<PirRecord>> {
if input.trim().is_empty() {
return Ok(Vec::new());
}
let mut records = Vec::new();
let mut lines = input.lines().peekable();
while let Some(line) = lines.next() {
let line = line.trim();
if line.is_empty() {
continue;
}
if !line.starts_with('>') {
continue;
}
let header = &line[1..]; let (entry_type, name) = match header.split_once(';') {
Some((et, n)) => (et.trim().to_string(), n.trim().to_string()),
None => {
return Err(CyaneaError::Parse(format!(
"invalid PIR header (missing ';'): '{}'",
line
)));
}
};
let description = match lines.next() {
Some(desc) => desc.trim().to_string(),
None => {
return Err(CyaneaError::Parse(format!(
"unexpected end of input after PIR header for '{}'",
name
)));
}
};
let mut sequence = String::new();
let mut found_terminator = false;
while let Some(seq_line) = lines.next() {
let seq_line = seq_line.trim();
if seq_line.is_empty() {
continue;
}
if let Some(pos) = seq_line.find('*') {
let fragment: String = seq_line[..pos]
.chars()
.filter(|c| !c.is_whitespace())
.collect();
sequence.push_str(&fragment);
found_terminator = true;
break;
} else {
let fragment: String = seq_line
.chars()
.filter(|c| !c.is_whitespace())
.collect();
sequence.push_str(&fragment);
}
}
if !found_terminator {
return Err(CyaneaError::Parse(format!(
"missing sequence terminator '*' for PIR record '{}'",
name
)));
}
records.push(PirRecord {
entry_type,
name,
description,
sequence,
});
}
Ok(records)
}
pub fn write_pir(records: &[PirRecord]) -> String {
const LINE_WIDTH: usize = 60;
let mut out = String::new();
for rec in records {
out.push_str(&format!(">{};{}\n", rec.entry_type, rec.name));
out.push_str(&rec.description);
out.push('\n');
let seq = &rec.sequence;
let mut pos = 0;
while pos < seq.len() {
let end = std::cmp::min(pos + LINE_WIDTH, seq.len());
out.push_str(&seq[pos..end]);
if end == seq.len() {
out.push('*');
}
out.push('\n');
pos = end;
}
if seq.is_empty() {
out.push_str("*\n");
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn pir_round_trip() {
let rec = PirRecord {
entry_type: "P1".to_string(),
name: "CRAB_ANAPL".to_string(),
description: "Alpha A crystallin - Atlantic crab".to_string(),
sequence: "MFNIFFHDFKAPLNASSQVIFEKGESK".to_string(),
};
let written = write_pir(&[rec]);
let parsed = parse_pir(&written).unwrap();
assert_eq!(parsed.len(), 1);
assert_eq!(parsed[0].entry_type, "P1");
assert_eq!(parsed[0].name, "CRAB_ANAPL");
assert_eq!(parsed[0].description, "Alpha A crystallin - Atlantic crab");
assert_eq!(parsed[0].sequence, "MFNIFFHDFKAPLNASSQVIFEKGESK");
}
#[test]
fn pir_multi_record() {
let input = "\
>P1;CRAB_ANAPL
Alpha A crystallin - Atlantic crab
MFNIFFHDFK*
>P1;CRAB_BOVIN
Alpha crystallin - Bovine
MDIAIHHPWI*
>DL;DNA_TEST
Test DNA sequence
ACGTACGT*
";
let records = parse_pir(input).unwrap();
assert_eq!(records.len(), 3);
assert_eq!(records[0].name, "CRAB_ANAPL");
assert_eq!(records[0].sequence, "MFNIFFHDFK");
assert_eq!(records[1].name, "CRAB_BOVIN");
assert_eq!(records[1].sequence, "MDIAIHHPWI");
assert_eq!(records[2].entry_type, "DL");
assert_eq!(records[2].name, "DNA_TEST");
assert_eq!(records[2].sequence, "ACGTACGT");
}
#[test]
fn pir_various_entry_types() {
let input = "\
>F1;FRAG1
Protein fragment
ACDEF*
>DL;DNA1
Linear DNA
ACGT*
>N1;RNA1
RNA sequence
ACGU*
";
let records = parse_pir(input).unwrap();
assert_eq!(records.len(), 3);
assert_eq!(records[0].entry_type, "F1");
assert_eq!(records[1].entry_type, "DL");
assert_eq!(records[2].entry_type, "N1");
}
#[test]
fn pir_missing_terminator_error() {
let input = ">P1;TEST\nTest protein\nMASHAKE\n";
let result = parse_pir(input);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("terminator"));
}
#[test]
fn pir_empty_input() {
let records = parse_pir("").unwrap();
assert!(records.is_empty());
let records = parse_pir(" \n\n ").unwrap();
assert!(records.is_empty());
}
#[test]
fn pir_multiline_sequence() {
let input = "\
>P1;LONGSEQ
A long protein
MFNIFFHDFKAPLNASSQVIFEKGESKFVNQDVKEMEEQTQAFNKHSSGFGRNYQD
ADVDVAYGFSSGGPTIAAGK*
";
let records = parse_pir(input).unwrap();
assert_eq!(records.len(), 1);
assert_eq!(records[0].sequence.len(), 76);
assert!(records[0].sequence.starts_with("MFNIFFHDFK"));
assert!(records[0].sequence.ends_with("AAGK"));
}
}