use cyanea_core::Result;
#[derive(Debug, Clone)]
pub struct EmblRecord {
pub id: String,
pub accession: String,
pub description: String,
pub sequence: String,
pub features: Vec<(String, String)>,
}
pub fn parse_embl(input: &str) -> Result<Vec<EmblRecord>> {
if input.trim().is_empty() {
return Ok(Vec::new());
}
let mut records = Vec::new();
let mut builder = RecordBuilder::new();
let mut in_sequence = false;
for line in input.lines() {
if line.starts_with("//") {
if builder.has_data() {
records.push(builder.build());
}
builder = RecordBuilder::new();
in_sequence = false;
continue;
}
if in_sequence {
for ch in line.chars() {
if ch.is_ascii_alphabetic() {
builder.sequence.push(ch);
}
}
continue;
}
if line.starts_with("ID ") || line.starts_with("ID\t") {
let rest = &line[5..];
let rest = rest.trim_start();
let name = rest
.split(|c: char| c == ';' || c.is_whitespace())
.next()
.unwrap_or("")
.trim();
builder.id = name.to_string();
continue;
}
if line.starts_with("AC ") || line.starts_with("AC\t") {
let rest = &line[5..];
let rest = rest.trim();
let acc = rest.trim_end_matches(';').trim();
if builder.accession.is_empty() {
builder.accession = acc.to_string();
}
continue;
}
if line.starts_with("DE ") || line.starts_with("DE\t") {
let rest = &line[5..];
let rest = rest.trim();
if !builder.description.is_empty() {
builder.description.push(' ');
}
builder.description.push_str(rest);
continue;
}
if line.starts_with("FT ") || line.starts_with("FT\t") {
let rest = &line[5..];
let trimmed = rest.trim_start();
if !trimmed.is_empty() && !trimmed.starts_with('/') {
let parts: Vec<&str> = trimmed.splitn(2, char::is_whitespace).collect();
if parts.len() >= 2 {
let key = parts[0].trim().to_string();
let location = parts[1].trim().to_string();
builder.features.push((key, location));
} else if parts.len() == 1 {
builder.features.push((parts[0].trim().to_string(), String::new()));
}
}
continue;
}
if line.starts_with("SQ ") || line.starts_with("SQ\t") || line == "SQ" {
in_sequence = true;
continue;
}
}
if builder.has_data() {
records.push(builder.build());
}
Ok(records)
}
pub fn write_embl(records: &[EmblRecord]) -> String {
let mut out = String::new();
for rec in records {
out.push_str(&format!("ID {}; SV 1; linear; DNA; STD; UNC; {} BP.\n",
rec.id, rec.sequence.len()));
out.push_str(&format!("AC {};\n", rec.accession));
out.push_str(&format!("DE {}\n", rec.description));
for (key, location) in &rec.features {
out.push_str(&format!("FT {:<16}{}\n", key, location));
}
out.push_str(&format!("SQ Sequence {} BP;\n", rec.sequence.len()));
let seq_bytes = rec.sequence.as_bytes();
let mut pos = 0;
while pos < seq_bytes.len() {
out.push_str(" ");
let line_end = std::cmp::min(pos + 60, seq_bytes.len());
let mut col = 0;
for i in pos..line_end {
out.push(seq_bytes[i] as char);
col += 1;
if col % 10 == 0 && i + 1 < line_end {
out.push(' ');
}
}
let chars_written = (line_end - pos) + ((line_end - pos).saturating_sub(1)) / 10;
let target_width = 60 + 5; for _ in chars_written..target_width {
out.push(' ');
}
out.push_str(&format!(" {}\n", line_end));
pos = line_end;
}
out.push_str("//\n");
}
out
}
struct RecordBuilder {
id: String,
accession: String,
description: String,
sequence: String,
features: Vec<(String, String)>,
}
impl RecordBuilder {
fn new() -> Self {
Self {
id: String::new(),
accession: String::new(),
description: String::new(),
sequence: String::new(),
features: Vec::new(),
}
}
fn has_data(&self) -> bool {
!self.id.is_empty() || !self.sequence.is_empty()
}
fn build(self) -> EmblRecord {
EmblRecord {
id: self.id,
accession: self.accession,
description: self.description,
sequence: self.sequence,
features: self.features,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn embl_round_trip() {
let rec = EmblRecord {
id: "HSBGLOBIN".to_string(),
accession: "V00497".to_string(),
description: "Human beta-globin gene.".to_string(),
sequence: "acgtacgtacgtacgt".to_string(),
features: vec![
("gene".to_string(), "1..16".to_string()),
],
};
let written = write_embl(&[rec]);
let parsed = parse_embl(&written).unwrap();
assert_eq!(parsed.len(), 1);
assert_eq!(parsed[0].id, "HSBGLOBIN");
assert_eq!(parsed[0].accession, "V00497");
assert_eq!(parsed[0].description, "Human beta-globin gene.");
assert_eq!(parsed[0].sequence, "acgtacgtacgtacgt");
assert_eq!(parsed[0].features.len(), 1);
assert_eq!(parsed[0].features[0].0, "gene");
assert_eq!(parsed[0].features[0].1, "1..16");
}
#[test]
fn embl_multi_record() {
let input = "\
ID REC1; SV 1; linear; DNA; STD; HUM; 10 BP.
AC X00001;
DE First record.
SQ Sequence 10 BP;
acgtacgtac 10
//
ID REC2; SV 1; linear; DNA; STD; HUM; 8 BP.
AC X00002;
DE Second record.
SQ Sequence 8 BP;
tgcatgca 8
//
";
let records = parse_embl(input).unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0].id, "REC1");
assert_eq!(records[0].accession, "X00001");
assert_eq!(records[0].sequence, "acgtacgtac");
assert_eq!(records[1].id, "REC2");
assert_eq!(records[1].accession, "X00002");
assert_eq!(records[1].sequence, "tgcatgca");
}
#[test]
fn embl_feature_extraction() {
let input = "\
ID FEAT1; SV 1; linear; DNA; STD; HUM; 20 BP.
AC Y00001;
DE Feature test.
FT gene 1..20
FT /gene=\"TP53\"
FT CDS join(1..10,15..20)
FT /protein_id=\"AAA001.1\"
SQ Sequence 20 BP;
acgtacgtac acgtacgtac 20
//
";
let records = parse_embl(input).unwrap();
assert_eq!(records[0].features.len(), 2);
assert_eq!(records[0].features[0].0, "gene");
assert_eq!(records[0].features[0].1, "1..20");
assert_eq!(records[0].features[1].0, "CDS");
assert_eq!(records[0].features[1].1, "join(1..10,15..20)");
}
#[test]
fn embl_empty_input() {
let records = parse_embl("").unwrap();
assert!(records.is_empty());
let records = parse_embl(" \n\n ").unwrap();
assert!(records.is_empty());
}
#[test]
fn embl_sequence_whitespace_extraction() {
let input = "\
ID SEQTEST; SV 1; linear; DNA; STD; UNC; 30 BP.
AC Z99999;
DE Whitespace test.
SQ Sequence 30 BP;
aaaaaaaaaa cccccccccc gggggggggg 30
//
";
let records = parse_embl(input).unwrap();
assert_eq!(records[0].sequence, "aaaaaaaaaaccccccccccgggggggggg");
assert_eq!(records[0].sequence.len(), 30);
}
}