use serde::{Deserialize, Serialize};
use std::fmt::Display;
use std::path::PathBuf;
use super::algorithms::{canonicalize_json, md5, sha512t24u};
use super::alphabet::{AlphabetType, guess_alphabet};
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SequenceMetadata {
pub name: String,
#[serde(default)]
pub description: Option<String>,
pub length: usize,
pub sha512t24u: String,
pub md5: String,
pub alphabet: AlphabetType,
pub fai: Option<FaiMetadata>,
}
impl Default for SequenceMetadata {
fn default() -> Self {
Self {
name: String::new(),
description: None,
length: 0,
sha512t24u: String::new(),
md5: String::new(),
alphabet: AlphabetType::Ascii,
fai: None,
}
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct FaiMetadata {
pub offset: u64, pub line_bases: u32, pub line_bytes: u32, }
#[derive(Clone, Debug)]
pub enum SequenceRecord {
Stub(SequenceMetadata),
Full {
metadata: SequenceMetadata,
sequence: Vec<u8>,
},
}
impl SequenceRecord {
pub fn metadata(&self) -> &SequenceMetadata {
match self {
SequenceRecord::Stub(meta) => meta,
SequenceRecord::Full { metadata, .. } => metadata,
}
}
pub fn sequence(&self) -> Option<&[u8]> {
match self {
SequenceRecord::Stub(_) => None,
SequenceRecord::Full { sequence, .. } => Some(sequence),
}
}
pub fn is_loaded(&self) -> bool {
matches!(self, SequenceRecord::Full { .. })
}
pub fn with_data(self, sequence: Vec<u8>) -> Self {
let metadata = match self {
SequenceRecord::Stub(m) => m,
SequenceRecord::Full { metadata, .. } => metadata,
};
SequenceRecord::Full { metadata, sequence }
}
pub fn load_data(&mut self, sequence: Vec<u8>) {
match self {
SequenceRecord::Stub(metadata) => {
let metadata = std::mem::take(metadata);
*self = SequenceRecord::Full { metadata, sequence };
}
SequenceRecord::Full {
sequence: existing, ..
} => {
*existing = sequence;
}
}
}
pub fn decode(&self) -> Option<String> {
use super::alphabet::lookup_alphabet;
use super::encoder::decode_substring_from_bytes;
let (metadata, data) = match self {
SequenceRecord::Stub(_) => return None,
SequenceRecord::Full { metadata, sequence } => (metadata, sequence),
};
if metadata.alphabet == AlphabetType::Ascii {
return String::from_utf8(data.clone()).ok();
}
let alphabet = lookup_alphabet(&metadata.alphabet);
if data.len() == metadata.length {
if let Ok(raw_string) = String::from_utf8(data.clone()) {
return Some(raw_string);
}
}
let decoded_bytes = decode_substring_from_bytes(data, 0, metadata.length, alphabet);
String::from_utf8(decoded_bytes).ok()
}
}
impl Display for SequenceRecord {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"SequenceRecord: {} (length: {}, alphabet: {}, ga4gh: {:02x?}, md5: {:02x?})",
&self.metadata().name,
&self.metadata().length,
&self.metadata().alphabet,
&self.metadata().sha512t24u,
&self.metadata().md5
)?;
Ok(())
}
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct SeqColDigestLvl1 {
pub sequences_digest: String,
pub names_digest: String,
pub lengths_digest: String,
}
impl SeqColDigestLvl1 {
pub fn to_digest(&self) -> String {
let mut lvl1_object = serde_json::Map::new();
lvl1_object.insert(
"names".to_string(),
serde_json::Value::String(self.names_digest.clone()),
);
lvl1_object.insert(
"sequences".to_string(),
serde_json::Value::String(self.sequences_digest.clone()),
);
let lvl1_json = serde_json::Value::Object(lvl1_object);
let lvl1_canonical = canonicalize_json(&lvl1_json);
sha512t24u(lvl1_canonical.as_bytes())
}
pub fn from_metadata(metadata_vec: &[&SequenceMetadata]) -> Self {
use serde_json::Value;
let sequences: Vec<String> = metadata_vec
.iter()
.map(|md| format!("SQ.{}", md.sha512t24u))
.collect();
let names: Vec<&str> = metadata_vec.iter().map(|md| md.name.as_str()).collect();
let lengths: Vec<usize> = metadata_vec.iter().map(|md| md.length).collect();
let sequences_json = Value::Array(
sequences
.iter()
.map(|s| Value::String(s.to_string()))
.collect(),
);
let names_json = Value::Array(names.iter().map(|s| Value::String(s.to_string())).collect());
let lengths_json = Value::Array(
lengths
.iter()
.map(|l| Value::Number(serde_json::Number::from(*l)))
.collect(),
);
let sequences_canonical = canonicalize_json(&sequences_json);
let names_canonical = canonicalize_json(&names_json);
let lengths_canonical = canonicalize_json(&lengths_json);
SeqColDigestLvl1 {
sequences_digest: sha512t24u(sequences_canonical.as_bytes()),
names_digest: sha512t24u(names_canonical.as_bytes()),
lengths_digest: sha512t24u(lengths_canonical.as_bytes()),
}
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SequenceCollectionMetadata {
pub digest: String,
pub n_sequences: usize,
pub names_digest: String,
pub sequences_digest: String,
pub lengths_digest: String,
pub file_path: Option<PathBuf>,
}
impl SequenceCollectionMetadata {
pub fn from_sequences(sequences: &[SequenceRecord], file_path: Option<PathBuf>) -> Self {
let metadata_refs: Vec<&SequenceMetadata> =
sequences.iter().map(|r| r.metadata()).collect();
let lvl1 = SeqColDigestLvl1::from_metadata(&metadata_refs);
let digest = lvl1.to_digest();
Self {
digest,
n_sequences: sequences.len(),
names_digest: lvl1.names_digest,
sequences_digest: lvl1.sequences_digest,
lengths_digest: lvl1.lengths_digest,
file_path,
}
}
pub fn from_collection(collection: &SequenceCollection) -> Self {
collection.metadata.clone()
}
pub fn to_lvl1(&self) -> SeqColDigestLvl1 {
SeqColDigestLvl1 {
sequences_digest: self.sequences_digest.clone(),
names_digest: self.names_digest.clone(),
lengths_digest: self.lengths_digest.clone(),
}
}
}
#[derive(Clone, Debug)]
pub struct SequenceCollection {
pub metadata: SequenceCollectionMetadata,
pub sequences: Vec<SequenceRecord>,
}
impl SequenceCollection {
pub fn from_records(records: Vec<SequenceRecord>) -> Self {
let metadata = SequenceCollectionMetadata::from_sequences(&records, None);
SequenceCollection {
metadata,
sequences: records,
}
}
}
impl Display for SequenceCollection {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"SequenceCollection with {} sequences, digest: {}",
self.sequences.len(),
self.metadata.digest
)?;
write!(f, "\nFirst 3 sequences:")?;
for seqrec in self.sequences.iter().take(3) {
write!(f, "\n- {}", seqrec)?;
}
Ok(())
}
}
impl<'a> IntoIterator for &'a SequenceCollection {
type Item = &'a SequenceRecord;
type IntoIter = std::slice::Iter<'a, SequenceRecord>;
fn into_iter(self) -> Self::IntoIter {
self.sequences.iter()
}
}
impl IntoIterator for SequenceCollection {
type Item = SequenceRecord;
type IntoIter = std::vec::IntoIter<SequenceRecord>;
fn into_iter(self) -> Self::IntoIter {
self.sequences.into_iter()
}
}
#[derive(Clone, Debug)]
pub enum SequenceCollectionRecord {
Stub(SequenceCollectionMetadata),
Full {
metadata: SequenceCollectionMetadata,
sequences: Vec<SequenceRecord>,
},
}
impl SequenceCollectionRecord {
pub fn metadata(&self) -> &SequenceCollectionMetadata {
match self {
SequenceCollectionRecord::Stub(meta) => meta,
SequenceCollectionRecord::Full { metadata, .. } => metadata,
}
}
pub fn sequences(&self) -> Option<&[SequenceRecord]> {
match self {
SequenceCollectionRecord::Stub(_) => None,
SequenceCollectionRecord::Full { sequences, .. } => Some(sequences),
}
}
pub fn has_sequences(&self) -> bool {
matches!(self, SequenceCollectionRecord::Full { .. })
}
pub fn with_sequences(self, sequences: Vec<SequenceRecord>) -> Self {
let metadata = match self {
SequenceCollectionRecord::Stub(m) => m,
SequenceCollectionRecord::Full { metadata, .. } => metadata,
};
SequenceCollectionRecord::Full {
metadata,
sequences,
}
}
pub fn to_collection(&self) -> SequenceCollection {
match self {
SequenceCollectionRecord::Stub(meta) => {
SequenceCollection {
metadata: meta.clone(),
sequences: Vec::new(),
}
}
SequenceCollectionRecord::Full {
metadata,
sequences,
} => SequenceCollection {
metadata: metadata.clone(),
sequences: sequences.clone(),
},
}
}
}
impl From<SequenceCollection> for SequenceCollectionRecord {
fn from(collection: SequenceCollection) -> Self {
SequenceCollectionRecord::Full {
metadata: collection.metadata,
sequences: collection.sequences,
}
}
}
pub fn digest_sequence(name: &str, data: &[u8]) -> SequenceRecord {
let uppercased: Vec<u8> = data.iter().map(|b| b.to_ascii_uppercase()).collect();
let metadata = SequenceMetadata {
name: name.to_string(),
description: None,
length: data.len(),
sha512t24u: sha512t24u(&uppercased),
md5: md5(&uppercased),
alphabet: guess_alphabet(&uppercased),
fai: None, };
SequenceRecord::Full {
metadata,
sequence: uppercased,
}
}
pub fn digest_sequence_with_description(
name: &str,
description: Option<&str>,
data: &[u8],
) -> SequenceRecord {
let mut seq = digest_sequence(name, data);
if let SequenceRecord::Full {
ref mut metadata, ..
} = seq
{
metadata.description = description.map(String::from);
}
seq
}
pub fn parse_rgsi_line(line: &str) -> Option<SequenceMetadata> {
if line.trim().is_empty() {
return None;
}
let parts: Vec<&str> = line.split('\t').collect();
match parts.len() {
5 => Some(SequenceMetadata {
name: parts[0].to_string(),
description: None,
length: parts[1].parse().ok()?,
alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
sha512t24u: parts[3].to_string(),
md5: parts[4].to_string(),
fai: None,
}),
6 => Some(SequenceMetadata {
name: parts[0].to_string(),
description: if parts[5].is_empty() {
None
} else {
Some(parts[5].to_string())
},
length: parts[1].parse().ok()?,
alphabet: parts[2].parse().unwrap_or(AlphabetType::Unknown),
sha512t24u: parts[3].to_string(),
md5: parts[4].to_string(),
fai: None,
}),
_ => None,
}
}