use std::collections::HashMap;
use std::fs::File;
use std::io::{self, BufRead, BufReader};
#[derive(Debug)]
pub struct MmcifParser {
categories: HashMap<String, Category>,
}
#[derive(Debug)]
pub struct Category {
pub headers: Vec<String>,
pub rows: Vec<Vec<String>>,
}
impl Default for MmcifParser {
fn default() -> Self {
Self::new()
}
}
impl MmcifParser {
pub fn new() -> Self {
Self {
categories: HashMap::new(),
}
}
pub fn parse_file(&mut self, path: &str) -> io::Result<()> {
let file = File::open(path)?;
let reader = BufReader::new(file);
self.parse_reader(reader)
}
pub fn parse_reader<R: BufRead>(&mut self, reader: R) -> io::Result<()> {
let mut current_category: Option<String> = None;
let mut current_headers: Vec<String> = Vec::new();
let mut in_loop = false;
for line in reader.lines() {
let line = line?;
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.starts_with('#') {
continue;
}
if trimmed.starts_with("data_") {
continue;
}
if trimmed.starts_with("loop_") {
in_loop = true;
current_category = None;
current_headers.clear();
continue;
}
if trimmed.starts_with('_') {
let parts: Vec<&str> = trimmed.splitn(2, '.').collect();
if parts.len() != 2 {
continue;
}
let category_name = parts[0][1..].to_string(); let field_name = parts[1].split_whitespace().next().unwrap_or("").to_string();
if in_loop {
if current_category.is_none() {
current_category = Some(category_name.clone());
}
current_headers.push(field_name);
} else {
let remaining = parts[1].trim();
let value = if remaining.starts_with('"') && remaining.ends_with('"') {
remaining[1..remaining.len() - 1].to_string()
} else {
remaining
.split_whitespace()
.skip(1)
.collect::<Vec<&str>>()
.join(" ")
};
let category =
self.categories
.entry(category_name.clone())
.or_insert(Category {
headers: vec![field_name.clone()],
rows: Vec::new(),
});
category.rows.push(vec![value]);
}
continue;
}
if in_loop && !trimmed.starts_with('_') && !current_headers.is_empty() {
if let Some(category_name) = ¤t_category {
let values = parse_values(trimmed);
let category =
self.categories
.entry(category_name.clone())
.or_insert(Category {
headers: current_headers.clone(),
rows: Vec::new(),
});
category.rows.push(values);
}
}
}
Ok(())
}
pub fn get_category(&self, name: &str) -> Option<&Category> {
self.categories.get(name)
}
}
impl Category {
pub fn get_column(&self, header: &str) -> Option<Vec<&str>> {
let index = self.headers.iter().position(|h| h == header)?;
Some(
self.rows
.iter()
.filter_map(|row| row.get(index).map(|s| s.as_str()))
.collect(),
)
}
pub fn get_row(&self, index: usize) -> Option<HashMap<&str, &str>> {
self.rows.get(index).map(|row| {
self.headers
.iter()
.enumerate()
.filter_map(|(i, h)| row.get(i).map(|v| (h.as_str(), v.as_str())))
.collect()
})
}
}
fn parse_values(line: &str) -> Vec<String> {
let mut values = Vec::new();
let mut current_value = String::new();
let mut in_quotes = false;
for c in line.chars() {
match c {
'"' => {
in_quotes = !in_quotes;
if !in_quotes {
values.push(current_value.clone());
current_value.clear();
}
}
' ' | '\t' if !in_quotes => {
if !current_value.is_empty() {
values.push(current_value.clone());
current_value.clear();
}
}
_ => current_value.push(c),
}
}
if !current_value.is_empty() {
values.push(current_value);
}
values
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Cursor;
#[test]
fn test_parse_simple_mmcif() {
let data = r#"data_test
_entry.id 1ABC
loop_
_atom_site.group_PDB
_atom_site.id
_atom_site.type_symbol
ATOM 1 C
ATOM 2 N
"#;
let mut parser = MmcifParser::new();
parser.parse_reader(Cursor::new(data)).unwrap();
let atom_site = parser.get_category("atom_site").unwrap();
assert_eq!(atom_site.headers.len(), 3);
assert_eq!(atom_site.rows.len(), 2);
assert_eq!(
atom_site.get_column("group_PDB").unwrap(),
vec!["ATOM", "ATOM"]
);
}
#[test]
fn test_parse_quoted_values() {
let data = r#"_citation.title "Some quoted title with spaces"
loop_
_entity.type
_entity.description
"polymer" "First chain"
"non-polymer" "Second chain""#;
let mut parser = MmcifParser::new();
parser.parse_reader(Cursor::new(data)).unwrap();
let entity = parser.get_category("entity").unwrap();
assert_eq!(entity.rows.len(), 2);
assert_eq!(
entity.get_column("type").unwrap(),
vec!["polymer", "non-polymer"]
);
}
}