Skip to main content

sci_form/materials/
cif.rs

1//! CIF (Crystallographic Information File) import and export.
2//!
3//! Supports reading and writing CIF 1.1 format for crystal structures:
4//! - Cell parameters (`_cell_length_a`, `_cell_angle_alpha`, etc.)
5//! - Space group (`_symmetry_space_group_name_H-M`, `_symmetry_Int_Tables_number`)
6//! - Atom sites (`_atom_site_label`, `_atom_site_fract_x/y/z`, `_atom_site_type_symbol`)
7
8use super::cell::{CellParameters, UnitCell};
9
10/// A parsed CIF crystal structure.
11#[derive(Debug, Clone)]
12pub struct CifStructure {
13    /// Data block name (from `data_` line).
14    pub data_block: String,
15    /// Unit cell.
16    pub cell: UnitCell,
17    /// Cell parameters (a, b, c, α, β, γ).
18    pub cell_params: CellParameters,
19    /// Space group Hermann-Mauguin symbol (if present).
20    pub space_group_hm: Option<String>,
21    /// Space group ITC number (if present).
22    pub space_group_number: Option<u16>,
23    /// Atom sites with labels, elements, and fractional coordinates.
24    pub atom_sites: Vec<CifAtomSite>,
25}
26
27/// A single atom site from a CIF file.
28#[derive(Debug, Clone)]
29pub struct CifAtomSite {
30    /// Atom label (e.g., "Na1", "Cl2").
31    pub label: String,
32    /// Element symbol (e.g., "Na", "Cl").
33    pub type_symbol: String,
34    /// Atomic number.
35    pub atomic_number: u8,
36    /// Fractional coordinates.
37    pub frac_x: f64,
38    pub frac_y: f64,
39    pub frac_z: f64,
40    /// Occupancy (default 1.0).
41    pub occupancy: f64,
42}
43
44/// Parse a CIF-format string into a crystal structure.
45///
46/// Handles standard CIF 1.1 tags for cell parameters, space group, and atom sites.
47/// Numbers with uncertainty notation (e.g., "5.640(1)") are stripped of the parenthetical.
48pub fn parse_cif(input: &str) -> Result<CifStructure, String> {
49    let mut data_block = String::new();
50    let mut a = None;
51    let mut b = None;
52    let mut c = None;
53    let mut alpha = None;
54    let mut beta = None;
55    let mut gamma = None;
56    let mut sg_hm: Option<String> = None;
57    let mut sg_number: Option<u16> = None;
58    let mut atom_sites = Vec::new();
59
60    let lines: Vec<&str> = input.lines().collect();
61    let mut i = 0;
62
63    while i < lines.len() {
64        let line = lines[i].trim();
65
66        // Skip comments and empty lines
67        if line.is_empty() || line.starts_with('#') {
68            i += 1;
69            continue;
70        }
71
72        // Data block
73        if let Some(name) = line.strip_prefix("data_") {
74            data_block = name.trim().to_string();
75            i += 1;
76            continue;
77        }
78
79        // Cell parameters — single-value tags
80        if line.starts_with("_cell_length_a") {
81            a = parse_cif_float(extract_value(line, "_cell_length_a"));
82            i += 1;
83            continue;
84        }
85        if line.starts_with("_cell_length_b") {
86            b = parse_cif_float(extract_value(line, "_cell_length_b"));
87            i += 1;
88            continue;
89        }
90        if line.starts_with("_cell_length_c") {
91            c = parse_cif_float(extract_value(line, "_cell_length_c"));
92            i += 1;
93            continue;
94        }
95        if line.starts_with("_cell_angle_alpha") {
96            alpha = parse_cif_float(extract_value(line, "_cell_angle_alpha"));
97            i += 1;
98            continue;
99        }
100        if line.starts_with("_cell_angle_beta") {
101            beta = parse_cif_float(extract_value(line, "_cell_angle_beta"));
102            i += 1;
103            continue;
104        }
105        if line.starts_with("_cell_angle_gamma") {
106            gamma = parse_cif_float(extract_value(line, "_cell_angle_gamma"));
107            i += 1;
108            continue;
109        }
110
111        // Space group
112        if line.starts_with("_symmetry_space_group_name_H-M")
113            || line.starts_with("_space_group_name_H-M_alt")
114        {
115            let val = extract_value_quoted(line);
116            if !val.is_empty() {
117                sg_hm = Some(val);
118            }
119            i += 1;
120            continue;
121        }
122        if line.starts_with("_symmetry_Int_Tables_number")
123            || line.starts_with("_space_group_IT_number")
124        {
125            let val = extract_value_simple(line);
126            sg_number = val.parse::<u16>().ok();
127            i += 1;
128            continue;
129        }
130
131        // Atom site loop
132        if line == "loop_" {
133            // Check if next lines define atom_site columns
134            let mut col_names = Vec::new();
135            let mut j = i + 1;
136            while j < lines.len() {
137                let col_line = lines[j].trim();
138                if col_line.starts_with("_atom_site_") {
139                    col_names.push(col_line.to_string());
140                    j += 1;
141                } else {
142                    break;
143                }
144            }
145
146            if !col_names.is_empty() {
147                // Map column indices
148                let label_col = col_names.iter().position(|c| c == "_atom_site_label");
149                let type_col = col_names.iter().position(|c| c == "_atom_site_type_symbol");
150                let fx_col = col_names.iter().position(|c| c == "_atom_site_fract_x");
151                let fy_col = col_names.iter().position(|c| c == "_atom_site_fract_y");
152                let fz_col = col_names.iter().position(|c| c == "_atom_site_fract_z");
153                let occ_col = col_names.iter().position(|c| c == "_atom_site_occupancy");
154                let cx_col = col_names.iter().position(|c| c == "_atom_site_Cartn_x");
155                let cy_col = col_names.iter().position(|c| c == "_atom_site_Cartn_y");
156                let cz_col = col_names.iter().position(|c| c == "_atom_site_Cartn_z");
157
158                let has_frac = fx_col.is_some() && fy_col.is_some() && fz_col.is_some();
159                let has_cart = cx_col.is_some() && cy_col.is_some() && cz_col.is_some();
160
161                // Read data rows
162                while j < lines.len() {
163                    let data_line = lines[j].trim();
164                    if data_line.is_empty()
165                        || data_line.starts_with('#')
166                        || data_line.starts_with('_')
167                        || data_line == "loop_"
168                    {
169                        break;
170                    }
171
172                    let fields: Vec<&str> = split_cif_fields(data_line);
173                    if fields.len() < col_names.len() {
174                        j += 1;
175                        continue;
176                    }
177
178                    let label = label_col.map(|c| fields[c].to_string()).unwrap_or_default();
179                    let type_sym = type_col
180                        .map(|c| fields[c].to_string())
181                        .or_else(|| {
182                            // Extract element from label (e.g., "Na1" → "Na")
183                            Some(extract_element_from_label(&label))
184                        })
185                        .unwrap_or_default();
186
187                    let (fx, fy, fz) = if has_frac {
188                        (
189                            parse_cif_float(fields[fx_col.unwrap()]).unwrap_or(0.0),
190                            parse_cif_float(fields[fy_col.unwrap()]).unwrap_or(0.0),
191                            parse_cif_float(fields[fz_col.unwrap()]).unwrap_or(0.0),
192                        )
193                    } else if has_cart {
194                        // Will convert below after cell is built
195                        (
196                            parse_cif_float(fields[cx_col.unwrap()]).unwrap_or(0.0),
197                            parse_cif_float(fields[cy_col.unwrap()]).unwrap_or(0.0),
198                            parse_cif_float(fields[cz_col.unwrap()]).unwrap_or(0.0),
199                        )
200                    } else {
201                        (0.0, 0.0, 0.0)
202                    };
203
204                    let occ = occ_col
205                        .and_then(|c| parse_cif_float(fields[c]))
206                        .unwrap_or(1.0);
207
208                    let z = symbol_to_atomic_number(&type_sym);
209
210                    atom_sites.push((
211                        CifAtomSite {
212                            label,
213                            type_symbol: type_sym,
214                            atomic_number: z,
215                            frac_x: fx,
216                            frac_y: fy,
217                            frac_z: fz,
218                            occupancy: occ,
219                        },
220                        has_cart && !has_frac,
221                    ));
222
223                    j += 1;
224                }
225
226                i = j;
227                continue;
228            }
229
230            i += 1;
231            continue;
232        }
233
234        i += 1;
235    }
236
237    // Validate cell parameters
238    let a = a.ok_or("Missing _cell_length_a")?;
239    let b = b.ok_or("Missing _cell_length_b")?;
240    let c = c.ok_or("Missing _cell_length_c")?;
241    let alpha = alpha.unwrap_or(90.0);
242    let beta = beta.unwrap_or(90.0);
243    let gamma = gamma.unwrap_or(90.0);
244
245    let cell_params = CellParameters {
246        a,
247        b,
248        c,
249        alpha,
250        beta,
251        gamma,
252    };
253    let cell = UnitCell::from_parameters(&cell_params);
254
255    // Convert Cartesian atom sites to fractional if needed
256    let atom_sites: Vec<CifAtomSite> = atom_sites
257        .into_iter()
258        .map(|(mut site, is_cart)| {
259            if is_cart {
260                let frac = cell.cart_to_frac([site.frac_x, site.frac_y, site.frac_z]);
261                site.frac_x = frac[0];
262                site.frac_y = frac[1];
263                site.frac_z = frac[2];
264            }
265            site
266        })
267        .collect();
268
269    Ok(CifStructure {
270        data_block,
271        cell,
272        cell_params,
273        space_group_hm: sg_hm,
274        space_group_number: sg_number,
275        atom_sites,
276    })
277}
278
279/// Write a CIF-format string from a crystal structure.
280pub fn write_cif(structure: &CifStructure) -> String {
281    let mut out = String::new();
282
283    out.push_str(&format!("data_{}\n", structure.data_block));
284    out.push_str("#\n");
285
286    // Cell parameters
287    let p = &structure.cell_params;
288    out.push_str(&format!("_cell_length_a                    {:.6}\n", p.a));
289    out.push_str(&format!("_cell_length_b                    {:.6}\n", p.b));
290    out.push_str(&format!("_cell_length_c                    {:.6}\n", p.c));
291    out.push_str(&format!(
292        "_cell_angle_alpha                 {:.4}\n",
293        p.alpha
294    ));
295    out.push_str(&format!(
296        "_cell_angle_beta                  {:.4}\n",
297        p.beta
298    ));
299    out.push_str(&format!(
300        "_cell_angle_gamma                 {:.4}\n",
301        p.gamma
302    ));
303    out.push_str("#\n");
304
305    // Space group
306    if let Some(ref hm) = structure.space_group_hm {
307        out.push_str(&format!("_symmetry_space_group_name_H-M    '{}'\n", hm));
308    }
309    if let Some(num) = structure.space_group_number {
310        out.push_str(&format!("_symmetry_Int_Tables_number       {}\n", num));
311    }
312    out.push_str("#\n");
313
314    // Atom sites
315    if !structure.atom_sites.is_empty() {
316        out.push_str("loop_\n");
317        out.push_str("_atom_site_label\n");
318        out.push_str("_atom_site_type_symbol\n");
319        out.push_str("_atom_site_fract_x\n");
320        out.push_str("_atom_site_fract_y\n");
321        out.push_str("_atom_site_fract_z\n");
322        out.push_str("_atom_site_occupancy\n");
323        for site in &structure.atom_sites {
324            out.push_str(&format!(
325                "{:<8} {:<4} {:.6} {:.6} {:.6} {:.4}\n",
326                site.label, site.type_symbol, site.frac_x, site.frac_y, site.frac_z, site.occupancy
327            ));
328        }
329    }
330
331    out.push_str("#END\n");
332    out
333}
334
335// ─── Helpers ──────────────────────────────────────────────────────────────────
336
337/// Extract value part from a CIF tag line: `_tag  value` → `value`.
338fn extract_value<'a>(line: &'a str, tag: &str) -> &'a str {
339    line[tag.len()..].trim()
340}
341
342/// Extract simple (non-quoted) value from a tag line.
343fn extract_value_simple(line: &str) -> &str {
344    // Find first whitespace after the tag
345    if let Some(pos) = line.find(char::is_whitespace) {
346        line[pos..].trim()
347    } else {
348        ""
349    }
350}
351
352/// Extract possibly-quoted value (strip single quotes).
353fn extract_value_quoted(line: &str) -> String {
354    let val = extract_value_simple(line);
355    val.trim_matches('\'').trim_matches('"').trim().to_string()
356}
357
358/// Parse a CIF float, stripping uncertainty notation like "5.640(1)" → 5.640.
359fn parse_cif_float(s: &str) -> Option<f64> {
360    let s = s.trim();
361    if s == "?" || s == "." {
362        return None;
363    }
364    // Strip parenthetical uncertainty: "5.640(1)" → "5.640"
365    let clean = if let Some(paren) = s.find('(') {
366        &s[..paren]
367    } else {
368        s
369    };
370    clean.parse::<f64>().ok()
371}
372
373/// Split a CIF data line into fields, respecting single-quoted strings.
374fn split_cif_fields(line: &str) -> Vec<&str> {
375    let mut fields = Vec::new();
376    let bytes = line.as_bytes();
377    let mut i = 0;
378    let len = bytes.len();
379
380    while i < len {
381        // Skip whitespace
382        while i < len && bytes[i] == b' ' {
383            i += 1;
384        }
385        if i >= len {
386            break;
387        }
388
389        if bytes[i] == b'\'' {
390            // Quoted field
391            let start = i + 1;
392            i = start;
393            while i < len && bytes[i] != b'\'' {
394                i += 1;
395            }
396            fields.push(&line[start..i]);
397            if i < len {
398                i += 1; // skip closing quote
399            }
400        } else {
401            // Unquoted field
402            let start = i;
403            while i < len && bytes[i] != b' ' {
404                i += 1;
405            }
406            fields.push(&line[start..i]);
407        }
408    }
409
410    fields
411}
412
413/// Extract element symbol from an atom label (e.g., "Na1" → "Na", "Fe2+" → "Fe").
414fn extract_element_from_label(label: &str) -> String {
415    let mut sym = String::new();
416    let mut chars = label.chars();
417    if let Some(c) = chars.next() {
418        if c.is_ascii_uppercase() {
419            sym.push(c);
420            if let Some(c2) = chars.next() {
421                if c2.is_ascii_lowercase() {
422                    sym.push(c2);
423                }
424            }
425        }
426    }
427    if sym.is_empty() {
428        "X".to_string()
429    } else {
430        sym
431    }
432}
433
434/// Convert element symbol to atomic number.
435fn symbol_to_atomic_number(sym: &str) -> u8 {
436    match sym.trim() {
437        "H" => 1,
438        "He" => 2,
439        "Li" => 3,
440        "Be" => 4,
441        "B" => 5,
442        "C" => 6,
443        "N" => 7,
444        "O" => 8,
445        "F" => 9,
446        "Ne" => 10,
447        "Na" => 11,
448        "Mg" => 12,
449        "Al" => 13,
450        "Si" => 14,
451        "P" => 15,
452        "S" => 16,
453        "Cl" => 17,
454        "Ar" => 18,
455        "K" => 19,
456        "Ca" => 20,
457        "Sc" => 21,
458        "Ti" => 22,
459        "V" => 23,
460        "Cr" => 24,
461        "Mn" => 25,
462        "Fe" => 26,
463        "Co" => 27,
464        "Ni" => 28,
465        "Cu" => 29,
466        "Zn" => 30,
467        "Ga" => 31,
468        "Ge" => 32,
469        "As" => 33,
470        "Se" => 34,
471        "Br" => 35,
472        "Kr" => 36,
473        "Rb" => 37,
474        "Sr" => 38,
475        "Y" => 39,
476        "Zr" => 40,
477        "Nb" => 41,
478        "Mo" => 42,
479        "Tc" => 43,
480        "Ru" => 44,
481        "Rh" => 45,
482        "Pd" => 46,
483        "Ag" => 47,
484        "Cd" => 48,
485        "In" => 49,
486        "Sn" => 50,
487        "Sb" => 51,
488        "Te" => 52,
489        "I" => 53,
490        "Xe" => 54,
491        "Cs" => 55,
492        "Ba" => 56,
493        "La" => 57,
494        "Ce" => 58,
495        "Pr" => 59,
496        "Nd" => 60,
497        "Pm" => 61,
498        "Sm" => 62,
499        "Eu" => 63,
500        "Gd" => 64,
501        "Tb" => 65,
502        "Dy" => 66,
503        "Ho" => 67,
504        "Er" => 68,
505        "Tm" => 69,
506        "Yb" => 70,
507        "Lu" => 71,
508        "Hf" => 72,
509        "Ta" => 73,
510        "W" => 74,
511        "Re" => 75,
512        "Os" => 76,
513        "Ir" => 77,
514        "Pt" => 78,
515        "Au" => 79,
516        "Hg" => 80,
517        "Tl" => 81,
518        "Pb" => 82,
519        "Bi" => 83,
520        _ => 0,
521    }
522}
523
524#[cfg(test)]
525mod tests {
526    use super::*;
527
528    #[test]
529    fn test_parse_nacl_cif() {
530        let cif = r#"
531data_NaCl
532_cell_length_a                    5.640
533_cell_length_b                    5.640
534_cell_length_c                    5.640
535_cell_angle_alpha                 90.000
536_cell_angle_beta                  90.000
537_cell_angle_gamma                 90.000
538_symmetry_space_group_name_H-M    'F m -3 m'
539_symmetry_Int_Tables_number       225
540#
541loop_
542_atom_site_label
543_atom_site_type_symbol
544_atom_site_fract_x
545_atom_site_fract_y
546_atom_site_fract_z
547_atom_site_occupancy
548Na1      Na   0.000000 0.000000 0.000000 1.0000
549Cl1      Cl   0.500000 0.500000 0.500000 1.0000
550"#;
551        let structure = parse_cif(cif).unwrap();
552        assert_eq!(structure.data_block, "NaCl");
553        assert!((structure.cell_params.a - 5.640).abs() < 1e-6);
554        assert_eq!(structure.space_group_hm, Some("F m -3 m".to_string()));
555        assert_eq!(structure.space_group_number, Some(225));
556        assert_eq!(structure.atom_sites.len(), 2);
557        assert_eq!(structure.atom_sites[0].type_symbol, "Na");
558        assert_eq!(structure.atom_sites[0].atomic_number, 11);
559        assert_eq!(structure.atom_sites[1].type_symbol, "Cl");
560        assert_eq!(structure.atom_sites[1].atomic_number, 17);
561        assert!((structure.atom_sites[1].frac_x - 0.5).abs() < 1e-6);
562    }
563
564    #[test]
565    fn test_roundtrip_cif() {
566        let original = CifStructure {
567            data_block: "test".to_string(),
568            cell: UnitCell::from_parameters(&CellParameters {
569                a: 10.0,
570                b: 10.0,
571                c: 10.0,
572                alpha: 90.0,
573                beta: 90.0,
574                gamma: 90.0,
575            }),
576            cell_params: CellParameters {
577                a: 10.0,
578                b: 10.0,
579                c: 10.0,
580                alpha: 90.0,
581                beta: 90.0,
582                gamma: 90.0,
583            },
584            space_group_hm: Some("P m -3 m".to_string()),
585            space_group_number: Some(221),
586            atom_sites: vec![CifAtomSite {
587                label: "Fe1".to_string(),
588                type_symbol: "Fe".to_string(),
589                atomic_number: 26,
590                frac_x: 0.0,
591                frac_y: 0.0,
592                frac_z: 0.0,
593                occupancy: 1.0,
594            }],
595        };
596
597        let cif_text = write_cif(&original);
598        let parsed = parse_cif(&cif_text).unwrap();
599        assert_eq!(parsed.data_block, "test");
600        assert!((parsed.cell_params.a - 10.0).abs() < 1e-4);
601        assert_eq!(parsed.atom_sites.len(), 1);
602        assert_eq!(parsed.atom_sites[0].type_symbol, "Fe");
603        assert_eq!(parsed.atom_sites[0].atomic_number, 26);
604    }
605
606    #[test]
607    fn test_parse_cif_float_uncertainty() {
608        assert_eq!(parse_cif_float("5.640(1)"), Some(5.640));
609        assert_eq!(parse_cif_float("90.000"), Some(90.0));
610        assert_eq!(parse_cif_float("?"), None);
611    }
612
613    #[test]
614    fn test_extract_element_from_label() {
615        assert_eq!(extract_element_from_label("Na1"), "Na");
616        assert_eq!(extract_element_from_label("Fe2+"), "Fe");
617        assert_eq!(extract_element_from_label("O3"), "O");
618        assert_eq!(extract_element_from_label("C12"), "C");
619    }
620}