Skip to main content

openproteo_io/
lib.rs

1//! `openproteo-io` is the umbrella crate that ties together the open
2//! Rust mass-spec parsers (`opentfraw`, `opentimstdf`, `openwraw`)
3//! behind a uniform vendor-detection + mzML-conversion API.
4//!
5//! Each vendor parser is gated behind a Cargo feature
6//! (`thermo`, `bruker`, `waters`) and re-exported under
7//! [`vendor`]. The `all` meta-feature pulls in every supported
8//! vendor.
9//!
10//! Even with no features enabled, [`detect_format`] is available so
11//! callers can probe a path without paying the compile-time cost of a
12//! parser they will not use.
13
14#![forbid(unsafe_code)]
15
16use std::path::{Path, PathBuf};
17
18mod error;
19pub use error::{Error, Result};
20
21pub use openproteo_core as core;
22
23#[cfg(feature = "arrow")]
24pub use openproteo_core::arrow;
25
26/// Re-exports of each vendor parser, gated by feature.
27pub mod vendor {
28    #[cfg(feature = "thermo")]
29    pub use opentfraw;
30    #[cfg(feature = "bruker")]
31    pub use opentimstdf;
32    #[cfg(feature = "waters")]
33    pub use openwraw;
34}
35
36/// Detected on-disk vendor / format family.
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum VendorFormat {
39    /// Thermo Fisher Finnigan `.raw` (file).
40    ThermoRaw,
41    /// Bruker timsTOF TDF (directory ending in `.d/` containing
42    /// `analysis.tdf` + `analysis.tdf_bin`).
43    BrukerTdf,
44    /// Waters MassLynx bundle (directory ending in `.raw/` containing
45    /// `_HEADER.TXT`).
46    WatersRaw,
47}
48
49impl VendorFormat {
50    /// Vendor-name string suitable for logs and the CLI.
51    pub fn name(self) -> &'static str {
52        match self {
53            Self::ThermoRaw => "thermo",
54            Self::BrukerTdf => "bruker",
55            Self::WatersRaw => "waters",
56        }
57    }
58}
59
60/// Result of probing a filesystem path for a supported vendor format.
61#[derive(Debug, Clone)]
62pub struct Detected {
63    /// Canonical path to feed back into the matching vendor reader.
64    /// For directory-based formats this is the bundle directory; for
65    /// Thermo, the `.raw` file itself.
66    pub path: PathBuf,
67    /// Identified format.
68    pub format: VendorFormat,
69}
70
71/// Inspect `path` (file or directory) and return the matching vendor
72/// format, or `None` when none of the supported signatures match.
73///
74/// This function is always available, even with no features enabled,
75/// so a host application can decide which feature to enable at compile
76/// time based on a runtime probe.
77pub fn detect_format(path: &Path) -> Option<Detected> {
78    if path.is_dir() {
79        // Bruker .d/ first, then Waters .raw/.
80        if path.join("analysis.tdf").is_file() && path.join("analysis.tdf_bin").is_file() {
81            return Some(Detected {
82                path: path.to_path_buf(),
83                format: VendorFormat::BrukerTdf,
84            });
85        }
86        if path.join("_HEADER.TXT").is_file() {
87            return Some(Detected {
88                path: path.to_path_buf(),
89                format: VendorFormat::WatersRaw,
90            });
91        }
92        return None;
93    }
94    if path.is_file() {
95        if is_thermo_raw(path) {
96            return Some(Detected {
97                path: path.to_path_buf(),
98                format: VendorFormat::ThermoRaw,
99            });
100        }
101        return None;
102    }
103    None
104}
105
106/// Returns `true` if the file looks like a Thermo Finnigan `.raw`.
107///
108/// The Finnigan signature is the UTF-16LE string `Finnigan` starting at
109/// offset 2 (the first two bytes are a small header version word).
110fn is_thermo_raw(path: &Path) -> bool {
111    use std::fs::File;
112    use std::io::Read;
113    let Ok(mut f) = File::open(path) else {
114        return false;
115    };
116    let mut buf = [0u8; 18];
117    if f.read_exact(&mut buf).is_err() {
118        return false;
119    }
120    // "Finnigan" in UTF-16LE: F.i.n.n.i.g.a.n. (16 bytes) at offset 2.
121    const FINNIGAN_UTF16LE: [u8; 16] = [
122        0x46, 0x00, 0x69, 0x00, 0x6e, 0x00, 0x6e, 0x00, 0x69, 0x00, 0x67, 0x00, 0x61, 0x00, 0x6e,
123        0x00,
124    ];
125    buf[2..18] == FINNIGAN_UTF16LE
126}
127
128/// Convert a detected vendor file to mzML at `output`. Picks the
129/// correct vendor crate's `write_mzml` (or `write_indexed_mzml`) based
130/// on `indexed`.
131#[allow(clippy::needless_pass_by_value)] // for symmetry with detect_format
132pub fn convert_to_mzml(detected: Detected, output: &Path, indexed: bool) -> Result<()> {
133    use std::fs::File;
134    use std::io::BufWriter;
135    let f = File::create(output)?;
136    let mut w = BufWriter::new(f);
137    write_to(detected.format, &detected.path, &mut w, indexed)
138}
139
140/// Like [`convert_to_mzml`] but writes to an arbitrary writer instead
141/// of a path. Useful for streaming output to gzip, stdout, or any other
142/// sink.
143#[allow(clippy::needless_pass_by_value)]
144pub fn convert_to_mzml_writer<W: std::io::Write>(
145    detected: Detected,
146    writer: &mut W,
147    indexed: bool,
148) -> Result<()> {
149    write_to(detected.format, &detected.path, writer, indexed)
150}
151
152fn write_to(
153    format: VendorFormat,
154    path: &Path,
155    w: &mut impl std::io::Write,
156    indexed: bool,
157) -> Result<()> {
158    match format {
159        VendorFormat::ThermoRaw => {
160            #[cfg(feature = "thermo")]
161            {
162                thermo_convert(path, w, indexed)
163            }
164            #[cfg(not(feature = "thermo"))]
165            {
166                let _ = (path, w, indexed);
167                Err(Error::FeatureDisabled { vendor: "thermo" })
168            }
169        }
170        VendorFormat::BrukerTdf => {
171            #[cfg(feature = "bruker")]
172            {
173                if indexed {
174                    opentimstdf::mzml::write_indexed_mzml(path, w)?;
175                } else {
176                    opentimstdf::mzml::write_mzml(path, w)?;
177                }
178                Ok(())
179            }
180            #[cfg(not(feature = "bruker"))]
181            {
182                let _ = (path, w, indexed);
183                Err(Error::FeatureDisabled { vendor: "bruker" })
184            }
185        }
186        VendorFormat::WatersRaw => {
187            #[cfg(feature = "waters")]
188            {
189                if indexed {
190                    openwraw::mzml::write_indexed_mzml(path, w)?;
191                } else {
192                    openwraw::mzml::write_mzml(path, w)?;
193                }
194                Ok(())
195            }
196            #[cfg(not(feature = "waters"))]
197            {
198                let _ = (path, w, indexed);
199                Err(Error::FeatureDisabled { vendor: "waters" })
200            }
201        }
202    }
203}
204
205#[cfg(feature = "thermo")]
206fn thermo_convert(path: &Path, out: &mut impl std::io::Write, indexed: bool) -> Result<()> {
207    use std::fs::File;
208    use std::io::BufReader;
209    let raw = opentfraw::RawFileReader::open_path(path)?;
210    let mut source = BufReader::with_capacity(2 << 20, File::open(path)?);
211    let filename = path
212        .file_name()
213        .and_then(|n| n.to_str())
214        .unwrap_or("unknown.raw");
215    if indexed {
216        opentfraw::mzml::write_indexed_mzml(&raw, &mut source, out, filename, false)?;
217    } else {
218        opentfraw::mzml::write_mzml(&raw, &mut source, out, filename, false)?;
219    }
220    Ok(())
221}
222
223/// Open the appropriate vendor source for `detected`, collect every
224/// spectrum into a `Vec`, and return both the records and the
225/// run-level metadata. Used by tools that need a second pass over the
226/// data (conformance validation, `info` summaries, Arrow batching).
227///
228/// This dispatches to the same vendor code paths as
229/// [`convert_to_mzml`], so a feature-gated build that excludes a
230/// vendor will return an error here for that vendor.
231#[allow(clippy::needless_pass_by_value)]
232pub fn collect(
233    detected: Detected,
234) -> Result<(
235    Vec<openproteo_core::SpectrumRecord>,
236    openproteo_core::RunMetadata,
237)> {
238    #[allow(unused_imports)]
239    use openproteo_core::SpectrumSource;
240    match detected.format {
241        VendorFormat::ThermoRaw => {
242            #[cfg(feature = "thermo")]
243            {
244                use std::fs::File;
245                use std::io::BufReader;
246                let raw = opentfraw::RawFileReader::open_path(&detected.path)?;
247                let mut source = BufReader::with_capacity(2 << 20, File::open(&detected.path)?);
248                let filename = detected
249                    .path
250                    .file_name()
251                    .and_then(|n| n.to_str())
252                    .unwrap_or("unknown.raw");
253                let mut src =
254                    opentfraw::mzml::OpenTfRawSource::new(&raw, &mut source, filename, false);
255                let meta = src.run_metadata();
256                let recs: Vec<_> = src.iter_spectra().collect();
257                Ok((recs, meta))
258            }
259            #[cfg(not(feature = "thermo"))]
260            Err(Error::FeatureDisabled { vendor: "thermo" })
261        }
262        VendorFormat::BrukerTdf => {
263            #[cfg(feature = "bruker")]
264            {
265                let mut src = opentimstdf::mzml::TdfSource::open(&detected.path)?;
266                let meta = src.run_metadata();
267                let recs: Vec<_> = src.iter_spectra().collect();
268                Ok((recs, meta))
269            }
270            #[cfg(not(feature = "bruker"))]
271            Err(Error::FeatureDisabled { vendor: "bruker" })
272        }
273        VendorFormat::WatersRaw => {
274            #[cfg(feature = "waters")]
275            {
276                let mut src = openwraw::mzml::WatersSource::open(&detected.path)?;
277                let meta = src.run_metadata();
278                let recs: Vec<_> = src.iter_spectra().collect();
279                Ok((recs, meta))
280            }
281            #[cfg(not(feature = "waters"))]
282            Err(Error::FeatureDisabled { vendor: "waters" })
283        }
284    }
285}
286
287/// A trivial in-memory [`openproteo_core::SpectrumSource`] backed by a
288/// `Vec<SpectrumRecord>` + a [`openproteo_core::RunMetadata`]. Hand it
289/// to `openproteo_core::write_mzml` when you already have the records
290/// in hand and just want to emit mzML.
291pub struct VecSource {
292    pub metadata: openproteo_core::RunMetadata,
293    pub records: Vec<openproteo_core::SpectrumRecord>,
294}
295
296impl VecSource {
297    pub fn new(
298        metadata: openproteo_core::RunMetadata,
299        records: Vec<openproteo_core::SpectrumRecord>,
300    ) -> Self {
301        Self { metadata, records }
302    }
303}
304
305impl openproteo_core::SpectrumSource for VecSource {
306    fn run_metadata(&self) -> openproteo_core::RunMetadata {
307        self.metadata.clone()
308    }
309    fn iter_spectra<'s>(
310        &'s mut self,
311    ) -> Box<dyn Iterator<Item = openproteo_core::SpectrumRecord> + 's> {
312        Box::new(self.records.drain(..))
313    }
314    fn spectrum_count_hint(&self) -> Option<usize> {
315        Some(self.records.len())
316    }
317}
318
319#[cfg(test)]
320mod tests {
321    use super::*;
322    use std::io::Write;
323
324    #[test]
325    fn detect_returns_none_for_garbage_file() {
326        let tmp = tempfile_path();
327        std::fs::write(&tmp, b"hello").unwrap();
328        assert!(detect_format(&tmp).is_none());
329        let _ = std::fs::remove_file(&tmp);
330    }
331
332    #[test]
333    fn detect_returns_thermo_for_finnigan_magic() {
334        let tmp = tempfile_path();
335        let mut f = std::fs::File::create(&tmp).unwrap();
336        // 2-byte version word + "Finnigan" in UTF-16LE + trailing garbage.
337        f.write_all(&[
338            0x01, 0xa1, 0x46, 0x00, 0x69, 0x00, 0x6e, 0x00, 0x6e, 0x00, 0x69, 0x00, 0x67, 0x00,
339            0x61, 0x00, 0x6e, 0x00, 0xff, 0xff,
340        ])
341        .unwrap();
342        let det = detect_format(&tmp).expect("detect");
343        assert_eq!(det.format, VendorFormat::ThermoRaw);
344        let _ = std::fs::remove_file(&tmp);
345    }
346
347    #[test]
348    fn detect_returns_bruker_for_tdf_layout() {
349        let tmp = tempfile_dir();
350        std::fs::write(tmp.join("analysis.tdf"), b"").unwrap();
351        std::fs::write(tmp.join("analysis.tdf_bin"), b"").unwrap();
352        let det = detect_format(&tmp).expect("detect");
353        assert_eq!(det.format, VendorFormat::BrukerTdf);
354        let _ = std::fs::remove_dir_all(&tmp);
355    }
356
357    #[test]
358    fn detect_returns_waters_for_header_layout() {
359        let tmp = tempfile_dir();
360        std::fs::write(tmp.join("_HEADER.TXT"), b"$$ FAKE\n").unwrap();
361        let det = detect_format(&tmp).expect("detect");
362        assert_eq!(det.format, VendorFormat::WatersRaw);
363        let _ = std::fs::remove_dir_all(&tmp);
364    }
365
366    fn tempfile_path() -> PathBuf {
367        let pid = std::process::id();
368        let mut p = std::env::temp_dir();
369        p.push(format!("msio-test-{pid}-{:p}", &pid));
370        p
371    }
372
373    fn tempfile_dir() -> PathBuf {
374        let p = tempfile_path();
375        let _ = std::fs::create_dir_all(&p);
376        p
377    }
378
379    #[test]
380    fn convert_unsupported_format_returns_typed_error() {
381        // `detect_format` returns None here, so callers can't reach
382        // `convert_to_mzml`. Exercise the FeatureDisabled / Mzml paths
383        // through the public `Error` variants directly to keep this
384        // test feature-agnostic.
385        let e: Error = std::io::Error::other("boom").into();
386        assert!(matches!(e, Error::Io(_)));
387        let e = Error::FeatureDisabled { vendor: "thermo" };
388        assert_eq!(
389            e.to_string(),
390            "openproteo-io was built without the 'thermo' feature"
391        );
392        let e = Error::UnsupportedFormat(PathBuf::from("/tmp/nope"));
393        assert!(matches!(e, Error::UnsupportedFormat(_)));
394    }
395}