libreflow_api/fcs/
io.rs

1use crate::fcs::{EventData, Metadata, Sample};
2use atoi::atoi;
3use byteorder::ReadBytesExt;
4use derive_more::{Display, From};
5use nom::bytes::complete::{is_not, tag, take};
6use nom::combinator::map_res;
7use nom::error::ErrorKind;
8use nom::multi::fold_many1;
9use nom::sequence::{separated_pair, terminated, tuple};
10use nom::IResult;
11use regex::Regex;
12use std::collections::HashMap;
13use std::fs::File;
14use std::io::{BufReader, Read, Seek, SeekFrom};
15use std::num::ParseIntError;
16use std::ops::RangeInclusive;
17use std::path::Path;
18use std::str::FromStr;
19
20/// FCS IO Error
21#[derive(Display, From, Debug)]
22pub enum Error {
23    #[from]
24    IO(std::io::Error),
25
26    #[display("Invalid FCS version: {}", version)]
27    InvalidVersion {
28        version: String,
29    },
30
31    #[display("Invalid file type found. File must be fcs.")]
32    InvalidFileType,
33
34    #[display("Failed to parse header segment offset.")]
35    FailedHeaderOffsetParse,
36
37    #[display("Failed to parse text segment delimiter.")]
38    FailedDelimiterParse,
39
40    #[display("Metadata and header segment offsets don't match.")]
41    MetadataOffsetMismatch,
42
43    FailedMetadataParse,
44
45    #[from]
46    FailedIntParse(ParseIntError),
47
48    InvalidMetadata,
49
50    #[display("Invalid data mode: {data_mode} for version {version}")]
51    InvalidDataMode {
52        data_mode: String,
53        version: String,
54    },
55
56    #[display("Invalid data type: {kind} for version {version}")]
57    InvalidDataType {
58        kind: String,
59        version: String,
60    },
61
62    #[display("Could not find key: {key}, in FCS metadata")]
63    MetadataKeyNotFound {
64        key: String,
65    },
66
67    NoDataFound,
68
69    #[display("Invalid bit param length: {bit_length} for parameter index {index}")]
70    InvalidParamBitLength {
71        bit_length: usize,
72        index: usize,
73    },
74
75    InvalidByteOrder {
76        byte_order: String,
77    },
78
79    #[from]
80    FromUtf8Error(std::string::FromUtf8Error),
81}
82
83/// FCS IO Result.
84type Result<T> = core::result::Result<T, Error>;
85
86/// Attempts to read FCS file and return Sample data
87pub fn read<P: AsRef<Path>>(path: P) -> Result<Sample> {
88    if path.as_ref().extension() != Some("fcs".as_ref()) {
89        return Err(Error::InvalidFileType);
90    }
91
92    let file = File::open(path)?;
93    let mut reader = BufReader::new(file);
94
95    let header = read_header(&mut reader)?;
96    let metadata = read_metadata(&mut reader, &header)?;
97    let event_data = read_event_data(&mut reader, &metadata)?;
98
99    Ok(Sample {
100        metadata,
101        event_data,
102    })
103}
104
105/// Valid FCS versions
106#[derive(Debug, PartialEq)]
107enum Version {
108    FCS3_1,
109    FCS3_0,
110}
111
112impl FromStr for Version {
113    type Err = Error;
114
115    // Get version enum from string
116    fn from_str(s: &str) -> Result<Self> {
117        match s {
118            "FCS3.1" => Ok(Version::FCS3_1),
119            // for now, we only support 3.1, but we leave this as a placeholder
120            "FCS3.0" => Ok(Version::FCS3_0),
121            _ => Err(Error::InvalidVersion {
122                version: s.to_string(),
123            }),
124        }
125    }
126}
127
128impl Display for Version {
129    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
130        let str = match self {
131            Version::FCS3_1 => "FCS3.1".to_string(),
132            Version::FCS3_0 => "FCS3.0".to_string(),
133        };
134        write!(f, "{}", str)
135    }
136}
137
138/// FCS header segment information.
139struct Header {
140    version: Version,
141    text_offsets: RangeInclusive<usize>,
142    data_offsets: RangeInclusive<usize>,
143    analysis_offsets: RangeInclusive<usize>,
144}
145
146/// Read FCS header segment.
147fn read_header(reader: &mut BufReader<File>) -> Result<Header> {
148    let mut version_buffer = [0u8; 6];
149    reader.read_exact(&mut version_buffer)?;
150    let version = String::from_utf8(version_buffer.to_vec())?.parse::<Version>()?;
151
152    reader.seek(SeekFrom::Current(4))?; // skip 4 bytes encoding whitespace
153
154    let mut offset_buffer = [0u8; 48]; // 6 x 8 byte offsets
155    reader.read_exact(&mut offset_buffer)?;
156
157    let (offset_buffer, text_offsets) = parse_segment_offsets(&offset_buffer)?;
158    let (offset_buffer, data_offsets) = parse_segment_offsets(&offset_buffer)?;
159    let (_, analysis_offsets) = parse_segment_offsets(&offset_buffer)?;
160
161    Ok(Header {
162        version,
163        text_offsets,
164        data_offsets,
165        analysis_offsets,
166    })
167}
168
169/// Helper for parsing a single segment offset in header
170fn parse_segment_offsets(input: &[u8]) -> Result<(&[u8], RangeInclusive<usize>)> {
171    let (input, (start, stop)) = tuple((parse_offset_bytes, parse_offset_bytes))(input)
172        .map_err(|_| Error::FailedHeaderOffsetParse)?;
173
174    Ok((input, start..=stop))
175}
176
177/// Helper for parsing ascii encoded offset into an usize
178fn parse_offset_bytes(input: &[u8]) -> IResult<&[u8], usize> {
179    map_res(take(8usize), |bytes: &[u8]| {
180        atoi::<usize>(bytes.trim_ascii_start()).ok_or(ErrorKind::Fail)
181    })(input)
182}
183
184/// Escaped delimiters in keys or values in the text segment are replaced with
185/// this temporary string during parsing. This is done to simplify parsing.
186/// The temporary string is replaced with a single delimiter after parsing.
187const DOUBLE_DELIMITER_TRANSFORM: &str = "@ESCAPED@";
188
189/// Required non-parameter indexed keywords in the text segment.
190const REQUIRED_KEYWORDS: [&str; 12] = [
191    "$BEGINANALYSIS", // byte-offset to the beginning of analysis segment
192    "$BEGINDATA",     // byte-offset of beginning of data segment
193    "$BEGINSTEXT",    // byte-offset to beginning of text segment
194    "$BYTEORD",       // byte order for data acquisition computer
195    "$DATATYPE",      // type of data in data segment (ASCII, int, float)
196    "$ENDANALYSIS",   // byte-offset to end of analysis segment
197    "$ENDDATA",       // byte-offset to end of data segment
198    "$ENDSTEXT",      // byte-offset to end of text segment
199    "$MODE",          // data mode (list mode - preferred, histogram - deprecated)
200    "$NEXTDATA",      // byte-offset to next data set in the file
201    "$PAR",           // number of parameters in an event
202    "$TOT",           // total number of events in the data set
203];
204
205/// Optional non-paramater indexed keywords
206const OPTIONAL_KEYWORDS: [&str; 31] = [
207    "$ABRT",          // events lost due to acquisition electronic coincidence
208    "$BTIM",          // clock time at beginning of data acquisition
209    "$CELLS",         // description of objects measured
210    "$COM",           // comment
211    "$CSMODE",        // cell subset mode, number of subsets an object may belong
212    "$CSVBITS",       // number of bits used to encode cell subset identifier
213    "$CYT",           // cytometer type
214    "$CYTSN",         // cytometer serial number
215    "$DATE",          // date of data acquisition
216    "$ETIM",          // clock time at end of data acquisition
217    "$EXP",           // investigator name initiating experiment
218    "$FIL",           // name of data file containing data set
219    "$GATE",          // number of gating parameters
220    "$GATING",        // region combinations used for gating
221    "$INST",          // institution where data was acquired
222    "$LAST_MODIFIED", // timestamp of last modification
223    "$LAST_MODIFIER", // person performing last modification
224    "$LOST",          // number events lost due to computer busy
225    "$OP",            // name of flow cytometry operator
226    "$ORIGINALITY",   // information whether FCS data set has been modified or not
227    "$PLATEID",       // plate identifier
228    "$PLATENAME",     // plate name
229    "$PROJ",          // project name
230    "$SMNO",          // specimen (i.e., tube) label
231    "$SPILLOVER",     // spillover matrix
232    "$SRC",           // source of specimen (cell type, name, etc.)
233    "$SYS",           // type of computer and OS
234    "$TIMESTEP",      // time step for time parameter
235    "$TR",            // trigger parameter and its threshold
236    "$VOL",           // volume of sample run during data acquisition
237    "$WELLID",        // well identifier
238];
239
240/// Read FCS text segment.
241fn read_metadata(reader: &mut BufReader<File>, header: &Header) -> Result<Metadata> {
242    reader.seek(SeekFrom::Start(*header.text_offsets.start() as u64))?;
243    let mut metadata_buf = vec![0u8; *header.text_offsets.end() - *header.text_offsets.start()];
244    reader.read_exact(&mut metadata_buf)?;
245
246    let metadata_txt = String::from_utf8(metadata_buf)?;
247
248    let (metadata_txt, delimiter) =
249        parse_delimiter(&metadata_txt).map_err(|_| Error::FailedDelimiterParse)?;
250
251    // We handle double delimiters by replacing them with a temporary string.
252    // This is done simply because it's a pain to handle double delimiters
253    // when each key/value is separated by a single delimiter.
254    // We'll replace the temporary string with the delimiter after parsing.
255    let metadata_txt = metadata_txt.replace(&delimiter.repeat(2), DOUBLE_DELIMITER_TRANSFORM);
256
257    let (_, metadata) = fold_many1(
258        |input| parse_metadata_pairs(input, delimiter),
259        HashMap::new,
260        |mut acc: HashMap<String, String>, (key, value)| {
261            acc.insert(key, value);
262            acc
263        },
264    )(&metadata_txt)
265    .map_err(|_| Error::FailedMetadataParse)?;
266
267    metadata.is_valid()?;
268    cross_validate(&metadata, &header)?;
269    Ok(metadata)
270}
271
272/// Parse text segment delimiter
273fn parse_delimiter(input: &str) -> IResult<&str, &str> {
274    take(1u8)(input)
275}
276
277/// Metadata string parser
278fn parse_metadata_string<'a>(input: &'a str, delimiter: &str) -> IResult<&'a str, String> {
279    map_res(is_not(delimiter), |s: &str| {
280        // Here, we replace the temporary string with the delimiter after extracting
281        // the key or value string.
282        Ok::<String, std::io::Error>(s.replace(DOUBLE_DELIMITER_TRANSFORM, delimiter))
283    })(input)
284}
285
286/// Metadata key-value pair parser
287fn parse_metadata_pairs<'a>(input: &'a str, delimiter: &str) -> IResult<&'a str, (String, String)> {
288    separated_pair(
289        |input| parse_metadata_string(input, delimiter), // keys
290        tag(delimiter),                                  // delimiter separating the pair
291        terminated(
292            // values (terminated by delimiter or end of string)
293            |input| parse_metadata_string(input, delimiter),
294            tag(delimiter),
295        ),
296    )(input)
297}
298
299/// Check recovered segment offsets from metadata match those in header segment
300fn validate_metadata_offsets(
301    seg_start: usize,
302    seg_end: usize,
303    seg_offsets: &RangeInclusive<usize>,
304) -> Result<()> {
305    if seg_start != *seg_offsets.start() || seg_end != *seg_offsets.end() {
306        return Err(Error::InvalidMetadata);
307    }
308
309    Ok(())
310}
311
312trait GetRequiredKey {
313    fn get_required_key(&self, key: &str) -> Result<&str>;
314}
315
316impl GetRequiredKey for Metadata {
317    /// Attempt to get a required key from the metadata hashmap, but return an
318    /// FCS IO Result rather than option better error handling.
319    fn get_required_key(&self, key: &str) -> Result<&str> {
320        self.get(key)
321            .ok_or(Error::MetadataKeyNotFound {
322                key: key.to_string(),
323            })
324            .map(|s| s.as_str())
325    }
326}
327
328/// Assert that types that implement this trait are valid
329trait IsValid {
330    fn is_valid(&self) -> Result<()>;
331}
332
333impl IsValid for Metadata {
334    /// Assert that recovered metadata from the FCS text segment is valid.
335    fn is_valid(&self) -> Result<()> {
336        // this is a required key, so we just return an error if not found
337        let n_params = self.get_required_key("$PAR")?;
338
339        let n_digits = n_params.chars().count().to_string();
340        let parameter_indexed_regex = r"[PR]\d{1,".to_string() + &n_digits + "}[BENRDFGLOPSTVIW]";
341
342        // this is safe to unwrap since regex has to be valid
343        let param_keywords = Regex::new(&parameter_indexed_regex).unwrap();
344
345        // check that keys are valid
346        for key in self.keys() {
347            if !REQUIRED_KEYWORDS.contains(&key.as_str())
348                && !param_keywords.is_match(key)
349                && !OPTIONAL_KEYWORDS.contains(&key.as_str())
350            {
351                return Err(Error::InvalidMetadata);
352            }
353        }
354
355        Ok(())
356    }
357}
358
359/// Assert recovered metadata is consistent with header information
360fn cross_validate(metadata: &Metadata, header: &Header) -> Result<()> {
361    // check that data segment offsets from header match those in metadata
362    let begin_data = metadata.get_required_key("$BEGINDATA")?;
363    let end_data = metadata.get_required_key("$ENDDATA")?;
364    validate_metadata_offsets(
365        begin_data.parse::<usize>()?,
366        end_data.parse::<usize>()?,
367        &header.data_offsets,
368    )?;
369
370    // check that analysis segment offsets from header match those in metadata
371    let begin_analysis = metadata.get_required_key("$BEGINANALYSIS")?;
372    let end_analysis = metadata.get_required_key("$ENDANALYSIS")?;
373    validate_metadata_offsets(
374        begin_analysis.parse::<usize>()?,
375        end_analysis.parse::<usize>()?,
376        &header.analysis_offsets,
377    )?;
378
379    // validate some version specific metadata
380    match header.version {
381        Version::FCS3_1 => {
382            let data_mode = metadata.get_required_key("$MODE")?;
383            if data_mode != "L" {
384                return Err(Error::InvalidDataMode {
385                    data_mode: data_mode.to_string(),
386                    version: header.version.to_string(),
387                });
388            }
389
390            let data_type = metadata.get_required_key("$DATATYPE")?;
391            if data_type != "I" && data_type != "F" && data_type != "D" {
392                return Err(Error::InvalidDataType {
393                    kind: data_type.to_string(),
394                    version: header.version.to_string(),
395                });
396            }
397        }
398        Version::FCS3_0 => {
399            todo!()
400        }
401    }
402    Ok(())
403}
404
405/// Parse FCS data segment.
406fn read_event_data(
407    reader: &mut BufReader<std::fs::File>,
408    metadata: &Metadata,
409) -> Result<EventData> {
410    let n_params = metadata.get_required_key("$PAR")?.parse::<usize>()?;
411    let n_events = metadata.get_required_key("$TOT")?.parse::<usize>()?;
412    let capacity = n_params * n_events;
413
414    if capacity == 0 {
415        return Err(Error::NoDataFound);
416    }
417
418    let byte_order = metadata.get_required_key("$BYTEORD")?;
419    let data_type = metadata.get_required_key("$DATATYPE")?;
420    let data_start = metadata.get_required_key("$BEGINDATA")?.parse::<u64>()?;
421
422    reader.seek(SeekFrom::Start(data_start))?;
423    let mut events: Vec<f64>;
424    let mut data: HashMap<String, Vec<f64>> = HashMap::with_capacity(n_params);
425
426    match metadata.get_required_key("$MODE")? {
427        // List mode
428        "L" => {
429            for i in 1..=n_params {
430                match byte_order {
431                    "1,2,3,4" => {
432                        events = parse_events::<byteorder::LittleEndian>(
433                            reader, &data_type, n_events, metadata, i,
434                        )?;
435                    }
436                    "4,3,2,1" => {
437                        events = parse_events::<byteorder::BigEndian>(
438                            reader, &data_type, n_events, metadata, i,
439                        )?;
440                    }
441                    _ => {
442                        return Err(Error::InvalidByteOrder {
443                            byte_order: byte_order.to_string(),
444                        })
445                    }
446                }
447                let id = metadata.get_required_key(&format!("$P{}N", i))?;
448                data.insert(id.to_string(), events);
449            }
450            Ok(data)
451        }
452        "H" => todo!(),
453        _ => unreachable!(),
454    }
455}
456
457fn parse_events<B: byteorder::ByteOrder>(
458    reader: &mut BufReader<std::fs::File>,
459    data_type: &str,
460    n_events: usize,
461    metadata: &Metadata,
462    index: usize,
463) -> Result<Vec<f64>> {
464    let mut data: Vec<f64> = Vec::with_capacity(n_events);
465    match data_type {
466        // unsigned binary integer type
467        "I" => {
468            let bit_length = metadata
469                .get_required_key(&format!("P{}B", index))?
470                .parse::<usize>()?;
471            match bit_length {
472                16 => {
473                    for _ in 0..n_events {
474                        let event = reader.read_u16::<B>()? as f64;
475                        data.push(event);
476                    }
477                }
478                32 => {
479                    for _ in 0..n_events {
480                        let event = reader.read_u32::<B>()? as f64;
481                        data.push(event);
482                    }
483                }
484                64 => {
485                    for _ in 0..n_events {
486                        let event = reader.read_u64::<B>()? as f64;
487                        data.push(event);
488                    }
489                }
490                128 => {
491                    for _ in 0..n_events {
492                        let event = reader.read_u128::<B>()? as f64;
493                        data.push(event);
494                    }
495                }
496                _ => return Err(Error::InvalidParamBitLength { bit_length, index }),
497            }
498        }
499        // single precision floating point
500        "F" => {
501            for _ in 0..n_events {
502                let event = reader.read_f32::<B>()? as f64;
503                data.push(event);
504            }
505        }
506        // double precision floating point
507        "D" => {
508            for _ in 0..n_events {
509                let event = reader.read_f64::<B>()?;
510                data.push(event);
511            }
512        }
513        "A" => {
514            unimplemented!()
515        }
516        _ => unreachable!(),
517    }
518    Ok(data)
519}
520
521#[cfg(test)]
522mod tests {
523    use super::*;
524
525    #[test]
526    fn fcs_header_parser() -> Result<()> {
527        let file = File::open("tests/data/test_fcs_3_1.fcs")?;
528        let mut reader = BufReader::new(file);
529
530        let header = read_header(&mut reader)?;
531
532        assert_eq!(header.version, Version::FCS3_1);
533        assert_eq!(header.text_offsets, 64..=1717);
534        assert_eq!(header.data_offsets, 1718..=5201717);
535        assert_eq!(header.analysis_offsets, 0..=0);
536
537        Ok(())
538    }
539
540    #[test]
541    fn fcs_metadata_parser() {
542        let metadata_string =
543            "\\Key1\\Value1\\Escaped\\\\Key2\\Value2\\Key3\\Escaped\\\\Value3\\Key 4\\Value-4\\";
544
545        let true_metadata_map: HashMap<String, String> = HashMap::from_iter(vec![
546            ("Key1".to_string(), "Value1".to_string()),
547            ("Escaped\\Key2".to_string(), "Value2".to_string()),
548            ("Key3".to_string(), "Escaped\\Value3".to_string()),
549            ("Key 4".to_string(), "Value-4".to_string()),
550        ]);
551
552        let (metadata_string, delimiter) = parse_delimiter(&metadata_string).unwrap();
553        let metadata_string_transformed =
554            metadata_string.replace(delimiter.repeat(2).as_str(), DOUBLE_DELIMITER_TRANSFORM);
555
556        let (_, metadata) = fold_many1(
557            |input| parse_metadata_pairs(input, delimiter),
558            HashMap::new,
559            |mut acc: HashMap<String, String>, (key, value)| {
560                acc.insert(key, value);
561                acc
562            },
563        )(&metadata_string_transformed)
564        .unwrap();
565
566        assert_eq!(metadata, true_metadata_map);
567    }
568
569    #[test]
570    fn full_fcs_parser() -> Result<()> {
571        let sample = read("tests/data/test_fcs_3_1.fcs")?;
572
573        let n_params = sample.metadata.get_required_key("$PAR")?.parse::<usize>()?;
574        let n_param_vecs = sample.event_data.len();
575        assert_eq!(n_params, n_param_vecs);
576
577        let n_events = sample.metadata.get_required_key("$TOT")?.parse::<usize>()?;
578        let param_id = sample.metadata.get_required_key("$P1N")?;
579        let param_data = sample.event_data.get(param_id).unwrap();
580        assert_eq!(n_events, param_data.len());
581
582        Ok(())
583    }
584}