mmi_parser/
lib.rs

1//! This crate exists to support the primary functions of the
2//! MMI parser command line tool.
3//!
4//!
5//! The primary reference for the field information is found
6//! [here](https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MMI_Output_2016.pdf)
7//! and relies on MetaMap 2016 or newer.
8//!
9//! The main functionality is encompassed in [`MmiOutput`], [`AaOutput`], and [`parse_record`].
10//!
11//! For questions on implementations of the parsing algorithms for specific sections,
12//! please consult the [source](https://github.com/UK-IPOP) which contains well-labeled
13//! and fairly documented functions to parse each type.
14
15extern crate core;
16
17use serde::{Deserialize, Serialize};
18use std::collections::HashMap;
19use std::fmt::{self, Display};
20use std::num::ParseIntError;
21use std::str::FromStr;
22use std::{error, result};
23
24/// ValueError occurs when an invalid value was provided
25#[derive(Debug)]
26pub struct ValueError;
27
28impl Display for ValueError {
29    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
30        write!(f, "Received an unexpected value")
31    }
32}
33
34impl error::Error for ValueError {}
35
36/// A custom result type implementing [`ValueError`]
37type Result<T> = std::result::Result<T, Box<dyn error::Error>>;
38
39/// Splits the provided string reference on vertical bar (pipe symbol)
40/// and collects split into vector.
41fn split_text(text: &str) -> Vec<&str> {
42    text.split('|').collect()
43}
44
45/// Labels the parts of the pipe-split string using MMI field labels.
46/// Returns a hashmap of field names as keys and their values from the vector.
47fn label_mmi_parts(parts: Vec<&str>) -> Result<HashMap<&str, &str>> {
48    if parts.len() != 10 {
49        println!(
50            "Record is not of the right length, expected 10 pipe-separated components, found {}",
51            parts.len()
52        );
53        return Err(Box::new(ValueError));
54    }
55    let mut map = HashMap::new();
56    map.insert("id", parts[0]);
57    map.insert("mmi", parts[1]);
58    map.insert("score", parts[2]);
59    map.insert("name", parts[3]);
60    map.insert("cui", parts[4]);
61    map.insert("semantic_types", parts[5]);
62    map.insert("triggers", parts[6]);
63    map.insert("location", parts[7]);
64    map.insert("positional_info", parts[8]);
65    map.insert("tree_codes", parts[9]);
66    Ok(map)
67}
68
69/// Parses out semantic type field by removing brackets and splitting on commas.
70fn parse_semantic_types(semantic_types: &str) -> Vec<String> {
71    let cleaned = semantic_types.trim_start_matches('[').trim_end_matches(']');
72    let clean_list = cleaned.split(',').map(|x| x.to_string()).collect();
73    clean_list
74}
75
76/// Enumeration for Location options.
77#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)]
78pub enum Location {
79    TI,
80    AB,
81    TX,
82    Tiab,
83}
84
85impl FromStr for Location {
86    type Err = ValueError;
87    /// Parses a Location type from a string reference.
88    fn from_str(s: &str) -> std::result::Result<Location, ValueError> {
89        match s.to_uppercase().as_str() {
90            "TI" => Ok(Location::TI),
91            "AB" => Ok(Location::AB),
92            "TX" => Ok(Location::TX),
93            "TI;AB" => Ok(Location::Tiab),
94            _ => Err(ValueError),
95        }
96    }
97}
98
99/// Parses the tree codes by splitting string reference on semicolon and
100/// collecting into vector.
101/// Returns Optional Vector because tree-codes could be None.
102fn parse_tree_codes(codes: &str) -> Option<Vec<String>> {
103    if codes.is_empty() {
104        return None;
105    }
106    Some(codes.split(';').map(|x| x.to_string()).collect())
107}
108
109/// Utility function for splitting a string reference on a given pattern
110/// while *ignoring* inside quotes.
111///  
112/// This was necessary due to MMI output containing literal-quoted strings with
113/// split characters ("," or "-") inside them.
114fn split_with_quote_context(x: &str, pattern: char) -> Vec<String> {
115    let mut is_in_quotes = false;
116    let mut start_position = 0;
117    let final_position = x.len();
118    let mut parts: Vec<String> = Vec::new();
119    for (i, c) in x.chars().enumerate() {
120        if c == '\"' {
121            is_in_quotes = !is_in_quotes;
122        } else if c == pattern && !is_in_quotes {
123            parts.push(x[start_position..i].to_string());
124            start_position = i + 1;
125        } else if i == final_position - 1 {
126            // last part
127            parts.push(x[start_position..final_position].to_string());
128        }
129    }
130    parts
131}
132
133/// Struct to represent Trigger information.
134#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)]
135pub struct Trigger {
136    /// UMLS concept name
137    pub name: String,
138    /// location of text
139    pub loc: Location,
140    /// number of the utterance within the location (starting with 1)
141    pub loc_position: i32,
142    /// the actual text
143    pub text: String,
144    /// determined by MedPost Tagger or Lexical Lookup
145    pub part_of_speech: String,
146    /// True if text is considered negated by MetaMap
147    pub negation: bool,
148}
149
150/// Utility function to convert string reference to boolean.
151///
152/// Will error if string reference is not "1" or "0" because
153/// that is the expected output from MetaMap.
154fn parse_bool(x: &str) -> Result<bool> {
155    match x {
156        "1" => Ok(true),
157        "0" => Ok(false),
158        _ => {
159            println!("Unexpected boolean: {}", x);
160            Err(Box::new(ValueError))
161        }
162    }
163}
164
165impl Trigger {
166    /// New function to initialize a Trigger.
167    pub fn new(
168        n: &str,
169        loc: &str,
170        loc_pos: &str,
171        t: &str,
172        part_of_speech: &str,
173        neg: bool,
174    ) -> Trigger {
175        Trigger {
176            name: n.replace('\"', ""),
177            loc: Location::from_str(loc).expect("unable to parse Location"),
178            loc_position: loc_pos
179                .parse::<i32>()
180                .expect("unable to parse integer from location"),
181            text: t.replace('\"', ""),
182            part_of_speech: part_of_speech.replace('\"', ""),
183            negation: neg,
184        }
185    }
186}
187
188/// Parses [`Trigger`] instances from string reference.
189fn parse_triggers(info: &str) -> Result<Vec<Trigger>> {
190    let mut triggers: Vec<Trigger> = Vec::new();
191    let trigger_list = split_with_quote_context(info, ',');
192    for t in trigger_list {
193        let clean = t.trim_start_matches('[').trim_end_matches(']');
194        let parts = split_with_quote_context(clean, '-');
195        if parts.len() != 6 {
196            println!(
197                "Trigger format does not make sense, expected sextuple (6), got {} parts instead.",
198                &parts.len()
199            );
200            return Err(Box::new(ValueError));
201        } else {
202            // valid shape
203            let negation = parse_bool(&parts[5])?;
204            let trigger = Trigger::new(
205                &parts[0], &parts[1], &parts[2], &parts[3], &parts[4], negation,
206            );
207            triggers.push(trigger)
208        }
209    }
210    Ok(triggers)
211}
212
213/// Splits on commas *not* inside brackets.
214/// Similar to [split_with_quote_context] except applies to brackets instead of quotes.
215fn split_with_bracket_context(x: &str) -> Vec<String> {
216    let mut is_in_brackets = false;
217    let mut start_position = 0;
218    let final_position = x.len();
219    let mut parts: Vec<String> = Vec::new();
220    for (i, c) in x.chars().enumerate() {
221        if c == '[' {
222            is_in_brackets = !is_in_brackets;
223        } else if c == ']' {
224            is_in_brackets = !is_in_brackets;
225            if i == final_position - 1 {
226                // last part
227                parts.push(x[start_position..final_position].to_string());
228            }
229        } else if c == ',' && !is_in_brackets {
230            parts.push(x[start_position..i].to_string());
231            start_position = i + 1;
232        }
233    }
234    parts
235}
236
237/// Parses bracketed information for positional information.
238/// Used in [parse_positional_info]
239fn parse_bracketed_info(x: &str) -> result::Result<Vec<i32>, ParseIntError> {
240    let parts = x.trim_start_matches('[').trim_end_matches(']').split('/');
241    let collected = parts
242        .into_iter()
243        .map(|p| p.parse::<i32>())
244        .collect::<result::Result<Vec<i32>, ParseIntError>>()?;
245    Ok(collected)
246}
247
248/// Positional Information type options
249#[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
250pub enum PositionalInfoType {
251    A,
252    B,
253    C,
254    D,
255}
256
257/// Tags positional information based on conditions
258/// listed in 9a-9d of the reference [document](https://lhncbc.nlm.nih.gov/ii/tools/MetaMap/Docs/MMI_Output_2016.pdf).
259fn tag_pos_info(x: &str) -> (bool, bool, bool) {
260    // series of different conditions
261    let mut has_brackets = false;
262    let mut has_comma_inside_brackets = false;
263    let mut has_comma_outside_brackets = false;
264    let mut in_bracket = false;
265    for c in x.chars() {
266        // encountered bracket somewhere
267        if c == '[' {
268            has_brackets = true;
269            in_bracket = true;
270        } else if c == ']' {
271            in_bracket = false;
272        } else if c == ',' && !in_bracket {
273            has_comma_outside_brackets = true;
274        } else if c == ',' && in_bracket {
275            has_comma_inside_brackets = true;
276        }
277    }
278    (
279        has_brackets,
280        has_comma_inside_brackets,
281        has_comma_outside_brackets,
282    )
283}
284
285/// Categorizes the positional information tagged from
286/// [tag_pos_info] into a specific category.
287fn categorize_positional_info(
288    has_brackets: bool,
289    has_comma_inside_brackets: bool,
290    has_comma_outside_brackets: bool,
291) -> Result<PositionalInfoType> {
292    if !has_comma_outside_brackets && !has_comma_inside_brackets {
293        Ok(PositionalInfoType::A)
294    } else if !has_brackets && has_comma_outside_brackets {
295        Ok(PositionalInfoType::B)
296    } else if has_brackets && has_comma_outside_brackets && !has_comma_inside_brackets {
297        Ok(PositionalInfoType::C)
298    } else if has_brackets && has_comma_outside_brackets && has_comma_inside_brackets {
299        Ok(PositionalInfoType::D)
300    } else {
301        println!("could not parse positional information.");
302        Err(Box::new(ValueError))
303    }
304}
305
306/// Structure for Position representing start index, length, and Position Type.
307#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)]
308pub struct Position {
309    /// Start position
310    pub start: i32,
311    /// Length of matched text
312    pub length: i32,
313    /// Type of match
314    pub case: PositionalInfoType,
315}
316
317impl Position {
318    /// Initialize new position.
319    pub fn new(start: i32, length: i32, case: PositionalInfoType) -> Position {
320        Position {
321            start,
322            length,
323            case,
324        }
325    }
326}
327
328/// Simple utility function to check whether
329pub fn check_parts(parts: &[&str]) -> Result<()> {
330    if parts.len() != 2 {
331        return Err(Box::new(ValueError));
332    }
333    Ok(())
334}
335
336pub fn parse_position_parts(position_str: &str, case: PositionalInfoType) -> Result<Position> {
337    let parts = position_str.split('/').collect::<Vec<&str>>();
338    check_parts(&parts)?;
339    let p1 = parts[0].parse::<i32>()?;
340    let p2 = parts[1].parse::<i32>()?;
341    Ok(Position::new(p1, p2, case))
342}
343
344/// Parses out a Vector of [`Position`] types from a string reference.
345fn parse_positional_info(info: &str) -> Result<Vec<Position>> {
346    let tags = tag_pos_info(info);
347    let category = categorize_positional_info(tags.0, tags.1, tags.2)?;
348    let mut positions: Vec<Position> = Vec::new();
349    match category {
350        PositionalInfoType::A => {
351            for section in info.split(';') {
352                let p = parse_position_parts(section, PositionalInfoType::A)?;
353                positions.push(p);
354            }
355            Ok(positions)
356        }
357        PositionalInfoType::B => {
358            for section in info.split(';') {
359                for subsection in section.split(',') {
360                    let p = parse_position_parts(subsection, PositionalInfoType::B)?;
361                    positions.push(p);
362                }
363            }
364            Ok(positions)
365        }
366        PositionalInfoType::C => {
367            for section in info.split(';') {
368                for subsection in section.split(',') {
369                    let parts = parse_bracketed_info(subsection)?;
370                    let p = Position::new(parts[0], parts[1], PositionalInfoType::C);
371                    positions.push(p);
372                }
373            }
374            Ok(positions)
375        }
376        PositionalInfoType::D => {
377            for section in info.split(';') {
378                for subsection in split_with_bracket_context(section) {
379                    for underground in subsection.split(',') {
380                        let parts = parse_bracketed_info(underground)?;
381                        let p = Position::new(parts[0], parts[1], PositionalInfoType::D);
382                        positions.push(p);
383                    }
384                }
385            }
386            Ok(positions)
387        }
388    }
389}
390
391/// Main struct for entire library.
392/// Represents an entire fielded MMI record as one type.
393#[derive(Debug, PartialEq, Serialize, Deserialize)]
394pub struct MmiOutput {
395    /// unique identifier
396    pub id: String,
397    /// always MMI
398    pub mmi: String,
399    /// score of concept relevance, 0-1000, 1000 being perfect
400    pub score: f64,
401    /// name of the concept matched
402    pub name: String,
403    /// CUI for identified UMLS concept
404    pub cui: String,
405    /// Semantic Type abbreviations
406    pub semantic_types: Vec<String>,
407    /// Triggers for MMI to flag this concept
408    pub triggers: Vec<Trigger>,
409    /// Location of concept
410    pub location: Location,
411    /// Positional information of concept
412    pub positional_info: Vec<Position>,
413    /// Optional MeSH [tree code(s)](https://www.nlm.nih.gov/mesh/meshhome.html)
414    pub tree_codes: Option<Vec<String>>,
415}
416
417impl MmiOutput {
418    /// Parses a hashmap into MMiOutput field types.
419    /// Utilizes all other functionality defined in this module
420    /// to assemble/parse each field into its appropriate format and types.
421    ///
422    /// While this function is useful for building [`MmiOutput`] types,
423    /// [`parse_record`] will probably be **much** more practical since it
424    /// accepts a string reference and does the field tagging/mapping for you.
425    pub fn assemble(parts: HashMap<&str, &str>) -> Result<Self> {
426        // does not use `parts.get(<key>)` because WE made the keys so WE
427        // know they exist
428        let id = parts["id"].to_string();
429        let mmi = parts["mmi"].to_string();
430        let score = parts["score"].parse::<f64>()?;
431        let name = parts["name"].to_string();
432        let cui = parts["cui"].to_string();
433        let source_sem_types = parts["semantic_types"].to_string();
434        let semantic_types = parse_semantic_types(&source_sem_types);
435        let source_triggers = parts["triggers"].to_string();
436        let triggers = parse_triggers(&source_triggers)?;
437        let source_location = parts["location"].to_string();
438        let location = Location::from_str(&source_location)?;
439        let source_positions = parts["positional_info"].to_string();
440        let positional_info = parse_positional_info(&source_positions)?;
441        let source_tree_codes = parts["tree_codes"].to_string();
442        let tree_codes = parse_tree_codes(&source_tree_codes);
443        let mmi_output = MmiOutput {
444            id,
445            mmi,
446            score,
447            name,
448            cui,
449            semantic_types,
450            triggers,
451            location,
452            positional_info,
453            tree_codes,
454        };
455        Ok(mmi_output)
456    }
457}
458
459/// Which type of abbreviation (AA) record exists, either AA or UA (user-defined)
460#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)]
461pub enum AbbreviationType {
462    /// MetaMap Acronyms and Abbreviations
463    AA,
464    /// User defined Acronyms and Abbreviations
465    UA,
466}
467
468impl FromStr for AbbreviationType {
469    type Err = ValueError;
470    /// Parses an Abbreviation Type from a string reference.
471    fn from_str(s: &str) -> std::result::Result<AbbreviationType, ValueError> {
472        match s.to_uppercase().as_str() {
473            "AA" => Ok(AbbreviationType::AA),
474            "UA" => Ok(AbbreviationType::UA),
475            _ => Err(ValueError),
476        }
477    }
478}
479
480/// Abbreviation and Acronym position information
481#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)]
482pub struct AaPosInfo {
483    pub start: i32,
484    pub length: i32,
485}
486
487impl AaPosInfo {
488    /// New function to create positional info type from two str references
489    pub fn new(s: i32, l: i32) -> Self {
490        AaPosInfo {
491            start: s,
492            length: l,
493        }
494    }
495}
496
497/// Main "Secondary" struct of program
498/// Acronyms and Abbreviations detected by MetaMap
499#[derive(PartialEq, Eq, Debug, Serialize, Deserialize)]
500pub struct AaOutput {
501    /// Unique identifier
502    pub id: String,
503    /// Abbreviation type: either MetaMap defined or User-defined
504    pub abbreviation_type: AbbreviationType,
505    /// Short form of the acronym/abbreviation
506    pub short_form: String,
507    /// Long form or expansion
508    pub long_form: String,
509    /// number of tokens (including whitespace) in short form
510    pub short_token_count: i32,
511    /// number of characters in short form
512    pub short_character_count: i32,
513    /// number of tokens (including whitespace) in long form
514    pub long_token_count: i32,
515    /// number of characters in long form
516    pub long_character_count: i32,
517    /// starting position of short form followed by ":" followed by character length of short form
518    pub positional_info: AaPosInfo,
519}
520
521impl AaOutput {
522    /// Parses a hashmap into AaOutput field types.
523    /// Utilizes all other functionality defined in this module
524    /// to assemble/parse each field into its appropriate format and types.
525    ///
526    /// While this function is useful for building [`AaOutput`] types,
527    /// [`parse_record`] will probably be **much** more practical since it
528    /// accepts a string reference and does the field tagging/mapping for you.
529    pub fn assemble(parts: HashMap<&str, &str>) -> Result<Self> {
530        // does not use `parts.get(<key>)` because WE made the keys so WE
531        // know they exist
532        let id = parts["id"].to_string();
533        let abbreviation_type = AbbreviationType::from_str(parts["abbreviation_type"])?;
534        let short_form = parts["short_form"].to_string();
535        let long_form = parts["long_form"].to_string();
536        let short_token_count = parts["short_token_count"].parse::<i32>()?;
537        let short_character_count = parts["short_character_count"].parse::<i32>()?;
538        let long_token_count = parts["long_token_count"].parse::<i32>()?;
539        let long_character_count = parts["long_character_count"].parse::<i32>()?;
540        let position_parts = parts["positional_info"].split(':').collect::<Vec<&str>>();
541        let pp1 = position_parts[0].parse::<i32>()?;
542        let pp2 = position_parts[1].parse::<i32>()?;
543        let positional_info = AaPosInfo::new(pp1, pp2);
544        let aa_output = AaOutput {
545            id,
546            abbreviation_type,
547            short_form,
548            long_form,
549            short_token_count,
550            short_character_count,
551            long_token_count,
552            long_character_count,
553            positional_info,
554        };
555        Ok(aa_output)
556    }
557}
558
559/// Labels AA records with the corresponding field names
560pub fn label_aa_parts(parts: Vec<&str>) -> Result<HashMap<&str, &str>> {
561    if parts.len() != 9 {
562        return Err(Box::new(ValueError));
563    }
564    let mut map: HashMap<&str, &str> = HashMap::new();
565    map.insert("id", parts[0]);
566    map.insert("abbreviation_type", parts[1]);
567    map.insert("short_form", parts[2]);
568    map.insert("long_form", parts[3]);
569    map.insert("short_token_count", parts[4]);
570    map.insert("short_character_count", parts[5]);
571    map.insert("long_token_count", parts[6]);
572    map.insert("long_character_count", parts[7]);
573    map.insert("positional_info", parts[8]);
574    Ok(map)
575}
576
577#[derive(Serialize, Deserialize, Debug)]
578pub enum Output {
579    MMI(MmiOutput),
580    AA(AaOutput),
581}
582
583/// A better alternative to [`MmiOutput::assemble`] or [`AaOutput::assemble`]
584/// Takes a string reference, splits it on vertical bar (pipe) characters,
585/// labels each item with its corresponding field name,
586/// passes labeled data into [`MmiOutput::assemble`] or [`AaOutput::assemble`].
587///
588/// This is used to scan over lines in fielded MMI output text files in the main CLI.
589/// It detects whether the record is MMI or not by looking at the second item in the pipe-delimited
590/// vector and whether it matches MMI, AA/UA, or neither.
591///
592/// Arguments:
593/// * text: a string reference representing a single line of MMI/AA output
594///
595/// Returns:
596/// * Result<Output, ValueError>: An enumeration with MMI::MmiOutput and AA::AaOutput options. Could return
597/// error if a valid option is not found in the second vector position.
598///
599/// This effectively converts *each* fielded MMI **line** into an [`Output`] of either MMI or AA type.
600/// For example:
601///
602/// ```rust
603/// use std::io::{BufReader, BufRead};
604/// use std::fs::File;
605///
606/// let file = File::open("data/MMI_sample.txt").unwrap();
607/// // or for AA records
608/// // let file = File::open("data/AA_sample.txt".unwrap());
609/// let reader = BufReader::new(file);
610///
611/// for line in reader.lines() {
612///     let record = line.unwrap();
613///     let result = mmi_parser::parse_record(record.as_str());
614///     println!("{:?}", result.unwrap()); // must use debug
615/// }
616
617/// ```
618pub fn parse_record(text: &str) -> Result<Output> {
619    let parts = split_text(text);
620    // only 2 valid length options, easy to stop early
621    if parts.len() != 10 && parts.len() != 9 {
622        return Err(Box::new(ValueError));
623    }
624    match parts[1].to_ascii_uppercase().as_str() {
625        "MMI" => {
626            let fields = label_mmi_parts(parts)?;
627            let output = MmiOutput::assemble(fields)?;
628            Ok(Output::MMI(output))
629        }
630        "AA" | "UA" => {
631            let fields = label_aa_parts(parts)?;
632            let output = AaOutput::assemble(fields)?;
633            Ok(Output::AA(output))
634        }
635        _ => Err(Box::new(ValueError)),
636    }
637}
638
639#[cfg(test)]
640mod tests {
641    use core::panic;
642
643    use super::*;
644
645    #[test]
646    fn test_parse_bool() {
647        assert!(parse_bool("1").unwrap());
648        assert!(!parse_bool("0").unwrap());
649        assert!(parse_bool("2").is_err());
650    }
651
652    #[test]
653    fn test_split_with_bracket_context() {
654        let s1 = "[4061/10,4075/11],[4061/10,4075/11]";
655        let r1 = split_with_bracket_context(s1);
656        assert_eq!(r1, vec!["[4061/10,4075/11]", "[4061/10,4075/11]"])
657    }
658
659    // this is a beefy integration test of the
660    // `tag_pos_info` and the `categorize_positional_info` functions
661    #[test]
662    fn test_pos_info_categorization() {
663        // ex 1 type C
664        let s1 = "[4061/10,4075/11],[4061/10,4075/11]";
665        let r1 = tag_pos_info(s1);
666        let cat = categorize_positional_info(r1.0, r1.1, r1.2);
667
668        assert_eq!(r1, (true, true, true));
669        assert_eq!(cat.unwrap(), PositionalInfoType::D);
670
671        let s1 = "117/5;122/4";
672        let r1 = tag_pos_info(s1);
673        let cat = categorize_positional_info(r1.0, r1.1, r1.2);
674
675        assert_eq!(r1, (false, false, false));
676        assert_eq!(cat.unwrap(), PositionalInfoType::A);
677
678        let s1 = "117/5";
679        let r1 = tag_pos_info(s1);
680        let cat = categorize_positional_info(r1.0, r1.1, r1.2);
681
682        assert_eq!(r1, (false, false, false));
683        assert_eq!(cat.unwrap(), PositionalInfoType::A);
684
685        let s1 = "117/5,122/4,113/2";
686        let r1 = tag_pos_info(s1);
687        let cat = categorize_positional_info(r1.0, r1.1, r1.2);
688
689        assert_eq!(r1, (false, false, true));
690        assert_eq!(cat.unwrap(), PositionalInfoType::B);
691
692        let s1 = "[122/4],[117/6]";
693        let r1 = tag_pos_info(s1);
694        let cat = categorize_positional_info(r1.0, r1.1, r1.2);
695
696        assert_eq!(r1, (true, false, true));
697        assert_eq!(cat.unwrap(), PositionalInfoType::C);
698
699        let r1 = categorize_positional_info(true, true, false);
700        assert!(r1.is_err());
701    }
702
703    #[test]
704    fn test_quote_splitter() {
705        let sample = "[\"Drug, NOS\"-tx-33-\"medicine\"-noun-0,\"Drug, NOS\"-tx-31-\"medicine\"-noun-0,\"Drug - NOS\"-tx-29-\"medication\"-noun-0,\"Drug, NOS\"-tx-5-\"drug\"-noun-0]";
706        let r = split_with_quote_context(sample, ',');
707        assert_eq!(r.len(), 4);
708        for x in r {
709            let r2 = split_with_quote_context(&x, '-');
710            assert_eq!(6, r2.len()); // sextuple
711        }
712    }
713
714    #[test]
715    fn test_split_text() {
716        let sample = "24119710|MMI|637.30|Isopoda|C0598806|[euka]|";
717        assert_eq!(
718            split_text(sample),
719            ["24119710", "MMI", "637.30", "Isopoda", "C0598806", "[euka]", ""]
720        );
721    }
722
723    #[test]
724    fn test_name_parts() {
725        let sample = "24119710|MMI|637.30|Isopoda|C0598806|[euka]|[\"Isopod\"-ab-1-\"isopod\"-adj-0,\"Isopoda\"-ti-1-\"Isopoda\"-noun-0]|TI;AB|228/6;136/7|B01.050.500.131.365.400";
726        let split = split_text(sample);
727        assert_eq!(label_mmi_parts(split).unwrap(), {
728            let mut map = HashMap::new();
729            map.insert("id", "24119710");
730            map.insert("mmi", "MMI");
731            map.insert("score", "637.30");
732            map.insert("name", "Isopoda");
733            map.insert("cui", "C0598806");
734            map.insert("semantic_types", "[euka]");
735            map.insert(
736                "triggers",
737                "[\"Isopod\"-ab-1-\"isopod\"-adj-0,\"Isopoda\"-ti-1-\"Isopoda\"-noun-0]",
738            );
739            map.insert("location", "TI;AB");
740            map.insert("positional_info", "228/6;136/7");
741            map.insert("tree_codes", "B01.050.500.131.365.400");
742            map
743        });
744        let split = split_text(sample);
745        assert!(label_mmi_parts(split[0..5].to_vec()).is_err());
746    }
747
748    #[test]
749    fn test_parse_semantic_types() {
750        let sample = "[euka,helalo]";
751        assert_eq!(parse_semantic_types(sample), ["euka", "helalo"]);
752    }
753
754    #[test]
755    fn test_location() {
756        let sample = "ti";
757        assert_eq!(
758            Location::from_str(sample.to_uppercase().as_str()).unwrap(),
759            Location::TI
760        );
761        let sample = "AB";
762        assert_eq!(Location::from_str(sample).unwrap(), Location::AB);
763        let sample = "TX";
764        assert_eq!(Location::from_str(sample).unwrap(), Location::TX);
765        let sample = "TI;AB";
766        assert_eq!(Location::from_str(sample).unwrap(), Location::Tiab);
767    }
768    #[test]
769    #[should_panic]
770    fn test_invalid_location() {
771        let sample = "BG";
772        assert_eq!(Location::from_str(sample).unwrap(), Location::Tiab);
773    }
774
775    #[test]
776    fn test_parse_tree_codes() {
777        let sample = "";
778        assert_eq!(parse_tree_codes(sample), None);
779        let sample = "B01.050.500.131.365.400";
780        assert_eq!(
781            parse_tree_codes(sample),
782            Some(vec![String::from("B01.050.500.131.365.400")])
783        );
784        let sample = "B01.050.500.131.365.400;B01.050.500.131.365.400";
785        assert_eq!(
786            parse_tree_codes(sample),
787            Some(vec![
788                "B01.050.500.131.365.400".to_string(),
789                "B01.050.500.131.365.400".to_string()
790            ])
791        );
792    }
793
794    #[test]
795    fn test_parse_positional_info() {
796        let sample = "228/6;136/7";
797        assert_eq!(
798            parse_positional_info(sample).unwrap(),
799            vec![
800                Position::new(228, 6, PositionalInfoType::A),
801                Position::new(136, 7, PositionalInfoType::A)
802            ]
803        );
804        let s1 = "[4061/10,4075/11],[4061/10,4075/11]";
805        assert_eq!(
806            parse_positional_info(s1).unwrap(),
807            vec![
808                Position::new(4061, 10, PositionalInfoType::D),
809                Position::new(4075, 11, PositionalInfoType::D),
810                Position::new(4061, 10, PositionalInfoType::D),
811                Position::new(4075, 11, PositionalInfoType::D),
812            ]
813        );
814        let s1 = "7059/5,7073/5";
815        assert_eq!(
816            parse_positional_info(s1).unwrap(),
817            vec![
818                Position::new(7059, 5, PositionalInfoType::B),
819                Position::new(7073, 5, PositionalInfoType::B),
820            ]
821        );
822        let s1 = "[1351/8],[1437/8]";
823        assert_eq!(
824            parse_positional_info(s1).unwrap(),
825            vec![
826                Position::new(1351, 8, PositionalInfoType::C),
827                Position::new(1437, 8, PositionalInfoType::C),
828            ]
829        );
830    }
831
832    #[test]
833    fn test_new_trigger() {
834        let t = ("hi", "tI;aB", "124", "fun times", "testing stuff", true);
835        let tt = Trigger::new(t.0, t.1, t.2, t.3, t.4, t.5);
836        let actual_tt = Trigger {
837            name: String::from("hi"),
838            loc: Location::Tiab,
839            loc_position: 124,
840            text: "fun times".to_string(),
841            part_of_speech: "testing stuff".to_string(),
842            negation: true,
843        };
844        assert_eq!(tt, actual_tt);
845    }
846
847    #[test]
848    fn test_parse_triggers() {
849        let sample = "[\"Crustacea\"-ti-1-\"Crustacea\"-noun-0]";
850        let result = parse_triggers(sample);
851        assert_eq!(
852            result.unwrap(),
853            [Trigger {
854                name: "Crustacea".to_string(),
855                loc: Location::TI,
856                loc_position: 1,
857                text: "Crustacea".to_string(),
858                part_of_speech: "noun".to_string(),
859                negation: false
860            }]
861        );
862        let s2 = "[\"Crustacea\"-ti-1-\"Crustacea\"-noun";
863        assert!(parse_triggers(s2).is_err());
864    }
865
866    #[test]
867    fn test_new_mmi() {
868        let mut map = HashMap::new();
869        map.insert("id", "24119710");
870        map.insert("mmi", "MMI");
871        map.insert("score", "637.30");
872        map.insert("name", "Isopoda");
873        map.insert("cui", "C0598806");
874        map.insert("semantic_types", "[euka]");
875        map.insert(
876            "triggers",
877            "[\"Isopod\"-ab-1-\"isopod\"-adj-0,\"Isopoda\"-ti-1-\"Isopoda\"-noun-0]",
878        );
879        map.insert("location", "TI;AB");
880        map.insert("positional_info", "228/6;136/7");
881        map.insert("tree_codes", "B01.050.500.131.365.400");
882        let expected = MmiOutput {
883            id: "24119710".to_string(),
884            mmi: "MMI".to_string(),
885            score: 637.30,
886            name: "Isopoda".to_string(),
887            cui: "C0598806".to_string(),
888            semantic_types: vec!["euka".to_string()],
889            triggers: vec![
890                Trigger {
891                    name: "Isopod".to_string(),
892                    loc: Location::AB,
893                    loc_position: 1,
894                    text: "isopod".to_string(),
895                    part_of_speech: "adj".to_string(),
896                    negation: false,
897                },
898                Trigger {
899                    name: "Isopoda".to_string(),
900                    loc: Location::TI,
901                    loc_position: 1,
902                    text: "Isopoda".to_string(),
903                    part_of_speech: "noun".to_string(),
904                    negation: false,
905                },
906            ],
907            location: Location::Tiab,
908            positional_info: vec![
909                Position {
910                    start: 228,
911                    length: 6,
912                    case: PositionalInfoType::A,
913                },
914                Position {
915                    start: 136,
916                    length: 7,
917                    case: PositionalInfoType::A,
918                },
919            ],
920            tree_codes: Some(vec!["B01.050.500.131.365.400".to_string()]),
921        };
922        assert_eq!(expected, MmiOutput::assemble(map).unwrap());
923    }
924
925    #[test]
926    fn test_parse_mmi_for_mmi() {
927        let s1 = "3124119710|MMI|637.30|Isopoda|C0598806|[euka]|[\"Isopod\"-ab-1-\"isopod\"-adj-0,\"Isopoda\"-ti-1-\"Isopoda\"-noun-0]|TI;AB|228/6;136/7|B01.050.500.131.365.400";
928        let expected = MmiOutput {
929            id: "3124119710".to_string(),
930            mmi: "MMI".to_string(),
931            score: 637.3,
932            name: "Isopoda".to_string(),
933            cui: "C0598806".to_string(),
934            semantic_types: vec!["euka".to_string()],
935            triggers: vec![
936                Trigger {
937                    name: "Isopod".to_string(),
938                    loc: Location::AB,
939                    loc_position: 1,
940                    text: "isopod".to_string(),
941                    part_of_speech: "adj".to_string(),
942                    negation: false,
943                },
944                Trigger {
945                    name: "Isopoda".to_string(),
946                    loc: Location::TI,
947                    loc_position: 1,
948                    text: "Isopoda".to_string(),
949                    part_of_speech: "noun".to_string(),
950                    negation: false,
951                },
952            ],
953            location: Location::Tiab,
954            positional_info: vec![
955                Position {
956                    start: 228,
957                    length: 6,
958                    case: PositionalInfoType::A,
959                },
960                Position {
961                    start: 136,
962                    length: 7,
963                    case: PositionalInfoType::A,
964                },
965            ],
966            tree_codes: Some(vec!["B01.050.500.131.365.400".to_string()]),
967        };
968        let parsed = match parse_record(s1).unwrap() {
969            Output::MMI(x) => x,
970            _ => panic!("stuff"),
971        };
972        assert_eq!(parsed, expected);
973    }
974
975    #[test]
976    fn test_parse_mmi_for_aa() {
977        let s1 = "23074487|AA|FY|fiscal years|1|2|3|12|9362:2";
978        let expected = match parse_record(s1).unwrap() {
979            Output::AA(x) => x,
980            _ => panic!("stuff"),
981        };
982        println!("{:?}", expected);
983    }
984
985    #[test]
986    #[should_panic]
987    fn test_panic_parse_mmi() {
988        let s1 = "asda|fake|other stuff|";
989        parse_record(s1).unwrap();
990    }
991
992    #[test]
993    fn test_abbreviation_type() {
994        assert_eq!(
995            AbbreviationType::AA,
996            AbbreviationType::from_str("AA").unwrap()
997        );
998        assert_eq!(
999            AbbreviationType::UA,
1000            AbbreviationType::from_str("UA").unwrap()
1001        );
1002        assert!(AbbreviationType::from_str("asfnkjsanf").is_err())
1003    }
1004
1005    #[test]
1006    fn test_parse_bracketed_info() {
1007        let t = parse_bracketed_info("[12/hi]");
1008        assert!(t.is_err());
1009    }
1010
1011    #[test]
1012    fn test_check_parts() {
1013        assert!(check_parts(&["hi", "bye"]).is_ok());
1014        assert!(check_parts(&["hi", "bye", "see ya"]).is_err());
1015    }
1016
1017    #[test]
1018    fn test_label_aa_parts() {
1019        let sample = vec!["hi", "by", "se", "yA", "later", "alligator"];
1020        assert!(label_aa_parts(sample).is_err());
1021    }
1022
1023    #[test]
1024    fn test_parse_record_fail() {
1025        assert!(parse_record("hi").is_err());
1026    }
1027}
mmi_parser/lib.rs

mmi_parser/
lib.rs