ucd-parse 0.1.13

A library for parsing data files in the Unicode character database.
Documentation
use std::path::Path;

use crate::{
    common::{Codepoint, CodepointIter, UcdFile, UcdFileByCodepoint},
    error::Error,
};

/// A single row in the `CaseFolding.txt` file.
///
/// The contents of `CaseFolding.txt` are a convenience derived from both
/// `UnicodeData.txt` and `SpecialCasing.txt`.
///
/// Note that a single codepoint may be mapped multiple times. In particular,
/// a single codepoint might have distinct `CaseStatus::Simple` and
/// `CaseStatus::Full` mappings.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct CaseFold {
    /// The codepoint that is being mapped.
    pub codepoint: Codepoint,
    /// The case status of this mapping.
    pub status: CaseStatus,
    /// The actual case mapping, which is more than one codepoint if this is
    /// a "full" mapping.
    pub mapping: Vec<Codepoint>,
}

impl UcdFile for CaseFold {
    fn relative_file_path() -> &'static Path {
        Path::new("CaseFolding.txt")
    }
}

impl UcdFileByCodepoint for CaseFold {
    fn codepoints(&self) -> CodepointIter {
        self.codepoint.into_iter()
    }
}

impl std::str::FromStr for CaseFold {
    type Err = Error;

    fn from_str(line: &str) -> Result<CaseFold, Error> {
        let re_parts = regex!(
            r"(?x)
                ^
                \s*(?P<codepoint>[^\s;]+)\s*;
                \s*(?P<status>[^\s;]+)\s*;
                \s*(?P<mapping>[^;]+)\s*;
                ",
        );

        let caps = match re_parts.captures(line.trim()) {
            Some(caps) => caps,
            None => return err!("invalid CaseFolding line: '{}'", line),
        };
        let mut mapping = vec![];
        for cp in caps["mapping"].split_whitespace() {
            mapping.push(cp.parse()?);
        }
        Ok(CaseFold {
            codepoint: caps["codepoint"].parse()?,
            status: caps["status"].parse()?,
            mapping,
        })
    }
}

/// The status of a particular case mapping.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum CaseStatus {
    /// Case mappings shared by both "simple" and "full" mappings.
    Common,
    /// A case mapping that changes the number of codepoints.
    Full,
    /// A case mapping that doesn't change the number of codepoints, when it
    /// differs from `Full`.
    Simple,
    /// Special cases (currently only for Turkic mappings) that are typically
    /// excluded by default. Special cases don't change the number of
    /// codepoints, but may changed the encoding (e.g., UTF-8) length in bytes.
    Special,
}

impl Default for CaseStatus {
    fn default() -> CaseStatus {
        CaseStatus::Common
    }
}

impl CaseStatus {
    /// Returns true if and only if this status indicates a case mapping that
    /// won't change the number of codepoints.
    pub fn is_fixed(&self) -> bool {
        *self != CaseStatus::Full
    }
}

impl std::str::FromStr for CaseStatus {
    type Err = Error;

    fn from_str(s: &str) -> Result<CaseStatus, Error> {
        match s {
            "C" => Ok(CaseStatus::Common),
            "F" => Ok(CaseStatus::Full),
            "S" => Ok(CaseStatus::Simple),
            "T" => Ok(CaseStatus::Special),
            _ => err!(
                "unrecognized case status: '{}' \
                 (must be one of C, F, S or T)",
                s
            ),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::{CaseFold, CaseStatus};

    #[test]
    fn parse_common() {
        let line =
            "0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE\n";
        let row: CaseFold = line.parse().unwrap();
        assert_eq!(row.codepoint, 0x0150);
        assert_eq!(row.status, CaseStatus::Common);
        assert_eq!(row.mapping, vec![0x0151]);
    }

    #[test]
    fn parse_full() {
        let line = "03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS\n";
        let row: CaseFold = line.parse().unwrap();
        assert_eq!(row.codepoint, 0x03B0);
        assert_eq!(row.status, CaseStatus::Full);
        assert_eq!(row.mapping, vec![0x03C5, 0x0308, 0x0301]);
    }

    #[test]
    fn parse_simple() {
        let line = "1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI\n";
        let row: CaseFold = line.parse().unwrap();
        assert_eq!(row.codepoint, 0x1F8F);
        assert_eq!(row.status, CaseStatus::Simple);
        assert_eq!(row.mapping, vec![0x1F87]);
    }

    #[test]
    fn parse_special() {
        let line = "0049; T; 0131; # LATIN CAPITAL LETTER I\n";
        let row: CaseFold = line.parse().unwrap();
        assert_eq!(row.codepoint, 0x0049);
        assert_eq!(row.status, CaseStatus::Special);
        assert_eq!(row.mapping, vec![0x0131]);
    }
}