1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
//! Provide a lowercased diacritics-free version of a character or a string.
//!
//! For example return `e` for `é`.
//!
//! Secular's char lookup is an inlined lookup of a static table, which means it's possible to use it in performance sensitive code.
//!
//! Secular also performs (optionally) Unicode normalization.
//!
//! ## Declaration
//!
//! By default, diacritics removal is only done on ascii chars, so to include a smaller table.
//!
//! If you want to handle the whole BMP, use the "bmp" feature" (the binary will be bigger to
//! incorporate the whole mapping).
//!
//! Default import:
//!
//!```toml
//! [dependencies]
//! secular = "0.3"
//! ```
//!
//! For more characters (the BMP):
//!
//!```toml
//![dependencies]
//!secular = { version="0.3", features=["bmp"] }
//! ```
//!
//! With Unicode normalization functions (using the unicode-normalization crate):
//!
//!```toml
//![dependencies]
//!secular = { version="0.3", features=["normalization"] }
//! ```
//!
//! or
//!
//!```toml
//![dependencies]
//!secular = { version="0.3", features=["bmp","normalization"] }
//! ```
//!
//! This feature is optional so that you can avoid importing the unicode-normalization crate (note that it's used in many other crates so it's possible your text processing application already uses it).
//!
//! ## Usage
//!
//! On characters:
//!
//! ```
//! use secular::*;
//! let s = "Comunicações"; // normalized string (length=12)
//! let chars: Vec<char> = s.chars().collect();
//! assert_eq!(chars.len(), 12);
//! assert_eq!(chars[0], 'C');
//! assert_eq!(lower_lay_char(chars[0]), 'c');
//! assert_eq!(chars[8], 'ç');
//! assert_eq!(lower_lay_char(chars[8]), 'c');
//! ```
//!
//! On strings:
//!
//! ```
//! use secular::*;
//! let s = "Comunicações"; // unnormalized string (length=14)
//! assert_eq!(s.chars().count(), 14);
//! let s = normalized_lower_lay_string(s);
//! assert_eq!(s.chars().count(), 12);
//! assert_eq!(s, "comunicacoes");
//! ```

#[cfg(not(feature = "bmp"))]
mod data_ascii;
#[cfg(not(feature = "bmp"))]
use data_ascii::LAY_CHARS;

#[cfg(feature = "bmp")]
mod data_bmp;
#[cfg(feature = "bmp")]
use data_bmp::LAY_CHARS;

#[cfg(feature = "normalization")]
use unicode_normalization::{
    UnicodeNormalization,
};

/// try to return a lowercased diacritics-free version
/// of the character.
#[inline(always)]
pub fn lower_lay_char(c: char) -> char {
    if (c as usize) < LAY_CHARS.len() {
        unsafe {
            *LAY_CHARS.get_unchecked(c as usize)
        }
    } else {
        c
    }
}

/// replace every character with its lowercased diacritics-free equivalent
/// whenever possible.
/// By construct, the resulting string is guaranteed to have the same number
/// of characters as the input one (it may be smaller in bytes but not larger).
/// This function doesn't do any normalization. It's thus necessary to ensure
/// the string is already normalized.
pub fn lower_lay_string(s: &str) -> String {
    s.chars()
        .map(|c| lower_lay_char(c))
        .collect()
}

/// normalize the string then replace every character with its
/// lowercased diacritics-free equivalent whenever possible.
#[cfg(feature = "normalization")]
pub fn normalized_lower_lay_string(s: &str) -> String {
    s.nfc()
        .map(|c| lower_lay_char(c))
        .collect()
}


// To test, run
//     cargo test --features="bmp, normalization"
#[cfg(all(test, feature="normalization"))]
mod tests {
    use super::*;
    #[test]
    fn test_lower_lay_char() {
        let s = "Comunicações"; // normalized string (length=12 characters)
        let chars: Vec<char> = s.chars().collect();
        assert_eq!(chars.len(), 12);
        assert_eq!(chars[0], 'C');
        assert_eq!(lower_lay_char(chars[0]), 'c');
        assert_eq!(chars[8], 'ç');
        assert_eq!(lower_lay_char(chars[8]), 'c');
    }
    #[test]
    fn test_normalized_lower_lay_string() {
        let s = "Comunicações"; // unnormalized string (length=14 characters)
        assert_eq!(s.chars().count(), 14);
        let s = normalized_lower_lay_string(s);
        assert_eq!(s.chars().count(), 12);
        assert_eq!(s, "comunicacoes");
    }
}