1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
//! Provide a lowercased diacritics-free version of a character or a string. //! //! For example return `e` for `é`. //! //! Secular's char lookup is an inlined lookup of a static table, which means it's possible to use it in performance sensitive code. //! //! Secular also performs (optionally) Unicode normalization. //! //! ## Declaration //! //! By default, diacritics removal is only done on ascii chars, so to include a smaller table. //! //! If you want to handle the whole BMP, use the "bmp" feature" (the binary will be bigger to //! incorporate the whole mapping). //! //! Default import: //! //!```toml //! [dependencies] //! secular = "0.3" //! ``` //! //! For more characters (the BMP): //! //!```toml //![dependencies] //!secular = { version="0.3", features=["bmp"] } //! ``` //! //! With Unicode normalization functions (using the unicode-normalization crate): //! //!```toml //![dependencies] //!secular = { version="0.3", features=["normalization"] } //! ``` //! //! or //! //!```toml //![dependencies] //!secular = { version="0.3", features=["bmp","normalization"] } //! ``` //! //! This feature is optional so that you can avoid importing the unicode-normalization crate (note that it's used in many other crates so it's possible your text processing application already uses it). //! //! ## Usage //! //! On characters: //! //! ``` //! use secular::*; //! let s = "Comunicações"; // normalized string (length=12) //! let chars: Vec<char> = s.chars().collect(); //! assert_eq!(chars.len(), 12); //! assert_eq!(chars[0], 'C'); //! assert_eq!(lower_lay_char(chars[0]), 'c'); //! assert_eq!(chars[8], 'ç'); //! assert_eq!(lower_lay_char(chars[8]), 'c'); //! ``` //! //! On strings: //! //! ``` //! use secular::*; //! let s = "Comunicações"; // unnormalized string (length=14) //! assert_eq!(s.chars().count(), 14); //! let s = normalized_lower_lay_string(s); //! assert_eq!(s.chars().count(), 12); //! assert_eq!(s, "comunicacoes"); //! ``` #[cfg(not(feature = "bmp"))] mod data_ascii; #[cfg(not(feature = "bmp"))] use data_ascii::LAY_CHARS; #[cfg(feature = "bmp")] mod data_bmp; #[cfg(feature = "bmp")] use data_bmp::LAY_CHARS; #[cfg(feature = "normalization")] use unicode_normalization::{ UnicodeNormalization, }; /// try to return a lowercased diacritics-free version /// of the character. #[inline(always)] pub fn lower_lay_char(c: char) -> char { if (c as usize) < LAY_CHARS.len() { unsafe { *LAY_CHARS.get_unchecked(c as usize) } } else { c } } /// replace every character with its lowercased diacritics-free equivalent /// whenever possible. /// By construct, the resulting string is guaranteed to have the same number /// of characters as the input one (it may be smaller in bytes but not larger). /// This function doesn't do any normalization. It's thus necessary to ensure /// the string is already normalized. pub fn lower_lay_string(s: &str) -> String { s.chars() .map(|c| lower_lay_char(c)) .collect() } /// normalize the string then replace every character with its /// lowercased diacritics-free equivalent whenever possible. #[cfg(feature = "normalization")] pub fn normalized_lower_lay_string(s: &str) -> String { s.nfc() .map(|c| lower_lay_char(c)) .collect() } // To test, run // cargo test --features="bmp, normalization" #[cfg(all(test, feature="normalization"))] mod tests { use super::*; #[test] fn test_lower_lay_char() { let s = "Comunicações"; // normalized string (length=12 characters) let chars: Vec<char> = s.chars().collect(); assert_eq!(chars.len(), 12); assert_eq!(chars[0], 'C'); assert_eq!(lower_lay_char(chars[0]), 'c'); assert_eq!(chars[8], 'ç'); assert_eq!(lower_lay_char(chars[8]), 'c'); } #[test] fn test_normalized_lower_lay_string() { let s = "Comunicações"; // unnormalized string (length=14 characters) assert_eq!(s.chars().count(), 14); let s = normalized_lower_lay_string(s); assert_eq!(s.chars().count(), 12); assert_eq!(s, "comunicacoes"); } }