ferray_strings/case.rs
1// ferray-strings: Case manipulation operations (REQ-5)
2//
3// Implements upper, lower, capitalize, title — elementwise on StringArray.
4//
5// `capitalize` and `title` match Python `str.capitalize`/`str.title`
6// (the semantics `numpy.char`/`numpy.strings` delegate to via `_vec_string`,
7// numpy/_core/strings.py:1239 capitalize, :1282 title). The word-initial
8// character uses the Unicode TITLECASE mapping (not uppercase): the ligature
9// `fi` titlecases to `Fi` (not `FI`), `ß` to `Ss` (not `SS`), and the Lt
10// digraph `Dž` titlecases to itself (not `DŽ`). Tail characters use full
11// lowercase with final-sigma context (word-final `Σ` -> `ς`). Rust's
12// `str::to_lowercase` already implements the Unicode Final_Sigma rule, so the
13// tail is routed through it; only the word-initial titlecase needs a table,
14// since Rust's stdlib has no `to_titlecase`.
15//
16// ## REQ status
17//
18// SHIPPED:
19// - REQ-5 case manipulation — `upper`, `lower`, `capitalize`, `title`
20// (all `pub fn` in this file). `upper`/`lower` route through Rust's
21// Unicode `to_uppercase`/`to_lowercase` (full case folding, e.g.
22// `ß` -> `SS`), matching CPython `str.upper`/`str.lower`.
23// `capitalize`/`title` reproduce CPython `str.capitalize`/`str.title`
24// (the semantics `numpy.char`/`numpy.strings` delegate to via
25// `_vec_string`): word-initial characters use the Unicode TITLECASE
26// mapping, tails use lowercase with the Final_Sigma rule.
27// (`swapcase` lives in `str_ops.rs`.)
28//
29// Consumers (non-test): re-exported from the crate root
30// (`ferray-strings/src/lib.rs` `pub use case::{capitalize, lower, title,
31// upper}`) and bound at the Python surface by the `#[pyfunction]` shims
32// generated via `bind_unary_string_op!(lower, fs::lower)`,
33// `(upper, fs::upper)`, `(capitalize, fs::capitalize)`, `(title, fs::title)`
34// in `ferray-python/src/char.rs`, which back `numpy.char`/`numpy.strings`.
35
36use ferray_core::dimension::Dimension;
37use ferray_core::error::FerrayResult;
38
39use crate::string_array::StringArray;
40
41/// The Unicode titlecase mapping for the characters whose titlecase differs
42/// from their uppercase. For every other character titlecase == uppercase, so
43/// the fallback uses `char::to_uppercase`.
44///
45/// Derived live from CPython/Unicode (matches the numpy 2.4 oracle):
46/// `[chr(c) for c in range(0x110000) if chr(c).title() != chr(c).upper()]`,
47/// mapping each to `chr(c).title()`. Covers the case ligatures
48/// (`fi`->`Fi`, `ß`->`Ss`, …), the Latin/Greek titlecase digraphs (Lt chars
49/// that titlecase to themselves), and the Armenian/Greek/Georgian forms.
50fn titlecase_char(c: char) -> String {
51 let mapped: &str = match c as u32 {
52 0x00DF => "Ss",
53 0x01C4 => "Dž",
54 0x01C5 => "Dž",
55 0x01C6 => "Dž",
56 0x01C7 => "Lj",
57 0x01C8 => "Lj",
58 0x01C9 => "Lj",
59 0x01CA => "Nj",
60 0x01CB => "Nj",
61 0x01CC => "Nj",
62 0x01F1 => "Dz",
63 0x01F2 => "Dz",
64 0x01F3 => "Dz",
65 0x0587 => "Եւ",
66 0x10D0 => "ა",
67 0x10D1 => "ბ",
68 0x10D2 => "გ",
69 0x10D3 => "დ",
70 0x10D4 => "ე",
71 0x10D5 => "ვ",
72 0x10D6 => "ზ",
73 0x10D7 => "თ",
74 0x10D8 => "ი",
75 0x10D9 => "კ",
76 0x10DA => "ლ",
77 0x10DB => "მ",
78 0x10DC => "ნ",
79 0x10DD => "ო",
80 0x10DE => "პ",
81 0x10DF => "ჟ",
82 0x10E0 => "რ",
83 0x10E1 => "ს",
84 0x10E2 => "ტ",
85 0x10E3 => "უ",
86 0x10E4 => "ფ",
87 0x10E5 => "ქ",
88 0x10E6 => "ღ",
89 0x10E7 => "ყ",
90 0x10E8 => "შ",
91 0x10E9 => "ჩ",
92 0x10EA => "ც",
93 0x10EB => "ძ",
94 0x10EC => "წ",
95 0x10ED => "ჭ",
96 0x10EE => "ხ",
97 0x10EF => "ჯ",
98 0x10F0 => "ჰ",
99 0x10F1 => "ჱ",
100 0x10F2 => "ჲ",
101 0x10F3 => "ჳ",
102 0x10F4 => "ჴ",
103 0x10F5 => "ჵ",
104 0x10F6 => "ჶ",
105 0x10F7 => "ჷ",
106 0x10F8 => "ჸ",
107 0x10F9 => "ჹ",
108 0x10FA => "ჺ",
109 0x10FD => "ჽ",
110 0x10FE => "ჾ",
111 0x10FF => "ჿ",
112 0x1F80 => "ᾈ",
113 0x1F81 => "ᾉ",
114 0x1F82 => "ᾊ",
115 0x1F83 => "ᾋ",
116 0x1F84 => "ᾌ",
117 0x1F85 => "ᾍ",
118 0x1F86 => "ᾎ",
119 0x1F87 => "ᾏ",
120 0x1F88 => "ᾈ",
121 0x1F89 => "ᾉ",
122 0x1F8A => "ᾊ",
123 0x1F8B => "ᾋ",
124 0x1F8C => "ᾌ",
125 0x1F8D => "ᾍ",
126 0x1F8E => "ᾎ",
127 0x1F8F => "ᾏ",
128 0x1F90 => "ᾘ",
129 0x1F91 => "ᾙ",
130 0x1F92 => "ᾚ",
131 0x1F93 => "ᾛ",
132 0x1F94 => "ᾜ",
133 0x1F95 => "ᾝ",
134 0x1F96 => "ᾞ",
135 0x1F97 => "ᾟ",
136 0x1F98 => "ᾘ",
137 0x1F99 => "ᾙ",
138 0x1F9A => "ᾚ",
139 0x1F9B => "ᾛ",
140 0x1F9C => "ᾜ",
141 0x1F9D => "ᾝ",
142 0x1F9E => "ᾞ",
143 0x1F9F => "ᾟ",
144 0x1FA0 => "ᾨ",
145 0x1FA1 => "ᾩ",
146 0x1FA2 => "ᾪ",
147 0x1FA3 => "ᾫ",
148 0x1FA4 => "ᾬ",
149 0x1FA5 => "ᾭ",
150 0x1FA6 => "ᾮ",
151 0x1FA7 => "ᾯ",
152 0x1FA8 => "ᾨ",
153 0x1FA9 => "ᾩ",
154 0x1FAA => "ᾪ",
155 0x1FAB => "ᾫ",
156 0x1FAC => "ᾬ",
157 0x1FAD => "ᾭ",
158 0x1FAE => "ᾮ",
159 0x1FAF => "ᾯ",
160 0x1FB2 => "Ὰͅ",
161 0x1FB3 => "ᾼ",
162 0x1FB4 => "Άͅ",
163 0x1FB7 => "ᾼ͂",
164 0x1FBC => "ᾼ",
165 0x1FC2 => "Ὴͅ",
166 0x1FC3 => "ῌ",
167 0x1FC4 => "Ήͅ",
168 0x1FC7 => "ῌ͂",
169 0x1FCC => "ῌ",
170 0x1FF2 => "Ὼͅ",
171 0x1FF3 => "ῼ",
172 0x1FF4 => "Ώͅ",
173 0x1FF7 => "ῼ͂",
174 0x1FFC => "ῼ",
175 0xFB00 => "Ff",
176 0xFB01 => "Fi",
177 0xFB02 => "Fl",
178 0xFB03 => "Ffi",
179 0xFB04 => "Ffl",
180 0xFB05 => "St",
181 0xFB06 => "St",
182 0xFB13 => "Մն",
183 0xFB14 => "Մե",
184 0xFB15 => "Մի",
185 0xFB16 => "Վն",
186 0xFB17 => "Մխ",
187 _ => return c.to_uppercase().collect(),
188 };
189 mapped.to_string()
190}
191
192/// Whether `c` has the Unicode `Cased` property — used to find word
193/// boundaries for `title` (Python's `str.title` treats a character as
194/// word-initial when the previous character is not cased). Rust's
195/// `char::is_lowercase` / `char::is_uppercase` cover the `Lowercase` /
196/// `Uppercase` properties but miss the titlecase (`Lt`) characters, which are
197/// added explicitly. (The `Lt` set: U+01C5/01C8/01CB/01F2 and the Greek
198/// titlecase ranges — derived from `category(chr(c)) == 'Lt'`.)
199fn is_cased(c: char) -> bool {
200 c.is_lowercase()
201 || c.is_uppercase()
202 || matches!(
203 c,
204 '\u{1C5}'
205 | '\u{1C8}'
206 | '\u{1CB}'
207 | '\u{1F2}'
208 | '\u{1F88}'..='\u{1F8F}'
209 | '\u{1F98}'..='\u{1F9F}'
210 | '\u{1FA8}'..='\u{1FAF}'
211 | '\u{1FBC}'
212 | '\u{1FCC}'
213 | '\u{1FFC}'
214 )
215}
216
217/// Convert each string element to uppercase.
218///
219/// # Errors
220/// Returns an error if the internal array construction fails.
221///
222/// # Examples
223/// ```ignore
224/// let a = strings::array(&["hello", "world"]).unwrap();
225/// let b = strings::upper(&a).unwrap();
226/// assert_eq!(b.as_slice(), &["HELLO", "WORLD"]);
227/// ```
228pub fn upper<D: Dimension>(a: &StringArray<D>) -> FerrayResult<StringArray<D>> {
229 a.map(str::to_uppercase)
230}
231
232/// Convert each string element to lowercase.
233///
234/// # Errors
235/// Returns an error if the internal array construction fails.
236pub fn lower<D: Dimension>(a: &StringArray<D>) -> FerrayResult<StringArray<D>> {
237 a.map(str::to_lowercase)
238}
239
240/// Capitalize each string element: the first character is titlecased and the
241/// rest is lowercased (final-sigma aware), matching Python `str.capitalize`.
242///
243/// The first character uses the Unicode titlecase mapping (`fi`->`Fi`,
244/// `ß`->`Ss`); the tail is the whole-string lowercase with the first
245/// character's lowercase span removed, so the final-sigma decision sees the
246/// (cased) first character as preceding context.
247///
248/// # Errors
249/// Returns an error if the internal array construction fails.
250pub fn capitalize<D: Dimension>(a: &StringArray<D>) -> FerrayResult<StringArray<D>> {
251 a.map(|s| {
252 let mut chars = s.chars();
253 match chars.next() {
254 None => String::new(),
255 Some(first) => {
256 // `str::to_lowercase` resolves every final-sigma against the
257 // whole-string context; the first character's lowercase is
258 // context-free (nothing precedes it), so slicing it off the
259 // front leaves the correctly-lowered tail.
260 let lowered = s.to_lowercase();
261 let first_lower_len: usize = first.to_lowercase().map(char::len_utf8).sum();
262 let tail = &lowered[first_lower_len..];
263 let head = titlecase_char(first);
264 format!("{head}{tail}")
265 }
266 }
267 })
268}
269
270/// Title-case each string element: each word starts with a titlecased
271/// character and all remaining cased characters are lowercased, matching
272/// Python `str.title` / `numpy.char.title`.
273///
274/// A character is word-initial when the preceding character is not cased
275/// (digits and punctuation are not cased, so a letter following one is
276/// titlecased). The tail uses the whole-string lowercase so a word-final
277/// `Σ` becomes `ς` (final sigma) per the Unicode Final_Sigma rule.
278///
279/// # Errors
280/// Returns an error if the internal array construction fails.
281pub fn title<D: Dimension>(a: &StringArray<D>) -> FerrayResult<StringArray<D>> {
282 a.map(|s| {
283 // Whole-string lowercase resolves final-sigma at every position; each
284 // original character maps to a span of `lowered` whose byte length
285 // equals `char::to_lowercase(c)` (final sigma `ς` and `σ` are both two
286 // bytes, so the only context-sensitive case preserves the span width).
287 let lowered = s.to_lowercase();
288 let mut result = String::with_capacity(lowered.len());
289 let mut previous_is_cased = false;
290 let mut lowered_offset = 0usize;
291 for ch in s.chars() {
292 let lower_len: usize = ch.to_lowercase().map(char::len_utf8).sum();
293 if is_cased(ch) && !previous_is_cased {
294 result.push_str(&titlecase_char(ch));
295 } else {
296 result.push_str(&lowered[lowered_offset..lowered_offset + lower_len]);
297 }
298 lowered_offset += lower_len;
299 previous_is_cased = is_cased(ch);
300 }
301 result
302 })
303}
304
305#[cfg(test)]
306mod tests {
307 use super::*;
308 use crate::string_array::array;
309
310 #[test]
311 fn test_upper() {
312 let a = array(&["hello", "world"]).unwrap();
313 let b = upper(&a).unwrap();
314 assert_eq!(b.as_slice(), &["HELLO", "WORLD"]);
315 }
316
317 #[test]
318 fn test_lower() {
319 let a = array(&["HELLO", "World"]).unwrap();
320 let b = lower(&a).unwrap();
321 assert_eq!(b.as_slice(), &["hello", "world"]);
322 }
323
324 #[test]
325 fn test_capitalize() {
326 let a = array(&["hello world", "fOO BAR", ""]).unwrap();
327 let b = capitalize(&a).unwrap();
328 assert_eq!(b.as_slice(), &["Hello world", "Foo bar", ""]);
329 }
330
331 #[test]
332 fn test_title() {
333 let a = array(&["hello world", "foo bar baz"]).unwrap();
334 let b = title(&a).unwrap();
335 assert_eq!(b.as_slice(), &["Hello World", "Foo Bar Baz"]);
336 }
337
338 #[test]
339 fn test_title_mixed_case() {
340 let a = array(&["hELLO wORLD"]).unwrap();
341 let b = title(&a).unwrap();
342 assert_eq!(b.as_slice(), &["Hello World"]);
343 }
344
345 #[test]
346 fn test_upper_empty() {
347 let a = array(&[""]).unwrap();
348 let b = upper(&a).unwrap();
349 assert_eq!(b.as_slice(), &[""]);
350 }
351
352 // ---- Titlecase / final-sigma divergence regression (#915) ----------
353 // Expected values come from Python `str.capitalize`/`str.title`, the
354 // semantics numpy.char delegates to (R-CHAR-3).
355
356 #[test]
357 fn test_capitalize_ligature_titlecase() {
358 // 'finery'.capitalize() == 'Finery' (fi titlecases to "Fi", not "FI").
359 let a = array(&["\u{FB01}nery"]).unwrap();
360 let b = capitalize(&a).unwrap();
361 assert_eq!(b.as_slice(), &["Finery"]);
362 }
363
364 #[test]
365 fn test_capitalize_sharp_s_titlecase() {
366 // 'ßeta'.capitalize() == 'Sseta' (ß -> "Ss", not "SS").
367 let a = array(&["\u{DF}eta"]).unwrap();
368 let b = capitalize(&a).unwrap();
369 assert_eq!(b.as_slice(), &["Sseta"]);
370 }
371
372 #[test]
373 fn test_capitalize_digraph_titlecase() {
374 // 'Džungla'.capitalize() == 'Džungla' (U+01C5 titlecases to itself).
375 let a = array(&["\u{1C5}ungla"]).unwrap();
376 let b = capitalize(&a).unwrap();
377 assert_eq!(b.as_slice(), &["\u{1C5}ungla"]);
378 }
379
380 #[test]
381 fn test_capitalize_final_sigma() {
382 // 'ΟΔΟΣ'.capitalize() == 'Οδος' (word-final Σ -> ς).
383 let a = array(&["\u{39F}\u{394}\u{39F}\u{3A3}"]).unwrap();
384 let b = capitalize(&a).unwrap();
385 assert_eq!(b.as_slice(), &["\u{39F}\u{3B4}\u{3BF}\u{3C2}"]);
386 }
387
388 #[test]
389 fn test_title_ligature_titlecase() {
390 // 'file file'.title() == 'File File'.
391 let a = array(&["\u{FB01}le \u{FB01}le"]).unwrap();
392 let b = title(&a).unwrap();
393 assert_eq!(b.as_slice(), &["File File"]);
394 }
395
396 #[test]
397 fn test_title_sharp_s_titlecase() {
398 // 'ßtraße ßeta'.title() == 'Sstraße Sseta'.
399 let a = array(&["\u{DF}tra\u{DF}e \u{DF}eta"]).unwrap();
400 let b = title(&a).unwrap();
401 assert_eq!(b.as_slice(), &["Sstra\u{DF}e Sseta"]);
402 }
403
404 #[test]
405 fn test_title_final_sigma() {
406 // 'ΟΔΟΣ ΟΔΟΣ'.title() == 'Οδος Οδος' (word-final Σ -> ς per word).
407 let a = array(&["\u{39F}\u{394}\u{39F}\u{3A3} \u{39F}\u{394}\u{39F}\u{3A3}"]).unwrap();
408 let b = title(&a).unwrap();
409 assert_eq!(
410 b.as_slice(),
411 &["\u{39F}\u{3B4}\u{3BF}\u{3C2} \u{39F}\u{3B4}\u{3BF}\u{3C2}"]
412 );
413 }
414
415 #[test]
416 fn test_title_digit_word_boundary() {
417 // 'a1b c'.title() == 'A1B C' — a digit is not cased, so the letter
418 // after it is word-initial (titlecased).
419 let a = array(&["a1b c"]).unwrap();
420 let b = title(&a).unwrap();
421 assert_eq!(b.as_slice(), &["A1B C"]);
422 }
423}