Skip to main content

ferray_strings/
case.rs

1// ferray-strings: Case manipulation operations (REQ-5)
2//
3// Implements upper, lower, capitalize, title — elementwise on StringArray.
4//
5// `capitalize` and `title` match Python `str.capitalize`/`str.title`
6// (the semantics `numpy.char`/`numpy.strings` delegate to via `_vec_string`,
7// numpy/_core/strings.py:1239 capitalize, :1282 title). The word-initial
8// character uses the Unicode TITLECASE mapping (not uppercase): the ligature
9// `fi` titlecases to `Fi` (not `FI`), `ß` to `Ss` (not `SS`), and the Lt
10// digraph `Dž` titlecases to itself (not `DŽ`). Tail characters use full
11// lowercase with final-sigma context (word-final `Σ` -> `ς`). Rust's
12// `str::to_lowercase` already implements the Unicode Final_Sigma rule, so the
13// tail is routed through it; only the word-initial titlecase needs a table,
14// since Rust's stdlib has no `to_titlecase`.
15//
16// ## REQ status
17//
18// SHIPPED:
19//   - REQ-5 case manipulation — `upper`, `lower`, `capitalize`, `title`
20//     (all `pub fn` in this file). `upper`/`lower` route through Rust's
21//     Unicode `to_uppercase`/`to_lowercase` (full case folding, e.g.
22//     `ß` -> `SS`), matching CPython `str.upper`/`str.lower`.
23//     `capitalize`/`title` reproduce CPython `str.capitalize`/`str.title`
24//     (the semantics `numpy.char`/`numpy.strings` delegate to via
25//     `_vec_string`): word-initial characters use the Unicode TITLECASE
26//     mapping, tails use lowercase with the Final_Sigma rule.
27//     (`swapcase` lives in `str_ops.rs`.)
28//
29// Consumers (non-test): re-exported from the crate root
30// (`ferray-strings/src/lib.rs` `pub use case::{capitalize, lower, title,
31// upper}`) and bound at the Python surface by the `#[pyfunction]` shims
32// generated via `bind_unary_string_op!(lower, fs::lower)`,
33// `(upper, fs::upper)`, `(capitalize, fs::capitalize)`, `(title, fs::title)`
34// in `ferray-python/src/char.rs`, which back `numpy.char`/`numpy.strings`.
35
36use ferray_core::dimension::Dimension;
37use ferray_core::error::FerrayResult;
38
39use crate::string_array::StringArray;
40
41/// The Unicode titlecase mapping for the characters whose titlecase differs
42/// from their uppercase. For every other character titlecase == uppercase, so
43/// the fallback uses `char::to_uppercase`.
44///
45/// Derived live from CPython/Unicode (matches the numpy 2.4 oracle):
46/// `[chr(c) for c in range(0x110000) if chr(c).title() != chr(c).upper()]`,
47/// mapping each to `chr(c).title()`. Covers the case ligatures
48/// (`fi`->`Fi`, `ß`->`Ss`, …), the Latin/Greek titlecase digraphs (Lt chars
49/// that titlecase to themselves), and the Armenian/Greek/Georgian forms.
50fn titlecase_char(c: char) -> String {
51    let mapped: &str = match c as u32 {
52        0x00DF => "Ss",
53        0x01C4 => "Dž",
54        0x01C5 => "Dž",
55        0x01C6 => "Dž",
56        0x01C7 => "Lj",
57        0x01C8 => "Lj",
58        0x01C9 => "Lj",
59        0x01CA => "Nj",
60        0x01CB => "Nj",
61        0x01CC => "Nj",
62        0x01F1 => "Dz",
63        0x01F2 => "Dz",
64        0x01F3 => "Dz",
65        0x0587 => "Եւ",
66        0x10D0 => "ა",
67        0x10D1 => "ბ",
68        0x10D2 => "გ",
69        0x10D3 => "დ",
70        0x10D4 => "ე",
71        0x10D5 => "ვ",
72        0x10D6 => "ზ",
73        0x10D7 => "თ",
74        0x10D8 => "ი",
75        0x10D9 => "კ",
76        0x10DA => "ლ",
77        0x10DB => "მ",
78        0x10DC => "ნ",
79        0x10DD => "ო",
80        0x10DE => "პ",
81        0x10DF => "ჟ",
82        0x10E0 => "რ",
83        0x10E1 => "ს",
84        0x10E2 => "ტ",
85        0x10E3 => "უ",
86        0x10E4 => "ფ",
87        0x10E5 => "ქ",
88        0x10E6 => "ღ",
89        0x10E7 => "ყ",
90        0x10E8 => "შ",
91        0x10E9 => "ჩ",
92        0x10EA => "ც",
93        0x10EB => "ძ",
94        0x10EC => "წ",
95        0x10ED => "ჭ",
96        0x10EE => "ხ",
97        0x10EF => "ჯ",
98        0x10F0 => "ჰ",
99        0x10F1 => "ჱ",
100        0x10F2 => "ჲ",
101        0x10F3 => "ჳ",
102        0x10F4 => "ჴ",
103        0x10F5 => "ჵ",
104        0x10F6 => "ჶ",
105        0x10F7 => "ჷ",
106        0x10F8 => "ჸ",
107        0x10F9 => "ჹ",
108        0x10FA => "ჺ",
109        0x10FD => "ჽ",
110        0x10FE => "ჾ",
111        0x10FF => "ჿ",
112        0x1F80 => "ᾈ",
113        0x1F81 => "ᾉ",
114        0x1F82 => "ᾊ",
115        0x1F83 => "ᾋ",
116        0x1F84 => "ᾌ",
117        0x1F85 => "ᾍ",
118        0x1F86 => "ᾎ",
119        0x1F87 => "ᾏ",
120        0x1F88 => "ᾈ",
121        0x1F89 => "ᾉ",
122        0x1F8A => "ᾊ",
123        0x1F8B => "ᾋ",
124        0x1F8C => "ᾌ",
125        0x1F8D => "ᾍ",
126        0x1F8E => "ᾎ",
127        0x1F8F => "ᾏ",
128        0x1F90 => "ᾘ",
129        0x1F91 => "ᾙ",
130        0x1F92 => "ᾚ",
131        0x1F93 => "ᾛ",
132        0x1F94 => "ᾜ",
133        0x1F95 => "ᾝ",
134        0x1F96 => "ᾞ",
135        0x1F97 => "ᾟ",
136        0x1F98 => "ᾘ",
137        0x1F99 => "ᾙ",
138        0x1F9A => "ᾚ",
139        0x1F9B => "ᾛ",
140        0x1F9C => "ᾜ",
141        0x1F9D => "ᾝ",
142        0x1F9E => "ᾞ",
143        0x1F9F => "ᾟ",
144        0x1FA0 => "ᾨ",
145        0x1FA1 => "ᾩ",
146        0x1FA2 => "ᾪ",
147        0x1FA3 => "ᾫ",
148        0x1FA4 => "ᾬ",
149        0x1FA5 => "ᾭ",
150        0x1FA6 => "ᾮ",
151        0x1FA7 => "ᾯ",
152        0x1FA8 => "ᾨ",
153        0x1FA9 => "ᾩ",
154        0x1FAA => "ᾪ",
155        0x1FAB => "ᾫ",
156        0x1FAC => "ᾬ",
157        0x1FAD => "ᾭ",
158        0x1FAE => "ᾮ",
159        0x1FAF => "ᾯ",
160        0x1FB2 => "Ὰͅ",
161        0x1FB3 => "ᾼ",
162        0x1FB4 => "Άͅ",
163        0x1FB7 => "ᾼ͂",
164        0x1FBC => "ᾼ",
165        0x1FC2 => "Ὴͅ",
166        0x1FC3 => "ῌ",
167        0x1FC4 => "Ήͅ",
168        0x1FC7 => "ῌ͂",
169        0x1FCC => "ῌ",
170        0x1FF2 => "Ὼͅ",
171        0x1FF3 => "ῼ",
172        0x1FF4 => "Ώͅ",
173        0x1FF7 => "ῼ͂",
174        0x1FFC => "ῼ",
175        0xFB00 => "Ff",
176        0xFB01 => "Fi",
177        0xFB02 => "Fl",
178        0xFB03 => "Ffi",
179        0xFB04 => "Ffl",
180        0xFB05 => "St",
181        0xFB06 => "St",
182        0xFB13 => "Մն",
183        0xFB14 => "Մե",
184        0xFB15 => "Մի",
185        0xFB16 => "Վն",
186        0xFB17 => "Մխ",
187        _ => return c.to_uppercase().collect(),
188    };
189    mapped.to_string()
190}
191
192/// Whether `c` has the Unicode `Cased` property — used to find word
193/// boundaries for `title` (Python's `str.title` treats a character as
194/// word-initial when the previous character is not cased). Rust's
195/// `char::is_lowercase` / `char::is_uppercase` cover the `Lowercase` /
196/// `Uppercase` properties but miss the titlecase (`Lt`) characters, which are
197/// added explicitly. (The `Lt` set: U+01C5/01C8/01CB/01F2 and the Greek
198/// titlecase ranges — derived from `category(chr(c)) == 'Lt'`.)
199fn is_cased(c: char) -> bool {
200    c.is_lowercase()
201        || c.is_uppercase()
202        || matches!(
203            c,
204            '\u{1C5}'
205                | '\u{1C8}'
206                | '\u{1CB}'
207                | '\u{1F2}'
208                | '\u{1F88}'..='\u{1F8F}'
209                | '\u{1F98}'..='\u{1F9F}'
210                | '\u{1FA8}'..='\u{1FAF}'
211                | '\u{1FBC}'
212                | '\u{1FCC}'
213                | '\u{1FFC}'
214        )
215}
216
217/// Convert each string element to uppercase.
218///
219/// # Errors
220/// Returns an error if the internal array construction fails.
221///
222/// # Examples
223/// ```ignore
224/// let a = strings::array(&["hello", "world"]).unwrap();
225/// let b = strings::upper(&a).unwrap();
226/// assert_eq!(b.as_slice(), &["HELLO", "WORLD"]);
227/// ```
228pub fn upper<D: Dimension>(a: &StringArray<D>) -> FerrayResult<StringArray<D>> {
229    a.map(str::to_uppercase)
230}
231
232/// Convert each string element to lowercase.
233///
234/// # Errors
235/// Returns an error if the internal array construction fails.
236pub fn lower<D: Dimension>(a: &StringArray<D>) -> FerrayResult<StringArray<D>> {
237    a.map(str::to_lowercase)
238}
239
240/// Capitalize each string element: the first character is titlecased and the
241/// rest is lowercased (final-sigma aware), matching Python `str.capitalize`.
242///
243/// The first character uses the Unicode titlecase mapping (`fi`->`Fi`,
244/// `ß`->`Ss`); the tail is the whole-string lowercase with the first
245/// character's lowercase span removed, so the final-sigma decision sees the
246/// (cased) first character as preceding context.
247///
248/// # Errors
249/// Returns an error if the internal array construction fails.
250pub fn capitalize<D: Dimension>(a: &StringArray<D>) -> FerrayResult<StringArray<D>> {
251    a.map(|s| {
252        let mut chars = s.chars();
253        match chars.next() {
254            None => String::new(),
255            Some(first) => {
256                // `str::to_lowercase` resolves every final-sigma against the
257                // whole-string context; the first character's lowercase is
258                // context-free (nothing precedes it), so slicing it off the
259                // front leaves the correctly-lowered tail.
260                let lowered = s.to_lowercase();
261                let first_lower_len: usize = first.to_lowercase().map(char::len_utf8).sum();
262                let tail = &lowered[first_lower_len..];
263                let head = titlecase_char(first);
264                format!("{head}{tail}")
265            }
266        }
267    })
268}
269
270/// Title-case each string element: each word starts with a titlecased
271/// character and all remaining cased characters are lowercased, matching
272/// Python `str.title` / `numpy.char.title`.
273///
274/// A character is word-initial when the preceding character is not cased
275/// (digits and punctuation are not cased, so a letter following one is
276/// titlecased). The tail uses the whole-string lowercase so a word-final
277/// `Σ` becomes `ς` (final sigma) per the Unicode Final_Sigma rule.
278///
279/// # Errors
280/// Returns an error if the internal array construction fails.
281pub fn title<D: Dimension>(a: &StringArray<D>) -> FerrayResult<StringArray<D>> {
282    a.map(|s| {
283        // Whole-string lowercase resolves final-sigma at every position; each
284        // original character maps to a span of `lowered` whose byte length
285        // equals `char::to_lowercase(c)` (final sigma `ς` and `σ` are both two
286        // bytes, so the only context-sensitive case preserves the span width).
287        let lowered = s.to_lowercase();
288        let mut result = String::with_capacity(lowered.len());
289        let mut previous_is_cased = false;
290        let mut lowered_offset = 0usize;
291        for ch in s.chars() {
292            let lower_len: usize = ch.to_lowercase().map(char::len_utf8).sum();
293            if is_cased(ch) && !previous_is_cased {
294                result.push_str(&titlecase_char(ch));
295            } else {
296                result.push_str(&lowered[lowered_offset..lowered_offset + lower_len]);
297            }
298            lowered_offset += lower_len;
299            previous_is_cased = is_cased(ch);
300        }
301        result
302    })
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308    use crate::string_array::array;
309
310    #[test]
311    fn test_upper() {
312        let a = array(&["hello", "world"]).unwrap();
313        let b = upper(&a).unwrap();
314        assert_eq!(b.as_slice(), &["HELLO", "WORLD"]);
315    }
316
317    #[test]
318    fn test_lower() {
319        let a = array(&["HELLO", "World"]).unwrap();
320        let b = lower(&a).unwrap();
321        assert_eq!(b.as_slice(), &["hello", "world"]);
322    }
323
324    #[test]
325    fn test_capitalize() {
326        let a = array(&["hello world", "fOO BAR", ""]).unwrap();
327        let b = capitalize(&a).unwrap();
328        assert_eq!(b.as_slice(), &["Hello world", "Foo bar", ""]);
329    }
330
331    #[test]
332    fn test_title() {
333        let a = array(&["hello world", "foo bar baz"]).unwrap();
334        let b = title(&a).unwrap();
335        assert_eq!(b.as_slice(), &["Hello World", "Foo Bar Baz"]);
336    }
337
338    #[test]
339    fn test_title_mixed_case() {
340        let a = array(&["hELLO wORLD"]).unwrap();
341        let b = title(&a).unwrap();
342        assert_eq!(b.as_slice(), &["Hello World"]);
343    }
344
345    #[test]
346    fn test_upper_empty() {
347        let a = array(&[""]).unwrap();
348        let b = upper(&a).unwrap();
349        assert_eq!(b.as_slice(), &[""]);
350    }
351
352    // ---- Titlecase / final-sigma divergence regression (#915) ----------
353    // Expected values come from Python `str.capitalize`/`str.title`, the
354    // semantics numpy.char delegates to (R-CHAR-3).
355
356    #[test]
357    fn test_capitalize_ligature_titlecase() {
358        // 'finery'.capitalize() == 'Finery' (fi titlecases to "Fi", not "FI").
359        let a = array(&["\u{FB01}nery"]).unwrap();
360        let b = capitalize(&a).unwrap();
361        assert_eq!(b.as_slice(), &["Finery"]);
362    }
363
364    #[test]
365    fn test_capitalize_sharp_s_titlecase() {
366        // 'ßeta'.capitalize() == 'Sseta' (ß -> "Ss", not "SS").
367        let a = array(&["\u{DF}eta"]).unwrap();
368        let b = capitalize(&a).unwrap();
369        assert_eq!(b.as_slice(), &["Sseta"]);
370    }
371
372    #[test]
373    fn test_capitalize_digraph_titlecase() {
374        // 'Džungla'.capitalize() == 'Džungla' (U+01C5 titlecases to itself).
375        let a = array(&["\u{1C5}ungla"]).unwrap();
376        let b = capitalize(&a).unwrap();
377        assert_eq!(b.as_slice(), &["\u{1C5}ungla"]);
378    }
379
380    #[test]
381    fn test_capitalize_final_sigma() {
382        // 'ΟΔΟΣ'.capitalize() == 'Οδος' (word-final Σ -> ς).
383        let a = array(&["\u{39F}\u{394}\u{39F}\u{3A3}"]).unwrap();
384        let b = capitalize(&a).unwrap();
385        assert_eq!(b.as_slice(), &["\u{39F}\u{3B4}\u{3BF}\u{3C2}"]);
386    }
387
388    #[test]
389    fn test_title_ligature_titlecase() {
390        // 'file file'.title() == 'File File'.
391        let a = array(&["\u{FB01}le \u{FB01}le"]).unwrap();
392        let b = title(&a).unwrap();
393        assert_eq!(b.as_slice(), &["File File"]);
394    }
395
396    #[test]
397    fn test_title_sharp_s_titlecase() {
398        // 'ßtraße ßeta'.title() == 'Sstraße Sseta'.
399        let a = array(&["\u{DF}tra\u{DF}e \u{DF}eta"]).unwrap();
400        let b = title(&a).unwrap();
401        assert_eq!(b.as_slice(), &["Sstra\u{DF}e Sseta"]);
402    }
403
404    #[test]
405    fn test_title_final_sigma() {
406        // 'ΟΔΟΣ ΟΔΟΣ'.title() == 'Οδος Οδος' (word-final Σ -> ς per word).
407        let a = array(&["\u{39F}\u{394}\u{39F}\u{3A3} \u{39F}\u{394}\u{39F}\u{3A3}"]).unwrap();
408        let b = title(&a).unwrap();
409        assert_eq!(
410            b.as_slice(),
411            &["\u{39F}\u{3B4}\u{3BF}\u{3C2} \u{39F}\u{3B4}\u{3BF}\u{3C2}"]
412        );
413    }
414
415    #[test]
416    fn test_title_digit_word_boundary() {
417        // 'a1b c'.title() == 'A1B C' — a digit is not cased, so the letter
418        // after it is word-initial (titlecased).
419        let a = array(&["a1b c"]).unwrap();
420        let b = title(&a).unwrap();
421        assert_eq!(b.as_slice(), &["A1B C"]);
422    }
423}