Skip to main content

ferray_strings/
strip.rs

1// ferray-strings: Stripping operations (REQ-7)
2//
3// Implements strip, lstrip, rstrip — elementwise on StringArray.
4//
5// ## REQ status
6//
7// SHIPPED:
8//   - REQ-7 stripping — `strip`, `lstrip`, `rstrip` (all `pub fn`).
9//     With `chars = None` they trim Unicode whitespace (CPython
10//     `str.strip()` default); with an explicit `chars` they trim any
11//     character in that set from the relevant end(s), matching
12//     `numpy.strings.strip`/`lstrip`/`rstrip`. Audited.
13//
14// Consumers (non-test): re-exported from the crate root
15// (`ferray-strings/src/lib.rs` `pub use strip::{lstrip, rstrip, strip}`)
16// and bound at the Python surface by the `#[pyfunction]` shims generated via
17// `bind_strip!(strip, fs::strip)`, `(lstrip, fs::lstrip)`,
18// `(rstrip, fs::rstrip)` in `ferray-python/src/char.rs`, which back
19// `numpy.char`/`numpy.strings`.
20
21use ferray_core::dimension::Dimension;
22use ferray_core::error::FerrayResult;
23
24use crate::string_array::StringArray;
25
26/// Build a char-set predicate from `chars`. Shared by strip/lstrip/rstrip
27/// so the `char_set: Vec<char>` construction lives in one place (#280).
28fn char_set_predicate(chars: &str) -> impl Fn(char) -> bool + '_ {
29    let char_set: Vec<char> = chars.chars().collect();
30    move |c: char| char_set.contains(&c)
31}
32
33/// Whitespace predicate for the default (no-`chars`) strip path, matching
34/// Python `str.strip` / `numpy.char.strip` (numpy/_core/strings.py:1034 strip
35/// -> `str.strip`).
36///
37/// Python strips every character for which `str.isspace()` is true. That set
38/// differs from Rust's `char::is_whitespace` (the Unicode `White_Space`
39/// property) by exactly the four C0 information separators
40/// U+001C..U+001F (FS/GS/RS/US), which CPython treats as whitespace
41/// (Objects/unicodectype.c) but Unicode `White_Space` excludes. Derived live:
42/// `[hex(c) for c in range(0x110000) if (chr(c)).strip() == '']` has 29
43/// entries; Rust `is_whitespace` covers all but those four, with no extras.
44fn is_python_strip_whitespace(c: char) -> bool {
45    c.is_whitespace() || matches!(c, '\u{1C}'..='\u{1F}')
46}
47
48/// Strip leading and trailing characters from each string element.
49///
50/// If `chars` is `None`, strips whitespace. Otherwise strips any character
51/// present in the `chars` string.
52///
53/// # Errors
54/// Returns an error if the internal array construction fails.
55pub fn strip<D: Dimension>(
56    a: &StringArray<D>,
57    chars: Option<&str>,
58) -> FerrayResult<StringArray<D>> {
59    match chars {
60        None => a.map(|s| s.trim_matches(is_python_strip_whitespace).to_string()),
61        Some(ch) => {
62            let pred = char_set_predicate(ch);
63            a.map(|s| s.trim_matches(&pred).to_string())
64        }
65    }
66}
67
68/// Strip leading characters from each string element.
69///
70/// If `chars` is `None`, strips leading whitespace. Otherwise strips any
71/// character present in the `chars` string from the left.
72///
73/// # Errors
74/// Returns an error if the internal array construction fails.
75pub fn lstrip<D: Dimension>(
76    a: &StringArray<D>,
77    chars: Option<&str>,
78) -> FerrayResult<StringArray<D>> {
79    match chars {
80        None => a.map(|s| s.trim_start_matches(is_python_strip_whitespace).to_string()),
81        Some(ch) => {
82            let pred = char_set_predicate(ch);
83            a.map(|s| s.trim_start_matches(&pred).to_string())
84        }
85    }
86}
87
88/// Strip trailing characters from each string element.
89///
90/// If `chars` is `None`, strips trailing whitespace. Otherwise strips any
91/// character present in the `chars` string from the right.
92///
93/// # Errors
94/// Returns an error if the internal array construction fails.
95pub fn rstrip<D: Dimension>(
96    a: &StringArray<D>,
97    chars: Option<&str>,
98) -> FerrayResult<StringArray<D>> {
99    match chars {
100        None => a.map(|s| s.trim_end_matches(is_python_strip_whitespace).to_string()),
101        Some(ch) => {
102            let pred = char_set_predicate(ch);
103            a.map(|s| s.trim_end_matches(&pred).to_string())
104        }
105    }
106}
107
108#[cfg(test)]
109mod tests {
110    use super::*;
111    use crate::string_array::array;
112
113    #[test]
114    fn test_strip_whitespace() {
115        let a = array(&["  hello  ", "\tworld\n"]).unwrap();
116        let b = strip(&a, None).unwrap();
117        assert_eq!(b.as_slice(), &["hello", "world"]);
118    }
119
120    #[test]
121    fn test_strip_chars() {
122        let a = array(&["xxhelloxx", "xyworldyx"]).unwrap();
123        let b = strip(&a, Some("xy")).unwrap();
124        assert_eq!(b.as_slice(), &["hello", "world"]);
125    }
126
127    #[test]
128    fn test_lstrip_whitespace() {
129        let a = array(&["  hello  ", "\tworld\n"]).unwrap();
130        let b = lstrip(&a, None).unwrap();
131        assert_eq!(b.as_slice(), &["hello  ", "world\n"]);
132    }
133
134    #[test]
135    fn test_lstrip_chars() {
136        let a = array(&["xxhello", "xyhello"]).unwrap();
137        let b = lstrip(&a, Some("xy")).unwrap();
138        assert_eq!(b.as_slice(), &["hello", "hello"]);
139    }
140
141    #[test]
142    fn test_rstrip_whitespace() {
143        let a = array(&["  hello  ", "\tworld\n"]).unwrap();
144        let b = rstrip(&a, None).unwrap();
145        assert_eq!(b.as_slice(), &["  hello", "\tworld"]);
146    }
147
148    #[test]
149    fn test_rstrip_chars() {
150        let a = array(&["helloxx", "worldyx"]).unwrap();
151        let b = rstrip(&a, Some("xy")).unwrap();
152        assert_eq!(b.as_slice(), &["hello", "world"]);
153    }
154
155    #[test]
156    fn test_strip_empty_string() {
157        let a = array(&["", "   "]).unwrap();
158        let b = strip(&a, None).unwrap();
159        assert_eq!(b.as_slice(), &["", ""]);
160    }
161
162    // ---- C0 separator whitespace divergence regression (#916) ----------
163    // Python `str.strip` / `numpy.char.strip` treat U+001C..U+001F as
164    // whitespace; Rust `str::trim` does not. Expected values mirror
165    // `("\x1chi\x1c").strip() == "hi"` etc. (R-CHAR-3).
166
167    #[test]
168    fn test_strip_c0_separators() {
169        for ch in ['\u{1C}', '\u{1D}', '\u{1E}', '\u{1F}'] {
170            let s = format!("{ch}hi{ch}");
171            let a = array(&[s.as_str()]).unwrap();
172            let b = strip(&a, None).unwrap();
173            assert_eq!(
174                b.as_slice(),
175                &["hi"],
176                "strip failed for U+{:04X}",
177                ch as u32
178            );
179        }
180    }
181
182    #[test]
183    fn test_lstrip_c0_separators() {
184        for ch in ['\u{1C}', '\u{1D}', '\u{1E}', '\u{1F}'] {
185            let s = format!("{ch}ab");
186            let a = array(&[s.as_str()]).unwrap();
187            let b = lstrip(&a, None).unwrap();
188            assert_eq!(
189                b.as_slice(),
190                &["ab"],
191                "lstrip failed for U+{:04X}",
192                ch as u32
193            );
194        }
195    }
196
197    #[test]
198    fn test_rstrip_c0_separators() {
199        for ch in ['\u{1C}', '\u{1D}', '\u{1E}', '\u{1F}'] {
200            let s = format!("ab{ch}");
201            let a = array(&[s.as_str()]).unwrap();
202            let b = rstrip(&a, None).unwrap();
203            assert_eq!(
204                b.as_slice(),
205                &["ab"],
206                "rstrip failed for U+{:04X}",
207                ch as u32
208            );
209        }
210    }
211
212    #[test]
213    fn test_strip_unicode_whitespace_still_works() {
214        // Guard: the previously-converged Unicode whitespace
215        // (U+0085 NEL, U+00A0 NBSP, U+000B VT, U+000C FF) still strips,
216        // matching `("\x85\xa0\x0b\x0c hi \t\n\r ").strip() == "hi"`.
217        let a = array(&["\u{85}\u{A0}\u{0B}\u{0C} hi \t\n\r "]).unwrap();
218        let b = strip(&a, None).unwrap();
219        assert_eq!(b.as_slice(), &["hi"]);
220    }
221}