ferray-strings 0.5.0

String operations on character arrays for ferray
Documentation
// ferray-strings: Miscellaneous string operations
//
// str_len, swapcase, and elementwise comparison functions.
//
// ## REQ status
//
// These ops are not numbered in the `.design/ferray-strings.md` REQ list
// (which enumerates REQ-1..14 for the core surface); they are the
// `numpy.strings` extras tracked by their issue ids. All SHIPPED & audited:
//
// SHIPPED:
//   - `str_len` (`pub fn`, #518) -> `Array<i64, D>`: counts Unicode code
//     points (`s.chars().count()`), matching numpy's `str_len` ufunc which
//     writes a signed `npy_intp` from `buf.num_codepoints()` (NOT bytes).
//   - `swapcase` (`pub fn`, #515): per-character case inversion, matching
//     CPython `str.swapcase` / `numpy.strings.swapcase`.
//   - Elementwise comparisons (#516) `equal`, `not_equal`, `less`,
//     `greater`, `less_equal`, `greater_equal` (all `pub fn`) ->
//     `Array<bool, D>`, lexicographic by Unicode scalar value, matching
//     `numpy.strings`/`numpy.char` comparison ufuncs.
//
// Consumers (non-test): re-exported from the crate root
// (`ferray-strings/src/lib.rs` `pub use str_ops::{equal, greater,
// greater_equal, less, less_equal, not_equal, str_len, swapcase}`) and bound
// at the Python surface in `ferray-python/src/char.rs` — `str_len` and
// `swapcase` (`bind_unary_string_op!(swapcase, fs::swapcase)`) shims, and the
// comparison shims generated via `bind_string_compare!(equal, fs::equal)`
// (and the other five) — which back `numpy.char`/`numpy.strings`.

use ferray_core::Array;
use ferray_core::dimension::Dimension;
use ferray_core::error::FerrayResult;

use crate::string_array::StringArray;

/// Return the length of each string element. Matches
/// `numpy.strings.str_len` (#518).
pub fn str_len<D: Dimension>(a: &StringArray<D>) -> FerrayResult<Array<i64, D>> {
    // numpy `string_ufuncs.cpp:118`: `*(npy_intp *)out = buf.num_codepoints();`
    // — signed `npy_intp` counting Unicode code points, NOT UTF-8 bytes.
    let data: Vec<i64> = a.iter().map(|s| s.chars().count() as i64).collect();
    Array::from_vec(a.dim().clone(), data)
}

/// Swap the case of each character in every element. Matches
/// `numpy.strings.swapcase` (#515).
pub fn swapcase<D: Dimension>(a: &StringArray<D>) -> FerrayResult<StringArray<D>> {
    a.map(|s| {
        s.chars()
            .map(|c| {
                if c.is_uppercase() {
                    c.to_lowercase().collect::<String>()
                } else if c.is_lowercase() {
                    c.to_uppercase().collect::<String>()
                } else {
                    c.to_string()
                }
            })
            .collect()
    })
}

// ---------------------------------------------------------------------------
// Elementwise string comparison (#516)
// ---------------------------------------------------------------------------

/// Elementwise string equality. Both arrays must have the same shape.
pub fn equal<D: Dimension>(a: &StringArray<D>, b: &StringArray<D>) -> FerrayResult<Array<bool, D>> {
    let data: Vec<bool> = a.iter().zip(b.iter()).map(|(x, y)| x == y).collect();
    Array::from_vec(a.dim().clone(), data)
}

/// Elementwise string inequality.
pub fn not_equal<D: Dimension>(
    a: &StringArray<D>,
    b: &StringArray<D>,
) -> FerrayResult<Array<bool, D>> {
    let data: Vec<bool> = a.iter().zip(b.iter()).map(|(x, y)| x != y).collect();
    Array::from_vec(a.dim().clone(), data)
}

/// Elementwise lexicographic less-than.
pub fn less<D: Dimension>(a: &StringArray<D>, b: &StringArray<D>) -> FerrayResult<Array<bool, D>> {
    let data: Vec<bool> = a.iter().zip(b.iter()).map(|(x, y)| x < y).collect();
    Array::from_vec(a.dim().clone(), data)
}

/// Elementwise lexicographic greater-than.
pub fn greater<D: Dimension>(
    a: &StringArray<D>,
    b: &StringArray<D>,
) -> FerrayResult<Array<bool, D>> {
    let data: Vec<bool> = a.iter().zip(b.iter()).map(|(x, y)| x > y).collect();
    Array::from_vec(a.dim().clone(), data)
}

/// Elementwise lexicographic less-or-equal.
pub fn less_equal<D: Dimension>(
    a: &StringArray<D>,
    b: &StringArray<D>,
) -> FerrayResult<Array<bool, D>> {
    let data: Vec<bool> = a.iter().zip(b.iter()).map(|(x, y)| x <= y).collect();
    Array::from_vec(a.dim().clone(), data)
}

/// Elementwise lexicographic greater-or-equal.
pub fn greater_equal<D: Dimension>(
    a: &StringArray<D>,
    b: &StringArray<D>,
) -> FerrayResult<Array<bool, D>> {
    let data: Vec<bool> = a.iter().zip(b.iter()).map(|(x, y)| x >= y).collect();
    Array::from_vec(a.dim().clone(), data)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::string_array::array;

    #[test]
    fn test_str_len() {
        let a = array(&["hello", "", "abc", "hi"]).unwrap();
        let r = str_len(&a).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[5, 0, 3, 2]);
    }

    #[test]
    fn test_swapcase() {
        let a = array(&["Hello World", "ABC", "abc", "123"]).unwrap();
        let r = swapcase(&a).unwrap();
        assert_eq!(r.as_slice(), &["hELLO wORLD", "abc", "ABC", "123"]);
    }

    #[test]
    fn test_equal() {
        let a = array(&["abc", "def", "ghi"]).unwrap();
        let b = array(&["abc", "xyz", "ghi"]).unwrap();
        let r = equal(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[true, false, true]);
    }

    #[test]
    fn test_less() {
        let a = array(&["abc", "xyz"]).unwrap();
        let b = array(&["abd", "abc"]).unwrap();
        let r = less(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[true, false]);
    }

    #[test]
    fn test_greater() {
        let a = array(&["xyz", "abc"]).unwrap();
        let b = array(&["abc", "xyz"]).unwrap();
        let r = greater(&a, &b).unwrap();
        assert_eq!(r.as_slice().unwrap(), &[true, false]);
    }
}