ferray-strings 0.5.0

String operations on character arrays for ferray
Documentation
// ferray-strings: Search operations (REQ-8, REQ-9, REQ-10)
//
// Implements find, rfind, index, rindex, count, startswith, endswith, replace —
// elementwise on StringArray.
//
// ## REQ status
//
// SHIPPED:
//   - REQ-8 replace — `replace` (`pub fn`): elementwise substring
//     replacement with an optional count, matching `numpy.strings.replace`.
//   - REQ-9 search predicates — `startswith`, `endswith` (`pub fn`) ->
//     `Array<bool, D>`, matching `numpy.strings.startswith`/`endswith`.
//   - REQ-10 search indices — `find`, `count` (`pub fn`) -> `Array<i64, D>`
//     (`find` returns `-1` when absent; both return **character** indices /
//     counts, the published `numpy.strings.find`/`count` contract writing a
//     signed `npy_intp`). The full search family `rfind`, `index`, `rindex`
//     ships alongside (`index`/`rindex` raise `ValueError` when `sub` is
//     absent, like CPython `str.index`/`rindex`). Audited NO DIVERGENCE.
//
// Consumers (non-test): re-exported from the crate root
// (`ferray-strings/src/lib.rs` `pub use search::{count, endswith, find,
// index, replace, rfind, rindex, startswith}`) and bound at the Python
// surface by the `#[pyfunction]` shims `count`, `find`, `startswith`,
// `endswith`, `replace`, `rfind` in `ferray-python/src/char.rs` (each
// calling the matching `fs::` function); the `index`/`rindex` shims build on
// `fs::find`/`fs::rfind`. These back `numpy.char`/`numpy.strings`.

// `find` returns an `i64` array following NumPy's `numpy.strings.find`
// contract (with `-1` for "not found"); converting the `usize` char count
// to `i64` is the published return-type, not a precision bug.
#![allow(clippy::cast_possible_wrap)]

use ferray_core::Array;
use ferray_core::dimension::Dimension;
use ferray_core::error::{FerrayError, FerrayResult};

use crate::string_array::StringArray;

/// Find the lowest index of `sub` in each string element.
///
/// Returns an `Array<i64, D>` preserving the input shape, where each element
/// is the index of the first occurrence of `sub`, or -1 if not found.
///
/// # Errors
/// Returns an error if the internal array construction fails.
pub fn find<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
    let data: Vec<i64> = a.map_to_vec(|s| {
        match s.find(sub) {
            Some(byte_idx) => {
                // Convert byte index to character index
                s[..byte_idx].chars().count() as i64
            }
            None => -1,
        }
    });
    Array::from_vec(a.dim().clone(), data)
}

/// Count non-overlapping occurrences of `sub` in each string element.
///
/// Returns an `Array<i64, D>` preserving the input shape. numpy's `count`
/// ufunc writes a signed `npy_intp` (int64) result, so the element type is
/// signed `i64`.
///
/// # Errors
/// Returns an error if the internal array construction fails.
pub fn count<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
    let data: Vec<i64> = a.map_to_vec(|s| s.matches(sub).count() as i64);
    Array::from_vec(a.dim().clone(), data)
}

/// Test whether each string element starts with the given prefix.
///
/// Returns an `Array<bool, D>` preserving the input shape.
///
/// # Errors
/// Returns an error if the internal array construction fails.
pub fn startswith<D: Dimension>(a: &StringArray<D>, prefix: &str) -> FerrayResult<Array<bool, D>> {
    let data: Vec<bool> = a.map_to_vec(|s| s.starts_with(prefix));
    Array::from_vec(a.dim().clone(), data)
}

/// Test whether each string element ends with the given suffix.
///
/// Returns an `Array<bool, D>` preserving the input shape.
///
/// # Errors
/// Returns an error if the internal array construction fails.
pub fn endswith<D: Dimension>(a: &StringArray<D>, suffix: &str) -> FerrayResult<Array<bool, D>> {
    let data: Vec<bool> = a.map_to_vec(|s| s.ends_with(suffix));
    Array::from_vec(a.dim().clone(), data)
}

/// Replace occurrences of `old` with `new` in each string element.
///
/// If `max_count` is `Some(n)`, only the first `n` occurrences are replaced.
/// If `None`, all occurrences are replaced.
///
/// # Errors
/// Returns an error if the internal array construction fails.
pub fn replace<D: Dimension>(
    a: &StringArray<D>,
    old: &str,
    new: &str,
    max_count: Option<usize>,
) -> FerrayResult<StringArray<D>> {
    a.map(|s| match max_count {
        None => s.replace(old, new),
        Some(n) => s.replacen(old, new, n),
    })
}

/// Find the highest character index of `sub` in each string element.
///
/// Returns `-1` for elements where `sub` is not found. Mirrors
/// `numpy.strings.rfind`.
///
/// # Errors
/// Returns an error if the internal array construction fails.
pub fn rfind<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
    let data: Vec<i64> = a.map_to_vec(|s| match s.rfind(sub) {
        Some(byte_idx) => s[..byte_idx].chars().count() as i64,
        None => -1,
    });
    Array::from_vec(a.dim().clone(), data)
}

/// Find the lowest character index of `sub` in each string element, raising
/// when any element does not contain `sub`.
///
/// Mirrors `numpy.strings.index`. The first miss surfaces as
/// `FerrayError::InvalidValue`.
///
/// # Errors
/// - `FerrayError::InvalidValue` if any element does not contain `sub`.
pub fn index<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
    let mut data = Vec::with_capacity(a.iter().len());
    for s in a.iter() {
        match s.find(sub) {
            Some(byte_idx) => data.push(s[..byte_idx].chars().count() as i64),
            None => {
                return Err(FerrayError::invalid_value(format!(
                    "index: substring {sub:?} not found in element {s:?}"
                )));
            }
        }
    }
    Array::from_vec(a.dim().clone(), data)
}

/// Find the highest character index of `sub` in each string element, raising
/// when any element does not contain `sub`.
///
/// Mirrors `numpy.strings.rindex`.
///
/// # Errors
/// - `FerrayError::InvalidValue` if any element does not contain `sub`.
pub fn rindex<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
    let mut data = Vec::with_capacity(a.iter().len());
    for s in a.iter() {
        match s.rfind(sub) {
            Some(byte_idx) => data.push(s[..byte_idx].chars().count() as i64),
            None => {
                return Err(FerrayError::invalid_value(format!(
                    "rindex: substring {sub:?} not found in element {s:?}"
                )));
            }
        }
    }
    Array::from_vec(a.dim().clone(), data)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::string_array::array;

    #[test]
    fn test_find() {
        let a = array(&["hello", "world", "bell"]).unwrap();
        let b = find(&a, "ll").unwrap();
        let data = b.as_slice().unwrap();
        assert_eq!(data, &[2, -1, 2]);
    }

    #[test]
    fn test_find_at_start() {
        let a = array(&["abc", "def"]).unwrap();
        let b = find(&a, "abc").unwrap();
        let data = b.as_slice().unwrap();
        assert_eq!(data, &[0, -1]);
    }

    #[test]
    fn test_find_empty_sub() {
        let a = array(&["hello"]).unwrap();
        let b = find(&a, "").unwrap();
        let data = b.as_slice().unwrap();
        assert_eq!(data, &[0]);
    }

    #[test]
    fn test_count() {
        let a = array(&["abcabc", "abc", "xyz"]).unwrap();
        let b = count(&a, "abc").unwrap();
        let data = b.as_slice().unwrap();
        assert_eq!(data, &[2_i64, 1, 0]);
    }

    #[test]
    fn test_startswith() {
        let a = array(&["hello", "world", "help"]).unwrap();
        let b = startswith(&a, "hel").unwrap();
        let data = b.as_slice().unwrap();
        assert_eq!(data, &[true, false, true]);
    }

    #[test]
    fn test_endswith() {
        let a = array(&["hello", "world", "bello"]).unwrap();
        let b = endswith(&a, "llo").unwrap();
        let data = b.as_slice().unwrap();
        assert_eq!(data, &[true, false, true]);
    }

    #[test]
    fn test_replace_all() {
        let a = array(&["aabbcc", "aabba"]).unwrap();
        let b = replace(&a, "aa", "XX", None).unwrap();
        assert_eq!(b.as_slice(), &["XXbbcc", "XXbba"]);
    }

    #[test]
    fn test_replace_with_count() {
        let a = array(&["ababab"]).unwrap();
        let b = replace(&a, "ab", "X", Some(2)).unwrap();
        assert_eq!(b.as_slice(), &["XXab"]);
    }

    #[test]
    fn test_find_ac3() {
        // AC-3: strings::find(&a, "ll") returns correct indices (2 for "hello", -1 for "world")
        let a = array(&["hello", "world"]).unwrap();
        let b = find(&a, "ll").unwrap();
        let data = b.as_slice().unwrap();
        assert_eq!(data, &[2, -1]);
    }
}