scirs2_numpy/
strings.rs

1//! Types to support arrays of [ASCII][ascii] and [UCS4][ucs4] strings
2//!
3//! [ascii]: https://numpy.org/doc/stable/reference/c-api/dtype.html#c.NPY_STRING
4//! [ucs4]: https://numpy.org/doc/stable/reference/c-api/dtype.html#c.NPY_UNICODE
5
6use std::collections::hash_map::Entry;
7use std::fmt;
8use std::mem::size_of;
9use std::os::raw::c_char;
10use std::str;
11use std::sync::Mutex;
12
13use pyo3::sync::MutexExt;
14use pyo3::{
15    ffi::{Py_UCS1, Py_UCS4},
16    Bound, Py, Python,
17};
18use rustc_hash::FxHashMap;
19
20use crate::dtype::{clone_methods_impl, Element, PyArrayDescr, PyArrayDescrMethods};
21use crate::npyffi::PyDataType_SET_ELSIZE;
22use crate::npyffi::NPY_TYPES;
23
24/// A newtype wrapper around [`[u8; N]`][Py_UCS1] to handle [`byte` scalars][numpy-bytes] while satisfying coherence.
25///
26/// Note that when creating arrays of ASCII strings without an explicit `dtype`,
27/// NumPy will automatically determine the smallest possible array length at runtime.
28///
29/// For example,
30///
31/// ```python
32/// array = numpy.array([b"foo", b"bar", b"foobar"])
33/// ```
34///
35/// yields `S6` for `array.dtype`.
36///
37/// On the Rust side however, the length `N` of `PyFixedString<N>` must always be given
38/// explicitly and as a compile-time constant. For this work reliably, the Python code
39/// should set the `dtype` explicitly, e.g.
40///
41/// ```python
42/// numpy.array([b"foo", b"bar", b"foobar"], dtype='S12')
43/// ```
44///
45/// always matching `PyArray1<PyFixedString<12>>`.
46///
47/// # Example
48///
49/// ```rust
50/// # use pyo3::Python;
51/// use numpy::{PyArray1, PyUntypedArrayMethods, PyFixedString};
52///
53/// # Python::attach(|py| {
54/// let array = PyArray1::<PyFixedString<3>>::from_vec(py, vec![[b'f', b'o', b'o'].into()]);
55///
56/// assert!(array.dtype().to_string().contains("S3"));
57/// # });
58/// ```
59///
60/// [numpy-bytes]: https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.bytes_
61#[repr(transparent)]
62#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
63pub struct PyFixedString<const N: usize>(pub [Py_UCS1; N]);
64
65impl<const N: usize> fmt::Display for PyFixedString<N> {
66    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
67        fmt.write_str(
68            str::from_utf8(&self.0)
69                .expect("Operation failed")
70                .trim_end_matches('\0'),
71        )
72    }
73}
74
75impl<const N: usize> From<[Py_UCS1; N]> for PyFixedString<N> {
76    fn from(val: [Py_UCS1; N]) -> Self {
77        Self(val)
78    }
79}
80
81unsafe impl<const N: usize> Element for PyFixedString<N> {
82    const IS_COPY: bool = true;
83
84    fn get_dtype(py: Python<'_>) -> Bound<'_, PyArrayDescr> {
85        static DTYPES: TypeDescriptors = TypeDescriptors::new();
86
87        unsafe { DTYPES.from_size(py, NPY_TYPES::NPY_STRING, b'|' as _, size_of::<Self>()) }
88    }
89
90    clone_methods_impl!(Self);
91}
92
93/// A newtype wrapper around [`[PyUCS4; N]`][Py_UCS4] to handle [`str_` scalars][numpy-str] while satisfying coherence.
94///
95/// Note that when creating arrays of Unicode strings without an explicit `dtype`,
96/// NumPy will automatically determine the smallest possible array length at runtime.
97///
98/// For example,
99///
100/// ```python
101/// numpy.array(["foo🐍", "bar🦀", "foobar"])
102/// ```
103///
104/// yields `U6` for `array.dtype`.
105///
106/// On the Rust side however, the length `N` of `PyFixedUnicode<N>` must always be given
107/// explicitly and as a compile-time constant. For this work reliably, the Python code
108/// should set the `dtype` explicitly, e.g.
109///
110/// ```python
111/// numpy.array(["foo🐍", "bar🦀", "foobar"], dtype='U12')
112/// ```
113///
114/// always matching `PyArray1<PyFixedUnicode<12>>`.
115///
116/// # Example
117///
118/// ```rust
119/// # use pyo3::Python;
120/// use numpy::{PyArray1, PyUntypedArrayMethods, PyFixedUnicode};
121///
122/// # Python::attach(|py| {
123/// let array = PyArray1::<PyFixedUnicode<3>>::from_vec(py, vec![[b'b' as _, b'a' as _, b'r' as _].into()]);
124///
125/// assert!(array.dtype().to_string().contains("U3"));
126/// # });
127/// ```
128///
129/// [numpy-str]: https://numpy.org/doc/stable/reference/arrays.scalars.html#numpy.str_
130#[repr(transparent)]
131#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
132pub struct PyFixedUnicode<const N: usize>(pub [Py_UCS4; N]);
133
134impl<const N: usize> fmt::Display for PyFixedUnicode<N> {
135    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
136        for character in self.0 {
137            if character == 0 {
138                break;
139            }
140
141            write!(
142                fmt,
143                "{}",
144                char::from_u32(character).expect("Operation failed")
145            )?;
146        }
147
148        Ok(())
149    }
150}
151
152impl<const N: usize> From<[Py_UCS4; N]> for PyFixedUnicode<N> {
153    fn from(val: [Py_UCS4; N]) -> Self {
154        Self(val)
155    }
156}
157
158unsafe impl<const N: usize> Element for PyFixedUnicode<N> {
159    const IS_COPY: bool = true;
160
161    fn get_dtype(py: Python<'_>) -> Bound<'_, PyArrayDescr> {
162        static DTYPES: TypeDescriptors = TypeDescriptors::new();
163
164        unsafe { DTYPES.from_size(py, NPY_TYPES::NPY_UNICODE, b'=' as _, size_of::<Self>()) }
165    }
166
167    clone_methods_impl!(Self);
168}
169
170struct TypeDescriptors {
171    dtypes: Mutex<Option<FxHashMap<usize, Py<PyArrayDescr>>>>,
172}
173
174impl TypeDescriptors {
175    const fn new() -> Self {
176        Self {
177            dtypes: Mutex::new(None),
178        }
179    }
180
181    /// `npy_type` must be either `NPY_STRING` or `NPY_UNICODE` with matching `byteorder` and `size`
182    #[allow(clippy::wrong_self_convention)]
183    unsafe fn from_size<'py>(
184        &self,
185        py: Python<'py>,
186        npy_type: NPY_TYPES,
187        byteorder: c_char,
188        size: usize,
189    ) -> Bound<'py, PyArrayDescr> {
190        let mut dtypes = self
191            .dtypes
192            .lock_py_attached(py)
193            .expect("dtype cache poisoned");
194
195        let dtype = match dtypes.get_or_insert_with(Default::default).entry(size) {
196            Entry::Occupied(entry) => entry.into_mut(),
197            Entry::Vacant(entry) => {
198                let dtype = PyArrayDescr::new_from_npy_type(py, npy_type);
199
200                let descr = &mut *dtype.as_dtype_ptr();
201                PyDataType_SET_ELSIZE(py, descr, size.try_into().expect("Operation failed"));
202                descr.byteorder = byteorder;
203
204                entry.insert(dtype.into())
205            }
206        };
207
208        dtype.bind(py).to_owned()
209    }
210}
211
212#[cfg(test)]
213mod tests {
214    use super::*;
215
216    #[test]
217    fn format_fixed_string() {
218        assert_eq!(
219            PyFixedString([b'f', b'o', b'o', 0, 0, 0]).to_string(),
220            "foo"
221        );
222        assert_eq!(
223            PyFixedString([b'f', b'o', b'o', b'b', b'a', b'r']).to_string(),
224            "foobar"
225        );
226    }
227
228    #[test]
229    fn format_fixed_unicode() {
230        assert_eq!(
231            PyFixedUnicode([b'f' as _, b'o' as _, b'o' as _, 0, 0, 0]).to_string(),
232            "foo"
233        );
234        assert_eq!(
235            PyFixedUnicode([0x1F980, 0x1F40D, 0, 0, 0, 0]).to_string(),
236            "🦀🐍"
237        );
238        assert_eq!(
239            PyFixedUnicode([b'f' as _, b'o' as _, b'o' as _, b'b' as _, b'a' as _, b'r' as _])
240                .to_string(),
241            "foobar"
242        );
243    }
244}