#[cfg(not(Py_LIMITED_API))]
use crate::exceptions::PyUnicodeDecodeError;
use crate::ffi_ptr_ext::FfiPtrExt;
use crate::instance::Borrowed;
use crate::py_result_ext::PyResultExt;
use crate::types::bytes::PyBytesMethods;
use crate::types::PyBytes;
use crate::{ffi, Bound, Py, PyAny, PyResult, Python};
use std::borrow::Cow;
use std::ffi::CStr;
use std::{fmt, str};
#[cfg(not(Py_LIMITED_API))]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum PyStringData<'a> {
Ucs1(&'a [u8]),
Ucs2(&'a [u16]),
Ucs4(&'a [u32]),
}
#[cfg(not(Py_LIMITED_API))]
impl<'a> PyStringData<'a> {
pub fn as_bytes(&self) -> &[u8] {
match self {
Self::Ucs1(s) => s,
Self::Ucs2(s) => unsafe {
std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
},
Self::Ucs4(s) => unsafe {
std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
},
}
}
#[inline]
pub fn value_width_bytes(&self) -> usize {
match self {
Self::Ucs1(_) => 1,
Self::Ucs2(_) => 2,
Self::Ucs4(_) => 4,
}
}
pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
match self {
Self::Ucs1(data) => match str::from_utf8(data) {
Ok(s) => Ok(Cow::Borrowed(s)),
Err(e) => Err(PyUnicodeDecodeError::new_utf8(py, data, e)?.into()),
},
Self::Ucs2(data) => match String::from_utf16(data) {
Ok(s) => Ok(Cow::Owned(s)),
Err(e) => {
let mut message = e.to_string().as_bytes().to_vec();
message.push(0);
Err(PyUnicodeDecodeError::new(
py,
c"utf-16",
self.as_bytes(),
0..self.as_bytes().len(),
CStr::from_bytes_with_nul(&message).unwrap(),
)?
.into())
}
},
Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() {
Some(s) => Ok(Cow::Owned(s)),
None => Err(PyUnicodeDecodeError::new(
py,
c"utf-32",
self.as_bytes(),
0..self.as_bytes().len(),
c"error converting utf-32",
)?
.into()),
},
}
}
pub fn to_string_lossy(self) -> Cow<'a, str> {
match self {
Self::Ucs1(data) => String::from_utf8_lossy(data),
Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)),
Self::Ucs4(data) => Cow::Owned(
data.iter()
.map(|&c| std::char::from_u32(c).unwrap_or('\u{FFFD}'))
.collect(),
),
}
}
}
#[repr(transparent)]
pub struct PyString(PyAny);
pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), "builtins", "str", #checkfunction=ffi::PyUnicode_Check);
impl PyString {
pub fn new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
let ptr = s.as_ptr().cast();
let len = s.len() as ffi::Py_ssize_t;
unsafe {
ffi::PyUnicode_FromStringAndSize(ptr, len)
.assume_owned(py)
.cast_into_unchecked()
}
}
pub fn from_bytes<'py>(py: Python<'py>, s: &[u8]) -> PyResult<Bound<'py, PyString>> {
let ptr = s.as_ptr().cast();
let len = s.len() as ffi::Py_ssize_t;
unsafe {
ffi::PyUnicode_FromStringAndSize(ptr, len)
.assume_owned_or_err(py)
.cast_into_unchecked()
}
}
pub fn intern<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
let ptr = s.as_ptr().cast();
let len = s.len() as ffi::Py_ssize_t;
unsafe {
let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len);
if !ob.is_null() {
ffi::PyUnicode_InternInPlace(&mut ob);
}
ob.assume_owned(py).cast_into_unchecked()
}
}
pub fn from_encoded_object<'py>(
src: &Bound<'py, PyAny>,
encoding: Option<&CStr>,
errors: Option<&CStr>,
) -> PyResult<Bound<'py, PyString>> {
let encoding = encoding.map_or(std::ptr::null(), CStr::as_ptr);
let errors = errors.map_or(std::ptr::null(), CStr::as_ptr);
unsafe {
ffi::PyUnicode_FromEncodedObject(src.as_ptr(), encoding, errors)
.assume_owned_or_err(src.py())
.cast_into_unchecked()
}
}
#[inline]
pub fn from_fmt<'py>(
py: Python<'py>,
args: fmt::Arguments<'_>,
) -> PyResult<Bound<'py, PyString>> {
if let Some(static_string) = args.as_str() {
return Ok(PyString::new(py, static_string));
};
#[cfg(all(Py_3_14, not(Py_LIMITED_API)))]
{
use crate::fmt::PyUnicodeWriter;
use std::fmt::Write as _;
let mut writer = PyUnicodeWriter::new(py)?;
writer
.write_fmt(args)
.map_err(|_| writer.take_error().expect("expected error"))?;
writer.into_py_string()
}
#[cfg(any(not(Py_3_14), Py_LIMITED_API))]
{
Ok(PyString::new(py, &format!("{args}")))
}
}
}
#[doc(alias = "PyString")]
pub trait PyStringMethods<'py>: crate::sealed::Sealed {
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
fn to_str(&self) -> PyResult<&str>;
fn to_cow(&self) -> PyResult<Cow<'_, str>>;
fn to_string_lossy(&self) -> Cow<'_, str>;
fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>;
#[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
unsafe fn data(&self) -> PyResult<PyStringData<'_>>;
}
impl<'py> PyStringMethods<'py> for Bound<'py, PyString> {
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
fn to_str(&self) -> PyResult<&str> {
self.as_borrowed().to_str()
}
fn to_cow(&self) -> PyResult<Cow<'_, str>> {
self.as_borrowed().to_cow()
}
fn to_string_lossy(&self) -> Cow<'_, str> {
self.as_borrowed().to_string_lossy()
}
fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> {
unsafe {
ffi::PyUnicode_AsUTF8String(self.as_ptr())
.assume_owned_or_err(self.py())
.cast_into_unchecked::<PyBytes>()
}
}
#[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
unsafe { self.as_borrowed().data() }
}
}
impl<'a> Borrowed<'a, '_, PyString> {
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
pub(crate) fn to_str(self) -> PyResult<&'a str> {
let mut size: ffi::Py_ssize_t = 0;
let data: *const u8 =
unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
if data.is_null() {
Err(crate::PyErr::fetch(self.py()))
} else {
Ok(unsafe {
std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize))
})
}
}
pub(crate) fn to_cow(self) -> PyResult<Cow<'a, str>> {
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
{
self.to_str().map(Cow::Borrowed)
}
#[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
{
let bytes = self.encode_utf8()?;
Ok(Cow::Owned(
unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
))
}
}
fn to_string_lossy(self) -> Cow<'a, str> {
let ptr = self.as_ptr();
let py = self.py();
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
if let Ok(s) = self.to_str() {
return Cow::Borrowed(s);
}
let bytes = unsafe {
ffi::PyUnicode_AsEncodedString(ptr, c"utf-8".as_ptr(), c"surrogatepass".as_ptr())
.assume_owned(py)
.cast_into_unchecked::<PyBytes>()
};
Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned())
}
#[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
unsafe fn data(self) -> PyResult<PyStringData<'a>> {
unsafe {
let ptr = self.as_ptr();
#[cfg(not(Py_3_12))]
#[allow(deprecated)]
{
let ready = ffi::PyUnicode_READY(ptr);
if ready != 0 {
return Err(crate::PyErr::fetch(self.py()));
}
}
let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
let raw_data = ffi::PyUnicode_DATA(ptr);
let kind = ffi::PyUnicode_KIND(ptr);
match kind {
ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
raw_data as *const u8,
length,
))),
ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
raw_data as *const u16,
length,
))),
ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
raw_data as *const u32,
length,
))),
_ => unreachable!(),
}
}
}
}
impl Py<PyString> {
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> {
self.bind_borrowed(py).to_str()
}
pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
self.bind_borrowed(py).to_cow()
}
pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> {
self.bind_borrowed(py).to_string_lossy()
}
}
impl PartialEq<str> for Bound<'_, PyString> {
#[inline]
fn eq(&self, other: &str) -> bool {
self.as_borrowed() == *other
}
}
impl PartialEq<&'_ str> for Bound<'_, PyString> {
#[inline]
fn eq(&self, other: &&str) -> bool {
self.as_borrowed() == **other
}
}
impl PartialEq<Bound<'_, PyString>> for str {
#[inline]
fn eq(&self, other: &Bound<'_, PyString>) -> bool {
*self == other.as_borrowed()
}
}
impl PartialEq<&'_ Bound<'_, PyString>> for str {
#[inline]
fn eq(&self, other: &&Bound<'_, PyString>) -> bool {
*self == other.as_borrowed()
}
}
impl PartialEq<Bound<'_, PyString>> for &'_ str {
#[inline]
fn eq(&self, other: &Bound<'_, PyString>) -> bool {
**self == other.as_borrowed()
}
}
impl PartialEq<str> for &'_ Bound<'_, PyString> {
#[inline]
fn eq(&self, other: &str) -> bool {
self.as_borrowed() == other
}
}
impl PartialEq<str> for Borrowed<'_, '_, PyString> {
#[inline]
fn eq(&self, other: &str) -> bool {
#[cfg(not(Py_3_13))]
{
self.to_cow().is_ok_and(|s| s == other)
}
#[cfg(Py_3_13)]
unsafe {
ffi::PyUnicode_EqualToUTF8AndSize(
self.as_ptr(),
other.as_ptr().cast(),
other.len() as _,
) == 1
}
}
}
impl PartialEq<&str> for Borrowed<'_, '_, PyString> {
#[inline]
fn eq(&self, other: &&str) -> bool {
*self == **other
}
}
impl PartialEq<Borrowed<'_, '_, PyString>> for str {
#[inline]
fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
other == self
}
}
impl PartialEq<Borrowed<'_, '_, PyString>> for &'_ str {
#[inline]
fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
other == self
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::{exceptions::PyLookupError, types::PyAnyMethods as _, IntoPyObject};
#[test]
fn test_to_cow_utf8() {
Python::attach(|py| {
let s = "ascii 🐈";
let py_string = PyString::new(py, s);
assert_eq!(s, py_string.to_cow().unwrap());
})
}
#[test]
fn test_to_cow_surrogate() {
Python::attach(|py| {
let py_string = py
.eval(cr"'\ud800'", None, None)
.unwrap()
.cast_into::<PyString>()
.unwrap();
assert!(py_string.to_cow().is_err());
})
}
#[test]
fn test_to_cow_unicode() {
Python::attach(|py| {
let s = "哈哈🐈";
let py_string = PyString::new(py, s);
assert_eq!(s, py_string.to_cow().unwrap());
})
}
#[test]
fn test_encode_utf8_unicode() {
Python::attach(|py| {
let s = "哈哈🐈";
let obj = PyString::new(py, s);
assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes());
})
}
#[test]
fn test_encode_utf8_surrogate() {
Python::attach(|py| {
let obj: Py<PyAny> = py.eval(cr"'\ud800'", None, None).unwrap().into();
assert!(obj
.bind(py)
.cast::<PyString>()
.unwrap()
.encode_utf8()
.is_err());
})
}
#[test]
fn test_to_string_lossy() {
Python::attach(|py| {
let py_string = py
.eval(cr"'🐈 Hello \ud800World'", None, None)
.unwrap()
.cast_into::<PyString>()
.unwrap();
assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
})
}
#[test]
fn test_debug_string() {
Python::attach(|py| {
let s = "Hello\n".into_pyobject(py).unwrap();
assert_eq!(format!("{s:?}"), "'Hello\\n'");
})
}
#[test]
fn test_display_string() {
Python::attach(|py| {
let s = "Hello\n".into_pyobject(py).unwrap();
assert_eq!(format!("{s}"), "Hello\n");
})
}
#[test]
fn test_string_from_encoded_object() {
Python::attach(|py| {
let py_bytes = PyBytes::new(py, b"ab\xFFcd");
let py_string = PyString::from_encoded_object(&py_bytes, None, None).unwrap_err();
assert!(py_string
.get_type(py)
.is(py.get_type::<crate::exceptions::PyUnicodeDecodeError>()));
let py_string =
PyString::from_encoded_object(&py_bytes, None, Some(c"ignore")).unwrap();
let result = py_string.to_cow().unwrap();
assert_eq!(result, "abcd");
});
}
#[test]
fn test_string_from_encoded_object_with_invalid_encoding_errors() {
Python::attach(|py| {
let py_bytes = PyBytes::new(py, b"abcd");
let err = PyString::from_encoded_object(&py_bytes, Some(c"wat"), None).unwrap_err();
assert!(err.is_instance(py, &py.get_type::<PyLookupError>()));
assert_eq!(err.to_string(), "LookupError: unknown encoding: wat");
let err =
PyString::from_encoded_object(&PyBytes::new(py, b"ab\xFFcd"), None, Some(c"wat"))
.unwrap_err();
assert!(err.is_instance(py, &py.get_type::<PyLookupError>()));
assert_eq!(
err.to_string(),
"LookupError: unknown error handler name 'wat'"
);
});
}
#[test]
#[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
fn test_string_data_ucs1() {
Python::attach(|py| {
let s = PyString::new(py, "hello, world");
let data = unsafe { s.data().unwrap() };
assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world"));
})
}
#[test]
#[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
fn test_string_data_ucs1_invalid() {
Python::attach(|py| {
let buffer = b"f\xfe\0";
let ptr = unsafe {
crate::ffi::PyUnicode_FromKindAndData(
crate::ffi::PyUnicode_1BYTE_KIND as _,
buffer.as_ptr().cast(),
2,
)
};
assert!(!ptr.is_null());
let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
let data = unsafe { s.data().unwrap() };
assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
let err = data.to_string(py).unwrap_err();
assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
assert!(err
.to_string()
.contains("'utf-8' codec can't decode byte 0xfe in position 1"));
assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�"));
});
}
#[test]
#[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
fn test_string_data_ucs2() {
Python::attach(|py| {
let s = py.eval(c"'foo\\ud800'", None, None).unwrap();
let py_string = s.cast::<PyString>().unwrap();
let data = unsafe { py_string.data().unwrap() };
assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
assert_eq!(
data.to_string_lossy(),
Cow::Owned::<str>("foo�".to_string())
);
})
}
#[test]
#[cfg(all(not(any(Py_LIMITED_API, PyPy, GraalPy)), target_endian = "little"))]
fn test_string_data_ucs2_invalid() {
Python::attach(|py| {
let buffer = b"\x22\xff\x00\xd8\x00\x00";
let ptr = unsafe {
crate::ffi::PyUnicode_FromKindAndData(
crate::ffi::PyUnicode_2BYTE_KIND as _,
buffer.as_ptr().cast(),
2,
)
};
assert!(!ptr.is_null());
let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
let data = unsafe { s.data().unwrap() };
assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
let err = data.to_string(py).unwrap_err();
assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
assert!(err
.to_string()
.contains("'utf-16' codec can't decode bytes in position 0-3"));
assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�".into()));
});
}
#[test]
#[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
fn test_string_data_ucs4() {
Python::attach(|py| {
let s = "哈哈🐈";
let py_string = PyString::new(py, s);
let data = unsafe { py_string.data().unwrap() };
assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
})
}
#[test]
#[cfg(all(not(any(Py_LIMITED_API, PyPy, GraalPy)), target_endian = "little"))]
fn test_string_data_ucs4_invalid() {
Python::attach(|py| {
let buffer = b"\x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00";
let ptr = unsafe {
crate::ffi::PyUnicode_FromKindAndData(
crate::ffi::PyUnicode_4BYTE_KIND as _,
buffer.as_ptr().cast(),
2,
)
};
assert!(!ptr.is_null());
let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
let data = unsafe { s.data().unwrap() };
assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
let err = data.to_string(py).unwrap_err();
assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
assert!(err
.to_string()
.contains("'utf-32' codec can't decode bytes in position 0-7"));
assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�".into()));
});
}
#[test]
#[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
fn test_pystring_from_bytes() {
Python::attach(|py| {
let result = PyString::from_bytes(py, "\u{2122}".as_bytes());
assert!(result.is_ok());
let result = PyString::from_bytes(py, b"\x80");
assert!(result
.unwrap_err()
.get_type(py)
.is(py.get_type::<PyUnicodeDecodeError>()));
});
}
#[test]
fn test_intern_string() {
Python::attach(|py| {
let py_string1 = PyString::intern(py, "foo");
assert_eq!(py_string1, "foo");
let py_string2 = PyString::intern(py, "foo");
assert_eq!(py_string2, "foo");
assert_eq!(py_string1.as_ptr(), py_string2.as_ptr());
let py_string3 = PyString::intern(py, "bar");
assert_eq!(py_string3, "bar");
assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
});
}
#[test]
fn test_py_to_str_utf8() {
Python::attach(|py| {
let s = "ascii 🐈";
let py_string = PyString::new(py, s).unbind();
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
assert_eq!(s, py_string.to_str(py).unwrap());
assert_eq!(s, py_string.to_cow(py).unwrap());
})
}
#[test]
fn test_py_to_str_surrogate() {
Python::attach(|py| {
let py_string: Py<PyString> = py
.eval(cr"'\ud800'", None, None)
.unwrap()
.extract()
.unwrap();
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
assert!(py_string.to_str(py).is_err());
assert!(py_string.to_cow(py).is_err());
})
}
#[test]
fn test_py_to_string_lossy() {
Python::attach(|py| {
let py_string: Py<PyString> = py
.eval(cr"'🐈 Hello \ud800World'", None, None)
.unwrap()
.extract()
.unwrap();
assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World");
})
}
#[test]
fn test_comparisons() {
Python::attach(|py| {
let s = "hello, world";
let py_string = PyString::new(py, s);
assert_eq!(py_string, "hello, world");
assert_eq!(py_string, s);
assert_eq!(&py_string, s);
assert_eq!(s, py_string);
assert_eq!(s, &py_string);
assert_eq!(py_string, *s);
assert_eq!(&py_string, *s);
assert_eq!(*s, py_string);
assert_eq!(*s, &py_string);
let py_string = py_string.as_borrowed();
assert_eq!(py_string, s);
assert_eq!(&py_string, s);
assert_eq!(s, py_string);
assert_eq!(s, &py_string);
assert_eq!(py_string, *s);
assert_eq!(*s, py_string);
})
}
}