cpython/objects/
string.rs

1// Copyright (c) 2015 Daniel Grunwald
2//
3// Permission is hereby granted, free of charge, to any person obtaining a copy of this
4// software and associated documentation files (the "Software"), to deal in the Software
5// without restriction, including without limitation the rights to use, copy, modify, merge,
6// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons
7// to whom the Software is furnished to do so, subject to the following conditions:
8//
9// The above copyright notice and this permission notice shall be included in all copies or
10// substantial portions of the Software.
11//
12// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
13// INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
14// PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE
15// FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
16// OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
17// DEALINGS IN THE SOFTWARE.
18
19use libc::c_char;
20use std::borrow::Cow;
21use std::{char, mem, str};
22
23use super::{exc, PyObject};
24use crate::conversion::{FromPyObject, RefFromPyObject, ToPyObject};
25use crate::err::{self, PyErr, PyResult};
26use crate::ffi;
27use crate::python::{PyClone, Python, PythonObject, PythonObjectDowncastError, ToPythonPointer};
28
29/// Represents a Python string.
30/// Corresponds to `basestring` in Python 2, and `str` in Python 3.
31pub struct PyString(PyObject);
32
33#[cfg(feature = "python27-sys")]
34pyobject_newtype!(PyString);
35#[cfg(feature = "python3-sys")]
36pyobject_newtype!(PyString, PyUnicode_Check, PyUnicode_Type);
37
38/// Represents a Python byte string.
39/// Corresponds to `str` in Python 2, and `bytes` in Python 3.
40pub struct PyBytes(PyObject);
41
42pyobject_newtype!(PyBytes, PyBytes_Check, PyBytes_Type);
43
44/// Represents a Python unicode string.
45/// Corresponds to `unicode` in Python 2, and `str` in Python 3.
46#[cfg(feature = "python27-sys")]
47pub struct PyUnicode(PyObject);
48
49#[cfg(feature = "python27-sys")]
50pyobject_newtype!(PyUnicode, PyUnicode_Check, PyUnicode_Type);
51
52/// Represents a Python unicode string.
53/// Corresponds to `unicode` in Python 2, and `str` in Python 3.
54#[cfg(feature = "python3-sys")]
55pub use PyString as PyUnicode;
56
57#[cfg(feature = "python27-sys")]
58impl crate::python::PythonObjectWithCheckedDowncast for PyString {
59    #[inline]
60    fn downcast_from(
61        py: Python<'_>,
62        obj: PyObject,
63    ) -> Result<PyString, PythonObjectDowncastError<'_>> {
64        if is_base_string(&obj) {
65            Ok(PyString(obj))
66        } else {
67            Err(PythonObjectDowncastError::new(
68                py,
69                "PyString",
70                obj.get_type(py),
71            ))
72        }
73    }
74
75    #[inline]
76    fn downcast_borrow_from<'a, 'p>(
77        py: Python<'p>,
78        obj: &'a PyObject,
79    ) -> Result<&'a PyString, PythonObjectDowncastError<'p>> {
80        unsafe {
81            if is_base_string(obj) {
82                Ok(std::mem::transmute(obj))
83            } else {
84                Err(crate::python::PythonObjectDowncastError::new(
85                    py,
86                    "PyString",
87                    obj.get_type(py),
88                ))
89            }
90        }
91    }
92}
93
94#[cfg(feature = "python27-sys")]
95#[inline]
96fn is_base_string(obj: &PyObject) -> bool {
97    unsafe {
98        ffi::PyType_FastSubclass(
99            ffi::Py_TYPE(obj.as_ptr()),
100            ffi::Py_TPFLAGS_STRING_SUBCLASS | ffi::Py_TPFLAGS_UNICODE_SUBCLASS,
101        ) != 0
102    }
103}
104
105#[cfg(feature = "python27-sys")]
106impl crate::python::PythonObjectWithTypeObject for PyString {
107    #[inline]
108    fn type_object(py: Python) -> super::PyType {
109        unsafe {
110            crate::objects::typeobject::PyType::from_type_ptr(py, &mut ffi::PyBaseString_Type)
111        }
112    }
113}
114
115/// Enum of possible Python string representations.
116#[derive(Clone, Copy, Debug)]
117pub enum PyStringData<'a> {
118    Latin1(&'a [u8]),
119    Utf8(&'a [u8]),
120    Utf16(&'a [u16]),
121    Utf32(&'a [u32]),
122}
123
124impl<'a> From<&'a str> for PyStringData<'a> {
125    #[inline]
126    fn from(val: &'a str) -> PyStringData<'a> {
127        PyStringData::Utf8(val.as_bytes())
128    }
129}
130
131impl<'a> From<&'a [u16]> for PyStringData<'a> {
132    #[inline]
133    fn from(val: &'a [u16]) -> PyStringData<'a> {
134        PyStringData::Utf16(val)
135    }
136}
137
138impl<'a> From<&'a [u32]> for PyStringData<'a> {
139    #[inline]
140    fn from(val: &'a [u32]) -> PyStringData<'a> {
141        PyStringData::Utf32(val)
142    }
143}
144
145impl<'a> PyStringData<'a> {
146    /// Convert the Python string data to a Rust string.
147    ///
148    /// For UTF-8 and ASCII-only latin-1, returns a borrow into the original string data.
149    /// For Latin-1, UTF-16 and UTF-32, returns an owned string.
150    ///
151    /// Fails with UnicodeDecodeError if the string data isn't valid in its encoding.
152    pub fn to_string(self, py: Python) -> PyResult<Cow<'a, str>> {
153        match self {
154            PyStringData::Utf8(data) => match str::from_utf8(data) {
155                Ok(s) => Ok(Cow::Borrowed(s)),
156                Err(e) => Err(PyErr::from_instance(
157                    py,
158                    exc::UnicodeDecodeError::new_utf8(py, data, e)?,
159                )),
160            },
161            PyStringData::Latin1(data) => {
162                if data.is_ascii() {
163                    Ok(Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) }))
164                } else {
165                    Ok(Cow::Owned(data.iter().map(|&b| b as char).collect()))
166                }
167            }
168            PyStringData::Utf16(data) => {
169                fn utf16_bytes(input: &[u16]) -> &[u8] {
170                    unsafe { mem::transmute(input) }
171                }
172                match String::from_utf16(data) {
173                    Ok(s) => Ok(Cow::Owned(s)),
174                    Err(_) => Err(PyErr::from_instance(
175                        py,
176                        exc::UnicodeDecodeError::new(
177                            py,
178                            cstr!("utf-16"),
179                            utf16_bytes(data),
180                            0..2 * data.len(),
181                            cstr!("invalid utf-16"),
182                        )?,
183                    )),
184                }
185            }
186            PyStringData::Utf32(data) => {
187                fn utf32_bytes(input: &[u32]) -> &[u8] {
188                    unsafe { mem::transmute(input) }
189                }
190                match data.iter().map(|&u| char::from_u32(u)).collect() {
191                    Some(s) => Ok(Cow::Owned(s)),
192                    None => Err(PyErr::from_instance(
193                        py,
194                        exc::UnicodeDecodeError::new(
195                            py,
196                            cstr!("utf-32"),
197                            utf32_bytes(data),
198                            0..4 * data.len(),
199                            cstr!("invalid utf-32"),
200                        )?,
201                    )),
202                }
203            }
204        }
205    }
206
207    /// Convert the Python string data to a Rust string.
208    ///
209    /// Returns a borrow into the original string data if possible.
210    ///
211    /// Data that isn't valid in its encoding will be replaced
212    /// with U+FFFD REPLACEMENT CHARACTER.
213    pub fn to_string_lossy(self) -> Cow<'a, str> {
214        match self {
215            PyStringData::Utf8(data) => String::from_utf8_lossy(data),
216            PyStringData::Latin1(data) => {
217                if data.is_ascii() {
218                    Cow::Borrowed(unsafe { str::from_utf8_unchecked(data) })
219                } else {
220                    Cow::Owned(data.iter().map(|&b| b as char).collect())
221                }
222            }
223            PyStringData::Utf16(data) => Cow::Owned(String::from_utf16_lossy(data)),
224            PyStringData::Utf32(data) => Cow::Owned(
225                data.iter()
226                    .map(|&u| char::from_u32(u).unwrap_or('\u{FFFD}'))
227                    .collect(),
228            ),
229        }
230    }
231}
232
233impl PyString {
234    /// Creates a new Python string object.
235    ///
236    /// On Python 2.7, this function will create a byte string if the
237    /// feature `py2-no-auto-unicode-promotion` is set, or the input
238    /// input string is ASCII-only; otherwise, the input string will be
239    /// converted to a unicode string.
240    /// Use `PyUnicode::new()` to always create a unicode string.
241    ///
242    /// On Python 3.x, this function always creates unicode `str` objects.
243    ///
244    /// Panics if out of memory.
245    pub fn new(py: Python, s: &str) -> PyString {
246        #[cfg(feature = "python27-sys")]
247        fn new_impl(py: Python, s: &str) -> PyString {
248            if cfg!(feature = "py2-no-auto-unicode-promotion") || s.is_ascii() {
249                PyBytes::new(py, s.as_bytes()).into_basestring()
250            } else {
251                PyUnicode::new(py, s).into_basestring()
252            }
253        }
254        #[cfg(feature = "python3-sys")]
255        fn new_impl(py: Python, s: &str) -> PyString {
256            let ptr = s.as_ptr() as *const c_char;
257            let len = s.len() as ffi::Py_ssize_t;
258            unsafe {
259                err::cast_from_owned_ptr_or_panic(py, ffi::PyUnicode_FromStringAndSize(ptr, len))
260            }
261        }
262        new_impl(py, s)
263    }
264
265    /// Gets the python string data in its underlying representation.
266    ///
267    /// For Python 2 byte strings, this function always returns `PyStringData::Utf8`,
268    /// even if the bytes are not valid UTF-8.
269    /// For unicode strings, returns the underlying representation used by Python.
270    pub fn data(&self, py: Python) -> PyStringData {
271        self.data_impl(py)
272    }
273
274    #[cfg(feature = "python27-sys")]
275    fn data_impl(&self, py: Python) -> PyStringData {
276        if let Ok(bytes) = self.0.cast_as::<PyBytes>(py) {
277            PyStringData::Utf8(bytes.data(py))
278        } else if let Ok(unicode) = self.0.cast_as::<PyUnicode>(py) {
279            unicode.data(py)
280        } else {
281            panic!("PyString is neither `str` nor `unicode`")
282        }
283    }
284
285    #[cfg(feature = "python3-sys")]
286    fn data_impl(&self, _py: Python) -> PyStringData {
287        let ptr = self.as_ptr();
288        unsafe {
289            let ready = ffi::PyUnicode_READY(ptr);
290            if ready < 0 {
291                // should fail only on OOM
292                ffi::PyErr_Print();
293                panic!("PyUnicode_READY failed");
294            }
295            let size = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
296            let data = ffi::PyUnicode_DATA(ptr);
297            let kind = ffi::PyUnicode_KIND(ptr);
298            match kind {
299                ffi::PyUnicode_1BYTE_KIND => {
300                    PyStringData::Latin1(std::slice::from_raw_parts(data as *const u8, size))
301                }
302                ffi::PyUnicode_2BYTE_KIND => {
303                    PyStringData::Utf16(std::slice::from_raw_parts(data as *const u16, size))
304                }
305                ffi::PyUnicode_4BYTE_KIND => {
306                    PyStringData::Utf32(std::slice::from_raw_parts(data as *const u32, size))
307                }
308                _ => panic!("Unknown PyUnicode_KIND"),
309            }
310        }
311    }
312
313    /// Convert the `PyString` into a Rust string.
314    ///
315    /// On Python 2.7, if the `PyString` refers to a byte string,
316    /// it will be decoded using UTF-8.
317    ///
318    /// Returns a `UnicodeDecodeError` if the input is not valid unicode
319    /// (containing unpaired surrogates, or a Python 2.7 byte string that is
320    /// not valid UTF-8).
321    pub fn to_string(&self, py: Python) -> PyResult<Cow<str>> {
322        #[cfg(feature = "python3-sys")]
323        unsafe {
324            // On Python 3, we can use the UTF-8 representation stored
325            // inside the Python string.
326            // This should produce identical results to
327            // `self.data(py).to_string(py)` but avoids
328            // re-encoding the string on every to_string call.
329            let mut size: ffi::Py_ssize_t = 0;
330            let data = ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size);
331            if data.is_null() {
332                Err(PyErr::fetch(py))
333            } else {
334                let slice = std::slice::from_raw_parts(data as *const u8, size as usize);
335                Ok(Cow::Borrowed(std::str::from_utf8_unchecked(slice)))
336            }
337        }
338        #[cfg(feature = "python27-sys")]
339        {
340            self.data(py).to_string(py)
341        }
342    }
343
344    /// Convert the `PyString` into a Rust string.
345    ///
346    /// On Python 2.7, if the `PyString` refers to a byte string,
347    /// it will be decoded using UTF-8.
348    ///
349    /// Unpaired surrogates and (on Python 2.7) invalid UTF-8 sequences are
350    /// replaced with U+FFFD REPLACEMENT CHARACTER.
351    pub fn to_string_lossy(&self, py: Python) -> Cow<str> {
352        self.data(py).to_string_lossy()
353    }
354}
355
356impl PyBytes {
357    /// Creates a new Python byte string object.
358    /// The byte string is initialized by copying the data from the `&[u8]`.
359    ///
360    /// Panics if out of memory.
361    pub fn new(py: Python, s: &[u8]) -> PyBytes {
362        let ptr = s.as_ptr() as *const c_char;
363        let len = s.len() as ffi::Py_ssize_t;
364        unsafe { err::cast_from_owned_ptr_or_panic(py, ffi::PyBytes_FromStringAndSize(ptr, len)) }
365    }
366
367    /// Gets the Python string data as byte slice.
368    pub fn data(&self, _py: Python) -> &[u8] {
369        unsafe {
370            let buffer = ffi::PyBytes_AsString(self.as_ptr()) as *const u8;
371            let length = ffi::PyBytes_Size(self.as_ptr()) as usize;
372            std::slice::from_raw_parts(buffer, length)
373        }
374    }
375
376    /// Converts from `PyBytes` to `PyString`.
377    /// This method is only available on Python 2.
378    #[cfg(feature = "python27-sys")]
379    #[inline]
380    pub fn as_basestring(&self) -> &PyString {
381        unsafe { self.0.unchecked_cast_as() }
382    }
383
384    /// Converts from `PyBytes` to `PyString`.
385    /// This method is only available on Python 2.
386    #[cfg(feature = "python27-sys")]
387    #[inline]
388    pub fn into_basestring(self) -> PyString {
389        unsafe { self.0.unchecked_cast_into() }
390    }
391}
392
393#[cfg(feature = "python27-sys")]
394impl PyUnicode {
395    /// Creates a new Python unicode string object.
396    ///
397    /// Panics if out of memory.
398    pub fn new(py: Python, s: &str) -> PyUnicode {
399        let ptr = s.as_ptr() as *const c_char;
400        let len = s.len() as ffi::Py_ssize_t;
401        unsafe { err::cast_from_owned_ptr_or_panic(py, ffi::PyUnicode_FromStringAndSize(ptr, len)) }
402    }
403
404    /// Converts from `PyUnicode` to `PyString`.
405    /// This method is only available on Python 2.
406    /// (note: on Python 3, `PyUnicode` is a type alias for `PyString`)
407    #[inline]
408    pub fn as_basestring(&self) -> &PyString {
409        unsafe { self.0.unchecked_cast_as() }
410    }
411
412    /// Converts from `PyUnicode` to `PyString`.
413    /// This method is only available on Python 2.
414    /// (note: on Python 3, `PyUnicode` is a type alias for `PyString`)
415    #[inline]
416    pub fn into_basestring(self) -> PyString {
417        unsafe { self.0.unchecked_cast_into() }
418    }
419
420    /// Gets the python string data in its underlying representation.
421    pub fn data(&self, _py: Python) -> PyStringData {
422        unsafe {
423            let buffer = ffi::PyUnicode_AS_UNICODE(self.as_ptr());
424            let length = ffi::PyUnicode_GET_SIZE(self.as_ptr()) as usize;
425            std::slice::from_raw_parts(buffer, length).into()
426        }
427    }
428
429    /// Convert the `PyUnicode` into a Rust string.
430    ///
431    /// Returns a `UnicodeDecodeError` if the input is not valid unicode
432    /// (containing unpaired surrogates).
433    pub fn to_string(&self, py: Python) -> PyResult<Cow<str>> {
434        self.data(py).to_string(py)
435    }
436
437    /// Convert the `PyUnicode` into a Rust string.
438    ///
439    /// Unpaired surrogates are replaced with U+FFFD REPLACEMENT CHARACTER.
440    pub fn to_string_lossy(&self, py: Python) -> Cow<str> {
441        self.data(py).to_string_lossy()
442    }
443}
444
445/// Converts Rust `str` to Python object.
446///
447/// On Python 2.7, this impl will create a byte string if the
448/// input string is ASCII-only; and a unicode string otherwise.
449/// Use `PyUnicode::new()` to always create a unicode string.
450///
451/// On Python 3.x, this function always creates unicode `str` objects.
452impl ToPyObject for str {
453    type ObjectType = PyString;
454
455    #[inline]
456    fn to_py_object(&self, py: Python) -> PyString {
457        PyString::new(py, self)
458    }
459}
460
461/// Converts Rust `str` to Python object.
462///
463/// On Python 2.7, this impl will create a byte string if the
464/// input string is ASCII-only; and a unicode string otherwise.
465/// Use `PyUnicode::new()` to always create a unicode string.
466///
467/// On Python 3.x, this function always creates unicode `str` objects.
468impl<'a> ToPyObject for Cow<'a, str> {
469    type ObjectType = PyString;
470
471    #[inline]
472    fn to_py_object(&self, py: Python) -> PyString {
473        PyString::new(py, self)
474    }
475}
476
477/// Converts Rust `str` to Python object.
478///
479/// On Python 2.7, this impl will create a byte string if the
480/// input string is ASCII-only; and a unicode string otherwise.
481/// Use `PyUnicode::new()` to always create a unicode string.
482///
483/// On Python 3.x, this function always creates unicode `str` objects.
484impl ToPyObject for String {
485    type ObjectType = PyString;
486
487    #[inline]
488    fn to_py_object(&self, py: Python) -> PyString {
489        PyString::new(py, self)
490    }
491}
492
493/// Allows extracting strings from Python objects.
494/// Accepts Python `str` and `unicode` objects.
495/// In Python 2.7, `str` is expected to be UTF-8 encoded.
496///
497/// Returns a `UnicodeDecodeError` if the input is not valid unicode
498/// (containing unpaired surrogates, or a Python 2.7 byte string that is
499/// not valid UTF-8).
500impl<'s> FromPyObject<'s> for Cow<'s, str> {
501    fn extract(py: Python, obj: &'s PyObject) -> PyResult<Self> {
502        obj.cast_as::<PyString>(py)?.to_string(py)
503    }
504}
505
506/// Allows extracting strings from Python objects.
507/// Accepts Python `str` and `unicode` objects.
508/// In Python 2.7, `str` is expected to be UTF-8 encoded.
509///
510/// Returns a `UnicodeDecodeError` if the input is not valid unicode
511/// (containing unpaired surrogates, or a Python 2.7 byte string that is
512/// not valid UTF-8).
513impl<'s> FromPyObject<'s> for String {
514    fn extract(py: Python, obj: &'s PyObject) -> PyResult<Self> {
515        obj.extract::<Cow<str>>(py).map(Cow::into_owned)
516    }
517}
518
519/// For Python `bytes`, returns a reference to the existing immutable string data.
520/// If the Python object is a single-dimensional [buffer] of format `c` or `B` (C: `char` or `unsigned char`),
521/// returns an owned copy of the data in the buffer.
522/// Otherwise, uses the sequence protocol and converts each individual element
523/// via `impl FromPyObject for u8`.
524///
525/// [buffer]: https://docs.python.org/3/c-api/buffer.html
526impl<'s> FromPyObject<'s> for Cow<'s, [u8]> {
527    fn extract(py: Python, obj: &'s PyObject) -> PyResult<Self> {
528        if let Ok(bytes) = obj.cast_as::<PyBytes>(py) {
529            Ok(Cow::Borrowed(bytes.data(py)))
530        } else {
531            super::sequence::extract_buffer_or_sequence(py, obj).map(Cow::Owned)
532        }
533    }
534}
535
536/// Allows extracting strings from Python objects.
537/// Accepts Python `str` and `unicode` objects.
538/// In Python 2.7, `str` is expected to be UTF-8 encoded.
539impl RefFromPyObject for str {
540    fn with_extracted<F, R>(py: Python, obj: &PyObject, f: F) -> PyResult<R>
541    where
542        F: FnOnce(&str) -> R,
543    {
544        let s = obj.extract::<Cow<str>>(py)?;
545        Ok(f(&s))
546    }
547}
548
549/// For Python `bytes`, returns a reference to the existing immutable string data.
550/// If the Python object is a single-dimensional [buffer] of format `c` or `B` (C: `char` or `unsigned char`),
551/// returns an owned copy of the data in the buffer.
552/// Otherwise, uses the sequence protocol and converts each individual element
553/// via `impl FromPyObject for u8`.
554///
555/// [buffer]: https://docs.python.org/3/c-api/buffer.html
556impl RefFromPyObject for [u8] {
557    fn with_extracted<F, R>(py: Python, obj: &PyObject, f: F) -> PyResult<R>
558    where
559        F: FnOnce(&[u8]) -> R,
560    {
561        let s = obj.extract::<Cow<[u8]>>(py)?;
562        Ok(f(&s))
563    }
564}
565
566#[cfg(test)]
567mod test {
568    use super::{PyString, PyStringData};
569    use crate::conversion::{RefFromPyObject, ToPyObject};
570    use crate::python::{Python, PythonObject};
571
572    #[test]
573    fn test_non_bmp() {
574        let gil = Python::acquire_gil();
575        let py = gil.python();
576        let s = "\u{1F30F}";
577        let py_string = s.to_py_object(py).into_object();
578        assert_eq!(s, py_string.extract::<String>(py).unwrap());
579    }
580
581    #[test]
582    fn test_extract_str() {
583        let gil = Python::acquire_gil();
584        let py = gil.python();
585        let s = "Hello Python";
586        let py_string = s.to_py_object(py).into_object();
587        let mut called = false;
588        RefFromPyObject::with_extracted(py, &py_string, |s2: &str| {
589            assert_eq!(s, s2);
590            called = true;
591        })
592        .unwrap();
593        assert!(called);
594    }
595
596    #[test]
597    fn test_extract_byte_str() {
598        let gil = Python::acquire_gil();
599        let py = gil.python();
600        let py_bytes = py.eval("b'Hello'", None, None).unwrap();
601        let mut called = false;
602        RefFromPyObject::with_extracted(py, &py_bytes, |s2: &[u8]| {
603            assert_eq!(b"Hello", s2);
604            called = true;
605        })
606        .unwrap();
607        assert!(called);
608    }
609
610    #[test]
611    #[cfg(feature = "nightly")] // only works with specialization
612    fn test_extract_byte_str_to_vec() {
613        let gil = Python::acquire_gil();
614        let py = gil.python();
615        let py_bytes = py.eval("b'Hello'", None, None).unwrap();
616        let v = py_bytes.extract::<Vec<u8>>(py).unwrap();
617        assert_eq!(b"Hello", &v[..]);
618    }
619
620    #[allow(unused_variables)] // when compiling for py2.7
621    #[test]
622    fn test_extract_umlaut() {
623        let gil = Python::acquire_gil();
624        let py = gil.python();
625        let py_string = py.eval("u'x=\\u00e4'", None, None).unwrap();
626        let data = py_string.cast_as::<PyString>(py).unwrap().data(py);
627        #[cfg(feature = "python3-sys")]
628        {
629            if let PyStringData::Latin1(s) = data {
630                assert_eq!([b'x', b'=', 0xe4], *s);
631            } else {
632                panic!("Expected PyStringData::Latin1");
633            }
634        }
635        assert_eq!("x=รค", py_string.extract::<String>(py).unwrap());
636    }
637
638    #[allow(unused_variables)] // when compiling for py2.7
639    #[test]
640    fn test_extract_lone_surrogate() {
641        let gil = Python::acquire_gil();
642        let py = gil.python();
643        let py_string = py.eval("u'x=\\ud800'", None, None).unwrap();
644        let data = py_string.cast_as::<PyString>(py).unwrap().data(py);
645        #[cfg(feature = "python3-sys")]
646        {
647            if let PyStringData::Utf16(s) = data {
648                assert_eq!(['x' as u16, '=' as u16, 0xd800], *s);
649            } else {
650                panic!("Expected PyStringData::Utf16");
651            }
652        }
653        assert!(py_string.extract::<String>(py).is_err());
654    }
655
656    #[test]
657    fn test_extract_lone_surrogate_lossy() {
658        let gil = Python::acquire_gil();
659        let py = gil.python();
660        let py_string = py.eval("u'x=\\ud800'", None, None).unwrap();
661        let result = py_string
662            .cast_as::<PyString>(py)
663            .unwrap()
664            .to_string_lossy(py);
665        assert_eq!("x=\u{fffd}", result);
666    }
667}