rustpython_vm/builtins/
str.rs

1use super::{
2    PositionIterInternal, PyBytesRef, PyDict, PyTupleRef, PyType, PyTypeRef,
3    int::{PyInt, PyIntRef},
4    iter::{
5        IterStatus::{self, Exhausted},
6        builtins_iter,
7    },
8};
9use crate::common::lock::LazyLock;
10use crate::{
11    AsObject, Context, Py, PyExact, PyObject, PyObjectRef, PyPayload, PyRef, PyRefExact, PyResult,
12    TryFromBorrowedObject, VirtualMachine,
13    anystr::{self, AnyStr, AnyStrContainer, AnyStrWrapper, adjust_indices},
14    atomic_func,
15    cformat::cformat_string,
16    class::PyClassImpl,
17    common::str::{PyKindStr, StrData, StrKind},
18    convert::{IntoPyException, ToPyException, ToPyObject, ToPyResult},
19    format::{format, format_map},
20    function::{ArgIterable, ArgSize, FuncArgs, OptionalArg, OptionalOption, PyComparisonValue},
21    intern::PyInterned,
22    object::{MaybeTraverse, Traverse, TraverseFn},
23    protocol::{PyIterReturn, PyMappingMethods, PyNumberMethods, PySequenceMethods},
24    sequence::SequenceExt,
25    sliceable::{SequenceIndex, SliceableSequenceOp},
26    types::{
27        AsMapping, AsNumber, AsSequence, Comparable, Constructor, Hashable, IterNext, Iterable,
28        PyComparisonOp, Representable, SelfIter,
29    },
30};
31use alloc::{borrow::Cow, fmt};
32use ascii::{AsciiChar, AsciiStr, AsciiString};
33use bstr::ByteSlice;
34use core::{char, mem, ops::Range};
35use itertools::Itertools;
36use num_traits::ToPrimitive;
37use rustpython_common::{
38    ascii,
39    atomic::{self, PyAtomic, Radium},
40    format::{FormatSpec, FormatString, FromTemplate},
41    hash,
42    lock::PyMutex,
43    str::DeduceStrKind,
44    wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat},
45};
46use unic_ucd_bidi::BidiClass;
47use unic_ucd_category::GeneralCategory;
48use unic_ucd_ident::{is_xid_continue, is_xid_start};
49use unicode_casing::CharExt;
50
51impl<'a> TryFromBorrowedObject<'a> for String {
52    fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult<Self> {
53        obj.try_value_with(|pystr: &PyUtf8Str| Ok(pystr.as_str().to_owned()), vm)
54    }
55}
56
57impl<'a> TryFromBorrowedObject<'a> for &'a str {
58    fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult<Self> {
59        let pystr: &Py<PyUtf8Str> = TryFromBorrowedObject::try_from_borrowed_object(vm, obj)?;
60        Ok(pystr.as_str())
61    }
62}
63
64impl<'a> TryFromBorrowedObject<'a> for &'a Wtf8 {
65    fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult<Self> {
66        let pystr: &Py<PyStr> = TryFromBorrowedObject::try_from_borrowed_object(vm, obj)?;
67        Ok(pystr.as_wtf8())
68    }
69}
70
71pub type PyStrRef = PyRef<PyStr>;
72pub type PyUtf8StrRef = PyRef<PyUtf8Str>;
73
74#[pyclass(module = false, name = "str")]
75pub struct PyStr {
76    data: StrData,
77    hash: PyAtomic<hash::PyHash>,
78}
79
80impl fmt::Debug for PyStr {
81    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
82        f.debug_struct("PyStr")
83            .field("value", &self.as_wtf8())
84            .field("kind", &self.data.kind())
85            .field("hash", &self.hash)
86            .finish()
87    }
88}
89
90impl AsRef<str> for PyStr {
91    #[track_caller] // <- can remove this once it doesn't panic
92    fn as_ref(&self) -> &str {
93        self.to_str().expect("str has surrogates")
94    }
95}
96
97impl AsRef<str> for Py<PyStr> {
98    #[track_caller] // <- can remove this once it doesn't panic
99    fn as_ref(&self) -> &str {
100        self.to_str().expect("str has surrogates")
101    }
102}
103
104impl AsRef<str> for PyStrRef {
105    #[track_caller] // <- can remove this once it doesn't panic
106    fn as_ref(&self) -> &str {
107        self.to_str().expect("str has surrogates")
108    }
109}
110
111impl AsRef<Wtf8> for PyStr {
112    fn as_ref(&self) -> &Wtf8 {
113        self.as_wtf8()
114    }
115}
116
117impl AsRef<Wtf8> for Py<PyStr> {
118    fn as_ref(&self) -> &Wtf8 {
119        self.as_wtf8()
120    }
121}
122
123impl AsRef<Wtf8> for PyStrRef {
124    fn as_ref(&self) -> &Wtf8 {
125        self.as_wtf8()
126    }
127}
128
129impl Wtf8Concat for PyStr {
130    #[inline]
131    fn fmt_wtf8(&self, buf: &mut Wtf8Buf) {
132        buf.push_wtf8(self.as_wtf8());
133    }
134}
135
136impl Wtf8Concat for Py<PyStr> {
137    #[inline]
138    fn fmt_wtf8(&self, buf: &mut Wtf8Buf) {
139        buf.push_wtf8(self.as_wtf8());
140    }
141}
142
143impl<'a> From<&'a AsciiStr> for PyStr {
144    fn from(s: &'a AsciiStr) -> Self {
145        s.to_owned().into()
146    }
147}
148
149impl From<AsciiString> for PyStr {
150    fn from(s: AsciiString) -> Self {
151        s.into_boxed_ascii_str().into()
152    }
153}
154
155impl From<Box<AsciiStr>> for PyStr {
156    fn from(s: Box<AsciiStr>) -> Self {
157        StrData::from(s).into()
158    }
159}
160
161impl From<AsciiChar> for PyStr {
162    fn from(ch: AsciiChar) -> Self {
163        AsciiString::from(ch).into()
164    }
165}
166
167impl<'a> From<&'a str> for PyStr {
168    fn from(s: &'a str) -> Self {
169        s.to_owned().into()
170    }
171}
172
173impl<'a> From<&'a Wtf8> for PyStr {
174    fn from(s: &'a Wtf8) -> Self {
175        s.to_owned().into()
176    }
177}
178
179impl From<String> for PyStr {
180    fn from(s: String) -> Self {
181        s.into_boxed_str().into()
182    }
183}
184
185impl From<Wtf8Buf> for PyStr {
186    fn from(w: Wtf8Buf) -> Self {
187        w.into_box().into()
188    }
189}
190
191impl From<char> for PyStr {
192    fn from(ch: char) -> Self {
193        StrData::from(ch).into()
194    }
195}
196
197impl From<CodePoint> for PyStr {
198    fn from(ch: CodePoint) -> Self {
199        StrData::from(ch).into()
200    }
201}
202
203impl From<StrData> for PyStr {
204    fn from(data: StrData) -> Self {
205        Self {
206            data,
207            hash: Radium::new(hash::SENTINEL),
208        }
209    }
210}
211
212impl<'a> From<alloc::borrow::Cow<'a, str>> for PyStr {
213    fn from(s: alloc::borrow::Cow<'a, str>) -> Self {
214        s.into_owned().into()
215    }
216}
217
218impl From<Box<str>> for PyStr {
219    #[inline]
220    fn from(value: Box<str>) -> Self {
221        StrData::from(value).into()
222    }
223}
224
225impl From<Box<Wtf8>> for PyStr {
226    #[inline]
227    fn from(value: Box<Wtf8>) -> Self {
228        StrData::from(value).into()
229    }
230}
231
232impl Default for PyStr {
233    fn default() -> Self {
234        Self {
235            data: StrData::default(),
236            hash: Radium::new(hash::SENTINEL),
237        }
238    }
239}
240
241impl fmt::Display for PyStr {
242    #[inline]
243    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
244        self.as_wtf8().fmt(f)
245    }
246}
247
248pub trait AsPyStr<'a>
249where
250    Self: 'a,
251{
252    #[allow(
253        clippy::wrong_self_convention,
254        reason = "this trait is intentionally implemented for references"
255    )]
256    fn as_pystr(self, ctx: &Context) -> &'a Py<PyStr>;
257}
258
259impl<'a> AsPyStr<'a> for &'a Py<PyStr> {
260    #[inline]
261    fn as_pystr(self, _ctx: &Context) -> &'a Py<PyStr> {
262        self
263    }
264}
265
266impl<'a> AsPyStr<'a> for &'a Py<PyUtf8Str> {
267    #[inline]
268    fn as_pystr(self, _ctx: &Context) -> &'a Py<PyStr> {
269        Py::<PyUtf8Str>::as_pystr(self)
270    }
271}
272
273impl<'a> AsPyStr<'a> for &'a PyStrRef {
274    #[inline]
275    fn as_pystr(self, _ctx: &Context) -> &'a Py<PyStr> {
276        self
277    }
278}
279
280impl<'a> AsPyStr<'a> for &'a PyUtf8StrRef {
281    #[inline]
282    fn as_pystr(self, _ctx: &Context) -> &'a Py<PyStr> {
283        Py::<PyUtf8Str>::as_pystr(self)
284    }
285}
286
287impl AsPyStr<'static> for &'static str {
288    #[inline]
289    fn as_pystr(self, ctx: &Context) -> &'static Py<PyStr> {
290        ctx.intern_str(self)
291    }
292}
293
294impl<'a> AsPyStr<'a> for &'a PyStrInterned {
295    #[inline]
296    fn as_pystr(self, _ctx: &Context) -> &'a Py<PyStr> {
297        self
298    }
299}
300
301impl<'a> AsPyStr<'a> for &'a PyUtf8StrInterned {
302    #[inline]
303    fn as_pystr(self, _ctx: &Context) -> &'a Py<PyStr> {
304        Py::<PyUtf8Str>::as_pystr(self)
305    }
306}
307
308#[pyclass(module = false, name = "str_iterator", traverse = "manual")]
309#[derive(Debug)]
310pub struct PyStrIterator {
311    internal: PyMutex<(PositionIterInternal<PyStrRef>, usize)>,
312}
313
314unsafe impl Traverse for PyStrIterator {
315    fn traverse(&self, tracer: &mut TraverseFn<'_>) {
316        // No need to worry about deadlock, for inner is a PyStr and can't make ref cycle
317        self.internal.lock().0.traverse(tracer);
318    }
319}
320
321impl PyPayload for PyStrIterator {
322    fn class(ctx: &Context) -> &'static Py<PyType> {
323        ctx.types.str_iterator_type
324    }
325}
326
327#[pyclass(flags(DISALLOW_INSTANTIATION), with(IterNext, Iterable))]
328impl PyStrIterator {
329    #[pymethod]
330    fn __length_hint__(&self) -> usize {
331        self.internal.lock().0.length_hint(|obj| obj.char_len())
332    }
333
334    #[pymethod]
335    fn __setstate__(&self, state: PyObjectRef, vm: &VirtualMachine) -> PyResult<()> {
336        let mut internal = self.internal.lock();
337        internal.1 = usize::MAX;
338        internal
339            .0
340            .set_state(state, |obj, pos| pos.min(obj.char_len()), vm)
341    }
342
343    #[pymethod]
344    fn __reduce__(&self, vm: &VirtualMachine) -> PyTupleRef {
345        let func = builtins_iter(vm);
346        self.internal.lock().0.reduce(
347            func,
348            |x| x.clone().into(),
349            |vm| vm.ctx.empty_str.to_owned().into(),
350            vm,
351        )
352    }
353}
354
355impl SelfIter for PyStrIterator {}
356
357impl IterNext for PyStrIterator {
358    fn next(zelf: &Py<Self>, vm: &VirtualMachine) -> PyResult<PyIterReturn> {
359        let mut internal = zelf.internal.lock();
360
361        if let IterStatus::Active(s) = &internal.0.status {
362            let value = s.as_wtf8();
363
364            if internal.1 == usize::MAX {
365                if let Some((offset, ch)) = value.code_point_indices().nth(internal.0.position) {
366                    internal.0.position += 1;
367                    internal.1 = offset + ch.len_wtf8();
368                    return Ok(PyIterReturn::Return(ch.to_pyobject(vm)));
369                }
370            } else if let Some(value) = value.get(internal.1..)
371                && let Some(ch) = value.code_points().next()
372            {
373                internal.0.position += 1;
374                internal.1 += ch.len_wtf8();
375                return Ok(PyIterReturn::Return(ch.to_pyobject(vm)));
376            }
377            internal.0.status = Exhausted;
378        }
379        Ok(PyIterReturn::StopIteration(None))
380    }
381}
382
383#[derive(FromArgs)]
384pub struct StrArgs {
385    #[pyarg(any, optional)]
386    object: OptionalArg<PyObjectRef>,
387    #[pyarg(any, optional)]
388    encoding: OptionalArg<PyUtf8StrRef>,
389    #[pyarg(any, optional)]
390    errors: OptionalArg<PyUtf8StrRef>,
391}
392
393impl Constructor for PyStr {
394    type Args = StrArgs;
395
396    fn slot_new(cls: PyTypeRef, func_args: FuncArgs, vm: &VirtualMachine) -> PyResult {
397        // Optimization: return exact str as-is (only when no encoding/errors provided)
398        if cls.is(vm.ctx.types.str_type)
399            && func_args.args.len() == 1
400            && func_args.kwargs.is_empty()
401            && func_args.args[0].class().is(vm.ctx.types.str_type)
402        {
403            return Ok(func_args.args[0].clone());
404        }
405
406        let args: Self::Args = func_args.bind(vm)?;
407        let payload = Self::py_new(&cls, args, vm)?;
408        payload.into_ref_with_type(vm, cls).map(Into::into)
409    }
410
411    fn py_new(_cls: &Py<PyType>, args: Self::Args, vm: &VirtualMachine) -> PyResult<Self> {
412        match args.object {
413            OptionalArg::Present(input) => {
414                if let OptionalArg::Present(enc) = args.encoding {
415                    let s = vm.state.codec_registry.decode_text(
416                        input,
417                        enc.as_str(),
418                        args.errors.into_option(),
419                        vm,
420                    )?;
421                    Ok(Self::from(s.as_wtf8().to_owned()))
422                } else {
423                    let s = input.str(vm)?;
424                    Ok(Self::from(s.as_wtf8().to_owned()))
425                }
426            }
427            OptionalArg::Missing => Ok(Self::from(String::new())),
428        }
429    }
430}
431
432impl PyStr {
433    /// # Safety: Given `bytes` must be valid data for given `kind`
434    unsafe fn new_str_unchecked(data: Box<Wtf8>, kind: StrKind) -> Self {
435        unsafe { StrData::new_str_unchecked(data, kind) }.into()
436    }
437
438    unsafe fn new_with_char_len<T: DeduceStrKind + Into<Box<Wtf8>>>(s: T, char_len: usize) -> Self {
439        let kind = s.str_kind();
440        unsafe { StrData::new_with_char_len(s.into(), kind, char_len) }.into()
441    }
442
443    /// # Safety
444    /// Given `bytes` must be ascii
445    pub unsafe fn new_ascii_unchecked(bytes: Vec<u8>) -> Self {
446        unsafe { AsciiString::from_ascii_unchecked(bytes) }.into()
447    }
448
449    #[deprecated(note = "use PyStr::from(...).into_ref() instead")]
450    pub fn new_ref(zelf: impl Into<Self>, ctx: &Context) -> PyRef<Self> {
451        let zelf = zelf.into();
452        zelf.into_ref(ctx)
453    }
454
455    fn new_substr(&self, s: Wtf8Buf) -> Self {
456        let kind = if self.kind().is_ascii() || s.is_ascii() {
457            StrKind::Ascii
458        } else if self.kind().is_utf8() || s.is_utf8() {
459            StrKind::Utf8
460        } else {
461            StrKind::Wtf8
462        };
463        unsafe {
464            // SAFETY: kind is properly decided for substring
465            Self::new_str_unchecked(s.into(), kind)
466        }
467    }
468
469    #[inline]
470    pub const fn as_wtf8(&self) -> &Wtf8 {
471        self.data.as_wtf8()
472    }
473
474    pub const fn as_bytes(&self) -> &[u8] {
475        self.data.as_wtf8().as_bytes()
476    }
477
478    pub fn to_str(&self) -> Option<&str> {
479        self.data.as_str()
480    }
481
482    /// Returns `&str`
483    ///
484    /// # Panic
485    /// If the string contains surrogates.
486    #[inline]
487    #[track_caller]
488    pub fn expect_str(&self) -> &str {
489        self.to_str().expect("PyStr contains surrogates")
490    }
491
492    pub(crate) fn ensure_valid_utf8(&self, vm: &VirtualMachine) -> PyResult<()> {
493        if self.is_utf8() {
494            Ok(())
495        } else {
496            let start = self
497                .as_wtf8()
498                .code_points()
499                .position(|c| c.to_char().is_none())
500                .unwrap();
501            Err(vm.new_unicode_encode_error_real(
502                identifier!(vm, utf_8).to_owned(),
503                vm.ctx.new_str(self.data.clone()),
504                start,
505                start + 1,
506                vm.ctx.new_str("surrogates not allowed"),
507            ))
508        }
509    }
510
511    pub fn to_string_lossy(&self) -> Cow<'_, str> {
512        self.to_str()
513            .map(Cow::Borrowed)
514            .unwrap_or_else(|| self.as_wtf8().to_string_lossy())
515    }
516
517    pub const fn kind(&self) -> StrKind {
518        self.data.kind()
519    }
520
521    #[inline]
522    pub fn as_str_kind(&self) -> PyKindStr<'_> {
523        self.data.as_str_kind()
524    }
525
526    pub const fn is_utf8(&self) -> bool {
527        self.kind().is_utf8()
528    }
529
530    fn char_all<F>(&self, test: F) -> bool
531    where
532        F: Fn(char) -> bool,
533    {
534        match self.as_str_kind() {
535            PyKindStr::Ascii(s) => s.chars().all(|ch| test(ch.into())),
536            PyKindStr::Utf8(s) => s.chars().all(test),
537            PyKindStr::Wtf8(w) => w.code_points().all(|ch| ch.is_char_and(&test)),
538        }
539    }
540
541    fn repeat(zelf: PyRef<Self>, value: isize, vm: &VirtualMachine) -> PyResult<PyRef<Self>> {
542        if value == 0 && zelf.class().is(vm.ctx.types.str_type) {
543            // Special case: when some `str` is multiplied by `0`,
544            // returns the empty `str`.
545            return Ok(vm.ctx.empty_str.to_owned());
546        }
547        if (value == 1 || zelf.is_empty()) && zelf.class().is(vm.ctx.types.str_type) {
548            // Special case: when some `str` is multiplied by `1` or is the empty `str`,
549            // nothing really happens, we need to return an object itself
550            // with the same `id()` to be compatible with CPython.
551            // This only works for `str` itself, not its subclasses.
552            return Ok(zelf);
553        }
554        zelf.as_wtf8()
555            .as_bytes()
556            .mul(vm, value)
557            .map(|x| Self::from(unsafe { Wtf8Buf::from_bytes_unchecked(x) }).into_ref(&vm.ctx))
558    }
559
560    pub fn try_as_utf8<'a>(&'a self, vm: &VirtualMachine) -> PyResult<&'a PyUtf8Str> {
561        // Check if the string contains surrogates
562        self.ensure_valid_utf8(vm)?;
563        // If no surrogates, we can safely cast to PyStr
564        Ok(unsafe { &*(self as *const _ as *const PyUtf8Str) })
565    }
566}
567
568impl Py<PyStr> {
569    pub fn try_as_utf8<'a>(&'a self, vm: &VirtualMachine) -> PyResult<&'a Py<PyUtf8Str>> {
570        // Check if the string contains surrogates
571        self.ensure_valid_utf8(vm)?;
572        // If no surrogates, we can safely cast to PyStr
573        Ok(unsafe { &*(self as *const _ as *const Py<PyUtf8Str>) })
574    }
575}
576
577#[pyclass(
578    flags(BASETYPE, _MATCH_SELF),
579    with(
580        AsMapping,
581        AsNumber,
582        AsSequence,
583        Representable,
584        Hashable,
585        Comparable,
586        Iterable,
587        Constructor
588    )
589)]
590impl PyStr {
591    fn __add__(zelf: PyRef<Self>, other: PyObjectRef, vm: &VirtualMachine) -> PyResult {
592        if let Some(other) = other.downcast_ref::<Self>() {
593            let bytes = zelf.as_wtf8().py_add(other.as_wtf8());
594            Ok(unsafe {
595                // SAFETY: `kind` is safely decided
596                let kind = zelf.kind() | other.kind();
597                Self::new_str_unchecked(bytes.into(), kind)
598            }
599            .to_pyobject(vm))
600        } else if let Some(radd) = vm.get_method(other.clone(), identifier!(vm, __radd__)) {
601            // hack to get around not distinguishing number add from seq concat
602            radd?.call((zelf,), vm)
603        } else {
604            Err(vm.new_type_error(format!(
605                r#"can only concatenate str (not "{}") to str"#,
606                other.class().name()
607            )))
608        }
609    }
610
611    fn _contains(&self, needle: &PyObject, vm: &VirtualMachine) -> PyResult<bool> {
612        if let Some(needle) = needle.downcast_ref::<Self>() {
613            Ok(memchr::memmem::find(self.as_bytes(), needle.as_bytes()).is_some())
614        } else {
615            Err(vm.new_type_error(format!(
616                "'in <string>' requires string as left operand, not {}",
617                needle.class().name()
618            )))
619        }
620    }
621
622    fn __contains__(&self, needle: PyObjectRef, vm: &VirtualMachine) -> PyResult<bool> {
623        self._contains(&needle, vm)
624    }
625
626    fn _getitem(&self, needle: &PyObject, vm: &VirtualMachine) -> PyResult {
627        let item = match SequenceIndex::try_from_borrowed_object(vm, needle, "str")? {
628            SequenceIndex::Int(i) => self.getitem_by_index(vm, i)?.to_pyobject(vm),
629            SequenceIndex::Slice(slice) => self.getitem_by_slice(vm, slice)?.to_pyobject(vm),
630        };
631        Ok(item)
632    }
633
634    fn __getitem__(&self, needle: PyObjectRef, vm: &VirtualMachine) -> PyResult {
635        self._getitem(&needle, vm)
636    }
637
638    #[inline]
639    pub(crate) fn hash(&self, vm: &VirtualMachine) -> hash::PyHash {
640        match self.hash.load(atomic::Ordering::Relaxed) {
641            hash::SENTINEL => self._compute_hash(vm),
642            hash => hash,
643        }
644    }
645
646    #[cold]
647    fn _compute_hash(&self, vm: &VirtualMachine) -> hash::PyHash {
648        let hash_val = vm.state.hash_secret.hash_bytes(self.as_bytes());
649        debug_assert_ne!(hash_val, hash::SENTINEL);
650        // spell-checker:ignore cmpxchg
651        // like with char_len, we don't need a cmpxchg loop, since it'll always be the same value
652        self.hash.store(hash_val, atomic::Ordering::Relaxed);
653        hash_val
654    }
655
656    #[inline]
657    pub fn byte_len(&self) -> usize {
658        self.data.len()
659    }
660
661    #[inline]
662    pub fn is_empty(&self) -> bool {
663        self.data.is_empty()
664    }
665
666    #[inline]
667    pub fn char_len(&self) -> usize {
668        self.data.char_len()
669    }
670
671    #[pymethod]
672    #[inline(always)]
673    pub const fn isascii(&self) -> bool {
674        matches!(self.kind(), StrKind::Ascii)
675    }
676
677    #[pymethod]
678    fn __sizeof__(&self) -> usize {
679        core::mem::size_of::<Self>() + self.byte_len() * core::mem::size_of::<u8>()
680    }
681
682    fn __mul__(zelf: PyRef<Self>, value: ArgSize, vm: &VirtualMachine) -> PyResult<PyRef<Self>> {
683        Self::repeat(zelf, value.into(), vm)
684    }
685
686    #[inline]
687    pub(crate) fn repr(&self, vm: &VirtualMachine) -> PyResult<String> {
688        use crate::literal::escape::UnicodeEscape;
689        UnicodeEscape::new_repr(self.as_wtf8())
690            .str_repr()
691            .to_string()
692            .ok_or_else(|| vm.new_overflow_error("string is too long to generate repr"))
693    }
694
695    #[pymethod]
696    fn lower(&self) -> Self {
697        match self.as_str_kind() {
698            PyKindStr::Ascii(s) => s.to_ascii_lowercase().into(),
699            PyKindStr::Utf8(s) => s.to_lowercase().into(),
700            PyKindStr::Wtf8(w) => w.to_lowercase().into(),
701        }
702    }
703
704    // casefold is much more aggressive than lower
705    #[pymethod]
706    fn casefold(&self) -> Self {
707        match self.as_str_kind() {
708            PyKindStr::Ascii(s) => caseless::default_case_fold_str(s.as_str()).into(),
709            PyKindStr::Utf8(s) => caseless::default_case_fold_str(s).into(),
710            PyKindStr::Wtf8(w) => w
711                .chunks()
712                .map(|c| match c {
713                    Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(caseless::default_case_fold_str(s)),
714                    Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c),
715                })
716                .collect::<Wtf8Buf>()
717                .into(),
718        }
719    }
720
721    #[pymethod]
722    fn upper(&self) -> Self {
723        match self.as_str_kind() {
724            PyKindStr::Ascii(s) => s.to_ascii_uppercase().into(),
725            PyKindStr::Utf8(s) => s.to_uppercase().into(),
726            PyKindStr::Wtf8(w) => w.to_uppercase().into(),
727        }
728    }
729
730    #[pymethod]
731    fn capitalize(&self) -> Wtf8Buf {
732        match self.as_str_kind() {
733            PyKindStr::Ascii(s) => {
734                let mut s = s.to_owned();
735                if let [first, rest @ ..] = s.as_mut_slice() {
736                    first.make_ascii_uppercase();
737                    ascii::AsciiStr::make_ascii_lowercase(rest.into());
738                }
739                s.into()
740            }
741            PyKindStr::Utf8(s) => {
742                let mut chars = s.chars();
743                let mut out = String::with_capacity(s.len());
744                if let Some(c) = chars.next() {
745                    out.extend(c.to_titlecase());
746                    out.push_str(&chars.as_str().to_lowercase());
747                }
748                out.into()
749            }
750            PyKindStr::Wtf8(s) => {
751                let mut out = Wtf8Buf::with_capacity(s.len());
752                let mut chars = s.code_points();
753                if let Some(ch) = chars.next() {
754                    match ch.to_char() {
755                        Some(ch) => out.extend(ch.to_titlecase()),
756                        None => out.push(ch),
757                    }
758                    out.push_wtf8(&chars.as_wtf8().to_lowercase());
759                }
760                out
761            }
762        }
763    }
764
765    #[pymethod]
766    fn split(zelf: &Py<Self>, args: SplitArgs, vm: &VirtualMachine) -> PyResult<Vec<PyObjectRef>> {
767        let elements = match zelf.as_str_kind() {
768            PyKindStr::Ascii(s) => s.py_split(
769                args,
770                vm,
771                || zelf.as_object().to_owned(),
772                |v, s, vm| {
773                    v.as_bytes()
774                        .split_str(s)
775                        .map(|s| unsafe { AsciiStr::from_ascii_unchecked(s) }.to_pyobject(vm))
776                        .collect()
777                },
778                |v, s, n, vm| {
779                    v.as_bytes()
780                        .splitn_str(n, s)
781                        .map(|s| unsafe { AsciiStr::from_ascii_unchecked(s) }.to_pyobject(vm))
782                        .collect()
783                },
784                |v, n, vm| {
785                    v.as_bytes().py_split_whitespace(n, |s| {
786                        unsafe { AsciiStr::from_ascii_unchecked(s) }.to_pyobject(vm)
787                    })
788                },
789            ),
790            PyKindStr::Utf8(s) => s.py_split(
791                args,
792                vm,
793                || zelf.as_object().to_owned(),
794                |v, s, vm| v.split(s).map(|s| vm.ctx.new_str(s).into()).collect(),
795                |v, s, n, vm| v.splitn(n, s).map(|s| vm.ctx.new_str(s).into()).collect(),
796                |v, n, vm| v.py_split_whitespace(n, |s| vm.ctx.new_str(s).into()),
797            ),
798            PyKindStr::Wtf8(w) => w.py_split(
799                args,
800                vm,
801                || zelf.as_object().to_owned(),
802                |v, s, vm| v.split(s).map(|s| vm.ctx.new_str(s).into()).collect(),
803                |v, s, n, vm| v.splitn(n, s).map(|s| vm.ctx.new_str(s).into()).collect(),
804                |v, n, vm| v.py_split_whitespace(n, |s| vm.ctx.new_str(s).into()),
805            ),
806        }?;
807        Ok(elements)
808    }
809
810    #[pymethod]
811    fn rsplit(zelf: &Py<Self>, args: SplitArgs, vm: &VirtualMachine) -> PyResult<Vec<PyObjectRef>> {
812        let mut elements = zelf.as_wtf8().py_split(
813            args,
814            vm,
815            || zelf.as_object().to_owned(),
816            |v, s, vm| v.rsplit(s).map(|s| vm.ctx.new_str(s).into()).collect(),
817            |v, s, n, vm| v.rsplitn(n, s).map(|s| vm.ctx.new_str(s).into()).collect(),
818            |v, n, vm| v.py_rsplit_whitespace(n, |s| vm.ctx.new_str(s).into()),
819        )?;
820        // Unlike Python rsplit, Rust rsplitn returns an iterator that
821        // starts from the end of the string.
822        elements.reverse();
823        Ok(elements)
824    }
825
826    #[pymethod]
827    fn strip(&self, chars: OptionalOption<PyStrRef>) -> Self {
828        match self.as_str_kind() {
829            PyKindStr::Ascii(s) => s
830                .py_strip(
831                    chars,
832                    |s, chars| {
833                        let s = s
834                            .as_str()
835                            .trim_matches(|c| memchr::memchr(c as _, chars.as_bytes()).is_some());
836                        unsafe { AsciiStr::from_ascii_unchecked(s.as_bytes()) }
837                    },
838                    |s| s.trim(),
839                )
840                .into(),
841            PyKindStr::Utf8(s) => s
842                .py_strip(
843                    chars,
844                    |s, chars| s.trim_matches(|c| chars.contains(c)),
845                    |s| s.trim(),
846                )
847                .into(),
848            PyKindStr::Wtf8(w) => w
849                .py_strip(
850                    chars,
851                    |s, chars| s.trim_matches(|c| chars.code_points().contains(&c)),
852                    |s| s.trim(),
853                )
854                .into(),
855        }
856    }
857
858    #[pymethod]
859    fn lstrip(
860        zelf: PyRef<Self>,
861        chars: OptionalOption<PyStrRef>,
862        vm: &VirtualMachine,
863    ) -> PyRef<Self> {
864        let s = zelf.as_wtf8();
865        let stripped = s.py_strip(
866            chars,
867            |s, chars| s.trim_start_matches(|c| chars.contains_code_point(c)),
868            |s| s.trim_start(),
869        );
870        if s == stripped {
871            zelf
872        } else {
873            vm.ctx.new_str(stripped)
874        }
875    }
876
877    #[pymethod]
878    fn rstrip(
879        zelf: PyRef<Self>,
880        chars: OptionalOption<PyStrRef>,
881        vm: &VirtualMachine,
882    ) -> PyRef<Self> {
883        let s = zelf.as_wtf8();
884        let stripped = s.py_strip(
885            chars,
886            |s, chars| s.trim_end_matches(|c| chars.contains_code_point(c)),
887            |s| s.trim_end(),
888        );
889        if s == stripped {
890            zelf
891        } else {
892            vm.ctx.new_str(stripped)
893        }
894    }
895
896    #[pymethod]
897    fn endswith(&self, options: anystr::StartsEndsWithArgs, vm: &VirtualMachine) -> PyResult<bool> {
898        let (affix, substr) =
899            match options.prepare(self.as_wtf8(), self.len(), |s, r| s.get_chars(r)) {
900                Some(x) => x,
901                None => return Ok(false),
902            };
903        substr.py_starts_ends_with(
904            &affix,
905            "endswith",
906            "str",
907            |s, x: &Py<Self>| s.ends_with(x.as_wtf8()),
908            vm,
909        )
910    }
911
912    #[pymethod]
913    fn startswith(
914        &self,
915        options: anystr::StartsEndsWithArgs,
916        vm: &VirtualMachine,
917    ) -> PyResult<bool> {
918        let (affix, substr) =
919            match options.prepare(self.as_wtf8(), self.len(), |s, r| s.get_chars(r)) {
920                Some(x) => x,
921                None => return Ok(false),
922            };
923        substr.py_starts_ends_with(
924            &affix,
925            "startswith",
926            "str",
927            |s, x: &Py<Self>| s.starts_with(x.as_wtf8()),
928            vm,
929        )
930    }
931
932    #[pymethod]
933    fn removeprefix(&self, pref: PyStrRef) -> Wtf8Buf {
934        self.as_wtf8()
935            .py_removeprefix(pref.as_wtf8(), pref.byte_len(), |s, p| s.starts_with(p))
936            .to_owned()
937    }
938
939    #[pymethod]
940    fn removesuffix(&self, suffix: PyStrRef) -> Wtf8Buf {
941        self.as_wtf8()
942            .py_removesuffix(suffix.as_wtf8(), suffix.byte_len(), |s, p| s.ends_with(p))
943            .to_owned()
944    }
945
946    #[pymethod]
947    fn isalnum(&self) -> bool {
948        !self.data.is_empty() && self.char_all(char::is_alphanumeric)
949    }
950
951    #[pymethod]
952    fn isnumeric(&self) -> bool {
953        !self.data.is_empty() && self.char_all(char::is_numeric)
954    }
955
956    #[pymethod]
957    fn isdigit(&self) -> bool {
958        // python's isdigit also checks if exponents are digits, these are the unicode codepoints for exponents
959        !self.data.is_empty()
960            && self.char_all(|c| {
961                c.is_ascii_digit()
962                    || matches!(c, '⁰' | '¹' | '²' | '³' | '⁴' | '⁵' | '⁶' | '⁷' | '⁸' | '⁹')
963            })
964    }
965
966    #[pymethod]
967    fn isdecimal(&self) -> bool {
968        !self.data.is_empty()
969            && self.char_all(|c| GeneralCategory::of(c) == GeneralCategory::DecimalNumber)
970    }
971
972    fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
973        cformat_string(vm, self.as_wtf8(), values)
974    }
975
976    #[pymethod]
977    fn format(&self, args: FuncArgs, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
978        let format_str =
979            FormatString::from_str(self.as_wtf8()).map_err(|e| e.to_pyexception(vm))?;
980        format(&format_str, &args, vm)
981    }
982
983    #[pymethod]
984    fn format_map(&self, mapping: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
985        let format_string =
986            FormatString::from_str(self.as_wtf8()).map_err(|err| err.to_pyexception(vm))?;
987        format_map(&format_string, &mapping, vm)
988    }
989
990    #[pymethod]
991    fn __format__(
992        zelf: PyRef<PyStr>,
993        spec: PyUtf8StrRef,
994        vm: &VirtualMachine,
995    ) -> PyResult<PyRef<PyStr>> {
996        if spec.is_empty() {
997            return if zelf.class().is(vm.ctx.types.str_type) {
998                Ok(zelf)
999            } else {
1000                zelf.as_object().str(vm)
1001            };
1002        }
1003        let zelf = zelf.try_into_utf8(vm)?;
1004        let s = FormatSpec::parse(spec.as_str())
1005            .and_then(|format_spec| {
1006                format_spec.format_string(&CharLenStr(zelf.as_str(), zelf.char_len()))
1007            })
1008            .map_err(|err| err.into_pyexception(vm))?;
1009        Ok(vm.ctx.new_str(s))
1010    }
1011
1012    #[pymethod]
1013    fn title(&self) -> Wtf8Buf {
1014        let mut title = Wtf8Buf::with_capacity(self.data.len());
1015        let mut previous_is_cased = false;
1016        for c_orig in self.as_wtf8().code_points() {
1017            let c = c_orig.to_char_lossy();
1018            if c.is_lowercase() {
1019                if !previous_is_cased {
1020                    title.extend(c.to_titlecase());
1021                } else {
1022                    title.push_char(c);
1023                }
1024                previous_is_cased = true;
1025            } else if c.is_uppercase() || c.is_titlecase() {
1026                if previous_is_cased {
1027                    title.extend(c.to_lowercase());
1028                } else {
1029                    title.push_char(c);
1030                }
1031                previous_is_cased = true;
1032            } else {
1033                previous_is_cased = false;
1034                title.push(c_orig);
1035            }
1036        }
1037        title
1038    }
1039
1040    #[pymethod]
1041    fn swapcase(&self) -> Wtf8Buf {
1042        let mut swapped_str = Wtf8Buf::with_capacity(self.data.len());
1043        for c_orig in self.as_wtf8().code_points() {
1044            let c = c_orig.to_char_lossy();
1045            // to_uppercase returns an iterator, to_ascii_uppercase returns the char
1046            if c.is_lowercase() {
1047                swapped_str.push_char(c.to_ascii_uppercase());
1048            } else if c.is_uppercase() {
1049                swapped_str.push_char(c.to_ascii_lowercase());
1050            } else {
1051                swapped_str.push(c_orig);
1052            }
1053        }
1054        swapped_str
1055    }
1056
1057    #[pymethod]
1058    fn isalpha(&self) -> bool {
1059        !self.data.is_empty() && self.char_all(char::is_alphabetic)
1060    }
1061
1062    #[pymethod]
1063    fn replace(&self, args: ReplaceArgs) -> Wtf8Buf {
1064        use core::cmp::Ordering;
1065
1066        let s = self.as_wtf8();
1067        let ReplaceArgs { old, new, count } = args;
1068
1069        match count.cmp(&0) {
1070            Ordering::Less => s.replace(old.as_wtf8(), new.as_wtf8()),
1071            Ordering::Equal => s.to_owned(),
1072            Ordering::Greater => {
1073                let s_is_empty = s.is_empty();
1074                let old_is_empty = old.is_empty();
1075
1076                if s_is_empty && !old_is_empty {
1077                    s.to_owned()
1078                } else if s_is_empty && old_is_empty {
1079                    new.as_wtf8().to_owned()
1080                } else {
1081                    s.replacen(old.as_wtf8(), new.as_wtf8(), count as usize)
1082                }
1083            }
1084        }
1085    }
1086
1087    #[pymethod]
1088    fn isprintable(&self) -> bool {
1089        self.char_all(|c| c == '\u{0020}' || rustpython_literal::char::is_printable(c))
1090    }
1091
1092    #[pymethod]
1093    fn isspace(&self) -> bool {
1094        use unic_ucd_bidi::bidi_class::abbr_names::*;
1095        !self.data.is_empty()
1096            && self.char_all(|c| {
1097                GeneralCategory::of(c) == GeneralCategory::SpaceSeparator
1098                    || matches!(BidiClass::of(c), WS | B | S)
1099            })
1100    }
1101
1102    // Return true if all cased characters in the string are lowercase and there is at least one cased character, false otherwise.
1103    #[pymethod]
1104    fn islower(&self) -> bool {
1105        match self.as_str_kind() {
1106            PyKindStr::Ascii(s) => s.py_islower(),
1107            PyKindStr::Utf8(s) => s.py_islower(),
1108            PyKindStr::Wtf8(w) => w.py_islower(),
1109        }
1110    }
1111
1112    // Return true if all cased characters in the string are uppercase and there is at least one cased character, false otherwise.
1113    #[pymethod]
1114    fn isupper(&self) -> bool {
1115        match self.as_str_kind() {
1116            PyKindStr::Ascii(s) => s.py_isupper(),
1117            PyKindStr::Utf8(s) => s.py_isupper(),
1118            PyKindStr::Wtf8(w) => w.py_isupper(),
1119        }
1120    }
1121
1122    #[pymethod]
1123    fn splitlines(&self, args: anystr::SplitLinesArgs, vm: &VirtualMachine) -> Vec<PyObjectRef> {
1124        let into_wrapper = |s: &Wtf8| self.new_substr(s.to_owned()).to_pyobject(vm);
1125        let mut elements = Vec::new();
1126        let mut last_i = 0;
1127        let self_str = self.as_wtf8();
1128        let mut enumerated = self_str.code_point_indices().peekable();
1129        while let Some((i, ch)) = enumerated.next() {
1130            let end_len = match ch.to_char_lossy() {
1131                '\n' => 1,
1132                '\r' => {
1133                    let is_rn = enumerated.next_if(|(_, ch)| *ch == '\n').is_some();
1134                    if is_rn { 2 } else { 1 }
1135                }
1136                '\x0b' | '\x0c' | '\x1c' | '\x1d' | '\x1e' | '\u{0085}' | '\u{2028}'
1137                | '\u{2029}' => ch.len_wtf8(),
1138                _ => continue,
1139            };
1140            let range = if args.keepends {
1141                last_i..i + end_len
1142            } else {
1143                last_i..i
1144            };
1145            last_i = i + end_len;
1146            elements.push(into_wrapper(&self_str[range]));
1147        }
1148        if last_i != self_str.len() {
1149            elements.push(into_wrapper(&self_str[last_i..]));
1150        }
1151        elements
1152    }
1153
1154    #[pymethod]
1155    fn join(
1156        zelf: PyRef<Self>,
1157        iterable: ArgIterable<PyStrRef>,
1158        vm: &VirtualMachine,
1159    ) -> PyResult<PyStrRef> {
1160        let iter = iterable.iter(vm)?;
1161        let joined = match iter.exactly_one() {
1162            Ok(first) => {
1163                let first = first?;
1164                if first.as_object().class().is(vm.ctx.types.str_type) {
1165                    return Ok(first);
1166                } else {
1167                    first.as_wtf8().to_owned()
1168                }
1169            }
1170            Err(iter) => zelf.as_wtf8().py_join(iter)?,
1171        };
1172        Ok(vm.ctx.new_str(joined))
1173    }
1174
1175    // FIXME: two traversals of str is expensive
1176    #[inline]
1177    fn _to_char_idx(r: &Wtf8, byte_idx: usize) -> usize {
1178        r[..byte_idx].code_points().count()
1179    }
1180
1181    #[inline]
1182    fn _find<F>(&self, args: FindArgs, find: F) -> Option<usize>
1183    where
1184        F: Fn(&Wtf8, &Wtf8) -> Option<usize>,
1185    {
1186        let (sub, range) = args.get_value(self.len());
1187        self.as_wtf8().py_find(sub.as_wtf8(), range, find)
1188    }
1189
1190    #[pymethod]
1191    fn find(&self, args: FindArgs) -> isize {
1192        self._find(args, |r, s| Some(Self::_to_char_idx(r, r.find(s)?)))
1193            .map_or(-1, |v| v as isize)
1194    }
1195
1196    #[pymethod]
1197    fn rfind(&self, args: FindArgs) -> isize {
1198        self._find(args, |r, s| Some(Self::_to_char_idx(r, r.rfind(s)?)))
1199            .map_or(-1, |v| v as isize)
1200    }
1201
1202    #[pymethod]
1203    fn index(&self, args: FindArgs, vm: &VirtualMachine) -> PyResult<usize> {
1204        self._find(args, |r, s| Some(Self::_to_char_idx(r, r.find(s)?)))
1205            .ok_or_else(|| vm.new_value_error("substring not found"))
1206    }
1207
1208    #[pymethod]
1209    fn rindex(&self, args: FindArgs, vm: &VirtualMachine) -> PyResult<usize> {
1210        self._find(args, |r, s| Some(Self::_to_char_idx(r, r.rfind(s)?)))
1211            .ok_or_else(|| vm.new_value_error("substring not found"))
1212    }
1213
1214    #[pymethod]
1215    fn partition(&self, sep: PyStrRef, vm: &VirtualMachine) -> PyResult {
1216        let (front, has_mid, back) = self.as_wtf8().py_partition(
1217            sep.as_wtf8(),
1218            || self.as_wtf8().splitn(2, sep.as_wtf8()),
1219            vm,
1220        )?;
1221        let partition = (
1222            self.new_substr(front),
1223            if has_mid {
1224                sep
1225            } else {
1226                vm.ctx.new_str(ascii!(""))
1227            },
1228            self.new_substr(back),
1229        );
1230        Ok(partition.to_pyobject(vm))
1231    }
1232
1233    #[pymethod]
1234    fn rpartition(&self, sep: PyStrRef, vm: &VirtualMachine) -> PyResult {
1235        let (back, has_mid, front) = self.as_wtf8().py_partition(
1236            sep.as_wtf8(),
1237            || self.as_wtf8().rsplitn(2, sep.as_wtf8()),
1238            vm,
1239        )?;
1240        Ok((
1241            self.new_substr(front),
1242            if has_mid {
1243                sep
1244            } else {
1245                vm.ctx.empty_str.to_owned()
1246            },
1247            self.new_substr(back),
1248        )
1249            .to_pyobject(vm))
1250    }
1251
1252    #[pymethod]
1253    fn istitle(&self) -> bool {
1254        if self.data.is_empty() {
1255            return false;
1256        }
1257
1258        let mut cased = false;
1259        let mut previous_is_cased = false;
1260        for c in self.as_wtf8().code_points().map(CodePoint::to_char_lossy) {
1261            if c.is_uppercase() || c.is_titlecase() {
1262                if previous_is_cased {
1263                    return false;
1264                }
1265                previous_is_cased = true;
1266                cased = true;
1267            } else if c.is_lowercase() {
1268                if !previous_is_cased {
1269                    return false;
1270                }
1271                previous_is_cased = true;
1272                cased = true;
1273            } else {
1274                previous_is_cased = false;
1275            }
1276        }
1277        cased
1278    }
1279
1280    #[pymethod]
1281    fn count(&self, args: FindArgs) -> usize {
1282        let (needle, range) = args.get_value(self.len());
1283        self.as_wtf8()
1284            .py_count(needle.as_wtf8(), range, |h, n| h.find_iter(n).count())
1285    }
1286
1287    #[pymethod]
1288    fn zfill(&self, width: isize) -> Wtf8Buf {
1289        unsafe {
1290            // SAFETY: this is safe-guaranteed because the original self.as_wtf8() is valid wtf8
1291            Wtf8Buf::from_bytes_unchecked(self.as_wtf8().py_zfill(width))
1292        }
1293    }
1294
1295    #[inline]
1296    fn _pad(
1297        &self,
1298        width: isize,
1299        fillchar: OptionalArg<PyStrRef>,
1300        pad: fn(&Wtf8, usize, CodePoint, usize) -> Wtf8Buf,
1301        vm: &VirtualMachine,
1302    ) -> PyResult<Wtf8Buf> {
1303        let fillchar = fillchar.map_or(Ok(' '.into()), |ref s| {
1304            s.as_wtf8().code_points().exactly_one().map_err(|_| {
1305                vm.new_type_error("The fill character must be exactly one character long")
1306            })
1307        })?;
1308        Ok(if self.len() as isize >= width {
1309            self.as_wtf8().to_owned()
1310        } else {
1311            pad(self.as_wtf8(), width as usize, fillchar, self.len())
1312        })
1313    }
1314
1315    #[pymethod]
1316    fn center(
1317        &self,
1318        width: isize,
1319        fillchar: OptionalArg<PyStrRef>,
1320        vm: &VirtualMachine,
1321    ) -> PyResult<Wtf8Buf> {
1322        self._pad(width, fillchar, AnyStr::py_center, vm)
1323    }
1324
1325    #[pymethod]
1326    fn ljust(
1327        &self,
1328        width: isize,
1329        fillchar: OptionalArg<PyStrRef>,
1330        vm: &VirtualMachine,
1331    ) -> PyResult<Wtf8Buf> {
1332        self._pad(width, fillchar, AnyStr::py_ljust, vm)
1333    }
1334
1335    #[pymethod]
1336    fn rjust(
1337        &self,
1338        width: isize,
1339        fillchar: OptionalArg<PyStrRef>,
1340        vm: &VirtualMachine,
1341    ) -> PyResult<Wtf8Buf> {
1342        self._pad(width, fillchar, AnyStr::py_rjust, vm)
1343    }
1344
1345    #[pymethod]
1346    fn expandtabs(&self, args: anystr::ExpandTabsArgs, vm: &VirtualMachine) -> PyResult<String> {
1347        // TODO: support WTF-8
1348        Ok(rustpython_common::str::expandtabs(
1349            self.try_as_utf8(vm)?.as_str(),
1350            args.tabsize(),
1351        ))
1352    }
1353
1354    #[pymethod]
1355    pub fn isidentifier(&self) -> bool {
1356        let Some(s) = self.to_str() else { return false };
1357        let mut chars = s.chars();
1358        let is_identifier_start = chars.next().is_some_and(|c| c == '_' || is_xid_start(c));
1359        // a string is not an identifier if it has whitespace or starts with a number
1360        is_identifier_start && chars.all(is_xid_continue)
1361    }
1362
1363    // https://docs.python.org/3/library/stdtypes.html#str.translate
1364    #[pymethod]
1365    fn translate(&self, table: PyObjectRef, vm: &VirtualMachine) -> PyResult<Wtf8Buf> {
1366        vm.get_method_or_type_error(table.clone(), identifier!(vm, __getitem__), || {
1367            format!("'{}' object is not subscriptable", table.class().name())
1368        })?;
1369
1370        let mut translated = Wtf8Buf::new();
1371        for cp in self.as_wtf8().code_points() {
1372            match table.get_item(&*cp.to_u32().to_pyobject(vm), vm) {
1373                Ok(value) => {
1374                    if let Some(text) = value.downcast_ref::<Self>() {
1375                        translated.push_wtf8(text.as_wtf8());
1376                    } else if let Some(bigint) = value.downcast_ref::<PyInt>() {
1377                        let mapped = bigint
1378                            .as_bigint()
1379                            .to_u32()
1380                            .and_then(CodePoint::from_u32)
1381                            .ok_or_else(|| {
1382                                vm.new_value_error("character mapping must be in range(0x110000)")
1383                            })?;
1384                        translated.push(mapped);
1385                    } else if !vm.is_none(&value) {
1386                        return Err(
1387                            vm.new_type_error("character mapping must return integer, None or str")
1388                        );
1389                    }
1390                }
1391                Err(e) if e.fast_isinstance(vm.ctx.exceptions.key_error) => translated.push(cp),
1392                Err(e) => return Err(e),
1393            }
1394        }
1395        Ok(translated)
1396    }
1397
1398    #[pystaticmethod]
1399    fn maketrans(
1400        dict_or_str: PyObjectRef,
1401        to_str: OptionalArg<PyStrRef>,
1402        none_str: OptionalArg<PyStrRef>,
1403        vm: &VirtualMachine,
1404    ) -> PyResult {
1405        let new_dict = vm.ctx.new_dict();
1406        if let OptionalArg::Present(to_str) = to_str {
1407            match dict_or_str.downcast::<Self>() {
1408                Ok(from_str) => {
1409                    if to_str.len() == from_str.len() {
1410                        for (c1, c2) in from_str
1411                            .as_wtf8()
1412                            .code_points()
1413                            .zip(to_str.as_wtf8().code_points())
1414                        {
1415                            new_dict.set_item(
1416                                &*vm.new_pyobj(c1.to_u32()),
1417                                vm.new_pyobj(c2.to_u32()),
1418                                vm,
1419                            )?;
1420                        }
1421                        if let OptionalArg::Present(none_str) = none_str {
1422                            for c in none_str.as_wtf8().code_points() {
1423                                new_dict.set_item(&*vm.new_pyobj(c.to_u32()), vm.ctx.none(), vm)?;
1424                            }
1425                        }
1426                        Ok(new_dict.to_pyobject(vm))
1427                    } else {
1428                        Err(vm.new_value_error(
1429                            "the first two maketrans arguments must have equal length",
1430                        ))
1431                    }
1432                }
1433                _ => Err(vm.new_type_error(
1434                    "first maketrans argument must be a string if there is a second argument",
1435                )),
1436            }
1437        } else {
1438            // dict_str must be a dict
1439            match dict_or_str.downcast::<PyDict>() {
1440                Ok(dict) => {
1441                    for (key, val) in dict {
1442                        // FIXME: ints are key-compatible
1443                        if let Some(num) = key.downcast_ref::<PyInt>() {
1444                            new_dict.set_item(
1445                                &*num.as_bigint().to_i32().to_pyobject(vm),
1446                                val,
1447                                vm,
1448                            )?;
1449                        } else if let Some(string) = key.downcast_ref::<Self>() {
1450                            if string.len() == 1 {
1451                                let num_value =
1452                                    string.as_wtf8().code_points().next().unwrap().to_u32();
1453                                new_dict.set_item(&*num_value.to_pyobject(vm), val, vm)?;
1454                            } else {
1455                                return Err(vm.new_value_error(
1456                                    "string keys in translate table must be of length 1",
1457                                ));
1458                            }
1459                        } else {
1460                            return Err(vm.new_type_error(
1461                                "keys in translate table must be strings or integers",
1462                            ));
1463                        }
1464                    }
1465                    Ok(new_dict.to_pyobject(vm))
1466                }
1467                _ => Err(vm.new_value_error(
1468                    "if you give only one argument to maketrans it must be a dict",
1469                )),
1470            }
1471        }
1472    }
1473
1474    #[pymethod]
1475    fn encode(zelf: PyRef<Self>, args: EncodeArgs, vm: &VirtualMachine) -> PyResult<PyBytesRef> {
1476        encode_string(zelf, args.encoding, args.errors, vm)
1477    }
1478
1479    #[pymethod]
1480    fn __getnewargs__(zelf: PyRef<Self>, vm: &VirtualMachine) -> PyObjectRef {
1481        (zelf.as_wtf8(),).to_pyobject(vm)
1482    }
1483
1484    #[pymethod]
1485    fn __str__(zelf: &Py<Self>, vm: &VirtualMachine) -> PyResult<PyStrRef> {
1486        if zelf.class().is(vm.ctx.types.str_type) {
1487            // Already exact str, just return a reference
1488            Ok(zelf.to_owned())
1489        } else {
1490            // Subclass, create a new exact str
1491            Ok(PyStr::from(zelf.data.clone()).into_ref(&vm.ctx))
1492        }
1493    }
1494}
1495
1496impl PyRef<PyStr> {
1497    pub fn is_empty(&self) -> bool {
1498        (**self).is_empty()
1499    }
1500
1501    pub fn concat_in_place(&mut self, other: &Wtf8, vm: &VirtualMachine) {
1502        if other.is_empty() {
1503            return;
1504        }
1505        let mut s = Wtf8Buf::with_capacity(self.byte_len() + other.len());
1506        s.push_wtf8(self.as_ref());
1507        s.push_wtf8(other);
1508        if self.as_object().strong_count() == 1 {
1509            // SAFETY: strong_count()==1 guarantees unique ownership of this PyStr.
1510            // Mutating payload in place preserves semantics while avoiding PyObject reallocation.
1511            unsafe {
1512                let payload = self.payload() as *const PyStr as *mut PyStr;
1513                (*payload).data = PyStr::from(s).data;
1514                (*payload)
1515                    .hash
1516                    .store(hash::SENTINEL, atomic::Ordering::Relaxed);
1517            }
1518        } else {
1519            *self = PyStr::from(s).into_ref(&vm.ctx);
1520        }
1521    }
1522
1523    pub fn try_into_utf8(self, vm: &VirtualMachine) -> PyResult<PyRef<PyUtf8Str>> {
1524        self.ensure_valid_utf8(vm)?;
1525        Ok(unsafe { mem::transmute::<Self, PyRef<PyUtf8Str>>(self) })
1526    }
1527}
1528
1529struct CharLenStr<'a>(&'a str, usize);
1530impl core::ops::Deref for CharLenStr<'_> {
1531    type Target = str;
1532
1533    fn deref(&self) -> &Self::Target {
1534        self.0
1535    }
1536}
1537impl crate::common::format::CharLen for CharLenStr<'_> {
1538    fn char_len(&self) -> usize {
1539        self.1
1540    }
1541}
1542
1543impl Representable for PyStr {
1544    #[inline]
1545    fn repr_str(zelf: &Py<Self>, vm: &VirtualMachine) -> PyResult<String> {
1546        zelf.repr(vm)
1547    }
1548}
1549
1550impl Hashable for PyStr {
1551    #[inline]
1552    fn hash(zelf: &Py<Self>, vm: &VirtualMachine) -> PyResult<hash::PyHash> {
1553        Ok(zelf.hash(vm))
1554    }
1555}
1556
1557impl Comparable for PyStr {
1558    fn cmp(
1559        zelf: &Py<Self>,
1560        other: &PyObject,
1561        op: PyComparisonOp,
1562        _vm: &VirtualMachine,
1563    ) -> PyResult<PyComparisonValue> {
1564        if let Some(res) = op.identical_optimization(zelf, other) {
1565            return Ok(res.into());
1566        }
1567        let other = class_or_notimplemented!(Self, other);
1568        Ok(op.eval_ord(zelf.as_wtf8().cmp(other.as_wtf8())).into())
1569    }
1570}
1571
1572impl Iterable for PyStr {
1573    fn iter(zelf: PyRef<Self>, vm: &VirtualMachine) -> PyResult {
1574        Ok(PyStrIterator {
1575            internal: PyMutex::new((PositionIterInternal::new(zelf, 0), 0)),
1576        }
1577        .into_pyobject(vm))
1578    }
1579}
1580
1581impl AsMapping for PyStr {
1582    fn as_mapping() -> &'static PyMappingMethods {
1583        static AS_MAPPING: LazyLock<PyMappingMethods> = LazyLock::new(|| PyMappingMethods {
1584            length: atomic_func!(|mapping, _vm| Ok(PyStr::mapping_downcast(mapping).len())),
1585            subscript: atomic_func!(
1586                |mapping, needle, vm| PyStr::mapping_downcast(mapping)._getitem(needle, vm)
1587            ),
1588            ..PyMappingMethods::NOT_IMPLEMENTED
1589        });
1590        &AS_MAPPING
1591    }
1592}
1593
1594impl AsNumber for PyStr {
1595    fn as_number() -> &'static PyNumberMethods {
1596        static AS_NUMBER: PyNumberMethods = PyNumberMethods {
1597            add: Some(|a, b, vm| {
1598                let Some(a) = a.downcast_ref::<PyStr>() else {
1599                    return Ok(vm.ctx.not_implemented());
1600                };
1601                let Some(b) = b.downcast_ref::<PyStr>() else {
1602                    return Ok(vm.ctx.not_implemented());
1603                };
1604                let bytes = a.as_wtf8().py_add(b.as_wtf8());
1605                Ok(unsafe {
1606                    let kind = a.kind() | b.kind();
1607                    PyStr::new_str_unchecked(bytes.into(), kind)
1608                }
1609                .to_pyobject(vm))
1610            }),
1611            remainder: Some(|a, b, vm| {
1612                if let Some(a) = a.downcast_ref::<PyStr>() {
1613                    a.__mod__(b.to_owned(), vm).to_pyresult(vm)
1614                } else {
1615                    Ok(vm.ctx.not_implemented())
1616                }
1617            }),
1618            ..PyNumberMethods::NOT_IMPLEMENTED
1619        };
1620        &AS_NUMBER
1621    }
1622}
1623
1624impl AsSequence for PyStr {
1625    fn as_sequence() -> &'static PySequenceMethods {
1626        static AS_SEQUENCE: LazyLock<PySequenceMethods> = LazyLock::new(|| PySequenceMethods {
1627            length: atomic_func!(|seq, _vm| Ok(PyStr::sequence_downcast(seq).len())),
1628            concat: atomic_func!(|seq, other, vm| {
1629                let zelf = PyStr::sequence_downcast(seq);
1630                PyStr::__add__(zelf.to_owned(), other.to_owned(), vm)
1631            }),
1632            repeat: atomic_func!(|seq, n, vm| {
1633                let zelf = PyStr::sequence_downcast(seq);
1634                PyStr::repeat(zelf.to_owned(), n, vm).map(|x| x.into())
1635            }),
1636            item: atomic_func!(|seq, i, vm| {
1637                let zelf = PyStr::sequence_downcast(seq);
1638                zelf.getitem_by_index(vm, i).to_pyresult(vm)
1639            }),
1640            contains: atomic_func!(
1641                |seq, needle, vm| PyStr::sequence_downcast(seq)._contains(needle, vm)
1642            ),
1643            ..PySequenceMethods::NOT_IMPLEMENTED
1644        });
1645        &AS_SEQUENCE
1646    }
1647}
1648
1649#[derive(FromArgs)]
1650struct EncodeArgs {
1651    #[pyarg(any, default)]
1652    encoding: Option<PyUtf8StrRef>,
1653    #[pyarg(any, default)]
1654    errors: Option<PyUtf8StrRef>,
1655}
1656
1657pub(crate) fn encode_string(
1658    s: PyStrRef,
1659    encoding: Option<PyUtf8StrRef>,
1660    errors: Option<PyUtf8StrRef>,
1661    vm: &VirtualMachine,
1662) -> PyResult<PyBytesRef> {
1663    let encoding = match encoding.as_ref() {
1664        None => crate::codecs::DEFAULT_ENCODING,
1665        Some(s) => s.as_str(),
1666    };
1667    vm.state.codec_registry.encode_text(s, encoding, errors, vm)
1668}
1669
1670impl PyPayload for PyStr {
1671    #[inline]
1672    fn class(ctx: &Context) -> &'static Py<PyType> {
1673        ctx.types.str_type
1674    }
1675}
1676
1677impl ToPyObject for String {
1678    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1679        vm.ctx.new_str(self).into()
1680    }
1681}
1682
1683impl ToPyObject for Wtf8Buf {
1684    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1685        vm.ctx.new_str(self).into()
1686    }
1687}
1688
1689impl ToPyObject for char {
1690    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1691        let cp = self as u32;
1692        if cp <= u8::MAX as u32 {
1693            vm.ctx.latin1_char(cp as u8).into()
1694        } else {
1695            vm.ctx.new_str(self).into()
1696        }
1697    }
1698}
1699
1700impl ToPyObject for CodePoint {
1701    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1702        let cp = self.to_u32();
1703        if cp <= u8::MAX as u32 {
1704            vm.ctx.latin1_char(cp as u8).into()
1705        } else {
1706            vm.ctx.new_str(self).into()
1707        }
1708    }
1709}
1710
1711impl ToPyObject for &str {
1712    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1713        vm.ctx.new_str(self).into()
1714    }
1715}
1716
1717impl ToPyObject for &String {
1718    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1719        vm.ctx.new_str(self.clone()).into()
1720    }
1721}
1722
1723impl ToPyObject for &Wtf8 {
1724    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1725        vm.ctx.new_str(self).into()
1726    }
1727}
1728
1729impl ToPyObject for &Wtf8Buf {
1730    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1731        vm.ctx.new_str(self.clone()).into()
1732    }
1733}
1734
1735impl ToPyObject for &AsciiStr {
1736    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1737        vm.ctx.new_str(self).into()
1738    }
1739}
1740
1741impl ToPyObject for AsciiString {
1742    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1743        vm.ctx.new_str(self).into()
1744    }
1745}
1746
1747impl ToPyObject for AsciiChar {
1748    fn to_pyobject(self, vm: &VirtualMachine) -> PyObjectRef {
1749        vm.ctx.latin1_char(u8::from(self)).into()
1750    }
1751}
1752
1753type SplitArgs = anystr::SplitArgs<PyStrRef>;
1754
1755#[derive(FromArgs)]
1756pub struct FindArgs {
1757    #[pyarg(positional)]
1758    sub: PyStrRef,
1759    #[pyarg(positional, default)]
1760    start: Option<PyIntRef>,
1761    #[pyarg(positional, default)]
1762    end: Option<PyIntRef>,
1763}
1764
1765impl FindArgs {
1766    fn get_value(self, len: usize) -> (PyStrRef, core::ops::Range<usize>) {
1767        let range = adjust_indices(self.start, self.end, len);
1768        (self.sub, range)
1769    }
1770}
1771
1772#[derive(FromArgs)]
1773struct ReplaceArgs {
1774    #[pyarg(positional)]
1775    old: PyStrRef,
1776
1777    #[pyarg(positional)]
1778    new: PyStrRef,
1779
1780    #[pyarg(any, default = -1)]
1781    count: isize,
1782}
1783
1784fn vectorcall_str(
1785    zelf_obj: &PyObject,
1786    args: Vec<PyObjectRef>,
1787    nargs: usize,
1788    kwnames: Option<&[PyObjectRef]>,
1789    vm: &VirtualMachine,
1790) -> PyResult {
1791    let zelf: &Py<PyType> = zelf_obj.downcast_ref().unwrap();
1792    let func_args = FuncArgs::from_vectorcall_owned(args, nargs, kwnames);
1793    (zelf.slots.new.load().unwrap())(zelf.to_owned(), func_args, vm)
1794}
1795
1796pub fn init(ctx: &'static Context) {
1797    PyStr::extend_class(ctx, ctx.types.str_type);
1798    ctx.types
1799        .str_type
1800        .slots
1801        .vectorcall
1802        .store(Some(vectorcall_str));
1803
1804    PyStrIterator::extend_class(ctx, ctx.types.str_iterator_type);
1805}
1806
1807impl SliceableSequenceOp for PyStr {
1808    type Item = CodePoint;
1809    type Sliced = Self;
1810
1811    fn do_get(&self, index: usize) -> Self::Item {
1812        self.data.nth_char(index)
1813    }
1814
1815    fn do_slice(&self, range: Range<usize>) -> Self::Sliced {
1816        match self.as_str_kind() {
1817            PyKindStr::Ascii(s) => s[range].into(),
1818            PyKindStr::Utf8(s) => {
1819                let char_len = range.len();
1820                let out = rustpython_common::str::get_chars(s, range);
1821                // SAFETY: char_len is accurate
1822                unsafe { Self::new_with_char_len(out, char_len) }
1823            }
1824            PyKindStr::Wtf8(w) => {
1825                let char_len = range.len();
1826                let out = rustpython_common::str::get_codepoints(w, range);
1827                // SAFETY: char_len is accurate
1828                unsafe { Self::new_with_char_len(out, char_len) }
1829            }
1830        }
1831    }
1832
1833    fn do_slice_reverse(&self, range: Range<usize>) -> Self::Sliced {
1834        match self.as_str_kind() {
1835            PyKindStr::Ascii(s) => {
1836                let mut out = s[range].to_owned();
1837                out.as_mut_slice().reverse();
1838                out.into()
1839            }
1840            PyKindStr::Utf8(s) => {
1841                let char_len = range.len();
1842                let mut out = String::with_capacity(2 * char_len);
1843                out.extend(
1844                    s.chars()
1845                        .rev()
1846                        .skip(self.char_len() - range.end)
1847                        .take(range.len()),
1848                );
1849                // SAFETY: char_len is accurate
1850                unsafe { Self::new_with_char_len(out, range.len()) }
1851            }
1852            PyKindStr::Wtf8(w) => {
1853                let char_len = range.len();
1854                let mut out = Wtf8Buf::with_capacity(2 * char_len);
1855                out.extend(
1856                    w.code_points()
1857                        .rev()
1858                        .skip(self.char_len() - range.end)
1859                        .take(range.len()),
1860                );
1861                // SAFETY: char_len is accurate
1862                unsafe { Self::new_with_char_len(out, char_len) }
1863            }
1864        }
1865    }
1866
1867    fn do_stepped_slice(&self, range: Range<usize>, step: usize) -> Self::Sliced {
1868        match self.as_str_kind() {
1869            PyKindStr::Ascii(s) => s[range]
1870                .as_slice()
1871                .iter()
1872                .copied()
1873                .step_by(step)
1874                .collect::<AsciiString>()
1875                .into(),
1876            PyKindStr::Utf8(s) => {
1877                let char_len = (range.len() / step) + 1;
1878                let mut out = String::with_capacity(2 * char_len);
1879                out.extend(s.chars().skip(range.start).take(range.len()).step_by(step));
1880                // SAFETY: char_len is accurate
1881                unsafe { Self::new_with_char_len(out, char_len) }
1882            }
1883            PyKindStr::Wtf8(w) => {
1884                let char_len = (range.len() / step) + 1;
1885                let mut out = Wtf8Buf::with_capacity(2 * char_len);
1886                out.extend(
1887                    w.code_points()
1888                        .skip(range.start)
1889                        .take(range.len())
1890                        .step_by(step),
1891                );
1892                // SAFETY: char_len is accurate
1893                unsafe { Self::new_with_char_len(out, char_len) }
1894            }
1895        }
1896    }
1897
1898    fn do_stepped_slice_reverse(&self, range: Range<usize>, step: usize) -> Self::Sliced {
1899        match self.as_str_kind() {
1900            PyKindStr::Ascii(s) => s[range]
1901                .chars()
1902                .rev()
1903                .step_by(step)
1904                .collect::<AsciiString>()
1905                .into(),
1906            PyKindStr::Utf8(s) => {
1907                let char_len = (range.len() / step) + 1;
1908                // not ascii, so the codepoints have to be at least 2 bytes each
1909                let mut out = String::with_capacity(2 * char_len);
1910                out.extend(
1911                    s.chars()
1912                        .rev()
1913                        .skip(self.char_len() - range.end)
1914                        .take(range.len())
1915                        .step_by(step),
1916                );
1917                // SAFETY: char_len is accurate
1918                unsafe { Self::new_with_char_len(out, char_len) }
1919            }
1920            PyKindStr::Wtf8(w) => {
1921                let char_len = (range.len() / step) + 1;
1922                // not ascii, so the codepoints have to be at least 2 bytes each
1923                let mut out = Wtf8Buf::with_capacity(2 * char_len);
1924                out.extend(
1925                    w.code_points()
1926                        .rev()
1927                        .skip(self.char_len() - range.end)
1928                        .take(range.len())
1929                        .step_by(step),
1930                );
1931                // SAFETY: char_len is accurate
1932                unsafe { Self::new_with_char_len(out, char_len) }
1933            }
1934        }
1935    }
1936
1937    fn empty() -> Self::Sliced {
1938        Self::default()
1939    }
1940
1941    fn len(&self) -> usize {
1942        self.char_len()
1943    }
1944}
1945
1946impl AsRef<str> for PyRefExact<PyStr> {
1947    #[track_caller]
1948    fn as_ref(&self) -> &str {
1949        self.to_str().expect("str has surrogates")
1950    }
1951}
1952
1953impl AsRef<str> for PyExact<PyStr> {
1954    #[track_caller]
1955    fn as_ref(&self) -> &str {
1956        self.to_str().expect("str has surrogates")
1957    }
1958}
1959
1960impl AsRef<Wtf8> for PyRefExact<PyStr> {
1961    fn as_ref(&self) -> &Wtf8 {
1962        self.as_wtf8()
1963    }
1964}
1965
1966impl AsRef<Wtf8> for PyExact<PyStr> {
1967    fn as_ref(&self) -> &Wtf8 {
1968        self.as_wtf8()
1969    }
1970}
1971
1972impl AnyStrWrapper<Wtf8> for PyStrRef {
1973    fn as_ref(&self) -> Option<&Wtf8> {
1974        Some(self.as_wtf8())
1975    }
1976
1977    fn is_empty(&self) -> bool {
1978        self.data.is_empty()
1979    }
1980}
1981
1982impl AnyStrWrapper<str> for PyStrRef {
1983    fn as_ref(&self) -> Option<&str> {
1984        self.data.as_str()
1985    }
1986
1987    fn is_empty(&self) -> bool {
1988        self.data.is_empty()
1989    }
1990}
1991
1992impl AnyStrWrapper<AsciiStr> for PyStrRef {
1993    fn as_ref(&self) -> Option<&AsciiStr> {
1994        self.data.as_ascii()
1995    }
1996
1997    fn is_empty(&self) -> bool {
1998        self.data.is_empty()
1999    }
2000}
2001
2002#[repr(transparent)]
2003#[derive(Debug)]
2004pub struct PyUtf8Str(PyStr);
2005
2006impl fmt::Display for PyUtf8Str {
2007    #[inline]
2008    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2009        self.0.fmt(f)
2010    }
2011}
2012
2013impl MaybeTraverse for PyUtf8Str {
2014    const HAS_TRAVERSE: bool = true;
2015    const HAS_CLEAR: bool = false;
2016
2017    fn try_traverse(&self, traverse_fn: &mut TraverseFn<'_>) {
2018        self.0.try_traverse(traverse_fn);
2019    }
2020
2021    fn try_clear(&mut self, _out: &mut Vec<PyObjectRef>) {
2022        // No clear needed for PyUtf8Str
2023    }
2024}
2025
2026impl PyPayload for PyUtf8Str {
2027    #[inline]
2028    fn class(ctx: &Context) -> &'static Py<PyType> {
2029        ctx.types.str_type
2030    }
2031
2032    const PAYLOAD_TYPE_ID: core::any::TypeId = core::any::TypeId::of::<PyStr>();
2033
2034    unsafe fn validate_downcastable_from(obj: &PyObject) -> bool {
2035        // SAFETY: we know the object is a PyStr in this context
2036        let wtf8 = unsafe { obj.downcast_unchecked_ref::<PyStr>() };
2037        wtf8.is_utf8()
2038    }
2039
2040    fn try_downcast_from(obj: &PyObject, vm: &VirtualMachine) -> PyResult<()> {
2041        let str = obj.try_downcast_ref::<PyStr>(vm)?;
2042        str.ensure_valid_utf8(vm)
2043    }
2044}
2045
2046impl<'a> From<&'a AsciiStr> for PyUtf8Str {
2047    fn from(s: &'a AsciiStr) -> Self {
2048        s.to_owned().into()
2049    }
2050}
2051
2052impl From<AsciiString> for PyUtf8Str {
2053    fn from(s: AsciiString) -> Self {
2054        s.into_boxed_ascii_str().into()
2055    }
2056}
2057
2058impl From<Box<AsciiStr>> for PyUtf8Str {
2059    fn from(s: Box<AsciiStr>) -> Self {
2060        let data = StrData::from(s);
2061        unsafe { Self::from_str_data_unchecked(data) }
2062    }
2063}
2064
2065impl From<AsciiChar> for PyUtf8Str {
2066    fn from(ch: AsciiChar) -> Self {
2067        AsciiString::from(ch).into()
2068    }
2069}
2070
2071impl<'a> From<&'a str> for PyUtf8Str {
2072    fn from(s: &'a str) -> Self {
2073        s.to_owned().into()
2074    }
2075}
2076
2077impl From<String> for PyUtf8Str {
2078    fn from(s: String) -> Self {
2079        s.into_boxed_str().into()
2080    }
2081}
2082
2083impl From<char> for PyUtf8Str {
2084    fn from(ch: char) -> Self {
2085        let data = StrData::from(ch);
2086        unsafe { Self::from_str_data_unchecked(data) }
2087    }
2088}
2089
2090impl<'a> From<alloc::borrow::Cow<'a, str>> for PyUtf8Str {
2091    fn from(s: alloc::borrow::Cow<'a, str>) -> Self {
2092        s.into_owned().into()
2093    }
2094}
2095
2096impl From<Box<str>> for PyUtf8Str {
2097    #[inline]
2098    fn from(value: Box<str>) -> Self {
2099        let data = StrData::from(value);
2100        unsafe { Self::from_str_data_unchecked(data) }
2101    }
2102}
2103
2104impl AsRef<Wtf8> for PyUtf8Str {
2105    #[inline]
2106    fn as_ref(&self) -> &Wtf8 {
2107        self.0.as_wtf8()
2108    }
2109}
2110
2111impl AsRef<str> for PyUtf8Str {
2112    #[inline]
2113    fn as_ref(&self) -> &str {
2114        self.as_str()
2115    }
2116}
2117
2118impl PyUtf8Str {
2119    // Create a new `PyUtf8Str` from `StrData` without validation.
2120    // This function must be only used in this module to create conversions.
2121    // # Safety: must be called with a valid UTF-8 string data.
2122    unsafe fn from_str_data_unchecked(data: StrData) -> Self {
2123        Self(PyStr::from(data))
2124    }
2125
2126    /// Returns the underlying WTF-8 slice (always valid UTF-8 for this type).
2127    #[inline]
2128    pub fn as_wtf8(&self) -> &Wtf8 {
2129        self.0.as_wtf8()
2130    }
2131
2132    /// Returns the underlying string slice.
2133    pub fn as_str(&self) -> &str {
2134        debug_assert!(
2135            self.0.is_utf8(),
2136            "PyUtf8Str invariant violated: inner string is not valid UTF-8"
2137        );
2138        // Safety: This is safe because the type invariant guarantees UTF-8 validity.
2139        unsafe { self.0.to_str().unwrap_unchecked() }
2140    }
2141
2142    #[inline]
2143    pub fn as_bytes(&self) -> &[u8] {
2144        self.as_str().as_bytes()
2145    }
2146
2147    #[inline]
2148    pub fn byte_len(&self) -> usize {
2149        self.0.byte_len()
2150    }
2151
2152    #[inline]
2153    pub fn is_empty(&self) -> bool {
2154        self.0.is_empty()
2155    }
2156
2157    #[inline]
2158    pub fn char_len(&self) -> usize {
2159        self.0.char_len()
2160    }
2161}
2162
2163impl Py<PyUtf8Str> {
2164    /// Upcast to PyStr.
2165    pub fn as_pystr(&self) -> &Py<PyStr> {
2166        unsafe {
2167            // Safety: PyUtf8Str is a wrapper around PyStr, so this cast is safe.
2168            &*(self as *const Self as *const Py<PyStr>)
2169        }
2170    }
2171
2172    /// Returns the underlying `&str`.
2173    #[inline]
2174    pub fn as_str(&self) -> &str {
2175        self.as_pystr().to_str().unwrap_or_else(|| {
2176            debug_assert!(false, "PyUtf8Str invariant violated");
2177            // Safety: PyUtf8Str guarantees valid UTF-8
2178            unsafe { core::hint::unreachable_unchecked() }
2179        })
2180    }
2181}
2182
2183impl PyRef<PyUtf8Str> {
2184    /// Convert to PyStrRef. Safe because PyUtf8Str is a subtype of PyStr.
2185    pub fn into_wtf8(self) -> PyStrRef {
2186        unsafe { mem::transmute::<Self, PyStrRef>(self) }
2187    }
2188}
2189
2190impl From<PyRef<PyUtf8Str>> for PyRef<PyStr> {
2191    fn from(s: PyRef<PyUtf8Str>) -> Self {
2192        s.into_wtf8()
2193    }
2194}
2195
2196impl PartialEq for PyUtf8Str {
2197    fn eq(&self, other: &Self) -> bool {
2198        self.as_str() == other.as_str()
2199    }
2200}
2201impl Eq for PyUtf8Str {}
2202
2203impl AnyStrContainer<str> for String {
2204    fn new() -> Self {
2205        Self::new()
2206    }
2207
2208    fn with_capacity(capacity: usize) -> Self {
2209        Self::with_capacity(capacity)
2210    }
2211
2212    fn push_str(&mut self, other: &str) {
2213        Self::push_str(self, other)
2214    }
2215}
2216
2217impl anystr::AnyChar for char {
2218    fn is_lowercase(self) -> bool {
2219        self.is_lowercase()
2220    }
2221
2222    fn is_uppercase(self) -> bool {
2223        self.is_uppercase()
2224    }
2225
2226    fn bytes_len(self) -> usize {
2227        self.len_utf8()
2228    }
2229}
2230
2231impl AnyStr for str {
2232    type Char = char;
2233    type Container = String;
2234
2235    fn to_container(&self) -> Self::Container {
2236        self.to_owned()
2237    }
2238
2239    fn as_bytes(&self) -> &[u8] {
2240        self.as_bytes()
2241    }
2242
2243    fn elements(&self) -> impl Iterator<Item = char> {
2244        Self::chars(self)
2245    }
2246
2247    fn get_bytes(&self, range: core::ops::Range<usize>) -> &Self {
2248        &self[range]
2249    }
2250
2251    fn get_chars(&self, range: core::ops::Range<usize>) -> &Self {
2252        rustpython_common::str::get_chars(self, range)
2253    }
2254
2255    fn is_empty(&self) -> bool {
2256        Self::is_empty(self)
2257    }
2258
2259    fn bytes_len(&self) -> usize {
2260        Self::len(self)
2261    }
2262
2263    fn py_split_whitespace<F>(&self, maxsplit: isize, convert: F) -> Vec<PyObjectRef>
2264    where
2265        F: Fn(&Self) -> PyObjectRef,
2266    {
2267        // CPython split_whitespace
2268        let mut splits = Vec::new();
2269        let mut last_offset = 0;
2270        let mut count = maxsplit;
2271        for (offset, _) in self.match_indices(|c: char| c.is_ascii_whitespace() || c == '\x0b') {
2272            if last_offset == offset {
2273                last_offset += 1;
2274                continue;
2275            }
2276            if count == 0 {
2277                break;
2278            }
2279            splits.push(convert(&self[last_offset..offset]));
2280            last_offset = offset + 1;
2281            count -= 1;
2282        }
2283        if last_offset != self.len() {
2284            splits.push(convert(&self[last_offset..]));
2285        }
2286        splits
2287    }
2288
2289    fn py_rsplit_whitespace<F>(&self, maxsplit: isize, convert: F) -> Vec<PyObjectRef>
2290    where
2291        F: Fn(&Self) -> PyObjectRef,
2292    {
2293        // CPython rsplit_whitespace
2294        let mut splits = Vec::new();
2295        let mut last_offset = self.len();
2296        let mut count = maxsplit;
2297        for (offset, _) in self.rmatch_indices(|c: char| c.is_ascii_whitespace() || c == '\x0b') {
2298            if last_offset == offset + 1 {
2299                last_offset -= 1;
2300                continue;
2301            }
2302            if count == 0 {
2303                break;
2304            }
2305            splits.push(convert(&self[offset + 1..last_offset]));
2306            last_offset = offset;
2307            count -= 1;
2308        }
2309        if last_offset != 0 {
2310            splits.push(convert(&self[..last_offset]));
2311        }
2312        splits
2313    }
2314}
2315
2316impl AnyStrContainer<Wtf8> for Wtf8Buf {
2317    fn new() -> Self {
2318        Self::new()
2319    }
2320
2321    fn with_capacity(capacity: usize) -> Self {
2322        Self::with_capacity(capacity)
2323    }
2324
2325    fn push_str(&mut self, other: &Wtf8) {
2326        self.push_wtf8(other)
2327    }
2328}
2329
2330impl anystr::AnyChar for CodePoint {
2331    fn is_lowercase(self) -> bool {
2332        self.is_char_and(char::is_lowercase)
2333    }
2334    fn is_uppercase(self) -> bool {
2335        self.is_char_and(char::is_uppercase)
2336    }
2337    fn bytes_len(self) -> usize {
2338        self.len_wtf8()
2339    }
2340}
2341
2342impl AnyStr for Wtf8 {
2343    type Char = CodePoint;
2344    type Container = Wtf8Buf;
2345
2346    fn to_container(&self) -> Self::Container {
2347        self.to_owned()
2348    }
2349
2350    fn as_bytes(&self) -> &[u8] {
2351        self.as_bytes()
2352    }
2353
2354    fn elements(&self) -> impl Iterator<Item = Self::Char> {
2355        self.code_points()
2356    }
2357
2358    fn get_bytes(&self, range: core::ops::Range<usize>) -> &Self {
2359        &self[range]
2360    }
2361
2362    fn get_chars(&self, range: core::ops::Range<usize>) -> &Self {
2363        rustpython_common::str::get_codepoints(self, range)
2364    }
2365
2366    fn bytes_len(&self) -> usize {
2367        self.len()
2368    }
2369
2370    fn is_empty(&self) -> bool {
2371        self.is_empty()
2372    }
2373
2374    fn py_split_whitespace<F>(&self, maxsplit: isize, convert: F) -> Vec<PyObjectRef>
2375    where
2376        F: Fn(&Self) -> PyObjectRef,
2377    {
2378        // CPython split_whitespace
2379        let mut splits = Vec::new();
2380        let mut last_offset = 0;
2381        let mut count = maxsplit;
2382        for (offset, _) in self
2383            .code_point_indices()
2384            .filter(|(_, c)| c.is_char_and(|c| c.is_ascii_whitespace() || c == '\x0b'))
2385        {
2386            if last_offset == offset {
2387                last_offset += 1;
2388                continue;
2389            }
2390            if count == 0 {
2391                break;
2392            }
2393            splits.push(convert(&self[last_offset..offset]));
2394            last_offset = offset + 1;
2395            count -= 1;
2396        }
2397        if last_offset != self.len() {
2398            splits.push(convert(&self[last_offset..]));
2399        }
2400        splits
2401    }
2402
2403    fn py_rsplit_whitespace<F>(&self, maxsplit: isize, convert: F) -> Vec<PyObjectRef>
2404    where
2405        F: Fn(&Self) -> PyObjectRef,
2406    {
2407        // CPython rsplit_whitespace
2408        let mut splits = Vec::new();
2409        let mut last_offset = self.len();
2410        let mut count = maxsplit;
2411        for (offset, _) in self
2412            .code_point_indices()
2413            .rev()
2414            .filter(|(_, c)| c.is_char_and(|c| c.is_ascii_whitespace() || c == '\x0b'))
2415        {
2416            if last_offset == offset + 1 {
2417                last_offset -= 1;
2418                continue;
2419            }
2420            if count == 0 {
2421                break;
2422            }
2423            splits.push(convert(&self[offset + 1..last_offset]));
2424            last_offset = offset;
2425            count -= 1;
2426        }
2427        if last_offset != 0 {
2428            splits.push(convert(&self[..last_offset]));
2429        }
2430        splits
2431    }
2432}
2433
2434impl AnyStrContainer<AsciiStr> for AsciiString {
2435    fn new() -> Self {
2436        Self::new()
2437    }
2438
2439    fn with_capacity(capacity: usize) -> Self {
2440        Self::with_capacity(capacity)
2441    }
2442
2443    fn push_str(&mut self, other: &AsciiStr) {
2444        Self::push_str(self, other)
2445    }
2446}
2447
2448impl anystr::AnyChar for ascii::AsciiChar {
2449    fn is_lowercase(self) -> bool {
2450        self.is_lowercase()
2451    }
2452
2453    fn is_uppercase(self) -> bool {
2454        self.is_uppercase()
2455    }
2456
2457    fn bytes_len(self) -> usize {
2458        1
2459    }
2460}
2461
2462const ASCII_WHITESPACES: [u8; 6] = [0x20, 0x09, 0x0a, 0x0c, 0x0d, 0x0b];
2463
2464impl AnyStr for AsciiStr {
2465    type Char = AsciiChar;
2466    type Container = AsciiString;
2467
2468    fn to_container(&self) -> Self::Container {
2469        self.to_ascii_string()
2470    }
2471
2472    fn as_bytes(&self) -> &[u8] {
2473        self.as_bytes()
2474    }
2475
2476    fn elements(&self) -> impl Iterator<Item = Self::Char> {
2477        self.chars()
2478    }
2479
2480    fn get_bytes(&self, range: core::ops::Range<usize>) -> &Self {
2481        &self[range]
2482    }
2483
2484    fn get_chars(&self, range: core::ops::Range<usize>) -> &Self {
2485        &self[range]
2486    }
2487
2488    fn bytes_len(&self) -> usize {
2489        self.len()
2490    }
2491
2492    fn is_empty(&self) -> bool {
2493        self.is_empty()
2494    }
2495
2496    fn py_split_whitespace<F>(&self, maxsplit: isize, convert: F) -> Vec<PyObjectRef>
2497    where
2498        F: Fn(&Self) -> PyObjectRef,
2499    {
2500        let mut splits = Vec::new();
2501        let mut count = maxsplit;
2502        let mut haystack = self;
2503        while let Some(offset) = haystack.as_bytes().find_byteset(ASCII_WHITESPACES) {
2504            if offset != 0 {
2505                if count == 0 {
2506                    break;
2507                }
2508                splits.push(convert(&haystack[..offset]));
2509                count -= 1;
2510            }
2511            haystack = &haystack[offset + 1..];
2512        }
2513        if !haystack.is_empty() {
2514            splits.push(convert(haystack));
2515        }
2516        splits
2517    }
2518
2519    fn py_rsplit_whitespace<F>(&self, maxsplit: isize, convert: F) -> Vec<PyObjectRef>
2520    where
2521        F: Fn(&Self) -> PyObjectRef,
2522    {
2523        // CPython rsplit_whitespace
2524        let mut splits = Vec::new();
2525        let mut count = maxsplit;
2526        let mut haystack = self;
2527        while let Some(offset) = haystack.as_bytes().rfind_byteset(ASCII_WHITESPACES) {
2528            if offset + 1 != haystack.len() {
2529                if count == 0 {
2530                    break;
2531                }
2532                splits.push(convert(&haystack[offset + 1..]));
2533                count -= 1;
2534            }
2535            haystack = &haystack[..offset];
2536        }
2537        if !haystack.is_empty() {
2538            splits.push(convert(haystack));
2539        }
2540        splits
2541    }
2542}
2543
2544/// The unique reference of interned PyStr
2545/// Always intended to be used as a static reference
2546pub type PyStrInterned = PyInterned<PyStr>;
2547
2548impl PyStrInterned {
2549    #[inline]
2550    pub fn to_exact(&'static self) -> PyRefExact<PyStr> {
2551        unsafe { PyRefExact::new_unchecked(self.to_owned()) }
2552    }
2553}
2554
2555impl core::fmt::Display for PyStrInterned {
2556    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
2557        self.data.fmt(f)
2558    }
2559}
2560
2561impl AsRef<str> for PyStrInterned {
2562    #[inline(always)]
2563    fn as_ref(&self) -> &str {
2564        self.to_str()
2565            .expect("Interned PyStr should always be valid UTF-8")
2566    }
2567}
2568
2569/// Interned PyUtf8Str — guaranteed UTF-8 at type level.
2570/// Same layout as `PyStrInterned` due to `#[repr(transparent)]` on both
2571/// `PyInterned<T>` and `PyUtf8Str`.
2572pub type PyUtf8StrInterned = PyInterned<PyUtf8Str>;
2573
2574impl PyUtf8StrInterned {
2575    /// Returns the underlying `&str`.
2576    #[inline]
2577    pub fn as_str(&self) -> &str {
2578        Py::<PyUtf8Str>::as_str(self)
2579    }
2580
2581    /// View as `PyStrInterned` (widening: UTF-8 → WTF-8).
2582    #[inline]
2583    pub fn as_interned_str(&self) -> &PyStrInterned {
2584        // Safety: PyUtf8Str is #[repr(transparent)] over PyStr,
2585        // so PyInterned<PyUtf8Str> has the same layout as PyInterned<PyStr>.
2586        unsafe { &*(self as *const Self as *const PyStrInterned) }
2587    }
2588
2589    /// Narrow a `PyStrInterned` to `PyUtf8StrInterned`.
2590    ///
2591    /// # Safety
2592    /// The caller must ensure that the interned string is valid UTF-8.
2593    #[inline]
2594    pub unsafe fn from_str_interned_unchecked(s: &PyStrInterned) -> &Self {
2595        unsafe { &*(s as *const PyStrInterned as *const Self) }
2596    }
2597}
2598
2599impl core::fmt::Display for PyUtf8StrInterned {
2600    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
2601        f.write_str(self.as_str())
2602    }
2603}
2604
2605impl AsRef<str> for PyUtf8StrInterned {
2606    #[inline(always)]
2607    fn as_ref(&self) -> &str {
2608        self.as_str()
2609    }
2610}
2611
2612#[cfg(test)]
2613mod tests {
2614    use super::*;
2615    use crate::Interpreter;
2616    use rustpython_common::wtf8::Wtf8Buf;
2617
2618    #[test]
2619    fn str_title() {
2620        let tests = vec![
2621            (" Hello ", " hello "),
2622            ("Hello ", "hello "),
2623            ("Hello ", "Hello "),
2624            ("Format This As Title String", "fOrMaT thIs aS titLe String"),
2625            ("Format,This-As*Title;String", "fOrMaT,thIs-aS*titLe;String"),
2626            ("Getint", "getInt"),
2627            // spell-checker:disable-next-line
2628            ("Greek Ωppercases ...", "greek ωppercases ..."),
2629            // spell-checker:disable-next-line
2630            ("Greek ῼitlecases ...", "greek ῳitlecases ..."),
2631        ];
2632        for (title, input) in tests {
2633            assert_eq!(PyStr::from(input).title().as_str(), Ok(title));
2634        }
2635    }
2636
2637    #[test]
2638    fn str_istitle() {
2639        let pos = vec![
2640            "A",
2641            "A Titlecased Line",
2642            "A\nTitlecased Line",
2643            "A Titlecased, Line",
2644            // spell-checker:disable-next-line
2645            "Greek Ωppercases ...",
2646            // spell-checker:disable-next-line
2647            "Greek ῼitlecases ...",
2648        ];
2649
2650        for s in pos {
2651            assert!(PyStr::from(s).istitle());
2652        }
2653
2654        let neg = vec![
2655            "",
2656            "a",
2657            "\n",
2658            "Not a capitalized String",
2659            "Not\ta Titlecase String",
2660            "Not--a Titlecase String",
2661            "NOT",
2662        ];
2663        for s in neg {
2664            assert!(!PyStr::from(s).istitle());
2665        }
2666    }
2667
2668    #[test]
2669    fn str_maketrans_and_translate() {
2670        Interpreter::without_stdlib(Default::default()).enter(|vm| {
2671            let table = vm.ctx.new_dict();
2672            table
2673                .set_item("a", vm.ctx.new_str("🎅").into(), vm)
2674                .unwrap();
2675            table.set_item("b", vm.ctx.none(), vm).unwrap();
2676            table
2677                .set_item("c", vm.ctx.new_str(ascii!("xda")).into(), vm)
2678                .unwrap();
2679            let translated =
2680                PyStr::maketrans(table.into(), OptionalArg::Missing, OptionalArg::Missing, vm)
2681                    .unwrap();
2682            let text = PyStr::from("abc");
2683            let translated = text.translate(translated, vm).unwrap();
2684            assert_eq!(translated, Wtf8Buf::from("🎅xda"));
2685            let translated = text.translate(vm.ctx.new_int(3).into(), vm);
2686            assert_eq!("TypeError", &*translated.unwrap_err().class().name(),);
2687        })
2688    }
2689}
rustpython_vm/builtins/str.rs

rustpython_vm/builtins/
str.rs