rustpython_vm/
bytesinner.rs

1use crate::{
2    anystr::{self, AnyStr, AnyStrContainer, AnyStrWrapper},
3    builtins::{
4        pystr, PyBaseExceptionRef, PyByteArray, PyBytes, PyBytesRef, PyInt, PyIntRef, PyStr,
5        PyStrRef, PyTypeRef,
6    },
7    byte::bytes_from_object,
8    cformat::cformat_bytes,
9    common::hash,
10    function::{ArgIterable, Either, OptionalArg, OptionalOption, PyComparisonValue},
11    identifier,
12    literal::escape::Escape,
13    protocol::PyBuffer,
14    sequence::{SequenceExt, SequenceMutExt},
15    types::PyComparisonOp,
16    AsObject, PyObject, PyObjectRef, PyPayload, PyResult, TryFromBorrowedObject, VirtualMachine,
17};
18use bstr::ByteSlice;
19use itertools::Itertools;
20use malachite_bigint::BigInt;
21use num_traits::ToPrimitive;
22
23#[derive(Debug, Default, Clone)]
24pub struct PyBytesInner {
25    pub(super) elements: Vec<u8>,
26}
27
28impl From<Vec<u8>> for PyBytesInner {
29    fn from(elements: Vec<u8>) -> PyBytesInner {
30        Self { elements }
31    }
32}
33
34impl<'a> TryFromBorrowedObject<'a> for PyBytesInner {
35    fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult<Self> {
36        bytes_from_object(vm, obj).map(Self::from)
37    }
38}
39
40#[derive(FromArgs)]
41pub struct ByteInnerNewOptions {
42    #[pyarg(any, optional)]
43    pub source: OptionalArg<PyObjectRef>,
44    #[pyarg(any, optional)]
45    pub encoding: OptionalArg<PyStrRef>,
46    #[pyarg(any, optional)]
47    pub errors: OptionalArg<PyStrRef>,
48}
49
50impl ByteInnerNewOptions {
51    fn get_value_from_string(
52        s: PyStrRef,
53        encoding: PyStrRef,
54        errors: OptionalArg<PyStrRef>,
55        vm: &VirtualMachine,
56    ) -> PyResult<PyBytesInner> {
57        let bytes = pystr::encode_string(s, Some(encoding), errors.into_option(), vm)?;
58        Ok(bytes.as_bytes().to_vec().into())
59    }
60
61    fn get_value_from_source(source: PyObjectRef, vm: &VirtualMachine) -> PyResult<PyBytesInner> {
62        bytes_from_object(vm, &source).map(|x| x.into())
63    }
64
65    fn get_value_from_size(size: PyIntRef, vm: &VirtualMachine) -> PyResult<PyBytesInner> {
66        let size = size.as_bigint().to_isize().ok_or_else(|| {
67            vm.new_overflow_error("cannot fit 'int' into an index-sized integer".to_owned())
68        })?;
69        let size = if size < 0 {
70            return Err(vm.new_value_error("negative count".to_owned()));
71        } else {
72            size as usize
73        };
74        Ok(vec![0; size].into())
75    }
76
77    pub fn get_bytes(self, cls: PyTypeRef, vm: &VirtualMachine) -> PyResult<PyBytesRef> {
78        let inner = match (&self.source, &self.encoding, &self.errors) {
79            (OptionalArg::Present(obj), OptionalArg::Missing, OptionalArg::Missing) => {
80                let obj = obj.clone();
81                // construct an exact bytes from an exact bytes do not clone
82                let obj = if cls.is(PyBytes::class(&vm.ctx)) {
83                    match obj.downcast_exact::<PyBytes>(vm) {
84                        Ok(b) => return Ok(b.into_pyref()),
85                        Err(obj) => obj,
86                    }
87                } else {
88                    obj
89                };
90
91                if let Some(bytes_method) = vm.get_method(obj, identifier!(vm, __bytes__)) {
92                    // construct an exact bytes from __bytes__ slot.
93                    // if __bytes__ return a bytes, use the bytes object except we are the subclass of the bytes
94                    let bytes = bytes_method?.call((), vm)?;
95                    let bytes = if cls.is(PyBytes::class(&vm.ctx)) {
96                        match bytes.downcast::<PyBytes>() {
97                            Ok(b) => return Ok(b),
98                            Err(bytes) => bytes,
99                        }
100                    } else {
101                        bytes
102                    };
103                    Some(PyBytesInner::try_from_borrowed_object(vm, &bytes))
104                } else {
105                    None
106                }
107            }
108            _ => None,
109        }
110        .unwrap_or_else(|| self.get_bytearray_inner(vm))?;
111        PyBytes::from(inner).into_ref_with_type(vm, cls)
112    }
113
114    pub fn get_bytearray_inner(self, vm: &VirtualMachine) -> PyResult<PyBytesInner> {
115        const STRING_WITHOUT_ENCODING: &str = "string argument without an encoding";
116        const ENCODING_WITHOUT_STRING: &str = "encoding without a string argument";
117
118        match (self.source, self.encoding, self.errors) {
119            (OptionalArg::Present(obj), OptionalArg::Missing, OptionalArg::Missing) => {
120                match_class!(match obj {
121                    i @ PyInt => {
122                        Ok(Self::get_value_from_size(i, vm)?)
123                    }
124                    _s @ PyStr => Err(STRING_WITHOUT_ENCODING),
125                    obj => {
126                        Ok(Self::get_value_from_source(obj, vm)?)
127                    }
128                })
129            }
130            (OptionalArg::Present(obj), OptionalArg::Present(encoding), errors) => {
131                if let Ok(s) = obj.downcast::<PyStr>() {
132                    Ok(Self::get_value_from_string(s, encoding, errors, vm)?)
133                } else {
134                    Err(ENCODING_WITHOUT_STRING)
135                }
136            }
137            (OptionalArg::Missing, OptionalArg::Missing, OptionalArg::Missing) => {
138                Ok(PyBytesInner::default())
139            }
140            (OptionalArg::Missing, OptionalArg::Present(_), _) => Err(ENCODING_WITHOUT_STRING),
141            (OptionalArg::Missing, _, OptionalArg::Present(_)) => {
142                Err("errors without a string argument")
143            }
144            (OptionalArg::Present(_), OptionalArg::Missing, OptionalArg::Present(_)) => {
145                Err(STRING_WITHOUT_ENCODING)
146            }
147        }
148        .map_err(|e| vm.new_type_error(e.to_owned()))
149    }
150}
151
152#[derive(FromArgs)]
153pub struct ByteInnerFindOptions {
154    #[pyarg(positional)]
155    sub: Either<PyBytesInner, PyIntRef>,
156    #[pyarg(positional, default)]
157    start: Option<PyIntRef>,
158    #[pyarg(positional, default)]
159    end: Option<PyIntRef>,
160}
161
162impl ByteInnerFindOptions {
163    pub fn get_value(
164        self,
165        len: usize,
166        vm: &VirtualMachine,
167    ) -> PyResult<(Vec<u8>, std::ops::Range<usize>)> {
168        let sub = match self.sub {
169            Either::A(v) => v.elements.to_vec(),
170            Either::B(int) => vec![int.as_bigint().byte_or(vm)?],
171        };
172        let range = anystr::adjust_indices(self.start, self.end, len);
173        Ok((sub, range))
174    }
175}
176
177#[derive(FromArgs)]
178pub struct ByteInnerPaddingOptions {
179    #[pyarg(positional)]
180    width: isize,
181    #[pyarg(positional, optional)]
182    fillchar: OptionalArg<PyObjectRef>,
183}
184
185impl ByteInnerPaddingOptions {
186    fn get_value(self, fn_name: &str, vm: &VirtualMachine) -> PyResult<(isize, u8)> {
187        let fillchar = if let OptionalArg::Present(v) = self.fillchar {
188            try_as_bytes(v.clone(), |bytes| bytes.iter().copied().exactly_one().ok())
189                .flatten()
190                .ok_or_else(|| {
191                    vm.new_type_error(format!(
192                        "{}() argument 2 must be a byte string of length 1, not {}",
193                        fn_name,
194                        v.class().name()
195                    ))
196                })?
197        } else {
198            b' ' // default is space
199        };
200
201        Ok((self.width, fillchar))
202    }
203}
204
205#[derive(FromArgs)]
206pub struct ByteInnerTranslateOptions {
207    #[pyarg(positional)]
208    table: Option<PyObjectRef>,
209    #[pyarg(any, optional)]
210    delete: OptionalArg<PyObjectRef>,
211}
212
213impl ByteInnerTranslateOptions {
214    pub fn get_value(self, vm: &VirtualMachine) -> PyResult<(Vec<u8>, Vec<u8>)> {
215        let table = self.table.map_or_else(
216            || Ok((0..=255).collect::<Vec<u8>>()),
217            |v| {
218                let bytes = v
219                    .try_into_value::<PyBytesInner>(vm)
220                    .ok()
221                    .filter(|v| v.elements.len() == 256)
222                    .ok_or_else(|| {
223                        vm.new_value_error(
224                            "translation table must be 256 characters long".to_owned(),
225                        )
226                    })?;
227                Ok(bytes.elements.to_vec())
228            },
229        )?;
230
231        let delete = match self.delete {
232            OptionalArg::Present(byte) => {
233                let byte: PyBytesInner = byte.try_into_value(vm)?;
234                byte.elements
235            }
236            _ => vec![],
237        };
238
239        Ok((table, delete))
240    }
241}
242
243pub type ByteInnerSplitOptions = anystr::SplitArgs<PyBytesInner>;
244
245impl PyBytesInner {
246    #[inline]
247    pub fn as_bytes(&self) -> &[u8] {
248        &self.elements
249    }
250
251    fn new_repr_overflow_error(vm: &VirtualMachine) -> PyBaseExceptionRef {
252        vm.new_overflow_error("bytes object is too large to make repr".to_owned())
253    }
254
255    pub fn repr_with_name(&self, class_name: &str, vm: &VirtualMachine) -> PyResult<String> {
256        const DECORATION_LEN: isize = 2 + 3; // 2 for (), 3 for b"" => bytearray(b"")
257        let escape = crate::literal::escape::AsciiEscape::new_repr(&self.elements);
258        let len = escape
259            .layout()
260            .len
261            .and_then(|len| (len as isize).checked_add(DECORATION_LEN + class_name.len() as isize))
262            .ok_or_else(|| Self::new_repr_overflow_error(vm))? as usize;
263        let mut buf = String::with_capacity(len);
264        buf.push_str(class_name);
265        buf.push('(');
266        escape.bytes_repr().write(&mut buf).unwrap();
267        buf.push(')');
268        debug_assert_eq!(buf.len(), len);
269        Ok(buf)
270    }
271
272    pub fn repr_bytes(&self, vm: &VirtualMachine) -> PyResult<String> {
273        let escape = crate::literal::escape::AsciiEscape::new_repr(&self.elements);
274        let len = 3 + escape
275            .layout()
276            .len
277            .ok_or_else(|| Self::new_repr_overflow_error(vm))?;
278        let mut buf = String::with_capacity(len);
279        escape.bytes_repr().write(&mut buf).unwrap();
280        debug_assert_eq!(buf.len(), len);
281        Ok(buf)
282    }
283
284    #[inline]
285    pub fn len(&self) -> usize {
286        self.elements.len()
287    }
288
289    #[inline]
290    pub fn capacity(&self) -> usize {
291        self.elements.capacity()
292    }
293
294    #[inline]
295    pub fn is_empty(&self) -> bool {
296        self.elements.is_empty()
297    }
298
299    pub fn cmp(
300        &self,
301        other: &PyObject,
302        op: PyComparisonOp,
303        vm: &VirtualMachine,
304    ) -> PyComparisonValue {
305        // TODO: bytes can compare with any object implemented buffer protocol
306        // but not memoryview, and not equal if compare with unicode str(PyStr)
307        PyComparisonValue::from_option(
308            other
309                .try_bytes_like(vm, |other| op.eval_ord(self.elements.as_slice().cmp(other)))
310                .ok(),
311        )
312    }
313
314    pub fn hash(&self, vm: &VirtualMachine) -> hash::PyHash {
315        vm.state.hash_secret.hash_bytes(&self.elements)
316    }
317
318    pub fn add(&self, other: &[u8]) -> Vec<u8> {
319        self.elements.py_add(other)
320    }
321
322    pub fn contains(
323        &self,
324        needle: Either<PyBytesInner, PyIntRef>,
325        vm: &VirtualMachine,
326    ) -> PyResult<bool> {
327        Ok(match needle {
328            Either::A(byte) => self.elements.contains_str(byte.elements.as_slice()),
329            Either::B(int) => self.elements.contains(&int.as_bigint().byte_or(vm)?),
330        })
331    }
332
333    pub fn isalnum(&self) -> bool {
334        !self.elements.is_empty()
335            && self
336                .elements
337                .iter()
338                .all(|x| char::from(*x).is_alphanumeric())
339    }
340
341    pub fn isalpha(&self) -> bool {
342        !self.elements.is_empty() && self.elements.iter().all(|x| char::from(*x).is_alphabetic())
343    }
344
345    pub fn isascii(&self) -> bool {
346        self.elements.iter().all(|x| char::from(*x).is_ascii())
347    }
348
349    pub fn isdigit(&self) -> bool {
350        !self.elements.is_empty()
351            && self
352                .elements
353                .iter()
354                .all(|x| char::from(*x).is_ascii_digit())
355    }
356
357    pub fn islower(&self) -> bool {
358        self.elements
359            .py_iscase(char::is_lowercase, char::is_uppercase)
360    }
361
362    pub fn isupper(&self) -> bool {
363        self.elements
364            .py_iscase(char::is_uppercase, char::is_lowercase)
365    }
366
367    pub fn isspace(&self) -> bool {
368        !self.elements.is_empty()
369            && self
370                .elements
371                .iter()
372                .all(|x| char::from(*x).is_ascii_whitespace())
373    }
374
375    pub fn istitle(&self) -> bool {
376        if self.elements.is_empty() {
377            return false;
378        }
379
380        let mut iter = self.elements.iter().peekable();
381        let mut prev_cased = false;
382
383        while let Some(c) = iter.next() {
384            let current = char::from(*c);
385            let next = if let Some(k) = iter.peek() {
386                char::from(**k)
387            } else if current.is_uppercase() {
388                return !prev_cased;
389            } else {
390                return prev_cased;
391            };
392
393            let is_cased = current.to_uppercase().next().unwrap() != current
394                || current.to_lowercase().next().unwrap() != current;
395            if (is_cased && next.is_uppercase() && !prev_cased)
396                || (!is_cased && next.is_lowercase())
397            {
398                return false;
399            }
400
401            prev_cased = is_cased;
402        }
403
404        true
405    }
406
407    pub fn lower(&self) -> Vec<u8> {
408        self.elements.to_ascii_lowercase()
409    }
410
411    pub fn upper(&self) -> Vec<u8> {
412        self.elements.to_ascii_uppercase()
413    }
414
415    pub fn capitalize(&self) -> Vec<u8> {
416        let mut new: Vec<u8> = Vec::with_capacity(self.elements.len());
417        if let Some((first, second)) = self.elements.split_first() {
418            new.push(first.to_ascii_uppercase());
419            second.iter().for_each(|x| new.push(x.to_ascii_lowercase()));
420        }
421        new
422    }
423
424    pub fn swapcase(&self) -> Vec<u8> {
425        let mut new: Vec<u8> = Vec::with_capacity(self.elements.len());
426        for w in &self.elements {
427            match w {
428                65..=90 => new.push(w.to_ascii_lowercase()),
429                97..=122 => new.push(w.to_ascii_uppercase()),
430                x => new.push(*x),
431            }
432        }
433        new
434    }
435
436    pub fn hex(
437        &self,
438        sep: OptionalArg<Either<PyStrRef, PyBytesRef>>,
439        bytes_per_sep: OptionalArg<isize>,
440        vm: &VirtualMachine,
441    ) -> PyResult<String> {
442        bytes_to_hex(self.elements.as_slice(), sep, bytes_per_sep, vm)
443    }
444
445    pub fn fromhex(string: &str, vm: &VirtualMachine) -> PyResult<Vec<u8>> {
446        let mut iter = string.bytes().enumerate();
447        let mut bytes: Vec<u8> = Vec::with_capacity(string.len() / 2);
448        let i = loop {
449            let (i, b) = match iter.next() {
450                Some(val) => val,
451                None => {
452                    return Ok(bytes);
453                }
454            };
455
456            if is_py_ascii_whitespace(b) {
457                continue;
458            }
459
460            let top = match b {
461                b'0'..=b'9' => b - b'0',
462                b'a'..=b'f' => 10 + b - b'a',
463                b'A'..=b'F' => 10 + b - b'A',
464                _ => break i,
465            };
466
467            let (i, b) = match iter.next() {
468                Some(val) => val,
469                None => break i + 1,
470            };
471
472            let bot = match b {
473                b'0'..=b'9' => b - b'0',
474                b'a'..=b'f' => 10 + b - b'a',
475                b'A'..=b'F' => 10 + b - b'A',
476                _ => break i,
477            };
478
479            bytes.push((top << 4) + bot);
480        };
481
482        Err(vm.new_value_error(format!(
483            "non-hexadecimal number found in fromhex() arg at position {i}"
484        )))
485    }
486
487    #[inline]
488    fn _pad(
489        &self,
490        options: ByteInnerPaddingOptions,
491        pad: fn(&[u8], usize, u8, usize) -> Vec<u8>,
492        vm: &VirtualMachine,
493    ) -> PyResult<Vec<u8>> {
494        let (width, fillchar) = options.get_value("center", vm)?;
495        Ok(if self.len() as isize >= width {
496            Vec::from(&self.elements[..])
497        } else {
498            pad(&self.elements, width as usize, fillchar, self.len())
499        })
500    }
501
502    pub fn center(
503        &self,
504        options: ByteInnerPaddingOptions,
505        vm: &VirtualMachine,
506    ) -> PyResult<Vec<u8>> {
507        self._pad(options, AnyStr::py_center, vm)
508    }
509
510    pub fn ljust(
511        &self,
512        options: ByteInnerPaddingOptions,
513        vm: &VirtualMachine,
514    ) -> PyResult<Vec<u8>> {
515        self._pad(options, AnyStr::py_ljust, vm)
516    }
517
518    pub fn rjust(
519        &self,
520        options: ByteInnerPaddingOptions,
521        vm: &VirtualMachine,
522    ) -> PyResult<Vec<u8>> {
523        self._pad(options, AnyStr::py_rjust, vm)
524    }
525
526    pub fn count(&self, options: ByteInnerFindOptions, vm: &VirtualMachine) -> PyResult<usize> {
527        let (needle, range) = options.get_value(self.elements.len(), vm)?;
528        Ok(self
529            .elements
530            .py_count(needle.as_slice(), range, |h, n| h.find_iter(n).count()))
531    }
532
533    pub fn join(
534        &self,
535        iterable: ArgIterable<PyBytesInner>,
536        vm: &VirtualMachine,
537    ) -> PyResult<Vec<u8>> {
538        let iter = iterable.iter(vm)?;
539        self.elements.py_join(iter)
540    }
541
542    #[inline]
543    pub fn find<F>(
544        &self,
545        options: ByteInnerFindOptions,
546        find: F,
547        vm: &VirtualMachine,
548    ) -> PyResult<Option<usize>>
549    where
550        F: Fn(&[u8], &[u8]) -> Option<usize>,
551    {
552        let (needle, range) = options.get_value(self.elements.len(), vm)?;
553        Ok(self.elements.py_find(&needle, range, find))
554    }
555
556    pub fn maketrans(
557        from: PyBytesInner,
558        to: PyBytesInner,
559        vm: &VirtualMachine,
560    ) -> PyResult<Vec<u8>> {
561        if from.len() != to.len() {
562            return Err(
563                vm.new_value_error("the two maketrans arguments must have equal length".to_owned())
564            );
565        }
566        let mut res = vec![];
567
568        for i in 0..=255 {
569            res.push(if let Some(position) = from.elements.find_byte(i) {
570                to.elements[position]
571            } else {
572                i
573            });
574        }
575
576        Ok(res)
577    }
578
579    pub fn translate(
580        &self,
581        options: ByteInnerTranslateOptions,
582        vm: &VirtualMachine,
583    ) -> PyResult<Vec<u8>> {
584        let (table, delete) = options.get_value(vm)?;
585
586        let mut res = if delete.is_empty() {
587            Vec::with_capacity(self.elements.len())
588        } else {
589            Vec::new()
590        };
591
592        for i in &self.elements {
593            if !delete.contains(i) {
594                res.push(table[*i as usize]);
595            }
596        }
597
598        Ok(res)
599    }
600
601    pub fn strip(&self, chars: OptionalOption<PyBytesInner>) -> Vec<u8> {
602        self.elements
603            .py_strip(
604                chars,
605                |s, chars| s.trim_with(|c| chars.contains(&(c as u8))),
606                |s| s.trim(),
607            )
608            .to_vec()
609    }
610
611    pub fn lstrip(&self, chars: OptionalOption<PyBytesInner>) -> &[u8] {
612        self.elements.py_strip(
613            chars,
614            |s, chars| s.trim_start_with(|c| chars.contains(&(c as u8))),
615            |s| s.trim_start(),
616        )
617    }
618
619    pub fn rstrip(&self, chars: OptionalOption<PyBytesInner>) -> &[u8] {
620        self.elements.py_strip(
621            chars,
622            |s, chars| s.trim_end_with(|c| chars.contains(&(c as u8))),
623            |s| s.trim_end(),
624        )
625    }
626
627    // new in Python 3.9
628    pub fn removeprefix(&self, prefix: PyBytesInner) -> Vec<u8> {
629        self.elements
630            .py_removeprefix(&prefix.elements, prefix.elements.len(), |s, p| {
631                s.starts_with(p)
632            })
633            .to_vec()
634    }
635
636    // new in Python 3.9
637    pub fn removesuffix(&self, suffix: PyBytesInner) -> Vec<u8> {
638        self.elements
639            .py_removesuffix(&suffix.elements, suffix.elements.len(), |s, p| {
640                s.ends_with(p)
641            })
642            .to_vec()
643    }
644
645    pub fn split<F>(
646        &self,
647        options: ByteInnerSplitOptions,
648        convert: F,
649        vm: &VirtualMachine,
650    ) -> PyResult<Vec<PyObjectRef>>
651    where
652        F: Fn(&[u8], &VirtualMachine) -> PyObjectRef,
653    {
654        let elements = self.elements.py_split(
655            options,
656            vm,
657            |v, s, vm| v.split_str(s).map(|v| convert(v, vm)).collect(),
658            |v, s, n, vm| v.splitn_str(n, s).map(|v| convert(v, vm)).collect(),
659            |v, n, vm| v.py_split_whitespace(n, |v| convert(v, vm)),
660        )?;
661        Ok(elements)
662    }
663
664    pub fn rsplit<F>(
665        &self,
666        options: ByteInnerSplitOptions,
667        convert: F,
668        vm: &VirtualMachine,
669    ) -> PyResult<Vec<PyObjectRef>>
670    where
671        F: Fn(&[u8], &VirtualMachine) -> PyObjectRef,
672    {
673        let mut elements = self.elements.py_split(
674            options,
675            vm,
676            |v, s, vm| v.rsplit_str(s).map(|v| convert(v, vm)).collect(),
677            |v, s, n, vm| v.rsplitn_str(n, s).map(|v| convert(v, vm)).collect(),
678            |v, n, vm| v.py_rsplit_whitespace(n, |v| convert(v, vm)),
679        )?;
680        elements.reverse();
681        Ok(elements)
682    }
683
684    pub fn partition(
685        &self,
686        sub: &PyBytesInner,
687        vm: &VirtualMachine,
688    ) -> PyResult<(Vec<u8>, bool, Vec<u8>)> {
689        self.elements.py_partition(
690            &sub.elements,
691            || self.elements.splitn_str(2, &sub.elements),
692            vm,
693        )
694    }
695
696    pub fn rpartition(
697        &self,
698        sub: &PyBytesInner,
699        vm: &VirtualMachine,
700    ) -> PyResult<(Vec<u8>, bool, Vec<u8>)> {
701        self.elements.py_partition(
702            &sub.elements,
703            || self.elements.rsplitn_str(2, &sub.elements),
704            vm,
705        )
706    }
707
708    pub fn expandtabs(&self, options: anystr::ExpandTabsArgs) -> Vec<u8> {
709        let tabsize = options.tabsize();
710        let mut counter: usize = 0;
711        let mut res = vec![];
712
713        if tabsize == 0 {
714            return self
715                .elements
716                .iter()
717                .copied()
718                .filter(|x| *x != b'\t')
719                .collect();
720        }
721
722        for i in &self.elements {
723            if *i == b'\t' {
724                let len = tabsize - counter % tabsize;
725                res.extend_from_slice(&vec![b' '; len]);
726                counter += len;
727            } else {
728                res.push(*i);
729                if *i == b'\r' || *i == b'\n' {
730                    counter = 0;
731                } else {
732                    counter += 1;
733                }
734            }
735        }
736
737        res
738    }
739
740    pub fn splitlines<FW, W>(&self, options: anystr::SplitLinesArgs, into_wrapper: FW) -> Vec<W>
741    where
742        FW: Fn(&[u8]) -> W,
743    {
744        self.elements.py_bytes_splitlines(options, into_wrapper)
745    }
746
747    pub fn zfill(&self, width: isize) -> Vec<u8> {
748        self.elements.py_zfill(width)
749    }
750
751    // len(self)>=1, from="", len(to)>=1, maxcount>=1
752    fn replace_interleave(&self, to: PyBytesInner, maxcount: Option<usize>) -> Vec<u8> {
753        let place_count = self.elements.len() + 1;
754        let count = maxcount.map_or(place_count, |v| std::cmp::min(v, place_count)) - 1;
755        let capacity = self.elements.len() + count * to.len();
756        let mut result = Vec::with_capacity(capacity);
757        let to_slice = to.elements.as_slice();
758        result.extend_from_slice(to_slice);
759        for c in &self.elements[..count] {
760            result.push(*c);
761            result.extend_from_slice(to_slice);
762        }
763        result.extend_from_slice(&self.elements[count..]);
764        result
765    }
766
767    fn replace_delete(&self, from: PyBytesInner, maxcount: Option<usize>) -> Vec<u8> {
768        let count = count_substring(self.elements.as_slice(), from.elements.as_slice(), maxcount);
769        if count == 0 {
770            // no matches
771            return self.elements.clone();
772        }
773
774        let result_len = self.len() - (count * from.len());
775        debug_assert!(self.len() >= count * from.len());
776
777        let mut result = Vec::with_capacity(result_len);
778        let mut last_end = 0;
779        let mut count = count;
780        for offset in self.elements.find_iter(&from.elements) {
781            result.extend_from_slice(&self.elements[last_end..offset]);
782            last_end = offset + from.len();
783            count -= 1;
784            if count == 0 {
785                break;
786            }
787        }
788        result.extend_from_slice(&self.elements[last_end..]);
789        result
790    }
791
792    pub fn replace_in_place(
793        &self,
794        from: PyBytesInner,
795        to: PyBytesInner,
796        maxcount: Option<usize>,
797    ) -> Vec<u8> {
798        let len = from.len();
799        let mut iter = self.elements.find_iter(&from.elements);
800
801        let mut new = if let Some(offset) = iter.next() {
802            let mut new = self.elements.clone();
803            new[offset..offset + len].clone_from_slice(to.elements.as_slice());
804            if maxcount == Some(1) {
805                return new;
806            } else {
807                new
808            }
809        } else {
810            return self.elements.clone();
811        };
812
813        let mut count = maxcount.unwrap_or(usize::MAX) - 1;
814        for offset in iter {
815            new[offset..offset + len].clone_from_slice(to.elements.as_slice());
816            count -= 1;
817            if count == 0 {
818                break;
819            }
820        }
821        new
822    }
823
824    fn replace_general(
825        &self,
826        from: PyBytesInner,
827        to: PyBytesInner,
828        maxcount: Option<usize>,
829        vm: &VirtualMachine,
830    ) -> PyResult<Vec<u8>> {
831        let count = count_substring(self.elements.as_slice(), from.elements.as_slice(), maxcount);
832        if count == 0 {
833            // no matches, return unchanged
834            return Ok(self.elements.clone());
835        }
836
837        // Check for overflow
838        //    result_len = self_len + count * (to_len-from_len)
839        debug_assert!(count > 0);
840        if to.len() as isize - from.len() as isize
841            > (isize::MAX - self.elements.len() as isize) / count as isize
842        {
843            return Err(vm.new_overflow_error("replace bytes is too long".to_owned()));
844        }
845        let result_len = (self.elements.len() as isize
846            + count as isize * (to.len() as isize - from.len() as isize))
847            as usize;
848
849        let mut result = Vec::with_capacity(result_len);
850        let mut last_end = 0;
851        let mut count = count;
852        for offset in self.elements.find_iter(&from.elements) {
853            result.extend_from_slice(&self.elements[last_end..offset]);
854            result.extend_from_slice(to.elements.as_slice());
855            last_end = offset + from.len();
856            count -= 1;
857            if count == 0 {
858                break;
859            }
860        }
861        result.extend_from_slice(&self.elements[last_end..]);
862        Ok(result)
863    }
864
865    pub fn replace(
866        &self,
867        from: PyBytesInner,
868        to: PyBytesInner,
869        maxcount: OptionalArg<isize>,
870        vm: &VirtualMachine,
871    ) -> PyResult<Vec<u8>> {
872        // stringlib_replace in CPython
873        let maxcount = match maxcount {
874            OptionalArg::Present(maxcount) if maxcount >= 0 => {
875                if maxcount == 0 || (self.elements.is_empty() && !from.is_empty()) {
876                    // nothing to do; return the original bytes
877                    return Ok(self.elements.clone());
878                } else if self.elements.is_empty() && from.is_empty() {
879                    return Ok(to.elements);
880                }
881                Some(maxcount as usize)
882            }
883            _ => None,
884        };
885
886        // Handle zero-length special cases
887        if from.elements.is_empty() {
888            if to.elements.is_empty() {
889                // nothing to do; return the original bytes
890                return Ok(self.elements.clone());
891            }
892            // insert the 'to' bytes everywhere.
893            //     >>> b"Python".replace(b"", b".")
894            //     b'.P.y.t.h.o.n.'
895            return Ok(self.replace_interleave(to, maxcount));
896        }
897
898        // Except for b"".replace(b"", b"A") == b"A" there is no way beyond this
899        // point for an empty self bytes to generate a non-empty bytes
900        // Special case so the remaining code always gets a non-empty bytes
901        if self.elements.is_empty() {
902            return Ok(self.elements.clone());
903        }
904
905        if to.elements.is_empty() {
906            // delete all occurrences of 'from' bytes
907            Ok(self.replace_delete(from, maxcount))
908        } else if from.len() == to.len() {
909            // Handle special case where both bytes have the same length
910            Ok(self.replace_in_place(from, to, maxcount))
911        } else {
912            // Otherwise use the more generic algorithms
913            self.replace_general(from, to, maxcount, vm)
914        }
915    }
916
917    pub fn title(&self) -> Vec<u8> {
918        let mut res = vec![];
919        let mut spaced = true;
920
921        for i in &self.elements {
922            match i {
923                65..=90 | 97..=122 => {
924                    if spaced {
925                        res.push(i.to_ascii_uppercase());
926                        spaced = false
927                    } else {
928                        res.push(i.to_ascii_lowercase());
929                    }
930                }
931                _ => {
932                    res.push(*i);
933                    spaced = true
934                }
935            }
936        }
937
938        res
939    }
940
941    pub fn cformat(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult<Vec<u8>> {
942        cformat_bytes(vm, self.elements.as_slice(), values)
943    }
944
945    pub fn mul(&self, n: isize, vm: &VirtualMachine) -> PyResult<Vec<u8>> {
946        self.elements.mul(vm, n)
947    }
948
949    pub fn imul(&mut self, n: isize, vm: &VirtualMachine) -> PyResult<()> {
950        self.elements.imul(vm, n)
951    }
952
953    pub fn concat(&self, other: &PyObject, vm: &VirtualMachine) -> PyResult<Vec<u8>> {
954        let buffer = PyBuffer::try_from_borrowed_object(vm, other)?;
955        let borrowed = buffer.as_contiguous();
956        if let Some(other) = borrowed {
957            let mut v = Vec::with_capacity(self.elements.len() + other.len());
958            v.extend_from_slice(&self.elements);
959            v.extend_from_slice(&other);
960            Ok(v)
961        } else {
962            let mut v = self.elements.clone();
963            buffer.append_to(&mut v);
964            Ok(v)
965        }
966    }
967}
968
969pub fn try_as_bytes<F, R>(obj: PyObjectRef, f: F) -> Option<R>
970where
971    F: Fn(&[u8]) -> R,
972{
973    match_class!(match obj {
974        i @ PyBytes => Some(f(i.as_bytes())),
975        j @ PyByteArray => Some(f(&j.borrow_buf())),
976        _ => None,
977    })
978}
979
980#[inline]
981fn count_substring(haystack: &[u8], needle: &[u8], maxcount: Option<usize>) -> usize {
982    let substrings = haystack.find_iter(needle);
983    if let Some(maxcount) = maxcount {
984        std::cmp::min(substrings.take(maxcount).count(), maxcount)
985    } else {
986        substrings.count()
987    }
988}
989
990pub trait ByteOr: ToPrimitive {
991    fn byte_or(&self, vm: &VirtualMachine) -> PyResult<u8> {
992        match self.to_u8() {
993            Some(value) => Ok(value),
994            None => Err(vm.new_value_error("byte must be in range(0, 256)".to_owned())),
995        }
996    }
997}
998
999impl ByteOr for BigInt {}
1000
1001impl AnyStrWrapper for PyBytesInner {
1002    type Str = [u8];
1003    fn as_ref(&self) -> &[u8] {
1004        &self.elements
1005    }
1006}
1007
1008impl AnyStrContainer<[u8]> for Vec<u8> {
1009    fn new() -> Self {
1010        Vec::new()
1011    }
1012
1013    fn with_capacity(capacity: usize) -> Self {
1014        Vec::with_capacity(capacity)
1015    }
1016
1017    fn push_str(&mut self, other: &[u8]) {
1018        self.extend(other)
1019    }
1020}
1021
1022const ASCII_WHITESPACES: [u8; 6] = [0x20, 0x09, 0x0a, 0x0c, 0x0d, 0x0b];
1023
1024impl AnyStr for [u8] {
1025    type Char = u8;
1026    type Container = Vec<u8>;
1027
1028    fn element_bytes_len(_: u8) -> usize {
1029        1
1030    }
1031
1032    fn to_container(&self) -> Self::Container {
1033        self.to_vec()
1034    }
1035
1036    fn as_bytes(&self) -> &[u8] {
1037        self
1038    }
1039
1040    fn as_utf8_str(&self) -> Result<&str, std::str::Utf8Error> {
1041        std::str::from_utf8(self)
1042    }
1043
1044    fn chars(&self) -> impl Iterator<Item = char> {
1045        bstr::ByteSlice::chars(self)
1046    }
1047
1048    fn elements(&self) -> impl Iterator<Item = u8> {
1049        self.iter().copied()
1050    }
1051
1052    fn get_bytes(&self, range: std::ops::Range<usize>) -> &Self {
1053        &self[range]
1054    }
1055
1056    fn get_chars(&self, range: std::ops::Range<usize>) -> &Self {
1057        &self[range]
1058    }
1059
1060    fn is_empty(&self) -> bool {
1061        Self::is_empty(self)
1062    }
1063
1064    fn bytes_len(&self) -> usize {
1065        Self::len(self)
1066    }
1067
1068    fn py_split_whitespace<F>(&self, maxsplit: isize, convert: F) -> Vec<PyObjectRef>
1069    where
1070        F: Fn(&Self) -> PyObjectRef,
1071    {
1072        let mut splits = Vec::new();
1073        let mut count = maxsplit;
1074        let mut haystack = self;
1075        while let Some(offset) = haystack.find_byteset(ASCII_WHITESPACES) {
1076            if offset != 0 {
1077                if count == 0 {
1078                    break;
1079                }
1080                splits.push(convert(&haystack[..offset]));
1081                count -= 1;
1082            }
1083            haystack = &haystack[offset + 1..];
1084        }
1085        if !haystack.is_empty() {
1086            splits.push(convert(haystack));
1087        }
1088        splits
1089    }
1090
1091    fn py_rsplit_whitespace<F>(&self, maxsplit: isize, convert: F) -> Vec<PyObjectRef>
1092    where
1093        F: Fn(&Self) -> PyObjectRef,
1094    {
1095        let mut splits = Vec::new();
1096        let mut count = maxsplit;
1097        let mut haystack = self;
1098        while let Some(offset) = haystack.rfind_byteset(ASCII_WHITESPACES) {
1099            if offset + 1 != haystack.len() {
1100                if count == 0 {
1101                    break;
1102                }
1103                splits.push(convert(&haystack[offset + 1..]));
1104                count -= 1;
1105            }
1106            haystack = &haystack[..offset];
1107        }
1108        if !haystack.is_empty() {
1109            splits.push(convert(haystack));
1110        }
1111        splits
1112    }
1113}
1114
1115#[derive(FromArgs)]
1116pub struct DecodeArgs {
1117    #[pyarg(any, default)]
1118    encoding: Option<PyStrRef>,
1119    #[pyarg(any, default)]
1120    errors: Option<PyStrRef>,
1121}
1122
1123pub fn bytes_decode(
1124    zelf: PyObjectRef,
1125    args: DecodeArgs,
1126    vm: &VirtualMachine,
1127) -> PyResult<PyStrRef> {
1128    let DecodeArgs { encoding, errors } = args;
1129    let encoding = encoding
1130        .as_ref()
1131        .map_or(crate::codecs::DEFAULT_ENCODING, |s| s.as_str());
1132    vm.state
1133        .codec_registry
1134        .decode_text(zelf, encoding, errors, vm)
1135}
1136
1137fn hex_impl_no_sep(bytes: &[u8]) -> String {
1138    let mut buf: Vec<u8> = vec![0; bytes.len() * 2];
1139    hex::encode_to_slice(bytes, buf.as_mut_slice()).unwrap();
1140    unsafe { String::from_utf8_unchecked(buf) }
1141}
1142
1143fn hex_impl(bytes: &[u8], sep: u8, bytes_per_sep: isize) -> String {
1144    let len = bytes.len();
1145
1146    let buf = if bytes_per_sep < 0 {
1147        let bytes_per_sep = std::cmp::min(len, (-bytes_per_sep) as usize);
1148        let chunks = (len - 1) / bytes_per_sep;
1149        let chunked = chunks * bytes_per_sep;
1150        let unchunked = len - chunked;
1151        let mut buf = vec![0; len * 2 + chunks];
1152        let mut j = 0;
1153        for i in (0..chunks).map(|i| i * bytes_per_sep) {
1154            hex::encode_to_slice(
1155                &bytes[i..i + bytes_per_sep],
1156                &mut buf[j..j + bytes_per_sep * 2],
1157            )
1158            .unwrap();
1159            j += bytes_per_sep * 2;
1160            buf[j] = sep;
1161            j += 1;
1162        }
1163        hex::encode_to_slice(&bytes[chunked..], &mut buf[j..j + unchunked * 2]).unwrap();
1164        buf
1165    } else {
1166        let bytes_per_sep = std::cmp::min(len, bytes_per_sep as usize);
1167        let chunks = (len - 1) / bytes_per_sep;
1168        let chunked = chunks * bytes_per_sep;
1169        let unchunked = len - chunked;
1170        let mut buf = vec![0; len * 2 + chunks];
1171        hex::encode_to_slice(&bytes[..unchunked], &mut buf[..unchunked * 2]).unwrap();
1172        let mut j = unchunked * 2;
1173        for i in (0..chunks).map(|i| i * bytes_per_sep + unchunked) {
1174            buf[j] = sep;
1175            j += 1;
1176            hex::encode_to_slice(
1177                &bytes[i..i + bytes_per_sep],
1178                &mut buf[j..j + bytes_per_sep * 2],
1179            )
1180            .unwrap();
1181            j += bytes_per_sep * 2;
1182        }
1183        buf
1184    };
1185
1186    unsafe { String::from_utf8_unchecked(buf) }
1187}
1188
1189pub fn bytes_to_hex(
1190    bytes: &[u8],
1191    sep: OptionalArg<Either<PyStrRef, PyBytesRef>>,
1192    bytes_per_sep: OptionalArg<isize>,
1193    vm: &VirtualMachine,
1194) -> PyResult<String> {
1195    if bytes.is_empty() {
1196        return Ok("".to_owned());
1197    }
1198
1199    if let OptionalArg::Present(sep) = sep {
1200        let bytes_per_sep = bytes_per_sep.unwrap_or(1);
1201        if bytes_per_sep == 0 {
1202            return Ok(hex_impl_no_sep(bytes));
1203        }
1204
1205        let s_guard;
1206        let b_guard;
1207        let sep = match &sep {
1208            Either::A(s) => {
1209                s_guard = s.as_str();
1210                s_guard.as_bytes()
1211            }
1212            Either::B(bytes) => {
1213                b_guard = bytes.as_bytes();
1214                b_guard
1215            }
1216        };
1217
1218        if sep.len() != 1 {
1219            return Err(vm.new_value_error("sep must be length 1.".to_owned()));
1220        }
1221        let sep = sep[0];
1222        if sep > 127 {
1223            return Err(vm.new_value_error("sep must be ASCII.".to_owned()));
1224        }
1225
1226        Ok(hex_impl(bytes, sep, bytes_per_sep))
1227    } else {
1228        Ok(hex_impl_no_sep(bytes))
1229    }
1230}
1231
1232pub const fn is_py_ascii_whitespace(b: u8) -> bool {
1233    matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B')
1234}
rustpython_vm/bytesinner.rs

rustpython_vm/
bytesinner.rs