rustpython_common/
str.rs

1// spell-checker:ignore uncomputed
2use crate::atomic::{PyAtomic, Radium};
3use crate::format::CharLen;
4use crate::wtf8::{CodePoint, Wtf8, Wtf8Buf};
5use ascii::{AsciiChar, AsciiStr, AsciiString};
6use core::fmt;
7use core::ops::{Bound, RangeBounds};
8use core::sync::atomic::Ordering::Relaxed;
9
10#[cfg(not(target_arch = "wasm32"))]
11#[allow(non_camel_case_types)]
12pub type wchar_t = libc::wchar_t;
13#[cfg(target_arch = "wasm32")]
14#[allow(non_camel_case_types)]
15pub type wchar_t = u32;
16
17/// Utf8 + state.ascii (+ PyUnicode_Kind in future)
18#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
19pub enum StrKind {
20    Ascii,
21    Utf8,
22    Wtf8,
23}
24
25impl core::ops::BitOr for StrKind {
26    type Output = Self;
27
28    fn bitor(self, other: Self) -> Self {
29        use StrKind::*;
30        match (self, other) {
31            (Wtf8, _) | (_, Wtf8) => Wtf8,
32            (Utf8, _) | (_, Utf8) => Utf8,
33            (Ascii, Ascii) => Ascii,
34        }
35    }
36}
37
38impl StrKind {
39    pub const fn is_ascii(&self) -> bool {
40        matches!(self, Self::Ascii)
41    }
42
43    pub const fn is_utf8(&self) -> bool {
44        matches!(self, Self::Ascii | Self::Utf8)
45    }
46
47    #[inline(always)]
48    pub fn can_encode(&self, code: CodePoint) -> bool {
49        match self {
50            Self::Ascii => code.is_ascii(),
51            Self::Utf8 => code.to_char().is_some(),
52            Self::Wtf8 => true,
53        }
54    }
55}
56
57pub trait DeduceStrKind {
58    fn str_kind(&self) -> StrKind;
59}
60
61impl DeduceStrKind for str {
62    fn str_kind(&self) -> StrKind {
63        if self.is_ascii() {
64            StrKind::Ascii
65        } else {
66            StrKind::Utf8
67        }
68    }
69}
70
71impl DeduceStrKind for Wtf8 {
72    fn str_kind(&self) -> StrKind {
73        if self.is_ascii() {
74            StrKind::Ascii
75        } else if self.is_utf8() {
76            StrKind::Utf8
77        } else {
78            StrKind::Wtf8
79        }
80    }
81}
82
83impl DeduceStrKind for String {
84    fn str_kind(&self) -> StrKind {
85        (**self).str_kind()
86    }
87}
88
89impl DeduceStrKind for Wtf8Buf {
90    fn str_kind(&self) -> StrKind {
91        (**self).str_kind()
92    }
93}
94
95impl<T: DeduceStrKind + ?Sized> DeduceStrKind for &T {
96    fn str_kind(&self) -> StrKind {
97        (**self).str_kind()
98    }
99}
100
101impl<T: DeduceStrKind + ?Sized> DeduceStrKind for Box<T> {
102    fn str_kind(&self) -> StrKind {
103        (**self).str_kind()
104    }
105}
106
107#[derive(Debug)]
108pub enum PyKindStr<'a> {
109    Ascii(&'a AsciiStr),
110    Utf8(&'a str),
111    Wtf8(&'a Wtf8),
112}
113
114#[derive(Debug, Clone)]
115pub struct StrData {
116    data: Box<Wtf8>,
117    kind: StrKind,
118    len: StrLen,
119}
120
121struct StrLen(PyAtomic<usize>);
122
123impl From<usize> for StrLen {
124    #[inline(always)]
125    fn from(value: usize) -> Self {
126        Self(Radium::new(value))
127    }
128}
129
130impl fmt::Debug for StrLen {
131    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
132        let len = self.0.load(Relaxed);
133        if len == usize::MAX {
134            f.write_str("<uncomputed>")
135        } else {
136            len.fmt(f)
137        }
138    }
139}
140
141impl StrLen {
142    #[inline(always)]
143    fn zero() -> Self {
144        0usize.into()
145    }
146
147    #[inline(always)]
148    fn uncomputed() -> Self {
149        usize::MAX.into()
150    }
151}
152
153impl Clone for StrLen {
154    fn clone(&self) -> Self {
155        Self(self.0.load(Relaxed).into())
156    }
157}
158
159impl Default for StrData {
160    fn default() -> Self {
161        Self {
162            data: <Box<Wtf8>>::default(),
163            kind: StrKind::Ascii,
164            len: StrLen::zero(),
165        }
166    }
167}
168
169impl From<Box<Wtf8>> for StrData {
170    fn from(value: Box<Wtf8>) -> Self {
171        // doing the check is ~10x faster for ascii, and is actually only 2% slower worst case for
172        // non-ascii; see https://github.com/RustPython/RustPython/pull/2586#issuecomment-844611532
173        let kind = value.str_kind();
174        unsafe { Self::new_str_unchecked(value, kind) }
175    }
176}
177
178impl From<Box<str>> for StrData {
179    #[inline]
180    fn from(value: Box<str>) -> Self {
181        // doing the check is ~10x faster for ascii, and is actually only 2% slower worst case for
182        // non-ascii; see https://github.com/RustPython/RustPython/pull/2586#issuecomment-844611532
183        let kind = value.str_kind();
184        unsafe { Self::new_str_unchecked(value.into(), kind) }
185    }
186}
187
188impl From<Box<AsciiStr>> for StrData {
189    #[inline]
190    fn from(value: Box<AsciiStr>) -> Self {
191        Self {
192            len: value.len().into(),
193            data: value.into(),
194            kind: StrKind::Ascii,
195        }
196    }
197}
198
199impl From<AsciiChar> for StrData {
200    fn from(ch: AsciiChar) -> Self {
201        AsciiString::from(ch).into_boxed_ascii_str().into()
202    }
203}
204
205impl From<char> for StrData {
206    fn from(ch: char) -> Self {
207        if let Ok(ch) = ascii::AsciiChar::from_ascii(ch) {
208            ch.into()
209        } else {
210            Self {
211                data: ch.to_string().into(),
212                kind: StrKind::Utf8,
213                len: 1.into(),
214            }
215        }
216    }
217}
218
219impl From<CodePoint> for StrData {
220    fn from(ch: CodePoint) -> Self {
221        if let Some(ch) = ch.to_char() {
222            ch.into()
223        } else {
224            Self {
225                data: Wtf8Buf::from(ch).into(),
226                kind: StrKind::Wtf8,
227                len: 1.into(),
228            }
229        }
230    }
231}
232
233impl StrData {
234    /// # Safety
235    ///
236    /// Given `bytes` must be valid data for given `kind`
237    pub unsafe fn new_str_unchecked(data: Box<Wtf8>, kind: StrKind) -> Self {
238        let len = match kind {
239            StrKind::Ascii => data.len().into(),
240            _ => StrLen::uncomputed(),
241        };
242        Self { data, kind, len }
243    }
244
245    /// # Safety
246    ///
247    /// `char_len` must be accurate.
248    pub unsafe fn new_with_char_len(data: Box<Wtf8>, kind: StrKind, char_len: usize) -> Self {
249        Self {
250            data,
251            kind,
252            len: char_len.into(),
253        }
254    }
255
256    #[inline]
257    pub const fn as_wtf8(&self) -> &Wtf8 {
258        &self.data
259    }
260
261    // TODO: rename to to_str
262    #[inline]
263    pub fn as_str(&self) -> Option<&str> {
264        self.kind
265            .is_utf8()
266            .then(|| unsafe { core::str::from_utf8_unchecked(self.data.as_bytes()) })
267    }
268
269    pub fn as_ascii(&self) -> Option<&AsciiStr> {
270        self.kind
271            .is_ascii()
272            .then(|| unsafe { AsciiStr::from_ascii_unchecked(self.data.as_bytes()) })
273    }
274
275    pub const fn kind(&self) -> StrKind {
276        self.kind
277    }
278
279    #[inline]
280    pub fn as_str_kind(&self) -> PyKindStr<'_> {
281        match self.kind {
282            StrKind::Ascii => {
283                PyKindStr::Ascii(unsafe { AsciiStr::from_ascii_unchecked(self.data.as_bytes()) })
284            }
285            StrKind::Utf8 => {
286                PyKindStr::Utf8(unsafe { core::str::from_utf8_unchecked(self.data.as_bytes()) })
287            }
288            StrKind::Wtf8 => PyKindStr::Wtf8(&self.data),
289        }
290    }
291
292    #[inline]
293    pub fn len(&self) -> usize {
294        self.data.len()
295    }
296
297    pub fn is_empty(&self) -> bool {
298        self.data.is_empty()
299    }
300
301    #[inline]
302    pub fn char_len(&self) -> usize {
303        match self.len.0.load(Relaxed) {
304            usize::MAX => self._compute_char_len(),
305            len => len,
306        }
307    }
308
309    #[cold]
310    fn _compute_char_len(&self) -> usize {
311        let len = if let Some(s) = self.as_str() {
312            // utf8 chars().count() is optimized
313            s.chars().count()
314        } else {
315            self.data.code_points().count()
316        };
317        // len cannot be usize::MAX, since vec.capacity() < sys.maxsize
318        self.len.0.store(len, Relaxed);
319        len
320    }
321
322    pub fn nth_char(&self, index: usize) -> CodePoint {
323        match self.as_str_kind() {
324            PyKindStr::Ascii(s) => s[index].into(),
325            PyKindStr::Utf8(s) => s.chars().nth(index).unwrap().into(),
326            PyKindStr::Wtf8(w) => w.code_points().nth(index).unwrap(),
327        }
328    }
329}
330
331impl core::fmt::Display for StrData {
332    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
333        self.data.fmt(f)
334    }
335}
336
337impl CharLen for StrData {
338    fn char_len(&self) -> usize {
339        self.char_len()
340    }
341}
342
343pub fn try_get_chars(s: &str, range: impl RangeBounds<usize>) -> Option<&str> {
344    let mut chars = s.chars();
345    let start = match range.start_bound() {
346        Bound::Included(&i) => i,
347        Bound::Excluded(&i) => i + 1,
348        Bound::Unbounded => 0,
349    };
350    for _ in 0..start {
351        chars.next()?;
352    }
353    let s = chars.as_str();
354    let range_len = match range.end_bound() {
355        Bound::Included(&i) => i + 1 - start,
356        Bound::Excluded(&i) => i - start,
357        Bound::Unbounded => return Some(s),
358    };
359    char_range_end(s, range_len).map(|end| &s[..end])
360}
361
362pub fn get_chars(s: &str, range: impl RangeBounds<usize>) -> &str {
363    try_get_chars(s, range).unwrap()
364}
365
366#[inline]
367pub fn char_range_end(s: &str, n_chars: usize) -> Option<usize> {
368    let i = match n_chars.checked_sub(1) {
369        Some(last_char_index) => {
370            let (index, c) = s.char_indices().nth(last_char_index)?;
371            index + c.len_utf8()
372        }
373        None => 0,
374    };
375    Some(i)
376}
377
378pub fn try_get_codepoints(w: &Wtf8, range: impl RangeBounds<usize>) -> Option<&Wtf8> {
379    let mut chars = w.code_points();
380    let start = match range.start_bound() {
381        Bound::Included(&i) => i,
382        Bound::Excluded(&i) => i + 1,
383        Bound::Unbounded => 0,
384    };
385    for _ in 0..start {
386        chars.next()?;
387    }
388    let s = chars.as_wtf8();
389    let range_len = match range.end_bound() {
390        Bound::Included(&i) => i + 1 - start,
391        Bound::Excluded(&i) => i - start,
392        Bound::Unbounded => return Some(s),
393    };
394    codepoint_range_end(s, range_len).map(|end| &s[..end])
395}
396
397pub fn get_codepoints(w: &Wtf8, range: impl RangeBounds<usize>) -> &Wtf8 {
398    try_get_codepoints(w, range).unwrap()
399}
400
401#[inline]
402pub fn codepoint_range_end(s: &Wtf8, n_chars: usize) -> Option<usize> {
403    let i = match n_chars.checked_sub(1) {
404        Some(last_char_index) => {
405            let (index, c) = s.code_point_indices().nth(last_char_index)?;
406            index + c.len_wtf8()
407        }
408        None => 0,
409    };
410    Some(i)
411}
412
413pub fn zfill(bytes: &[u8], width: usize) -> Vec<u8> {
414    if width <= bytes.len() {
415        bytes.to_vec()
416    } else {
417        let (sign, s) = match bytes.first() {
418            Some(_sign @ b'+') | Some(_sign @ b'-') => {
419                (unsafe { bytes.get_unchecked(..1) }, &bytes[1..])
420            }
421            _ => (&b""[..], bytes),
422        };
423        let mut filled = Vec::new();
424        filled.extend_from_slice(sign);
425        filled.extend(core::iter::repeat_n(b'0', width - bytes.len()));
426        filled.extend_from_slice(s);
427        filled
428    }
429}
430
431/// Convert a string to ascii compatible, escaping unicode-s into escape
432/// sequences.
433pub fn to_ascii(value: &Wtf8) -> AsciiString {
434    let mut ascii = Vec::new();
435    for cp in value.code_points() {
436        if cp.is_ascii() {
437            ascii.push(cp.to_u32() as u8);
438        } else {
439            let c = cp.to_u32();
440            let hex = if c < 0x100 {
441                format!("\\x{c:02x}")
442            } else if c < 0x10000 {
443                format!("\\u{c:04x}")
444            } else {
445                format!("\\U{c:08x}")
446            };
447            ascii.append(&mut hex.into_bytes());
448        }
449    }
450    unsafe { AsciiString::from_ascii_unchecked(ascii) }
451}
452
453#[derive(Clone, Copy)]
454pub struct UnicodeEscapeCodepoint(pub CodePoint);
455
456impl fmt::Display for UnicodeEscapeCodepoint {
457    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
458        let c = self.0.to_u32();
459        if c >= 0x10000 {
460            write!(f, "\\U{c:08x}")
461        } else if c >= 0x100 {
462            write!(f, "\\u{c:04x}")
463        } else {
464            write!(f, "\\x{c:02x}")
465        }
466    }
467}
468
469pub mod levenshtein {
470    pub const MOVE_COST: usize = 2;
471    const CASE_COST: usize = 1;
472    const MAX_STRING_SIZE: usize = 40;
473
474    const fn substitution_cost(mut a: u8, mut b: u8) -> usize {
475        if (a & 31) != (b & 31) {
476            return MOVE_COST;
477        }
478        if a == b {
479            return 0;
480        }
481        if a.is_ascii_uppercase() {
482            a += b'a' - b'A';
483        }
484        if b.is_ascii_uppercase() {
485            b += b'a' - b'A';
486        }
487        if a == b { CASE_COST } else { MOVE_COST }
488    }
489
490    pub fn levenshtein_distance(a: &[u8], b: &[u8], max_cost: usize) -> usize {
491        if a == b {
492            return 0;
493        }
494
495        let (mut a_bytes, mut b_bytes) = (a, b);
496        let (mut a_begin, mut a_end) = (0usize, a.len());
497        let (mut b_begin, mut b_end) = (0usize, b.len());
498
499        while a_end > 0 && b_end > 0 && (a_bytes[a_begin] == b_bytes[b_begin]) {
500            a_begin += 1;
501            b_begin += 1;
502            a_end -= 1;
503            b_end -= 1;
504        }
505        while a_end > 0
506            && b_end > 0
507            && (a_bytes[a_begin + a_end - 1] == b_bytes[b_begin + b_end - 1])
508        {
509            a_end -= 1;
510            b_end -= 1;
511        }
512        if a_end == 0 || b_end == 0 {
513            return (a_end + b_end) * MOVE_COST;
514        }
515        if a_end > MAX_STRING_SIZE || b_end > MAX_STRING_SIZE {
516            return max_cost + 1;
517        }
518
519        if b_end < a_end {
520            core::mem::swap(&mut a_bytes, &mut b_bytes);
521            core::mem::swap(&mut a_begin, &mut b_begin);
522            core::mem::swap(&mut a_end, &mut b_end);
523        }
524
525        if (b_end - a_end) * MOVE_COST > max_cost {
526            return max_cost + 1;
527        }
528
529        let mut buffer = [0usize; MAX_STRING_SIZE];
530
531        for (i, x) in buffer.iter_mut().take(a_end).enumerate() {
532            *x = (i + 1) * MOVE_COST;
533        }
534
535        let mut result = 0usize;
536        for (b_index, b_code) in b_bytes[b_begin..(b_begin + b_end)].iter().enumerate() {
537            result = b_index * MOVE_COST;
538            let mut distance = result;
539            let mut minimum = usize::MAX;
540            for (a_index, a_code) in a_bytes[a_begin..(a_begin + a_end)].iter().enumerate() {
541                let substitute = distance + substitution_cost(*b_code, *a_code);
542                distance = buffer[a_index];
543                let insert_delete = usize::min(result, distance) + MOVE_COST;
544                result = usize::min(insert_delete, substitute);
545
546                buffer[a_index] = result;
547                if result < minimum {
548                    minimum = result;
549                }
550            }
551            if minimum > max_cost {
552                return max_cost + 1;
553            }
554        }
555        result
556    }
557}
558
559/// Replace all tabs in a string with spaces, using the given tab size.
560pub fn expandtabs(input: &str, tab_size: usize) -> String {
561    let tab_stop = tab_size;
562    let mut expanded_str = String::with_capacity(input.len());
563    let mut tab_size = tab_stop;
564    let mut col_count = 0usize;
565    for ch in input.chars() {
566        match ch {
567            '\t' => {
568                let num_spaces = tab_size - col_count;
569                col_count += num_spaces;
570                let expand = " ".repeat(num_spaces);
571                expanded_str.push_str(&expand);
572            }
573            '\r' | '\n' => {
574                expanded_str.push(ch);
575                col_count = 0;
576                tab_size = 0;
577            }
578            _ => {
579                expanded_str.push(ch);
580                col_count += 1;
581            }
582        }
583        if col_count >= tab_size {
584            tab_size += tab_stop;
585        }
586    }
587    expanded_str
588}
589
590/// Creates an [`AsciiStr`][ascii::AsciiStr] from a string literal, throwing a compile error if the
591/// literal isn't actually ascii.
592///
593/// ```compile_fail
594/// # use rustpython_common::str::ascii;
595/// ascii!("I ❤️ Rust & Python");
596/// ```
597#[macro_export]
598macro_rules! ascii {
599    ($x:expr $(,)?) => {{
600        let s = const {
601            let s: &str = $x;
602            assert!(s.is_ascii(), "ascii!() argument is not an ascii string");
603            s
604        };
605        unsafe { $crate::vendored::ascii::AsciiStr::from_ascii_unchecked(s.as_bytes()) }
606    }};
607}
608pub use ascii;
609
610// TODO: this should probably live in a crate like unic or unicode-properties
611const UNICODE_DECIMAL_VALUES: &[char] = &[
612    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '٠', '١', '٢', '٣', '٤', '٥', '٦', '٧', '٨',
613    '٩', '۰', '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹', '߀', '߁', '߂', '߃', '߄', '߅', '߆', '߇',
614    '߈', '߉', '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', '০', '১', '২', '৩', '৪', '৫', '৬',
615    '৭', '৮', '৯', '੦', '੧', '੨', '੩', '੪', '੫', '੬', '੭', '੮', '੯', '૦', '૧', '૨', '૩', '૪', '૫',
616    '૬', '૭', '૮', '૯', '୦', '୧', '୨', '୩', '୪', '୫', '୬', '୭', '୮', '୯', '௦', '௧', '௨', '௩', '௪',
617    '௫', '௬', '௭', '௮', '௯', '౦', '౧', '౨', '౩', '౪', '౫', '౬', '౭', '౮', '౯', '೦', '೧', '೨', '೩',
618    '೪', '೫', '೬', '೭', '೮', '೯', '൦', '൧', '൨', '൩', '൪', '൫', '൬', '൭', '൮', '൯', '෦', '෧', '෨',
619    '෩', '෪', '෫', '෬', '෭', '෮', '෯', '๐', '๑', '๒', '๓', '๔', '๕', '๖', '๗', '๘', '๙', '໐', '໑',
620    '໒', '໓', '໔', '໕', '໖', '໗', '໘', '໙', '༠', '༡', '༢', '༣', '༤', '༥', '༦', '༧', '༨', '༩', '၀',
621    '၁', '၂', '၃', '၄', '၅', '၆', '၇', '၈', '၉', '႐', '႑', '႒', '႓', '႔', '႕', '႖', '႗', '႘', '႙',
622    '០', '១', '២', '៣', '៤', '៥', '៦', '៧', '៨', '៩', '᠐', '᠑', '᠒', '᠓', '᠔', '᠕', '᠖', '᠗', '᠘',
623    '᠙', '᥆', '᥇', '᥈', '᥉', '᥊', '᥋', '᥌', '᥍', '᥎', '᥏', '᧐', '᧑', '᧒', '᧓', '᧔', '᧕', '᧖', '᧗',
624    '᧘', '᧙', '᪀', '᪁', '᪂', '᪃', '᪄', '᪅', '᪆', '᪇', '᪈', '᪉', '᪐', '᪑', '᪒', '᪓', '᪔', '᪕', '᪖',
625    '᪗', '᪘', '᪙', '᭐', '᭑', '᭒', '᭓', '᭔', '᭕', '᭖', '᭗', '᭘', '᭙', '᮰', '᮱', '᮲', '᮳', '᮴', '᮵',
626    '᮶', '᮷', '᮸', '᮹', '᱀', '᱁', '᱂', '᱃', '᱄', '᱅', '᱆', '᱇', '᱈', '᱉', '᱐', '᱑', '᱒', '᱓', '᱔',
627    '᱕', '᱖', '᱗', '᱘', '᱙', '꘠', '꘡', '꘢', '꘣', '꘤', '꘥', '꘦', '꘧', '꘨', '꘩', '꣐', '꣑', '꣒', '꣓',
628    '꣔', '꣕', '꣖', '꣗', '꣘', '꣙', '꤀', '꤁', '꤂', '꤃', '꤄', '꤅', '꤆', '꤇', '꤈', '꤉', '꧐', '꧑', '꧒',
629    '꧓', '꧔', '꧕', '꧖', '꧗', '꧘', '꧙', '꧰', '꧱', '꧲', '꧳', '꧴', '꧵', '꧶', '꧷', '꧸', '꧹', '꩐', '꩑',
630    '꩒', '꩓', '꩔', '꩕', '꩖', '꩗', '꩘', '꩙', '꯰', '꯱', '꯲', '꯳', '꯴', '꯵', '꯶', '꯷', '꯸', '꯹', '０',
631    '１', '２', '３', '４', '５', '６', '７', '８', '９', '𐒠', '𐒡', '𐒢', '𐒣', '𐒤', '𐒥', '𐒦', '𐒧',
632    '𐒨', '𐒩', '𑁦', '𑁧', '𑁨', '𑁩', '𑁪', '𑁫', '𑁬', '𑁭', '𑁮', '𑁯', '𑃰', '𑃱', '𑃲', '𑃳', '𑃴', '𑃵', '𑃶',
633    '𑃷', '𑃸', '𑃹', '𑄶', '𑄷', '𑄸', '𑄹', '𑄺', '𑄻', '𑄼', '𑄽', '𑄾', '𑄿', '𑇐', '𑇑', '𑇒', '𑇓', '𑇔', '𑇕',
634    '𑇖', '𑇗', '𑇘', '𑇙', '𑋰', '𑋱', '𑋲', '𑋳', '𑋴', '𑋵', '𑋶', '𑋷', '𑋸', '𑋹', '𑑐', '𑑑', '𑑒', '𑑓', '𑑔',
635    '𑑕', '𑑖', '𑑗', '𑑘', '𑑙', '𑓐', '𑓑', '𑓒', '𑓓', '𑓔', '𑓕', '𑓖', '𑓗', '𑓘', '𑓙', '𑙐', '𑙑', '𑙒', '𑙓',
636    '𑙔', '𑙕', '𑙖', '𑙗', '𑙘', '𑙙', '𑛀', '𑛁', '𑛂', '𑛃', '𑛄', '𑛅', '𑛆', '𑛇', '𑛈', '𑛉', '𑜰', '𑜱', '𑜲',
637    '𑜳', '𑜴', '𑜵', '𑜶', '𑜷', '𑜸', '𑜹', '𑣠', '𑣡', '𑣢', '𑣣', '𑣤', '𑣥', '𑣦', '𑣧', '𑣨', '𑣩', '𑱐', '𑱑',
638    '𑱒', '𑱓', '𑱔', '𑱕', '𑱖', '𑱗', '𑱘', '𑱙', '𑵐', '𑵑', '𑵒', '𑵓', '𑵔', '𑵕', '𑵖', '𑵗', '𑵘', '𑵙', '𖩠',
639    '𖩡', '𖩢', '𖩣', '𖩤', '𖩥', '𖩦', '𖩧', '𖩨', '𖩩', '𖭐', '𖭑', '𖭒', '𖭓', '𖭔', '𖭕', '𖭖', '𖭗', '𖭘', '𖭙',
640    '𝟎', '𝟏', '𝟐', '𝟑', '𝟒', '𝟓', '𝟔', '𝟕', '𝟖', '𝟗', '𝟘', '𝟙', '𝟚', '𝟛', '𝟜', '𝟝', '𝟞', '𝟟', '𝟠',
641    '𝟡', '𝟢', '𝟣', '𝟤', '𝟥', '𝟦', '𝟧', '𝟨', '𝟩', '𝟪', '𝟫', '𝟬', '𝟭', '𝟮', '𝟯', '𝟰', '𝟱', '𝟲', '𝟳',
642    '𝟴', '𝟵', '𝟶', '𝟷', '𝟸', '𝟹', '𝟺', '𝟻', '𝟼', '𝟽', '𝟾', '𝟿', '𞥐', '𞥑', '𞥒', '𞥓', '𞥔', '𞥕', '𞥖',
643    '𞥗', '𞥘', '𞥙',
644];
645
646pub fn char_to_decimal(ch: char) -> Option<u8> {
647    UNICODE_DECIMAL_VALUES
648        .binary_search(&ch)
649        .ok()
650        .map(|i| (i % 10) as u8)
651}
652
653#[cfg(test)]
654mod tests {
655    use super::*;
656
657    #[test]
658    fn test_get_chars() {
659        let s = "0123456789";
660        assert_eq!(get_chars(s, 3..7), "3456");
661        assert_eq!(get_chars(s, 3..7), &s[3..7]);
662
663        let s = "0유니코드 문자열9";
664        assert_eq!(get_chars(s, 3..7), "코드 문");
665
666        let s = "0😀😃😄😁😆😅😂🤣9";
667        assert_eq!(get_chars(s, 3..7), "😄😁😆😅");
668    }
669}
rustpython_common/str.rs

rustpython_common/
str.rs