Skip to main content

obeli_sk_boa_string/
str.rs

1use super::iter::{CodePointsIter, Windows};
2use crate::{CodePoint, Iter, display::JsStrDisplayLossy, is_trimmable_whitespace};
3use std::{
4    hash::{Hash, Hasher},
5    slice::SliceIndex,
6};
7
8/// Inner representation of a [`JsStr`].
9#[derive(Debug, Clone, Copy)]
10pub enum JsStrVariant<'a> {
11    /// Latin1 string representation.
12    Latin1(&'a [u8]),
13
14    /// U16 string representation.
15    Utf16(&'a [u16]),
16}
17
18impl JsStrVariant<'_> {
19    pub(crate) const fn len(&self) -> usize {
20        match self {
21            JsStrVariant::Latin1(data) => data.len(),
22            JsStrVariant::Utf16(data) => data.len(),
23        }
24    }
25}
26
27/// This is equivalent to Rust's `&str`.
28#[derive(Clone, Copy)]
29#[repr(align(8))]
30pub struct JsStr<'a> {
31    inner: JsStrVariant<'a>,
32}
33
34// SAFETY: Inner<'_> has only immutable references to Sync types (u8/u16), so this is safe.
35unsafe impl Sync for JsStr<'_> {}
36
37// SAFETY: It's read-only, sending this reference to another thread doesn't
38//         risk data races (there’s no mutation happening), so this is safe.
39unsafe impl Send for JsStr<'_> {}
40
41impl<'a> JsStr<'a> {
42    /// This represents an empty string.
43    pub const EMPTY: Self = Self::latin1("".as_bytes());
44
45    /// Creates a [`JsStr`] from codepoints that can fit in a `u8`.
46    #[inline]
47    #[must_use]
48    pub const fn latin1(value: &'a [u8]) -> Self {
49        Self {
50            inner: JsStrVariant::Latin1(value),
51        }
52    }
53
54    /// Creates a [`JsStr`] from utf16 encoded string.
55    #[inline]
56    #[must_use]
57    pub const fn utf16(value: &'a [u16]) -> Self {
58        Self {
59            inner: JsStrVariant::Utf16(value),
60        }
61    }
62
63    /// Get the length of the [`JsStr`].
64    #[inline]
65    #[must_use]
66    pub const fn len(&self) -> usize {
67        self.inner.len()
68    }
69
70    /// Return the inner [`JsStrVariant`] variant of the [`JsStr`].
71    #[inline]
72    #[must_use]
73    pub const fn variant(self) -> JsStrVariant<'a> {
74        self.inner
75    }
76
77    /// Check if the [`JsStr`] is latin1 encoded.
78    #[inline]
79    #[must_use]
80    pub const fn is_latin1(&self) -> bool {
81        matches!(self.inner, JsStrVariant::Latin1(_))
82    }
83
84    /// Returns [`u8`] slice if the [`JsStr`] is latin1 encoded, otherwise [`None`].
85    #[inline]
86    #[must_use]
87    pub const fn as_latin1(&self) -> Option<&[u8]> {
88        match &self.inner {
89            JsStrVariant::Latin1(v) => Some(v),
90            JsStrVariant::Utf16(_) => None,
91        }
92    }
93
94    /// Returns the same string slice but with a static reference, removing any
95    /// lifetime limits.
96    ///
97    /// # Safety
98    /// The caller is responsible to ensure the lifetime of this slice.
99    #[inline]
100    #[must_use]
101    pub unsafe fn as_static(self) -> JsStr<'static> {
102        let inner: JsStrVariant<'static> = match self.inner {
103            JsStrVariant::Latin1(v) => {
104                // SAFETY: Caller is responsible for ensuring the lifetime of this slice.
105                let static_v: &'static [u8] =
106                    unsafe { std::slice::from_raw_parts(v.as_ptr(), v.len()) };
107                JsStrVariant::<'static>::Latin1(static_v)
108            }
109            JsStrVariant::Utf16(v) => {
110                // SAFETY: Caller is responsible for ensuring the lifetime of this slice.
111                let static_v: &'static [u16] =
112                    unsafe { std::slice::from_raw_parts(v.as_ptr(), v.len()) };
113                JsStrVariant::<'static>::Utf16(static_v)
114            }
115        };
116        JsStr::<'static> { inner }
117    }
118
119    /// Iterate over the codepoints of the string.
120    #[inline]
121    #[must_use]
122    pub fn iter(self) -> Iter<'a> {
123        Iter::new(self)
124    }
125
126    /// Iterate over the codepoints of the string.
127    #[inline]
128    #[must_use]
129    pub fn windows(self, size: usize) -> Windows<'a> {
130        Windows::new(self, size)
131    }
132
133    /// Check if the [`JsStr`] is empty.
134    #[inline]
135    #[must_use]
136    pub fn is_empty(&self) -> bool {
137        self.len() == 0
138    }
139
140    /// Returns an element or subslice depending on the type of index, otherwise [`None`].
141    #[inline]
142    #[must_use]
143    pub fn get<I>(self, index: I) -> Option<I::Value>
144    where
145        I: JsSliceIndex<'a>,
146    {
147        JsSliceIndex::get(self, index)
148    }
149
150    /// Get the element at the given index.
151    ///
152    /// # Panics
153    ///
154    /// If the index is out of bounds.
155    #[inline]
156    #[must_use]
157    pub fn get_expect<I>(&self, index: I) -> I::Value
158    where
159        I: JsSliceIndex<'a>,
160    {
161        self.get(index).expect("Index out of bounds")
162    }
163
164    /// Returns an element or subslice depending on the type of index, without doing bounds check.
165    ///
166    /// # Safety
167    ///
168    /// Caller must ensure the index is not out of bounds
169    #[inline]
170    #[must_use]
171    pub unsafe fn get_unchecked<I>(self, index: I) -> I::Value
172    where
173        I: JsSliceIndex<'a>,
174    {
175        // Safety: Caller must ensure the index is not out of bounds
176        unsafe { JsSliceIndex::get_unchecked(self, index) }
177    }
178
179    /// Convert the [`JsStr`] into a [`Vec<U16>`].
180    #[inline]
181    #[must_use]
182    pub fn to_vec(&self) -> Vec<u16> {
183        match self.variant() {
184            JsStrVariant::Latin1(v) => v.iter().copied().map(u16::from).collect(),
185            JsStrVariant::Utf16(v) => v.to_vec(),
186        }
187    }
188
189    /// Returns true if needle is a prefix of the [`JsStr`].
190    #[inline]
191    #[must_use]
192    // We check the size, so this should never panic.
193    #[allow(clippy::missing_panics_doc)]
194    pub fn starts_with(&self, needle: JsStr<'_>) -> bool {
195        let n = needle.len();
196        self.len() >= n && needle == self.get(..n).expect("already checked size")
197    }
198    /// Returns `true` if `needle` is a suffix of the [`JsStr`].
199    #[inline]
200    #[must_use]
201    // We check the size, so this should never panic.
202    #[allow(clippy::missing_panics_doc)]
203    pub fn ends_with(&self, needle: JsStr<'_>) -> bool {
204        let (m, n) = (self.len(), needle.len());
205        m >= n && needle == self.get(m - n..).expect("already checked size")
206    }
207
208    /// Abstract operation `StringIndexOf ( string, searchValue, fromIndex )`
209    ///
210    /// Note: Instead of returning an isize with `-1` as the "not found" value, we make use of the
211    /// type system and return <code>[Option]\<usize\></code> with [`None`] as the "not found" value.
212    ///
213    /// More information:
214    ///  - [ECMAScript reference][spec]
215    ///
216    /// [spec]: https://tc39.es/ecma262/#sec-stringindexof
217    #[inline]
218    #[must_use]
219    pub fn index_of(&self, search_value: JsStr<'_>, from_index: usize) -> Option<usize> {
220        // 1. Assert: Type(string) is String.
221        // 2. Assert: Type(searchValue) is String.
222        // 3. Assert: fromIndex is a non-negative integer.
223
224        // 4. Let len be the length of string.
225        let len = self.len();
226
227        // 5. If searchValue is the empty String and fromIndex ≤ len, return fromIndex.
228        if search_value.is_empty() {
229            return if from_index <= len {
230                Some(from_index)
231            } else {
232                None
233            };
234        }
235
236        // 6. Let searchLen be the length of searchValue.
237        // 7. For each integer i starting with fromIndex such that i ≤ len - searchLen, in ascending order, do
238        // a. Let candidate be the substring of string from i to i + searchLen.
239        // b. If candidate is the same sequence of code units as searchValue, return i.
240        // 8. Return -1.
241        self.windows(search_value.len())
242            .skip(from_index)
243            .position(|s| s == search_value)
244            .map(|i| i + from_index)
245    }
246
247    /// Abstract operation `CodePointAt( string, position )`.
248    ///
249    /// The abstract operation `CodePointAt` takes arguments `string` (a String) and `position` (a
250    /// non-negative integer) and returns a Record with fields `[[CodePoint]]` (a code point),
251    /// `[[CodeUnitCount]]` (a positive integer), and `[[IsUnpairedSurrogate]]` (a Boolean). It
252    /// interprets string as a sequence of UTF-16 encoded code points, as described in 6.1.4, and reads
253    /// from it a single code point starting with the code unit at index `position`.
254    ///
255    /// More information:
256    ///  - [ECMAScript reference][spec]
257    ///
258    /// [spec]: https://tc39.es/ecma262/#sec-codepointat
259    ///
260    /// # Panics
261    ///
262    /// If `position` is smaller than size of string.
263    #[inline]
264    #[must_use]
265    pub fn code_point_at(&self, position: usize) -> CodePoint {
266        // 1. Let size be the length of string.
267        let size = self.len();
268
269        // 2. Assert: position ≥ 0 and position < size.
270        // position >= 0 ensured by position: usize
271        assert!(position < size);
272
273        match self.variant() {
274            JsStrVariant::Latin1(v) => {
275                let code_point = v.get(position).expect("Already checked the size");
276                CodePoint::Unicode(*code_point as char)
277            }
278            // 3. Let first be the code unit at index position within string.
279            // 4. Let cp be the code point whose numeric value is that of first.
280            // 5. If first is not a leading surrogate or trailing surrogate, then
281            // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: false }.
282            // 6. If first is a trailing surrogate or position + 1 = size, then
283            // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
284            // 7. Let second be the code unit at index position + 1 within string.
285            // 8. If second is not a trailing surrogate, then
286            // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
287            // 9. Set cp to ! UTF16SurrogatePairToCodePoint(first, second).
288            JsStrVariant::Utf16(v) => {
289                // We can skip the checks and instead use the `char::decode_utf16` function to take care of that for us.
290                let code_point = v
291                    .get(position..=position + 1)
292                    .unwrap_or(&v[position..=position]);
293
294                match char::decode_utf16(code_point.iter().copied())
295                    .next()
296                    .expect("code_point always has a value")
297                {
298                    Ok(c) => CodePoint::Unicode(c),
299                    Err(e) => CodePoint::UnpairedSurrogate(e.unpaired_surrogate()),
300                }
301            }
302        }
303    }
304
305    /// Abstract operation `StringToNumber ( str )`
306    ///
307    /// More information:
308    /// - [ECMAScript reference][spec]
309    ///
310    /// [spec]: https://tc39.es/ecma262/#sec-stringtonumber
311    #[inline]
312    #[must_use]
313    pub fn to_number(&self) -> f64 {
314        // 1. Let text be ! StringToCodePoints(str).
315        // 2. Let literal be ParseText(text, StringNumericLiteral).
316        let Ok(string) = self.to_std_string() else {
317            // 3. If literal is a List of errors, return NaN.
318            return f64::NAN;
319        };
320        // 4. Return StringNumericValue of literal.
321        let string = string.trim_matches(is_trimmable_whitespace);
322        match string {
323            "" => return 0.0,
324            "-Infinity" => return f64::NEG_INFINITY,
325            "Infinity" | "+Infinity" => return f64::INFINITY,
326            _ => {}
327        }
328
329        let mut s = string.bytes();
330        let base = match (s.next(), s.next()) {
331            (Some(b'0'), Some(b'b' | b'B')) => Some(2),
332            (Some(b'0'), Some(b'o' | b'O')) => Some(8),
333            (Some(b'0'), Some(b'x' | b'X')) => Some(16),
334            // Make sure that no further variants of "infinity" are parsed.
335            (Some(b'i' | b'I'), _) => {
336                return f64::NAN;
337            }
338            _ => None,
339        };
340
341        // Parse numbers that begin with `0b`, `0o` and `0x`.
342        if let Some(base) = base {
343            let string = &string[2..];
344            if string.is_empty() {
345                return f64::NAN;
346            }
347
348            // Fast path
349            if let Ok(value) = u32::from_str_radix(string, base) {
350                return f64::from(value);
351            }
352
353            // Slow path
354            let mut value: f64 = 0.0;
355            for c in s {
356                if let Some(digit) = char::from(c).to_digit(base) {
357                    value = value.mul_add(f64::from(base), f64::from(digit));
358                } else {
359                    return f64::NAN;
360                }
361            }
362            return value;
363        }
364
365        fast_float2::parse(string).unwrap_or(f64::NAN)
366    }
367
368    /// Gets an iterator of all the Unicode codepoints of a [`JsStr`].
369    #[inline]
370    #[must_use]
371    pub fn code_points(&self) -> CodePointsIter<'a> {
372        CodePointsIter::new(*self)
373    }
374
375    /// Checks if the [`JsStr`] contains a byte.
376    #[inline]
377    #[must_use]
378    pub fn contains(&self, element: u8) -> bool {
379        match self.variant() {
380            JsStrVariant::Latin1(v) => v.contains(&element),
381            JsStrVariant::Utf16(v) => v.contains(&u16::from(element)),
382        }
383    }
384
385    /// Gets an iterator of all the Unicode codepoints of a [`JsStr`], replacing
386    /// unpaired surrogates with the replacement character. This is faster than
387    /// using [`Self::code_points`].
388    #[inline]
389    pub fn code_points_lossy(self) -> impl Iterator<Item = char> + 'a {
390        char::decode_utf16(self.iter()).map(|res| res.unwrap_or('\u{FFFD}'))
391    }
392
393    /// Decodes a [`JsStr`] into a [`String`], returning an error if it contains any invalid data.
394    ///
395    /// # Errors
396    ///
397    /// [`FromUtf16Error`][std::string::FromUtf16Error] if it contains any invalid data.
398    #[inline]
399    pub fn to_std_string(&self) -> Result<String, std::string::FromUtf16Error> {
400        match self.variant() {
401            JsStrVariant::Latin1(v) => Ok(v.iter().copied().map(char::from).collect()),
402            JsStrVariant::Utf16(v) => String::from_utf16(v),
403        }
404    }
405
406    /// Decodes a [`JsStr`] into a [`String`], replacing invalid data with the
407    /// replacement character U+FFFD.
408    #[inline]
409    #[must_use]
410    pub fn to_std_string_lossy(&self) -> String {
411        self.display_lossy().to_string()
412    }
413
414    /// Gets a displayable lossy string.
415    ///
416    /// This may be faster and has fewer
417    /// allocations than `format!("{}", str.to_string_lossy())` when displaying.
418    #[inline]
419    #[must_use]
420    pub fn display_lossy(&self) -> JsStrDisplayLossy<'a> {
421        JsStrDisplayLossy::from(*self)
422    }
423}
424
425impl Hash for JsStr<'_> {
426    #[inline]
427    fn hash<H: Hasher>(&self, state: &mut H) {
428        // NOTE: The hash function has been inlined to ensure that a hash of latin1 and U16
429        // encoded strings remains the same if they have the same characters
430        match self.variant() {
431            JsStrVariant::Latin1(s) => {
432                state.write_usize(s.len());
433                for elem in s {
434                    state.write_u16(u16::from(*elem));
435                }
436            }
437            JsStrVariant::Utf16(s) => {
438                state.write_usize(s.len());
439                for elem in s {
440                    state.write_u16(*elem);
441                }
442            }
443        }
444    }
445}
446
447impl Ord for JsStr<'_> {
448    #[inline]
449    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
450        match (self.variant(), other.variant()) {
451            (JsStrVariant::Latin1(x), JsStrVariant::Latin1(y)) => x.cmp(y),
452            (JsStrVariant::Utf16(x), JsStrVariant::Utf16(y)) => x.cmp(y),
453            _ => self.iter().cmp(other.iter()),
454        }
455    }
456}
457
458impl Eq for JsStr<'_> {}
459
460impl PartialEq for JsStr<'_> {
461    #[inline]
462    fn eq(&self, other: &Self) -> bool {
463        match (self.variant(), other.variant()) {
464            (JsStrVariant::Latin1(lhs), JsStrVariant::Latin1(rhs)) => return lhs == rhs,
465            (JsStrVariant::Utf16(lhs), JsStrVariant::Utf16(rhs)) => return lhs == rhs,
466            _ => {}
467        }
468        if self.len() != other.len() {
469            return false;
470        }
471        for (x, y) in self.iter().zip(other.iter()) {
472            if x != y {
473                return false;
474            }
475        }
476        true
477    }
478}
479
480impl PartialEq<str> for JsStr<'_> {
481    #[inline]
482    fn eq(&self, other: &str) -> bool {
483        match self.variant() {
484            JsStrVariant::Latin1(v) => v == other.as_bytes(),
485            JsStrVariant::Utf16(v) => other.encode_utf16().zip(v).all(|(a, b)| a == *b),
486        }
487    }
488}
489
490impl PartialEq<&str> for JsStr<'_> {
491    #[inline]
492    fn eq(&self, other: &&str) -> bool {
493        self == *other
494    }
495}
496
497impl<'a> PartialEq<JsStr<'a>> for [u16] {
498    #[inline]
499    fn eq(&self, other: &JsStr<'a>) -> bool {
500        if self.len() != other.len() {
501            return false;
502        }
503        for (x, y) in self.iter().copied().zip(other.iter()) {
504            if x != y {
505                return false;
506            }
507        }
508        true
509    }
510}
511
512impl std::fmt::Debug for JsStr<'_> {
513    #[inline]
514    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
515        f.debug_struct("JsStr").field("len", &self.len()).finish()
516    }
517}
518
519pub trait JsSliceIndex<'a>: SliceIndex<[u8]> + SliceIndex<[u16]> {
520    type Value;
521
522    fn get(_: JsStr<'a>, index: Self) -> Option<Self::Value>;
523
524    unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value;
525}
526
527impl<'a> JsSliceIndex<'a> for usize {
528    type Value = u16;
529
530    #[inline]
531    fn get(value: JsStr<'a>, index: Self) -> Option<Self::Value> {
532        match value.variant() {
533            JsStrVariant::Latin1(v) => v.get(index).copied().map(u16::from),
534            JsStrVariant::Utf16(v) => v.get(index).copied(),
535        }
536    }
537
538    /// # Safety
539    ///
540    /// Caller must ensure the index is not out of bounds
541    #[inline]
542    unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value {
543        // Safety: Caller must ensure the index is not out of bounds
544        unsafe {
545            match value.variant() {
546                JsStrVariant::Latin1(v) => u16::from(*v.get_unchecked(index)),
547                JsStrVariant::Utf16(v) => *v.get_unchecked(index),
548            }
549        }
550    }
551}
552
553impl<'a> JsSliceIndex<'a> for std::ops::Range<usize> {
554    type Value = JsStr<'a>;
555
556    #[inline]
557    fn get(value: JsStr<'a>, index: Self) -> Option<Self::Value> {
558        match value.variant() {
559            JsStrVariant::Latin1(v) => v.get(index).map(JsStr::latin1),
560            JsStrVariant::Utf16(v) => v.get(index).map(JsStr::utf16),
561        }
562    }
563
564    /// # Safety
565    ///
566    /// Caller must ensure the index is not out of bounds
567    #[inline]
568    unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value {
569        // Safety: Caller must ensure the index is not out of bounds
570        unsafe {
571            match value.variant() {
572                JsStrVariant::Latin1(v) => JsStr::latin1(v.get_unchecked(index)),
573                JsStrVariant::Utf16(v) => JsStr::utf16(v.get_unchecked(index)),
574            }
575        }
576    }
577}
578
579impl<'a> JsSliceIndex<'a> for std::ops::RangeInclusive<usize> {
580    type Value = JsStr<'a>;
581
582    #[inline]
583    fn get(value: JsStr<'a>, index: Self) -> Option<Self::Value> {
584        match value.variant() {
585            JsStrVariant::Latin1(v) => v.get(index).map(JsStr::latin1),
586            JsStrVariant::Utf16(v) => v.get(index).map(JsStr::utf16),
587        }
588    }
589
590    /// # Safety
591    ///
592    /// Caller must ensure the index is not out of bounds
593    #[inline]
594    unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value {
595        // Safety: Caller must ensure the index is not out of bounds
596        unsafe {
597            match value.variant() {
598                JsStrVariant::Latin1(v) => JsStr::latin1(v.get_unchecked(index)),
599                JsStrVariant::Utf16(v) => JsStr::utf16(v.get_unchecked(index)),
600            }
601        }
602    }
603}
604
605impl<'a> JsSliceIndex<'a> for std::ops::RangeFrom<usize> {
606    type Value = JsStr<'a>;
607
608    #[inline]
609    fn get(value: JsStr<'a>, index: Self) -> Option<Self::Value> {
610        match value.variant() {
611            JsStrVariant::Latin1(v) => v.get(index).map(JsStr::latin1),
612            JsStrVariant::Utf16(v) => v.get(index).map(JsStr::utf16),
613        }
614    }
615
616    /// # Safety
617    ///
618    /// Caller must ensure the index is not out of bounds
619    #[inline]
620    unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value {
621        // Safety: Caller must ensure the index is not out of bounds
622        unsafe {
623            match value.variant() {
624                JsStrVariant::Latin1(v) => JsStr::latin1(v.get_unchecked(index)),
625                JsStrVariant::Utf16(v) => JsStr::utf16(v.get_unchecked(index)),
626            }
627        }
628    }
629}
630
631impl<'a> JsSliceIndex<'a> for std::ops::RangeTo<usize> {
632    type Value = JsStr<'a>;
633
634    #[inline]
635    fn get(value: JsStr<'a>, index: Self) -> Option<Self::Value> {
636        match value.variant() {
637            JsStrVariant::Latin1(v) => v.get(index).map(JsStr::latin1),
638            JsStrVariant::Utf16(v) => v.get(index).map(JsStr::utf16),
639        }
640    }
641
642    /// # Safety
643    ///
644    /// Caller must ensure the index is not out of bounds
645    #[inline]
646    unsafe fn get_unchecked(value: JsStr<'a>, index: Self) -> Self::Value {
647        // Safety: Caller must ensure the index is not out of bounds
648        unsafe {
649            match value.variant() {
650                JsStrVariant::Latin1(v) => JsStr::latin1(v.get_unchecked(index)),
651                JsStrVariant::Utf16(v) => JsStr::utf16(v.get_unchecked(index)),
652            }
653        }
654    }
655}
656
657impl<'a> JsSliceIndex<'a> for std::ops::RangeFull {
658    type Value = JsStr<'a>;
659
660    #[inline]
661    fn get(value: JsStr<'a>, _index: Self) -> Option<Self::Value> {
662        Some(value)
663    }
664
665    /// # Safety
666    ///
667    /// Caller must ensure the index is not out of bounds
668    #[inline]
669    unsafe fn get_unchecked(value: JsStr<'a>, _index: Self) -> Self::Value {
670        value
671    }
672}