Skip to main content

react_compiler_diagnostics/
js_string.rs

1//! A JavaScript string value. JS strings are sequences of UTF-16 code units
2//! with no validity requirement, so a value can contain unpaired surrogate
3//! halves that Rust's `String` cannot represent. `JsString` keeps the common
4//! valid case as UTF-8 and falls back to code units only when the value is
5//! ill-formed, so the compiler computes on true program values instead of
6//! replacement characters or escape hatches.
7//!
8//! Wire format: the babel bridge transports lone surrogates as
9//! `__SURROGATE_XXXX__` markers (see `sanitizeJsonSurrogates` in bridge.ts),
10//! because serde_json can neither parse nor emit a lone `\uXXXX` escape.
11//! Serde for `JsString` decodes and re-emits that marker form, which keeps the
12//! JS side of the bridge unchanged.
13
14use std::fmt;
15
16use serde::Serialize;
17
18/// Invariant: `Repr::Utf8` holds every well-formed value and `Repr::Wtf16`
19/// only ill-formed ones (at least one unpaired surrogate). The derived
20/// `PartialEq`/`Hash` are only sound under this invariant: a well-formed
21/// value smuggled into `Wtf16` would compare unequal to its `Utf8` twin. The
22/// representation is private so the invariant holds by construction; match on
23/// [`JsString::as_ref`] to branch on well-formedness.
24#[derive(Debug, Clone, PartialEq, Eq, Hash)]
25pub struct JsString(Repr);
26
27#[derive(Debug, Clone, PartialEq, Eq, Hash)]
28enum Repr {
29    /// A well-formed string (no unpaired surrogates), stored as UTF-8.
30    Utf8(String),
31    /// An ill-formed string, stored as UTF-16 code units.
32    Wtf16(Vec<u16>),
33}
34
35/// Borrowed view of a [`JsString`] for callers that need to branch on
36/// well-formedness.
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38pub enum JsStringRef<'a> {
39    Utf8(&'a str),
40    Wtf16(&'a [u16]),
41}
42
43impl JsString {
44    /// Build from UTF-16 code units, normalizing to UTF-8 when well-formed.
45    pub fn from_code_units(units: Vec<u16>) -> Self {
46        match String::from_utf16(&units) {
47            Ok(s) => JsString(Repr::Utf8(s)),
48            Err(_) => JsString(Repr::Wtf16(units)),
49        }
50    }
51
52    pub fn as_ref(&self) -> JsStringRef<'_> {
53        match &self.0 {
54            Repr::Utf8(s) => JsStringRef::Utf8(s),
55            Repr::Wtf16(units) => JsStringRef::Wtf16(units),
56        }
57    }
58
59    /// The UTF-8 view, when the value is well-formed.
60    pub fn as_str(&self) -> Option<&str> {
61        match &self.0 {
62            Repr::Utf8(s) => Some(s),
63            Repr::Wtf16(_) => None,
64        }
65    }
66
67    pub fn code_units(&self) -> Vec<u16> {
68        match &self.0 {
69            Repr::Utf8(s) => s.encode_utf16().collect(),
70            Repr::Wtf16(units) => units.clone(),
71        }
72    }
73
74    /// Length in UTF-16 code units (JS `String.prototype.length`).
75    pub fn len_utf16(&self) -> usize {
76        match &self.0 {
77            Repr::Utf8(s) => s.encode_utf16().count(),
78            Repr::Wtf16(units) => units.len(),
79        }
80    }
81
82    /// The value with unpaired surrogates replaced by U+FFFD, for consumers
83    /// whose string type cannot represent ill-formed values.
84    pub fn to_string_lossy(&self) -> String {
85        match &self.0 {
86            Repr::Utf8(s) => s.clone(),
87            Repr::Wtf16(units) => String::from_utf16_lossy(units),
88        }
89    }
90
91    /// Decode the bridge wire form: a UTF-8 string in which lone surrogates
92    /// appear as `__SURROGATE_XXXX__` markers (uppercase hex, mirroring what
93    /// `sanitizeJsonSurrogates` emits and `restoreJsonSurrogates` accepts).
94    ///
95    /// All scanning is byte-wise: a marker is 18 ASCII bytes, so byte-slice
96    /// comparisons cannot land on a UTF-8 char boundary the way `str` range
97    /// indexing can when multibyte text follows the prefix.
98    pub fn from_marker_string(s: &str) -> Self {
99        const PREFIX: &[u8] = b"__SURROGATE_";
100        const MARKER_LEN: usize = 18;
101        if !s.contains("__SURROGATE_") {
102            return JsString(Repr::Utf8(s.to_string()));
103        }
104        let bytes = s.as_bytes();
105        let mut units: Vec<u16> = Vec::with_capacity(s.len());
106        let mut pos = 0;
107        let mut segment_start = 0;
108        while let Some(found) = s[pos..].find("__SURROGATE_") {
109            let idx = pos + found;
110            let tail = &bytes[idx..];
111            let well_formed = tail.len() >= MARKER_LEN
112                && &tail[MARKER_LEN - 2..MARKER_LEN] == b"__"
113                && tail[PREFIX.len()..PREFIX.len() + 4]
114                    .iter()
115                    .all(|b| b.is_ascii_hexdigit() && !b.is_ascii_lowercase());
116            if well_formed {
117                let hex = std::str::from_utf8(&tail[PREFIX.len()..PREFIX.len() + 4])
118                    .expect("ascii hex is valid utf8");
119                let unit = u16::from_str_radix(hex, 16).expect("validated hex digits");
120                units.extend(s[segment_start..idx].encode_utf16());
121                units.push(unit);
122                pos = idx + MARKER_LEN;
123                segment_start = pos;
124            } else {
125                // Not a well-formed marker: keep the literal text and continue
126                // scanning after the prefix.
127                pos = idx + PREFIX.len();
128            }
129        }
130        units.extend(s[segment_start..].encode_utf16());
131        JsString::from_code_units(units)
132    }
133
134    /// Encode to the bridge wire form (markers for unpaired surrogates).
135    pub fn to_marker_string(&self) -> String {
136        match &self.0 {
137            Repr::Utf8(s) => s.clone(),
138            Repr::Wtf16(units) => {
139                let mut out = String::with_capacity(units.len() * 2);
140                let mut iter = units.iter().copied().peekable();
141                while let Some(unit) = iter.next() {
142                    match unit {
143                        0xD800..=0xDBFF => {
144                            if let Some(&next) = iter.peek() {
145                                if (0xDC00..=0xDFFF).contains(&next) {
146                                    iter.next();
147                                    let cp = 0x10000
148                                        + ((unit as u32 - 0xD800) << 10)
149                                        + (next as u32 - 0xDC00);
150                                    out.push(char::from_u32(cp).expect("valid supplementary"));
151                                    continue;
152                                }
153                            }
154                            out.push_str(&format!("__SURROGATE_{unit:04X}__"));
155                        }
156                        0xDC00..=0xDFFF => {
157                            out.push_str(&format!("__SURROGATE_{unit:04X}__"));
158                        }
159                        _ => {
160                            out.push(
161                                char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),
162                            );
163                        }
164                    }
165                }
166                out
167            }
168        }
169    }
170
171    /// Render as JS-source-style escaped text, matching the form TS's debug
172    /// printer produces via JSON.stringify: unpaired surrogates print as
173    /// lowercase `\udXXX` escapes inside the otherwise UTF-8 text.
174    pub fn to_escaped_string(&self) -> String {
175        match &self.0 {
176            Repr::Utf8(s) => s.clone(),
177            Repr::Wtf16(units) => {
178                let mut out = String::with_capacity(units.len() * 2);
179                let mut iter = units.iter().copied().peekable();
180                while let Some(unit) = iter.next() {
181                    match unit {
182                        0xD800..=0xDBFF => {
183                            if let Some(&next) = iter.peek() {
184                                if (0xDC00..=0xDFFF).contains(&next) {
185                                    iter.next();
186                                    let cp = 0x10000
187                                        + ((unit as u32 - 0xD800) << 10)
188                                        + (next as u32 - 0xDC00);
189                                    out.push(char::from_u32(cp).expect("valid supplementary"));
190                                    continue;
191                                }
192                            }
193                            out.push_str(&format!("\\u{unit:04x}"));
194                        }
195                        0xDC00..=0xDFFF => {
196                            out.push_str(&format!("\\u{unit:04x}"));
197                        }
198                        _ => {
199                            out.push(
200                                char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),
201                            );
202                        }
203                    }
204                }
205                out
206            }
207        }
208    }
209}
210
211impl From<String> for JsString {
212    fn from(s: String) -> Self {
213        // A Rust String is valid UTF-8 and so cannot contain an unpaired
214        // surrogate; constructing Utf8 directly preserves the invariant.
215        JsString(Repr::Utf8(s))
216    }
217}
218
219impl From<&str> for JsString {
220    fn from(s: &str) -> Self {
221        JsString(Repr::Utf8(s.to_string()))
222    }
223}
224
225impl PartialEq<str> for JsString {
226    fn eq(&self, other: &str) -> bool {
227        self.as_str() == Some(other)
228    }
229}
230
231impl PartialEq<&str> for JsString {
232    fn eq(&self, other: &&str) -> bool {
233        self.as_str() == Some(*other)
234    }
235}
236
237impl fmt::Display for JsString {
238    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
239        f.write_str(&self.to_escaped_string())
240    }
241}
242
243impl Serialize for JsString {
244    fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
245        serializer.serialize_str(&self.to_marker_string())
246    }
247}
248
249#[cfg(test)]
250mod tests {
251    use super::JsString;
252    use super::JsStringRef;
253
254    #[test]
255    fn as_ref_views_match_well_formedness() {
256        assert!(matches!(
257            JsString::from("plain").as_ref(),
258            JsStringRef::Utf8("plain")
259        ));
260        assert!(matches!(
261            JsString::from_code_units(vec![0xD83E]).as_ref(),
262            JsStringRef::Wtf16(&[0xD83E])
263        ));
264        // Well-formed code units normalize to the Utf8 representation, so
265        // equal logical strings are equal values regardless of how they
266        // were constructed.
267        assert_eq!(
268            JsString::from_code_units("plain".encode_utf16().collect()),
269            JsString::from("plain")
270        );
271    }
272
273    #[test]
274    fn marker_round_trip_preserves_lone_surrogates() {
275        let js = JsString::from_marker_string("__SURROGATE_D83E__");
276        assert_eq!(js.code_units(), vec![0xD83E]);
277        assert_eq!(js.to_marker_string(), "__SURROGATE_D83E__");
278        assert_eq!(js.to_escaped_string(), "\\ud83e");
279    }
280
281    #[test]
282    fn paired_halves_render_as_the_supplementary_character() {
283        let js = JsString::from_code_units(vec![0xD83E, 0xDD21]);
284        assert_eq!(js.as_str(), Some("\u{1F921}"));
285    }
286
287    #[test]
288    fn plain_strings_stay_utf8_and_compare_with_str() {
289        let js = JsString::from("use memo");
290        assert!(js == "use memo");
291        assert_eq!(js.to_marker_string(), "use memo");
292    }
293
294    #[test]
295    fn malformed_marker_text_is_kept_literally() {
296        let js = JsString::from_marker_string("__SURROGATE_XYZ__");
297        assert_eq!(js.as_str(), Some("__SURROGATE_XYZ__"));
298    }
299
300    #[test]
301    fn multibyte_text_after_marker_prefix_does_not_panic() {
302        let input = "__SURROGATE_\u{20AC}\u{20AC}";
303        let js = JsString::from_marker_string(input);
304        assert_eq!(js.as_str(), Some(input));
305
306        let truncated = "__SURROGATE_D8";
307        assert_eq!(
308            JsString::from_marker_string(truncated).as_str(),
309            Some(truncated)
310        );
311
312        let mixed = "a\u{20AC}__SURROGATE_D83E__b\u{20AC}";
313        let js = JsString::from_marker_string(mixed);
314        let mut expected: Vec<u16> = "a\u{20AC}".encode_utf16().collect();
315        expected.push(0xD83E);
316        expected.extend("b\u{20AC}".encode_utf16());
317        assert_eq!(js.code_units(), expected);
318    }
319
320    #[test]
321    fn lowercase_hex_markers_are_not_decoded() {
322        // The bridge emits uppercase hex only; lowercase marker-shaped text is
323        // user text and must survive verbatim.
324        let input = "__SURROGATE_d83e__";
325        assert_eq!(JsString::from_marker_string(input).as_str(), Some(input));
326    }
327}