glulx_asm/
strings.rs

1// SPDX-License-Identifier: Apache-2.0 WITH LLVM-Exception
2// Copyright 2024 Daniel Fox Franke.
3
4//! [`Utf32String`] and [`MysteryString`].
5
6use alloc::borrow::Borrow;
7
8use bytes::{Buf, BufMut, Bytes, BytesMut};
9use core::{
10    fmt::{Debug, Display, Formatter, Write},
11    num::NonZeroUsize,
12};
13
14#[cfg(feature = "std")]
15use std::error::Error;
16
17/// A string encoded as UTF-32.
18///
19/// Strings of this type can be serialized into a story file (via the
20/// [`Item::Utf32String`](`crate::Item::Utf32String`) constructor) formatted
21/// compatibly with the `streamstr` instruction. Constructors ensure that the
22/// string is valid Unicode with no embedded nulls. Internally it's a [`Bytes`],
23/// so cloning it is cheap.
24///
25/// This is not at all a full-featured alternative to [`std::String`](`String`).
26/// `Utf32String`s are immutable once constructed and not intended for anything
27/// other than being serialized into a story file.
28#[derive(Clone, PartialEq, Eq, Hash, Default)]
29pub struct Utf32String(Bytes);
30
31/// A string whose encoding is defined by the IO system, but *probably* treated
32/// as Latin-1.
33///
34/// Strings of this type can be serialized into a story file (via the
35/// [`Item::MysteryString`](`crate::Item::MysteryString`)) constructor formatted
36/// compatibly with the `streamstr` instruction. Constructors ensure that it
37/// will not contain any embedded nulls. Internally it's a [`Bytes`], so cloning
38/// it is cheap.
39///
40/// This corresponds to a Glulx `E0` string, of which the spec says "the
41/// encoding scheme is the business of the I/O system; in Glk, it will be the
42/// Latin-1 character set". It is in any case required to be a single-byte
43/// encoding which uses a zero byte as a terminator.
44///
45/// When building a `MysteryString` from a `char` iterator or using its
46/// `Display` impl, Latin1 is assumed. However, you can also build it from a
47/// `u8` iterator in which case no assumption is made about the encoding.
48#[derive(Clone, PartialEq, Eq, Hash, Default)]
49pub struct MysteryString(Bytes);
50
51/// Error returned when constructing a [`Utf32String`] or [`MysteryString`] from
52/// malformed input.
53#[derive(Debug, Clone, PartialEq, Eq, Hash)]
54pub struct StringConversionError<T> {
55    /// The number of errors which were encountered when encoding the string.
56    pub num_errors: NonZeroUsize,
57    /// The index at which the first error was encountered.
58    pub first_error: usize,
59    /// A lossy representation of the string.
60    pub lossy: T,
61}
62
63impl<T> Display for StringConversionError<T> {
64    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
65        if usize::from(self.num_errors) > 1 {
66            write!(
67                f,
68                "string conversion encountered {} unrepresentable characters, the first one at index {}.", 
69                self.num_errors,
70                self.first_error
71            )
72        } else {
73            write!(
74                f,
75                "string conversion encountered an unrepresentable character at index {}.",
76                self.first_error
77            )
78        }
79    }
80}
81
82#[cfg(feature = "std")]
83impl<T> Error for StringConversionError<T> where T: Debug {}
84
85impl Utf32String {
86    /// Construct a `Utf32String` from an iterator over `char`s (or over any type
87    /// that lets you borrow a `char`).
88    ///
89    /// If the string contains embedded nulls, an error is returned, but a lossy
90    /// version can be extracted from the error struct. The lossy string
91    /// replaces nulls with `U+2400 SYMBOL FOR NULL` (␀), which belongs to the
92    /// Control Pictures block, which is a really neat block that I bet you
93    /// didn't know existed.
94    pub fn from_chars<I, C>(chars: I) -> Result<Self, StringConversionError<Self>>
95    where
96        I: IntoIterator<Item = C>,
97        C: Borrow<char>,
98    {
99        let mut num_errors: usize = 0;
100        let mut first_error: usize = usize::MAX;
101
102        let iter = chars.into_iter();
103        let mut bm = BytesMut::with_capacity(4 * iter.size_hint().0);
104
105        for (i, cref) in iter.enumerate() {
106            let c = *cref.borrow();
107            if c == '\0' {
108                bm.put_u32('\u{2400}'.into());
109                num_errors += 1;
110                first_error = first_error.min(i);
111            } else {
112                bm.put_u32(c.into())
113            }
114        }
115
116        if let Some(num_errors) = NonZeroUsize::new(num_errors) {
117            Err(StringConversionError {
118                num_errors,
119                first_error,
120                lossy: Self(bm.freeze()),
121            })
122        } else {
123            Ok(Self(bm.freeze()))
124        }
125    }
126
127    /// Like [`from_chars`](`Self::from_chars`), but in case of error will
128    /// silently unwrap the error and return the lossy version.
129    pub fn from_chars_lossy<I, C>(chars: I) -> Self
130    where
131        I: IntoIterator<Item = C>,
132        C: Borrow<char>,
133    {
134        match Self::from_chars(chars) {
135            Ok(s) => s,
136            Err(e) => e.lossy,
137        }
138    }
139
140    /// Returns true if the string is empty.
141    pub fn is_empty(&self) -> bool {
142        self.0.is_empty()
143    }
144
145    /// Returns the length of the string in characters.
146    pub fn char_len(&self) -> usize {
147        self.0.len() / 4
148    }
149
150    /// Returns the length of the string in bytes, excluding prefix and null
151    /// terminator.
152    pub fn byte_len(&self) -> usize {
153        self.0.len()
154    }
155
156    /// Returns the length of the string in bytes, including prefix and null
157    /// terminator.
158    pub fn byte_len_with_prefix_and_nul(&self) -> usize {
159        self.0.len() + 8
160    }
161
162    /// Returns a clone of the underlying [`Bytes`].
163    pub fn to_bytes(&self) -> Bytes {
164        self.clone().into_bytes()
165    }
166
167    /// Unwraps the string into its underlying [`Bytes`].
168    pub fn into_bytes(self) -> Bytes {
169        self.0
170    }
171}
172
173impl Display for Utf32String {
174    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
175        let mut buf = self.0.clone();
176
177        while buf.has_remaining() {
178            let c: char = buf
179                .get_u32()
180                .try_into()
181                .expect("Utf32String should always contain valid characters");
182            f.write_char(c)?
183        }
184
185        Ok(())
186    }
187}
188
189impl Debug for Utf32String {
190    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
191        let s = self.to_string();
192        f.debug_tuple("Utf32String").field(&s).finish()
193    }
194}
195
196impl TryFrom<String> for Utf32String {
197    type Error = StringConversionError<Utf32String>;
198
199    fn try_from(value: String) -> Result<Self, Self::Error> {
200        Utf32String::from_chars(value.chars())
201    }
202}
203
204impl TryFrom<&String> for Utf32String {
205    type Error = StringConversionError<Utf32String>;
206
207    fn try_from(value: &String) -> Result<Self, Self::Error> {
208        Utf32String::from_chars(value.chars())
209    }
210}
211
212impl TryFrom<&str> for Utf32String {
213    type Error = StringConversionError<Utf32String>;
214
215    fn try_from(value: &str) -> Result<Self, Self::Error> {
216        Utf32String::from_chars(value.chars())
217    }
218}
219
220impl TryFrom<&[char]> for Utf32String {
221    type Error = StringConversionError<Utf32String>;
222
223    fn try_from(value: &[char]) -> Result<Self, Self::Error> {
224        Utf32String::from_chars(value)
225    }
226}
227
228impl MysteryString {
229    /// Constructs a `MysteryString` from an iterator over `char`s (or over any
230    /// type that lets you borrow a `char`).
231    ///
232    /// If the string contains embedded nulls or any character which cannot be
233    /// represented in Latin-1, an error is returned, but a lossy version can be
234    /// extracted from the error struct. The lossy string replaces nulls and
235    /// unrepresentable characters with `b'?'`.
236    pub fn from_chars<I, C>(chars: I) -> Result<Self, StringConversionError<Self>>
237    where
238        I: IntoIterator<Item = C>,
239        C: Borrow<char>,
240    {
241        let mut num_errors: usize = 0;
242        let mut first_error: usize = usize::MAX;
243
244        let iter = chars.into_iter();
245        let mut bm = BytesMut::with_capacity(4 * iter.size_hint().0);
246
247        for (i, cref) in iter.enumerate() {
248            match u8::try_from(*cref.borrow()) {
249                Ok(b) if b != 0 => {
250                    bm.put_u8(b);
251                }
252                _ => {
253                    bm.put_u8(b'?');
254                    num_errors += 1;
255                    first_error = first_error.min(i);
256                }
257            }
258        }
259
260        if let Some(num_errors) = NonZeroUsize::new(num_errors) {
261            Err(StringConversionError {
262                num_errors,
263                first_error,
264                lossy: Self(bm.freeze()),
265            })
266        } else {
267            Ok(Self(bm.freeze()))
268        }
269    }
270
271    /// Constructs a `MysteryString` from an iterator over `u8`s (or over any
272    /// type that lets you borrow a `u8`).
273    ///
274    /// If the string contains embedded nulls, an error is returned, but a lossy
275    /// version can be extracted from the error struct. The lossy string is
276    /// truncated at the first occurence of a null. (Unlike `from_chars`, this
277    /// constructor doesn't that the string is Latin-1 or even any ASCII
278    /// superset, so therefore it can't know what would be a reasonable
279    /// replacement character to substitute.)
280    pub fn from_bytes<I, C>(chars: I) -> Result<Self, StringConversionError<Self>>
281    where
282        I: IntoIterator<Item = C>,
283        C: Borrow<u8>,
284    {
285        let mut failed = false;
286        let mut num_errors: usize = 0;
287        let mut first_error: usize = usize::MAX;
288
289        let iter = chars.into_iter();
290        let mut bm = BytesMut::with_capacity(4 * iter.size_hint().0);
291
292        for (i, bref) in iter.enumerate() {
293            let b = *bref.borrow();
294
295            if b != 0 {
296                // We use this separate boolean rather than testing on
297                // num_errors == 0 so the optimizer can prove that it's
298                // monotonic. The num_errors increment could wrap if we're
299                // handed an infinite iterator.
300                if !failed {
301                    bm.put_u8(b);
302                }
303            } else {
304                failed = true;
305                num_errors += 1;
306                first_error = first_error.min(i);
307            }
308        }
309
310        if let Some(num_errors) = NonZeroUsize::new(num_errors) {
311            Err(StringConversionError {
312                num_errors,
313                first_error,
314                lossy: Self(bm.freeze()),
315            })
316        } else {
317            Ok(Self(bm.freeze()))
318        }
319    }
320
321    /// Like [`from_chars`](`Self::from_chars`), but in case of error will
322    /// silently unwrap the error and return the lossy version.
323    pub fn from_chars_lossy<I, C>(chars: I) -> Self
324    where
325        I: IntoIterator<Item = C>,
326        C: Borrow<char>,
327    {
328        match Self::from_chars(chars) {
329            Ok(s) => s,
330            Err(e) => e.lossy,
331        }
332    }
333
334    /// Like [`from_bytes`](`Self::from_bytes`), but in case of error will
335    /// silently unwrap the error and return the lossy version.
336    pub fn from_bytes_lossy<I, C>(chars: I) -> Self
337    where
338        I: IntoIterator<Item = C>,
339        C: Borrow<u8>,
340    {
341        match Self::from_bytes(chars) {
342            Ok(s) => s,
343            Err(e) => e.lossy,
344        }
345    }
346
347    /// Returns true if the string is empty.
348    pub fn is_empty(&self) -> bool {
349        self.0.is_empty()
350    }
351
352    /// Returns the length of the string, excluding prefix and null terminator.
353    pub fn len(&self) -> usize {
354        self.0.len()
355    }
356
357    /// Returns the length of the string, including prefix and null terminator.
358    pub fn len_with_prefix_and_nul(&self) -> usize {
359        self.len() + 2
360    }
361
362    /// Returns a clone of the string's underlying `Bytes`.
363    pub fn to_bytes(&self) -> Bytes {
364        self.clone().into_bytes()
365    }
366
367    /// Unwraps the string into its underlying `Bytes`.
368    pub fn into_bytes(self) -> Bytes {
369        self.0
370    }
371}
372
373impl Display for MysteryString {
374    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
375        let mut buf = self.0.clone();
376        while buf.has_remaining() {
377            let byte = buf.get_u8();
378            let c = char::from_u32(byte.into()).unwrap();
379            f.write_char(c)?
380        }
381
382        Ok(())
383    }
384}
385
386impl Debug for MysteryString {
387    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
388        let s = self.to_string();
389        f.debug_tuple("MysteryString").field(&s).finish()
390    }
391}
392
393impl TryFrom<String> for MysteryString {
394    type Error = StringConversionError<MysteryString>;
395
396    fn try_from(value: String) -> Result<Self, Self::Error> {
397        MysteryString::from_chars(value.chars())
398    }
399}
400
401impl TryFrom<&String> for MysteryString {
402    type Error = StringConversionError<MysteryString>;
403
404    fn try_from(value: &String) -> Result<Self, Self::Error> {
405        MysteryString::from_chars(value.chars())
406    }
407}
408
409impl TryFrom<&str> for MysteryString {
410    type Error = StringConversionError<MysteryString>;
411
412    fn try_from(value: &str) -> Result<Self, Self::Error> {
413        MysteryString::from_chars(value.chars())
414    }
415}
416
417impl TryFrom<Vec<u8>> for MysteryString {
418    type Error = StringConversionError<MysteryString>;
419
420    fn try_from(value: Vec<u8>) -> Result<Self, Self::Error> {
421        MysteryString::from_bytes(value)
422    }
423}
424
425impl TryFrom<&Vec<u8>> for MysteryString {
426    type Error = StringConversionError<MysteryString>;
427
428    fn try_from(value: &Vec<u8>) -> Result<Self, Self::Error> {
429        MysteryString::from_bytes(value)
430    }
431}
432
433impl TryFrom<&[u8]> for MysteryString {
434    type Error = StringConversionError<MysteryString>;
435
436    fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
437        MysteryString::from_bytes(value)
438    }
439}