utf16string/
wstr.rs

1//! Implementations for the [`WStr`] type.
2//!
3//! The type itself lives in the `lib.rs` file to avoid having to have a public alias, but
4//! implementations live here.
5
6use std::fmt;
7
8use byteorder::{BigEndian, ByteOrder, LittleEndian};
9
10use crate::slicing::SliceIndex;
11use crate::utf16::{is_trailing_surrogate, validate_raw_utf16};
12use crate::{Utf16Error, WStr, WStrCharIndices, WStrChars};
13
14impl WStr<LittleEndian> {
15    /// Creates a new `&WStr<LE>`.
16    pub fn from_utf16le(raw: &[u8]) -> Result<&Self, Utf16Error> {
17        Self::from_utf16(raw)
18    }
19
20    /// Creates a new `&mut WStr<LE>`.
21    pub fn from_utf16le_mut(raw: &mut [u8]) -> Result<&mut Self, Utf16Error> {
22        Self::from_utf16_mut(raw)
23    }
24
25    /// Creates a new [WStr] with little-endian byte-ordering.
26    ///
27    /// This is a shortcut to easily create `WStr<LE>` without having to specify the type
28    /// explicitly.
29    ///
30    /// # Example
31    ///
32    /// ```
33    /// use utf16string::{LE, WStr};
34    ///
35    /// let b = b"h\x00i\x00";
36    /// let s: &WStr<LE> = unsafe { WStr::from_utf16_unchecked(b) };
37    /// let t = unsafe { WStr::from_utf16le_unchecked(b) };
38    /// assert_eq!(s, t);
39    /// ```
40    ///
41    /// # Safety
42    ///
43    /// You must guarantee that the buffer passed in is encoded correctly as UTF-16 with
44    /// little-endian byte-order, otherwise you will get undefined behaviour.
45    pub unsafe fn from_utf16le_unchecked(raw: &[u8]) -> &Self {
46        Self::from_utf16_unchecked(raw)
47    }
48
49    /// Creates a new `&mut WStr<LE>`.
50    ///
51    /// # Safety
52    ///
53    /// You must guarantee that the buffer passed in is encoded correctly as UTF-16 with
54    /// little-endian byte-order, otherwise you will get undefined behaviour.
55    pub unsafe fn from_utf16le_unchecked_mut(raw: &mut [u8]) -> &mut Self {
56        Self::from_utf16_unchecked_mut(raw)
57    }
58}
59
60impl WStr<BigEndian> {
61    /// Creates a new `&WStr<BE>`.
62    pub fn from_utf16be(raw: &[u8]) -> Result<&Self, Utf16Error> {
63        Self::from_utf16(raw)
64    }
65
66    /// Creates a new `&mut WStr<BE>`.
67    pub fn from_utf16be_mut(raw: &mut [u8]) -> Result<&mut Self, Utf16Error> {
68        Self::from_utf16_mut(raw)
69    }
70
71    /// Creates a new `&WStr<BE>` from an existing byte-slice with big-endian byte-ordering.
72    ///
73    /// This is a shortcut to easily create `WStr<BE>` without having to specify the type
74    /// explicitly.
75    ///
76    /// # Example
77    ///
78    /// ```
79    /// use utf16string::{BE, WStr};
80    ///
81    /// let b = b"h\x00i\x00";
82    /// let s: &WStr<BE> = unsafe { WStr::from_utf16_unchecked(b) };
83    /// let t = unsafe { WStr::from_utf16be_unchecked(b) };
84    /// assert_eq!(s, t);
85    /// ```
86    ///
87    /// # Safety
88    ///
89    /// You must guarantee that the buffer passed in is encoded correctly as UTF-16 with
90    /// big-endian byte-order, otherwise you will get undefined behaviour.
91    pub unsafe fn from_utf16be_unchecked(raw: &[u8]) -> &Self {
92        Self::from_utf16_unchecked(raw)
93    }
94
95    /// Creates a new `&mut WStr<BE>`.
96    ///
97    /// # Safety
98    ///
99    /// You must guarantee that the buffer passed in is encoded correctly as UTF-16 with
100    /// big-endian byte-order, otherwise you will get undefined behaviour.
101    pub unsafe fn from_utf16be_unchecked_mut(raw: &mut [u8]) -> &mut Self {
102        Self::from_utf16_unchecked_mut(raw)
103    }
104}
105
106impl<E> WStr<E>
107where
108    E: ByteOrder,
109{
110    /// Creates a new `&WStr<E>` from an existing UTF-16 byte-slice.
111    ///
112    /// If the byte-slice is not valid [`Utf16Error`] is returned.
113    pub fn from_utf16(raw: &[u8]) -> Result<&Self, Utf16Error> {
114        validate_raw_utf16::<E>(raw)?;
115        Ok(unsafe { Self::from_utf16_unchecked(raw) })
116    }
117
118    /// Creates a new `&mut WStr<E>` from an existing UTF-16 byte-slice.
119    ///
120    /// If the byte-slice is not valid [`Utf16Error`] is returned.
121    pub fn from_utf16_mut(raw: &mut [u8]) -> Result<&mut Self, Utf16Error> {
122        validate_raw_utf16::<E>(raw)?;
123        Ok(unsafe { Self::from_utf16_unchecked_mut(raw) })
124    }
125
126    /// Creates a new `&WStr<E>` from an existing UTF-16 byte-slice.
127    ///
128    /// # Safety
129    ///
130    /// You must guarantee that the buffer passed in is encoded correctly otherwise you will
131    /// get undefined behaviour.  Be aware of the byte-level endianess.
132    pub unsafe fn from_utf16_unchecked(raw: &[u8]) -> &Self {
133        &*(raw as *const [u8] as *const Self)
134    }
135
136    /// Like [`WStr::from_utf16_unchecked`] but return a mutable reference.
137    ///
138    /// # Safety
139    ///
140    /// You must guarantee that the buffer passed in is encoded correctly otherwise you will
141    /// get undefined behaviour.
142    pub unsafe fn from_utf16_unchecked_mut(raw: &mut [u8]) -> &mut Self {
143        &mut *(raw as *mut [u8] as *mut Self)
144    }
145
146    /// The length in bytes, not chars or graphemes.
147    #[inline]
148    pub fn len(&self) -> usize {
149        self.raw.len()
150    }
151
152    /// Returns `true` if the [WStr::len] is zero.
153    #[inline]
154    pub fn is_empty(&self) -> bool {
155        self.len() == 0
156    }
157
158    /// Returns `true` if the index into the bytes is on a char boundary.
159    #[inline]
160    pub fn is_char_boundary(&self, index: usize) -> bool {
161        if index == 0 || index == self.len() {
162            return true;
163        }
164        if index % 2 != 0 || index > self.len() {
165            return false;
166        }
167
168        // Since we always have a valid UTF-16 string in here we now are sure we always
169        // have a byte at index + 1.  The only invalid thing now is a trailing surrogate.
170        let code_unit = E::read_u16(&self.raw[index..]);
171        !is_trailing_surrogate(code_unit)
172    }
173
174    /// Converts to a byte slice.
175    #[inline]
176    pub fn as_bytes(&self) -> &[u8] {
177        &self.raw
178    }
179
180    /// Converts to a mutable byte slice.
181    ///
182    /// # Safety
183    ///
184    /// When mutating the bytes it must still be valid encoded UTF-16 with the correct
185    /// byte-order, otherwise you will get undefined behaviour.
186    #[inline]
187    pub unsafe fn as_bytes_mut(&mut self) -> &mut [u8] {
188        &mut self.raw
189    }
190
191    /// Converts to a raw pointer to the byte slice.
192    ///
193    /// This is currently not `const fn` because this is not yet stable with a trait bound.
194    #[inline]
195    pub fn as_ptr(&self) -> *const u8 {
196        self.raw.as_ptr()
197    }
198
199    /// Converts to a mutable raw pointer to the byte slice.
200    #[inline]
201    pub fn as_mut_ptr(&mut self) -> *mut u8 {
202        self.raw.as_mut_ptr()
203    }
204
205    /// Returns a subslice of `self`.
206    ///
207    /// The slice indices are on byte offsets of the underlying UTF-16 encoded buffer, if
208    /// the subslice is not on character boundaries or otherwise invalid this will return
209    /// [`None`].
210    #[inline]
211    pub fn get<I>(&self, index: I) -> Option<&<I as SliceIndex<WStr<E>>>::Output>
212    where
213        I: SliceIndex<WStr<E>>,
214    {
215        index.get(self)
216    }
217
218    /// Returns a mutable subslice of `self`.
219    ///
220    /// The slice indices are on byte offsets of the underlying UTF-16 encoded buffer, if
221    /// the subslice is not on character boundaries or otherwise invalid this will return
222    /// [`None`].
223    #[inline]
224    pub fn get_mut<I>(&mut self, index: I) -> Option<&mut <I as SliceIndex<WStr<E>>>::Output>
225    where
226        I: SliceIndex<WStr<E>>,
227    {
228        index.get_mut(self)
229    }
230
231    /// Returns a subslice of `self`.
232    ///
233    /// # Safety
234    ///
235    /// Like [`WStr::get`] but this results in undefined behaviour if the sublice is not on
236    /// character boundaries or otherwise invalid.
237    #[inline]
238    pub unsafe fn get_unchecked<I>(&self, index: I) -> &<I as SliceIndex<WStr<E>>>::Output
239    where
240        I: SliceIndex<WStr<E>>,
241    {
242        index.get_unchecked(self)
243    }
244
245    /// Returns a mutable subslice of `self`.
246    ///
247    /// # Safety
248    ///
249    /// Lice [`WStr::get_mut`] but this results in undefined behaviour if the subslice is
250    /// not on character boundaries or otherwise invalid.
251    #[inline]
252    pub unsafe fn get_unchecked_mut<I>(
253        &mut self,
254        index: I,
255    ) -> &mut <I as SliceIndex<WStr<E>>>::Output
256    where
257        I: SliceIndex<WStr<E>>,
258    {
259        index.get_unchecked_mut(self)
260    }
261
262    /// Returns an iterator of the [`char`]s of a string slice.
263    #[inline]
264    pub fn chars(&self) -> WStrChars<E> {
265        WStrChars {
266            chunks: self.raw.chunks_exact(2),
267            _endian: self._endian,
268        }
269    }
270
271    /// Returns and iterator over the [`char`]s of a string slice and their positions.
272    #[inline]
273    pub fn char_indices(&self) -> WStrCharIndices<E> {
274        WStrCharIndices {
275            chars: self.chars(),
276            index: 0,
277        }
278    }
279
280    /// Returns this [`WStr`] as a new owned [`String`].
281    pub fn to_utf8(&self) -> String {
282        self.chars().collect()
283    }
284
285    /// Returns `true` if all characters in the string are ASCII.
286    #[inline]
287    pub fn is_ascii(&self) -> bool {
288        self.as_bytes().is_ascii()
289    }
290}
291
292impl<E> AsRef<[u8]> for WStr<E>
293where
294    E: ByteOrder,
295{
296    #[inline]
297    fn as_ref(&self) -> &[u8] {
298        self.as_bytes()
299    }
300}
301
302impl<E> fmt::Display for WStr<E>
303where
304    E: ByteOrder,
305{
306    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
307        write!(f, "{}", self.to_utf8())
308    }
309}
310
311#[cfg(test)]
312mod tests {
313    use super::*;
314
315    #[test]
316    fn test_wstr_from_utf16le() {
317        let b = b"h\x00e\x00l\x00l\x00o\x00";
318        let s = WStr::from_utf16le(b).unwrap();
319        assert_eq!(s.to_utf8(), "hello");
320
321        // Odd number of bytes
322        let b = b"h\x00e\x00l\x00l\x00o";
323        let s = WStr::from_utf16le(b);
324        assert!(s.is_err());
325
326        // Lone leading surrogate
327        let b = b"\x00\xd8x\x00";
328        let s = WStr::from_utf16le(b);
329        assert!(s.is_err());
330
331        // Lone trailing surrogate
332        let b = b"\x00\xdcx\x00";
333        let s = WStr::from_utf16le(b);
334        assert!(s.is_err());
335    }
336
337    #[test]
338    fn test_wstr_from_utf16le_unchecked() {
339        let b = b"h\x00e\x00l\x00l\x00o\x00";
340        let s = unsafe { WStr::from_utf16le_unchecked(b) };
341        assert_eq!(s.to_utf8(), "hello");
342    }
343
344    #[test]
345    fn test_wstr_len() {
346        let b = b"h\x00e\x00l\x00l\x00o\x00";
347        let s = WStr::from_utf16le(b).unwrap();
348        assert_eq!(s.len(), b.len());
349    }
350
351    #[test]
352    fn test_wstr_is_empty() {
353        let b = b"h\x00e\x00l\x00l\x00o\x00";
354        let s = WStr::from_utf16le(b).unwrap();
355        assert!(!s.is_empty());
356
357        let s = WStr::from_utf16le(b"").unwrap();
358        assert!(s.is_empty());
359    }
360
361    #[test]
362    fn test_wstr_is_char_boundary() {
363        let b = b"\x00\xd8\x00\xdcx\x00"; // "\u{10000}\u{78}"
364        let s = WStr::from_utf16le(b).unwrap();
365        assert!(s.is_char_boundary(0));
366        assert!(!s.is_char_boundary(1));
367        assert!(!s.is_char_boundary(2));
368        assert!(!s.is_char_boundary(3));
369        assert!(s.is_char_boundary(4));
370        assert!(!s.is_char_boundary(5));
371        assert!(s.is_char_boundary(6));
372        assert!(!s.is_char_boundary(7)); // out of range
373    }
374
375    #[test]
376    fn test_wstr_as_bytes() {
377        let b = b"h\x00e\x00l\x00l\x00o\x00";
378        let s = WStr::from_utf16le(b).unwrap();
379        assert_eq!(s.as_bytes(), b);
380    }
381
382    #[test]
383    fn test_wstr_as_bytes_mut() {
384        let mut b = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
385        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
386        let world = b"w\x00o\x00r\x00l\x00d\x00";
387        unsafe {
388            let buf = s.as_bytes_mut();
389            buf.copy_from_slice(world);
390        }
391        assert_eq!(b.as_slice(), world);
392    }
393
394    #[test]
395    fn test_wstr_get() {
396        // This is implemented with get_unchecked() so this is also already tested.
397        let b = b"h\x00e\x00l\x00l\x00o\x00";
398        let s = WStr::from_utf16le(b).unwrap();
399
400        let t = s.get(0..8).expect("expected Some(&WStr)");
401        assert_eq!(t.as_bytes(), b"h\x00e\x00l\x00l\x00");
402
403        let t = s.get(1..8);
404        assert!(t.is_none());
405    }
406
407    #[test]
408    fn test_wstr_get_mut() {
409        // This is implemented with get_unchecked_mut() so this is also already tested.
410        let mut b = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
411        let s = WStr::from_utf16le_mut(b.as_mut_slice()).unwrap();
412
413        let t = s.get_mut(0..2).expect("expected Some(&mut Wstr)");
414        unsafe {
415            let buf = t.as_bytes_mut();
416            buf.copy_from_slice(b"x\x00");
417        }
418
419        assert_eq!(s.as_bytes(), b"x\x00e\x00l\x00l\x00o\x00");
420    }
421
422    #[test]
423    fn test_wstr_slice() {
424        let b = b"h\x00e\x00l\x00l\x00o\x00";
425        let s = WStr::from_utf16le(b).unwrap();
426        let sub = &s[2..8];
427        assert_eq!(sub.as_bytes(), b"e\x00l\x00l\x00");
428    }
429
430    #[test]
431    #[should_panic]
432    fn test_wstr_bad_index() {
433        let b = b"h\x00e\x00l\x00l\x00o\x00";
434        let s = WStr::from_utf16le(b).unwrap();
435        let _r = &s[2..7];
436    }
437
438    #[test]
439    fn test_wstr_to_utf8() {
440        let b = b"h\x00e\x00l\x00l\x00o\x00";
441        let s = WStr::from_utf16le(b).unwrap();
442        let out: String = s.to_utf8();
443        assert_eq!(out, "hello");
444    }
445
446    #[test]
447    fn test_wstr_is_ascii() {
448        let b = b"h\x00e\x00l\x00l\x00o\x00";
449        let s = WStr::from_utf16le(b).unwrap();
450        assert!(s.is_ascii());
451
452        let b = b"\x00\xd8\x00\xdcx\x00";
453        let s = WStr::from_utf16le(b).unwrap();
454        assert!(!s.is_ascii());
455    }
456
457    #[test]
458    fn test_wstr_as_ref() {
459        let b = b"h\x00e\x00l\x00l\x00o\x00";
460        let s = WStr::from_utf16le(b).unwrap();
461        let r: &[u8] = s.as_ref();
462        assert_eq!(r, b);
463    }
464
465    #[test]
466    fn test_display() {
467        let b = b"h\x00e\x00l\x00l\x00o\x00";
468        let s = WStr::from_utf16le(b).unwrap();
469        assert_eq!(format!("{}", s), "hello");
470    }
471}