utf16string/
wstring.rs

1//! Implementations for the [`WString`] type.
2//!
3//! The type itself lives in the `lib.rs` file to avoid having to have a public alias, but
4//! implementations live here.
5
6use std::marker::PhantomData;
7use std::ops::{Deref, DerefMut};
8
9use byteorder::{BigEndian, ByteOrder, LittleEndian};
10
11use crate::utf16::{validate_raw_utf16, Utf16CharExt};
12use crate::{Utf16Error, WStr, WString};
13
14impl WString<LittleEndian> {
15    /// Creates a new [`WString`] from raw bytes in little-endian byte order.
16    pub fn from_utf16le(buf: Vec<u8>) -> Result<Self, Utf16Error> {
17        Self::from_utf16(buf)
18    }
19
20    /// Converts a vector of bytes to a [`WString`], not checking validity.
21    ///
22    /// # Safety
23    ///
24    /// You must ensure the vector contains already valid UTF-16 with little-endian
25    /// byte-order, otherwise you will get undefined behaviour.
26    #[inline]
27    pub unsafe fn from_utf16le_unchecked(buf: Vec<u8>) -> Self {
28        Self::from_utf16_unchecked(buf)
29    }
30}
31
32impl WString<BigEndian> {
33    /// Creates a new [`WString`] from raw bytes in big-endian byte-order.
34    pub fn from_utf16be(buf: Vec<u8>) -> Result<Self, Utf16Error> {
35        Self::from_utf16(buf)
36    }
37
38    /// Converts a vector of bytes to a [`WString`], not checking validity.
39    ///
40    /// # Safety
41    ///
42    /// You must ensure the vector contains already valid UTF-16 with big-endian byte-order,
43    /// otherwise you will get undefined behaviour.
44    #[inline]
45    pub unsafe fn from_utf16be_unchecked(buf: Vec<u8>) -> Self {
46        Self::from_utf16_unchecked(buf)
47    }
48}
49
50impl<E> WString<E>
51where
52    E: ByteOrder,
53{
54    /// Creates a new empty [`WString`].
55    #[inline]
56    pub fn new() -> Self {
57        Self {
58            buf: Vec::new(),
59            _endian: PhantomData,
60        }
61    }
62
63    /// Creates a new empty [`WString`] with a capacity.
64    #[inline]
65    pub fn with_capacity(capacity: usize) -> Self {
66        Self {
67            buf: Vec::with_capacity(capacity),
68            _endian: PhantomData,
69        }
70    }
71
72    /// Converts a vector of bytes to a [`WString`].
73    #[inline]
74    pub fn from_utf16(buf: Vec<u8>) -> Result<Self, Utf16Error> {
75        validate_raw_utf16::<E>(buf.as_slice())?;
76        Ok(Self {
77            buf,
78            _endian: PhantomData,
79        })
80    }
81
82    /// Converts a vector of bytes to a [`WString`], not checking validity.
83    ///
84    /// # Safety
85    ///
86    /// You must ensure the vector contains already valid UTF-16 in the correct byte-order,
87    /// otherwise you will get undefined behaviour.
88    #[inline]
89    pub unsafe fn from_utf16_unchecked(buf: Vec<u8>) -> Self {
90        Self {
91            buf,
92            _endian: PhantomData,
93        }
94    }
95
96    /// Converts this string into a byte vector.
97    #[inline]
98    pub fn into_bytes(self) -> Vec<u8> {
99        self.buf
100    }
101
102    /// Returns a `&WStr` slice containing the entire string.
103    #[inline]
104    pub fn as_wstr(&self) -> &WStr<E> {
105        self
106    }
107
108    /// Returns a `&mut WStr` slice containing the entire string.
109    #[inline]
110    pub fn as_mut_wstr(&mut self) -> &mut WStr<E> {
111        self
112    }
113
114    /// Appends a string slice onto the end of this string.
115    #[inline]
116    pub fn push_wstr(&mut self, string: &WStr<E>) {
117        self.buf.extend_from_slice(string.as_bytes())
118    }
119
120    /// Returns the capacity in bytes.
121    #[inline]
122    pub fn capacity(&self) -> usize {
123        self.buf.capacity()
124    }
125
126    /// Ensure that this string has spare capacity of at least `additional` bytes.
127    #[inline]
128    pub fn reserve(&mut self, additional: usize) {
129        self.buf.reserve(additional)
130    }
131
132    /// Shrinks the capacity of this string to match its length.
133    #[inline]
134    pub fn shrink_to_fit(&mut self) {
135        self.buf.shrink_to_fit()
136    }
137
138    /// Appends the given [`char`] to the end of this string.
139    #[inline]
140    pub fn push(&mut self, ch: char) {
141        let mut buf = [0u8; 4];
142        let byte_count = ch.encode_utf16_into::<E>(&mut buf);
143        self.buf.extend_from_slice(&buf[..byte_count]);
144    }
145
146    /// Shortens this string to the specified length.
147    ///
148    /// The `new_len` is specified in bytes and not characters, just as [WString::len]
149    /// returns the length in bytes.  If `new_len` is greater than the string's current
150    /// length, this has no effect.
151    ///
152    /// Note that this method has no effect on the allocated capacity of the string.
153    ///
154    /// # Panics
155    ///
156    /// Panics if `new_len` does not lie on a [char] boundary.
157    #[inline]
158    pub fn truncate(&mut self, new_len: usize) {
159        if new_len < self.len() {
160            assert!(
161                self.is_char_boundary(new_len),
162                "new WString length not on char boundary"
163            );
164            self.buf.truncate(new_len)
165        }
166    }
167
168    /// Removes the last character from the string buffer and returns it.
169    ///
170    /// Returns [`None`] if this string is empty.
171    #[inline]
172    pub fn pop(&mut self) -> Option<char> {
173        let ch = self.chars().next_back()?;
174        let newlen = self.len() - ch.encoded_utf16_len();
175        unsafe {
176            self.buf.set_len(newlen);
177        }
178        Some(ch)
179    }
180
181    /// Removes a [`char`] from this string at the given byte position and returns it.
182    ///
183    /// This is an `O(n)` operation as it requires copying every element in the buffer.
184    ///
185    /// # Panics
186    ///
187    /// Panics if `idx` is larger than or equal to the string's length, or if it does not
188    /// lie on a [`char`] boundary.
189    #[inline]
190    pub fn remove(&mut self, idx: usize) -> char {
191        let ch = match self[idx..].chars().next() {
192            Some(ch) => ch,
193            None => panic!("cannot remove a char from the end of a string"),
194        };
195        let next = idx + ch.encoded_utf16_len();
196        let len = self.len();
197        unsafe {
198            std::ptr::copy(
199                self.buf.as_ptr().add(next),
200                self.buf.as_mut_ptr().add(idx),
201                len - next,
202            );
203            self.buf.set_len(len - (next - idx));
204        }
205        ch
206    }
207
208    /// Retains only the characters specified by the predicate.
209    #[inline]
210    pub fn retain<F>(&mut self, mut f: F)
211    where
212        F: FnMut(char) -> bool,
213    {
214        let len = self.len();
215        let mut del_bytes = 0;
216        let mut idx = 0;
217
218        while idx < len {
219            let ch = unsafe { self.get_unchecked(idx..len).chars().next().unwrap() };
220            let ch_len = ch.encoded_utf16_len();
221
222            if !f(ch) {
223                del_bytes += ch_len;
224            } else if del_bytes > 0 {
225                unsafe {
226                    std::ptr::copy(
227                        self.buf.as_ptr().add(idx),
228                        self.buf.as_mut_ptr().add(idx - del_bytes),
229                        ch_len,
230                    );
231                }
232            }
233            idx += ch_len;
234        }
235
236        if del_bytes > 0 {
237            unsafe { self.buf.set_len(len - del_bytes) }
238        }
239    }
240
241    /// Inserts a [`char`] into this string at the given byte position.
242    ///
243    /// This is an `O(n)` operation as it requires copying every element in the buffer.
244    ///
245    /// # Panics
246    ///
247    /// Panics if `idx` is larger than the string's length or if it does not lie on a
248    /// [`char`] boundary.
249    #[inline]
250    pub fn insert(&mut self, idx: usize, ch: char) {
251        assert!(self.is_char_boundary(idx), "insert not on char boundary");
252        let mut buf = [0u8; 4];
253        let len = ch.encode_utf16_into::<E>(&mut buf);
254
255        unsafe {
256            self.insert_bytes(idx, &buf[..len]);
257        }
258    }
259
260    unsafe fn insert_bytes(&mut self, idx: usize, bytes: &[u8]) {
261        #![allow(unused_unsafe)]
262        let orig_len = self.len();
263        let len_bytes = bytes.len();
264        self.buf.reserve(len_bytes);
265
266        unsafe {
267            std::ptr::copy(
268                self.buf.as_ptr().add(idx),
269                self.buf.as_mut_ptr().add(idx + len_bytes),
270                orig_len - idx,
271            );
272            std::ptr::copy(bytes.as_ptr(), self.buf.as_mut_ptr().add(idx), len_bytes);
273            self.buf.set_len(orig_len + len_bytes);
274        }
275    }
276
277    /// Inserts a string slice into this string at the given byte position.
278    ///
279    /// This is an `O(n)` operation as it requires copying every element in the buffer.  The
280    /// string slice must have the same endianess.
281    ///
282    /// # Panics
283    ///
284    /// Panics if `idx` is larger than the string's length or if it does not lie on a
285    /// [`char`] boundary.
286    #[inline]
287    pub fn insert_wstr(&mut self, idx: usize, string: &WStr<E>) {
288        assert!(self.is_char_boundary(idx));
289        unsafe {
290            self.insert_bytes(idx, string.as_bytes());
291        }
292    }
293
294    /// Returns a mutable reference to the contents of this string.
295    ///
296    /// # Safety
297    ///
298    /// You must ensure that the bytes remain encoded in UTF-16 with the correct byte-order,
299    /// otherwise you will get undefined behaviour trying to use the string.
300    #[inline]
301    pub unsafe fn as_mut_vec(&mut self) -> &mut Vec<u8> {
302        &mut self.buf
303    }
304
305    /// Returns the length in bytes, not chars or graphemes.
306    #[inline]
307    pub fn len(&self) -> usize {
308        self.buf.len()
309    }
310
311    /// Returns `true` if the string has a [`WString::len`] of zero, `false` otherwise.
312    #[inline]
313    pub fn is_empty(&self) -> bool {
314        self.len() == 0
315    }
316
317    /// Splits the string into two at the given index.
318    ///
319    /// Returns a newly allocated [`WString`].  `self` contains bytes `[0..at]` and the
320    /// returned [WString] contains bytes `[at..len]]`.
321    ///
322    /// # Panics
323    ///
324    /// Panics if `at` is not on a character boundary or is beyond the end of the string.
325    #[inline]
326    #[must_use = "use `.truncate()` if you don't need the other half"]
327    pub fn split_off(&mut self, at: usize) -> WString<E> {
328        assert!(
329            self.is_char_boundary(at),
330            "split_off not on a char boundary"
331        );
332        let other = self.buf.split_off(at);
333        unsafe { WString::from_utf16_unchecked(other) }
334    }
335
336    /// Truncates this string, removing all contents.
337    ///
338    /// The length will be zero, but the capacity will remain unchanged.
339    #[inline]
340    pub fn clear(&mut self) {
341        self.buf.clear();
342    }
343}
344
345impl<E> Default for WString<E>
346where
347    E: ByteOrder,
348{
349    #[inline]
350    fn default() -> Self {
351        Self::new()
352    }
353}
354
355impl<E> Deref for WString<E>
356where
357    E: ByteOrder,
358{
359    type Target = WStr<E>;
360
361    #[inline]
362    fn deref(&self) -> &Self::Target {
363        unsafe { WStr::from_utf16_unchecked(self.buf.as_slice()) }
364    }
365}
366
367impl<E> DerefMut for WString<E>
368where
369    E: ByteOrder,
370{
371    fn deref_mut(&mut self) -> &mut Self::Target {
372        unsafe { WStr::from_utf16_unchecked_mut(self.buf.as_mut_slice()) }
373    }
374}
375
376impl<E> From<&str> for WString<E>
377where
378    E: ByteOrder,
379{
380    #[inline]
381    fn from(source: &str) -> Self {
382        let mut new = Self::with_capacity(source.len());
383        for ch in source.chars() {
384            new.push(ch);
385        }
386        new
387    }
388}
389
390impl<E> From<&mut str> for WString<E>
391where
392    E: ByteOrder,
393{
394    #[inline]
395    fn from(source: &mut str) -> Self {
396        let mut new = Self::with_capacity(source.len());
397        for ch in source.chars() {
398            new.push(ch);
399        }
400        new
401    }
402}
403
404impl<E> From<&String> for WString<E>
405where
406    E: ByteOrder,
407{
408    #[inline]
409    fn from(source: &String) -> Self {
410        Self::from(source.as_str())
411    }
412}
413
414#[cfg(test)]
415mod tests {
416    use byteorder::{BE, LE};
417
418    use super::*;
419
420    #[test]
421    fn test_new() {
422        let s: WString<LE> = WString::new();
423        assert_eq!(s.len(), 0);
424        assert_eq!(s.capacity(), 0);
425        assert_eq!(s.to_utf8(), "");
426    }
427
428    #[test]
429    fn test_with_capacity() {
430        let s: WString<LE> = WString::with_capacity(5);
431        assert_eq!(s.capacity(), 5);
432        assert_eq!(s.len(), 0);
433        assert_eq!(s.to_utf8(), "");
434    }
435
436    #[test]
437    fn test_from_utf16() {
438        let b = b"h\x00e\x00l\x00l\x00o\x00";
439        let s: WString<LE> = WString::from_utf16(b.to_vec()).unwrap();
440        assert_eq!(s.buf, b);
441        assert_eq!(s.to_utf8(), "hello");
442    }
443
444    #[test]
445    fn test_from_utf16_le_be() {
446        let b_le = b"h\x00e\x00l\x00l\x00o\x00";
447        let s_le = WString::from_utf16le(b_le.to_vec()).unwrap();
448        assert_eq!(s_le.to_utf8(), "hello");
449
450        let b_be = b"\x00h\x00e\x00l\x00l\x00o";
451        let s_be = WString::from_utf16be(b_be.to_vec()).unwrap();
452        assert_eq!(s_be.to_utf8(), "hello");
453    }
454
455    #[test]
456    fn test_from_utf16_unchecked() {
457        let b_le = b"h\x00e\x00l\x00l\x00o\x00";
458        let s_le: WString<LE> = unsafe { WString::from_utf16_unchecked(b_le.to_vec()) };
459        assert_eq!(s_le.to_utf8(), "hello");
460
461        let s_le = unsafe { WString::from_utf16le_unchecked(b_le.to_vec()) };
462        assert_eq!(s_le.to_utf8(), "hello");
463
464        let b_be = b"\x00h\x00e\x00l\x00l\x00o";
465        let s_be = unsafe { WString::from_utf16be_unchecked(b_be.to_vec()) };
466        assert_eq!(s_be.to_utf8(), "hello");
467    }
468
469    #[test]
470    fn test_from_str() {
471        let s: WString<LE> = WString::from("hello");
472        assert_eq!(s.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
473
474        let s: WString<BE> = WString::from("hello");
475        assert_eq!(s.as_bytes(), b"\x00h\x00e\x00l\x00l\x00o");
476
477        let s: WString<LE> = From::from("hello");
478        assert_eq!(s.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
479
480        let mut v = String::from("hello");
481        let s: WString<LE> = From::from(v.as_mut_str());
482        assert_eq!(s.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
483    }
484
485    #[test]
486    fn test_from_string() {
487        let v = String::from("hello");
488
489        let s: WString<LE> = WString::from(&v);
490        assert_eq!(s.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
491
492        let s: WString<LE> = From::from(&v);
493        assert_eq!(s.as_bytes(), b"h\x00e\x00l\x00l\x00o\x00");
494    }
495
496    #[test]
497    fn test_into_bytes() {
498        let b = b"h\x00e\x00l\x00l\x00o\x00";
499        let s = WString::from_utf16le(b.to_vec()).unwrap();
500        assert_eq!(s.into_bytes(), b);
501    }
502
503    #[test]
504    fn test_as_wstr() {
505        let b = b"h\x00e\x00l\x00l\x00o\x00";
506        let wstr = WStr::from_utf16le(b).unwrap();
507        let wstring = WString::from_utf16le(b.to_vec()).unwrap();
508        assert_eq!(wstr, wstring.as_wstr());
509    }
510
511    #[test]
512    fn test_as_mut_wstr() {
513        let b = b"h\x00e\x00l\x00l\x00o\x00";
514        let wstr = WStr::from_utf16le(b).unwrap();
515        let mut wstring = WString::from_utf16le(b.to_vec()).unwrap();
516        let m: &mut WStr<_> = wstring.as_mut_wstr();
517        assert_eq!(m, wstr);
518    }
519
520    #[test]
521    fn test_push_wstr() {
522        let b = b"h\x00e\x00l\x00l\x00o\x00";
523        let mut wstring = WString::from_utf16le(b.to_vec()).unwrap();
524        let b = b" \x00w\x00o\x00r\x00l\x00d\x00";
525        let wstr = WStr::from_utf16le(b).unwrap();
526        wstring.push_wstr(wstr);
527        assert_eq!(wstring.to_utf8(), "hello world");
528    }
529
530    #[test]
531    fn test_reserve() {
532        let mut s: WString<LE> = WString::with_capacity(0);
533        assert_eq!(s.capacity(), 0);
534        s.reserve(42);
535        assert!(s.capacity() >= 42);
536    }
537
538    #[test]
539    fn test_shrink_to_fit() {
540        let mut s: WString<LE> = WString::with_capacity(42);
541        assert!(s.capacity() >= 42);
542        s.shrink_to_fit();
543        assert_eq!(s.capacity(), 0);
544    }
545
546    #[test]
547    fn test_push() {
548        let mut s: WString<LE> = WString::new();
549        s.push('h');
550        s.push('i');
551        assert_eq!(s.as_bytes(), b"h\x00i\x00");
552        assert_eq!(s.to_utf8(), "hi");
553
554        s.push('\u{10000}');
555        assert_eq!(s.as_bytes(), b"h\x00i\x00\x00\xd8\x00\xdc");
556        assert_eq!(s.to_utf8(), "hi\u{10000}");
557    }
558
559    #[test]
560    fn test_truncate() {
561        let b = b"h\x00e\x00l\x00l\x00o\x00";
562        let mut s = WString::from_utf16le(b.to_vec()).unwrap();
563
564        s.truncate(20);
565        assert_eq!(s.to_utf8(), "hello");
566
567        s.truncate(4);
568        assert_eq!(s.to_utf8(), "he");
569    }
570
571    #[test]
572    #[should_panic]
573    fn test_truncate_no_char_boundary() {
574        let b = b"h\x00e\x00l\x00l\x00o\x00";
575        let mut s = WString::from_utf16le(b.to_vec()).unwrap();
576
577        s.truncate(1);
578    }
579
580    #[test]
581    fn test_pop() {
582        let b = b"a\x00\x00\xd8\x00\xdch\x00i\x00";
583        let mut s = WString::from_utf16le(b.to_vec()).unwrap();
584        assert_eq!(s.to_utf8(), "a\u{10000}hi");
585
586        assert_eq!(s.pop(), Some('i'));
587        assert_eq!(s.to_utf8(), "a\u{10000}h");
588
589        assert_eq!(s.pop(), Some('h'));
590        assert_eq!(s.to_utf8(), "a\u{10000}");
591
592        assert_eq!(s.pop(), Some('\u{10000}'));
593        assert_eq!(s.to_utf8(), "a");
594
595        assert_eq!(s.pop(), Some('a'));
596        assert!(s.is_empty());
597    }
598
599    #[test]
600    fn test_remove() {
601        let b = b"a\x00\x00\xd8\x00\xdch\x00i\x00";
602        let mut s = WString::from_utf16le(b.to_vec()).unwrap();
603
604        assert_eq!(s.remove(2), '\u{10000}');
605        assert_eq!(s.remove(2), 'h');
606        assert_eq!(s.to_utf8(), "ai");
607    }
608
609    #[test]
610    fn test_retain() {
611        let mut s: WString<LE> = From::from("h_e__ll_o");
612        s.retain(|c| c != '_');
613        assert_eq!(s.to_utf8(), "hello");
614    }
615
616    #[test]
617    fn test_insert() {
618        let mut s: WString<LE> = From::from("hllo");
619        s.insert(2, 'e');
620        assert_eq!(s.to_utf8(), "hello");
621    }
622
623    #[test]
624    fn test_insert_wstr() {
625        let mut s: WString<LE> = From::from("ho");
626        let slice: WString<LE> = From::from("ell");
627        s.insert_wstr(2, slice.as_wstr());
628        assert_eq!(s.to_string(), "hello");
629    }
630
631    #[test]
632    fn test_as_mut_vec() {
633        let mut s: WString<LE> = From::from("hello");
634        unsafe {
635            let v: &mut Vec<u8> = s.as_mut_vec();
636            v.extend(b" \x00w\x00o\x00r\x00l\x00d\x00");
637        }
638        assert_eq!(s.to_string(), "hello world");
639    }
640
641    #[test]
642    fn test_split_off() {
643        let mut s: WString<LE> = From::from("helloworld");
644        let t = s.split_off(10);
645        assert_eq!(s.to_string(), "hello");
646        assert_eq!(t.to_string(), "world");
647    }
648
649    #[test]
650    #[should_panic]
651    fn test_split_off_bad_index() {
652        let mut s: WString<LE> = From::from("hi");
653        let _t = s.split_off(1);
654    }
655
656    #[test]
657    fn test_clear() {
658        let mut s: WString<LE> = From::from("hello");
659        assert_eq!(s.to_string(), "hello");
660        let cap = s.capacity();
661
662        s.clear();
663        assert!(s.is_empty());
664        assert_eq!(s.capacity(), cap)
665    }
666
667    #[test]
668    fn test_deref() {
669        let b = b"h\x00e\x00l\x00l\x00o\x00";
670        let wstring = WString::from_utf16le(b.to_vec()).unwrap();
671        let wstr = WStr::from_utf16le(b).unwrap();
672        assert_eq!(wstring.deref(), wstr);
673    }
674
675    #[test]
676    fn test_deref_mut() {
677        let b = b"h\x00e\x00l\x00l\x00o\x00";
678        let v = Vec::from(&b[..]);
679        let mut s = WString::from_utf16le(v).unwrap();
680        let wstr = s.deref_mut();
681        unsafe {
682            let buf = wstr.as_bytes_mut();
683            buf.copy_from_slice(b"w\x00o\x00r\x00l\x00d\x00");
684        }
685        assert_eq!(s.to_utf8(), "world");
686    }
687}