utf16string/
lib.rs

1//! A UTF-16 little-endian string type.
2//!
3//! This crate provides two string types to handle UTF-16 encoded bytes directly as strings:
4//! [`WString`] and [`WStr`].  They are to UTF-16 exactly like [`String`] and [`str`] are to
5//! UTF-8.  Some of the concepts and functions here are rather tersely documented, in this
6//! case you can look up their equivalents on [`String`] or [`str`] and the behaviour should
7//! be exactly the same, only the underlying byte encoding is different.
8//!
9//! Thus [`WString`] is a type which owns the bytes containing the string.  Just like
10//! [`String`] and the underlying [`Vec`] it is built on, it distinguishes length
11//! ([`WString::len`]) and capacity ([`String::capacity`]).  Here length is the number of
12//! bytes used while capacity is the number of bytes the string can grow withouth
13//! reallocating.
14//!
15//! The [`WStr`] type does not own any bytes, it can only point to a slice of bytes
16//! containing valid UTF-16.  As such you will only ever use it as a reference like `&WStr`,
17//! just you you only use [`str`] as `&str`.
18//!
19//! The [`WString`] type implements `Deref<Target = WStr<ByteOrder>`
20//!
21//! # UTF-16 ByteOrder
22//!
23//! UTF-16 encodes to unsigned 16-bit integers ([`u16`]), denoting *code units*.  However
24//! different CPU architectures encode these [`u16`] integers using different byte order:
25//! *little-endian* and *big-endian*.  Thus when handling UTF-16 strings you need to be
26//! aware of the byte order of the encoding, commonly the encoding variants are know as
27//! UTF-16LE and UTF-16BE respectively.
28//!
29//! For this crate this means the types need to be aware of the byte order, which is done
30//! using the [`byteorder::ByteOrder`] trait as a generic parameter to the types:
31//! `WString<ByteOrder>` and `WStr<ByteOrder>` commonly written as `WString<E>` and
32//! `WStr<E>` where `E` stands for "endianess".
33//!
34//! This crate exports [`BigEndian`], [`BE`], [`LittleEndian`] and [`LE`] in case you need
35//! to denote the type:
36//!
37//! ```
38//! use utf16string::{BigEndian, BE, WString};
39//!
40//! let s0: WString<BigEndian> = WString::from("hello");
41//! assert_eq!(s0.len(), 10);
42//!
43//! let s1: WString<BE> = WString::from("hello");
44//! assert_eq!(s0, s1);
45//! ```
46//!
47//! As these types can often be a bit cumbersome to write they can often be inferred,
48//! especially with the help of the shorthand constructors like [`WString::from_utf16le`],
49//! [`WString::from_utf16be`], [`WStr::from_utf16le`], [`WStr::from_utf16be`] and related.
50//! For example:
51//!
52//! ```
53//! # use std::error::Error;
54//! use utf16string::{LE, WStr};
55//!
56//! # fn main() -> Result<(), Box<dyn Error>> {
57//! let b = b"h\x00e\x00l\x00l\x00o\x00";
58//!
59//! let s0: &WStr<LE> = WStr::from_utf16(b)?;
60//! let s1 = WStr::from_utf16le(b)?;
61//!
62//! assert_eq!(s0, s1);
63//! assert_eq!(s0.to_utf8(), "hello");
64//! #     Ok(())
65//! # }
66//! ```
67
68#![warn(
69    missing_docs,
70    missing_debug_implementations,
71    missing_copy_implementations,
72    unused_extern_crates,
73    unused_qualifications
74)]
75
76use std::marker::PhantomData;
77use std::slice::ChunksExact;
78
79use byteorder::ByteOrder;
80
81pub use byteorder::{BigEndian, LittleEndian, BE, LE};
82
83mod error;
84mod iters;
85mod slicing;
86mod utf16;
87mod wstr;
88mod wstring;
89
90#[doc(inline)]
91pub use crate::slicing::SliceIndex;
92
93/// Error for invalid UTF-16 encoded bytes.
94#[derive(Debug, Copy, Clone)]
95pub struct Utf16Error {
96    valid_up_to: usize,
97    error_len: Option<u8>,
98}
99
100/// A UTF-16 [`String`]-like type with little- or big-endian byte order.
101///
102/// # Examples
103///
104/// ```
105/// # use std::error::Error;
106/// use utf16string::{LE, WString};
107///
108/// # fn main() -> Result<(), Box<dyn Error>> {
109/// let v = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
110/// let s = WString::from_utf16le(v)?;
111///
112/// let chars: Vec<char> = s.chars().collect();
113/// assert_eq!(chars, vec!['h', 'e', 'l', 'l', 'o']);
114///
115/// assert_eq!(s.to_utf8(), "hello");
116/// #    Ok(())
117/// # }
118/// ```
119///
120/// Converting from valid Unicode is infallible:
121///
122/// ```
123/// use utf16string::{LE, WString};
124///
125/// let s0: WString<LE> = WString::from("hello");
126/// assert_eq!(s0.len(), 10);
127///
128/// let s1: WString<LE> = From::from("hello");
129/// assert_eq!(s0, s1);
130/// ```
131#[derive(Debug, Eq, PartialEq, Hash)]
132pub struct WString<E: 'static + ByteOrder> {
133    buf: Vec<u8>,
134    _endian: PhantomData<&'static E>,
135}
136
137/// A UTF-16 [`str`]-like type with little- or big-endian byte order.
138///
139/// This mostly behaves like [`str`] does for UTF-8 encoded bytes slices, but works with
140/// UTF-16 encoded byte slices.  The endianess is determined by the type parameter.
141///
142/// # Examples
143///
144/// ```
145/// # use std::error::Error;
146/// use utf16string::{LE, WStr};
147///
148/// # fn main() -> Result<(), Box<dyn Error>> {
149/// let b = b"h\x00e\x00l\x00l\x00o\x00";
150/// let s: &WStr<LE> = WStr::from_utf16le(b)?;
151///
152/// let chars: Vec<char> = s.chars().collect();
153/// assert_eq!(chars, vec!['h', 'e', 'l', 'l', 'o']);
154///
155/// assert_eq!(s.to_utf8(), "hello");
156/// #    Ok(())
157/// # }
158/// ```
159
160#[derive(Debug, Eq, PartialEq, Hash)]
161#[repr(transparent)]
162pub struct WStr<E: 'static + ByteOrder> {
163    _endian: PhantomData<&'static E>,
164    raw: [u8],
165}
166
167/// Iterator yielding [`char`] from a UTF-16 encoded byte slice.
168///
169/// The slice must contain valid UTF-16, otherwise this may panic or cause undefined
170/// behaviour.
171#[derive(Debug)]
172pub struct WStrChars<'a, E: 'static + ByteOrder> {
173    chunks: ChunksExact<'a, u8>,
174    _endian: PhantomData<&'static E>,
175}
176
177/// Iterator yielding `(index, char)` tuples from a UTF-16 little-endian encoded byte slice.
178///
179/// The slice must contain valid UTF-16, otherwise this may panic or cause undefined
180/// behaviour.
181#[derive(Debug)]
182pub struct WStrCharIndices<'a, E: 'static + ByteOrder> {
183    chars: WStrChars<'a, E>,
184    index: usize,
185}