1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
//! A UTF-16 little-endian string type.
//!
//! This crate provides two string types to handle UTF-16 encoded bytes directly as strings:
//! [`WString`] and [`WStr`].  They are to UTF-16 exactly like [`String`] and [`str`] are to
//! UTF-8.  Some of the concepts and functions here are rather tersely documented, in this
//! case you can look up their equivalents on [`String`] or [`str`] and the behaviour should
//! be exactly the same, only the underlying byte encoding is different.
//!
//! Thus [`WString`] is a type which owns the bytes containing the string.  Just like
//! [`String`] and the underlying [`Vec`] it is built on, it distinguishes length
//! ([`WString::len`]) and capacity ([`String::capacity`]).  Here length is the number of
//! bytes used while capacity is the number of bytes the string can grow withouth
//! reallocating.
//!
//! The [`WStr`] type does not own any bytes, it can only point to a slice of bytes
//! containing valid UTF-16.  As such you will only ever use it as a reference like `&WStr`,
//! just you you only use [`str`] as `&str`.
//!
//! The [`WString`] type implements `Deref<Target = WStr<ByteOrder>`
//!
//! # UTF-16 ByteOrder
//!
//! UTF-16 encodes to unsigned 16-bit integers ([`u16`]), denoting *code units*.  However
//! different CPU architectures encode these [`u16`] integers using different byte order:
//! *little-endian* and *big-endian*.  Thus when handling UTF-16 strings you need to be
//! aware of the byte order of the encoding, commonly the encoding variants are know as
//! UTF-16LE and UTF-16BE respectively.
//!
//! For this crate this means the types need to be aware of the byte order, which is done
//! using the [`byteorder::ByteOrder`] trait as a generic parameter to the types:
//! `WString<ByteOrder>` and `WStr<ByteOrder>` commonly written as `WString<E>` and
//! `WStr<E>` where `E` stands for "endianess".
//!
//! This crate exports [`BigEndian`], [`BE`], [`LittleEndian`] and [`LE`] in case you need
//! to denote the type:
//!
//! ```
//! use utf16string::{BigEndian, BE, WString};
//!
//! let s0: WString<BigEndian> = WString::from_str("hello");
//! assert_eq!(s0.len(), 10);
//!
//! let s1: WString<BE> = WString::from_str("hello");
//! assert_eq!(s0, s1);
//! ```
//!
//! As these types can often be a bit cumbersome to write they can often be inferred,
//! especially with the help of the shorthand constructors like [`WString::from_utf16le`],
//! [`WString::from_utf16be`], [`WStr::from_utf16le`], [`WStr::from_utf16be`] and related.
//! For example:
//!
//! ```
//! # use std::error::Error;
//! use utf16string::{LE, WStr};
//!
//! # fn main() -> Result<(), Box<dyn Error>> {
//! let b = b"h\x00e\x00l\x00l\x00o\x00";
//!
//! let s0: &WStr<LE> = WStr::from_utf16(b)?;
//! let s1 = WStr::from_utf16le(b)?;
//!
//! assert_eq!(s0, s1);
//! assert_eq!(s0.to_utf8(), "hello");
//! #     Ok(())
//! # }
//! ```

#![deny(missing_docs, missing_debug_implementations)]

use std::marker::PhantomData;
use std::slice::ChunksExact;

use byteorder::ByteOrder;

pub use byteorder::{BigEndian, LittleEndian, BE, LE};

mod error;
mod iters;
mod slicing;
mod utf16;
mod wstr;
mod wstring;

#[doc(inline)]
pub use crate::slicing::SliceIndex;

/// Error for invalid UTF-16 encoded bytes.
#[derive(Debug, Copy, Clone)]
pub struct Utf16Error {
    valid_up_to: usize,
    error_len: Option<u8>,
}

/// A UTF-16 [`String`]-like type with little- or big-endian byte order.
///
/// # Examples
///
/// ```
/// # use std::error::Error;
/// use utf16string::{LE, WString};
///
/// # fn main() -> Result<(), Box<dyn Error>> {
/// let v = Vec::from(&b"h\x00e\x00l\x00l\x00o\x00"[..]);
/// let s = WString::from_utf16le(v)?;
///
/// let chars: Vec<char> = s.chars().collect();
/// assert_eq!(chars, vec!['h', 'e', 'l', 'l', 'o']);
///
/// assert_eq!(s.to_utf8(), "hello");
/// #    Ok(())
/// # }
/// ```
///
/// Converting from valid Unicode is infallible:
///
/// ```
/// use utf16string::{LE, WString};
///
/// let s0: WString<LE> = WString::from_str("hello");
/// assert_eq!(s0.len(), 10);
///
/// let s1: WString<LE> = From::from("hello");
/// assert_eq!(s0, s1);
/// ```
#[derive(Debug, Eq, PartialEq, Hash)]
pub struct WString<E: 'static + ByteOrder> {
    buf: Vec<u8>,
    _endian: PhantomData<&'static E>,
}

/// A UTF-16 [`str`]-like type with little- or big-endian byte order.
///
/// This mostly behaves like [`str`] does for UTF-8 encoded bytes slices, but works with
/// UTF-16 encoded byte slices.  The endianess is determined by the type parameter.
///
/// # Examples
///
/// ```
/// # use std::error::Error;
/// use utf16string::{LE, WStr};
///
/// # fn main() -> Result<(), Box<dyn Error>> {
/// let b = b"h\x00e\x00l\x00l\x00o\x00";
/// let s: &WStr<LE> = WStr::from_utf16le(b)?;
///
/// let chars: Vec<char> = s.chars().collect();
/// assert_eq!(chars, vec!['h', 'e', 'l', 'l', 'o']);
///
/// assert_eq!(s.to_utf8(), "hello");
/// #    Ok(())
/// # }
/// ```

#[derive(Debug, Eq, PartialEq, Hash)]
#[repr(transparent)]
pub struct WStr<E: 'static + ByteOrder> {
    _endian: PhantomData<&'static E>,
    raw: [u8],
}

/// Iterator yielding [`char`] from a UTF-16 encoded byte slice.
///
/// The slice must contain valid UTF-16, otherwise this may panic or cause undefined
/// behaviour.
#[derive(Debug)]
pub struct WStrChars<'a, E: 'static + ByteOrder> {
    chunks: ChunksExact<'a, u8>,
    _endian: PhantomData<&'static E>,
}

/// Iterator yielding `(index, char)` tuples from a UTF-16 little-endian encoded byte slice.
///
/// The slice must contain valid UTF-16, otherwise this may panic or cause undefined
/// behaviour.
#[derive(Debug)]
pub struct WStrCharIndices<'a, E: 'static + ByteOrder> {
    chars: WStrChars<'a, E>,
    index: usize,
}