wtf8_rs/codepoint/
mod.rs

1//! A Unicode code point: from U+0000 to U+10FFFF.
2
3use core::fmt;
4use core::iter::{FusedIterator, Peekable};
5
6#[cfg(test)]
7mod tests;
8
9/// A Unicode code point: from U+0000 to U+10FFFF.
10///
11/// Compares with the `char` type,
12/// which represents a Unicode scalar value:
13/// a code point that is not a surrogate (U+D800 to U+DFFF).
14#[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
15pub struct CodePoint {
16    value: u32,
17}
18
19/// Format the code point as `U+` followed by four to six hexadecimal digits.
20/// Example: `U+1F4A9`
21impl fmt::Debug for CodePoint {
22    #[inline]
23    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
24        write!(formatter, "U+{:04X}", self.value)
25    }
26}
27
28impl CodePoint {
29    /// Unsafely creates a new `CodePoint` without checking the value.
30    ///
31    /// # Safety
32    ///
33    /// Only safe if `value` is less than or equal to 0x10FFFF.
34    #[inline]
35    pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
36        CodePoint { value }
37    }
38
39    /// Creates a new `CodePoint` if the value is a valid code point.
40    ///
41    /// Returns `None` if `value` is above 0x10FFFF.
42    #[inline]
43    pub fn from_u32(value: u32) -> Option<CodePoint> {
44        match value {
45            0..=0x10FFFF => Some(CodePoint { value }),
46            _ => None,
47        }
48    }
49
50    /// Creates a new `CodePoint` from a `char`.
51    ///
52    /// Since all Unicode scalar values are code points, this always succeeds.
53    #[inline]
54    pub fn from_char(value: char) -> CodePoint {
55        CodePoint {
56            value: value as u32,
57        }
58    }
59
60    /// Returns the numeric value of the code point.
61    #[inline]
62    pub fn to_u32(&self) -> u32 {
63        self.value
64    }
65
66    /// Optionally returns a Unicode scalar value for the code point.
67    ///
68    /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
69    #[inline]
70    pub fn to_char(&self) -> Option<char> {
71        match self.value {
72            0xD800..=0xDFFF => None,
73            // Safety: value is known to be in char range, because it is not
74            // a surrogate, and is less than (#impl-Index<T>) as this is guaranteed
75            // by the type.
76            _ => Some(unsafe { char::from_u32_unchecked(self.value) }),
77        }
78    }
79
80    /// Returns a Unicode scalar value for the code point.
81    ///
82    /// Returns `'\u{FFFD}'` (the replacement character “�”)
83    /// if the code point is a surrogate (from U+D800 to U+DFFF).
84    #[inline]
85    pub fn to_char_lossy(&self) -> char {
86        self.to_char().unwrap_or('\u{FFFD}')
87    }
88
89    /// Decode potentially ill-formed UTF-16.
90    #[inline]
91    pub fn decode_utf16<I>(input: I) -> DecodeUtf16<I>
92    where
93        I: Iterator<Item = u16>,
94    {
95        DecodeUtf16 {
96            input: input.peekable(),
97        }
98    }
99
100    /// Encode potentially ill-formed UTF-16.
101    #[inline]
102    pub fn encode_utf16<I>(input: I) -> EncodeUtf16<I>
103    where
104        I: Iterator<Item = CodePoint>,
105    {
106        EncodeUtf16 { input, buf: None }
107    }
108}
109
110impl From<char> for CodePoint {
111    #[inline]
112    fn from(c: char) -> Self {
113        Self::from_char(c)
114    }
115}
116
117/// An iterator for decoding potentially ill-formed UTF-16.
118pub struct DecodeUtf16<I>
119where
120    I: Iterator<Item = u16>,
121{
122    input: Peekable<I>,
123}
124impl<I> Iterator for DecodeUtf16<I>
125where
126    I: Iterator<Item = u16>,
127{
128    type Item = CodePoint;
129
130    #[inline]
131    fn next(&mut self) -> Option<CodePoint> {
132        let mut val = self.input.next()? as u32;
133
134        if let 0xD800..=0xDBFF = val {
135            if let Some(y @ 0xDC00..=0xDFFF) = self.input.peek().copied() {
136                val = 0x1_0000 | ((val - 0xD800) << 10) | (y as u32 - 0xDC00);
137                self.input.next();
138            }
139        }
140
141        // Safety: this can not be greater than 0x10FFFF by construction.
142        Some(unsafe { CodePoint::from_u32_unchecked(val) })
143    }
144
145    #[inline]
146    fn size_hint(&self) -> (usize, Option<usize>) {
147        let (l, h) = self.input.size_hint();
148        (l / 2, h)
149    }
150}
151impl<I> FusedIterator for DecodeUtf16<I> where I: FusedIterator<Item = u16> {}
152
153/// An iterator for encoding potentially ill-formed UTF-16.
154pub struct EncodeUtf16<I>
155where
156    I: Iterator<Item = CodePoint>,
157{
158    input: I,
159    buf: Option<u16>,
160}
161impl<I> Iterator for EncodeUtf16<I>
162where
163    I: Iterator<Item = CodePoint>,
164{
165    type Item = u16;
166
167    #[inline]
168    fn next(&mut self) -> Option<u16> {
169        if let Some(x) = self.buf.take() {
170            return Some(x);
171        }
172
173        let p = self.input.next()?.to_u32();
174        if p >= 0x1_0000 {
175            self.buf = Some(((p - 0x1_0000) & 0x3FF) as u16 | 0xDC00);
176            Some(((p - 0x1_0000) >> 10) as u16 | 0xD800)
177        } else {
178            Some(p as u16)
179        }
180    }
181
182    #[inline]
183    fn size_hint(&self) -> (usize, Option<usize>) {
184        let (l, h) = self.input.size_hint();
185        (
186            l.saturating_add(self.buf.is_some() as usize),
187            h.and_then(|x| x.checked_mul(2))
188                .and_then(|x| x.checked_add(self.buf.is_some() as usize)),
189        )
190    }
191}
192impl<I> FusedIterator for EncodeUtf16<I> where I: FusedIterator<Item = CodePoint> {}