utf_64/
string64.rs

1use crate::error::{Result, Utf64Error};
2use std::{
3    borrow::{Borrow, BorrowMut},
4    fmt,
5    hash::{Hash, Hasher},
6    iter::{Extend, FromIterator},
7    ops::{Add, AddAssign, Deref, DerefMut, Index, Range, RangeFrom, RangeFull, RangeTo},
8    str::FromStr,
9};
10
11/// A UTF64-encoded string.
12///
13/// UTF64 is a fixed-width encoding where each character occupies exactly 64 bits (8 bytes).
14/// The upper 32 bits contain the UTF-8 encoding of the character (left-aligned, zero-padded),
15/// while the lower 32 bits are reserved for future specification versions.
16///
17/// **Specification Version:** This implementation conforms to UTF64 v1.0, which requires
18/// all reserved bits (lower 32 bits) to be zero. Future versions may define uses for these
19/// bits while maintaining backward compatibility.
20///
21/// # Examples
22///
23/// ```
24/// use utf64::String64;
25///
26/// let s = String64::from("Hello, 世界!");
27/// assert_eq!(s.len(), 10); // 10 characters
28/// ```
29#[derive(Clone, PartialEq, Eq)]
30pub struct String64 {
31    data: Vec<u64>,
32}
33
34impl String64 {
35    /// Creates a new empty `String64`.
36    pub fn new() -> Self {
37        Self { data: Vec::new() }
38    }
39
40    /// Creates a new `String64` with the specified capacity.
41    pub fn with_capacity(capacity: usize) -> Self {
42        Self {
43            data: Vec::with_capacity(capacity),
44        }
45    }
46
47    /// Returns the length of this `String64` in characters.
48    ///
49    /// Note: This is O(1) as each character is exactly one u64.
50    pub fn len(&self) -> usize {
51        self.data.len()
52    }
53
54    /// Returns `true` if this `String64` has a length of zero.
55    pub fn is_empty(&self) -> bool {
56        self.data.is_empty()
57    }
58
59    /// Returns a slice of the underlying u64 data.
60    pub fn as_slice(&self) -> &[u64] {
61        &self.data
62    }
63
64    /// Encodes a string slice into UTF64 format.
65    fn encode(s: &str) -> Result<Self> {
66        let mut data = Vec::with_capacity(s.chars().count());
67
68        for ch in s.chars() {
69            let mut utf8_buf = [0u8; 4];
70            let utf8_bytes = ch.encode_utf8(&mut utf8_buf).as_bytes();
71
72            // Pack UTF-8 bytes into upper 32 bits (big-endian style)
73            let mut upper_bits: u32 = 0;
74            for (i, &byte) in utf8_bytes.iter().enumerate() {
75                upper_bits |= (byte as u32) << (24 - (i * 8));
76            }
77
78            // Upper 32 bits = UTF-8, Lower 32 bits = reserved (0)
79            let utf64_char = (upper_bits as u64) << 32;
80            data.push(utf64_char);
81        }
82
83        Ok(Self { data })
84    }
85
86    /// Decodes this UTF64 string back to a standard Rust String.
87    pub fn to_string(&self) -> Result<String> {
88        let mut utf8_bytes = Vec::new();
89
90        for &utf64_char in &self.data {
91            // Check that reserved bits (lower 32) are zero
92            if (utf64_char & 0xFFFFFFFF) != 0 {
93                return Err(Utf64Error::NonZeroReservedBits);
94            }
95
96            // Extract upper 32 bits
97            let upper_bits = (utf64_char >> 32) as u32;
98
99            // Extract UTF-8 bytes (up to 4 bytes)
100            let bytes = [
101                ((upper_bits >> 24) & 0xFF) as u8,
102                ((upper_bits >> 16) & 0xFF) as u8,
103                ((upper_bits >> 8) & 0xFF) as u8,
104                (upper_bits & 0xFF) as u8,
105            ];
106
107            // Find the actual length of the UTF-8 sequence
108            // UTF-8 first byte tells us the length
109            let len = if bytes[0] == 0 {
110                return Err(Utf64Error::InvalidUtf64);
111            } else if bytes[0] < 0x80 {
112                1
113            } else if bytes[0] < 0xE0 {
114                2
115            } else if bytes[0] < 0xF0 {
116                3
117            } else {
118                4
119            };
120
121            utf8_bytes.extend_from_slice(&bytes[..len]);
122        }
123
124        String::from_utf8(utf8_bytes).map_err(|_| Utf64Error::InvalidUtf8)
125    }
126}
127
128impl Default for String64 {
129    fn default() -> Self {
130        Self::new()
131    }
132}
133
134impl From<&str> for String64 {
135    fn from(s: &str) -> Self {
136        Self::encode(s).expect("valid UTF-8 &str should always encode to UTF64")
137    }
138}
139
140impl From<String> for String64 {
141    fn from(s: String) -> Self {
142        Self::encode(&s).expect("valid UTF-8 String should always encode to UTF64")
143    }
144}
145
146impl FromStr for String64 {
147    type Err = Utf64Error;
148
149    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
150        Self::encode(s)
151    }
152}
153
154impl fmt::Display for String64 {
155    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
156        match self.to_string() {
157            Ok(s) => write!(f, "{s}"),
158            Err(_) => write!(f, "<invalid UTF64>"),
159        }
160    }
161}
162
163impl fmt::Debug for String64 {
164    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
165        match self.to_string() {
166            Ok(s) => write!(f, "String64({s:?})"),
167            Err(_) => write!(f, "String64(<invalid>)"),
168        }
169    }
170}
171
172impl Hash for String64 {
173    fn hash<H: Hasher>(&self, state: &mut H) {
174        self.data.hash(state);
175    }
176}
177
178impl PartialOrd for String64 {
179    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
180        Some(self.cmp(other))
181    }
182}
183
184impl Ord for String64 {
185    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
186        // Lexicographic comparison by decoding to strings
187        match (self.to_string(), other.to_string()) {
188            (Ok(s1), Ok(s2)) => s1.cmp(&s2),
189            (Ok(_), Err(_)) => std::cmp::Ordering::Greater,
190            (Err(_), Ok(_)) => std::cmp::Ordering::Less,
191            (Err(_), Err(_)) => std::cmp::Ordering::Equal,
192        }
193    }
194}
195
196impl Index<usize> for String64 {
197    type Output = u64;
198
199    fn index(&self, index: usize) -> &Self::Output {
200        &self.data[index]
201    }
202}
203
204impl Index<Range<usize>> for String64 {
205    type Output = [u64];
206
207    fn index(&self, range: Range<usize>) -> &Self::Output {
208        &self.data[range]
209    }
210}
211
212impl Index<RangeFrom<usize>> for String64 {
213    type Output = [u64];
214
215    fn index(&self, range: RangeFrom<usize>) -> &Self::Output {
216        &self.data[range]
217    }
218}
219
220impl Index<RangeTo<usize>> for String64 {
221    type Output = [u64];
222
223    fn index(&self, range: RangeTo<usize>) -> &Self::Output {
224        &self.data[range]
225    }
226}
227
228impl Index<RangeFull> for String64 {
229    type Output = [u64];
230
231    fn index(&self, range: RangeFull) -> &Self::Output {
232        &self.data[range]
233    }
234}
235
236/// Iterator that yields characters from a String64 by consuming it.
237pub struct IntoIter {
238    data: std::vec::IntoIter<u64>,
239}
240
241impl Iterator for IntoIter {
242    type Item = char;
243
244    fn next(&mut self) -> Option<Self::Item> {
245        self.data.next().map(|utf64_char| {
246            // Extract upper 32 bits and decode the UTF-8
247            let upper_bits = (utf64_char >> 32) as u32;
248            let bytes = [
249                ((upper_bits >> 24) & 0xFF) as u8,
250                ((upper_bits >> 16) & 0xFF) as u8,
251                ((upper_bits >> 8) & 0xFF) as u8,
252                (upper_bits & 0xFF) as u8,
253            ];
254
255            // Determine UTF-8 length and decode
256            let len = if bytes[0] < 0x80 {
257                1
258            } else if bytes[0] < 0xE0 {
259                2
260            } else if bytes[0] < 0xF0 {
261                3
262            } else {
263                4
264            };
265
266            std::str::from_utf8(&bytes[..len])
267                .ok()
268                .and_then(|s| s.chars().next())
269                .expect("valid UTF64 should decode to valid char")
270        })
271    }
272
273    fn size_hint(&self) -> (usize, Option<usize>) {
274        self.data.size_hint()
275    }
276}
277
278impl ExactSizeIterator for IntoIter {
279    fn len(&self) -> usize {
280        self.data.len()
281    }
282}
283
284impl IntoIterator for String64 {
285    type Item = char;
286    type IntoIter = IntoIter;
287
288    fn into_iter(self) -> Self::IntoIter {
289        IntoIter {
290            data: self.data.into_iter(),
291        }
292    }
293}
294
295/// Iterator that yields characters from a &String64 without consuming it.
296pub struct Iter<'a> {
297    data: std::slice::Iter<'a, u64>,
298}
299
300impl<'a> Iterator for Iter<'a> {
301    type Item = char;
302
303    fn next(&mut self) -> Option<Self::Item> {
304        self.data.next().map(|&utf64_char| {
305            // Extract upper 32 bits and decode the UTF-8
306            let upper_bits = (utf64_char >> 32) as u32;
307            let bytes = [
308                ((upper_bits >> 24) & 0xFF) as u8,
309                ((upper_bits >> 16) & 0xFF) as u8,
310                ((upper_bits >> 8) & 0xFF) as u8,
311                (upper_bits & 0xFF) as u8,
312            ];
313
314            // Determine UTF-8 length and decode
315            let len = if bytes[0] < 0x80 {
316                1
317            } else if bytes[0] < 0xE0 {
318                2
319            } else if bytes[0] < 0xF0 {
320                3
321            } else {
322                4
323            };
324
325            std::str::from_utf8(&bytes[..len])
326                .ok()
327                .and_then(|s| s.chars().next())
328                .expect("valid UTF64 should decode to valid char")
329        })
330    }
331
332    fn size_hint(&self) -> (usize, Option<usize>) {
333        self.data.size_hint()
334    }
335}
336
337impl<'a> ExactSizeIterator for Iter<'a> {
338    fn len(&self) -> usize {
339        self.data.len()
340    }
341}
342
343impl<'a> IntoIterator for &'a String64 {
344    type Item = char;
345    type IntoIter = Iter<'a>;
346
347    fn into_iter(self) -> Self::IntoIter {
348        Iter {
349            data: self.data.iter(),
350        }
351    }
352}
353
354impl FromIterator<char> for String64 {
355    fn from_iter<T: IntoIterator<Item = char>>(iter: T) -> Self {
356        let mut s = String64::new();
357        s.extend(iter);
358        s
359    }
360}
361
362impl Extend<char> for String64 {
363    fn extend<T: IntoIterator<Item = char>>(&mut self, iter: T) {
364        for ch in iter {
365            let mut utf8_buf = [0u8; 4];
366            let utf8_bytes = ch.encode_utf8(&mut utf8_buf).as_bytes();
367
368            // Pack UTF-8 bytes into upper 32 bits
369            let mut upper_bits: u32 = 0;
370            for (i, &byte) in utf8_bytes.iter().enumerate() {
371                upper_bits |= (byte as u32) << (24 - (i * 8));
372            }
373
374            let utf64_char = (upper_bits as u64) << 32;
375            self.data.push(utf64_char);
376        }
377    }
378}
379
380impl Add<&str> for String64 {
381    type Output = String64;
382
383    fn add(mut self, rhs: &str) -> Self::Output {
384        self.extend(rhs.chars());
385        self
386    }
387}
388
389impl AddAssign<&str> for String64 {
390    fn add_assign(&mut self, rhs: &str) {
391        self.extend(rhs.chars());
392    }
393}
394
395impl PartialEq<str> for String64 {
396    fn eq(&self, other: &str) -> bool {
397        match self.to_string() {
398            Ok(s) => s == other,
399            Err(_) => false,
400        }
401    }
402}
403
404impl PartialEq<&str> for String64 {
405    fn eq(&self, other: &&str) -> bool {
406        self.eq(*other)
407    }
408}
409
410impl PartialEq<String> for String64 {
411    fn eq(&self, other: &String) -> bool {
412        self.eq(other.as_str())
413    }
414}
415
416impl AsRef<[u64]> for String64 {
417    fn as_ref(&self) -> &[u64] {
418        &self.data
419    }
420}
421
422impl TryFrom<String64> for String {
423    type Error = Utf64Error;
424
425    fn try_from(value: String64) -> Result<Self> {
426        value.to_string()
427    }
428}
429
430impl TryFrom<&String64> for String {
431    type Error = Utf64Error;
432
433    fn try_from(value: &String64) -> Result<Self> {
434        value.to_string()
435    }
436}
437
438impl Deref for String64 {
439    type Target = [u64];
440
441    fn deref(&self) -> &Self::Target {
442        &self.data
443    }
444}
445
446impl DerefMut for String64 {
447    fn deref_mut(&mut self) -> &mut Self::Target {
448        &mut self.data
449    }
450}
451
452impl Borrow<[u64]> for String64 {
453    fn borrow(&self) -> &[u64] {
454        &self.data
455    }
456}
457
458impl BorrowMut<[u64]> for String64 {
459    fn borrow_mut(&mut self) -> &mut [u64] {
460        &mut self.data
461    }
462}