bigbit/linkedbytes/lbstring/
mod.rs

1//! A Unicode string format implemented using Linked Bytes.
2//!
3//! This module is the home for [`LBString`][lbs] and [`LBCharsIter`][lbci], which implement the Linked Bytes string storage format, as seen in the official specification. This is, perhaps, the most widely used feature of BigBit, since it's useful even when you don't need the powerful big number storage. The documentation on the `LBString` page elaborates on that.
4//!
5//! [lbs]: struct.LBString.html "LBString — a Unicode string stored using the Linked Bytes format"
6//! [lbci]: struct.LBCharsIter.html "LBCharsIter — an iterator over the codepoints in an LBString"
7
8use super::{LBNum, LBSequence, LBNumRef};
9
10/// A Unicode string stored using the Linked Bytes format.
11///
12/// This is more compact than all of the current UTF formats (namely, UTF-1, 7, 8, 16, let alone 32), since no surrogate pairs are used. Instead, the Linked Bytes format is leveraged, with separate codepoints being stored as individual Linked Bytes numbers. Both the link/end bits of the bytes and length of the entire message, either via the null terminator (which still works since a linking 0 has the most significant bit set to 1 and cannot be confused with the null terminator when reinterpreted as `u8`) or via storing it separately (as Rust `String`s do), are available. This means that the UTF-32 number of each codepoint can be encoded using the usual Linked Bytes format, with the link bit cleared in a byte indicating that one character has ended and a new one is coming next.
13///
14/// # Usage
15/// Conversion from `String` or `&str`:
16/// ```
17/// # extern crate alloc;
18/// # use alloc::string::String;
19/// # use bigbit::LBString;
20/// static MY_STRING: &str = "My string!";
21/// let stdstring = String::from("This is a standard string!");
22///
23/// let my_string_lb = LBString::from(MY_STRING); // Creates an LBString from a string slice
24/// let stdstring_lb = LBString::from(stdstring); // Creates an LBString from a String
25/// let my_string_lb_2 = MY_STRING.chars().collect::<LBString>(); // Creates an LBString from an iterator
26///
27/// # assert_eq!(String::from(my_string_lb), MY_STRING);
28/// # assert_eq!(String::from(stdstring_lb), "This is a standard string!");
29/// # assert_eq!(String::from(my_string_lb_2), MY_STRING);
30/// ```
31#[derive(Clone, Debug)]
32pub struct LBString(LBSequence);
33impl LBString {
34    /// Returns an iterator over the codepoints in the string.
35    ///
36    /// This is the core method of this type. Most other methods use this to perform more complex operations, such as conversion from an `&str`.
37    #[inline(always)]
38    pub fn chars(&self) -> impl Iterator<Item = char> + '_ {
39        LBCharsIter::new(self)
40    }
41
42    /// Counts the number of **codepoints** stored.
43    ///
44    /// This will iterate through the entire string and count how many codepoints were resolved successfully. Currently, this is implemented as simply `self.chars().count()`.
45    #[inline(always)]
46    pub fn len(&self) -> usize {
47        self.chars().count()
48    }
49    /// Returns `true` if there are no codepoints stored, `false` otherwise.
50    #[inline(always)]
51    pub fn is_empty(&self) -> bool {
52        self.0.is_empty() // We can use the container length, since if it's 0, then it's pointless to try to iterate, otherwise there's guaranteed to be a codepoint.
53    }
54    /// Returns an immutable reference to the underlying sequence.
55    #[inline(always)]
56    pub const fn inner(&self) -> &LBSequence {
57        &self.0
58    }
59}
60impl core::iter::FromIterator<char> for LBString {
61    fn from_iter<I: IntoIterator<Item = char>>(iter: I) -> Self {
62        let mut result = Self(LBSequence::empty());
63        let mut lbn = LBNum::ZERO;
64        for c in iter {
65            lbn.make_zero(); // This is a specialized method for making the value zero without reallocating,
66                             // which makes it vital for larger strings.
67            lbn += u32::from(c);
68            result.0.inner_mut().extend(lbn.iter_le());
69        }
70        result
71    }
72}
73impl<'a> core::iter::FromIterator<&'a char> for LBString {
74    /// Convenience implementation for collections which iterate over references to items rather than the items themselves, to avoid repetitive `.copied()` in calling code.
75    #[inline(always)]
76    fn from_iter<I: IntoIterator<Item = &'a char>>(iter: I) -> Self {
77        iter.into_iter().copied().collect::<Self>()
78    }
79}
80impl core::fmt::Display for LBString {
81    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
82        use core::fmt::Write;
83        for c in self.chars() {
84            if let Err(e) = f.write_char(c) {return Err(e);} // Stop right where we are if we can't write anything.
85        }
86        Ok(())
87    }
88}
89/// An iterator over the codepoints in an `LBString`.
90///
91/// This resolves the codepoints on the fly, as all lazy iterators do. Thus creating such an iterator is totally free.
92///
93/// The values are **not checked when resolving,** meaning that any invalid Unicode codepoints will be carried over into the result. The reason is that the validity of the values is ensured by the `LBString` type during creation. This means that any unsafe code which incorrectly modifies an `LBString` will most likely trigger a panic or an infinite loop.
94pub struct LBCharsIter<'a> {
95    inner: &'a LBString,
96    index: usize
97}
98impl<'a> LBCharsIter<'a> {
99    pub const fn new(s: &'a LBString) -> Self {
100        Self {inner: s, index: 0}
101    }
102}
103impl<'a> Iterator for LBCharsIter<'a> {
104    type Item = char;
105    fn next(&mut self) -> Option<char> { // If anything breaks, blame this tymethod (seriously, please do).
106        use core::{convert::TryInto, hint::unreachable_unchecked};
107        let mut chosen_range = self.index..self.index;
108        loop {
109            if let Some(v) = self.inner.inner().get(self.index) {
110                self.index += 1;
111                chosen_range.end = self.index;
112                if v.is_end() {break;}
113            } else {
114                return None;
115            }
116        }
117        // inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner
118        // inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner
119        // inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner
120        // inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner inner
121        let refnum = TryInto::<LBNumRef>::try_into(&self.inner.inner().inner()[chosen_range])
122            .unwrap_or_else(|_| unsafe {unreachable_unchecked()}); // Value validity is a safety guarantee for LBString, which is why we can simply
123                                                                   // invoke UB if it fails. Great!
124        let codepoint = TryInto::<u32>::try_into(refnum)
125            .unwrap_or_else(|_| unsafe {unreachable_unchecked()}); // Same thing here.
126        let codepoint = TryInto::<char>::try_into(codepoint)
127            .unwrap_or_else(|_| unsafe {unreachable_unchecked()}); // And here.
128        Some(codepoint)
129    }
130}