#[cfg(feature="serialize")]
use bitvec_serde::{order::Lsb0, vec::BitVec};
#[cfg(not(feature="serialize"))]
use bitvec::{order::Lsb0, vec::BitVec};
use smallvec::SmallVec;
#[cfg(feature="serialize")]
use serde::{Deserialize, Serialize};
#[derive(Copy, Clone, Debug)]
struct LSBEncoder {
value: Option<u32>,
}
impl LSBEncoder {
#[inline]
pub fn encode(value: u32) -> LSBEncoder {
LSBEncoder {
value: Some(value),
}
}
}
impl Iterator for LSBEncoder {
type Item = u8;
fn next(&mut self) -> Option<Self::Item> {
if let Some(value) = self.value {
match value {
0..=127 => {
let r = value + 128;
self.value = None;
Some(r as u8)
},
_ => {
let r = value & 127;
self.value.replace(value >> 7);
Some(r as u8)
}
}
} else {
None
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let bits = 32 - self.value.map_or(32, |v| v.leading_zeros());
let max = if bits > 0 {
(bits + 6) / 7
} else {
0
} as usize;
(max, Some(max))
}
}
impl From<MSBEncoder> for LSBEncoder {
#[inline]
fn from(e: MSBEncoder) -> LSBEncoder {
LSBEncoder {
value: e.value
}
}
}
impl ExactSizeIterator for LSBEncoder {}
impl core::iter::FusedIterator for LSBEncoder {}
#[derive(Copy, Clone, Debug)]
struct MSBEncoder {
flagged: bool,
value: Option<u32>,
}
impl MSBEncoder {
#[inline]
pub fn encode(value: u32) -> MSBEncoder {
MSBEncoder {
flagged: false,
value: Some(value)
}
}
}
impl Iterator for MSBEncoder {
type Item = u8;
fn next(&mut self) -> Option<Self::Item> {
if let Some(ref mut v) = self.value {
let leading_0 = v.leading_zeros();
match leading_0 {
0..=3 => {
let result = if self.flagged {
Some(v.wrapping_shr(28) as u8)
} else {
self.flagged = true;
Some(128 + v.wrapping_shr(28) as u8)
};
*v &= 0b0000_1111_1111_1111_1111_1111_1111_1111;
result
},
4..=10 => {
let result = if self.flagged {
Some(v.wrapping_shr(21) as u8)
} else {
self.flagged = true;
Some(128 + v.wrapping_shr(21) as u8)
};
*v &= 0b0000_0000_0001_1111_1111_1111_1111_1111;
result
},
11..=17 => {
let result = if self.flagged {
Some(v.wrapping_shr(14) as u8)
} else {
self.flagged = true;
Some(128 + v.wrapping_shr(14) as u8)
};
*v &= 0b0000_0000_0000_0000_0011_1111_1111_1111;
result
},
18..=24 => {
let result = if self.flagged {
Some(v.wrapping_shr(7) as u8)
} else {
self.flagged = true;
Some(128 + v.wrapping_shr(7) as u8)
};
*v &= 0b0000_0000_0000_0000_0000_0000_0111_1111;
result
},
25..=31 => {
let result = if self.flagged {
Some(*v as u8)
} else {
self.flagged = true;
Some(128 + *v as u8)
};
self.value = None;
result
},
32 => {
self.value = None;
if self.flagged {
Some(0)
} else {
Some(128)
}
},
_ => {
unreachable!()
}
}
} else {
None
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let bits = 32 - self.value.map_or(32, |v| v.leading_zeros());
let max = if bits > 0 {
(bits + 6) / 7
} else {
0
} as usize;
(max, Some(max))
}
}
impl From<LSBEncoder> for MSBEncoder {
#[inline]
fn from(e: LSBEncoder) -> MSBEncoder {
MSBEncoder {
flagged: false,
value: e.value
}
}
}
impl ExactSizeIterator for MSBEncoder {}
impl core::iter::FusedIterator for MSBEncoder {}
struct CharsEncoder<'a> {
prev: Option<u32>,
chars: std::str::Chars<'a>
}
impl<'a> CharsEncoder<'a> {
#[inline]
pub fn encode(s: &str) -> CharsEncoder {
CharsEncoder {
prev: None,
chars: s.chars()
}
}
}
impl<'a> Iterator for CharsEncoder<'a> {
type Item=(bool, LSBEncoder);
fn next(&mut self) -> Option<Self::Item> {
if let Some(c) = self.chars.next() {
let c = c as u32;
if let Some(p) = self.prev.replace(c) {
if c < p {
Some((true, LSBEncoder::encode(p - c)))
} else {
Some((false, LSBEncoder::encode(c - p)))
}
} else {
Some((false, LSBEncoder::encode(c)))
}
} else {
None
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.chars.size_hint()
}
}
#[derive(Debug)]
struct MSBCharsEncoder<'a> {
prev: Option<u32>,
chars: std::str::Chars<'a>
}
impl<'a> MSBCharsEncoder<'a> {
#[inline]
pub fn encode(s: &str) -> MSBCharsEncoder {
MSBCharsEncoder {
prev: None,
chars: s.chars()
}
}
}
impl<'a> Iterator for MSBCharsEncoder<'a> {
type Item=(bool, MSBEncoder);
fn next(&mut self) -> Option<Self::Item> {
if let Some(c) = self.chars.next() {
let c = c as u32;
if let Some(p) = self.prev.replace(c) {
if c < p {
Some((true, MSBEncoder::encode(p - c)))
} else {
Some((false, MSBEncoder::encode(c - p)))
}
} else {
Some((false, MSBEncoder::encode(c)))
}
} else {
None
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.chars.size_hint()
}
}
#[cfg(feature="serialize")]
#[derive(Clone, Deserialize, Debug, Eq, Hash, PartialEq, Serialize)]
pub struct VarByteString {
buffer: Vec<u8>,
sign: BitVec<Lsb0, u8>
}
#[cfg(not(feature="serialize"))]
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct VarByteString {
buffer: Vec<u8>,
sign: BitVec<Lsb0, u8>
}
impl VarByteString {
#[inline]
pub fn buffer_len(&self) -> usize {
self.buffer.len()
}
#[inline]
pub fn sign_len(&self) -> usize {
self.sign.len()
}
#[inline]
pub fn len(&self) -> usize {
self.buffer_len() + self.sign_len() / 8 + 1
}
#[inline]
pub fn size(&self) -> usize {
(self.sign_len() / 8) + 1 + self.buffer_len() + core::mem::size_of::<BitVec<Lsb0, u8>>() + core::mem::size_of::<Vec<u8>>()
}
#[inline]
pub fn chars(&self) -> Chars {
Chars {
byte_cursor: 0,
sign_cursor: 0,
prev_char: 0,
vbs: self
}
}
#[inline]
pub fn gaps(&self) -> Gaps {
Gaps {
byte_cursor: 0,
sign_cursor: 0,
vbs: self
}
}
#[inline]
pub fn gaps_bytes(&self) -> GapsBytes {
GapsBytes {
byte_cursor: 0,
sign_cursor: 0,
vbs: self
}
}
}
impl<'a> Into<String> for &'a VarByteString {
fn into(self) -> String {
let mut result = String::with_capacity(self.sign_len());
self.chars().for_each(|c| {
result.push(c);
});
result
}
}
impl<'a> Into<String> for VarByteString {
fn into(self) -> String {
let mut result = String::with_capacity(self.sign_len());
self.chars().for_each(|c| {
result.push(c);
});
result
}
}
impl<S> core::cmp::PartialEq<S> for VarByteString where S: core::borrow::Borrow<str> {
fn eq(&self, other: &S) -> bool {
CharsEncoder::encode(other.borrow()).zip(self.gaps_bytes()).all(|(rhs, lhs)| {
if rhs.0 == lhs.0 {
rhs.1.eq(lhs.1)
} else {
false
}
})
}
}
impl<S> core::cmp::PartialOrd<S> for VarByteString where S: core::borrow::Borrow<str> {
fn partial_cmp(&self, other: &S) -> Option<core::cmp::Ordering> {
let mut var_bytes = self.gaps_bytes();
let mut last_cmp = core::cmp::Ordering::Less;
for (other_sign, other_encoded) in MSBCharsEncoder::encode(other.borrow()) {
if let Some((sign, bytes)) = var_bytes.next() {
if sign {
if other_sign {
last_cmp = if other_encoded.len() == bytes.len() {
other_encoded.cmp(bytes.into_iter().rev())
} else {
other_encoded.len().cmp(&bytes.len())
};
match last_cmp {
core::cmp::Ordering::Equal => (),
_ => break
}
} else {
last_cmp = core::cmp::Ordering::Less;
break
}
} else {
if other_sign {
last_cmp = core::cmp::Ordering::Greater;
break;
} else {
last_cmp = if other_encoded.len() == bytes.len() {
other_encoded.cmp(bytes.into_iter().rev()).reverse()
} else {
bytes.len().cmp(&other_encoded.len())
};
match last_cmp {
core::cmp::Ordering::Equal => (),
_ => break
}
}
}
} else {
last_cmp = core::cmp::Ordering::Less;
break;
}
}
Some(last_cmp)
}
}
impl core::fmt::Display for VarByteString {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let val : String = self.into();
write!(f, "{}", val)
}
}
impl<S> From<S> for VarByteString where S: core::borrow::Borrow<str> {
fn from(s: S) -> VarByteString {
let s = s.borrow();
let mut buffer = Vec::with_capacity(s.len());
let mut sign = BitVec::with_capacity(s.len());
CharsEncoder::encode(s).for_each(|(cur_sign, encoded)| {
encoded.for_each(|v| {
buffer.push(v);
});
sign.push(cur_sign);
});
VarByteString {
buffer,
sign
}
}
}
pub struct Chars<'a> {
byte_cursor: usize,
sign_cursor: usize,
prev_char: u32,
vbs: &'a VarByteString,
}
impl<'a> Iterator for Chars<'a> {
type Item=char;
fn next(&mut self) -> Option<Self::Item> {
if self.byte_cursor >= self.vbs.buffer_len() {
None
} else {
let mut pow = 0;
let mut result : u32 = 0;
for b in &self.vbs.buffer[self.byte_cursor..] {
self.byte_cursor += 1;
if *b < 128 {
result += (*b as u32) << pow;
} else {
let diff = result + ((*b as u32 - 128) << pow);
if self.vbs.sign[self.sign_cursor] {
self.prev_char -= diff;
} else {
self.prev_char += diff;
}
self.sign_cursor += 1;
unsafe {
return Some(std::char::from_u32_unchecked(self.prev_char));
}
}
pow += 7;
}
None
}
}
}
impl<'a> core::iter::FusedIterator for Chars<'a> {}
impl<'a> core::iter::ExactSizeIterator for Chars<'a> {
fn len(&self) -> usize {
self.vbs.sign.len()
}
}
pub struct Gaps<'a> {
byte_cursor: usize,
sign_cursor: usize,
vbs: &'a VarByteString,
}
impl<'a> Iterator for Gaps<'a> {
type Item=i64;
fn next(&mut self) -> Option<Self::Item> {
if self.byte_cursor >= self.vbs.buffer_len() {
None
} else {
let mut pow = 0;
let mut result : i64 = 0;
for b in &self.vbs.buffer[self.byte_cursor..] {
self.byte_cursor += 1;
if *b < 128 {
result += (*b as i64) << pow;
} else {
let mut diff = result + ((*b as i64 - 128) << pow);
if self.vbs.sign[self.sign_cursor] {
diff *= -1;
}
self.sign_cursor += 1;
return Some(diff)
}
pow += 7;
}
None
}
}
}
impl<'a> core::iter::FusedIterator for Gaps<'a> {}
impl<'a> core::iter::ExactSizeIterator for Gaps<'a> {
fn len(&self) -> usize {
self.vbs.sign.len()
}
}
pub struct GapsBytes<'a> {
byte_cursor: usize,
sign_cursor: usize,
vbs: &'a VarByteString
}
impl<'a> Iterator for GapsBytes<'a> {
type Item = (bool, SmallVec<[u8; 5]>);
fn next(&mut self) -> Option<Self::Item> {
if self.sign_cursor < self.vbs.sign_len() {
let sign = self.vbs.sign[self.sign_cursor];
self.sign_cursor += 1;
let mut bytes = SmallVec::with_capacity(5);
let len = self.vbs.buffer[self.byte_cursor..].iter().take_while(|b| **b < 128).fold(0, |acc, b| {
bytes.push(*b);
acc + 1
});
bytes.push(self.vbs.buffer[self.byte_cursor + len]);
self.byte_cursor += len + 1;
Some((sign, bytes))
} else {
None
}
}
}
impl<'a> core::iter::FusedIterator for GapsBytes<'a> {}
impl<'a> core::iter::ExactSizeIterator for GapsBytes<'a> {
fn len(&self) -> usize {
self.vbs.sign_len()
}
}
#[cfg(test)]
mod tests {
use super::*;
use smallvec::smallvec;
#[test]
fn convert_back_forth() {
let val = "Some value นะครับนะ";
let var_bytes = VarByteString::from(val);
let back : String = var_bytes.into();
assert_eq!(val, back.as_str());
}
#[test]
fn short_text_display() {
use core::fmt::Write as FmtWrite;
let val = "Some value นะครับนะ";
let var_bytes = VarByteString::from(val);
let mut disp = String::new();
write!(&mut disp, "{}", var_bytes).unwrap();
assert_eq!(val, disp);
}
#[test]
fn long_text_display() {
use core::fmt::Write as FmtWrite;
let val = "Some really long text and may contains some different language like \"คำภาษาไทยที่ใช้พื้นที่เยอะกว่าเนื้อความภาษาอังกฤษเสียอีก\".";
let var_bytes = VarByteString::from(val);
let mut disp = String::new();
write!(&mut disp, "{}", var_bytes).unwrap();
assert_eq!(val, disp);
}
#[test]
fn gaps() {
let val = "abaกขa";
let var_bytes = VarByteString::from(val);
let expected = vec![b'a' as i64, 1, -1, 3488, 1, -3489];
let gaps: Vec<i64> = var_bytes.gaps().collect();
assert_eq!(expected, gaps);
}
#[test]
fn gaps_bytes() {
let val = "abaกขa";
let var_bytes = VarByteString::from(val);
let expected = vec![(false, smallvec!(b'a' + 128 as u8)), (false, smallvec![1 + 128]), (true, smallvec![1 + 128]), (false, smallvec![32, 27 + 128]), (false, smallvec![1 + 128]), (true, smallvec![33, 27 + 128])];
let gaps: Vec<(bool, SmallVec<[u8; 5]>)> = var_bytes.gaps_bytes().collect();
assert_eq!(expected, gaps);
}
#[test]
fn msb_encode_int() {
use smallvec::smallvec;
let max: SmallVec<[u8; 5]> = smallvec![0b1000_1111, 127, 127, 127, 127];
MSBEncoder::encode(u32::MAX).enumerate().for_each(|(i, byte)| {
assert_eq!(byte, max[i]);
});
let min: SmallVec<[u8; 1]> = smallvec![128];
MSBEncoder::encode(0).enumerate().for_each(|(i, byte)| {
assert_eq!(byte, min[i]);
});
let imin: SmallVec<[u8; 5]> = smallvec![0b1000_0111, 127, 127, 127, 127];
MSBEncoder::encode(i32::MAX as u32).enumerate().for_each(|(i, byte)| {
assert_eq!(byte, imin[i]);
});
let one: SmallVec<[u8; 1]> = smallvec![0b1000_0001];
MSBEncoder::encode(1).enumerate().for_each(|(i, byte)| {
assert_eq!(byte, one[i]);
});
let thousand: SmallVec<[u8; 2]> = smallvec![0b1000_0111, 0b0110_1000];
MSBEncoder::encode(1000).enumerate().for_each(|(i, byte)| {
assert_eq!(byte, thousand[i]);
});
let hundredthousand: SmallVec<[u8; 3]> = smallvec![0b1000_0110, 0b0000_1101, 0b0010_0000];
MSBEncoder::encode(100_000).enumerate().for_each(|(i, byte)| {
assert_eq!(byte, hundredthousand[i]);
});
let tenmillion: SmallVec<[u8; 4]> = smallvec![0b1000_0100, 0b0110_0010, 0b0010_1101 ,0];
MSBEncoder::encode(10_000_000).enumerate().for_each(|(i, byte)| {
assert_eq!(byte, tenmillion[i]);
});
}
#[test]
fn eq_str() {
let s = "abcba axa ทดสอบด้วยคำภาษาไทย";
let not = "bbb";
let vbs = VarByteString::from(s);
assert_eq!(vbs, s);
assert_ne!(vbs, not);
}
#[test]
fn cmp_str() {
let s1 = "abc";
let s2 = "abd";
let s3 = "bbc";
let s5 = "กก ";
let s6 = "ก ก";
let s7 = " ก ก";
let vbs = VarByteString::from(s1);
assert!(vbs < s2);
assert!(s3 > s1);
assert_eq!(s1 > s3, false);
assert!(s1 < s5);
assert_eq!(s1 == s3, false);
let thai_vbs = VarByteString::from(s5);
assert!(thai_vbs > s1);
assert!(thai_vbs > s2);
assert!(thai_vbs > s3);
assert!(thai_vbs == s5);
assert!(thai_vbs > s6);
assert!(thai_vbs > s7);
assert!(vbs > s7);
}
#[test]
fn hash_obj() {
let original = "Some really long text and may contains some different language like \"คำภาษาไทยที่ใช้พื้นที่เยอะกว่าเนื้อความภาษาอังกฤษเสียอีก\".";
let encoded = VarByteString::from(original);
let mut hm = std::collections::HashMap::new();
hm.insert(encoded.clone(), 1);
assert_eq!(hm.get(&encoded), Some(&1));
}
}