use std::{
fmt,
io::{self, Seek, SeekFrom, Write},
mem,
};
use bitflags::bitflags;
use bstr::{BStr, ByteSlice};
mod hash;
const CHUNK_ID: &[u8; 4] = b"CQDB";
const BYTEORDER_CHECK: u32 = 0x62445371;
const NUM_TABLES: usize = 256;
bitflags! {
pub struct Flag: u32 {
const NONE = 0;
const ONEWAY = 0x00000001;
}
}
#[inline]
fn unpack_u32(buf: &[u8]) -> io::Result<u32> {
if buf.len() < 4 {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
"not enough data for unpacking u32",
));
}
Ok(u32::from_le_bytes([buf[0], buf[1], buf[2], buf[3]]))
}
#[inline(always)]
fn pack_u32(value: u32) -> [u8; 4] {
value.to_le_bytes()
}
#[derive(Clone)]
pub struct CQDB<'a> {
buffer: &'a [u8],
header: Header,
tables: [Table; NUM_TABLES],
bwd: Vec<u32>,
num: u32,
}
#[derive(Debug, Clone)]
#[repr(C)]
struct Header {
chunk_id: [u8; 4],
size: u32,
flag: u32,
byteorder: u32,
bwd_size: u32,
bwd_offset: u32,
}
#[derive(Debug, Clone, Default)]
struct Table {
size: usize,
num: u32,
bucket: Vec<Bucket>,
}
#[repr(C)]
struct TableRef {
offset: u32,
num: u32,
}
#[derive(Debug, Default, Clone, Copy)]
#[repr(C)]
struct Bucket {
hash: u32,
offset: u32,
}
pub struct CQDBWriter<T: Write + Seek> {
writer: T,
flag: Flag,
begin: u32,
current: u32,
tables: [Table; NUM_TABLES],
bwd: Vec<u32>,
bwd_num: u32,
bwd_size: u32,
}
impl<'a> fmt::Debug for CQDB<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("CQDB")
.field("header", &self.header)
.field("bwd", &self.bwd)
.field("num", &self.num)
.finish()
}
}
impl<T: Write + Seek + fmt::Debug> fmt::Debug for CQDBWriter<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("CQDBWriter")
.field("writer", &self.writer)
.field("flag", &self.flag)
.field("begin", &self.begin)
.field("current", &self.current)
.field("bwd", &self.bwd)
.field("bwd_num", &self.bwd_num)
.field("bwd_size", &self.bwd_size)
.finish()
}
}
impl<'a> CQDB<'a> {
pub fn new(buf: &'a [u8]) -> io::Result<Self> {
if buf.len() < mem::size_of::<Header>() + mem::size_of::<TableRef>() * NUM_TABLES {
return Err(io::Error::new(io::ErrorKind::Other, "invalid file format"));
}
let magic = &buf[0..4];
if magic != CHUNK_ID {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid file format, magic mismatch",
));
}
let mut index = 4;
let chunk_size = unpack_u32(&buf[index..])?;
index += 4;
let flag = unpack_u32(&buf[index..])?;
index += 4;
let byte_order = unpack_u32(&buf[index..])?;
index += 4;
if byte_order != BYTEORDER_CHECK {
return Err(io::Error::new(
io::ErrorKind::Other,
"invalid file format, byte order mismatch",
));
}
let bwd_size = unpack_u32(&buf[index..])?;
index += 4;
let bwd_offset = unpack_u32(&buf[index..])?;
index += 4;
let header = Header {
chunk_id: *CHUNK_ID,
size: chunk_size,
flag,
byteorder: byte_order,
bwd_size,
bwd_offset,
};
let mut num_db = 0;
let mut tables: [Table; NUM_TABLES] = array_init::array_init(|_| Table::default());
for i in 0..NUM_TABLES {
let table_offset = unpack_u32(&buf[index..])?;
index += 4;
let table_num = unpack_u32(&buf[index..])?;
index += 4;
if table_offset > 0 {
let bucket = Self::read_bucket(buf, table_offset as usize, table_num as usize)?;
tables[i].bucket = bucket;
tables[i].num = table_num;
}
num_db += table_num / 2;
}
let bwd = if bwd_offset > 0 {
Self::read_backward_links(buf, bwd_offset as usize, num_db as usize)?
} else {
Vec::new()
};
Ok(Self {
buffer: buf,
header,
tables,
bwd,
num: num_db,
})
}
#[inline]
fn read_bucket(buf: &[u8], offset: usize, num: usize) -> io::Result<Vec<Bucket>> {
let mut buckets = Vec::with_capacity(num);
let mut index = offset;
for _ in 0..num {
let hash = unpack_u32(&buf[index..])?;
index += 4;
let offset = unpack_u32(&buf[index..])?;
index += 4;
buckets.push(Bucket { hash, offset });
}
Ok(buckets)
}
#[inline]
fn read_backward_links(buf: &[u8], offset: usize, num: usize) -> io::Result<Vec<u32>> {
let mut bwd = Vec::with_capacity(num);
let mut index = offset;
for _ in 0..num {
bwd.push(unpack_u32(&buf[index..])?);
index += 4;
}
Ok(bwd)
}
#[inline]
pub fn num(&self) -> u32 {
self.num
}
pub fn to_id(&self, s: &str) -> Option<u32> {
self.to_id_impl(s).unwrap_or_default()
}
#[inline]
fn to_id_impl(&self, s: &str) -> io::Result<Option<u32>> {
let hash = crate::hash::jhash(s.as_bytes(), s.len() as u32 + 1, 0);
let table_index = hash % NUM_TABLES as u32;
let table = &self.tables[table_index as usize];
if table.num > 0 && !table.bucket.is_empty() {
let n = table.num;
let mut k = (hash >> 8) % n;
loop {
let bucket = &table.bucket[k as usize];
if bucket.offset > 0 {
if bucket.hash == hash {
let mut index = bucket.offset as usize;
let value = unpack_u32(&self.buffer[index..])?;
index += 4;
let ksize = unpack_u32(&self.buffer[index..])? - 1;
index += 4;
let actual_str = &self.buffer[index..index + ksize as usize];
if s.as_bytes() == actual_str {
return Ok(Some(value));
}
}
} else {
break;
}
k = (k + 1) % n;
}
}
Ok(None)
}
pub fn to_str(&'a self, id: u32) -> Option<&'a BStr> {
self.to_str_impl(id).unwrap_or_default()
}
#[inline]
fn to_str_impl(&'a self, id: u32) -> io::Result<Option<&'a BStr>> {
if !self.bwd.is_empty() && (id as u32) < self.header.bwd_size {
let offset = self.bwd[id as usize];
if offset > 0 {
let mut index = offset as usize + 4;
let value_size = unpack_u32(&self.buffer[index..])? as usize - 1;
index += 4;
return Ok(Some(&self.buffer[index..index + value_size].as_bstr()));
}
}
Ok(None)
}
}
impl<T: Write + Seek> CQDBWriter<T> {
pub fn new(writer: T) -> io::Result<Self> {
Self::with_flag(writer, Flag::NONE)
}
pub fn with_flag(mut writer: T, flag: Flag) -> io::Result<Self> {
let begin = writer.seek(SeekFrom::Current(0))? as u32;
let current = (mem::size_of::<Header>() + mem::size_of::<TableRef>() * NUM_TABLES) as u32;
writer.seek(SeekFrom::Start((begin + current) as u64))?;
Ok(Self {
writer,
flag,
begin,
current,
tables: array_init::array_init(|_| Table::default()),
bwd: Vec::new(),
bwd_num: 0,
bwd_size: 0,
})
}
pub fn put<K: AsRef<[u8]>>(&mut self, key: K, id: u32) -> io::Result<()> {
let key = key.as_ref();
let key_size = key.len() as u32 + 1;
let hash = crate::hash::jhash(key, key_size, 0);
let table = &mut self.tables[hash as usize % 256];
self.writer.write_all(&pack_u32(id))?;
self.writer.write_all(&pack_u32(key_size as u32))?;
self.writer.write_all(key.as_bytes())?;
self.writer.write_all(b"\0")?;
if table.size <= table.num as usize {
table.size = (table.size + 1) * 2;
table.bucket.resize(table.size, Bucket::default());
}
table.bucket[table.num as usize].hash = hash;
table.bucket[table.num as usize].offset = self.current as u32;
table.num += 1;
if !self.flag.contains(Flag::ONEWAY) {
if self.bwd_size <= id {
let mut size = self.bwd_size;
while size <= id {
size = (size + 1) * 2;
}
self.bwd.resize(size as usize, 0);
self.bwd_size = size;
}
if self.bwd_num <= id {
self.bwd_num = id + 1;
}
self.bwd[id as usize] = self.current as u32;
}
self.current += 4 + 4 + key_size;
Ok(())
}
fn close(&mut self) -> io::Result<()> {
let mut header = Header {
chunk_id: *CHUNK_ID,
flag: self.flag.bits,
byteorder: BYTEORDER_CHECK,
bwd_offset: 0,
bwd_size: self.bwd_num,
size: 0,
};
for i in 0..NUM_TABLES {
let table = &self.tables[i];
if table.bucket.is_empty() {
continue;
}
let n = table.num * 2;
let mut dst = vec![Bucket::default(); n as usize];
for j in 0..table.num as usize {
let src = &table.bucket[j];
let mut k = (src.hash >> 8) % n;
while dst[k as usize].offset != 0 {
k = (k + 1) % n;
}
dst[k as usize].hash = src.hash;
dst[k as usize].offset = src.offset;
}
for k in 0..n as usize {
self.writer.write_all(&pack_u32(dst[k].hash))?;
self.writer.write_all(&pack_u32(dst[k].offset))?;
}
}
if !self.flag.contains(Flag::ONEWAY) && self.bwd_size > 0 {
let current_offset = self.writer.seek(SeekFrom::Current(0))? as u32;
header.bwd_offset = current_offset - self.begin;
for i in 0..self.bwd_num as usize {
self.writer.write_all(&pack_u32(self.bwd[i]))?;
}
}
let offset = self.writer.seek(SeekFrom::Current(0))? as u32;
header.size = offset - self.begin;
self.writer.seek(SeekFrom::Start(self.begin as u64))?;
self.writer.write_all(&header.chunk_id)?;
self.writer.write_all(&pack_u32(header.size))?;
self.writer.write_all(&pack_u32(header.flag))?;
self.writer.write_all(&pack_u32(header.byteorder))?;
self.writer.write_all(&pack_u32(header.bwd_size))?;
self.writer.write_all(&pack_u32(header.bwd_offset))?;
for i in 0..NUM_TABLES {
let table_num = self.tables[i].num;
let table_offset = if table_num > 0 {
self.current as u32
} else {
0
};
self.writer.write_all(&pack_u32(table_offset))?;
self.writer.write_all(&pack_u32(table_num * 2))?;
self.current += table_num * 2 * std::mem::size_of::<Bucket>() as u32;
}
self.writer.seek(SeekFrom::Start(offset as u64))?;
Ok(())
}
}
impl<T: Write + Seek> Drop for CQDBWriter<T> {
fn drop(&mut self) {
if let Ok(()) = self.close() {}
}
}