Skip to main content

cqdb/
lib.rs

1//! Rust implementation of [Constant Quark Database](http://www.chokkan.org/software/cqdb/):
2//! a database library specialized for serialization and retrieval of static associations between strings and integer identifiers
3use std::{
4    fmt,
5    io::{self, Seek, SeekFrom, Write},
6    mem,
7};
8
9use bitflags::bitflags;
10use bstr::{BStr, ByteSlice};
11
12mod hash;
13
14const CHUNK_ID: &[u8; 4] = b"CQDB";
15const BYTEORDER_CHECK: u32 = 0x62445371;
16const NUM_TABLES: usize = 256;
17
18bitflags! {
19    /// CQDB writer flag
20    #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
21    pub struct Flag: u32 {
22        /// No flag, default
23        const NONE = 0;
24        /// A reverse lookup array is omitted
25        const ONEWAY = 0x00000001;
26    }
27}
28
29/// Read a little-endian u32 directly from a buffer at the given offset.
30/// Uses a single slice bounds check instead of 4 individual byte accesses.
31/// Panics on out-of-bounds (callers must validate buffer structure upfront).
32#[inline(always)]
33fn read_u32_le(buf: &[u8], offset: usize) -> u32 {
34    let b = &buf[offset..offset + 4];
35    u32::from_le_bytes([b[0], b[1], b[2], b[3]])
36}
37
38#[inline(always)]
39fn pack_u32(value: u32) -> [u8; 4] {
40    value.to_le_bytes()
41}
42
43/// Zero-copy hash table reference into the buffer
44#[derive(Debug, Clone, Copy, Default)]
45struct ReadTable {
46    /// Offset into buffer where the bucket array starts
47    offset: usize,
48    /// Number of elements in the hash table
49    num: u32,
50}
51
52/// Constant quark database (CQDB)
53#[derive(Clone)]
54pub struct CQDB<'a> {
55    /// Database file buffer
56    buffer: &'a [u8],
57    /// Chunk header
58    header: Header,
59    /// Hash tables (string -> id), zero-copy references into buffer
60    tables: [ReadTable; NUM_TABLES],
61    /// Offset to the backward link array in the buffer (0 if none)
62    bwd_offset: usize,
63    /// Number of key/data pairs
64    num: u32,
65}
66
67/// CQDB chunk header
68#[derive(Debug, Clone)]
69#[repr(C)]
70struct Header {
71    /// Chunk identifier, "CQDB"
72    chunk_id: [u8; 4],
73    /// Chunk size including this header
74    size: u32,
75    /// Global flags
76    flag: u32,
77    /// Byte-order indicator
78    byteorder: u32,
79    /// Number of elements in the backward array
80    bwd_size: u32,
81    /// Offset to the backward array
82    bwd_offset: u32,
83}
84
85/// A hash table (used by writer)
86#[derive(Debug, Clone, Default)]
87struct Table {
88    size: usize,
89    /// Number of elements in the table
90    num: u32,
91    /// Array of Bucket
92    bucket: Vec<Bucket>,
93}
94
95#[repr(C)]
96struct TableRef {
97    /// Offset to a hash table
98    offset: u32,
99    /// Number of elements in the hash table
100    num: u32,
101}
102
103/// An element of a hash table
104#[derive(Debug, Default, Clone, Copy)]
105#[repr(C)]
106struct Bucket {
107    /// Hash value of the record
108    hash: u32,
109    /// Offset address to the actual record
110    offset: u32,
111}
112
113/// Writer for a constant quark database
114pub struct CQDBWriter<T: Write + Seek> {
115    writer: T,
116    /// Operation flag
117    flag: Flag,
118    /// Offset address to the head of this database
119    begin: u32,
120    /// Offset address to a new key/data pair
121    current: u32,
122    /// Hash tables (string -> id)
123    tables: [Table; NUM_TABLES],
124    /// Backlink array
125    bwd: Vec<u32>,
126    bwd_num: u32,
127    /// Number of elements in the backlink array
128    bwd_size: u32,
129}
130
131impl<'a> fmt::Debug for CQDB<'a> {
132    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133        f.debug_struct("CQDB")
134            .field("header", &self.header)
135            .field("bwd_offset", &self.bwd_offset)
136            .field("num", &self.num)
137            .finish()
138    }
139}
140
141impl<T: Write + Seek + fmt::Debug> fmt::Debug for CQDBWriter<T> {
142    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
143        f.debug_struct("CQDBWriter")
144            .field("writer", &self.writer)
145            .field("flag", &self.flag)
146            .field("begin", &self.begin)
147            .field("current", &self.current)
148            .field("bwd", &self.bwd)
149            .field("bwd_num", &self.bwd_num)
150            .field("bwd_size", &self.bwd_size)
151            .finish()
152    }
153}
154
155impl<'a> CQDB<'a> {
156    pub fn new(buf: &'a [u8]) -> io::Result<Self> {
157        let min_size = mem::size_of::<Header>() + mem::size_of::<TableRef>() * NUM_TABLES;
158        if buf.len() < min_size {
159            // The minimum size of a valid CQDB
160            return Err(io::Error::other("invalid file format"));
161        }
162        // Check the file chunkid
163        if &buf[0..4] != CHUNK_ID {
164            return Err(io::Error::other("invalid file format, magic mismatch"));
165        }
166        let chunk_size = read_u32_le(buf, 4);
167        let flag = read_u32_le(buf, 8);
168        let byte_order = read_u32_le(buf, 12);
169        // Check the consistency of byte order
170        if byte_order != BYTEORDER_CHECK {
171            return Err(io::Error::other("invalid file format, byte order mismatch"));
172        }
173        let bwd_size = read_u32_le(buf, 16);
174        let bwd_offset_raw = read_u32_le(buf, 20);
175        let header = Header {
176            chunk_id: *CHUNK_ID,
177            size: chunk_size,
178            flag,
179            byteorder: byte_order,
180            bwd_size,
181            bwd_offset: bwd_offset_raw,
182        };
183
184        // Parse table references (zero-copy: just store offset + count)
185        let mut num_db = 0u32;
186        let mut tables = [ReadTable::default(); NUM_TABLES];
187        let mut index = 24; // After 6 × u32 header fields
188        for table in &mut tables {
189            let table_offset = read_u32_le(buf, index) as usize;
190            index += 4;
191            let table_num = read_u32_le(buf, index);
192            index += 4;
193            if table_offset > 0 {
194                // Validate that bucket data fits within the buffer (checked arithmetic for overflow)
195                let end = (table_num as usize)
196                    .checked_mul(8)
197                    .and_then(|bytes| table_offset.checked_add(bytes));
198                match end {
199                    Some(end) if end <= buf.len() => {
200                        table.offset = table_offset;
201                        table.num = table_num;
202                    }
203                    _ => return Err(io::Error::other("invalid table data: out of bounds")),
204                }
205            }
206            // The number of records is the half of the table size
207            num_db += table_num / 2;
208        }
209
210        // Validate backward link array bounds
211        let bwd_offset = if bwd_offset_raw > 0 {
212            let off = bwd_offset_raw as usize;
213            let end = (bwd_size as usize)
214                .checked_mul(4)
215                .and_then(|bytes| off.checked_add(bytes));
216            match end {
217                Some(end) if end <= buf.len() => off,
218                _ => {
219                    return Err(io::Error::other(
220                        "invalid backward link data: out of bounds",
221                    ));
222                }
223            }
224        } else {
225            0
226        };
227
228        Ok(Self {
229            buffer: buf,
230            header,
231            tables,
232            bwd_offset,
233            num: num_db,
234        })
235    }
236
237    /// Get the number of associations in the database
238    #[inline]
239    pub fn num(&self) -> u32 {
240        self.num
241    }
242
243    /// Retrieve the identifier associated with a string
244    #[inline]
245    pub fn to_id(&self, s: &str) -> Option<u32> {
246        let hash = crate::hash::jhash(s.as_bytes(), s.len() as u32 + 1, 0);
247        let table = &self.tables[(hash % NUM_TABLES as u32) as usize];
248        if table.num > 0 {
249            let n = table.num;
250            let base = table.offset;
251            let mut k = (hash >> 8) % n;
252            loop {
253                // Single bounds check for both hash + offset (8 bytes)
254                let bk = &self.buffer[base + (k as usize) * 8..][..8];
255                let bucket_offset = u32::from_le_bytes([bk[4], bk[5], bk[6], bk[7]]);
256                if bucket_offset > 0 {
257                    let bucket_hash = u32::from_le_bytes([bk[0], bk[1], bk[2], bk[3]]);
258                    if bucket_hash == hash {
259                        // Record reads use offsets from file content — use checked access
260                        let rec_start = bucket_offset as usize;
261                        let rec = self.buffer.get(rec_start..rec_start + 8)?;
262                        let value = u32::from_le_bytes([rec[0], rec[1], rec[2], rec[3]]);
263                        let ksize = (u32::from_le_bytes([rec[4], rec[5], rec[6], rec[7]]) as usize)
264                            .checked_sub(1)?; // ksize includes NUL
265                        let key_end = rec_start.checked_add(8 + ksize)?;
266                        if s.as_bytes() == self.buffer.get(rec_start + 8..key_end)? {
267                            return Some(value);
268                        }
269                    }
270                } else {
271                    break;
272                }
273                k = (k + 1) % n;
274            }
275        }
276        None
277    }
278
279    /// Retrieve the string associated with an identifier
280    #[inline]
281    pub fn to_str(&'a self, id: u32) -> Option<&'a BStr> {
282        // Check if the current database supports the backward lookup
283        if self.bwd_offset > 0 && id < self.header.bwd_size {
284            // bwd array read is safe: bounds validated in new()
285            let offset = read_u32_le(self.buffer, self.bwd_offset + (id as usize) * 4);
286            if offset > 0 {
287                // Record reads use offsets from file content — use checked access
288                let index = offset as usize + 4; // Skip id field
289                let rec = self.buffer.get(index..index + 4)?;
290                let value_size = (u32::from_le_bytes([rec[0], rec[1], rec[2], rec[3]]) as usize)
291                    .checked_sub(1)?; // includes NUL
292                let start = index + 4;
293                let end = start.checked_add(value_size)?;
294                return Some(self.buffer.get(start..end)?.as_bstr());
295            }
296        }
297        None
298    }
299
300    /// An iterator visiting all id, string pairs in order.
301    pub fn iter(&'a self) -> Iter<'a> {
302        Iter { db: self, next: 0 }
303    }
304}
305
306/// CQDB iterator
307pub struct Iter<'a> {
308    db: &'a CQDB<'a>,
309    next: u32,
310}
311
312impl<'a> Iterator for Iter<'a> {
313    type Item = io::Result<(u32, &'a BStr)>;
314
315    fn next(&mut self) -> Option<Self::Item> {
316        let id = self.next;
317        if let Some(s) = self.db.to_str(id) {
318            self.next += 1;
319            return Some(Ok((id, s)));
320        }
321        None
322    }
323
324    fn size_hint(&self) -> (usize, Option<usize>) {
325        let remaining = if self.db.bwd_offset > 0 {
326            self.db.header.bwd_size.saturating_sub(self.next) as usize
327        } else {
328            0
329        };
330        (0, Some(remaining))
331    }
332}
333
334impl<'a> IntoIterator for &'a CQDB<'a> {
335    type Item = io::Result<(u32, &'a BStr)>;
336    type IntoIter = Iter<'a>;
337
338    #[inline]
339    fn into_iter(self) -> Self::IntoIter {
340        self.iter()
341    }
342}
343
344impl<T: Write + Seek> CQDBWriter<T> {
345    /// Create a new CQDB writer
346    pub fn new(writer: T) -> io::Result<Self> {
347        Self::with_flag(writer, Flag::NONE)
348    }
349
350    /// Create a new CQDB writer with flag
351    pub fn with_flag(mut writer: T, flag: Flag) -> io::Result<Self> {
352        let begin = writer.stream_position()? as u32;
353        let current = (mem::size_of::<Header>() + mem::size_of::<TableRef>() * NUM_TABLES) as u32;
354        // Move the file pointer to the offset to the first key/data pair
355        writer.seek(SeekFrom::Start((begin + current) as u64))?;
356        Ok(Self {
357            writer,
358            flag,
359            begin,
360            current,
361            tables: std::array::from_fn(|_| Table::default()),
362            bwd: Vec::new(),
363            bwd_num: 0,
364            bwd_size: 0,
365        })
366    }
367
368    /// Put a string/identifier association to the database
369    pub fn put<K: AsRef<[u8]>>(&mut self, key: K, id: u32) -> io::Result<()> {
370        let key = key.as_ref();
371        let key_size = key.len() as u32 + 1; // includes NUL byte
372        let hash = crate::hash::jhash(key, key_size, 0);
373        let table = &mut self.tables[hash as usize % 256];
374        // Batch record write: [id(4) | key_size(4) | key | NUL]
375        let record_len = 8 + key.len() + 1;
376        if record_len <= 264 {
377            let mut buf = [0u8; 264]; // 8 header + max 255 key + NUL
378            buf[0..4].copy_from_slice(&pack_u32(id));
379            buf[4..8].copy_from_slice(&pack_u32(key_size));
380            buf[8..8 + key.len()].copy_from_slice(key);
381            // buf[8 + key.len()] is already 0 (NUL)
382            self.writer.write_all(&buf[..record_len])?;
383        } else {
384            // Fallback for very large keys
385            self.writer.write_all(&pack_u32(id))?;
386            self.writer.write_all(&pack_u32(key_size))?;
387            self.writer.write_all(key)?;
388            self.writer.write_all(b"\0")?;
389        }
390        // Expand the bucket if necessary
391        if table.size <= table.num as usize {
392            table.size = (table.size + 1) * 2;
393            table.bucket.resize(table.size, Bucket::default());
394        }
395        // Set the hash value and current offset position
396        table.bucket[table.num as usize].hash = hash;
397        table.bucket[table.num as usize].offset = self.current;
398        table.num += 1;
399        // Store the backlink if specified
400        if !self.flag.contains(Flag::ONEWAY) {
401            // Expand the backlink arrray if necessary
402            if self.bwd_size <= id {
403                let mut size = self.bwd_size;
404                while size <= id {
405                    size = (size + 1) * 2;
406                }
407                self.bwd.resize(size as usize, 0);
408                self.bwd_size = size;
409            }
410            if self.bwd_num <= id {
411                self.bwd_num = id + 1;
412            }
413            self.bwd[id as usize] = self.current;
414        }
415        // Increment the current position
416        self.current += 4 + 4 + key_size;
417        Ok(())
418    }
419
420    /// Close the writer, flush the file stream
421    fn close(&mut self) -> io::Result<()> {
422        let mut header = Header {
423            chunk_id: *CHUNK_ID,
424            flag: self.flag.bits(),
425            byteorder: BYTEORDER_CHECK,
426            bwd_offset: 0,
427            bwd_size: self.bwd_num,
428            size: 0,
429        };
430        // Store the hash tables. At this moment, the file pointer refers to
431        // the offset succeeding the last key/data pair.
432        // Reuse dst Vec across tables to avoid per-table heap allocation.
433        let mut dst: Vec<Bucket> = Vec::new();
434        #[cfg(not(target_endian = "little"))]
435        let mut write_buf: Vec<u8> = Vec::new();
436        for i in 0..NUM_TABLES {
437            let table = &self.tables[i];
438            // Do not write empty hash tables
439            if table.bucket.is_empty() {
440                continue;
441            }
442            // Actual bucket will have the double size; half elements
443            // in the bucket are kept empty.
444            let n = table.num * 2;
445            let n_usize = n as usize;
446            // Reuse dst: only grows, never deallocates between tables
447            dst.clear();
448            dst.resize(n_usize, Bucket::default());
449            // Put hash elements to the bucket with the open-address method
450            for j in 0..table.num as usize {
451                let src = &table.bucket[j];
452                let mut k = (src.hash >> 8) % n;
453                // Find a vacant element
454                while dst[k as usize].offset != 0 {
455                    k = (k + 1) % n;
456                }
457                // Store the hash element
458                dst[k as usize].hash = src.hash;
459                dst[k as usize].offset = src.offset;
460            }
461            // Write the entire bucket array for this table in one call.
462            // On LE platforms, Bucket repr(C) {u32, u32} matches the on-disk format.
463            #[cfg(target_endian = "little")]
464            {
465                let bytes =
466                    unsafe { std::slice::from_raw_parts(dst.as_ptr() as *const u8, n_usize * 8) };
467                self.writer.write_all(bytes)?;
468            }
469            #[cfg(not(target_endian = "little"))]
470            {
471                write_buf.clear();
472                write_buf.reserve(n_usize * 8);
473                for bucket in &dst[..n_usize] {
474                    write_buf.extend_from_slice(&pack_u32(bucket.hash));
475                    write_buf.extend_from_slice(&pack_u32(bucket.offset));
476                }
477                self.writer.write_all(&write_buf)?;
478            }
479        }
480        // Write the backlink array if specified
481        if !self.flag.contains(Flag::ONEWAY) && self.bwd_size > 0 {
482            // Store the offset to the head of this array
483            let current_offset = self.writer.stream_position()? as u32;
484            header.bwd_offset = current_offset - self.begin;
485            // Write all backward links in one call.
486            #[cfg(target_endian = "little")]
487            {
488                let bytes = unsafe {
489                    std::slice::from_raw_parts(
490                        self.bwd.as_ptr() as *const u8,
491                        self.bwd_num as usize * 4,
492                    )
493                };
494                self.writer.write_all(bytes)?;
495            }
496            #[cfg(not(target_endian = "little"))]
497            {
498                write_buf.clear();
499                write_buf.reserve(self.bwd_num as usize * 4);
500                for i in 0..self.bwd_num as usize {
501                    write_buf.extend_from_slice(&pack_u32(self.bwd[i]));
502                }
503                self.writer.write_all(&write_buf)?;
504            }
505        }
506        // Store the current position
507        let offset = self.writer.stream_position()? as u32;
508        header.size = offset - self.begin;
509        // Rewind the current position to the beginning
510        self.writer.seek(SeekFrom::Start(self.begin as u64))?;
511        // Write header + table references in a single batch (2072 bytes on stack)
512        let mut hdr_buf = [0u8; 24 + NUM_TABLES * 8];
513        hdr_buf[0..4].copy_from_slice(&header.chunk_id);
514        hdr_buf[4..8].copy_from_slice(&pack_u32(header.size));
515        hdr_buf[8..12].copy_from_slice(&pack_u32(header.flag));
516        hdr_buf[12..16].copy_from_slice(&pack_u32(header.byteorder));
517        hdr_buf[16..20].copy_from_slice(&pack_u32(header.bwd_size));
518        hdr_buf[20..24].copy_from_slice(&pack_u32(header.bwd_offset));
519        // Write references to hash tables. At this moment, self.current points
520        // to the offset succeeding the last key/data pair.
521        for i in 0..NUM_TABLES {
522            let table_num = self.tables[i].num;
523            // Offset to the hash table (or zero for non-existent tables)
524            let table_offset = if table_num > 0 { self.current } else { 0 };
525            let off = 24 + i * 8;
526            hdr_buf[off..off + 4].copy_from_slice(&pack_u32(table_offset));
527            // Bucket size is double the number of elements
528            hdr_buf[off + 4..off + 8].copy_from_slice(&pack_u32(table_num * 2));
529            // Advance the offset counter
530            self.current += table_num * 2 * std::mem::size_of::<Bucket>() as u32;
531        }
532        self.writer.write_all(&hdr_buf)?;
533        // Seek to the last position
534        self.writer.seek(SeekFrom::Start(offset as u64))?;
535        Ok(())
536    }
537}
538
539impl<T: Write + Seek> Drop for CQDBWriter<T> {
540    fn drop(&mut self) {
541        if let Ok(()) = self.close() {}
542    }
543}