cdb64/
writer.rs

1use std::{
2    fs::{File, OpenOptions},
3    hash::Hasher,
4    io::{Seek, SeekFrom, Write},
5    marker::PhantomData,
6    path::Path,
7};
8
9use crate::{
10    Error,
11    cdb::{Cdb, HEADER_SIZE, TableEntry},
12    hash::CdbHash,
13    util::write_tuple,
14};
15
16#[derive(Debug)]
17struct Entry {
18    hash_val: u64,
19    offset: u64,
20}
21
22pub struct CdbWriter<W: Write + Seek, H: Hasher + Default = CdbHash> {
23    writer: W,
24    entries_by_table: [Vec<Entry>; 256],
25    is_finalized: bool,
26    current_data_offset: u64,
27    _hasher: PhantomData<H>,
28}
29
30impl<H: Hasher + Default> CdbWriter<File, H> {
31    pub fn create(path: impl AsRef<Path>) -> Result<Self, Error> {
32        let file = OpenOptions::new()
33            .write(true)
34            .create(true)
35            .truncate(true)
36            .open(path)?;
37
38        Self::new(file)
39    }
40}
41
42impl<W: Write + Seek, H: Hasher + Default> CdbWriter<W, H> {
43    pub fn new(mut writer: W) -> Result<Self, Error> {
44        writer.seek(SeekFrom::Start(0))?;
45        let header_placeholder = vec![0u8; HEADER_SIZE as usize];
46        writer.write_all(&header_placeholder)?;
47
48        Ok(CdbWriter {
49            writer,
50            entries_by_table: [const { Vec::new() }; 256],
51            is_finalized: false,
52            current_data_offset: HEADER_SIZE,
53            _hasher: PhantomData,
54        })
55    }
56
57    /// Inserts a key-value pair into the CDB database.
58    ///
59    /// # Arguments
60    ///
61    /// * `key` - A byte slice representing the key
62    /// * `value` - A byte slice representing the value
63    ///
64    /// # Behavior
65    ///
66    /// - **Duplicate Keys**: This method allows inserting duplicate keys. If the same key is inserted
67    ///   multiple times, all values will be stored. However, `Cdb::get()` will only return the first
68    ///   matching value it encounters during the hash table probe. Use iteration to retrieve all values
69    ///   for duplicate keys.
70    /// - **Empty Keys/Values**: Both empty keys (`b""`) and empty values are valid and supported.
71    ///
72    /// # Errors
73    ///
74    /// Returns `Error::WriterFinalized` if called after `finalize()`.
75    /// Returns `Error::Io` if an I/O error occurs during writing.
76    ///
77    /// # Examples
78    ///
79    /// ```
80    /// use cdb64::{CdbWriter, CdbHash};
81    /// use std::io::Cursor;
82    ///
83    /// let mut writer = CdbWriter::<_, CdbHash>::new(Cursor::new(Vec::new())).unwrap();
84    /// writer.put(b"key1", b"value1").unwrap();
85    /// writer.put(b"key1", b"value2").unwrap(); // Duplicate key is allowed
86    /// writer.put(b"", b"empty_key_value").unwrap(); // Empty key is allowed
87    /// writer.finalize().unwrap();
88    /// ```
89    pub fn put(&mut self, key: &[u8], value: &[u8]) -> Result<(), Error> {
90        if self.is_finalized {
91            return Err(Error::WriterFinalized);
92        }
93
94        self.writer
95            .seek(SeekFrom::Start(self.current_data_offset))?;
96        // Write key and value lengths as u64
97        write_tuple(&mut self.writer, key.len() as u64, value.len() as u64)?;
98        self.writer.write_all(key)?;
99        self.writer.write_all(value)?;
100
101        let mut hasher = H::default();
102        hasher.write(key);
103        let hash_val = hasher.finish();
104        let table_idx = (hash_val & 0xff) as usize;
105
106        self.entries_by_table[table_idx].push(Entry {
107            hash_val,
108            offset: self.current_data_offset,
109        });
110
111        // Adjust offset calculation: 16 bytes for (u64, u64) lengths
112        self.current_data_offset += 16 + key.len() as u64 + value.len() as u64;
113        Ok(())
114    }
115
116    fn write_footer_and_header(&mut self) -> Result<(), Error> {
117        if self.is_finalized {
118            return Ok(());
119        }
120
121        self.writer.flush()?;
122
123        let mut final_header_entries = [TableEntry::default(); 256];
124        let mut current_pos_for_hash_tables = self.current_data_offset;
125
126        for (i, entries_in_this_table) in self.entries_by_table.iter().enumerate() {
127            if entries_in_this_table.is_empty() {
128                final_header_entries[i] = TableEntry {
129                    offset: 0,
130                    length: 0,
131                };
132                continue;
133            }
134
135            let num_slots = entries_in_this_table.len() * 2;
136            let mut slots_data = vec![(0u64, 0u64); num_slots];
137
138            final_header_entries[i] = TableEntry {
139                offset: current_pos_for_hash_tables,
140                length: num_slots as u64, // num_slots is the count of (u64, u64) pairs
141            };
142
143            for entry in entries_in_this_table {
144                let mut slot_idx = (entry.hash_val >> 8) % (num_slots as u64);
145                loop {
146                    if slots_data[slot_idx as usize].1 == 0 {
147                        // .1 is offset, 0 means empty slot
148                        slots_data[slot_idx as usize] = (entry.hash_val, entry.offset);
149                        break;
150                    }
151                    slot_idx = (slot_idx + 1) % (num_slots as u64);
152                }
153            }
154
155            self.writer
156                .seek(SeekFrom::Start(current_pos_for_hash_tables))?;
157            for (hash_val, data_offset) in slots_data {
158                // Write two u64 values directly
159                self.writer.write_all(&hash_val.to_le_bytes())?;
160                self.writer.write_all(&data_offset.to_le_bytes())?;
161            }
162            // Each slot is (u64, u64), so 16 bytes per slot. num_slots is the count of such slots.
163            current_pos_for_hash_tables += (num_slots as u64) * 16;
164        }
165
166        self.writer.seek(SeekFrom::Start(0))?;
167        for table_entry in final_header_entries.iter() {
168            // Write two u64 values directly for the header
169            self.writer.write_all(&table_entry.offset.to_le_bytes())?;
170            self.writer.write_all(&table_entry.length.to_le_bytes())?;
171        }
172
173        self.is_finalized = true;
174
175        Ok(())
176    }
177
178    pub fn finalize(&mut self) -> Result<(), Error> {
179        self.write_footer_and_header()?;
180        self.writer.flush()?;
181        Ok(())
182    }
183
184    /// Consumes the CdbWriter and returns the underlying writer.
185    ///
186    /// This is useful for retrieving the written data, for example, when using `Cursor<Vec<u8>>`.
187    /// The writer is flushed before being returned.
188    ///
189    /// # Errors
190    ///
191    /// Returns `Error::WriterNotFinalized` if `finalize()` has not been called yet.
192    /// You must call `finalize()` before `into_inner()` to ensure the database is complete.
193    ///
194    /// # Examples
195    ///
196    /// ```
197    /// use cdb64::{CdbWriter, CdbHash};
198    /// use std::io::Cursor;
199    ///
200    /// let mut writer = CdbWriter::<_, CdbHash>::new(Cursor::new(Vec::new())).unwrap();
201    /// writer.put(b"key", b"value").unwrap();
202    /// writer.finalize().unwrap();
203    ///
204    /// let cursor = writer.into_inner().unwrap();
205    /// let data = cursor.into_inner();
206    /// assert!(data.len() > 0);
207    /// ```
208    pub fn into_inner(mut self) -> Result<W, Error> {
209        if !self.is_finalized {
210            // Ensure finalization before consuming.
211            // Alternatively, one could call self.finalize() here, but it requires &mut self.
212            // Forcing explicit finalize() call before into_inner() is cleaner.
213            return Err(Error::WriterNotFinalized);
214        }
215
216        self.writer.flush()?; // Ensure all data is written
217
218        Ok(self.writer)
219    }
220}
221
222impl<H: Hasher + Default> CdbWriter<File, H> {
223    /// Freezes the writer by finalizing it, flushing to disk, and reopening it as a `Cdb` reader.
224    ///
225    /// This method consumes the writer and provides a convenient way to transition from
226    /// writing to reading without manually managing files.
227    ///
228    /// # Arguments
229    ///
230    /// * `path_to_reopen` - The path to reopen the file as a reader. This should be the same
231    ///   path originally used to create the writer.
232    ///
233    /// # Errors
234    ///
235    /// Returns an error if finalization fails or if the file cannot be reopened.
236    ///
237    /// # Comparison with `finalize()` + `into_inner()`
238    ///
239    /// - Use `freeze()` when you want to immediately read from the same file after writing.
240    /// - Use `finalize()` + `into_inner()` when you need access to the underlying writer
241    ///   (e.g., to extract data from a `Cursor<Vec<u8>>`).
242    ///
243    /// # Examples
244    ///
245    /// ```
246    /// use cdb64::{CdbWriter, CdbHash};
247    /// use std::path::Path;
248    ///
249    /// let path = Path::new("my.cdb");
250    /// let mut writer = CdbWriter::<_, CdbHash>::create(path).unwrap();
251    /// writer.put(b"key", b"value").unwrap();
252    ///
253    /// // Freeze consumes the writer and returns a reader
254    /// let cdb = writer.freeze(path).unwrap();
255    /// assert_eq!(cdb.get(b"key").unwrap().unwrap(), b"value");
256    /// # // cleanup: remove the temporary file created by the example
257    /// # std::fs::remove_file(path).unwrap();
258    /// ```
259    // Freeze consumes the writer, finalizes it, and reopens it as a Cdb reader.
260    pub fn freeze(mut self, path_to_reopen: &Path) -> Result<Cdb<File, H>, Error> {
261        self.write_footer_and_header()?;
262        self.writer.flush()?;
263
264        Cdb::open(path_to_reopen).map_err(Error::Io)
265    }
266}