cdb64/writer.rs
1use std::{
2 fs::{File, OpenOptions},
3 hash::Hasher,
4 io::{Seek, SeekFrom, Write},
5 marker::PhantomData,
6 path::Path,
7};
8
9use crate::{
10 Error,
11 cdb::{Cdb, HEADER_SIZE, TableEntry},
12 hash::CdbHash,
13 util::write_tuple,
14};
15
16#[derive(Debug)]
17struct Entry {
18 hash_val: u64,
19 offset: u64,
20}
21
22pub struct CdbWriter<W: Write + Seek, H: Hasher + Default = CdbHash> {
23 writer: W,
24 entries_by_table: [Vec<Entry>; 256],
25 is_finalized: bool,
26 current_data_offset: u64,
27 _hasher: PhantomData<H>,
28}
29
30impl<H: Hasher + Default> CdbWriter<File, H> {
31 pub fn create(path: impl AsRef<Path>) -> Result<Self, Error> {
32 let file = OpenOptions::new()
33 .write(true)
34 .create(true)
35 .truncate(true)
36 .open(path)?;
37
38 Self::new(file)
39 }
40}
41
42impl<W: Write + Seek, H: Hasher + Default> CdbWriter<W, H> {
43 pub fn new(mut writer: W) -> Result<Self, Error> {
44 writer.seek(SeekFrom::Start(0))?;
45 let header_placeholder = vec![0u8; HEADER_SIZE as usize];
46 writer.write_all(&header_placeholder)?;
47
48 Ok(CdbWriter {
49 writer,
50 entries_by_table: [const { Vec::new() }; 256],
51 is_finalized: false,
52 current_data_offset: HEADER_SIZE,
53 _hasher: PhantomData,
54 })
55 }
56
57 /// Inserts a key-value pair into the CDB database.
58 ///
59 /// # Arguments
60 ///
61 /// * `key` - A byte slice representing the key
62 /// * `value` - A byte slice representing the value
63 ///
64 /// # Behavior
65 ///
66 /// - **Duplicate Keys**: This method allows inserting duplicate keys. If the same key is inserted
67 /// multiple times, all values will be stored. However, `Cdb::get()` will only return the first
68 /// matching value it encounters during the hash table probe. Use iteration to retrieve all values
69 /// for duplicate keys.
70 /// - **Empty Keys/Values**: Both empty keys (`b""`) and empty values are valid and supported.
71 ///
72 /// # Errors
73 ///
74 /// Returns `Error::WriterFinalized` if called after `finalize()`.
75 /// Returns `Error::Io` if an I/O error occurs during writing.
76 ///
77 /// # Examples
78 ///
79 /// ```
80 /// use cdb64::{CdbWriter, CdbHash};
81 /// use std::io::Cursor;
82 ///
83 /// let mut writer = CdbWriter::<_, CdbHash>::new(Cursor::new(Vec::new())).unwrap();
84 /// writer.put(b"key1", b"value1").unwrap();
85 /// writer.put(b"key1", b"value2").unwrap(); // Duplicate key is allowed
86 /// writer.put(b"", b"empty_key_value").unwrap(); // Empty key is allowed
87 /// writer.finalize().unwrap();
88 /// ```
89 pub fn put(&mut self, key: &[u8], value: &[u8]) -> Result<(), Error> {
90 if self.is_finalized {
91 return Err(Error::WriterFinalized);
92 }
93
94 self.writer
95 .seek(SeekFrom::Start(self.current_data_offset))?;
96 // Write key and value lengths as u64
97 write_tuple(&mut self.writer, key.len() as u64, value.len() as u64)?;
98 self.writer.write_all(key)?;
99 self.writer.write_all(value)?;
100
101 let mut hasher = H::default();
102 hasher.write(key);
103 let hash_val = hasher.finish();
104 let table_idx = (hash_val & 0xff) as usize;
105
106 self.entries_by_table[table_idx].push(Entry {
107 hash_val,
108 offset: self.current_data_offset,
109 });
110
111 // Adjust offset calculation: 16 bytes for (u64, u64) lengths
112 self.current_data_offset += 16 + key.len() as u64 + value.len() as u64;
113 Ok(())
114 }
115
116 fn write_footer_and_header(&mut self) -> Result<(), Error> {
117 if self.is_finalized {
118 return Ok(());
119 }
120
121 self.writer.flush()?;
122
123 let mut final_header_entries = [TableEntry::default(); 256];
124 let mut current_pos_for_hash_tables = self.current_data_offset;
125
126 for (i, entries_in_this_table) in self.entries_by_table.iter().enumerate() {
127 if entries_in_this_table.is_empty() {
128 final_header_entries[i] = TableEntry {
129 offset: 0,
130 length: 0,
131 };
132 continue;
133 }
134
135 let num_slots = entries_in_this_table.len() * 2;
136 let mut slots_data = vec![(0u64, 0u64); num_slots];
137
138 final_header_entries[i] = TableEntry {
139 offset: current_pos_for_hash_tables,
140 length: num_slots as u64, // num_slots is the count of (u64, u64) pairs
141 };
142
143 for entry in entries_in_this_table {
144 let mut slot_idx = (entry.hash_val >> 8) % (num_slots as u64);
145 loop {
146 if slots_data[slot_idx as usize].1 == 0 {
147 // .1 is offset, 0 means empty slot
148 slots_data[slot_idx as usize] = (entry.hash_val, entry.offset);
149 break;
150 }
151 slot_idx = (slot_idx + 1) % (num_slots as u64);
152 }
153 }
154
155 self.writer
156 .seek(SeekFrom::Start(current_pos_for_hash_tables))?;
157 for (hash_val, data_offset) in slots_data {
158 // Write two u64 values directly
159 self.writer.write_all(&hash_val.to_le_bytes())?;
160 self.writer.write_all(&data_offset.to_le_bytes())?;
161 }
162 // Each slot is (u64, u64), so 16 bytes per slot. num_slots is the count of such slots.
163 current_pos_for_hash_tables += (num_slots as u64) * 16;
164 }
165
166 self.writer.seek(SeekFrom::Start(0))?;
167 for table_entry in final_header_entries.iter() {
168 // Write two u64 values directly for the header
169 self.writer.write_all(&table_entry.offset.to_le_bytes())?;
170 self.writer.write_all(&table_entry.length.to_le_bytes())?;
171 }
172
173 self.is_finalized = true;
174
175 Ok(())
176 }
177
178 pub fn finalize(&mut self) -> Result<(), Error> {
179 self.write_footer_and_header()?;
180 self.writer.flush()?;
181 Ok(())
182 }
183
184 /// Consumes the CdbWriter and returns the underlying writer.
185 ///
186 /// This is useful for retrieving the written data, for example, when using `Cursor<Vec<u8>>`.
187 /// The writer is flushed before being returned.
188 ///
189 /// # Errors
190 ///
191 /// Returns `Error::WriterNotFinalized` if `finalize()` has not been called yet.
192 /// You must call `finalize()` before `into_inner()` to ensure the database is complete.
193 ///
194 /// # Examples
195 ///
196 /// ```
197 /// use cdb64::{CdbWriter, CdbHash};
198 /// use std::io::Cursor;
199 ///
200 /// let mut writer = CdbWriter::<_, CdbHash>::new(Cursor::new(Vec::new())).unwrap();
201 /// writer.put(b"key", b"value").unwrap();
202 /// writer.finalize().unwrap();
203 ///
204 /// let cursor = writer.into_inner().unwrap();
205 /// let data = cursor.into_inner();
206 /// assert!(data.len() > 0);
207 /// ```
208 pub fn into_inner(mut self) -> Result<W, Error> {
209 if !self.is_finalized {
210 // Ensure finalization before consuming.
211 // Alternatively, one could call self.finalize() here, but it requires &mut self.
212 // Forcing explicit finalize() call before into_inner() is cleaner.
213 return Err(Error::WriterNotFinalized);
214 }
215
216 self.writer.flush()?; // Ensure all data is written
217
218 Ok(self.writer)
219 }
220}
221
222impl<H: Hasher + Default> CdbWriter<File, H> {
223 /// Freezes the writer by finalizing it, flushing to disk, and reopening it as a `Cdb` reader.
224 ///
225 /// This method consumes the writer and provides a convenient way to transition from
226 /// writing to reading without manually managing files.
227 ///
228 /// # Arguments
229 ///
230 /// * `path_to_reopen` - The path to reopen the file as a reader. This should be the same
231 /// path originally used to create the writer.
232 ///
233 /// # Errors
234 ///
235 /// Returns an error if finalization fails or if the file cannot be reopened.
236 ///
237 /// # Comparison with `finalize()` + `into_inner()`
238 ///
239 /// - Use `freeze()` when you want to immediately read from the same file after writing.
240 /// - Use `finalize()` + `into_inner()` when you need access to the underlying writer
241 /// (e.g., to extract data from a `Cursor<Vec<u8>>`).
242 ///
243 /// # Examples
244 ///
245 /// ```
246 /// use cdb64::{CdbWriter, CdbHash};
247 /// use std::path::Path;
248 ///
249 /// let path = Path::new("my.cdb");
250 /// let mut writer = CdbWriter::<_, CdbHash>::create(path).unwrap();
251 /// writer.put(b"key", b"value").unwrap();
252 ///
253 /// // Freeze consumes the writer and returns a reader
254 /// let cdb = writer.freeze(path).unwrap();
255 /// assert_eq!(cdb.get(b"key").unwrap().unwrap(), b"value");
256 /// # // cleanup: remove the temporary file created by the example
257 /// # std::fs::remove_file(path).unwrap();
258 /// ```
259 // Freeze consumes the writer, finalizes it, and reopens it as a Cdb reader.
260 pub fn freeze(mut self, path_to_reopen: &Path) -> Result<Cdb<File, H>, Error> {
261 self.write_footer_and_header()?;
262 self.writer.flush()?;
263
264 Cdb::open(path_to_reopen).map_err(Error::Io)
265 }
266}