1use serde::{Deserialize, Serialize};
30use std::collections::HashMap;
31use std::io::{Read, Write};
32use std::path::Path;
33
34pub const MAGIC: [u8; 4] = *b"BRAG";
36
37pub const VERSION: u32 = 2;
39
40#[derive(Debug, Clone, Copy)]
42#[repr(C)]
43pub struct IndexHeader {
44 pub magic: [u8; 4],
46 pub version: u32,
48 pub doc_count: u64,
50 pub term_count: u64,
52 pub checksum: [u8; 32],
54 pub reserved: [u8; 8],
56}
57
58impl IndexHeader {
59 pub fn new(doc_count: u64, term_count: u64, checksum: [u8; 32]) -> Self {
61 Self { magic: MAGIC, version: VERSION, doc_count, term_count, checksum, reserved: [0; 8] }
62 }
63
64 pub fn validate(&self) -> Result<(), BinaryIndexError> {
66 if self.magic != MAGIC {
67 return Err(BinaryIndexError::InvalidMagic);
68 }
69 if self.version != VERSION {
70 return Err(BinaryIndexError::VersionMismatch {
71 expected: VERSION,
72 found: self.version,
73 });
74 }
75 Ok(())
76 }
77
78 #[allow(clippy::wrong_self_convention)]
80 pub fn to_bytes(&self) -> [u8; 64] {
81 let mut bytes = [0u8; 64];
82 let (a, rest) = bytes.split_at_mut(4);
83 a.copy_from_slice(&self.magic);
84 let (b, rest) = rest.split_at_mut(4);
85 b.copy_from_slice(&self.version.to_le_bytes());
86 let (c, rest) = rest.split_at_mut(8);
87 c.copy_from_slice(&self.doc_count.to_le_bytes());
88 let (d, rest) = rest.split_at_mut(8);
89 d.copy_from_slice(&self.term_count.to_le_bytes());
90 let (e, f) = rest.split_at_mut(32);
91 e.copy_from_slice(&self.checksum);
92 f.copy_from_slice(&self.reserved);
93 bytes
94 }
95
96 pub fn from_bytes(bytes: &[u8; 64]) -> Self {
98 let (magic_s, rest) = bytes.split_at(4);
99 let mut magic = [0u8; 4];
100 magic.copy_from_slice(magic_s);
101
102 let version = u32::from_le_bytes([rest[0], rest[1], rest[2], rest[3]]);
103 let (_, rest) = rest.split_at(4);
104 let doc_count = u64::from_le_bytes([
105 rest[0], rest[1], rest[2], rest[3], rest[4], rest[5], rest[6], rest[7],
106 ]);
107 let (_, rest) = rest.split_at(8);
108 let term_count = u64::from_le_bytes([
109 rest[0], rest[1], rest[2], rest[3], rest[4], rest[5], rest[6], rest[7],
110 ]);
111 let (_, rest) = rest.split_at(8);
112
113 let (checksum_s, reserved_s) = rest.split_at(32);
114 let mut checksum = [0u8; 32];
115 checksum.copy_from_slice(checksum_s);
116
117 let mut reserved = [0u8; 8];
118 reserved.copy_from_slice(reserved_s);
119
120 Self { magic, version, doc_count, term_count, checksum, reserved }
121 }
122}
123
124#[derive(Debug, Clone, Serialize, Deserialize)]
126pub struct DocumentEntry {
127 pub path: String,
129 pub fingerprint: [u8; 32],
131 pub length: u32,
133}
134
135#[derive(Debug, Clone, Copy)]
137pub struct Posting {
138 pub doc_id: u32,
140 pub tf: u16,
142}
143
144pub struct BinaryIndexWriter {
146 documents: Vec<DocumentEntry>,
148 terms: HashMap<String, Vec<Posting>>,
150}
151
152impl BinaryIndexWriter {
153 pub fn new() -> Self {
155 Self { documents: Vec::new(), terms: HashMap::new() }
156 }
157
158 pub fn add_document(&mut self, path: String, fingerprint: [u8; 32], length: u32) -> u32 {
160 let doc_id = self.documents.len() as u32;
161 self.documents.push(DocumentEntry { path, fingerprint, length });
162 doc_id
163 }
164
165 pub fn add_posting(&mut self, term: &str, doc_id: u32, tf: u16) {
167 self.terms.entry(term.to_string()).or_default().push(Posting { doc_id, tf });
168 }
169
170 pub fn write_to_file(&self, path: &Path) -> Result<(), BinaryIndexError> {
172 let mut file = std::fs::File::create(path)?;
173
174 let checksum = self.compute_checksum();
176
177 let header =
179 IndexHeader::new(self.documents.len() as u64, self.terms.len() as u64, checksum);
180 file.write_all(&header.to_bytes())?;
181
182 let docs_json = serde_json::to_vec(&self.documents)?;
184 file.write_all(&(docs_json.len() as u64).to_le_bytes())?;
185 file.write_all(&docs_json)?;
186
187 let mut sorted_terms: Vec<_> = self.terms.iter().collect();
189 sorted_terms.sort_by_key(|(k, _)| *k);
190
191 file.write_all(&(sorted_terms.len() as u64).to_le_bytes())?;
192 for (term, postings) in sorted_terms {
193 let term_bytes = term.as_bytes();
194 file.write_all(&(term_bytes.len() as u16).to_le_bytes())?;
195 file.write_all(term_bytes)?;
196 file.write_all(&(postings.len() as u32).to_le_bytes())?;
197 for posting in postings {
198 file.write_all(&posting.doc_id.to_le_bytes())?;
199 file.write_all(&posting.tf.to_le_bytes())?;
200 }
201 }
202
203 Ok(())
204 }
205
206 fn compute_checksum(&self) -> [u8; 32] {
208 use super::fingerprint::blake3_hash;
209
210 let mut data = Vec::new();
211 for doc in &self.documents {
212 data.extend_from_slice(doc.path.as_bytes());
213 data.extend_from_slice(&doc.fingerprint);
214 }
215 for (term, postings) in &self.terms {
216 data.extend_from_slice(term.as_bytes());
217 for posting in postings {
218 data.extend_from_slice(&posting.doc_id.to_le_bytes());
219 data.extend_from_slice(&posting.tf.to_le_bytes());
220 }
221 }
222 blake3_hash(&data)
223 }
224}
225
226impl Default for BinaryIndexWriter {
227 fn default() -> Self {
228 Self::new()
229 }
230}
231
232pub struct BinaryIndexReader {
234 header: IndexHeader,
236 documents: Vec<DocumentEntry>,
238 terms: Vec<(String, Vec<Posting>)>,
240}
241
242impl BinaryIndexReader {
243 pub fn load(path: &Path) -> Result<Self, BinaryIndexError> {
245 let mut file = std::fs::File::open(path)?;
246
247 let mut header_bytes = [0u8; 64];
249 file.read_exact(&mut header_bytes)?;
250 let header = IndexHeader::from_bytes(&header_bytes);
251 header.validate()?;
252
253 let mut doc_len_bytes = [0u8; 8];
255 file.read_exact(&mut doc_len_bytes)?;
256 let doc_len = u64::from_le_bytes(doc_len_bytes) as usize;
257
258 let mut docs_json = vec![0u8; doc_len];
259 file.read_exact(&mut docs_json)?;
260 let documents: Vec<DocumentEntry> = serde_json::from_slice(&docs_json)?;
261
262 let mut term_count_bytes = [0u8; 8];
264 file.read_exact(&mut term_count_bytes)?;
265 let term_count = u64::from_le_bytes(term_count_bytes) as usize;
266
267 let mut terms = Vec::with_capacity(term_count);
268 for _ in 0..term_count {
269 let mut term_len_bytes = [0u8; 2];
270 file.read_exact(&mut term_len_bytes)?;
271 let term_len = u16::from_le_bytes(term_len_bytes) as usize;
272
273 let mut term_bytes = vec![0u8; term_len];
274 file.read_exact(&mut term_bytes)?;
275 let term = String::from_utf8(term_bytes).map_err(|_| BinaryIndexError::InvalidUtf8)?;
276
277 let mut posting_count_bytes = [0u8; 4];
278 file.read_exact(&mut posting_count_bytes)?;
279 let posting_count = u32::from_le_bytes(posting_count_bytes) as usize;
280
281 let mut postings = Vec::with_capacity(posting_count);
282 for _ in 0..posting_count {
283 let mut doc_id_bytes = [0u8; 4];
284 let mut tf_bytes = [0u8; 2];
285 file.read_exact(&mut doc_id_bytes)?;
286 file.read_exact(&mut tf_bytes)?;
287 postings.push(Posting {
288 doc_id: u32::from_le_bytes(doc_id_bytes),
289 tf: u16::from_le_bytes(tf_bytes),
290 });
291 }
292
293 terms.push((term, postings));
294 }
295
296 Ok(Self { header, documents, terms })
297 }
298
299 pub fn get_document(&self, doc_id: u32) -> Option<&DocumentEntry> {
301 self.documents.get(doc_id as usize)
302 }
303
304 pub fn get_postings(&self, term: &str) -> Option<&[Posting]> {
306 match self.terms.binary_search_by_key(&term, |(t, _)| t.as_str()) {
307 Ok(idx) => Some(&self.terms[idx].1),
308 Err(_) => None,
309 }
310 }
311
312 pub fn doc_count(&self) -> usize {
314 self.documents.len()
315 }
316
317 pub fn term_count(&self) -> usize {
319 self.terms.len()
320 }
321}
322
323#[derive(Debug, thiserror::Error)]
325pub enum BinaryIndexError {
326 #[error("Invalid magic number")]
327 InvalidMagic,
328
329 #[error("Version mismatch: expected {expected}, found {found}")]
330 VersionMismatch { expected: u32, found: u32 },
331
332 #[error("IO error: {0}")]
333 Io(#[from] std::io::Error),
334
335 #[error("JSON error: {0}")]
336 Json(#[from] serde_json::Error),
337
338 #[error("Invalid UTF-8")]
339 InvalidUtf8,
340}
341
342#[cfg(test)]
343mod tests {
344 use super::*;
345 use tempfile::TempDir;
346
347 #[test]
348 fn test_header_roundtrip() {
349 let header = IndexHeader::new(100, 5000, [42u8; 32]);
350 let bytes = header.to_bytes();
351 let parsed = IndexHeader::from_bytes(&bytes);
352
353 assert_eq!(parsed.magic, MAGIC);
354 assert_eq!(parsed.version, VERSION);
355 assert_eq!(parsed.doc_count, 100);
356 assert_eq!(parsed.term_count, 5000);
357 }
358
359 #[test]
360 fn test_write_and_read() {
361 let temp = TempDir::new().expect("tempdir creation failed");
362 let index_path = temp.path().join("test.brag");
363
364 let mut writer = BinaryIndexWriter::new();
366 let doc_id = writer.add_document("test.txt".to_string(), [1u8; 32], 100);
367 writer.add_posting("hello", doc_id, 5);
368 writer.add_posting("world", doc_id, 3);
369 writer.write_to_file(&index_path).expect("unexpected failure");
370
371 let reader = BinaryIndexReader::load(&index_path).expect("unexpected failure");
373 assert_eq!(reader.doc_count(), 1);
374 assert_eq!(reader.term_count(), 2);
375
376 let postings = reader.get_postings("hello").expect("unexpected failure");
377 assert_eq!(postings.len(), 1);
378 assert_eq!(postings[0].doc_id, 0);
379 assert_eq!(postings[0].tf, 5);
380 }
381
382 #[test]
383 fn test_header_validation() {
384 let mut header = IndexHeader::new(100, 5000, [42u8; 32]);
385 assert!(header.validate().is_ok());
386
387 header.magic = *b"XXXX";
389 assert!(matches!(header.validate(), Err(BinaryIndexError::InvalidMagic)));
390 }
391
392 #[test]
393 fn test_header_version_mismatch() {
394 let mut header = IndexHeader::new(100, 5000, [42u8; 32]);
395 header.version = 999;
396 assert!(matches!(header.validate(), Err(BinaryIndexError::VersionMismatch { .. })));
397 }
398
399 #[test]
400 fn test_document_lookup() {
401 let temp = TempDir::new().expect("tempdir creation failed");
402 let index_path = temp.path().join("test.brag");
403
404 let mut writer = BinaryIndexWriter::new();
405 writer.add_document("doc1.txt".to_string(), [1u8; 32], 100);
406 writer.add_document("doc2.txt".to_string(), [2u8; 32], 200);
407 writer.write_to_file(&index_path).expect("unexpected failure");
408
409 let reader = BinaryIndexReader::load(&index_path).expect("unexpected failure");
410
411 let doc = reader.get_document(0).expect("unexpected failure");
412 assert_eq!(doc.path, "doc1.txt");
413 assert_eq!(doc.length, 100);
414
415 let doc = reader.get_document(1).expect("unexpected failure");
416 assert_eq!(doc.path, "doc2.txt");
417 assert_eq!(doc.length, 200);
418
419 assert!(reader.get_document(999).is_none());
420 }
421
422 #[test]
423 fn test_missing_term_returns_none() {
424 let temp = TempDir::new().expect("tempdir creation failed");
425 let index_path = temp.path().join("test.brag");
426
427 let mut writer = BinaryIndexWriter::new();
428 let doc_id = writer.add_document("test.txt".to_string(), [1u8; 32], 100);
429 writer.add_posting("exists", doc_id, 1);
430 writer.write_to_file(&index_path).expect("unexpected failure");
431
432 let reader = BinaryIndexReader::load(&index_path).expect("unexpected failure");
433 assert!(reader.get_postings("exists").is_some());
434 assert!(reader.get_postings("nonexistent").is_none());
435 }
436
437 #[test]
438 fn test_multiple_documents_same_term() {
439 let temp = TempDir::new().expect("tempdir creation failed");
440 let index_path = temp.path().join("test.brag");
441
442 let mut writer = BinaryIndexWriter::new();
443 let doc1 = writer.add_document("doc1.txt".to_string(), [1u8; 32], 100);
444 let doc2 = writer.add_document("doc2.txt".to_string(), [2u8; 32], 200);
445 writer.add_posting("common", doc1, 3);
446 writer.add_posting("common", doc2, 7);
447 writer.write_to_file(&index_path).expect("unexpected failure");
448
449 let reader = BinaryIndexReader::load(&index_path).expect("unexpected failure");
450 let postings = reader.get_postings("common").expect("unexpected failure");
451
452 assert_eq!(postings.len(), 2);
453 assert_eq!(postings[0].tf, 3);
454 assert_eq!(postings[1].tf, 7);
455 }
456
457 #[test]
458 fn test_binary_index_writer_default() {
459 let writer = BinaryIndexWriter::default();
460 let temp = TempDir::new().expect("tempdir creation failed");
462 let index_path = temp.path().join("empty.brag");
463 writer.write_to_file(&index_path).expect("unexpected failure");
464
465 let reader = BinaryIndexReader::load(&index_path).expect("unexpected failure");
466 assert_eq!(reader.doc_count(), 0);
467 assert_eq!(reader.term_count(), 0);
468 }
469
470 #[test]
471 fn test_document_entry_fields() {
472 let entry = DocumentEntry {
473 path: "/test/path.rs".to_string(),
474 fingerprint: [42u8; 32],
475 length: 1234,
476 };
477 assert_eq!(entry.path, "/test/path.rs");
478 assert_eq!(entry.fingerprint, [42u8; 32]);
479 assert_eq!(entry.length, 1234);
480 }
481
482 #[test]
483 fn test_posting_fields() {
484 let posting = Posting { doc_id: 5, tf: 10 };
485 assert_eq!(posting.doc_id, 5);
486 assert_eq!(posting.tf, 10);
487 }
488
489 #[test]
490 fn test_magic_and_version_constants() {
491 assert_eq!(MAGIC, *b"BRAG");
492 assert_eq!(VERSION, 2);
493 }
494
495 #[test]
496 fn test_header_reserved_is_zeroed() {
497 let header = IndexHeader::new(10, 20, [1u8; 32]);
498 assert_eq!(header.reserved, [0u8; 8]);
499 }
500
501 #[test]
502 fn test_error_display() {
503 let err = BinaryIndexError::InvalidMagic;
504 assert!(format!("{}", err).contains("Invalid magic"));
505
506 let err = BinaryIndexError::VersionMismatch { expected: 2, found: 1 };
507 assert!(format!("{}", err).contains('2'));
508 assert!(format!("{}", err).contains('1'));
509
510 let err = BinaryIndexError::InvalidUtf8;
511 assert!(format!("{}", err).contains("UTF-8"));
512 }
513
514 #[test]
515 fn test_load_nonexistent_file() {
516 let result = BinaryIndexReader::load(Path::new("/nonexistent/path.brag"));
517 assert!(result.is_err());
518 }
519
520 #[test]
521 fn test_add_document_returns_sequential_ids() {
522 let mut writer = BinaryIndexWriter::new();
523 let id0 = writer.add_document("doc0.txt".to_string(), [0u8; 32], 100);
524 let id1 = writer.add_document("doc1.txt".to_string(), [1u8; 32], 200);
525 let id2 = writer.add_document("doc2.txt".to_string(), [2u8; 32], 300);
526
527 assert_eq!(id0, 0);
528 assert_eq!(id1, 1);
529 assert_eq!(id2, 2);
530 }
531
532 #[test]
533 fn test_multiple_postings_same_term() {
534 let temp = TempDir::new().expect("tempdir creation failed");
535 let index_path = temp.path().join("multi.brag");
536
537 let mut writer = BinaryIndexWriter::new();
538 let doc = writer.add_document("doc.txt".to_string(), [1u8; 32], 100);
539 writer.add_posting("term", doc, 1);
540 writer.add_posting("term", doc, 2);
541 writer.add_posting("term", doc, 3);
542 writer.write_to_file(&index_path).expect("unexpected failure");
543
544 let reader = BinaryIndexReader::load(&index_path).expect("unexpected failure");
545 let postings = reader.get_postings("term").expect("unexpected failure");
546 assert_eq!(postings.len(), 3);
547 }
548
549 #[test]
550 fn test_terms_sorted_alphabetically() {
551 let temp = TempDir::new().expect("tempdir creation failed");
552 let index_path = temp.path().join("sorted.brag");
553
554 let mut writer = BinaryIndexWriter::new();
555 let doc = writer.add_document("doc.txt".to_string(), [1u8; 32], 100);
556 writer.add_posting("zebra", doc, 1);
557 writer.add_posting("alpha", doc, 1);
558 writer.add_posting("middle", doc, 1);
559 writer.write_to_file(&index_path).expect("unexpected failure");
560
561 let reader = BinaryIndexReader::load(&index_path).expect("unexpected failure");
562 assert!(reader.get_postings("alpha").is_some());
564 assert!(reader.get_postings("middle").is_some());
565 assert!(reader.get_postings("zebra").is_some());
566 }
567}