use crate::constants::magic;
use crate::error::ShardexError;
use crate::identifiers::DocumentId;
use crate::memory::FileHeader;
use bytemuck::{Pod, Zeroable};
pub const TEXT_INDEX_MAGIC: &[u8; 4] = magic::TEXT_INDEX;
pub const TEXT_DATA_MAGIC: &[u8; 4] = magic::TEXT_DATA;
pub const TEXT_INDEX_VERSION: u32 = 1;
pub const TEXT_DATA_VERSION: u32 = 1;
#[derive(Debug, Clone, Copy, PartialEq)]
#[repr(C)]
pub struct TextIndexHeader {
pub file_header: FileHeader,
pub entry_count: u32,
pub next_entry_offset: u64,
pub _padding: [u8; 12],
}
#[derive(Debug, Clone, Copy, PartialEq)]
#[repr(C)]
pub struct DocumentTextEntry {
pub document_id: DocumentId,
pub text_offset: u64,
pub text_length: u64,
}
#[derive(Debug, Clone, Copy, PartialEq)]
#[repr(C)]
pub struct TextDataHeader {
pub file_header: FileHeader,
pub total_text_size: u64,
pub next_text_offset: u64,
pub _padding: [u8; 8],
}
unsafe impl Pod for TextIndexHeader {}
unsafe impl Zeroable for TextIndexHeader {}
unsafe impl Pod for DocumentTextEntry {}
unsafe impl Zeroable for DocumentTextEntry {}
unsafe impl Pod for TextDataHeader {}
unsafe impl Zeroable for TextDataHeader {}
impl TextIndexHeader {
pub const SIZE: usize = std::mem::size_of::<TextIndexHeader>();
pub fn new() -> Self {
Self {
file_header: FileHeader::new_without_checksum(TEXT_INDEX_MAGIC, TEXT_INDEX_VERSION, Self::SIZE as u64),
entry_count: 0,
next_entry_offset: Self::SIZE as u64,
_padding: [0; 12],
}
}
pub fn new_with_data(data: &[u8]) -> Self {
let mut header = Self::new();
header.file_header = FileHeader::new(TEXT_INDEX_MAGIC, TEXT_INDEX_VERSION, Self::SIZE as u64, data);
header
}
pub fn validate(&self) -> Result<(), ShardexError> {
self.validate_magic()?;
self.file_header
.validate_version(TEXT_INDEX_VERSION, TEXT_INDEX_VERSION)?;
self.file_header.validate_structure()?;
if self.next_entry_offset < Self::SIZE as u64 {
return Err(ShardexError::Corruption(format!(
"Invalid next_entry_offset: {} is less than header size {}",
self.next_entry_offset,
Self::SIZE
)));
}
if self._padding != [0; 12] {
return Err(ShardexError::Corruption(
"TextIndexHeader padding is not zero".to_string(),
));
}
Ok(())
}
pub fn validate_magic(&self) -> Result<(), ShardexError> {
self.file_header.validate_magic(TEXT_INDEX_MAGIC)
}
pub fn validate_checksum(&self, data: &[u8]) -> Result<(), ShardexError> {
self.file_header
.validate_checksum(data)
.map_err(|e| ShardexError::Corruption(format!("TextIndexHeader checksum validation failed: {}", e)))
}
pub fn update_checksum(&mut self, data: &[u8]) {
self.file_header.update_checksum(data);
}
pub fn next_entry_offset(&self) -> u64 {
self.next_entry_offset
}
pub fn offset_for_entry(&self, entry_index: u32) -> u64 {
Self::SIZE as u64 + (entry_index as u64 * DocumentTextEntry::SIZE as u64)
}
pub fn add_entry(&mut self) {
self.entry_count += 1;
self.next_entry_offset += DocumentTextEntry::SIZE as u64;
}
pub fn is_empty(&self) -> bool {
self.entry_count == 0
}
pub fn total_entries_size(&self) -> u64 {
self.entry_count as u64 * DocumentTextEntry::SIZE as u64
}
}
impl DocumentTextEntry {
pub const SIZE: usize = std::mem::size_of::<DocumentTextEntry>();
pub fn new(document_id: DocumentId, text_offset: u64, text_length: u64) -> Self {
Self {
document_id,
text_offset,
text_length,
}
}
pub fn validate(&self) -> Result<(), ShardexError> {
if self.text_length == 0 {
return Err(ShardexError::InvalidInput {
field: "text_length".to_string(),
reason: "Text length cannot be zero".to_string(),
suggestion: "Store non-empty text or remove the document".to_string(),
});
}
const MAX_TEXT_SIZE: u64 = 100 * 1024 * 1024; if self.text_length > MAX_TEXT_SIZE {
return Err(ShardexError::InvalidInput {
field: "text_length".to_string(),
reason: format!(
"Text length {} exceeds maximum allowed size {}",
self.text_length, MAX_TEXT_SIZE
),
suggestion: "Break large documents into smaller chunks".to_string(),
});
}
if let Some(end_offset) = self.text_offset.checked_add(self.text_length) {
const MAX_FILE_SIZE: u64 = 10_u64.pow(12); if end_offset > MAX_FILE_SIZE {
return Err(ShardexError::InvalidInput {
field: "text_offset".to_string(),
reason: format!("Text end position {} exceeds file size limit", end_offset),
suggestion: "Use separate data files for very large datasets".to_string(),
});
}
} else {
return Err(ShardexError::InvalidInput {
field: "text_offset".to_string(),
reason: "Text offset + length overflows u64".to_string(),
suggestion: "Use smaller offset or length values".to_string(),
});
}
Ok(())
}
pub fn is_for_document(&self, document_id: DocumentId) -> bool {
self.document_id == document_id
}
pub fn end_offset(&self) -> Option<u64> {
self.text_offset.checked_add(self.text_length)
}
pub fn overlaps_with(&self, other: &DocumentTextEntry) -> bool {
let self_end = match self.end_offset() {
Some(end) => end,
None => return false, };
let other_end = match other.end_offset() {
Some(end) => end,
None => return false,
};
self.text_offset < other_end && other.text_offset < self_end
}
}
impl TextDataHeader {
pub const SIZE: usize = std::mem::size_of::<TextDataHeader>();
pub fn new() -> Self {
Self {
file_header: FileHeader::new_without_checksum(TEXT_DATA_MAGIC, TEXT_DATA_VERSION, Self::SIZE as u64),
total_text_size: 0,
next_text_offset: Self::SIZE as u64,
_padding: [0; 8],
}
}
pub fn new_with_data(data: &[u8]) -> Self {
let mut header = Self::new();
header.file_header = FileHeader::new(TEXT_DATA_MAGIC, TEXT_DATA_VERSION, Self::SIZE as u64, data);
header
}
pub fn validate(&self) -> Result<(), ShardexError> {
self.validate_magic()?;
self.file_header
.validate_version(TEXT_DATA_VERSION, TEXT_DATA_VERSION)?;
self.file_header.validate_structure()?;
if self.next_text_offset < Self::SIZE as u64 {
return Err(ShardexError::Corruption(format!(
"Invalid next_text_offset: {} is less than header size {}",
self.next_text_offset,
Self::SIZE
)));
}
if self._padding != [0; 8] {
return Err(ShardexError::Corruption(
"TextDataHeader padding is not zero".to_string(),
));
}
Ok(())
}
pub fn validate_magic(&self) -> Result<(), ShardexError> {
self.file_header.validate_magic(TEXT_DATA_MAGIC)
}
pub fn validate_checksum(&self, data: &[u8]) -> Result<(), ShardexError> {
self.file_header
.validate_checksum(data)
.map_err(|e| ShardexError::Corruption(format!("TextDataHeader checksum validation failed: {}", e)))
}
pub fn update_checksum(&mut self, data: &[u8]) {
self.file_header.update_checksum(data);
}
pub fn next_text_offset(&self) -> u64 {
self.next_text_offset
}
pub fn add_text(&mut self, text_length: u64) {
self.total_text_size += text_length;
self.next_text_offset += text_length + 8; }
pub fn is_empty(&self) -> bool {
self.total_text_size == 0
}
pub fn utilization_ratio(&self) -> f64 {
if self.next_text_offset <= Self::SIZE as u64 {
0.0
} else {
self.total_text_size as f64 / (self.next_text_offset - Self::SIZE as u64) as f64
}
}
}
impl Default for TextIndexHeader {
fn default() -> Self {
Self::new()
}
}
impl Default for TextDataHeader {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn test_text_index_header_creation() {
let header = TextIndexHeader::new();
assert_eq!(header.entry_count, 0);
assert_eq!(header.next_entry_offset, TextIndexHeader::SIZE as u64);
assert_eq!(header._padding, [0; 12]);
assert_eq!(header.file_header.magic, *TEXT_INDEX_MAGIC);
assert_eq!(header.file_header.version, TEXT_INDEX_VERSION);
}
#[test]
fn test_text_data_header_creation() {
let header = TextDataHeader::new();
assert_eq!(header.total_text_size, 0);
assert_eq!(header.next_text_offset, TextDataHeader::SIZE as u64);
assert_eq!(header._padding, [0; 8]);
assert_eq!(header.file_header.magic, *TEXT_DATA_MAGIC);
assert_eq!(header.file_header.version, TEXT_DATA_VERSION);
}
#[test]
fn test_document_text_entry_creation() {
let doc_id = DocumentId::new();
let entry = DocumentTextEntry::new(doc_id, 1024, 512);
assert_eq!(entry.document_id, doc_id);
assert_eq!(entry.text_offset, 1024);
assert_eq!(entry.text_length, 512);
}
#[test]
fn test_header_validation() {
let index_header = TextIndexHeader::new();
let data_header = TextDataHeader::new();
assert!(index_header.validate().is_ok());
assert!(data_header.validate().is_ok());
assert!(index_header.validate_magic().is_ok());
assert!(data_header.validate_magic().is_ok());
}
#[test]
fn test_document_text_entry_validation() {
let doc_id = DocumentId::new();
let valid_entry = DocumentTextEntry::new(doc_id, 1024, 512);
assert!(valid_entry.validate().is_ok());
let zero_length_entry = DocumentTextEntry::new(doc_id, 1024, 0);
assert!(zero_length_entry.validate().is_err());
let too_large_entry = DocumentTextEntry::new(doc_id, 1024, 200 * 1024 * 1024); assert!(too_large_entry.validate().is_err());
}
#[test]
fn test_entry_overlap_detection() {
let doc_id = DocumentId::new();
let entry1 = DocumentTextEntry::new(doc_id, 100, 50); let entry2 = DocumentTextEntry::new(doc_id, 125, 50); let entry3 = DocumentTextEntry::new(doc_id, 200, 50);
assert!(entry1.overlaps_with(&entry2));
assert!(entry2.overlaps_with(&entry1));
assert!(!entry1.overlaps_with(&entry3));
assert!(!entry3.overlaps_with(&entry1));
}
#[test]
fn test_header_add_operations() {
let mut index_header = TextIndexHeader::new();
let mut data_header = TextDataHeader::new();
assert!(index_header.is_empty());
assert!(data_header.is_empty());
index_header.add_entry();
assert_eq!(index_header.entry_count, 1);
assert_eq!(
index_header.next_entry_offset,
TextIndexHeader::SIZE as u64 + DocumentTextEntry::SIZE as u64
);
assert!(!index_header.is_empty());
data_header.add_text(512);
assert_eq!(data_header.total_text_size, 512);
assert_eq!(data_header.next_text_offset, TextDataHeader::SIZE as u64 + 512 + 8);
assert!(!data_header.is_empty());
}
#[test]
fn test_bytemuck_compatibility() {
let doc_id = DocumentId::new();
let index_header = TextIndexHeader::new();
let data_header = TextDataHeader::new();
let entry = DocumentTextEntry::new(doc_id, 1024, 512);
let index_bytes: &[u8] = bytemuck::bytes_of(&index_header);
let data_bytes: &[u8] = bytemuck::bytes_of(&data_header);
let entry_bytes: &[u8] = bytemuck::bytes_of(&entry);
assert_eq!(index_bytes.len(), TextIndexHeader::SIZE);
assert_eq!(data_bytes.len(), TextDataHeader::SIZE);
assert_eq!(entry_bytes.len(), DocumentTextEntry::SIZE);
let index_restored: TextIndexHeader = bytemuck::pod_read_unaligned(index_bytes);
let data_restored: TextDataHeader = bytemuck::pod_read_unaligned(data_bytes);
let entry_restored: DocumentTextEntry = bytemuck::pod_read_unaligned(entry_bytes);
assert_eq!(index_header, index_restored);
assert_eq!(data_header, data_restored);
assert_eq!(entry, entry_restored);
}
#[test]
fn test_zeroable_trait() {
let zero_index: TextIndexHeader = bytemuck::Zeroable::zeroed();
let zero_data: TextDataHeader = bytemuck::Zeroable::zeroed();
let zero_entry: DocumentTextEntry = bytemuck::Zeroable::zeroed();
assert_eq!(zero_index.entry_count, 0);
assert_eq!(zero_index.next_entry_offset, 0);
assert_eq!(zero_data.total_text_size, 0);
assert_eq!(zero_data.next_text_offset, 0);
assert_eq!(zero_entry.document_id.raw(), 0);
assert_eq!(zero_entry.text_offset, 0);
assert_eq!(zero_entry.text_length, 0);
}
#[test]
fn test_memory_layout_sizes() {
let index_size = mem::size_of::<TextIndexHeader>();
let data_size = mem::size_of::<TextDataHeader>();
let entry_size = mem::size_of::<DocumentTextEntry>();
assert!(index_size >= 80 + 4 + 8); assert!(data_size >= 80 + 8 + 8); assert_eq!(entry_size, 32);
assert!(mem::align_of::<TextIndexHeader>() >= 8);
assert!(mem::align_of::<TextDataHeader>() >= 8);
assert!(mem::align_of::<DocumentTextEntry>() >= 8);
assert_eq!(TextIndexHeader::SIZE, index_size);
assert_eq!(TextDataHeader::SIZE, data_size);
assert_eq!(DocumentTextEntry::SIZE, entry_size);
assert_eq!(TextIndexHeader::SIZE % mem::align_of::<TextIndexHeader>(), 0);
assert_eq!(TextDataHeader::SIZE % mem::align_of::<TextDataHeader>(), 0);
assert_eq!(DocumentTextEntry::SIZE % mem::align_of::<DocumentTextEntry>(), 0);
}
#[test]
fn test_constants() {
assert_eq!(TEXT_INDEX_MAGIC, b"TIDX");
assert_eq!(TEXT_DATA_MAGIC, b"TDAT");
assert_eq!(TEXT_INDEX_VERSION, 1);
assert_eq!(TEXT_DATA_VERSION, 1);
assert_eq!(TextIndexHeader::SIZE, mem::size_of::<TextIndexHeader>());
assert_eq!(TextDataHeader::SIZE, mem::size_of::<TextDataHeader>());
assert_eq!(DocumentTextEntry::SIZE, mem::size_of::<DocumentTextEntry>());
}
#[test]
fn test_entry_helper_methods() {
let doc_id = DocumentId::new();
let entry = DocumentTextEntry::new(doc_id, 1000, 500);
assert!(entry.is_for_document(doc_id));
assert!(!entry.is_for_document(DocumentId::new()));
assert_eq!(entry.end_offset(), Some(1500));
let overflow_entry = DocumentTextEntry::new(doc_id, u64::MAX, 1);
assert_eq!(overflow_entry.end_offset(), None);
}
#[test]
fn test_header_offset_calculations() {
let header = TextIndexHeader::new();
assert_eq!(header.offset_for_entry(0), TextIndexHeader::SIZE as u64);
assert_eq!(
header.offset_for_entry(1),
TextIndexHeader::SIZE as u64 + DocumentTextEntry::SIZE as u64
);
assert_eq!(
header.offset_for_entry(10),
TextIndexHeader::SIZE as u64 + 10 * DocumentTextEntry::SIZE as u64
);
assert_eq!(header.total_entries_size(), 0);
let mut header_with_entries = header;
header_with_entries.entry_count = 5;
assert_eq!(
header_with_entries.total_entries_size(),
5 * DocumentTextEntry::SIZE as u64
);
}
#[test]
fn test_utilization_calculation() {
let mut header = TextDataHeader::new();
assert_eq!(header.utilization_ratio(), 0.0);
header.total_text_size = 1000;
header.next_text_offset = TextDataHeader::SIZE as u64 + 1500;
let expected_ratio = 1000.0 / 1500.0;
assert!((header.utilization_ratio() - expected_ratio).abs() < 0.001);
}
#[test]
fn test_default_implementations() {
let default_index = TextIndexHeader::default();
let default_data = TextDataHeader::default();
let new_index = TextIndexHeader::new();
let new_data = TextDataHeader::new();
assert_eq!(default_index.entry_count, new_index.entry_count);
assert_eq!(default_index.next_entry_offset, new_index.next_entry_offset);
assert_eq!(default_index._padding, new_index._padding);
assert_eq!(default_index.file_header.magic, new_index.file_header.magic);
assert_eq!(default_index.file_header.version, new_index.file_header.version);
assert_eq!(default_data.total_text_size, new_data.total_text_size);
assert_eq!(default_data.next_text_offset, new_data.next_text_offset);
assert_eq!(default_data._padding, new_data._padding);
assert_eq!(default_data.file_header.magic, new_data.file_header.magic);
assert_eq!(default_data.file_header.version, new_data.file_header.version);
}
}