use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
use std::path::{Path, PathBuf};
use crate::Error;
use crate::document::{DocumentTree, ReasoningIndex};
use crate::error::Result;
const FORMAT_VERSION: u32 = 1;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentMeta {
pub id: String,
pub name: String,
pub format: String,
pub source_path: Option<PathBuf>,
pub description: Option<String>,
pub page_count: Option<usize>,
pub line_count: Option<usize>,
pub created_at: chrono::DateTime<chrono::Utc>,
pub modified_at: chrono::DateTime<chrono::Utc>,
#[serde(
default,
skip_serializing_if = "crate::utils::fingerprint::Fingerprint::is_zero"
)]
pub content_fingerprint: crate::utils::fingerprint::Fingerprint,
#[serde(
default,
skip_serializing_if = "crate::utils::fingerprint::Fingerprint::is_zero"
)]
pub logic_fingerprint: crate::utils::fingerprint::Fingerprint,
#[serde(default)]
pub processing_version: u32,
#[serde(default)]
pub node_count: usize,
#[serde(default)]
pub total_summary_tokens: usize,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub processing_model: Option<String>,
#[serde(default)]
pub processing_duration_ms: u64,
}
impl DocumentMeta {
pub fn new(id: impl Into<String>, name: impl Into<String>, format: impl Into<String>) -> Self {
let now = chrono::Utc::now();
Self {
id: id.into(),
name: name.into(),
format: format.into(),
source_path: None,
description: None,
page_count: None,
line_count: None,
created_at: now,
modified_at: now,
content_fingerprint: crate::utils::fingerprint::Fingerprint::zero(),
logic_fingerprint: crate::utils::fingerprint::Fingerprint::zero(),
processing_version: 0,
node_count: 0,
total_summary_tokens: 0,
processing_model: None,
processing_duration_ms: 0,
}
}
pub fn with_source_path(mut self, path: impl Into<PathBuf>) -> Self {
self.source_path = Some(path.into());
self
}
pub fn with_description(mut self, desc: impl Into<String>) -> Self {
self.description = Some(desc.into());
self
}
pub fn with_fingerprint(mut self, fp: crate::utils::fingerprint::Fingerprint) -> Self {
self.content_fingerprint = fp;
self
}
pub fn with_logic_fingerprint(mut self, fp: crate::utils::fingerprint::Fingerprint) -> Self {
self.logic_fingerprint = fp;
self
}
pub fn with_processing_version(mut self, version: u32) -> Self {
self.processing_version = version;
self
}
pub fn with_processing_model(mut self, model: impl Into<String>) -> Self {
self.processing_model = Some(model.into());
self
}
pub fn update_processing_stats(
&mut self,
node_count: usize,
summary_tokens: usize,
duration_ms: u64,
) {
self.node_count = node_count;
self.total_summary_tokens = summary_tokens;
self.processing_duration_ms = duration_ms;
self.modified_at = chrono::Utc::now();
}
pub fn mark_processed(
&mut self,
fp: crate::utils::fingerprint::Fingerprint,
version: u32,
model: Option<&str>,
) {
self.content_fingerprint = fp;
self.processing_version = version;
self.processing_model = model.map(|s| s.to_string());
self.modified_at = chrono::Utc::now();
}
pub fn needs_reprocessing(
&self,
current_fp: &crate::utils::fingerprint::Fingerprint,
current_version: u32,
) -> bool {
if self.processing_version == 0 {
return true;
}
if self.processing_version < current_version {
return true;
}
if &self.content_fingerprint != current_fp {
return true;
}
false
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PersistedDocument {
pub meta: DocumentMeta,
pub tree: DocumentTree,
#[serde(default)]
pub pages: Vec<PageContent>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub reasoning_index: Option<ReasoningIndex>,
}
impl PersistedDocument {
pub fn new(meta: DocumentMeta, tree: DocumentTree) -> Self {
Self {
meta,
tree,
pages: Vec::new(),
reasoning_index: None,
}
}
pub fn add_page(&mut self, page: usize, content: impl Into<String>) {
self.pages.push(PageContent {
page,
content: content.into(),
});
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageContent {
pub page: usize,
pub content: String,
}
#[derive(Debug, Serialize, Deserialize)]
struct PersistedWrapper {
version: u32,
checksum: String,
payload: serde_json::Value,
}
#[derive(Debug, Clone)]
pub struct PersistenceOptions {
pub atomic_writes: bool,
pub verify_checksum: bool,
}
impl Default for PersistenceOptions {
fn default() -> Self {
Self {
atomic_writes: true,
verify_checksum: true,
}
}
}
impl PersistenceOptions {
pub fn new() -> Self {
Self::default()
}
pub fn with_atomic_writes(mut self, enabled: bool) -> Self {
self.atomic_writes = enabled;
self
}
pub fn with_verify_checksum(mut self, enabled: bool) -> Self {
self.verify_checksum = enabled;
self
}
}
fn calculate_checksum(data: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(data);
format!("{:x}", hasher.finalize())
}
pub fn save_document(path: &Path, doc: &PersistedDocument) -> Result<()> {
save_document_with_options(path, doc, &PersistenceOptions::default())
}
pub fn save_document_with_options(
path: &Path,
doc: &PersistedDocument,
options: &PersistenceOptions,
) -> Result<()> {
let payload_value =
serde_json::to_value(doc).map_err(|e| Error::Serialization(e.to_string()))?;
let payload_bytes =
serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?;
let checksum = calculate_checksum(&payload_bytes);
let wrapper = PersistedWrapper {
version: FORMAT_VERSION,
checksum,
payload: payload_value,
};
let json =
serde_json::to_string_pretty(&wrapper).map_err(|e| Error::Serialization(e.to_string()))?;
if options.atomic_writes {
let temp_path = path.with_extension("tmp");
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).map_err(Error::Io)?;
}
{
let file = File::create(&temp_path).map_err(Error::Io)?;
let mut writer = BufWriter::new(file);
writer.write_all(json.as_bytes()).map_err(Error::Io)?;
writer.flush().map_err(Error::Io)?;
}
std::fs::rename(&temp_path, path).map_err(Error::Io)?;
} else {
std::fs::write(path, json).map_err(Error::Io)?;
}
Ok(())
}
pub fn load_document(path: &Path) -> Result<PersistedDocument> {
load_document_with_options(path, &PersistenceOptions::default())
}
pub fn load_document_with_options(
path: &Path,
options: &PersistenceOptions,
) -> Result<PersistedDocument> {
if !path.exists() {
return Err(Error::DocumentNotFound(path.display().to_string()));
}
let file = File::open(path).map_err(Error::Io)?;
let reader = BufReader::new(file);
let wrapper: PersistedWrapper = serde_json::from_reader(reader)
.map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?;
if wrapper.version != FORMAT_VERSION {
return Err(Error::Parse(format!(
"Unsupported format version: {} (expected {})",
wrapper.version, FORMAT_VERSION
)));
}
if options.verify_checksum {
let payload_bytes = serde_json::to_vec(&wrapper.payload)
.map_err(|e| Error::Serialization(e.to_string()))?;
let expected_checksum = calculate_checksum(&payload_bytes);
if wrapper.checksum != expected_checksum {
return Err(Error::Parse(format!(
"Checksum mismatch: expected {}, got {}",
expected_checksum, wrapper.checksum
)));
}
}
let doc: PersistedDocument = serde_json::from_value(wrapper.payload)
.map_err(|e| Error::Parse(format!("Failed to deserialize document: {}", e)))?;
Ok(doc)
}
pub fn save_index(path: &Path, entries: &[DocumentMeta]) -> Result<()> {
save_index_with_options(path, entries, &PersistenceOptions::default())
}
pub fn save_index_with_options(
path: &Path,
entries: &[DocumentMeta],
options: &PersistenceOptions,
) -> Result<()> {
let payload_value =
serde_json::to_value(entries).map_err(|e| Error::Serialization(e.to_string()))?;
let payload_bytes =
serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?;
let checksum = calculate_checksum(&payload_bytes);
let wrapper = PersistedWrapper {
version: FORMAT_VERSION,
checksum,
payload: payload_value,
};
let json =
serde_json::to_string_pretty(&wrapper).map_err(|e| Error::Serialization(e.to_string()))?;
if options.atomic_writes {
let temp_path = path.with_extension("tmp");
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent).map_err(Error::Io)?;
}
{
let file = File::create(&temp_path).map_err(Error::Io)?;
let mut writer = BufWriter::new(file);
writer.write_all(json.as_bytes()).map_err(Error::Io)?;
writer.flush().map_err(Error::Io)?;
}
std::fs::rename(&temp_path, path).map_err(Error::Io)?;
} else {
std::fs::write(path, json).map_err(Error::Io)?;
}
Ok(())
}
pub fn load_index(path: &Path) -> Result<Vec<DocumentMeta>> {
load_index_with_options(path, &PersistenceOptions::default())
}
pub fn load_index_with_options(
path: &Path,
options: &PersistenceOptions,
) -> Result<Vec<DocumentMeta>> {
if !path.exists() {
return Ok(Vec::new());
}
let file = File::open(path).map_err(Error::Io)?;
let reader = BufReader::new(file);
let wrapper: PersistedWrapper = serde_json::from_reader(reader)
.map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?;
if wrapper.version != FORMAT_VERSION {
return Err(Error::Parse(format!(
"Unsupported format version: {} (expected {})",
wrapper.version, FORMAT_VERSION
)));
}
if options.verify_checksum {
let payload_bytes = serde_json::to_vec(&wrapper.payload)
.map_err(|e| Error::Serialization(e.to_string()))?;
let expected_checksum = calculate_checksum(&payload_bytes);
if wrapper.checksum != expected_checksum {
return Err(Error::Parse(format!(
"Checksum mismatch: expected {}, got {}",
expected_checksum, wrapper.checksum
)));
}
}
let entries: Vec<DocumentMeta> = serde_json::from_value(wrapper.payload)
.map_err(|e| Error::Parse(format!("Failed to deserialize index: {}", e)))?;
Ok(entries)
}
pub fn save_document_to_bytes(doc: &PersistedDocument) -> Result<Vec<u8>> {
let payload_value =
serde_json::to_value(doc).map_err(|e| Error::Serialization(e.to_string()))?;
let payload_bytes =
serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?;
let checksum = calculate_checksum(&payload_bytes);
let wrapper = PersistedWrapper {
version: FORMAT_VERSION,
checksum,
payload: payload_value,
};
serde_json::to_vec(&wrapper).map_err(|e| Error::Serialization(e.to_string()))
}
pub fn load_document_from_bytes(data: &[u8]) -> Result<PersistedDocument> {
load_document_from_bytes_with_options(data, true)
}
pub fn load_document_from_bytes_with_options(
data: &[u8],
verify_checksum: bool,
) -> Result<PersistedDocument> {
let wrapper: PersistedWrapper = serde_json::from_slice(data)
.map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?;
if wrapper.version != FORMAT_VERSION {
return Err(Error::VersionMismatch(format!(
"Expected version {}, got {}",
FORMAT_VERSION, wrapper.version
)));
}
if verify_checksum {
let payload_bytes = serde_json::to_vec(&wrapper.payload)
.map_err(|e| Error::Serialization(e.to_string()))?;
let expected_checksum = calculate_checksum(&payload_bytes);
if wrapper.checksum != expected_checksum {
return Err(Error::ChecksumMismatch(format!(
"Expected {}, got {}",
expected_checksum, wrapper.checksum
)));
}
}
let doc: PersistedDocument = serde_json::from_value(wrapper.payload)
.map_err(|e| Error::Parse(format!("Failed to deserialize document: {}", e)))?;
Ok(doc)
}
pub fn save_index_to_bytes(entries: &[DocumentMeta]) -> Result<Vec<u8>> {
let payload_value =
serde_json::to_value(entries).map_err(|e| Error::Serialization(e.to_string()))?;
let payload_bytes =
serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?;
let checksum = calculate_checksum(&payload_bytes);
let wrapper = PersistedWrapper {
version: FORMAT_VERSION,
checksum,
payload: payload_value,
};
serde_json::to_vec(&wrapper).map_err(|e| Error::Serialization(e.to_string()))
}
pub fn load_index_from_bytes(data: &[u8]) -> Result<Vec<DocumentMeta>> {
load_index_from_bytes_with_options(data, true)
}
pub fn load_index_from_bytes_with_options(
data: &[u8],
verify_checksum: bool,
) -> Result<Vec<DocumentMeta>> {
let wrapper: PersistedWrapper = serde_json::from_slice(data)
.map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?;
if wrapper.version != FORMAT_VERSION {
return Err(Error::VersionMismatch(format!(
"Expected version {}, got {}",
FORMAT_VERSION, wrapper.version
)));
}
if verify_checksum {
let payload_bytes = serde_json::to_vec(&wrapper.payload)
.map_err(|e| Error::Serialization(e.to_string()))?;
let expected_checksum = calculate_checksum(&payload_bytes);
if wrapper.checksum != expected_checksum {
return Err(Error::ChecksumMismatch(format!(
"Expected {}, got {}",
expected_checksum, wrapper.checksum
)));
}
}
let entries: Vec<DocumentMeta> = serde_json::from_value(wrapper.payload)
.map_err(|e| Error::Parse(format!("Failed to deserialize index: {}", e)))?;
Ok(entries)
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
fn create_test_doc(id: &str) -> PersistedDocument {
let meta = DocumentMeta::new(id, "Test Doc", "md");
let tree = DocumentTree::new("Root", "Content");
PersistedDocument::new(meta, tree)
}
#[test]
fn test_save_and_load_document() {
let temp = TempDir::new().unwrap();
let path = temp.path().join("test.json");
let doc = create_test_doc("doc-1");
save_document(&path, &doc).unwrap();
let loaded = load_document(&path).unwrap();
assert_eq!(loaded.meta.id, "doc-1");
assert_eq!(loaded.meta.name, "Test Doc");
}
#[test]
fn test_atomic_write() {
let temp = TempDir::new().unwrap();
let path = temp.path().join("atomic.json");
let doc = create_test_doc("doc-atomic");
let options = PersistenceOptions::new().with_atomic_writes(true);
save_document_with_options(&path, &doc, &options).unwrap();
assert!(!path.with_extension("tmp").exists());
let loaded = load_document(&path).unwrap();
assert_eq!(loaded.meta.id, "doc-atomic");
}
#[test]
fn test_checksum_verification() {
let temp = TempDir::new().unwrap();
let path = temp.path().join("checksum.json");
let doc = create_test_doc("doc-checksum");
save_document(&path, &doc).unwrap();
let content = std::fs::read_to_string(&path).unwrap();
let corrupted = content.replace("doc-checksum", "doc-corrupted");
std::fs::write(&path, corrupted).unwrap();
let result = load_document(&path);
assert!(result.is_err());
let err = result.unwrap_err();
assert!(matches!(err, Error::Parse(_)));
}
#[test]
fn test_checksum_disabled() {
let temp = TempDir::new().unwrap();
let path = temp.path().join("no-checksum.json");
let doc = create_test_doc("doc-no-check");
save_document(&path, &doc).unwrap();
let options = PersistenceOptions::new().with_verify_checksum(false);
let result = load_document_with_options(&path, &options);
assert!(result.is_ok());
let loaded = result.unwrap();
assert_eq!(loaded.meta.id, "doc-no-check");
let content = std::fs::read_to_string(&path).unwrap();
let payload_value = serde_json::to_value(&doc).unwrap();
let corrupted = content.replace(
&calculate_checksum(&serde_json::to_vec(&payload_value).unwrap()),
"0000000000000000000000000000000000000000000000000000000000000000",
);
std::fs::write(&path, corrupted).unwrap();
let result = load_document_with_options(&path, &options);
assert!(result.is_ok());
let options_enabled = PersistenceOptions::new().with_verify_checksum(true);
let result = load_document_with_options(&path, &options_enabled);
assert!(result.is_err());
}
#[test]
fn test_load_nonexistent() {
let result = load_document(Path::new("/nonexistent/path.json"));
assert!(result.is_err());
assert!(result.unwrap_err().is_not_found());
}
#[test]
fn test_save_and_load_index() {
let temp = TempDir::new().unwrap();
let path = temp.path().join("meta.bin");
let mut entries = Vec::new();
entries.push(DocumentMeta::new("doc-1", "Doc 1", "md"));
entries.push(DocumentMeta::new("doc-2", "Doc 2", "pdf"));
save_index(&path, &entries).unwrap();
let loaded = load_index(&path).unwrap();
assert_eq!(loaded.len(), 2);
assert_eq!(loaded[0].id, "doc-1");
assert_eq!(loaded[1].format, "pdf");
}
#[test]
fn test_load_empty_index() {
let temp = TempDir::new().unwrap();
let path = temp.path().join("nonexistent.json");
let loaded = load_index(&path).unwrap();
assert!(loaded.is_empty());
}
#[test]
fn test_checksum_calculation() {
let data1 = b"test data";
let data2 = b"test data";
let data3 = b"different data";
let checksum1 = calculate_checksum(data1);
let checksum2 = calculate_checksum(data2);
let checksum3 = calculate_checksum(data3);
assert_eq!(checksum1, checksum2);
assert_ne!(checksum1, checksum3);
assert_eq!(checksum1.len(), 64); }
}