mf_file/
document.rs

1use std::io;
2use std::io::{Seek, SeekFrom, Write};
3use std::path::{Path, PathBuf};
4
5use blake3::Hasher as Blake3;
6use serde::{Deserialize, Serialize};
7
8use crate::error::{FileError, Result};
9use crate::record::{crc32, read_u32_le, Reader, Writer, HEADER_LEN, REC_HDR};
10
11// 固定尾指针:用于在 finalize 后快速定位目录起始偏移,避免全量扫描
12const TAIL_MAGIC: &[u8; 8] = b"MFFTAIL1"; // 8B 魔数 + 8B 目录偏移 (LE)
13
14// 段类型:用于描述容器中存储的数据类别
15#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
16pub enum SegmentType {
17    Meta,
18    Schema,
19    Snapshot,
20    Assets,
21    History,
22    Index,
23    Directory,
24}
25
26// 段目录项:记录段的类型、偏移、长度与 CRC
27#[derive(Debug, Clone, Serialize, Deserialize)]
28pub struct SegmentEntry {
29    pub kind: SegmentType,
30    pub offset: u64,
31    pub length: u64,
32    pub crc32: u32,
33}
34
35// 总目录:包含所有段的索引及文件级哈希
36#[derive(Debug, Clone, Serialize, Deserialize)]
37pub struct Directory {
38    pub entries: Vec<SegmentEntry>,
39    pub flags: u32,
40    pub file_hash: [u8; 32],
41}
42
43// 文档写入器:基于 append-only 文件写入段,并在末尾写目录
44pub struct DocumentWriter {
45    w: Writer,
46    segments: Vec<SegmentEntry>,
47    path: PathBuf,
48}
49impl DocumentWriter {
50    // 开始写入
51    pub fn begin<P: AsRef<Path>>(path: P) -> Result<Self> {
52        let p = path.as_ref().to_path_buf();
53        Ok(Self { w: Writer::create(&p, 0)?, segments: Vec::new(), path: p })
54    }
55    // 追加一个段
56    pub fn add_segment(
57        &mut self,
58        kind: SegmentType,
59        payload: &[u8],
60    ) -> Result<()> {
61        let off = self.w.len();
62        let _ = self.w.append(payload)?;
63        let crc = crc32(payload);
64        self.segments.push(SegmentEntry {
65            kind,
66            offset: off,
67            length: (REC_HDR as u64) + payload.len() as u64,
68            crc32: crc,
69        });
70        Ok(())
71    }
72    // 完成写入:生成并写入目录,计算全文件哈希
73    pub fn finalize(mut self) -> Result<()> {
74        // 计算数据哈希
75        self.w.flush()?;
76        let mut hasher = Blake3::new();
77        let r = Reader::open(&self.path)?;
78        for bytes in r.iter() {
79            hasher.update(bytes);
80        }
81        let hash = *hasher.finalize().as_bytes();
82        // 写入目录记录
83        let dir =
84            Directory { entries: self.segments, flags: 0, file_hash: hash };
85        let bytes =
86            bincode::serde::encode_to_vec(&dir, bincode::config::standard())
87                .map_err(|e| io::Error::new(io::ErrorKind::Other, e))
88                .map_err(FileError::Io)?;
89        let dir_off = self.w.append(&bytes)?;
90        self.w.flush()?;
91
92        // 写入尾指针,不计入逻辑长度:MAGIC(8) + dir_off(8)
93        // 这样 Reader 扫描逻辑结尾仍停在目录记录处,但可通过物理文件尾部快速读取目录偏移
94        {
95            // 直接使用底层文件写入尾部,不更新 logical_end
96            let file = &mut self.w.file;
97            file.seek(SeekFrom::Start(self.w.logical_end))?;
98            file.write_all(TAIL_MAGIC)?;
99            file.write_all(&dir_off.to_le_bytes())?;
100            file.sync_data()?;
101        }
102        Ok(())
103    }
104}
105
106// 文档读取器:读取末尾目录并提供段访问
107pub struct DocumentReader {
108    r: Reader,
109    dir: Directory,
110}
111impl DocumentReader {
112    // 打开并读取目录
113    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
114        let r = Reader::open(path)?;
115        // 优先通过尾指针快速定位目录偏移
116        let mut last_off = HEADER_LEN as u64;
117        let phys_len = r.mmap.len();
118        if phys_len >= 16 {
119            let tail = &r.mmap[phys_len - 16..phys_len];
120            if &tail[..8] == TAIL_MAGIC {
121                let mut off_bytes = [0u8; 8];
122                off_bytes.copy_from_slice(&tail[8..16]);
123                let off = u64::from_le_bytes(off_bytes);
124                // 基本校验:offset 落在逻辑区间内且指向一条有效记录
125                if (off as usize) + REC_HDR <= r.logical_end as usize {
126                    let len =
127                        read_u32_le(&r.mmap[off as usize..off as usize + 4])
128                            as usize;
129                    let s = off as usize + REC_HDR;
130                    let e = s + len;
131                    if e <= r.logical_end as usize {
132                        let stored_crc = read_u32_le(
133                            &r.mmap[off as usize + 4..off as usize + 8],
134                        );
135                        if crc32(&r.mmap[s..e]) == stored_crc {
136                            last_off = off;
137                        }
138                    }
139                }
140            }
141        }
142        // 如尾指针缺失/非法,回退到顺序扫描
143        if last_off == (HEADER_LEN as u64) {
144            let mut p = HEADER_LEN;
145            let end = r.logical_end as usize;
146            let mut fallback_last = HEADER_LEN as u64;
147            while p + REC_HDR <= end {
148                let len = read_u32_le(&r.mmap[p..p + 4]) as usize;
149                if len == 0 {
150                    break;
151                }
152                let s = p + REC_HDR;
153                let e = s + len;
154                if e > end {
155                    break;
156                }
157                let stored_crc = read_u32_le(&r.mmap[p + 4..p + 8]);
158                if crc32(&r.mmap[s..e]) != stored_crc {
159                    break;
160                }
161                fallback_last = p as u64;
162                p = e;
163            }
164            last_off = fallback_last;
165        }
166        let dir_bytes = r.get_at(last_off)?;
167        let (dir, _) = bincode::serde::decode_from_slice::<Directory, _>(
168            dir_bytes,
169            bincode::config::standard(),
170        )
171        .map_err(|e| io::Error::new(io::ErrorKind::Other, e))
172        .map_err(FileError::Io)?;
173        // 校验除目录外的数据哈希
174        let mut hasher = Blake3::new();
175        let mut q = HEADER_LEN;
176        let end2 = last_off as usize;
177        while q + REC_HDR <= end2 {
178            let len = read_u32_le(&r.mmap[q..q + 4]) as usize;
179            if len == 0 {
180                break;
181            }
182            let s = q + REC_HDR;
183            let e = s + len;
184            if e > end2 {
185                break;
186            }
187            let stored_crc = read_u32_le(&r.mmap[q + 4..q + 8]);
188            if crc32(&r.mmap[s..e]) != stored_crc {
189                break;
190            }
191            hasher.update(&r.mmap[s..e]);
192            q = e;
193        }
194        let calc = *hasher.finalize().as_bytes();
195        if calc != dir.file_hash {
196            return Err(FileError::BadHeader);
197        }
198        Ok(Self { r, dir })
199    }
200    // 按类型读取段负载
201    pub fn read_segment(
202        &self,
203        kind: SegmentType,
204    ) -> Result<Option<&[u8]>> {
205        if let Some(entry) =
206            self.dir.entries.iter().rev().find(|e| e.kind == kind)
207        {
208            let bytes = self.r.get_at(entry.offset)?;
209            if crc32(bytes) != entry.crc32 {
210                return Err(FileError::CrcMismatch(entry.offset));
211            }
212            return Ok(Some(bytes));
213        }
214        Ok(None)
215    }
216}