docufort/
integrity.rs

1//! This module contains the integrity check function for a docufort file.
2//!
3//! This will read the file from the beginning to the end, checking the integrity of the file.
4//! It will attempt to correct any errors it finds in the data using any available ECC data.
5
6use std::io::SeekFrom;
7
8use crate::{core::{Block, BlockInputs, BlockState}, read::{read_magic_number, verify_configs}, recovery::{try_read_block, BlockReadSummary}, ComponentTag, CorruptDataSegment, FileLike, ReadWriteError};
9
10
11/// The struct returned when we were able to recover the file.
12///
13/// Includes statistics on the file and the last block state.
14#[derive(Debug)]
15pub struct IntegrityCheckOk{
16    pub last_block_state:Option<BlockState>,
17    ///Number of errors we fixed and wrote back to the file
18    ///Does not indicate number of bytes corrected
19    ///To estimate: ECC_LEN/2 is number of correctable errors per 255 bytes
20    ///So if we did not return Err::Corruption, there was always less than that many errors per 'ecc chunk'.
21    pub errors_corrected: usize,
22    ///Number of bytes of 'Content' (without ECC data counted) in the file.
23    pub data_contents: u64,
24    ///Number of bytes of 'Content' in the compressed form (no ECC counted).
25    pub data_size_on_disk: u64,
26    ///Number of Blocks in file
27    pub num_blocks:usize,
28    ///This is the index up to which we checked
29    ///It may be in the middle of a block
30    pub file_len_checked:u64,
31    ///These are all the content data segments that are not 'as written'
32    ///They can either be corrupted and have no ECC or
33    ///they can be corrupted beyond what ECC can do.
34    pub corrupted_segments: Vec<CorruptDataSegment>,
35    ///Contains the block start position and the time stamp found there
36    pub block_times: Vec<(u64,u64)>
37
38}
39#[derive(Debug)]
40pub enum IntegrityErr{
41    Other(ReadWriteError),
42    ///This only returns if a Component Header (or hash) is corrupted.
43    ///We cannot process the file any farther. We only read Front to Back so the position is all the farther we checked the file.
44    ///The file may still be able to succeed at tail recovery if this corruption is earlier than the second to last block.
45    ///If found in the last block, then a tail recovery would truncate this block.
46    ///Integrity check handles the last block, so if you have this error then somehow part of the file got corrupted, badly.
47    Corruption(u64,ComponentTag), // TODO: Make a hash recovery routine in the unlikely event the hash is corrupt and nothing else is.
48    ///This is really an implementation error, where we find the wrong 'pattern' of headers. This should only occur in testing ideally.
49    InvalidBlockStructure{start_of_bad_component:u64},
50    ///Either the MAGIC_NUMBER, the V1 tag, or the ECC_LEN don't match this compiled program.
51    ///Most likely would happen if you upgraded or have multiple docufort wrappers that use a different ECC_LEN
52    ///You should only open docufort files that were written with the current compiled software.
53    FileConfigMisMatch
54}
55impl From<std::io::Error> for IntegrityErr{
56    fn from(value: std::io::Error) -> Self {
57        Self::Other(value.into())
58    }
59}
60impl From<ReadWriteError> for IntegrityErr{
61    fn from(value: ReadWriteError) -> Self {
62        Self::Other(value)
63    }
64}
65impl std::fmt::Display for IntegrityErr {
66    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
67        match self {
68            IntegrityErr::Other(err) => write!(f, "Other error: {}", err),
69            IntegrityErr::Corruption(pos, tag) => write!(f, "Corruption detected at position {} for component {:?}", pos, tag),
70            IntegrityErr::InvalidBlockStructure { start_of_bad_component } =>
71                write!(f, "Invalid block structure detected at position {}", start_of_bad_component),
72            IntegrityErr::FileConfigMisMatch => write!(f, "File configuration mismatch"),
73        }
74    }
75}
76impl std::error::Error for IntegrityErr {
77    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
78        match self {
79            IntegrityErr::Other(err) => Some(err),
80            _ => None,
81        }
82    }
83}
84/// This function will read a docufort file and check the integrity of the file.
85/// It will attempt to correct any errors it finds in the data using any available ECC data.
86/// If it finds a corruption that it cannot correct, it will return an error.
87/// If it finds a block that is not closed, it will return Ok, and the file_len_checked will be the position of the last complete segment.
88/// # Arguments
89/// * `file_path` - The path to the docufort file.
90/// # Returns
91/// A Result containing the summary of the check.
92/// ## Ok
93/// Contains the summary of the check.
94///
95/// Note: May return Ok if content is corrupted beyond ECC repair (or no ECC enabled). Check the `corrupted_segments` for details.
96/// This is because we can still read past the corruption and find the next block, and recover other data.
97/// This is not fatal to docufort, but it is a problem for the user's data.
98/// ## Err
99/// - File is not a docufort file
100/// - File is not written with the same configuration as this compiled program (ECC_LEN or version mismatch)
101/// - A Block Component is corrupted beyond repair, preventing further reading of the file
102/// - The block structure is invalid
103/// - An IO error occurred
104pub fn integrity_check_file<RW:FileLike, B: BlockInputs>(file: &mut RW) -> Result<IntegrityCheckOk, IntegrityErr> {
105    let mut file_len = file.len()?;
106    let mut errors_corrected = 0;
107    let mut data_contents = 0;
108    let mut data_size_on_disk = 0;
109    let mut num_blocks = 0;
110    let mut corrupted_segments = Vec::new();
111    let mut block_times = Vec::new();
112
113    if !verify_configs(file)?{return Err(IntegrityErr::FileConfigMisMatch)}
114    let mut last_state= None;
115    loop {
116        let cur_pos = file.seek(SeekFrom::Current(0))?;
117        let res = read_magic_number(file, true);
118        let after_read_pos = file.seek(SeekFrom::Current(0))?;
119        if cur_pos > file_len || after_read_pos > file_len || res.is_err() {//we read too far from when the fn was originally called.
120            //We set the file_len to reflect how far we have integrity checked
121            file_len = if cur_pos>file_len{file_len}else{cur_pos};
122            break;
123        }
124        errors_corrected += res?;
125        let bs = try_read_block::<_, B>(file, true,true)?;//if we get an error now, there is some non-integrity problem
126        last_state = Some(bs);
127        match last_state.as_ref().unwrap() {
128            BlockState::Closed(BlockReadSummary { errors_corrected: e, block,  corrupted_content_blocks, block_start, block_start_timestamp, .. }) => {
129                errors_corrected += e;
130                corrupted_segments.extend_from_slice(corrupted_content_blocks.as_slice());
131                match block {
132                    Block::A { middle, .. } => {
133                        if let Some(decomp_len) = middle.compressed {
134                            data_contents += decomp_len as u64;
135                            data_size_on_disk += middle.data_len as u64;
136                        }else{
137                            data_contents += middle.data_len as u64;
138                            data_size_on_disk += middle.data_len as u64;
139                        }
140                    },
141                    Block::B { middle, .. } => middle.iter().for_each(|(_,c)|{
142                        if let Some(decomp_len) = c.compressed {
143                            data_contents += decomp_len as u64;
144                            data_size_on_disk += c.data_len as u64;
145                        }else{
146                            data_contents += c.data_len as u64;
147                            data_size_on_disk += c.data_len as u64;
148                        }
149                    }),
150                }
151                num_blocks += 1;
152                block_times.push((*block_start,*block_start_timestamp))
153                // let BlockEnd { hash, .. } = block.clone().take_end();
154                // assert_eq!(&hash_as_read[..],hash.hash());//impl assertion since we are error correcting every block
155            },
156            BlockState::OpenABlock { truncate_at } |
157            BlockState::OpenBBlock { truncate_at, .. } => {
158                //We set the file_len to reflect how far we have integrity checked
159                file_len = *truncate_at;
160                break;
161            },
162            BlockState::IncompleteStartHeader { truncate_at } => {
163                //We set the file_len to reflect how far we have integrity checked
164                file_len = *truncate_at;
165                break;
166            },
167            BlockState::InvalidBlockStructure { end_of_last_good_component, .. } =>{
168                return Err(IntegrityErr::InvalidBlockStructure { start_of_bad_component: *end_of_last_good_component})
169            }
170            BlockState::ProbablyNotStartHeader { start_from } => {
171                return Err(IntegrityErr::Corruption(*start_from,ComponentTag::StartHeader))
172            }
173            BlockState::DataCorruption { component_start, component_tag,.. } => {
174                return Err(IntegrityErr::Corruption(*component_start,*component_tag))
175            },
176        }
177    }
178    Ok(IntegrityCheckOk {
179        last_block_state: last_state,
180        errors_corrected,
181        data_contents,
182        data_size_on_disk,
183        num_blocks,
184        file_len_checked: file_len,
185        corrupted_segments,
186        block_times
187    })
188}