1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
use crate::file::open_as_read;
use crate::FileWriter;
use digest::{Digest, Output};
use filepath::FilePath;
use memmap2::Mmap;
use rayon::prelude::*;
use sha3::Sha3_256;
use std::{fmt, fs::File, path::Path};

/// The FileReader struct represents a file reader that provides high-performance file reading capabilities.
/// It uses memory mapping for efficient access to file data.
pub struct FileReader {
    mmap: Box<Mmap>,
    path: Box<dyn AsRef<Path> + Send + Sync>,
}

impl fmt::Display for FileReader {
    /// Displays the path of the file.
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}", self.path.as_ref().as_ref().display())
    }
}

impl fmt::Debug for FileReader {
    /// Displays the path of the file.
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(f, "{}", self.path.as_ref().as_ref().display())
    }
}

impl FileReader {
    /// Creates a new FileReader for a given file and path.
    /// It memory maps the file for efficient access.
    fn new(file: &File, path: impl AsRef<Path> + Send + Sync) -> Self {
        let mmap = Box::new(unsafe {
            Mmap::map(file).unwrap_or_else(|err| panic!("Could not mmap file. Error: {}", err))
        });
        Self {
            mmap,
            path: Box::new(path.as_ref().to_path_buf()),
        }
    }

    /// Opens a file and returns a FileReader for it.
    /// The file is identified by its File object.
    pub fn open_file(file: &File) -> Self {
        let file_path = file
            .path()
            .unwrap_or_else(|err| panic!("Could not get path of writer file. Error: {}", err));
        Self::new(file, file_path)
    }

    /// Opens a file and returns a FileReader for it.
    /// The file is identified by its path.
    pub fn open(path: impl AsRef<Path> + Send + Sync) -> Self {
        let file = open_as_read(path.as_ref());
        Self::new(&file, path)
    }

    /// Reads the entire file to a string.
    pub fn read_to_string(&self) -> String {
        self.bytes().iter().map(|c| *c as char).collect::<String>()
    }

    /// Returns a slice of bytes representing the file data.
    pub fn bytes(&self) -> &[u8] {
        &self.mmap[..]
    }

    /// Returns a vector of bytes representing the file data.
    pub fn to_vec(&self) -> Vec<u8> {
        self.mmap.to_vec()
    }

    /// Opens the file for reading and returns the File object.
    pub fn file(&self) -> File {
        open_as_read(self.path.as_ref().as_ref())
    }

    /// Returns the memory-mapped file.
    pub fn mmap(&self) -> &Box<Mmap> {
        &self.mmap
    }

    /// Returns the path of the file.
    pub fn path(&self) -> &Path {
        self.path.as_ref().as_ref()
    }

    /// Opens the file for writing and returns a FileWriter for it.
    pub fn to_writer(&self) -> FileWriter {
        FileWriter::open(&self.path.as_ref())
    }

    /// Computes the hash of the file data using a given hash function.
    pub fn hash_with<H: Digest>(&self) -> Output<H> {
        H::digest(&self.bytes())
    }

    /// Computes the SHA3-256 hash of the file data.
    pub fn hash(&self) -> Output<Sha3_256> {
        self.hash_with::<Sha3_256>()
    }

    /// Computes the hash of the file data and returns it as a hex string.
    pub fn hash_to_string(&self) -> String {
        let hash = self.hash();
        let mut hash_string = String::new();
        for byte in hash {
            hash_string.push_str(&format!("{:02x}", byte));
        }
        hash_string
    }

    /// A private method that finds a sequence of bytes within the file.
    /// It takes a starting index `i`, a byte `byte`, and a byte sequence `bytes`.
    /// If the first byte of the sequence matches the provided byte, it checks the subsequent bytes.
    /// If all bytes match, it returns the starting index. Otherwise, it returns None.
    fn find_inner(&self, i: &usize, byte: &u8, bytes: &[u8]) -> Option<usize> {
        if byte == &bytes[0] {
            let mut offset = 1;
            while offset < bytes.len() {
                if self.bytes()[i + offset] != bytes[offset] {
                    break;
                }
                offset += 1;
            }
            if offset == bytes.len() {
                Some(*i)
            } else {
                None
            }
        } else {
            None
        }
    }

    /// Finds the first occurrence of a byte sequence in the file data.
    /// It takes a byte sequence `bytes` and returns the index of the first occurrence.
    /// If the byte sequence is not found, it returns None.
    pub fn find_bytes(&self, bytes: &impl AsRef<[u8]>) -> Option<usize> {
        let bytes = bytes.as_ref();
        let mmap_bytes = self.bytes();
        mmap_bytes
            .into_par_iter()
            .enumerate()
            .find_map_first(|(i, byte)| self.find_inner(&i, &byte, bytes.as_ref()))
    }

    /// Finds the last occurrence of a byte sequence in the file data.
    /// It takes a byte sequence `bytes` and returns the index of the last occurrence.
    /// If the byte sequence is not found, it returns None.
    pub fn rfind_bytes(&self, bytes: &impl AsRef<[u8]>) -> Option<usize> {
        let bytes = bytes.as_ref();
        let mmap_bytes = self.bytes();
        mmap_bytes
            .into_par_iter()
            .enumerate()
            .find_map_last(|(i, byte)| self.find_inner(&i, &byte, bytes.as_ref()))
    }

    /// Finds all occurrences of a byte sequence in the file data.
    /// It takes a byte sequence `bytes` and returns a vector of indices where the byte sequence is found.
    pub fn find_bytes_all(&self, bytes: &impl AsRef<[u8]>) -> Vec<usize> {
        let bytes = bytes.as_ref();
        let mmap_bytes = self.bytes();
        mmap_bytes
            .into_par_iter()
            .enumerate()
            .filter_map(|(i, byte)| self.find_inner(&i, &byte, bytes))
            .collect::<Vec<usize>>()
    }

    /// Finds the nth occurrence of a byte sequence in the file data.
    /// It takes a byte sequence `bytes` and an index `n`, and returns the index of the nth occurrence.
    /// If the byte sequence is not found, it returns None.
    pub fn find_bytes_nth(&self, bytes: &impl AsRef<[u8]>, n: usize) -> Option<usize> {
        //There are two good approaches to this - the nth match could be found by iterating
        //sequentially, then returning upon finding the nth match, or by finding all matches in
        //parallel, then sorting and returning the nth match. The second approach will generally be
        //faster, despite the obvious overhead/inefficiency of finding all matches first, then sorting, because it can be parallelized.
        //
        //This was initially implemented by breaking and returning the nth match in parallel
        //instead, but the nth match found when not parsing data sequentially is not guaranteed to
        //be the nth match in the file, so this was changed to the current approach.

        let mut offsets = self.find_bytes_all(bytes);
        offsets.par_sort_unstable(); //offsets will not have overlapping values
        offsets.get(n).copied()
    }

    /// Compares two files by their hashes.
    /// It takes two file paths `file_path1` and `file_path2`, and returns true if the files are identical (based on their hashes), false otherwise.
    pub fn compare_files(
        file_path1: impl AsRef<Path> + Send + Sync,
        file_path2: impl AsRef<Path> + Send + Sync,
    ) -> bool {
        let file1_reader = FileReader::open(&file_path1);
        let file2_reader = FileReader::open(&file_path2);
        file1_reader.hash() == file2_reader.hash()
    }

    /// Compares the FileReader's file to another file by their hashes.
    /// It takes a file path `file_path`, and returns true if the files are identical (based on their hashes), false otherwise.
    pub fn compare_to(&self, file_path: impl AsRef<Path> + Send + Sync) -> bool {
        let file_reader = FileReader::open(&file_path);
        self.hash() == file_reader.hash()
    }

    /// Compares the FileReader's file to another file by their hashes.
    /// It takes a File object `file`, and returns true if the files are identical (based on their hashes), false otherwise.
    pub fn compare_to_file(&self, file: &File) -> bool {
        let file_reader = FileReader::open_file(&file);
        self.hash() == file_reader.hash()
    }

    /// Compares the hash of the FileReader's file to a given hash.
    /// It takes a hash `hash`, and returns true if the hash of the file is identical to the given hash, false otherwise.
    pub fn compare_hash<T: Digest>(&self, hash: &Output<T>) -> bool {
        self.hash_with::<T>() == *hash
    }
}

impl IntoIterator for FileReader {
    type Item = u8;
    type IntoIter = std::vec::IntoIter<Self::Item>;

    /// Converts the FileReader into an iterator over the bytes of the file data.
    fn into_iter(self) -> Self::IntoIter {
        self.bytes().to_vec().into_iter()
    }
}

impl IntoParallelIterator for FileReader {
    type Item = u8;
    type Iter = rayon::vec::IntoIter<Self::Item>;

    /// Converts the FileReader into a parallel iterator over the bytes of the file data.
    fn into_par_iter(self) -> Self::Iter {
        self.bytes().to_vec().into_par_iter()
    }
}

impl PartialEq for FileReader {
    /// Compares two FileReader instances for equality based on their hashes.
    fn eq(&self, other: &Self) -> bool {
        self.hash() == other.hash()
    }
}