normalized_hash/
lib.rs

1//! # normalized-hash
2//!
3//! [![badge github]][url github]
4//! [![badge crates.io]][url crates.io]
5//! [![badge docs.rs]][url docs.rs]
6//! [![badge license]][url license]
7//!
8//! [badge github]: https://img.shields.io/badge/github-FloGa%2Fnormalized--hasher-green
9//! [badge crates.io]: https://img.shields.io/crates/v/normalized-hash
10//! [badge docs.rs]: https://img.shields.io/docsrs/normalized-hash
11//! [badge license]: https://img.shields.io/crates/l/normalized-hash
12//!
13//! [url github]: https://github.com/FloGa/normalized-hasher/crates/normalized-hash
14//! [url crates.io]: https://crates.io/crates/normalized-hash
15//! [url docs.rs]: https://docs.rs/normalized-hash
16//! [url license]: https://github.com/FloGa/normalized-hasher/blob/develop/crates/normalized-hash/LICENSE
17//!
18//! Cross-platform hash algorithm.
19//!
20//! *This is the library crate. If you're looking for the binary crate instead, go
21//! to [`normalized-hasher`].*
22//!
23//! [`normalized-hasher`]: https://github.com/FloGa/normalized-hasher
24//!
25//! ## Summary
26//!
27//! This hashing algorithm allows consistent hashes even if you accidentally
28//! convert a file from using UNIX line endings (LF) to Windows line endings
29//! (CRLF). For a longish motivational speech about how such a thing can happen
30//! and why you should want to even care about such a case, head over to
31//! [`normalized-hasher`].
32//!
33//! ## Code Example
34//!
35//! ```rust no_run
36//! use std::path::PathBuf;
37//!
38//! use normalized_hash::Hasher;
39//!
40//! fn main() {
41//!     let file_in = PathBuf::from("input.txt");
42//!     let file_out = PathBuf::from("output.txt");
43//!
44//!     // Simple example with default options, without writing an output file
45//!     let hash = Hasher::new().hash_file(&file_in, None::<PathBuf>);
46//!     println!("{}", hash);
47//!
48//!     // More complex example, with writing output
49//!     let hash = Hasher::new()
50//!         .eol("\r\n")
51//!         .no_eof(true)
52//!         .hash_file(&file_in, Some(file_out));
53//!     println!("{}", hash);
54//! }
55//! ```
56
57use std::fs::File;
58use std::io::{BufRead, BufReader, BufWriter, Write};
59use std::path::Path;
60
61use sha2::{Digest, Sha256};
62
63pub struct Hasher {
64    eol: String,
65    ignore_whitespaces: bool,
66    no_eof: bool,
67}
68
69impl Default for Hasher {
70    fn default() -> Self {
71        Self {
72            eol: "\n".to_string(),
73            ignore_whitespaces: false,
74            no_eof: false,
75        }
76    }
77}
78
79impl Hasher {
80    /// Create new Hasher instance with default options.
81    ///
82    /// # Defaults
83    ///
84    /// If not overwritten by the fluent API, the following defaults are valid:
85    ///
86    /// -   `eol`: `"\n"`
87    ///
88    ///     End-of-line sequence, will be appended to each normalized line for hashing.
89    ///
90    /// -   `ignore_whitespaces`: `false`
91    ///
92    ///     Ignore all whitespaces. This will remove all whitespaces from the input file when
93    ///     generating the hash.
94    ///
95    /// -   `no_eof`: `false`
96    ///
97    ///     Skip last end-of-line on end-of-file. If this is set to true, no trailing EOL will be
98    ///     appended at the end of the file.
99    ///
100    /// # Example
101    ///
102    /// ```
103    /// use normalized_hash::Hasher;
104    /// let hasher = Hasher::new();
105    /// ```
106    pub fn new() -> Self {
107        Default::default()
108    }
109
110    /// Change the eol sequence.
111    ///
112    /// This string will be appended to each normalized line for hashing.
113    ///
114    /// Defaults to `"\n"`.
115    ///
116    /// # Example
117    ///
118    /// ```
119    /// use normalized_hash::Hasher;
120    /// let hasher = Hasher::new().eol("\r\n");
121    /// ```
122    pub fn eol(mut self, eol: impl Into<String>) -> Self {
123        self.eol = eol.into();
124        self
125    }
126
127    /// Ignore all whitespaces.
128    ///
129    /// This will remove all whitespaces from the input file when generating the hash.
130    pub fn ignore_whitespaces(mut self, ignore_whitespaces: bool) -> Self {
131        self.ignore_whitespaces = ignore_whitespaces;
132        self
133    }
134
135    /// Skip last end-of-line on end-of-file.
136    ///
137    /// If this is set to true, no trailing EOL will be appended at the end of the file.
138    ///
139    /// Defaults to `false`.
140    ///
141    /// # Example
142    ///
143    /// ```
144    /// use normalized_hash::Hasher;
145    /// let hasher = Hasher::new().no_eof(true);
146    /// ```
147    pub fn no_eof(mut self, no_eof: bool) -> Self {
148        self.no_eof = no_eof;
149        self
150    }
151
152    /// Create hash from a text file, regardless of line endings.
153    ///
154    /// This function reads `file_in` linewise, replacing whatever line ending is present with a
155    /// single line feed character (`\n`). From this, it generates a hash code.
156    ///
157    /// Optionally, it is possible to write the normalized input to `file_out`.
158    ///
159    /// # Example
160    ///
161    /// ```no_run
162    /// use std::path::PathBuf;
163    ///     use normalized_hash::Hasher;
164    ///
165    ///     let hash_without_output = Hasher::new()
166    ///         .hash_file(PathBuf::from("input.txt"), None::<PathBuf>);
167    ///
168    ///     let hash_with_output = Hasher::new().hash_file(
169    ///         PathBuf::from("input.txt"),
170    ///         Some(PathBuf::from("output.txt"))
171    ///     );
172    /// ```
173    pub fn hash_file(
174        &self,
175        file_in: impl AsRef<Path>,
176        file_out: Option<impl AsRef<Path>>,
177    ) -> String {
178        let file_in = File::open(file_in).unwrap();
179        let file_in = BufReader::new(file_in);
180
181        let mut file_out = file_out.and_then(|file_out| {
182            let file_out = File::create(file_out).unwrap();
183            let file_out = BufWriter::new(file_out);
184            Some(file_out)
185        });
186
187        let mut hasher = Sha256::new();
188
189        let mut is_first_line = true;
190        for line in file_in.lines() {
191            let line = line.unwrap();
192
193            let line = if self.ignore_whitespaces {
194                line.replace(|c: char| c.is_whitespace(), "")
195            } else {
196                line
197            };
198
199            let line = if !is_first_line {
200                format!("{}{}", &self.eol, line)
201            } else {
202                line
203            };
204
205            hasher.update(&line);
206
207            if let Some(file_out) = &mut file_out {
208                file_out.write_all(line.as_bytes()).unwrap();
209            }
210
211            is_first_line = false;
212        }
213
214        if !self.no_eof {
215            hasher.update(&self.eol);
216
217            if let Some(file_out) = &mut file_out {
218                file_out.write_all(&self.eol.as_bytes()).unwrap();
219            }
220        }
221
222        let hash = hasher.finalize();
223
224        base16ct::lower::encode_string(&hash)
225    }
226}
227
228#[cfg(test)]
229mod tests {
230    use std::error::Error;
231    use std::ffi::OsString;
232    use std::fs;
233    use std::iter::zip;
234    use std::ops::Add;
235
236    use tempfile::NamedTempFile;
237
238    use super::*;
239
240    struct TestEnv {
241        file_with_crlf: NamedTempFile,
242        file_with_crlf_noeof: NamedTempFile,
243        file_with_lf: NamedTempFile,
244        file_with_lf_noeof: NamedTempFile,
245
246        normalized_file_with_crlf: NamedTempFile,
247        normalized_file_with_crlf_noeof: NamedTempFile,
248        normalized_file_with_lf: NamedTempFile,
249        normalized_file_with_lf_noeof: NamedTempFile,
250    }
251
252    impl TestEnv {
253        fn new() -> Result<Self, std::io::Error> {
254            let mut file_with_crlf = NamedTempFile::new()?;
255            let mut file_with_crlf_noeof = NamedTempFile::new()?;
256            let mut file_with_lf = NamedTempFile::new()?;
257            let mut file_with_lf_noeof = NamedTempFile::new()?;
258
259            let normalized_file_with_crlf_noeof = NamedTempFile::new()?;
260            let normalized_file_with_crlf = NamedTempFile::new()?;
261            let normalized_file_with_lf_noeof = NamedTempFile::new()?;
262            let normalized_file_with_lf = NamedTempFile::new()?;
263
264            let content = vec!["A B", "C D"];
265
266            file_with_crlf.write_all(content.join("\r\n").add("\r\n").as_bytes())?;
267            file_with_crlf_noeof.write_all(content.join("\r\n").as_bytes())?;
268            file_with_lf.write_all(content.join("\n").add("\n").as_bytes())?;
269            file_with_lf_noeof.write_all(content.join("\n").as_bytes())?;
270
271            Ok(TestEnv {
272                file_with_crlf,
273                file_with_crlf_noeof,
274                file_with_lf,
275                file_with_lf_noeof,
276
277                normalized_file_with_crlf,
278                normalized_file_with_crlf_noeof,
279                normalized_file_with_lf,
280                normalized_file_with_lf_noeof,
281            })
282        }
283
284        fn get_input_files(&self) -> Vec<&NamedTempFile> {
285            vec![
286                &self.file_with_crlf,
287                &self.file_with_crlf_noeof,
288                &self.file_with_lf,
289                &self.file_with_lf_noeof,
290            ]
291        }
292
293        fn get_output_files(&self) -> Vec<&NamedTempFile> {
294            vec![
295                &self.normalized_file_with_crlf,
296                &self.normalized_file_with_crlf_noeof,
297                &self.normalized_file_with_lf,
298                &self.normalized_file_with_lf_noeof,
299            ]
300        }
301
302        fn hash_files(&self, hasher: &Hasher) -> Result<(String, String), Box<dyn Error>> {
303            let mut hash_check = None;
304            let mut content_check = None;
305
306            for (file_in, file_out) in zip(self.get_input_files(), self.get_output_files()) {
307                let hash = hasher.hash_file(file_in, Some(file_out));
308
309                if hash_check.is_none() {
310                    hash_check = Some(hash.clone());
311                    content_check = Some(fs::read_to_string(file_out)?)
312                }
313
314                if let (Some(hash_check), Some(content_check)) = (&hash_check, &content_check) {
315                    assert_eq!(&hash, hash_check, "Hashes don't match");
316                    assert_eq!(
317                        &fs::read_to_string(file_out)?,
318                        content_check,
319                        "Normalized files don't match"
320                    );
321                }
322            }
323
324            let (Some(hash_check), Some(content_check)) = (hash_check, content_check) else {
325                unreachable!()
326            };
327
328            Ok((hash_check, content_check))
329        }
330    }
331
332    #[test]
333    fn check_empty_file() -> Result<(), Box<dyn Error>> {
334        let file = NamedTempFile::new()?;
335
336        // Sanity check between hasher versions
337
338        // Completely empty file
339        let hash_expected = "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855";
340        let hash_actual = Hasher::new().eol("").hash_file(&file, None::<OsString>);
341        assert_eq!(hash_actual, hash_expected);
342
343        // Empty file ending in LF
344        let hash_expected = "01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b";
345        let hash_actual = Hasher::new().hash_file(&file, None::<OsString>);
346        assert_eq!(hash_actual, hash_expected);
347
348        Ok(())
349    }
350
351    #[test]
352    fn check_default_options() -> Result<(), Box<dyn Error>> {
353        let test_env = TestEnv::new()?;
354        let (_, normalized_content) = test_env.hash_files(&Hasher::new())?;
355
356        assert_eq!(
357            fs::read_to_string(&test_env.file_with_lf)?,
358            normalized_content,
359            "Normalized files do not have LF"
360        );
361
362        Ok(())
363    }
364
365    #[test]
366    fn check_with_custom_eol() -> Result<(), Box<dyn Error>> {
367        let test_env = TestEnv::new()?;
368        let (_, normalized_content) = test_env.hash_files(&Hasher::new().eol("\r\n"))?;
369
370        assert_eq!(
371            fs::read_to_string(&test_env.file_with_crlf)?,
372            normalized_content,
373            "Normalized files do not have CRLF"
374        );
375
376        Ok(())
377    }
378
379    #[test]
380    fn check_without_eof() -> Result<(), Box<dyn Error>> {
381        let test_env = TestEnv::new()?;
382        let (_, normalized_content) = test_env.hash_files(&Hasher::new().no_eof(true))?;
383
384        assert_eq!(
385            fs::read_to_string(&test_env.file_with_lf_noeof)?,
386            normalized_content,
387            "Normalized files do not have LF without EOF"
388        );
389
390        Ok(())
391    }
392
393    #[test]
394    fn check_ignore_spaces() -> Result<(), Box<dyn Error>> {
395        let test_env = TestEnv::new()?;
396        let hasher = Hasher::new().eol("").ignore_whitespaces(true).no_eof(true);
397        let (normalized_hash, normalized_content) = test_env.hash_files(&hasher)?;
398
399        let mut file_with_lf_without_spaces = NamedTempFile::new()?;
400        let normalized_file_with_lf_without_spaces = NamedTempFile::new()?;
401
402        file_with_lf_without_spaces.write_all("ABCD".as_bytes())?;
403
404        let hash = hasher.hash_file(
405            &file_with_lf_without_spaces,
406            Some(normalized_file_with_lf_without_spaces),
407        );
408
409        assert_eq!(hash, normalized_hash, "Hashes don't match");
410        assert_eq!(
411            fs::read_to_string(&file_with_lf_without_spaces)?,
412            normalized_content,
413            "Normalized files do not ignore white spaces"
414        );
415
416        Ok(())
417    }
418}