oscar_tools/ops/
checksum.rs

1//! File checksum computing and checksum file writing.
2use std::{
3    borrow::Cow,
4    fs::File,
5    io::{self, Write},
6    path::{Path, PathBuf},
7};
8
9use log::{debug, error, info, warn};
10use rayon::{iter::ParallelIterator, prelude::ParallelBridge};
11use sha2::{Digest, Sha384};
12
13use crate::error::Error;
14
15pub trait Checksum {
16    /// compute the hash of the file pointed by the filepath by using [io::copy] between a file handler and the hasher.
17    /// As such, it shouldn't make the program go OOM with big files, but it has not been tested.
18    /// Can return an error if there has been problems regarding IO.
19    #[inline]
20    fn get_hash<R>(reader: &mut R, hasher: &mut Sha384) -> Result<String, Error>
21    where
22        R: std::io::Read,
23    {
24        io::copy(reader, hasher)?;
25        let result = format!("{:x}", hasher.finalize_reset());
26        Ok(result)
27    }
28
29    /// corpus/lang/lang_part_x.jsonl
30    #[inline]
31    fn get_hash_path(src: &Path, hasher: &mut Sha384) -> Result<String, Error> {
32        let mut f = File::open(src)?;
33        Self::get_hash(&mut f, hasher)
34    }
35
36    /// this should operate on the wide-level.
37    fn checksum_folder(src: &Path, num_threads: usize) -> Result<(), Error> {
38        if num_threads != 1 {
39            rayon::ThreadPoolBuilder::new()
40                .num_threads(num_threads)
41                .build_global()?;
42        }
43
44        if src.is_file() {
45            // TODO #86442 merged
46            // return Err(io::Error::new(
47            //     io::ErrorKind::IsADirectory,
48            //     format!("{}", src),
49            // ));
50            error!("Checksum only works on folders!");
51            return Err(io::Error::new(io::ErrorKind::InvalidInput, format!("{:?}", src)).into());
52        }
53
54        let language_dirs = std::fs::read_dir(src)?.filter_map(|entry| {
55            // check entry validity
56            let entry = match entry {
57                Ok(e) => e.path(),
58                Err(e) => {
59                    error!("error with directory entry {:?}", e);
60                    return None;
61                }
62            };
63
64            // filter out files
65            if !entry.is_dir() {
66                warn!("{:?} is not a directory: ignoring checksum op", entry);
67                None
68            } else {
69                Some(entry)
70            }
71        });
72
73        let language_dirs_par = language_dirs.par_bridge();
74        language_dirs_par.for_each(|language_dir| match Self::get_write_hashes(&language_dir) {
75            Ok(_) => (),
76            Err(e) => error!("Error with directory {:?}: {:?}", language_dir, e),
77        });
78        Ok(())
79    }
80
81    #[inline]
82    /// convinience function for checksum_folder
83    /// TODO: move out of trait
84    fn get_write_hashes(src: &Path) -> Result<(), Error> {
85        debug!("Getting hashes for {:?}", src);
86        let hashes = Self::checksum_lang(src)?;
87        let checksum_filepath = src.to_path_buf().join("checksum.sha384");
88        debug!("writing checksums in {:?}", checksum_filepath);
89        let mut checksum_file = File::create(&checksum_filepath)?;
90        Self::write_checksum(&mut checksum_file, hashes)?;
91        Ok(())
92    }
93    fn write_checksum<W: Write>(
94        writer: &mut W,
95        hashes: Vec<(PathBuf, String)>,
96    ) -> Result<(), Error> {
97        for (path, hash) in hashes {
98            if let Some(filename) = path.file_name() {
99                let filename = if let Some(filename_string) = filename.to_str() {
100                    Cow::from(filename_string)
101                } else {
102                    let filename_string = filename.to_string_lossy();
103                    warn!(
104                        "could not convert path to string: {:?}, using {} in replacement.",
105                        filename, filename_string
106                    );
107                    filename_string
108                };
109                writeln!(writer, "{} {}", hash, filename)?;
110            } else {
111                warn!("Could not get filename for {:?}: ignoring in checksum. Add manually if necessary.", path);
112            }
113        }
114        Ok(())
115    }
116    /// this should operate on lang-level
117    fn checksum_lang(src: &Path) -> Result<Vec<(PathBuf, String)>, Error> {
118        let mut hasher = Sha384::new();
119        let mut hashes = Vec::new();
120        for filepath in std::fs::read_dir(src)? {
121            let filepath = filepath?.path();
122            debug!("hashing {:?}", filepath);
123            let hash = Self::get_hash_path(&filepath, &mut hasher)?;
124            hashes.push((filepath, hash));
125        }
126        Ok(hashes)
127    }
128}
129
130#[cfg(test)]
131mod tests {
132    use sha2::Digest;
133
134    use std::fs::File;
135    use std::io::Write;
136    use std::path::PathBuf;
137    use tempfile::TempDir;
138
139    use sha2::Sha384;
140
141    use crate::error::Error;
142    use crate::ops::Checksum;
143
144    fn gen_dummy_corpus() -> Result<TempDir, Error> {
145        let corpus_dir = tempfile::tempdir().unwrap();
146
147        let (langs, contents): (Vec<&str>, Vec<&str>) = [
148            ("fr", r#"{{"content":"foo_french"}}"#),
149            ("en", r#"{{"content":"foo_english"}}"#),
150            ("de", r#"{{"content":"foo_german"}}"#),
151            ("es", r#"{{"content":"foo_spanish"}}"#),
152        ]
153        .iter()
154        .cloned()
155        .unzip();
156        for (lang, content) in langs.iter().zip(contents.iter()) {
157            let path = corpus_dir.path();
158            let lang_dir = path.join(lang);
159            std::fs::create_dir(&lang_dir)?;
160            let lang_text_file = lang_dir.clone().join(format!("{lang}.jsonl"));
161            let mut f = File::create(&lang_text_file)?;
162            write!(&mut f, "{content}")?;
163        }
164
165        Ok(corpus_dir)
166    }
167    #[test]
168    fn test_write_checksum() {
169        struct DummyChecksum;
170        impl Checksum for DummyChecksum {}
171
172        let hashes = vec![
173            (PathBuf::from("fr.txt"), "hash_for_fr.txt".to_string()),
174            (PathBuf::from("en.txt"), "hash_for_en.txt".to_string()),
175            (PathBuf::from("es.txt"), "hash_for_es.txt".to_string()),
176            (PathBuf::from("de.txt"), "hash_for_de.txt".to_string()),
177        ];
178        let expected = "hash_for_fr.txt fr.txt
179hash_for_en.txt en.txt
180hash_for_es.txt es.txt
181hash_for_de.txt de.txt
182";
183        let mut checksum_writer = Vec::new();
184        DummyChecksum::write_checksum(&mut checksum_writer, hashes).unwrap();
185        let checksum_string = String::from_utf8(checksum_writer).unwrap();
186        assert_eq!(expected, &checksum_string);
187    }
188
189    #[test]
190    fn test_get_write_hashes() -> Result<(), Error> {
191        struct DummyChecksum;
192        impl Checksum for DummyChecksum {}
193
194        let lang = tempfile::tempdir()?;
195        let lang_corpus = lang.path().join("fr.txt");
196        let text = "foo bar baz quux";
197        let mut f = File::create(&lang_corpus)?;
198        f.write(text.as_bytes())?;
199
200        DummyChecksum::get_write_hashes(lang.path())?;
201
202        let checksum_file = lang.path().join("checksum.sha384");
203        let checksums = std::fs::read_to_string(&checksum_file)?;
204
205        let mut x = checksums.split(' ').take(2);
206        let (checksum, filename) = (x.next(), x.next());
207
208        let mut hasher = Sha384::new();
209        hasher.update(text.as_bytes());
210        let expected_checksum = format!("{:x}", hasher.finalize_reset());
211        let expected_filename = "fr.txt\n";
212
213        assert_eq!(checksum.unwrap(), &expected_checksum);
214        assert_eq!(filename.unwrap(), expected_filename);
215
216        Ok(())
217    }
218    #[test]
219    fn test_checksum_lang() -> Result<(), Error> {
220        struct DummyChecksum;
221        impl Checksum for DummyChecksum {}
222
223        let corpus_dir = tempfile::tempdir().unwrap();
224
225        let (langs, contents): (Vec<&str>, Vec<&str>) = [
226            ("fr", r#"{{"content":"foo_french"}}"#),
227            ("en", r#"{{"content":"foo_english"}}"#),
228            ("de", r#"{{"content":"foo_german"}}"#),
229            ("es", r#"{{"content":"foo_spanish"}}"#),
230        ]
231        .iter()
232        .cloned()
233        .unzip();
234        for (lang, content) in langs.iter().zip(contents.iter()) {
235            let path = corpus_dir.path();
236            let lang_dir = path.join(lang);
237            std::fs::create_dir(&lang_dir)?;
238            let lang_text_file = lang_dir.clone().join(format!("{lang}.jsonl"));
239            let mut f = File::create(&lang_text_file)?;
240            write!(&mut f, "{content}")?;
241        }
242
243        for (lang, content) in langs.iter().zip(contents) {
244            // corpora are not split, so there's only one file (hence [0]). We then take the hash (hence .1)
245            let hash = &DummyChecksum::checksum_lang(&corpus_dir.path().join(lang))?[0].1;
246            let expected = {
247                let mut hasher = Sha384::new();
248                let mut reader = content.as_bytes();
249                DummyChecksum::get_hash(&mut reader, &mut hasher)?
250            };
251
252            assert_eq!(hash, &expected);
253        }
254
255        Ok(())
256    }
257
258    #[test]
259    fn test_checksum_folder() -> Result<(), Error> {
260        struct DummyChecksum;
261        impl Checksum for DummyChecksum {}
262
263        let corpus_dir = tempfile::tempdir().unwrap();
264
265        let (langs, contents): (Vec<&str>, Vec<&str>) = [
266            ("fr", r#"{{"content":"foo_french"}}"#),
267            ("en", r#"{{"content":"foo_english"}}"#),
268            ("de", r#"{{"content":"foo_german"}}"#),
269            ("es", r#"{{"content":"foo_spanish"}}"#),
270        ]
271        .iter()
272        .cloned()
273        .unzip();
274
275        for (lang, content) in langs.iter().zip(contents.iter()) {
276            let path = corpus_dir.path();
277            let lang_dir = path.join(lang);
278            std::fs::create_dir(&lang_dir)?;
279            let lang_text_file = lang_dir.clone().join(format!("{lang}.jsonl"));
280            let mut f = File::create(&lang_text_file)?;
281            write!(&mut f, "{content}")?;
282        }
283
284        let corpus_path = corpus_dir.path();
285        DummyChecksum::checksum_folder(corpus_path, 1)?;
286
287        for dir in std::fs::read_dir(&corpus_path)? {
288            let dir = dir?;
289            let mut hashes: Vec<(_, _)> = Vec::new();
290            let mut hashes_from_files: Vec<(_, _)> = Vec::new();
291            let mut hasher = Sha384::new();
292            for language_dir in std::fs::read_dir(dir.path())? {
293                let language_dir = language_dir?;
294
295                let current_path = language_dir.path();
296                let extension = current_path.extension().and_then(|x| x.to_str());
297                match extension {
298                    None => (),
299                    Some("jsonl") => {
300                        let hash = DummyChecksum::get_hash_path(&current_path, &mut hasher)?;
301                        let filename = current_path.clone();
302                        let filename = filename.file_name().map(|f| f.to_owned());
303
304                        let filename = filename.unwrap().into_string();
305                        hashes.push((filename.unwrap(), hash));
306                    }
307                    Some("sha384") => {
308                        let checksums = std::fs::read_to_string(current_path)?;
309                        let parts: Vec<String> = checksums
310                            .split(' ')
311                            .take(2)
312                            .map(|x| x.to_string())
313                            .collect();
314                        let hash = parts[0].clone();
315                        let filename = parts[1].clone().replace('\n', "");
316                        hashes_from_files.push((filename, hash));
317                    }
318                    _ => (),
319                }
320            }
321
322            assert_eq!(hashes, hashes_from_files);
323        }
324
325        Ok(())
326    }
327}