1use std::{
3 borrow::Cow,
4 fs::File,
5 io::{self, Write},
6 path::{Path, PathBuf},
7};
8
9use log::{debug, error, info, warn};
10use rayon::{iter::ParallelIterator, prelude::ParallelBridge};
11use sha2::{Digest, Sha384};
12
13use crate::error::Error;
14
15pub trait Checksum {
16 #[inline]
20 fn get_hash<R>(reader: &mut R, hasher: &mut Sha384) -> Result<String, Error>
21 where
22 R: std::io::Read,
23 {
24 io::copy(reader, hasher)?;
25 let result = format!("{:x}", hasher.finalize_reset());
26 Ok(result)
27 }
28
29 #[inline]
31 fn get_hash_path(src: &Path, hasher: &mut Sha384) -> Result<String, Error> {
32 let mut f = File::open(src)?;
33 Self::get_hash(&mut f, hasher)
34 }
35
36 fn checksum_folder(src: &Path, num_threads: usize) -> Result<(), Error> {
38 if num_threads != 1 {
39 rayon::ThreadPoolBuilder::new()
40 .num_threads(num_threads)
41 .build_global()?;
42 }
43
44 if src.is_file() {
45 error!("Checksum only works on folders!");
51 return Err(io::Error::new(io::ErrorKind::InvalidInput, format!("{:?}", src)).into());
52 }
53
54 let language_dirs = std::fs::read_dir(src)?.filter_map(|entry| {
55 let entry = match entry {
57 Ok(e) => e.path(),
58 Err(e) => {
59 error!("error with directory entry {:?}", e);
60 return None;
61 }
62 };
63
64 if !entry.is_dir() {
66 warn!("{:?} is not a directory: ignoring checksum op", entry);
67 None
68 } else {
69 Some(entry)
70 }
71 });
72
73 let language_dirs_par = language_dirs.par_bridge();
74 language_dirs_par.for_each(|language_dir| match Self::get_write_hashes(&language_dir) {
75 Ok(_) => (),
76 Err(e) => error!("Error with directory {:?}: {:?}", language_dir, e),
77 });
78 Ok(())
79 }
80
81 #[inline]
82 fn get_write_hashes(src: &Path) -> Result<(), Error> {
85 debug!("Getting hashes for {:?}", src);
86 let hashes = Self::checksum_lang(src)?;
87 let checksum_filepath = src.to_path_buf().join("checksum.sha384");
88 debug!("writing checksums in {:?}", checksum_filepath);
89 let mut checksum_file = File::create(&checksum_filepath)?;
90 Self::write_checksum(&mut checksum_file, hashes)?;
91 Ok(())
92 }
93 fn write_checksum<W: Write>(
94 writer: &mut W,
95 hashes: Vec<(PathBuf, String)>,
96 ) -> Result<(), Error> {
97 for (path, hash) in hashes {
98 if let Some(filename) = path.file_name() {
99 let filename = if let Some(filename_string) = filename.to_str() {
100 Cow::from(filename_string)
101 } else {
102 let filename_string = filename.to_string_lossy();
103 warn!(
104 "could not convert path to string: {:?}, using {} in replacement.",
105 filename, filename_string
106 );
107 filename_string
108 };
109 writeln!(writer, "{} {}", hash, filename)?;
110 } else {
111 warn!("Could not get filename for {:?}: ignoring in checksum. Add manually if necessary.", path);
112 }
113 }
114 Ok(())
115 }
116 fn checksum_lang(src: &Path) -> Result<Vec<(PathBuf, String)>, Error> {
118 let mut hasher = Sha384::new();
119 let mut hashes = Vec::new();
120 for filepath in std::fs::read_dir(src)? {
121 let filepath = filepath?.path();
122 debug!("hashing {:?}", filepath);
123 let hash = Self::get_hash_path(&filepath, &mut hasher)?;
124 hashes.push((filepath, hash));
125 }
126 Ok(hashes)
127 }
128}
129
130#[cfg(test)]
131mod tests {
132 use sha2::Digest;
133
134 use std::fs::File;
135 use std::io::Write;
136 use std::path::PathBuf;
137 use tempfile::TempDir;
138
139 use sha2::Sha384;
140
141 use crate::error::Error;
142 use crate::ops::Checksum;
143
144 fn gen_dummy_corpus() -> Result<TempDir, Error> {
145 let corpus_dir = tempfile::tempdir().unwrap();
146
147 let (langs, contents): (Vec<&str>, Vec<&str>) = [
148 ("fr", r#"{{"content":"foo_french"}}"#),
149 ("en", r#"{{"content":"foo_english"}}"#),
150 ("de", r#"{{"content":"foo_german"}}"#),
151 ("es", r#"{{"content":"foo_spanish"}}"#),
152 ]
153 .iter()
154 .cloned()
155 .unzip();
156 for (lang, content) in langs.iter().zip(contents.iter()) {
157 let path = corpus_dir.path();
158 let lang_dir = path.join(lang);
159 std::fs::create_dir(&lang_dir)?;
160 let lang_text_file = lang_dir.clone().join(format!("{lang}.jsonl"));
161 let mut f = File::create(&lang_text_file)?;
162 write!(&mut f, "{content}")?;
163 }
164
165 Ok(corpus_dir)
166 }
167 #[test]
168 fn test_write_checksum() {
169 struct DummyChecksum;
170 impl Checksum for DummyChecksum {}
171
172 let hashes = vec![
173 (PathBuf::from("fr.txt"), "hash_for_fr.txt".to_string()),
174 (PathBuf::from("en.txt"), "hash_for_en.txt".to_string()),
175 (PathBuf::from("es.txt"), "hash_for_es.txt".to_string()),
176 (PathBuf::from("de.txt"), "hash_for_de.txt".to_string()),
177 ];
178 let expected = "hash_for_fr.txt fr.txt
179hash_for_en.txt en.txt
180hash_for_es.txt es.txt
181hash_for_de.txt de.txt
182";
183 let mut checksum_writer = Vec::new();
184 DummyChecksum::write_checksum(&mut checksum_writer, hashes).unwrap();
185 let checksum_string = String::from_utf8(checksum_writer).unwrap();
186 assert_eq!(expected, &checksum_string);
187 }
188
189 #[test]
190 fn test_get_write_hashes() -> Result<(), Error> {
191 struct DummyChecksum;
192 impl Checksum for DummyChecksum {}
193
194 let lang = tempfile::tempdir()?;
195 let lang_corpus = lang.path().join("fr.txt");
196 let text = "foo bar baz quux";
197 let mut f = File::create(&lang_corpus)?;
198 f.write(text.as_bytes())?;
199
200 DummyChecksum::get_write_hashes(lang.path())?;
201
202 let checksum_file = lang.path().join("checksum.sha384");
203 let checksums = std::fs::read_to_string(&checksum_file)?;
204
205 let mut x = checksums.split(' ').take(2);
206 let (checksum, filename) = (x.next(), x.next());
207
208 let mut hasher = Sha384::new();
209 hasher.update(text.as_bytes());
210 let expected_checksum = format!("{:x}", hasher.finalize_reset());
211 let expected_filename = "fr.txt\n";
212
213 assert_eq!(checksum.unwrap(), &expected_checksum);
214 assert_eq!(filename.unwrap(), expected_filename);
215
216 Ok(())
217 }
218 #[test]
219 fn test_checksum_lang() -> Result<(), Error> {
220 struct DummyChecksum;
221 impl Checksum for DummyChecksum {}
222
223 let corpus_dir = tempfile::tempdir().unwrap();
224
225 let (langs, contents): (Vec<&str>, Vec<&str>) = [
226 ("fr", r#"{{"content":"foo_french"}}"#),
227 ("en", r#"{{"content":"foo_english"}}"#),
228 ("de", r#"{{"content":"foo_german"}}"#),
229 ("es", r#"{{"content":"foo_spanish"}}"#),
230 ]
231 .iter()
232 .cloned()
233 .unzip();
234 for (lang, content) in langs.iter().zip(contents.iter()) {
235 let path = corpus_dir.path();
236 let lang_dir = path.join(lang);
237 std::fs::create_dir(&lang_dir)?;
238 let lang_text_file = lang_dir.clone().join(format!("{lang}.jsonl"));
239 let mut f = File::create(&lang_text_file)?;
240 write!(&mut f, "{content}")?;
241 }
242
243 for (lang, content) in langs.iter().zip(contents) {
244 let hash = &DummyChecksum::checksum_lang(&corpus_dir.path().join(lang))?[0].1;
246 let expected = {
247 let mut hasher = Sha384::new();
248 let mut reader = content.as_bytes();
249 DummyChecksum::get_hash(&mut reader, &mut hasher)?
250 };
251
252 assert_eq!(hash, &expected);
253 }
254
255 Ok(())
256 }
257
258 #[test]
259 fn test_checksum_folder() -> Result<(), Error> {
260 struct DummyChecksum;
261 impl Checksum for DummyChecksum {}
262
263 let corpus_dir = tempfile::tempdir().unwrap();
264
265 let (langs, contents): (Vec<&str>, Vec<&str>) = [
266 ("fr", r#"{{"content":"foo_french"}}"#),
267 ("en", r#"{{"content":"foo_english"}}"#),
268 ("de", r#"{{"content":"foo_german"}}"#),
269 ("es", r#"{{"content":"foo_spanish"}}"#),
270 ]
271 .iter()
272 .cloned()
273 .unzip();
274
275 for (lang, content) in langs.iter().zip(contents.iter()) {
276 let path = corpus_dir.path();
277 let lang_dir = path.join(lang);
278 std::fs::create_dir(&lang_dir)?;
279 let lang_text_file = lang_dir.clone().join(format!("{lang}.jsonl"));
280 let mut f = File::create(&lang_text_file)?;
281 write!(&mut f, "{content}")?;
282 }
283
284 let corpus_path = corpus_dir.path();
285 DummyChecksum::checksum_folder(corpus_path, 1)?;
286
287 for dir in std::fs::read_dir(&corpus_path)? {
288 let dir = dir?;
289 let mut hashes: Vec<(_, _)> = Vec::new();
290 let mut hashes_from_files: Vec<(_, _)> = Vec::new();
291 let mut hasher = Sha384::new();
292 for language_dir in std::fs::read_dir(dir.path())? {
293 let language_dir = language_dir?;
294
295 let current_path = language_dir.path();
296 let extension = current_path.extension().and_then(|x| x.to_str());
297 match extension {
298 None => (),
299 Some("jsonl") => {
300 let hash = DummyChecksum::get_hash_path(¤t_path, &mut hasher)?;
301 let filename = current_path.clone();
302 let filename = filename.file_name().map(|f| f.to_owned());
303
304 let filename = filename.unwrap().into_string();
305 hashes.push((filename.unwrap(), hash));
306 }
307 Some("sha384") => {
308 let checksums = std::fs::read_to_string(current_path)?;
309 let parts: Vec<String> = checksums
310 .split(' ')
311 .take(2)
312 .map(|x| x.to_string())
313 .collect();
314 let hash = parts[0].clone();
315 let filename = parts[1].clone().replace('\n', "");
316 hashes_from_files.push((filename, hash));
317 }
318 _ => (),
319 }
320 }
321
322 assert_eq!(hashes, hashes_from_files);
323 }
324
325 Ok(())
326 }
327}