integrity_checker/
database.rs

1use std::cmp::Ordering;
2use std::collections::BTreeMap;
3use std::default::Default;
4use std::fs::File;
5use std::io::{Read, Write};
6use std::path::{Path, PathBuf};
7use std::sync::{Arc, Mutex};
8
9use digest::{consts::U32, Digest, FixedOutput};
10use ignore::{WalkBuilder, WalkState};
11use time;
12
13use serde_json;
14
15use flate2::read::GzDecoder;
16use flate2::write::GzEncoder;
17use flate2::Compression;
18
19use blake2;
20use sha2::Sha512_256;
21
22use crate::base64;
23use crate::error;
24
25type Blake2b32 = blake2::Blake2b<U32>;
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub struct Features {
29    pub sha2: bool,
30    pub blake2b: bool,
31}
32
33impl Default for Features {
34    fn default() -> Features {
35        Features {
36            sha2: true,
37            blake2b: false,
38        }
39    }
40}
41
42impl Features {
43    fn infer_from_database_checksum(checksum: &DatabaseChecksum) -> Features {
44        Features {
45            sha2: checksum.sha2.is_some(),
46            blake2b: checksum.blake2b.is_some(),
47        }
48    }
49}
50
51#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
52pub struct DatabaseChecksum {
53    #[serde(rename = "sha2-512/256")]
54    #[serde(skip_serializing_if = "Option::is_none")]
55    sha2: Option<HashSum>,
56    #[serde(skip_serializing_if = "Option::is_none")]
57    blake2b: Option<HashSum>,
58    size: u64,
59}
60
61impl DatabaseChecksum {
62    fn diff(&self, new: &Self) -> bool {
63        let changed = self.size != new.size;
64        let changed =
65            changed || (self.sha2.is_some() && new.sha2.is_some() && self.sha2 != new.sha2);
66        changed || (self.blake2b.is_some() && new.blake2b.is_some() && self.blake2b != new.blake2b)
67    }
68}
69
70impl From<Metrics> for DatabaseChecksum {
71    fn from(metrics: Metrics) -> Self {
72        DatabaseChecksum {
73            sha2: metrics.sha2,
74            blake2b: metrics.blake2b,
75            size: metrics.size,
76        }
77    }
78}
79
80#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
81pub struct Database(Entry);
82
83#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
84pub enum Entry {
85    Directory(BTreeMap<PathBuf, Entry>),
86    File(Metrics),
87}
88
89impl Default for Entry {
90    fn default() -> Entry {
91        Entry::Directory(BTreeMap::default())
92    }
93}
94
95#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
96pub struct Metrics {
97    #[serde(rename = "sha2-512/256")]
98    #[serde(skip_serializing_if = "Option::is_none")]
99    sha2: Option<HashSum>,
100    #[serde(skip_serializing_if = "Option::is_none")]
101    blake2b: Option<HashSum>,
102    size: u64,      // File size
103    nul: bool,      // Does the file contain a NUL byte?
104    nonascii: bool, // Does the file contain non-ASCII bytes?
105}
106
107#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
108pub struct HashSum(#[serde(with = "base64")] Vec<u8>);
109
110#[derive(Default)]
111struct EngineSize(u64);
112impl EngineSize {
113    fn input(&mut self, input: &[u8]) {
114        self.0 += input.len() as u64;
115    }
116    fn result(self) -> u64 {
117        self.0
118    }
119}
120
121#[derive(Default)]
122struct EngineNul(bool);
123impl EngineNul {
124    fn input(&mut self, input: &[u8]) {
125        self.0 = self.0 || input.iter().any(|x| *x == 0);
126    }
127    fn result(self) -> bool {
128        self.0
129    }
130}
131
132#[derive(Default)]
133struct EngineNonascii(bool);
134impl EngineNonascii {
135    fn input(&mut self, input: &[u8]) {
136        self.0 = self.0 || input.iter().any(|x| x & 0x80 != 0);
137    }
138    fn result(self) -> bool {
139        self.0
140    }
141}
142
143struct Engines {
144    sha2: Option<Sha512_256>,
145    blake2b: Option<Blake2b32>,
146    size: EngineSize,
147    nul: EngineNul,
148    nonascii: EngineNonascii,
149}
150
151impl Engines {
152    fn new(features: Features) -> Engines {
153        Engines {
154            sha2: if features.sha2 {
155                Some(Sha512_256::default())
156            } else {
157                None
158            },
159            blake2b: if features.blake2b {
160                Some(Blake2b32::new())
161            } else {
162                None
163            },
164            size: EngineSize::default(),
165            nul: EngineNul::default(),
166            nonascii: EngineNonascii::default(),
167        }
168    }
169}
170
171impl Engines {
172    fn input(&mut self, input: &[u8]) {
173        self.sha2.iter_mut().for_each(|e| e.update(input));
174        self.blake2b.iter_mut().for_each(|e| e.update(input));
175        self.size.input(input);
176        self.nul.input(input);
177        self.nonascii.input(input);
178    }
179    fn result(self) -> Metrics {
180        Metrics {
181            sha2: self
182                .sha2
183                .map(|e| HashSum(Vec::from(e.finalize_fixed().as_slice()))),
184            blake2b: self
185                .blake2b
186                .map(|e| HashSum(Vec::from(e.finalize().as_slice()))),
187            size: self.size.result(),
188            nul: self.nul.result(),
189            nonascii: self.nonascii.result(),
190        }
191    }
192}
193
194fn compute_metrics(path: impl AsRef<Path>, features: Features) -> Result<Metrics, error::Error> {
195    let mut f = File::open(path)?;
196
197    let mut engines = Engines::new(features);
198
199    let mut buffer = [0; 4096];
200    loop {
201        let n = f.read(&mut buffer[..])?;
202        if n == 0 {
203            break;
204        }
205        engines.input(&buffer[0..n]);
206    }
207    Ok(engines.result())
208}
209
210trait BTreeMapExt<K, V>
211where
212    K: Ord,
213    V: Default,
214{
215    fn get_default(&mut self, key: K) -> &mut V;
216}
217
218impl<K, V> BTreeMapExt<K, V> for BTreeMap<K, V>
219where
220    K: Ord + Clone,
221    V: Default,
222{
223    fn get_default(&mut self, key: K) -> &mut V {
224        self.entry(key).or_insert_with(|| V::default())
225    }
226}
227
228impl Entry {
229    fn insert(&mut self, path: PathBuf, file: Entry) {
230        // Inner nodes in the tree should always be directories. If
231        // the node is not a directory, that means we are inserting a
232        // duplicate file. However, this function is only called from
233        // the directory walker, which makes it impossible to observe
234        // any duplicates. (And the database, after construction, is
235        // always immutable.)
236        match self {
237            Entry::Directory(entries) => {
238                let mut components = path.components();
239                let count = components.clone().count();
240                let first =
241                    Path::new(components.next().expect("unreachable").as_os_str()).to_owned();
242                let rest = components.as_path().to_owned();
243                if count > 1 {
244                    let subentry = entries.get_default(first);
245                    subentry.insert(rest, file);
246                } else if entries.insert(first, file).is_some() {
247                    unreachable!(); // See above
248                }
249            }
250            Entry::File(_) => unreachable!(),
251        }
252    }
253
254    fn lookup(&self, path: &Path) -> Option<&Entry> {
255        match self {
256            Entry::Directory(entries) => {
257                let mut components = path.components();
258                let count = components.clone().count();
259                let first =
260                    Path::new(components.next().expect("unreachable").as_os_str()).to_owned();
261                let rest = components.as_path().to_owned();
262                if count > 1 {
263                    entries
264                        .get(&first)
265                        .and_then(|subentry| subentry.lookup(&rest))
266                } else {
267                    entries.get(&first)
268                }
269            }
270            Entry::File(_) => unreachable!(),
271        }
272    }
273}
274
275#[derive(Debug)]
276pub enum EntryDiff {
277    Directory(BTreeMap<PathBuf, EntryDiff>, DirectoryDiff),
278    File(MetricsDiff),
279    KindChanged,
280}
281
282#[derive(Debug)]
283pub struct DirectoryDiff {
284    added: u64,
285    removed: u64,
286    changed: u64,
287    unchanged: u64,
288}
289
290#[derive(Debug)]
291pub struct MetricsDiff {
292    changed_content: bool,
293    zeroed: bool,
294    changed_nul: bool,
295    changed_nonascii: bool,
296}
297
298#[derive(Debug, PartialEq)]
299pub enum DiffSummary {
300    NoChanges,
301    Changes,
302    Suspicious,
303}
304
305impl EntryDiff {
306    fn show_diff(&self, path: &Path, depth: usize) {
307        match self {
308            EntryDiff::Directory(entries, diff) => {
309                if diff.changed > 0 || diff.added > 0 || diff.removed > 0 {
310                    println!(
311                        "{}{}: {} changed, {} added, {} removed, {} unchanged",
312                        "| ".repeat(depth),
313                        path.display(),
314                        diff.changed,
315                        diff.added,
316                        diff.removed,
317                        diff.unchanged
318                    );
319                    for (key, entry) in entries.iter() {
320                        entry.show_diff(key, depth + 1);
321                    }
322                }
323            }
324            EntryDiff::File(diff) => {
325                if diff.zeroed || diff.changed_nul || diff.changed_nonascii {
326                    println!("{}{} changed", "| ".repeat(depth), path.display());
327                    if diff.zeroed {
328                        println!("{}> suspicious: file was truncated", "##".repeat(depth));
329                    }
330                    if diff.changed_nul {
331                        println!(
332                            "{}> suspicious: original had no NUL bytes, but now does",
333                            "##".repeat(depth)
334                        );
335                    }
336                    if diff.changed_nonascii {
337                        println!(
338                            "{}> suspicious: original had no non-ASCII bytes, but now does",
339                            "##".repeat(depth)
340                        );
341                    }
342                }
343            }
344            EntryDiff::KindChanged => {}
345        }
346    }
347
348    fn summarize_diff(&self) -> DiffSummary {
349        match self {
350            EntryDiff::Directory(entries, diff) => {
351                let initial = if diff.changed > 0 || diff.added > 0 || diff.removed > 0 {
352                    DiffSummary::Changes
353                } else {
354                    DiffSummary::NoChanges
355                };
356                entries
357                    .values()
358                    .map(|x| x.summarize_diff())
359                    .fold(initial, |acc, x| acc.meet(x))
360            }
361            EntryDiff::File(diff) => {
362                if diff.zeroed || diff.changed_nul || diff.changed_nonascii {
363                    DiffSummary::Suspicious
364                } else if diff.changed_content {
365                    DiffSummary::Changes
366                } else {
367                    DiffSummary::NoChanges
368                }
369            }
370            EntryDiff::KindChanged => DiffSummary::Changes,
371        }
372    }
373}
374
375impl DiffSummary {
376    fn meet(self, other: DiffSummary) -> DiffSummary {
377        if self == DiffSummary::Suspicious || other == DiffSummary::Suspicious {
378            DiffSummary::Suspicious
379        } else if self == DiffSummary::Changes || other == DiffSummary::Changes {
380            DiffSummary::Changes
381        } else {
382            DiffSummary::NoChanges
383        }
384    }
385}
386
387impl Entry {
388    fn diff(&self, other: &Entry) -> EntryDiff {
389        match (self, other) {
390            (Entry::Directory(old), Entry::Directory(new)) => {
391                let mut entries = BTreeMap::default();
392                let mut added = 0;
393                let mut removed = 0;
394                let mut changed = 0;
395                let mut unchanged = 0;
396
397                let mut old_iter = old.iter();
398                let mut new_iter = new.iter();
399                let mut old_entry = old_iter.next();
400                let mut new_entry = new_iter.next();
401                while old_entry.is_some() && new_entry.is_some() {
402                    let (old_key, old_value) = old_entry.unwrap();
403                    let (new_key, new_value) = new_entry.unwrap();
404                    match old_key.cmp(new_key) {
405                        Ordering::Less => {
406                            removed += 1;
407                            old_entry = old_iter.next();
408                        }
409                        Ordering::Greater => {
410                            added += 1;
411                            new_entry = new_iter.next();
412                        }
413                        Ordering::Equal => {
414                            let diff = old_value.diff(new_value);
415                            match diff {
416                                EntryDiff::Directory(_, ref stats) => {
417                                    added += stats.added;
418                                    removed += stats.removed;
419                                    changed += stats.changed;
420                                    unchanged += stats.unchanged;
421                                }
422                                EntryDiff::File(ref stats) => {
423                                    if stats.changed_content {
424                                        changed += 1;
425                                    } else {
426                                        unchanged += 1;
427                                    }
428                                }
429                                EntryDiff::KindChanged => {
430                                    changed += 1;
431                                }
432                            }
433                            entries.insert(old_key.clone(), diff);
434                            old_entry = old_iter.next();
435                            new_entry = new_iter.next();
436                        }
437                    }
438                }
439                removed += old_iter.count() as u64;
440                added += new_iter.count() as u64;
441                EntryDiff::Directory(
442                    entries,
443                    DirectoryDiff {
444                        added,
445                        removed,
446                        changed,
447                        unchanged,
448                    },
449                )
450            }
451            (Entry::File(old), Entry::File(new)) => {
452                let changed = old.size != new.size;
453                let changed =
454                    changed || (old.sha2.is_some() && new.sha2.is_some() && old.sha2 != new.sha2);
455                let changed = changed
456                    || (old.blake2b.is_some()
457                        && new.blake2b.is_some()
458                        && old.blake2b != new.blake2b);
459                EntryDiff::File(MetricsDiff {
460                    changed_content: changed,
461                    zeroed: old.size > 0 && new.size == 0,
462                    changed_nul: old.nul != new.nul,
463                    changed_nonascii: old.nonascii != new.nonascii,
464                })
465            }
466            (_, _) => EntryDiff::KindChanged,
467        }
468    }
469}
470
471const SEP: u8 = 0x0a; // separator \n (byte 0x0a) used in JSON encoding
472
473impl Database {
474    fn insert(&mut self, path: PathBuf, entry: Entry) {
475        self.0.insert(path, entry);
476    }
477
478    pub fn lookup(&self, path: &Path) -> Option<&Entry> {
479        self.0.lookup(path)
480    }
481
482    pub fn diff(&self, other: &Database) -> EntryDiff {
483        self.0.diff(&other.0)
484    }
485
486    pub fn build(
487        root: impl AsRef<Path>,
488        features: Features,
489        threads: usize,
490        verbose: bool,
491    ) -> Result<Database, error::Error> {
492        let total_bytes = Arc::new(Mutex::new(0));
493        let database = Arc::new(Mutex::new(Database::default()));
494        let start_time = time::Instant::now();
495
496        let parallel = threads > 1;
497        if parallel {
498            WalkBuilder::new(&root)
499                .threads(threads)
500                .build_parallel()
501                .run(|| {
502                    let total_bytes = total_bytes.clone();
503                    let database = database.clone();
504                    let root = root.as_ref().to_owned();
505                    Box::new(move |entry| {
506                        let entry = entry.unwrap(); // ?
507                        if entry.file_type().map_or(false, |t| t.is_file()) {
508                            let metrics = compute_metrics(entry.path(), features).unwrap(); // ?
509                            *total_bytes.lock().unwrap() += metrics.size;
510                            let result = Entry::File(metrics);
511                            let short_path = if entry.path() == root {
512                                Path::new(entry.path().file_name().expect("unreachable"))
513                            } else {
514                                entry.path().strip_prefix(&root).unwrap() // ?
515                            };
516                            database
517                                .lock()
518                                .unwrap()
519                                .insert(short_path.to_owned(), result);
520                        }
521                        WalkState::Continue
522                    })
523                });
524        } else {
525            let total_bytes = &mut (*total_bytes.lock().unwrap());
526            let database = &mut (*database.lock().unwrap());
527            for entry in WalkBuilder::new(&root).build() {
528                let entry = entry?;
529                if entry.file_type().map_or(false, |t| t.is_file()) {
530                    let metrics = compute_metrics(entry.path(), features)?;
531                    *total_bytes += metrics.size;
532                    let result = Entry::File(metrics);
533                    let short_path = if entry.path() == root.as_ref() {
534                        Path::new(entry.path().file_name().expect("unreachable"))
535                    } else {
536                        entry.path().strip_prefix(&root)?
537                    };
538                    database.insert(short_path.to_owned(), result);
539                }
540            }
541        }
542        let elapsed = start_time.elapsed().as_seconds_f64();
543        if verbose {
544            let total_bytes = *total_bytes.lock().unwrap();
545            println!(
546                "Database::build took {:.3} seconds on {} threads, read {} bytes, {:.1} MB/s",
547                elapsed,
548                threads,
549                total_bytes,
550                total_bytes as f64 / elapsed / 1e6
551            );
552        }
553        let database = &(*database.lock().unwrap());
554        Ok(database.clone())
555    }
556
557    pub fn show_diff(&self, other: &Database) -> DiffSummary {
558        let diff = self.diff(other);
559        diff.show_diff(Path::new("."), 0);
560        diff.summarize_diff()
561    }
562
563    pub fn check(
564        &self,
565        root: impl AsRef<Path>,
566        features: Features,
567        threads: usize,
568    ) -> Result<DiffSummary, error::Error> {
569        // FIXME: This is non-interactive, but vastly more simple than
570        // trying to implement the same functionality interactively.
571        let other = Database::build(root, features, threads, false)?;
572        Ok(self.show_diff(&other))
573    }
574
575    pub fn load_json(r: impl Read) -> Result<Database, error::Error> {
576        // Read entire contents to memory
577        let mut d = GzDecoder::new(r);
578
579        let mut bytes = Vec::new();
580        d.read_to_end(&mut bytes)?;
581        let bytes = bytes;
582
583        // Find position of separator
584        let index = match bytes.iter().position(|&x| x == SEP) {
585            Some(x) => x,
586            None => return Err(error::Error::ParseError),
587        };
588
589        // Decode expected checksums
590        let expected: DatabaseChecksum = serde_json::from_slice(&bytes[..index])?;
591        let features = Features::infer_from_database_checksum(&expected);
592
593        // Compute actual checksums of database
594        let mut engines = Engines::new(features);
595        engines.input(&bytes[index + 1..]);
596        let actual: DatabaseChecksum = engines.result().into();
597
598        if expected.diff(&actual) {
599            return Err(error::Error::ChecksumMismatch);
600        }
601
602        // Continue decoding database
603        Ok(serde_json::from_slice(&bytes[index + 1..])?)
604    }
605
606    pub fn dump_json<W>(&self, w: W, features: Features) -> Result<W, error::Error>
607    where
608        W: Write,
609    {
610        // Important: The encoded JSON **must not** contain the separator,
611        // or else the format will break
612
613        // Generate JSON-encoded database
614        let db_json = serde_json::to_vec(self)?;
615
616        // Compute checksums of encoded JSON
617        let mut engines = Engines::new(features);
618        engines.input(&db_json[..]);
619        let checksum: DatabaseChecksum = engines.result().into();
620        let checksum_json = serde_json::to_vec(&checksum)?;
621
622        // Make sure encoded JSON does not include separator
623        assert!(!checksum_json.contains(&SEP));
624
625        // Write checksum, separator and database
626        let mut e = GzEncoder::new(w, Compression::best());
627        e.write_all(&checksum_json[..])?;
628        e.write_all(&vec![SEP][..])?;
629        e.write_all(&db_json)?;
630        Ok(e.finish()?)
631    }
632}
633
634// impl std::fmt::Display for Database {
635//     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
636//         for (path, entry) in self.0.iter() {
637//             match entry {
638//                 &Entry::File(ref hashes) => {
639//                     let hash: Vec<_> = hashes.sha2.0.iter().map(
640//                         |b| format!("{:02x}", b)).collect();
641//                     writeln!(f, "{} {}", hash.join(""), Path::new(path).display())?
642//                 }
643//             }
644//         }
645//         Ok(())
646//     }
647// }