1use std::cmp::Ordering;
2use std::collections::BTreeMap;
3use std::default::Default;
4use std::fs::File;
5use std::io::{Read, Write};
6use std::path::{Path, PathBuf};
7use std::sync::{Arc, Mutex};
8
9use digest::{consts::U32, Digest, FixedOutput};
10use ignore::{WalkBuilder, WalkState};
11use time;
12
13use serde_json;
14
15use flate2::read::GzDecoder;
16use flate2::write::GzEncoder;
17use flate2::Compression;
18
19use blake2;
20use sha2::Sha512_256;
21
22use crate::base64;
23use crate::error;
24
25type Blake2b32 = blake2::Blake2b<U32>;
26
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub struct Features {
29 pub sha2: bool,
30 pub blake2b: bool,
31}
32
33impl Default for Features {
34 fn default() -> Features {
35 Features {
36 sha2: true,
37 blake2b: false,
38 }
39 }
40}
41
42impl Features {
43 fn infer_from_database_checksum(checksum: &DatabaseChecksum) -> Features {
44 Features {
45 sha2: checksum.sha2.is_some(),
46 blake2b: checksum.blake2b.is_some(),
47 }
48 }
49}
50
51#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
52pub struct DatabaseChecksum {
53 #[serde(rename = "sha2-512/256")]
54 #[serde(skip_serializing_if = "Option::is_none")]
55 sha2: Option<HashSum>,
56 #[serde(skip_serializing_if = "Option::is_none")]
57 blake2b: Option<HashSum>,
58 size: u64,
59}
60
61impl DatabaseChecksum {
62 fn diff(&self, new: &Self) -> bool {
63 let changed = self.size != new.size;
64 let changed =
65 changed || (self.sha2.is_some() && new.sha2.is_some() && self.sha2 != new.sha2);
66 changed || (self.blake2b.is_some() && new.blake2b.is_some() && self.blake2b != new.blake2b)
67 }
68}
69
70impl From<Metrics> for DatabaseChecksum {
71 fn from(metrics: Metrics) -> Self {
72 DatabaseChecksum {
73 sha2: metrics.sha2,
74 blake2b: metrics.blake2b,
75 size: metrics.size,
76 }
77 }
78}
79
80#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
81pub struct Database(Entry);
82
83#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
84pub enum Entry {
85 Directory(BTreeMap<PathBuf, Entry>),
86 File(Metrics),
87}
88
89impl Default for Entry {
90 fn default() -> Entry {
91 Entry::Directory(BTreeMap::default())
92 }
93}
94
95#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
96pub struct Metrics {
97 #[serde(rename = "sha2-512/256")]
98 #[serde(skip_serializing_if = "Option::is_none")]
99 sha2: Option<HashSum>,
100 #[serde(skip_serializing_if = "Option::is_none")]
101 blake2b: Option<HashSum>,
102 size: u64, nul: bool, nonascii: bool, }
106
107#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
108pub struct HashSum(#[serde(with = "base64")] Vec<u8>);
109
110#[derive(Default)]
111struct EngineSize(u64);
112impl EngineSize {
113 fn input(&mut self, input: &[u8]) {
114 self.0 += input.len() as u64;
115 }
116 fn result(self) -> u64 {
117 self.0
118 }
119}
120
121#[derive(Default)]
122struct EngineNul(bool);
123impl EngineNul {
124 fn input(&mut self, input: &[u8]) {
125 self.0 = self.0 || input.iter().any(|x| *x == 0);
126 }
127 fn result(self) -> bool {
128 self.0
129 }
130}
131
132#[derive(Default)]
133struct EngineNonascii(bool);
134impl EngineNonascii {
135 fn input(&mut self, input: &[u8]) {
136 self.0 = self.0 || input.iter().any(|x| x & 0x80 != 0);
137 }
138 fn result(self) -> bool {
139 self.0
140 }
141}
142
143struct Engines {
144 sha2: Option<Sha512_256>,
145 blake2b: Option<Blake2b32>,
146 size: EngineSize,
147 nul: EngineNul,
148 nonascii: EngineNonascii,
149}
150
151impl Engines {
152 fn new(features: Features) -> Engines {
153 Engines {
154 sha2: if features.sha2 {
155 Some(Sha512_256::default())
156 } else {
157 None
158 },
159 blake2b: if features.blake2b {
160 Some(Blake2b32::new())
161 } else {
162 None
163 },
164 size: EngineSize::default(),
165 nul: EngineNul::default(),
166 nonascii: EngineNonascii::default(),
167 }
168 }
169}
170
171impl Engines {
172 fn input(&mut self, input: &[u8]) {
173 self.sha2.iter_mut().for_each(|e| e.update(input));
174 self.blake2b.iter_mut().for_each(|e| e.update(input));
175 self.size.input(input);
176 self.nul.input(input);
177 self.nonascii.input(input);
178 }
179 fn result(self) -> Metrics {
180 Metrics {
181 sha2: self
182 .sha2
183 .map(|e| HashSum(Vec::from(e.finalize_fixed().as_slice()))),
184 blake2b: self
185 .blake2b
186 .map(|e| HashSum(Vec::from(e.finalize().as_slice()))),
187 size: self.size.result(),
188 nul: self.nul.result(),
189 nonascii: self.nonascii.result(),
190 }
191 }
192}
193
194fn compute_metrics(path: impl AsRef<Path>, features: Features) -> Result<Metrics, error::Error> {
195 let mut f = File::open(path)?;
196
197 let mut engines = Engines::new(features);
198
199 let mut buffer = [0; 4096];
200 loop {
201 let n = f.read(&mut buffer[..])?;
202 if n == 0 {
203 break;
204 }
205 engines.input(&buffer[0..n]);
206 }
207 Ok(engines.result())
208}
209
210trait BTreeMapExt<K, V>
211where
212 K: Ord,
213 V: Default,
214{
215 fn get_default(&mut self, key: K) -> &mut V;
216}
217
218impl<K, V> BTreeMapExt<K, V> for BTreeMap<K, V>
219where
220 K: Ord + Clone,
221 V: Default,
222{
223 fn get_default(&mut self, key: K) -> &mut V {
224 self.entry(key).or_insert_with(|| V::default())
225 }
226}
227
228impl Entry {
229 fn insert(&mut self, path: PathBuf, file: Entry) {
230 match self {
237 Entry::Directory(entries) => {
238 let mut components = path.components();
239 let count = components.clone().count();
240 let first =
241 Path::new(components.next().expect("unreachable").as_os_str()).to_owned();
242 let rest = components.as_path().to_owned();
243 if count > 1 {
244 let subentry = entries.get_default(first);
245 subentry.insert(rest, file);
246 } else if entries.insert(first, file).is_some() {
247 unreachable!(); }
249 }
250 Entry::File(_) => unreachable!(),
251 }
252 }
253
254 fn lookup(&self, path: &Path) -> Option<&Entry> {
255 match self {
256 Entry::Directory(entries) => {
257 let mut components = path.components();
258 let count = components.clone().count();
259 let first =
260 Path::new(components.next().expect("unreachable").as_os_str()).to_owned();
261 let rest = components.as_path().to_owned();
262 if count > 1 {
263 entries
264 .get(&first)
265 .and_then(|subentry| subentry.lookup(&rest))
266 } else {
267 entries.get(&first)
268 }
269 }
270 Entry::File(_) => unreachable!(),
271 }
272 }
273}
274
275#[derive(Debug)]
276pub enum EntryDiff {
277 Directory(BTreeMap<PathBuf, EntryDiff>, DirectoryDiff),
278 File(MetricsDiff),
279 KindChanged,
280}
281
282#[derive(Debug)]
283pub struct DirectoryDiff {
284 added: u64,
285 removed: u64,
286 changed: u64,
287 unchanged: u64,
288}
289
290#[derive(Debug)]
291pub struct MetricsDiff {
292 changed_content: bool,
293 zeroed: bool,
294 changed_nul: bool,
295 changed_nonascii: bool,
296}
297
298#[derive(Debug, PartialEq)]
299pub enum DiffSummary {
300 NoChanges,
301 Changes,
302 Suspicious,
303}
304
305impl EntryDiff {
306 fn show_diff(&self, path: &Path, depth: usize) {
307 match self {
308 EntryDiff::Directory(entries, diff) => {
309 if diff.changed > 0 || diff.added > 0 || diff.removed > 0 {
310 println!(
311 "{}{}: {} changed, {} added, {} removed, {} unchanged",
312 "| ".repeat(depth),
313 path.display(),
314 diff.changed,
315 diff.added,
316 diff.removed,
317 diff.unchanged
318 );
319 for (key, entry) in entries.iter() {
320 entry.show_diff(key, depth + 1);
321 }
322 }
323 }
324 EntryDiff::File(diff) => {
325 if diff.zeroed || diff.changed_nul || diff.changed_nonascii {
326 println!("{}{} changed", "| ".repeat(depth), path.display());
327 if diff.zeroed {
328 println!("{}> suspicious: file was truncated", "##".repeat(depth));
329 }
330 if diff.changed_nul {
331 println!(
332 "{}> suspicious: original had no NUL bytes, but now does",
333 "##".repeat(depth)
334 );
335 }
336 if diff.changed_nonascii {
337 println!(
338 "{}> suspicious: original had no non-ASCII bytes, but now does",
339 "##".repeat(depth)
340 );
341 }
342 }
343 }
344 EntryDiff::KindChanged => {}
345 }
346 }
347
348 fn summarize_diff(&self) -> DiffSummary {
349 match self {
350 EntryDiff::Directory(entries, diff) => {
351 let initial = if diff.changed > 0 || diff.added > 0 || diff.removed > 0 {
352 DiffSummary::Changes
353 } else {
354 DiffSummary::NoChanges
355 };
356 entries
357 .values()
358 .map(|x| x.summarize_diff())
359 .fold(initial, |acc, x| acc.meet(x))
360 }
361 EntryDiff::File(diff) => {
362 if diff.zeroed || diff.changed_nul || diff.changed_nonascii {
363 DiffSummary::Suspicious
364 } else if diff.changed_content {
365 DiffSummary::Changes
366 } else {
367 DiffSummary::NoChanges
368 }
369 }
370 EntryDiff::KindChanged => DiffSummary::Changes,
371 }
372 }
373}
374
375impl DiffSummary {
376 fn meet(self, other: DiffSummary) -> DiffSummary {
377 if self == DiffSummary::Suspicious || other == DiffSummary::Suspicious {
378 DiffSummary::Suspicious
379 } else if self == DiffSummary::Changes || other == DiffSummary::Changes {
380 DiffSummary::Changes
381 } else {
382 DiffSummary::NoChanges
383 }
384 }
385}
386
387impl Entry {
388 fn diff(&self, other: &Entry) -> EntryDiff {
389 match (self, other) {
390 (Entry::Directory(old), Entry::Directory(new)) => {
391 let mut entries = BTreeMap::default();
392 let mut added = 0;
393 let mut removed = 0;
394 let mut changed = 0;
395 let mut unchanged = 0;
396
397 let mut old_iter = old.iter();
398 let mut new_iter = new.iter();
399 let mut old_entry = old_iter.next();
400 let mut new_entry = new_iter.next();
401 while old_entry.is_some() && new_entry.is_some() {
402 let (old_key, old_value) = old_entry.unwrap();
403 let (new_key, new_value) = new_entry.unwrap();
404 match old_key.cmp(new_key) {
405 Ordering::Less => {
406 removed += 1;
407 old_entry = old_iter.next();
408 }
409 Ordering::Greater => {
410 added += 1;
411 new_entry = new_iter.next();
412 }
413 Ordering::Equal => {
414 let diff = old_value.diff(new_value);
415 match diff {
416 EntryDiff::Directory(_, ref stats) => {
417 added += stats.added;
418 removed += stats.removed;
419 changed += stats.changed;
420 unchanged += stats.unchanged;
421 }
422 EntryDiff::File(ref stats) => {
423 if stats.changed_content {
424 changed += 1;
425 } else {
426 unchanged += 1;
427 }
428 }
429 EntryDiff::KindChanged => {
430 changed += 1;
431 }
432 }
433 entries.insert(old_key.clone(), diff);
434 old_entry = old_iter.next();
435 new_entry = new_iter.next();
436 }
437 }
438 }
439 removed += old_iter.count() as u64;
440 added += new_iter.count() as u64;
441 EntryDiff::Directory(
442 entries,
443 DirectoryDiff {
444 added,
445 removed,
446 changed,
447 unchanged,
448 },
449 )
450 }
451 (Entry::File(old), Entry::File(new)) => {
452 let changed = old.size != new.size;
453 let changed =
454 changed || (old.sha2.is_some() && new.sha2.is_some() && old.sha2 != new.sha2);
455 let changed = changed
456 || (old.blake2b.is_some()
457 && new.blake2b.is_some()
458 && old.blake2b != new.blake2b);
459 EntryDiff::File(MetricsDiff {
460 changed_content: changed,
461 zeroed: old.size > 0 && new.size == 0,
462 changed_nul: old.nul != new.nul,
463 changed_nonascii: old.nonascii != new.nonascii,
464 })
465 }
466 (_, _) => EntryDiff::KindChanged,
467 }
468 }
469}
470
471const SEP: u8 = 0x0a; impl Database {
474 fn insert(&mut self, path: PathBuf, entry: Entry) {
475 self.0.insert(path, entry);
476 }
477
478 pub fn lookup(&self, path: &Path) -> Option<&Entry> {
479 self.0.lookup(path)
480 }
481
482 pub fn diff(&self, other: &Database) -> EntryDiff {
483 self.0.diff(&other.0)
484 }
485
486 pub fn build(
487 root: impl AsRef<Path>,
488 features: Features,
489 threads: usize,
490 verbose: bool,
491 ) -> Result<Database, error::Error> {
492 let total_bytes = Arc::new(Mutex::new(0));
493 let database = Arc::new(Mutex::new(Database::default()));
494 let start_time = time::Instant::now();
495
496 let parallel = threads > 1;
497 if parallel {
498 WalkBuilder::new(&root)
499 .threads(threads)
500 .build_parallel()
501 .run(|| {
502 let total_bytes = total_bytes.clone();
503 let database = database.clone();
504 let root = root.as_ref().to_owned();
505 Box::new(move |entry| {
506 let entry = entry.unwrap(); if entry.file_type().map_or(false, |t| t.is_file()) {
508 let metrics = compute_metrics(entry.path(), features).unwrap(); *total_bytes.lock().unwrap() += metrics.size;
510 let result = Entry::File(metrics);
511 let short_path = if entry.path() == root {
512 Path::new(entry.path().file_name().expect("unreachable"))
513 } else {
514 entry.path().strip_prefix(&root).unwrap() };
516 database
517 .lock()
518 .unwrap()
519 .insert(short_path.to_owned(), result);
520 }
521 WalkState::Continue
522 })
523 });
524 } else {
525 let total_bytes = &mut (*total_bytes.lock().unwrap());
526 let database = &mut (*database.lock().unwrap());
527 for entry in WalkBuilder::new(&root).build() {
528 let entry = entry?;
529 if entry.file_type().map_or(false, |t| t.is_file()) {
530 let metrics = compute_metrics(entry.path(), features)?;
531 *total_bytes += metrics.size;
532 let result = Entry::File(metrics);
533 let short_path = if entry.path() == root.as_ref() {
534 Path::new(entry.path().file_name().expect("unreachable"))
535 } else {
536 entry.path().strip_prefix(&root)?
537 };
538 database.insert(short_path.to_owned(), result);
539 }
540 }
541 }
542 let elapsed = start_time.elapsed().as_seconds_f64();
543 if verbose {
544 let total_bytes = *total_bytes.lock().unwrap();
545 println!(
546 "Database::build took {:.3} seconds on {} threads, read {} bytes, {:.1} MB/s",
547 elapsed,
548 threads,
549 total_bytes,
550 total_bytes as f64 / elapsed / 1e6
551 );
552 }
553 let database = &(*database.lock().unwrap());
554 Ok(database.clone())
555 }
556
557 pub fn show_diff(&self, other: &Database) -> DiffSummary {
558 let diff = self.diff(other);
559 diff.show_diff(Path::new("."), 0);
560 diff.summarize_diff()
561 }
562
563 pub fn check(
564 &self,
565 root: impl AsRef<Path>,
566 features: Features,
567 threads: usize,
568 ) -> Result<DiffSummary, error::Error> {
569 let other = Database::build(root, features, threads, false)?;
572 Ok(self.show_diff(&other))
573 }
574
575 pub fn load_json(r: impl Read) -> Result<Database, error::Error> {
576 let mut d = GzDecoder::new(r);
578
579 let mut bytes = Vec::new();
580 d.read_to_end(&mut bytes)?;
581 let bytes = bytes;
582
583 let index = match bytes.iter().position(|&x| x == SEP) {
585 Some(x) => x,
586 None => return Err(error::Error::ParseError),
587 };
588
589 let expected: DatabaseChecksum = serde_json::from_slice(&bytes[..index])?;
591 let features = Features::infer_from_database_checksum(&expected);
592
593 let mut engines = Engines::new(features);
595 engines.input(&bytes[index + 1..]);
596 let actual: DatabaseChecksum = engines.result().into();
597
598 if expected.diff(&actual) {
599 return Err(error::Error::ChecksumMismatch);
600 }
601
602 Ok(serde_json::from_slice(&bytes[index + 1..])?)
604 }
605
606 pub fn dump_json<W>(&self, w: W, features: Features) -> Result<W, error::Error>
607 where
608 W: Write,
609 {
610 let db_json = serde_json::to_vec(self)?;
615
616 let mut engines = Engines::new(features);
618 engines.input(&db_json[..]);
619 let checksum: DatabaseChecksum = engines.result().into();
620 let checksum_json = serde_json::to_vec(&checksum)?;
621
622 assert!(!checksum_json.contains(&SEP));
624
625 let mut e = GzEncoder::new(w, Compression::best());
627 e.write_all(&checksum_json[..])?;
628 e.write_all(&vec![SEP][..])?;
629 e.write_all(&db_json)?;
630 Ok(e.finish()?)
631 }
632}
633
634