add_determinism/add_det/handlers/
mod.rs

1/* SPDX-License-Identifier: GPL-3.0-or-later */
2
3pub mod ar;
4pub mod gzip;
5pub mod javadoc;
6pub mod pyc;
7pub mod zip;
8
9use anyhow::{bail, Context, Result};
10use log::{log, debug, info, warn, Level};
11use std::ascii::escape_default;
12use std::collections::HashMap;
13use std::fmt::Write;
14use std::ffi::OsStr;
15use std::fs;
16use std::fs::{File, Metadata};
17use std::io::{self, Seek};
18use std::os::unix::fs as unix_fs;
19use std::os::unix::fs::MetadataExt as _;
20use std::path::Path;
21use std::sync::Arc;
22use tempfile::NamedTempFile;
23use thiserror::Error;
24
25#[cfg(target_os = "linux")]
26use std::os::linux::fs::MetadataExt as _;
27#[cfg(target_os = "macos")]
28use std::os::macos::fs::MetadataExt as _;
29
30use super::config;
31
32#[derive(Error, Debug)]
33pub enum Error {
34    #[error("unexpected EOF, cannot take {1} bytes at offset 0x{0:x}")]
35    UnexpectedEOF(u64, usize),
36
37    #[error("wrong magic at offset {0}\n        (have \"{}\", exp. \"{}\")",
38            asciify(.1), asciify(.2))]
39    BadMagic(u64, Vec<u8>, &'static [u8]),
40
41    #[error("{0}")]
42    Other(String),
43}
44
45// based on https://stackoverflow.com/a/52671523
46pub fn asciify<B: AsRef<[u8]>>(buf: B) -> String {
47    String::from_utf8(
48        buf.as_ref()
49            .iter()
50            .flat_map(|b| escape_default(*b))
51            .collect(),
52    ).unwrap()
53}
54
55#[derive(Copy, Clone, Debug, PartialEq, PartialOrd)]
56pub enum ProcessResult {
57    Ignored,
58    Noop,
59    Replaced,
60    Rewritten,
61    BadFormat,
62    Error,
63}
64
65impl ProcessResult {
66    pub fn convert_and_warn(input_path: &Path, result: Result<ProcessResult>) -> ProcessResult {
67        match result {
68            Err(err) => {
69                warn!("{}: failed to process: {}", input_path.display(), err);
70
71                if err.downcast_ref::<Error>().is_some() {
72                    ProcessResult::BadFormat
73                } else {
74                    ProcessResult::Error
75                }
76            }
77            Ok(res) => res
78        }
79    }
80
81    pub fn extend_and_warn(&mut self, input_path: &Path, result: Result<ProcessResult>) {
82        let converted = ProcessResult::convert_and_warn(input_path, result);
83
84        if (*self == ProcessResult::Replaced && converted == ProcessResult::Rewritten) ||
85            (*self == ProcessResult::Rewritten && converted == ProcessResult::Replaced) {
86            warn!("{}: different process result, hardlink count modified externally?",
87                  input_path.display());
88        }
89
90        if *self < converted {
91            *self = converted;
92        }
93    }
94}
95
96pub trait Processor {
97    fn name(&self) -> &str;
98
99    /// Optionally, do "global" setup of the processor.
100    fn initialize(&mut self) -> Result<()> {
101        Ok(())
102    }
103
104    /// Return true if the given path looks like it should be processed.
105    fn filter(&self, path: &Path) -> Result<bool>;
106
107    /// Process file and indicate whether modifications were made.
108    fn process(&self, path: &Path) -> Result<ProcessResult>;
109}
110
111#[derive(Debug, Default, PartialEq)]
112pub struct Stats {
113    /// Count of directories that were scanned. This includes both
114    /// command-line arguments and subdirectories found in recursive
115    /// processing.
116    pub directories: u64,
117
118    /// Count of file paths that were scanned. This includes both
119    /// command-line arguments and paths found in recursive
120    /// processing.
121    pub files: u64,
122
123    /// Count of inodes we actually processed. We maintain a cache of
124    /// processed inode numbers, so a given inode is be processed only
125    /// once.
126    pub inodes_processed: u64,
127
128    /// Count of inodes modified. Split into inodes that were
129    /// automatically replaced and inodes that were rewritten. We
130    /// do a rewrite if there are hardlinks to maintain them.
131    pub inodes_replaced: u64,
132    pub inodes_rewritten: u64,
133
134    /// Files that we couldn't understand.
135    /// The case where the file has the right extension, but e.g.
136    /// bad magic, do *not* count.
137    pub misunderstood: u64,
138
139    /// Various errors other than bad format above.
140    pub errors: u64,
141}
142
143impl Stats {
144    pub fn new() -> Self { Default::default() }
145
146    pub fn add_one(&mut self, result: ProcessResult) {
147        match result {
148            ProcessResult::Ignored   => { return; }
149            ProcessResult::Noop      => {}
150            ProcessResult::Replaced  => { self.inodes_replaced += 1;  }
151            ProcessResult::Rewritten => { self.inodes_rewritten += 1; }
152            ProcessResult::BadFormat => { self.misunderstood += 1;    }
153            ProcessResult::Error     => { self.errors += 1;           }
154        }
155
156        self.inodes_processed += 1;
157    }
158
159    pub fn add(&mut self, other: &Stats) {
160        self.directories += other.directories;
161        self.files += other.files;
162        self.inodes_processed += other.inodes_processed;
163        self.inodes_replaced += other.inodes_replaced;
164        self.inodes_rewritten += other.inodes_rewritten;
165        self.misunderstood += other.misunderstood;
166        self.errors += other.errors;
167    }
168
169    pub fn summarize(&self) {
170        info!("Scanned {} directories and {} files,\n    \
171              processed {} inodes,\n    \
172              {} modified ({} replaced + {} rewritten),\n    \
173              {} unsupported format, {} errors\
174              ",
175              self.directories, self.files,
176              self.inodes_processed,
177              self.inodes_replaced + self.inodes_rewritten,
178              self.inodes_replaced, self.inodes_rewritten,
179              self.misunderstood, self.errors);
180    }
181}
182
183pub type HandlerBoxed = fn(&Arc<config::Config>) -> Box<dyn Processor + Send + Sync>;
184
185pub const HANDLERS: &[(&str, bool, HandlerBoxed)] = &[
186    ("ar",             true,  ar::Ar::boxed           ),
187    ("jar",            true,  zip::Zip::boxed_jar     ),
188    ("javadoc",        true,  javadoc::Javadoc::boxed ),
189    ("gzip",           true,  gzip::Gzip::boxed       ),
190    ("pyc",            true,  pyc::Pyc::boxed         ),
191    ("zip",            true,  zip::Zip::boxed_zip     ),
192    ("pyc-zero-mtime", false, pyc::PycZeroMtime::boxed),
193];
194
195pub fn handler_names() -> Vec<&'static str> {
196    HANDLERS.iter()
197        .map(|(name, _, _)| *name)
198        .collect()
199}
200
201pub fn make_handlers(config: &Arc<config::Config>) -> Result<Vec<Box<dyn Processor + Send + Sync>>> {
202    let mut handlers: Vec<Box<dyn Processor + Send + Sync>> = vec![];
203
204    for (name, _, func) in HANDLERS {
205        if config.handler_names.contains(name) {
206            let mut handler = func(config);
207            match handler.initialize() {
208                Err(e) => {
209                    if config.strict_handlers {
210                        bail!("Cannot initialize handler {}: {}", handler.name(), e);
211                    }
212                    warn!("Handler {} skipped: {}", handler.name(), e);
213                }
214                Ok(()) => {
215                    debug!("Initialized handler {}.", handler.name());
216                    handlers.push(handler);
217                }
218            }
219        }
220    }
221
222    Ok(handlers)
223}
224
225pub fn inodes_seen() -> HashMap<u64, u8> {
226    HashMap::new()
227}
228
229pub fn do_print(config: &Arc<config::Config>) -> Result<()> {
230    let handler = pyc::Pyc::new(config);
231    let mut w = String::new();
232
233    for (n, input_path) in config.inputs.iter().enumerate() {
234        if n > 0 {
235            writeln!(w)?;  // separate outputs by empty line
236        }
237        handler.pretty_print(&mut w, input_path)?;
238    }
239
240    print!("{w}");
241
242    Ok(())
243}
244
245pub fn do_normal_work(config: &Arc<config::Config>) -> Result<Stats> {
246    let handlers = make_handlers(config)?;
247    let mut inodes_seen = inodes_seen();
248    let mut total = Stats::new();
249
250    for input_path in &config.inputs {
251        let stats = process_file_or_dir(&handlers, &mut inodes_seen, input_path, None);
252        total.add(&stats);
253    }
254
255    Ok(total)
256}
257
258pub type ProcessWrapper<'a> = Option<&'a dyn Fn(u8, &Path) -> Result<()>>;
259
260fn process_file(
261    handlers: &[Box<dyn Processor + Send + Sync>],
262    already_seen: &mut u8,
263    input_path: &Path,
264    process_wrapper: ProcessWrapper,
265) -> Result<ProcessResult> {
266
267    // When processing locally, this says whether modifications have
268    // been made. When processing remotely, we will send the result
269    // separately after asynchronous processing is finished.
270    let mut entry_mod = ProcessResult::Ignored;
271
272    let mut selected_handlers = 0;
273
274    for (n_processor, processor) in handlers.iter().enumerate() {
275        // The same inode can be linked under multiple names
276        // with different extensions. Thus, we check if the
277        // given processor already handled this file.
278        if *already_seen & (1 << n_processor) > 0 {
279            debug!("{}: already seen by {} handler",
280                   input_path.display(), processor.name());
281            continue;
282        }
283
284        let cond = processor.filter(input_path)?;
285        if cond {
286            debug!("{}: matched by handler {}", input_path.display(), processor.name());
287
288            selected_handlers |= 1 << n_processor;
289
290            if process_wrapper.is_none() {
291                let res = processor.process(input_path);
292                entry_mod.extend_and_warn(input_path, res);
293            }
294        }
295
296        *already_seen |= selected_handlers;
297    }
298
299    if selected_handlers > 0 {
300        if let Some(func) = process_wrapper {
301            assert!(entry_mod == ProcessResult::Ignored);
302            func(selected_handlers, input_path)?;
303        }
304    }
305
306    Ok(entry_mod)
307}
308
309fn process_entry(
310    handlers: &[Box<dyn Processor + Send + Sync>],
311    inodes_seen: &mut HashMap<u64, u8>,
312    process_wrapper: ProcessWrapper,
313    stats: &mut Stats,
314    entry: &walkdir::DirEntry,
315) -> Result<ProcessResult> {
316
317    debug!("Looking at {}…", entry.path().display());
318
319    let name = unwrap_os_string(entry.file_name())?;
320    if name.starts_with(".#.") && name.ends_with(".tmp") {
321        // This is our own temporary file. Ignore it.
322        return Ok(ProcessResult::Ignored);
323    }
324
325    let metadata = entry.metadata()?;
326    if metadata.is_dir() {
327        stats.directories += 1;
328        return Ok(ProcessResult::Ignored);
329    }
330
331    stats.files += 1;
332    if !metadata.is_file() {
333        debug!("{}: not a file", entry.path().display());
334        return Ok(ProcessResult::Ignored);
335    }
336
337    let inode = metadata.ino();
338    let mut already_seen = *inodes_seen.get(&inode).unwrap_or(&0);
339
340    let entry_mod = process_file(
341        handlers,
342        &mut already_seen,
343        entry.path(),
344        process_wrapper)?;
345
346    inodes_seen.insert(inode, already_seen); // This is the orig inode
347    if entry_mod != ProcessResult::Noop {
348        // The path might have been replaced with a new inode.
349        let metadata = entry.metadata()?;
350        let inode2 = metadata.ino();
351        if inode2 != inode {
352            // This is the new inode. We use the same set of bits in
353            // already_seen, because those handlers have already been
354            // applied to the contents of the new inode.
355            inodes_seen.insert(inode2, already_seen);
356        }
357    }
358
359    Ok(entry_mod)
360}
361
362pub fn process_file_or_dir(
363    handlers: &[Box<dyn Processor + Send + Sync>],
364    inodes_seen: &mut HashMap<u64, u8>,
365    input_path: &Path,
366    process_wrapper: ProcessWrapper,
367) -> Stats {
368
369    let mut stats = Stats::new();
370
371    for entry in walkdir::WalkDir::new(input_path)
372        .follow_links(false)
373        .into_iter() {
374            let entry = match entry {
375                Err(e) => {
376                    warn!("Failed to process: {e}");
377                    stats.errors += 1;
378                    continue;
379                }
380                Ok(entry) => entry
381            };
382
383            let res = process_entry(handlers, inodes_seen, process_wrapper, &mut stats, &entry);
384            stats.add_one(ProcessResult::convert_and_warn(entry.path(), res));
385        }
386
387    stats
388}
389
390fn unwrap_os_string(filename: &OsStr) -> Result<&str> {
391    match filename.to_str() {
392        Some(s) => Ok(s),
393        None => {
394            bail!("Invalid file name {:?}", filename);
395        }
396    }
397}
398
399pub struct InputOutputHelper<'a> {
400    pub input_path: &'a Path,
401    pub input_metadata: Metadata,
402
403    // this is set when .open_output is called
404    pub output: Option<NamedTempFile>,
405
406    pub check: bool,
407    pub verbose: bool,  // include logging about each modified file
408}
409
410impl Drop for InputOutputHelper<'_> {
411    fn drop(&mut self) {
412        if let Some(f) = self.output.take() {
413            debug!("{}: discarding temporary copy", f.path().display());
414            if let Err(e) = f.close() {
415                if e.kind() != io::ErrorKind::NotFound {
416                    warn!("Failed to remove tempfile for {}: {}", self.input_path.display(), e);
417                }
418            }
419        }
420    }
421}
422
423impl<'a> InputOutputHelper<'a> {
424    pub fn open(
425        input_path: &'a Path,
426        check: bool,
427        verbose: bool,
428    ) -> Result<(Self, io::BufReader<File>)> {
429
430        let input = File::open(input_path)
431            .with_context(|| format!("Cannot open {input_path:?}"))?;
432
433        let input_metadata = input.metadata()?;
434        let input = io::BufReader::new(input);
435
436        let io = InputOutputHelper {
437            input_path,
438            input_metadata,
439            output: None,
440            check,
441            verbose,
442        };
443
444        Ok((io, input))
445    }
446
447    pub fn open_output(&mut self, need_real_file_for_check: bool) -> Result<()> {
448        assert!(self.output.is_none());
449
450        let tmpfile = if self.check && !need_real_file_for_check {
451            tempfile::Builder::new()
452                .disable_cleanup(true)
453                .make(|_| File::options()
454                      .read(true)
455                      .write(true)
456                      .open("/dev/null"))?
457        } else {
458            let prefix = format!(
459                ".#.{}",
460                self.input_path.file_name().and_then(|s| s.to_str()).unwrap_or("tmp")
461            );
462
463            if self.check {
464                NamedTempFile::with_prefix(prefix)?
465            } else {
466                // We need to create the temporary file in the same
467                // location as the real file so that rename works.
468                NamedTempFile::with_prefix_in(prefix, self.input_path.parent().unwrap())?
469            }
470        };
471
472        self.output = Some(tmpfile);
473
474        Ok(())
475    }
476
477    pub fn finalize(&mut self, have_mod: bool) -> Result<ProcessResult> {
478        let meta = &self.input_metadata;
479
480        if !have_mod {
481            Ok(ProcessResult::Noop)
482
483        } else if self.check {
484            // nothing to do, we're using a fake output
485            Ok(
486                if meta.nlink() == 1 {
487                    ProcessResult::Replaced
488                } else {
489                    ProcessResult::Rewritten
490                }
491            )
492
493        } else {
494            let output = self.output.as_mut().unwrap();
495
496            // If the original file has nlinks == 1, we atomically replace it.
497            // If it has multiple links, we reopen the original file and rewrite it.
498            // This way the inode number is retained and hard links are not broken.
499            if meta.nlink() == 1 {
500                log!(if self.verbose { Level::Info } else { Level::Debug },
501                     "{}: replacing with normalized version", self.input_path.display());
502
503                output.disable_cleanup(true);
504
505                output.as_file_mut().set_permissions(meta.permissions())?;
506                output.as_file_mut().set_modified(meta.modified()?)?;
507
508                if let Err(e) = unix_fs::lchown(output.path(), Some(meta.st_uid()), Some(meta.st_gid())) {
509                    if e.kind() == io::ErrorKind::PermissionDenied {
510                        warn!("{}: cannot change file ownership, ignoring", output.path().display());
511                    } else {
512                        bail!("{}: cannot change file ownership: {}", output.path().display(), e);
513                    }
514                }
515
516                fs::rename(output.path(), self.input_path)?;
517                self.output.take();   /* The output is now invalid */
518
519                Ok(ProcessResult::Replaced)
520
521            } else {
522                log!(if self.verbose { Level::Info } else { Level::Debug },
523                     "{}: rewriting with normalized contents", self.input_path.display());
524
525                let file = output.as_file_mut();
526
527                file.seek(io::SeekFrom::Start(0))?;
528
529                let mut input_writer = File::options().write(true).open(self.input_path)?;
530                let len = io::copy(file, &mut input_writer)?;
531                // truncate the file in case it was originally longer
532                input_writer.set_len(len)?;
533                input_writer.set_modified(meta.modified()?)?;
534
535                Ok(ProcessResult::Rewritten)
536            }
537        }
538    }
539}
540
541#[cfg(test)]
542mod tests {
543    use super::*;
544
545    #[test]
546    fn filter_asciify() {
547        assert_eq!(asciify("asdf"), "asdf");
548        assert_eq!(asciify("\"\""), "\\\"\\\"");
549        assert_eq!(asciify("\n\t\r"), "\\n\\t\\r");
550        assert_eq!(asciify("zębina"), "z\\xc4\\x99bina");
551        assert_eq!(asciify([0; 4]), "\\x00\\x00\\x00\\x00");
552    }
553}