1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
//! Provides the `first_and_rest` function, which returns a `Vec<u8>` containing
//! the contents of the first operand and an `ExactSizeIterator` over the
//! remaining operands. *Note:* this different treatment of the first and
//! remaining operands has the unfortunate result of requiring different code
//! paths for translating UTF16 files into UTF8. That currently seems worth the
//! cost.
use crate::set::LaterOperand;
use anyhow::{Context, Result};
use bstr::io::BufReadExt;
use encoding_rs_io::{DecodeReaderBytes, DecodeReaderBytesBuilder};
use std::{
    fs,
    fs::File,
    io::{self, Read},
    ops::FnMut,
    path::{Path, PathBuf},
};

/// The Unix convention: if a file argument is `-`, use `stdin`.
fn use_stdin(path: &Path) -> bool {
    path.to_string_lossy() == "-"
}
/// Return the contents of the first file named in `files` as a `Vec<u8>`, and
/// an `ExactSizeIterator` over the subsequent arguments.
#[must_use]
pub fn first_and_rest(files: &[PathBuf]) -> Option<(Result<Vec<u8>>, Remaining)> {
    fn all_of_stdin() -> Result<Vec<u8>> {
        let mut buffer = Vec::new();
        io::stdin().read_to_end(&mut buffer).context("Can't read file: <stdin>")?;
        Ok(decode_if_utf16(buffer))
    }

    match files {
        [] => None,
        [first, rest @ ..] => {
            let first_operand = if use_stdin(first) {
                all_of_stdin()
            } else {
                fs::read(first)
                    .with_context(|| format!("Can't read file: {}", first.display()))
                    .map(decode_if_utf16)
            };
            let rest = rest.to_vec();
            Some((first_operand, Remaining::from(rest)))
        }
    }
}

/// Decode UTF-16 to UTF-8 if we see a UTF-16 Byte Order Mark at the beginning of `candidate`.
/// Otherwise return `candidate` unchanged
fn decode_if_utf16(candidate: Vec<u8>) -> Vec<u8> {
    // Translate UTF16 to UTF8
    // Note: `decode_without_bom_handling` will change malformed sequences to the
    // Unicode REPLACEMENT CHARACTER. Should we report an error instead?
    //
    // "with BOM handling" means that the UTF-16 BOM is translated to a UTF-8 BOM
    //
    if let Some((enc, _)) = encoding_rs::Encoding::for_bom(&candidate) {
        if [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE].contains(&enc) {
            let (translated, _had_malformed_sequences) =
                enc.decode_without_bom_handling(&candidate);
            return translated.into_owned().into_bytes();
        }
    }
    candidate
}

/// The first operand is read into memory in its entirety, but that's not
/// efficient for the second and subsequent operands.  The `Remaining`
/// structure is an `ExactSizeIterator` over those operands.
pub struct Remaining {
    files: std::vec::IntoIter<PathBuf>,
}

impl From<Vec<PathBuf>> for Remaining {
    fn from(files: Vec<PathBuf>) -> Self {
        Remaining { files: files.into_iter() }
    }
}

impl Iterator for Remaining {
    type Item = Result<NextOperand>;
    fn next(&mut self) -> Option<Self::Item> {
        self.files.next().map(|path| reader_for(&path))
    }
}

impl ExactSizeIterator for Remaining {
    fn len(&self) -> usize {
        self.files.len()
    }
}

/// `NextOperand` is the `Item` type for the `Remaining` iterator. For a given
/// file path, the `reader` field is a reader for the file with that path, and
/// `path_display` is the path formatted for use in error messages.
pub struct NextOperand {
    path_display: String,
    reader: Box<dyn io::BufRead>,
}

/// The reader for a second or subsequent operand is a buffered reader with the
/// ability to decode UTF-16 files. I think this results in double-buffering,
/// with one buffer within the `DecodeReaderBytes` value, and another in the
/// `BufReader` that wraps it. I don't know how to work around that.
#[allow(trivial_casts)]
fn reader_for(path: &Path) -> Result<NextOperand> {
    fn decoder<R: Read>(f: R) -> DecodeReaderBytes<R, Vec<u8>> {
        DecodeReaderBytesBuilder::new()
            .bom_sniffing(true)
            .strip_bom(true)
            .utf8_passthru(true)
            .build(f)
    }
    let (path_display, reader) = if use_stdin(path) {
        let path_display = "<stdin>".to_string();
        let reader = decoder(io::stdin().lock());
        (path_display, Box::new(io::BufReader::new(reader)) as Box<dyn io::BufRead>)
    } else {
        let path_display = format!("{}", path.display());
        let reader =
            decoder(File::open(path).with_context(|| format!("Can't open file: {path_display}"))?);
        (path_display, Box::new(io::BufReader::new(reader)) as Box<dyn io::BufRead>)
    };
    Ok(NextOperand { path_display, reader })
}
impl LaterOperand for NextOperand {
    /// A convenience wrapper around `bstr::for_byte_line`
    fn for_byte_line(self, mut for_each_line: impl FnMut(&[u8])) -> Result<()> {
        let NextOperand { mut reader, path_display } = self;
        reader
            .for_byte_line(|line| {
                for_each_line(line);
                Ok(true)
            })
            .with_context(|| format!("Error reading file: {path_display}"))?;
        Ok(())
    }
}

#[allow(clippy::pedantic)]
#[cfg(test)]
mod test {
    use super::*;

    const UTF8_BOM: &str = "\u{FEFF}";

    fn abominate(expected: &str) -> String {
        UTF8_BOM.to_string() + expected
    }

    fn to_utf_16le(source: &str) -> Vec<u8> {
        let mut result = b"\xff\xfe".to_vec();
        for b in source.as_bytes().iter() {
            result.push(*b);
            result.push(0);
        }
        result
    }

    fn to_utf_16be(source: &str) -> Vec<u8> {
        let mut result = b"\xfe\xff".to_vec();
        for b in source.as_bytes().iter() {
            result.push(0);
            result.push(*b);
        }
        result
    }

    #[test]
    fn utf_16le_is_translated_to_utf8() {
        let expected = "The cute red crab\n jumps over the lazy blue gopher\n";
        assert_eq!(decode_if_utf16(to_utf_16le(expected)), abominate(expected).as_bytes());
    }

    #[test]
    fn utf_16be_is_translated_to_utf8() {
        let expected = "The cute red crab\n jumps over the lazy blue gopher\n";
        assert_eq!(decode_if_utf16(to_utf_16be(expected)), abominate(expected).as_bytes());
    }
}