zet/
operands.rs

1//! Provides the `first_and_rest` function, which returns a `Vec<u8>` containing
2//! the contents of the first operand and an `ExactSizeIterator` over the
3//! remaining operands. *Note:* this different treatment of the first and
4//! remaining operands has the unfortunate result of requiring different code
5//! paths for translating UTF16 files into UTF8. That currently seems worth the
6//! cost.
7use crate::set::LaterOperand;
8use anyhow::{Context, Result};
9use bstr::io::BufReadExt;
10use encoding_rs_io::{DecodeReaderBytes, DecodeReaderBytesBuilder};
11use std::{
12    fs,
13    fs::File,
14    io::{self, Read},
15    ops::FnMut,
16    path::{Path, PathBuf},
17};
18
19/// The Unix convention: if a file argument is `-`, use `stdin`.
20fn use_stdin(path: &Path) -> bool {
21    path.to_string_lossy() == "-"
22}
23/// Return the contents of the first file named in `files` as a `Vec<u8>`, and
24/// an `ExactSizeIterator` over the subsequent arguments.
25#[must_use]
26pub fn first_and_rest(files: &[PathBuf]) -> Option<(Result<Vec<u8>>, Remaining)> {
27    fn all_of_stdin() -> Result<Vec<u8>> {
28        let mut buffer = Vec::new();
29        io::stdin().read_to_end(&mut buffer).context("Can't read file: <stdin>")?;
30        Ok(decode_if_utf16(buffer))
31    }
32
33    match files {
34        [] => None,
35        [first, rest @ ..] => {
36            let first_operand = if use_stdin(first) {
37                all_of_stdin()
38            } else {
39                fs::read(first)
40                    .with_context(|| format!("Can't read file: {}", first.display()))
41                    .map(decode_if_utf16)
42            };
43            let rest = rest.to_vec();
44            Some((first_operand, Remaining::from(rest)))
45        }
46    }
47}
48
49/// Decode UTF-16 to UTF-8 if we see a UTF-16 Byte Order Mark at the beginning of `candidate`.
50/// Otherwise return `candidate` unchanged
51fn decode_if_utf16(candidate: Vec<u8>) -> Vec<u8> {
52    // Translate UTF16 to UTF8
53    // Note: `decode_without_bom_handling` will change malformed sequences to the
54    // Unicode REPLACEMENT CHARACTER. Should we report an error instead?
55    //
56    // "with BOM handling" means that the UTF-16 BOM is translated to a UTF-8 BOM
57    //
58    if let Some((enc, _)) = encoding_rs::Encoding::for_bom(&candidate) {
59        if [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE].contains(&enc) {
60            let (translated, _had_malformed_sequences) =
61                enc.decode_without_bom_handling(&candidate);
62            return translated.into_owned().into_bytes();
63        }
64    }
65    candidate
66}
67
68/// The first operand is read into memory in its entirety, but that's not
69/// efficient for the second and subsequent operands.  The `Remaining`
70/// structure is an `ExactSizeIterator` over those operands.
71pub struct Remaining {
72    files: std::vec::IntoIter<PathBuf>,
73}
74
75impl From<Vec<PathBuf>> for Remaining {
76    fn from(files: Vec<PathBuf>) -> Self {
77        Remaining { files: files.into_iter() }
78    }
79}
80
81impl Iterator for Remaining {
82    type Item = Result<NextOperand>;
83    fn next(&mut self) -> Option<Self::Item> {
84        self.files.next().map(|path| reader_for(&path))
85    }
86}
87
88impl ExactSizeIterator for Remaining {
89    fn len(&self) -> usize {
90        self.files.len()
91    }
92}
93
94/// `NextOperand` is the `Item` type for the `Remaining` iterator. For a given
95/// file path, the `reader` field is a reader for the file with that path, and
96/// `path_display` is the path formatted for use in error messages.
97pub struct NextOperand {
98    path_display: String,
99    reader: Box<dyn io::BufRead>,
100}
101
102/// The reader for a second or subsequent operand is a buffered reader with the
103/// ability to decode UTF-16 files. I think this results in double-buffering,
104/// with one buffer within the `DecodeReaderBytes` value, and another in the
105/// `BufReader` that wraps it. I don't know how to work around that.
106#[allow(trivial_casts)]
107fn reader_for(path: &Path) -> Result<NextOperand> {
108    fn decoder<R: Read>(f: R) -> DecodeReaderBytes<R, Vec<u8>> {
109        DecodeReaderBytesBuilder::new()
110            .bom_sniffing(true)
111            .strip_bom(true)
112            .utf8_passthru(true)
113            .build(f)
114    }
115    let (path_display, reader) = if use_stdin(path) {
116        let path_display = "<stdin>".to_string();
117        let reader = decoder(io::stdin().lock());
118        (path_display, Box::new(io::BufReader::new(reader)) as Box<dyn io::BufRead>)
119    } else {
120        let path_display = format!("{}", path.display());
121        let reader =
122            decoder(File::open(path).with_context(|| format!("Can't open file: {path_display}"))?);
123        (path_display, Box::new(io::BufReader::new(reader)) as Box<dyn io::BufRead>)
124    };
125    Ok(NextOperand { path_display, reader })
126}
127impl LaterOperand for NextOperand {
128    /// A convenience wrapper around `bstr::for_byte_line`
129    fn for_byte_line(self, mut for_each_line: impl FnMut(&[u8])) -> Result<()> {
130        let NextOperand { mut reader, path_display } = self;
131        reader
132            .for_byte_line(|line| {
133                for_each_line(line);
134                Ok(true)
135            })
136            .with_context(|| format!("Error reading file: {path_display}"))?;
137        Ok(())
138    }
139}
140
141#[allow(clippy::pedantic)]
142#[cfg(test)]
143mod test {
144    use super::*;
145
146    const UTF8_BOM: &str = "\u{FEFF}";
147
148    fn abominate(expected: &str) -> String {
149        UTF8_BOM.to_string() + expected
150    }
151
152    fn to_utf_16le(source: &str) -> Vec<u8> {
153        let mut result = b"\xff\xfe".to_vec();
154        for b in source.as_bytes().iter() {
155            result.push(*b);
156            result.push(0);
157        }
158        result
159    }
160
161    fn to_utf_16be(source: &str) -> Vec<u8> {
162        let mut result = b"\xfe\xff".to_vec();
163        for b in source.as_bytes().iter() {
164            result.push(0);
165            result.push(*b);
166        }
167        result
168    }
169
170    #[test]
171    fn utf_16le_is_translated_to_utf8() {
172        let expected = "The cute red crab\n jumps over the lazy blue gopher\n";
173        assert_eq!(decode_if_utf16(to_utf_16le(expected)), abominate(expected).as_bytes());
174    }
175
176    #[test]
177    fn utf_16be_is_translated_to_utf8() {
178        let expected = "The cute red crab\n jumps over the lazy blue gopher\n";
179        assert_eq!(decode_if_utf16(to_utf_16be(expected)), abominate(expected).as_bytes());
180    }
181}