hcklib/
core.rs

1//! Core processing module
2//!
3//! It causes me great pain that I can't figure out how split these methods up. The fact that we are relying on
4//! lifetime coersion to reuse the `shuffler` vector really locks down the possible options.
5//!
6//! If we go with a dyn trait on the line splitter function it is appreciably slower.
7use crate::{
8    field_range::{FieldRange, RegexOrString},
9    line_parser::LineParser,
10    mmap::MmapChoice,
11    single_byte_delim_parser::SingleByteDelimParser,
12};
13use anyhow::Result;
14use bstr::ByteSlice;
15use flate2::read::MultiGzDecoder;
16use grep_cli::DecompressionReaderBuilder;
17use regex::bytes::Regex;
18use ripline::{
19    line_buffer::{LineBuffer, LineBufferReader},
20    lines::{self, LineIter},
21    LineTerminator,
22};
23use std::{
24    fs::File,
25    io::{self, BufRead, BufReader, Read, Write},
26    path::Path,
27};
28
29const DEFAULT_DELIM: &[u8] = b"\t";
30
31/// The input types that `hck` can parse.
32pub enum HckInput<P: AsRef<Path>> {
33    Stdin,
34    Path(P),
35}
36
37/// The config object for [`Core`].
38#[derive(Debug, Clone)]
39pub struct CoreConfig<'a> {
40    delimiter: &'a [u8],
41    output_delimiter: &'a [u8],
42    line_terminator: LineTerminator,
43    mmap_choice: MmapChoice,
44    is_parser_regex: bool,
45    try_decompress: bool,
46    raw_fields: Option<&'a str>,
47    raw_header_fields: Option<&'a [Regex]>,
48    raw_exclude: Option<&'a str>,
49    raw_exclude_headers: Option<&'a [Regex]>,
50    header_is_regex: bool,
51    parsed_delim: RegexOrString,
52}
53
54impl Default for CoreConfig<'_> {
55    fn default() -> Self {
56        Self {
57            delimiter: DEFAULT_DELIM,
58            output_delimiter: DEFAULT_DELIM,
59            line_terminator: LineTerminator::default(),
60            mmap_choice: unsafe { MmapChoice::auto() },
61            is_parser_regex: false,
62            try_decompress: false,
63            raw_fields: Some("1-"),
64            raw_header_fields: None,
65            raw_exclude: None,
66            raw_exclude_headers: None,
67            header_is_regex: false,
68            parsed_delim: RegexOrString::String(
69                std::str::from_utf8(DEFAULT_DELIM).unwrap().to_string(),
70            ),
71        }
72    }
73}
74
75impl CoreConfig<'_> {
76    /// Get the parsed delimiter
77    pub fn parsed_delim(&self) -> &RegexOrString {
78        &self.parsed_delim
79    }
80
81    /// Read the first line of an input and return it.
82    ///
83    /// It's up to the user to make sure that any consumed bytes are properly handed
84    /// off to the line parsers later on.
85    pub fn peek_first_line<P: AsRef<Path>>(
86        &self,
87        input: &HckInput<P>,
88    ) -> Result<Vec<u8>, io::Error> {
89        let mut buffer = String::new();
90        match input {
91            HckInput::Stdin => {
92                // TODO: work out how to decode just a byte slice
93                if self.try_decompress {
94                    unimplemented!("Header selections not supported when piping gzipped stdin")
95                }
96                io::stdin().read_line(&mut buffer)?;
97            }
98
99            HckInput::Path(path) => {
100                if self.try_decompress {
101                    let reader: Box<dyn Read> = if path
102                        .as_ref()
103                        .to_str()
104                        .map(|p| p.ends_with(".gz"))
105                        .unwrap_or(false)
106                    {
107                        Box::new(MultiGzDecoder::new(File::open(path)?))
108                    } else {
109                        Box::new(
110                            DecompressionReaderBuilder::new()
111                                // .matcher(matcher)
112                                .build(path)?,
113                        )
114                    };
115                    let mut reader = BufReader::new(reader);
116                    reader.read_line(&mut buffer)?;
117                } else {
118                    BufReader::new(File::open(path)?).read_line(&mut buffer)?;
119                }
120            }
121        }
122        Ok(lines::without_terminator(buffer.as_bytes(), self.line_terminator).to_owned())
123    }
124
125    /// Parse the raw user input fields and header fields. Returns any header bytes read and the parsed fields
126    pub fn parse_fields<P>(&self, input: &HckInput<P>) -> Result<(Option<Vec<u8>>, Vec<FieldRange>)>
127    where
128        P: AsRef<Path>,
129    {
130        // Parser the fields in the context of the files being looked at
131        let (mut extra, fields) = match (self.raw_fields, self.raw_header_fields) {
132            (Some(field_list), Some(header_fields)) => {
133                let first_line = self.peek_first_line(input)?;
134                let mut fields = FieldRange::from_list(field_list)?;
135                let header_fields = FieldRange::from_header_list(
136                    header_fields,
137                    first_line.as_bytes(),
138                    &self.parsed_delim,
139                    self.header_is_regex,
140                    false,
141                )?;
142                fields.extend(header_fields);
143                FieldRange::post_process_ranges(&mut fields);
144                (Some(first_line), fields)
145            }
146            (Some(field_list), None) => (None, FieldRange::from_list(field_list)?),
147            (None, Some(header_fields)) => {
148                let first_line = self.peek_first_line(input)?;
149                let fields = FieldRange::from_header_list(
150                    header_fields,
151                    first_line.as_bytes(),
152                    &self.parsed_delim,
153                    self.header_is_regex,
154                    false,
155                )?;
156                (Some(first_line), fields)
157            }
158            (None, None) => (None, FieldRange::from_list("1-")?),
159        };
160
161        let fields = match (&self.raw_exclude, &self.raw_exclude_headers) {
162            (Some(exclude), Some(exclude_header)) => {
163                let exclude = FieldRange::from_list(exclude)?;
164                let fields = FieldRange::exclude(fields, exclude);
165                let first_line = if let Some(first_line) = extra {
166                    first_line
167                } else {
168                    self.peek_first_line(input)?
169                };
170                let exclude_headers = FieldRange::from_header_list(
171                    exclude_header,
172                    first_line.as_bytes(),
173                    &self.parsed_delim,
174                    self.header_is_regex,
175                    true,
176                )?;
177                extra = Some(first_line);
178                FieldRange::exclude(fields, exclude_headers)
179            }
180            (Some(exclude), None) => {
181                let exclude = FieldRange::from_list(exclude)?;
182                FieldRange::exclude(fields, exclude)
183            }
184            (None, Some(exclude_header)) => {
185                let first_line = if let Some(first_line) = extra {
186                    first_line
187                } else {
188                    self.peek_first_line(input)?
189                };
190                let exclude_headers = FieldRange::from_header_list(
191                    exclude_header,
192                    first_line.as_bytes(),
193                    &self.parsed_delim,
194                    self.header_is_regex,
195                    true,
196                )?;
197                extra = Some(first_line);
198                FieldRange::exclude(fields, exclude_headers)
199            }
200            (None, None) => fields,
201        };
202        Ok((extra, fields))
203    }
204}
205
206/// A builder for the [`CoreConfig`] which drives [`Core`].
207#[derive(Clone, Debug)]
208pub struct CoreConfigBuilder<'a> {
209    config: CoreConfig<'a>,
210}
211
212impl<'a> CoreConfigBuilder<'a> {
213    pub fn new() -> Self {
214        Self {
215            config: CoreConfig::default(),
216        }
217    }
218
219    pub fn build(mut self) -> Result<CoreConfig<'a>> {
220        let delim = if self.config.is_parser_regex {
221            RegexOrString::Regex(Regex::new(self.config.delimiter.to_str()?)?)
222        } else {
223            let unescaped =
224                std::str::from_utf8(&grep_cli::unescape(self.config.delimiter.to_str()?))?
225                    .to_string();
226            RegexOrString::String(unescaped)
227        };
228        self.config.parsed_delim = delim;
229        Ok(self.config)
230    }
231
232    /// The substr to split lines on.
233    pub fn delimiter(mut self, delim: &'a [u8]) -> Self {
234        self.config.delimiter = delim;
235        self
236    }
237
238    /// The substr to use as the output delimiter
239    pub fn output_delimiter(mut self, delim: &'a [u8]) -> Self {
240        self.config.output_delimiter = delim;
241        self
242    }
243
244    /// The line terminator to use when looking for linebreaks and stripping linebreach chars.
245    pub fn line_terminator(mut self, term: LineTerminator) -> Self {
246        self.config.line_terminator = term;
247        self
248    }
249
250    /// Whether or not to try to use mmap mode
251    pub fn mmap(mut self, mmap_choice: MmapChoice) -> Self {
252        self.config.mmap_choice = mmap_choice;
253        self
254    }
255
256    /// Whether or not the parser is a regex
257    #[allow(clippy::wrong_self_convention)]
258    pub fn is_regex_parser(mut self, is_regex: bool) -> Self {
259        self.config.is_parser_regex = is_regex;
260        self
261    }
262
263    /// Try to decompress an input file
264    pub fn try_decompress(mut self, try_decompress: bool) -> Self {
265        self.config.try_decompress = try_decompress;
266        self
267    }
268
269    /// The raw user input fields to output
270    pub fn fields(mut self, fields: Option<&'a str>) -> Self {
271        self.config.raw_fields = fields;
272        self
273    }
274
275    /// The raw user input header to output
276    pub fn headers(mut self, headers: Option<&'a [Regex]>) -> Self {
277        self.config.raw_header_fields = headers;
278        self
279    }
280
281    /// The raw user input fields to exclude
282    pub fn exclude(mut self, exclude: Option<&'a str>) -> Self {
283        self.config.raw_exclude = exclude;
284        self
285    }
286
287    /// The raw user input headers to exclude
288    pub fn exclude_headers(mut self, exclude_headers: Option<&'a [Regex]>) -> Self {
289        self.config.raw_exclude_headers = exclude_headers;
290        self
291    }
292
293    /// Whether or not to treat the headers as regex
294    pub fn header_is_regex(mut self, header_is_regex: bool) -> Self {
295        self.config.header_is_regex = header_is_regex;
296        self
297    }
298}
299
300impl Default for CoreConfigBuilder<'_> {
301    fn default() -> Self {
302        Self::new()
303    }
304}
305
306/// The main processing loop
307pub struct Core<'a, L> {
308    /// The [`CoreConfig`] object that determines how [`Core`] is run
309    config: &'a CoreConfig<'a>,
310    /// The [`FieldRange`]'s to keep, in the order to output them
311    fields: &'a [FieldRange],
312    /// The reusable line parse that defines how to parse a line (regex or substr).
313    line_parser: L,
314    /// The reusable line buffer that holds bytes from reads
315    line_buffer: &'a mut LineBuffer,
316}
317
318impl<'a, L> Core<'a, L>
319where
320    L: LineParser<'a>,
321{
322    /// Create a new "core" the can be used to parse multiple inputs
323    pub fn new(
324        config: &'a CoreConfig,
325        fields: &'a [FieldRange],
326        line_parser: L,
327        line_buffer: &'a mut LineBuffer,
328    ) -> Self {
329        Self {
330            config,
331            fields,
332            line_parser,
333            line_buffer,
334        }
335    }
336
337    /// Check if no reordering of fields is happening
338    #[inline]
339    fn are_fields_pos_sorted(&self) -> bool {
340        let mut test = 0;
341        for field in self.fields {
342            if field.pos < test {
343                return false;
344            }
345            test = field.pos
346        }
347        true
348    }
349
350    /// Check if we can run in `fast mode`.
351    ///
352    /// delimiter is 1 byte, newline is 1 bytes, and we are not using a regex
353    fn allow_fastmode(&self) -> bool {
354        self.config.delimiter.len() == 1
355            && self.config.line_terminator.as_bytes().len() == 1
356            && !self.config.is_parser_regex
357            && self.are_fields_pos_sorted()
358    }
359
360    pub fn hck_input<P, W>(
361        &mut self,
362        input: HckInput<P>,
363        mut output: W,
364        header: Option<Vec<u8>>,
365    ) -> Result<(), io::Error>
366    where
367        P: AsRef<Path>,
368        W: Write,
369    {
370        // Dispatch to a given `hck_*` runner depending on configuration
371        match input {
372            HckInput::Stdin => {
373                if let Some(header) = header {
374                    self.hck_bytes(header.as_bytes(), &mut output)?;
375                }
376                let reader: Box<dyn Read> = if self.config.try_decompress {
377                    Box::new(MultiGzDecoder::new(io::stdin()))
378                } else {
379                    Box::new(io::stdin())
380                };
381                if self.allow_fastmode() {
382                    self.hck_reader_fast(reader, &mut output)
383                } else {
384                    self.hck_reader(reader, &mut output)
385                }
386            }
387            HckInput::Path(path) => {
388                if self.config.try_decompress {
389                    let reader: Box<dyn Read> = if path
390                        .as_ref()
391                        .to_str()
392                        .map(|p| p.ends_with(".gz"))
393                        .unwrap_or(false)
394                    {
395                        Box::new(MultiGzDecoder::new(File::open(&path)?))
396                    } else {
397                        Box::new(
398                            DecompressionReaderBuilder::new()
399                                // .matcher(matcher)
400                                .build(&path)?,
401                        )
402                    };
403                    if self.allow_fastmode() {
404                        self.hck_reader_fast(reader, &mut output)
405                    } else {
406                        self.hck_reader(reader, &mut output)
407                    }
408                } else {
409                    let file = File::open(&path)?;
410                    if let Some(mmap) = self.config.mmap_choice.open(&file, Some(&path)) {
411                        if self.allow_fastmode() {
412                            self.hck_bytes_fast(mmap.as_bytes(), &mut output)
413                        } else {
414                            self.hck_bytes(mmap.as_bytes(), &mut output)
415                        }
416                    } else if self.allow_fastmode() {
417                        self.hck_reader_fast(file, &mut output)
418                    } else {
419                        self.hck_reader(file, &mut output)
420                    }
421                }
422            }
423        }
424    }
425
426    /// Iterate over the lines in a slice of bytes.
427    ///
428    /// The input slice of bytes is assumed to end in a newline.
429    #[allow(clippy::missing_transmute_annotations)]
430    pub fn hck_bytes<W>(&mut self, bytes: &[u8], mut output: W) -> Result<(), io::Error>
431    where
432        W: Write,
433    {
434        let iter = LineIter::new(self.config.line_terminator.as_byte(), bytes.as_bytes());
435        let mut shuffler: Vec<Vec<&'static [u8]>> =
436            vec![vec![]; self.fields.iter().map(|f| f.pos).max().unwrap() + 1];
437        for line in iter {
438            let mut s: Vec<Vec<&[u8]>> = shuffler;
439            self.line_parser.parse_line(
440                lines::without_terminator(line, self.config.line_terminator),
441                &mut s,
442            );
443            let items = s.iter_mut().flat_map(|s| s.drain(..));
444            output.join_append(
445                self.config.output_delimiter,
446                items,
447                &self.config.line_terminator,
448            )?;
449            shuffler = unsafe { core::mem::transmute(s) };
450        }
451        Ok(())
452    }
453
454    /// Fast mode iteration over lines in a slice of bytes.
455    ///
456    /// This expects the seperator to be a single byte and the newline to be a singel byte.
457    ///
458    /// Instead of  seaching for linebreaks, then splitting up the line on the `sep`,
459    /// fast mode looks for either `sep` or `newline` at the same time, so instead of two passes
460    /// over the bytes we only make one pass.
461    pub fn hck_bytes_fast<W: Write>(&mut self, bytes: &[u8], output: W) -> Result<(), io::Error> {
462        let mut buffer_parser = SingleByteDelimParser::new(
463            self.config.line_terminator,
464            self.config.output_delimiter,
465            self.fields,
466            self.config.delimiter[0],
467        );
468        buffer_parser.process_buffer(bytes, output)?;
469        Ok(())
470    }
471
472    /// Fast mode iteration over lines in a reader.
473    ///
474    /// This expects the separator to be a single byte and the newline to be a single byte.
475    ///
476    /// Instead of  seaching for linebreaks, then splitting up the line on the `sep`,
477    /// fast mode looks for either `sep` or `newline` at the same time, so instead of two passes
478    /// over the bytes we only make one pass.
479    pub fn hck_reader_fast<R: Read, W: Write>(
480        &mut self,
481        reader: R,
482        mut output: W,
483    ) -> Result<(), io::Error> {
484        let mut reader = LineBufferReader::new(reader, self.line_buffer);
485        let mut buffer_parser = SingleByteDelimParser::new(
486            self.config.line_terminator,
487            self.config.output_delimiter,
488            self.fields,
489            self.config.delimiter[0],
490        );
491
492        while reader.fill()? {
493            buffer_parser.process_buffer(reader.buffer(), &mut output)?;
494            buffer_parser.reset();
495            reader.consume(reader.buffer().len());
496        }
497        Ok(())
498    }
499
500    /// Process lines from a reader.
501    #[allow(clippy::missing_transmute_annotations)]
502    pub fn hck_reader<R: Read, W: Write>(
503        &mut self,
504        reader: R,
505        mut output: W,
506    ) -> Result<(), io::Error> {
507        let mut reader = LineBufferReader::new(reader, self.line_buffer);
508        let mut shuffler: Vec<Vec<&'static [u8]>> =
509            vec![vec![]; self.fields.iter().map(|f| f.pos).max().unwrap() + 1];
510        while reader.fill()? {
511            let iter = LineIter::new(self.config.line_terminator.as_byte(), reader.buffer());
512
513            for line in iter {
514                let mut s: Vec<Vec<&[u8]>> = shuffler;
515                self.line_parser.parse_line(
516                    lines::without_terminator(line, self.config.line_terminator),
517                    &mut s,
518                );
519
520                let items = s.iter_mut().flat_map(|s| s.drain(..));
521                output.join_append(
522                    self.config.output_delimiter,
523                    items,
524                    &self.config.line_terminator,
525                )?;
526                shuffler = unsafe { core::mem::transmute(s) };
527            }
528            reader.consume(reader.buffer().len());
529        }
530        Ok(())
531    }
532}
533
534/// A trait for adding `join_append` to a writer.
535pub trait JoinAppend {
536    /// Given an input iterator of items, write them with a serparator and a newline.
537    fn join_append<'b>(
538        &mut self,
539        sep: &[u8],
540        items: impl Iterator<Item = &'b [u8]>,
541        term: &LineTerminator,
542    ) -> Result<(), io::Error>;
543}
544
545/// [`JoinAppend`] for [`Write`].
546impl<W: Write> JoinAppend for W {
547    /// Given an input iterator of items, write them with a serparator and a newline.
548    #[inline(always)]
549    fn join_append<'b>(
550        &mut self,
551        sep: &[u8],
552        mut items: impl Iterator<Item = &'b [u8]>,
553        term: &LineTerminator,
554    ) -> Result<(), io::Error> {
555        if let Some(item) = items.next() {
556            self.write_all(item)?;
557        }
558
559        for item in items {
560            self.write_all(sep)?;
561            self.write_all(item)?;
562        }
563        self.write_all(term.as_bytes())?;
564        Ok(())
565    }
566}