hcklib/
single_byte_delim_parser.rs

1//! [`SingleByteDelimParser`] is a fast mode parser that is to be used when the
2//! field separator character is a single byte. It works by using `memchr2` to
3//! first look for both the line terminator and the separator in a single pass.
4//! Once the furthest right field has been parsed it switches to searching for
5//! just newlines.
6use std::{
7    cmp::min,
8    io::{self, Write},
9};
10
11use ripline::LineTerminator;
12
13use crate::{core::JoinAppend, field_range::FieldRange};
14
15/// A `SingleByteDelimParser` is a fast parser of fields from from a buffer.
16pub struct SingleByteDelimParser<'a> {
17    /// newline aligned buffer, must end in newline
18    line_terminator: LineTerminator,
19    output_delimiter: &'a [u8],
20    fields: &'a [FieldRange],
21    sep: u8,
22    /// The furthers right field
23    max_field: usize,
24    /// Current offset into the buffer
25    offset: usize,
26    newline: u8,
27    line: Vec<(usize, usize)>,
28}
29
30impl<'a> SingleByteDelimParser<'a> {
31    /// Create a [`SingleByteDelimParser`] to process buffers using the input configuration.
32    pub fn new(
33        line_terminator: LineTerminator,
34        output_delimiter: &'a [u8],
35        fields: &'a [FieldRange],
36        sep: u8,
37    ) -> Self {
38        Self {
39            line_terminator,
40            output_delimiter,
41            fields,
42            sep,
43            max_field: fields.last().map_or(usize::MAX, |f| f.high + 1),
44            offset: 0,
45            newline: line_terminator.as_byte(),
46            line: vec![],
47        }
48    }
49
50    /// Clear all fields of the [`SingleByteDelimParser`].
51    #[inline]
52    pub fn reset(&mut self) {
53        self.offset = 0;
54    }
55
56    /// Parse fields from the lines found in buffer and write them to `output`.
57    ///
58    /// **Note** The input buffer _must_ end with a newline.
59    #[inline]
60    pub fn process_buffer<W: Write>(
61        &mut self,
62        buffer: &[u8],
63        mut output: W,
64    ) -> Result<(), io::Error> {
65        // Advance pasts first newline
66        if let Some(byte) = buffer.first() {
67            if *byte == self.newline {
68                output.join_append(
69                    self.output_delimiter,
70                    std::iter::empty(),
71                    &self.line_terminator,
72                )?;
73                self.offset += 1;
74            }
75        }
76
77        while self.offset < buffer.len() {
78            self.fill_line(buffer)?;
79            let items = self.fields.iter().flat_map(|f| {
80                let slice = self
81                    .line
82                    .get(f.low..=min(f.high, self.line.len().saturating_sub(1)))
83                    .unwrap_or(&[]);
84                slice.iter().map(|(start, stop)| &buffer[*start..=*stop])
85            });
86            output.join_append(self.output_delimiter, items, &self.line_terminator)?;
87            self.line.clear();
88        }
89        Ok(())
90    }
91
92    /// Fill `line` with the start/end positions of found columns
93    /// The positions are relative to the held buffer
94    #[inline]
95    fn fill_line(&mut self, buffer: &[u8]) -> Result<(), io::Error> {
96        let mut field_count = 0;
97        let iter = memchr::memchr2_iter(self.sep, self.newline, &buffer[self.offset..]);
98
99        let mut line_offset = 0;
100        let mut found_newline = false;
101
102        for index in iter {
103            if buffer[self.offset + index] == self.sep {
104                field_count += 1;
105            } else {
106                found_newline = true;
107            }
108
109            self.line
110                .push((self.offset + line_offset, self.offset + index - 1));
111            line_offset = index + 1;
112
113            if found_newline || field_count == self.max_field {
114                break;
115            }
116        }
117
118        if !found_newline {
119            let end = memchr::memchr(self.newline, &buffer[self.offset + line_offset..])
120                .ok_or(io::ErrorKind::InvalidData)?;
121            self.offset += line_offset + end + 1;
122        } else {
123            self.offset += line_offset;
124        }
125        Ok(())
126    }
127}