hcklib/
line_parser.rs

1use crate::field_range::FieldRange;
2use bstr::ByteSlice;
3use regex::bytes::Regex;
4use std::cmp::max;
5
6/// Methods for parsing a line into a reordered `shuffler`
7pub trait LineParser<'a> {
8    /// Fills the shuffler with values parsed from the line.
9    fn parse_line<'b>(&self, line: &'b [u8], shuffler: &mut Vec<Vec<&'b [u8]>>)
10    where
11        'a: 'b;
12}
13
14/// A line parser that works on fixed substrings
15pub struct SubStrLineParser<'a> {
16    field_ranges: &'a [FieldRange],
17    delimiter: &'a [u8],
18}
19
20impl<'a> SubStrLineParser<'a> {
21    pub fn new(field_ranges: &'a [FieldRange], delimiter: &'a [u8]) -> Self {
22        Self {
23            field_ranges,
24            delimiter,
25        }
26    }
27}
28impl<'a> LineParser<'a> for SubStrLineParser<'a> {
29    #[inline]
30    fn parse_line<'b>(&self, line: &'b [u8], shuffler: &mut Vec<Vec<&'b [u8]>>)
31    where
32        'a: 'b,
33    {
34        let mut parts = line.split_str(self.delimiter).peekable();
35        let mut iterator_index = 0;
36
37        // Iterate over our ranges and write any fields that are contained by them.
38        for &FieldRange { low, high, pos } in self.field_ranges {
39            // Advance up to low end of range
40            if low > iterator_index {
41                match parts.nth(low - iterator_index - 1) {
42                    Some(_part) => {
43                        iterator_index = low;
44                    }
45                    None => break,
46                }
47            }
48
49            // Advance through the range
50            for _ in max(low, iterator_index)..=high {
51                match parts.next() {
52                    Some(part) => {
53                        // Guaranteed to be in range since shuffler is created based on field pos anyways
54                        if let Some(reshuffled_range) = shuffler.get_mut(pos) {
55                            reshuffled_range.push(part)
56                        }
57                    }
58                    None => break,
59                }
60                iterator_index += 1;
61            }
62        }
63    }
64}
65
66/// A line parser that works on fixed substrings
67pub struct RegexLineParser<'a> {
68    field_ranges: &'a [FieldRange],
69    delimiter: &'a Regex,
70}
71
72impl<'a> RegexLineParser<'a> {
73    pub fn new(field_ranges: &'a [FieldRange], delimiter: &'a Regex) -> Self {
74        Self {
75            field_ranges,
76            delimiter,
77        }
78    }
79}
80impl<'a> LineParser<'a> for RegexLineParser<'a> {
81    #[inline]
82    fn parse_line<'b>(&self, line: &'b [u8], shuffler: &mut Vec<Vec<&'b [u8]>>)
83    where
84        'a: 'b,
85    {
86        let mut parts = self.delimiter.split(line).peekable();
87        let mut iterator_index = 0;
88
89        // Iterate over our ranges and write any fields that are contained by them.
90        for &FieldRange { low, high, pos } in self.field_ranges {
91            // Advance up to low end of range
92            if low > iterator_index {
93                match parts.nth(low - iterator_index - 1) {
94                    Some(_part) => {
95                        iterator_index = low;
96                    }
97                    None => break,
98                }
99            }
100
101            // Advance through the range
102            for _ in max(low, iterator_index)..=high {
103                match parts.next() {
104                    Some(part) => {
105                        // Guaranteed to be in range since shuffler is created based on field pos anyways
106                        if let Some(reshuffled_range) = shuffler.get_mut(pos) {
107                            reshuffled_range.push(part)
108                        } else {
109                            unreachable!()
110                        }
111                    }
112                    None => break,
113                }
114                iterator_index += 1;
115            }
116        }
117    }
118}