1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
use crate::field_range::FieldRange;
use bstr::ByteSlice;
use regex::bytes::Regex;
use std::cmp::max;

/// Methods for parsing a line into a reordered `shuffler`
pub trait LineParser<'a> {
    /// Fills the shuffler with values parsed from the line.
    fn parse_line<'b>(&self, line: &'b [u8], shuffler: &mut Vec<Vec<&'b [u8]>>)
    where
        'a: 'b;
}

/// A line parser that works on fixed substrings
pub struct SubStrLineParser<'a> {
    field_ranges: &'a [FieldRange],
    delimiter: &'a [u8],
}

impl<'a> SubStrLineParser<'a> {
    pub fn new(field_ranges: &'a [FieldRange], delimiter: &'a [u8]) -> Self {
        Self {
            field_ranges,
            delimiter,
        }
    }
}
impl<'a> LineParser<'a> for SubStrLineParser<'a> {
    #[inline]
    fn parse_line<'b>(&self, line: &'b [u8], shuffler: &mut Vec<Vec<&'b [u8]>>)
    where
        'a: 'b,
    {
        let mut parts = line.split_str(self.delimiter).peekable();
        let mut iterator_index = 0;

        // Iterate over our ranges and write any fields that are contained by them.
        for &FieldRange { low, high, pos } in self.field_ranges {
            // Advance up to low end of range
            if low > iterator_index {
                match parts.nth(low - iterator_index - 1) {
                    Some(_part) => {
                        iterator_index = low;
                    }
                    None => break,
                }
            }

            // Advance through the range
            for _ in max(low, iterator_index)..=high {
                match parts.next() {
                    Some(part) => {
                        // Guaranteed to be in range since shuffler is created based on field pos anyways
                        if let Some(reshuffled_range) = shuffler.get_mut(pos) {
                            reshuffled_range.push(part)
                        }
                    }
                    None => break,
                }
                iterator_index += 1;
            }
        }
    }
}

/// A line parser that works on fixed substrings
pub struct RegexLineParser<'a> {
    field_ranges: &'a [FieldRange],
    delimiter: &'a Regex,
}

impl<'a> RegexLineParser<'a> {
    pub fn new(field_ranges: &'a [FieldRange], delimiter: &'a Regex) -> Self {
        Self {
            field_ranges,
            delimiter,
        }
    }
}
impl<'a> LineParser<'a> for RegexLineParser<'a> {
    #[inline]
    fn parse_line<'b>(&self, line: &'b [u8], shuffler: &mut Vec<Vec<&'b [u8]>>)
    where
        'a: 'b,
    {
        let mut parts = self.delimiter.split(line).peekable();
        let mut iterator_index = 0;

        // Iterate over our ranges and write any fields that are contained by them.
        for &FieldRange { low, high, pos } in self.field_ranges {
            // Advance up to low end of range
            if low > iterator_index {
                match parts.nth(low - iterator_index - 1) {
                    Some(_part) => {
                        iterator_index = low;
                    }
                    None => break,
                }
            }

            // Advance through the range
            for _ in max(low, iterator_index)..=high {
                match parts.next() {
                    Some(part) => {
                        // Guaranteed to be in range since shuffler is created based on field pos anyways
                        if let Some(reshuffled_range) = shuffler.get_mut(pos) {
                            reshuffled_range.push(part)
                        } else {
                            unreachable!()
                        }
                    }
                    None => break,
                }
                iterator_index += 1;
            }
        }
    }
}