hcklib/
field_range.rs

1//! Parse ranges like `-2,5,8-10,13-`.
2//!
3//! # Examples
4//!
5//! TODO
6
7use bstr::ByteSlice;
8use regex::bytes::Regex;
9use std::{cmp::max, collections::VecDeque, str::FromStr};
10use thiserror::Error;
11
12/// The fartest right possible field
13const MAX: usize = usize::MAX;
14
15/// Errors for parsing / validating [`FieldRange`] strings.
16#[derive(Error, Debug, PartialEq)]
17pub enum FieldError {
18    #[error("Header not found: {0}")]
19    HeaderNotFound(String),
20    #[error("Fields and positions are numbered from 1: {0}")]
21    InvalidField(usize),
22    #[error("High end of range less than low end of range: {0}-{1}")]
23    InvalidOrder(usize, usize),
24    #[error("Failed to parse field: {0}")]
25    FailedParse(String),
26    #[error("No headers matched")]
27    NoHeadersMatched,
28}
29
30#[derive(Debug, Clone)]
31pub enum RegexOrString {
32    Regex(Regex),
33    String(String),
34}
35
36impl RegexOrString {
37    fn split<'a>(&'a self, line: &'a [u8]) -> Box<dyn Iterator<Item = &'a [u8]> + 'a> {
38        match self {
39            RegexOrString::Regex(r) => Box::new(r.split(line)),
40            RegexOrString::String(s) => Box::new(line.split_str(s)),
41        }
42    }
43}
44
45/// Represent a range of columns to keep.
46#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Copy, Clone)]
47pub struct FieldRange {
48    pub low: usize,
49    pub high: usize,
50    // The initial position of this range in the user input
51    pub pos: usize,
52}
53
54impl FromStr for FieldRange {
55    type Err = FieldError;
56
57    /// Convert a [`str`] into a [`FieldRange`]
58    fn from_str(s: &str) -> Result<FieldRange, FieldError> {
59        let mut parts = s.splitn(2, '-');
60
61        match (parts.next(), parts.next()) {
62            (Some(nm), None) => {
63                if let Ok(nm) = nm.parse::<usize>() {
64                    if nm > 0 {
65                        Ok(FieldRange {
66                            low: nm - 1,
67                            high: nm - 1,
68                            pos: 0,
69                        })
70                    } else {
71                        Err(FieldError::InvalidField(nm))
72                    }
73                } else {
74                    Err(FieldError::FailedParse(nm.to_owned()))
75                }
76            }
77            (Some(n), Some("")) => {
78                if let Ok(low) = n.parse::<usize>() {
79                    if low > 0 {
80                        Ok(FieldRange {
81                            low: low - 1,
82                            high: MAX - 1,
83                            pos: 0,
84                        })
85                    } else {
86                        Err(FieldError::InvalidField(low))
87                    }
88                } else {
89                    Err(FieldError::FailedParse(n.to_owned()))
90                }
91            }
92            (Some(""), Some(m)) => {
93                if let Ok(high) = m.parse::<usize>() {
94                    if high > 0 {
95                        Ok(FieldRange {
96                            low: 0,
97                            high: high - 1,
98                            pos: 0,
99                        })
100                    } else {
101                        Err(FieldError::InvalidField(high))
102                    }
103                } else {
104                    Err(FieldError::FailedParse(m.to_owned()))
105                }
106            }
107            (Some(n), Some(m)) => match (n.parse::<usize>(), m.parse::<usize>()) {
108                (Ok(low), Ok(high)) => {
109                    if low > 0 && low <= high {
110                        Ok(FieldRange {
111                            low: low - 1,
112                            high: high - 1,
113                            pos: 0,
114                        })
115                    } else if low == 0 {
116                        Err(FieldError::InvalidField(low))
117                    } else {
118                        Err(FieldError::InvalidOrder(low, high))
119                    }
120                }
121                _ => Err(FieldError::FailedParse(format!("{}-{}", n, m))),
122            },
123            _ => unreachable!(),
124        }
125    }
126}
127
128impl FieldRange {
129    pub const fn default() -> Self {
130        Self {
131            low: 0,
132            high: MAX - 1,
133            pos: 0,
134        }
135    }
136
137    /// Parse a comma separated list of fields and merge any overlaps
138    pub fn from_list(list: &str) -> Result<Vec<FieldRange>, FieldError> {
139        let mut ranges: Vec<FieldRange> = vec![];
140        for (i, item) in list.split(',').enumerate() {
141            let mut rnge: FieldRange = FromStr::from_str(item)?;
142            rnge.pos = i;
143            ranges.push(rnge);
144        }
145        FieldRange::post_process_ranges(&mut ranges);
146
147        Ok(ranges)
148    }
149
150    /// Get the indices of the headers that match any of the provided regex's.
151    pub fn from_header_list(
152        list: &[Regex],
153        header: &[u8],
154        delim: &RegexOrString,
155        header_is_regex: bool,
156        allow_missing: bool,
157    ) -> Result<Vec<FieldRange>, FieldError> {
158        let mut ranges = vec![];
159        let mut found = vec![false; list.len()];
160        for (i, header) in delim.split(header).enumerate() {
161            for (j, regex) in list.iter().enumerate() {
162                if !header_is_regex {
163                    if regex.as_str().as_bytes() == header {
164                        found[j] = true;
165                        ranges.push(FieldRange {
166                            low: i,
167                            high: i,
168                            pos: j,
169                        });
170                    }
171                } else if regex.is_match(header) {
172                    found[j] = true;
173                    ranges.push(FieldRange {
174                        low: i,
175                        high: i,
176                        pos: j,
177                    });
178                }
179            }
180        }
181
182        if !allow_missing {
183            if ranges.is_empty() {
184                return Err(FieldError::NoHeadersMatched);
185            }
186            for (i, was_found) in found.into_iter().enumerate() {
187                if !was_found {
188                    return Err(FieldError::HeaderNotFound(list[i].as_str().to_owned()));
189                }
190            }
191        }
192
193        FieldRange::post_process_ranges(&mut ranges);
194
195        Ok(ranges)
196    }
197
198    // TODO: There is a very subtle bug in this sorting / mereging code.
199    // The tests have been updated to reflect it. In addition to sometimes merging overlaps, sometimes not,
200    // this will merge everlaps even when their positions are out of order, i.e. I want to output pos 2, then pos 1,
201    // but this will merge that into a range covering 1-2 and forget the pos ordering.
202
203    /// Sort and merge overlaps in a set of [`Vec<FieldRange>`].
204    pub fn post_process_ranges(ranges: &mut Vec<FieldRange>) {
205        ranges.sort();
206
207        // merge overlapping ranges
208        let mut shifted = 0;
209        for i in 0..ranges.len() {
210            let j = i + 1;
211            if let Some(rng) = ranges.get_mut(i) {
212                rng.pos = rng.pos.saturating_sub(shifted);
213            }
214
215            while j < ranges.len()
216                && ranges[j].low <= ranges[i].high + 1
217                && ranges[j].pos.saturating_sub(1) == ranges[i].pos
218            {
219                let j_high = ranges.remove(j).high;
220                ranges[i].high = max(ranges[i].high, j_high);
221                shifted += 1;
222            }
223        }
224    }
225
226    /// Test if a value is contained in this range
227    pub fn contains(&self, value: usize) -> bool {
228        value >= self.low && value <= self.high
229    }
230
231    /// Test if two ranges overlap
232    pub fn overlap(&self, other: &Self) -> bool {
233        self.low <= other.high && self.high >= other.low
234    }
235
236    /// Remove ranges in exclude from fields.
237    ///
238    /// This assumes both fields and exclude are in ascending order by `low` value.
239    pub fn exclude(fields: Vec<FieldRange>, exclude: Vec<FieldRange>) -> Vec<FieldRange> {
240        let mut fields: VecDeque<_> = fields.into_iter().collect();
241        let mut result = vec![];
242        let mut exclude_iter = exclude.into_iter();
243        let mut exclusion = if let Some(ex) = exclude_iter.next() {
244            ex
245        } else {
246            // Early return, no exclusions
247            return fields.into_iter().collect();
248        };
249        let mut field = fields.pop_front().unwrap(); // Must have at least one field
250        loop {
251            // Determine if there is any overlap at all
252            if exclusion.overlap(&field) {
253                // Determine the type of overlap
254                match (
255                    exclusion.contains(field.low),
256                    exclusion.contains(field.high),
257                ) {
258                    // Field: XXXXXXXX
259                    // Exclu:      XXXXXXXX
260                    (false, true) => {
261                        if exclusion.low != 0 {
262                            field.high = exclusion.low - 1;
263                        }
264                    }
265
266                    // Field:    XXXXXXXX
267                    // Exclu: XXXXX
268                    (true, false) => {
269                        if exclusion.high != MAX - 1 {
270                            field.low = exclusion.high + 1;
271                        }
272                    }
273                    // Field:    XXXXX
274                    // Exclu: XXXXXXXXXX
275                    (true, true) => {
276                        // Skip since we are excluding all fields in this range
277                        if let Some(f) = fields.pop_front() {
278                            field = f;
279                        } else {
280                            break;
281                        }
282                    }
283
284                    // Field: XXXXXXXXXX
285                    // exclu:     XXXX
286                    (false, false) => {
287                        // Split the field
288                        // high side
289                        if exclusion.high != MAX - 1 {
290                            let mut high_field = field;
291                            high_field.low = exclusion.high + 1;
292                            fields.push_front(high_field)
293                        }
294
295                        // low side
296                        if exclusion.low != 0 {
297                            field.high = exclusion.low - 1;
298                        }
299                    }
300                }
301            } else if field.low > exclusion.high {
302                // if the exclusion is behind the field, advance the exclusion
303                if let Some(ex) = exclude_iter.next() {
304                    exclusion = ex;
305                } else {
306                    result.push(field);
307                    result.extend(fields);
308                    break;
309                }
310            } else if field.high < exclusion.low {
311                // if the exclusion is ahead of the field, push the field
312                result.push(field);
313                if let Some(f) = fields.pop_front() {
314                    field = f;
315                } else {
316                    break;
317                }
318            } else {
319                unreachable!()
320            }
321        }
322        result
323    }
324}
325
326#[cfg(test)]
327mod test {
328    use super::*;
329
330    #[test]
331    #[rustfmt::skip::macros(assert_eq)]
332    fn test_parse_fields_good() {
333        assert_eq!(vec![FieldRange { low: 0, high: 0, pos: 0}], FieldRange::from_list("1").unwrap());
334        assert_eq!(vec![FieldRange { low: 0, high: 0, pos: 0},  FieldRange { low: 3, high: 3, pos: 1}], FieldRange::from_list("1,4").unwrap());
335        assert_eq!(vec![FieldRange { low: 0, high: 1, pos: 0},  FieldRange { low: 3, high: usize::MAX - 1, pos: 1}], FieldRange::from_list("1,2,4-").unwrap());
336        assert_eq!(vec![FieldRange { low: 1, high: 2, pos: 0},  FieldRange { low: 3, high: usize::MAX - 1, pos: 1} ], FieldRange::from_list("2,3,4-").unwrap());
337        assert_eq!(vec![FieldRange { low: 0, high: 0, pos: 0},  FieldRange { low: 3, high: usize::MAX - 1, pos: 1}], FieldRange::from_list("1,4-,5-8").unwrap());
338        assert_eq!(vec![FieldRange { low: 0, high: 0, pos: 1},  FieldRange { low: 3, high: usize::MAX - 1, pos: 0}, FieldRange { low: 4, high: 7, pos: 2}], FieldRange::from_list("4-,1,5-8").unwrap());
339        assert_eq!(vec![FieldRange { low: 0, high: 3, pos: 0}], FieldRange::from_list("-4").unwrap());
340        assert_eq!(vec![FieldRange { low: 0, high: 7, pos: 0}], FieldRange::from_list("-4,5-8").unwrap());
341        assert_eq!(vec![FieldRange { low: 0, high: 0, pos: 1 }, FieldRange { low: 2, high: 2, pos: 0}, FieldRange { low: 2, high: 2, pos: 2}], FieldRange::from_list("3,1,3").unwrap());
342        // Note the slightly odd pos ordering that happens here. This is an artifact of post_process_ranges, which needs some love
343        assert_eq!(vec![FieldRange { low: 0, high: 1, pos: 0 }, FieldRange { low: 2, high: 2, pos: 1}, FieldRange { low: 3, high: 3, pos: 2}], FieldRange::from_list("1,2,3,4").unwrap());
344        assert_eq!(vec![FieldRange { low: 0, high: 1, pos: 0 }, FieldRange { low: 2, high: 2, pos: 2}, FieldRange { low: 3, high: 3, pos: 1}], FieldRange::from_list("1,2,4,3").unwrap());
345    }
346
347    #[test]
348    fn test_parse_fields_bad() {
349        assert!(FieldRange::from_list("0").is_err());
350        assert!(FieldRange::from_list("4-1").is_err());
351        assert!(FieldRange::from_list("cat").is_err());
352        assert!(FieldRange::from_list("1-dog").is_err());
353        assert!(FieldRange::from_list("mouse-4").is_err());
354    }
355
356    #[test]
357    fn test_parse_header_fields() {
358        let header = b"is_cat-isdog-wascow-was_is_apple-12345-!$%*(_)";
359        let delim = Regex::new("-").unwrap();
360        let delim = RegexOrString::Regex(delim);
361        let header_fields = vec![
362            Regex::new(r"^is_.*$").unwrap(),
363            Regex::new("dog").unwrap(),
364            Regex::new(r"\$%").unwrap(),
365        ];
366        let fields =
367            FieldRange::from_header_list(&header_fields, header, &delim, true, false).unwrap();
368        assert_eq!(
369            vec![
370                FieldRange {
371                    low: 0,
372                    high: 1,
373                    pos: 0
374                },
375                FieldRange {
376                    low: 5,
377                    high: 5,
378                    pos: 1
379                }
380            ],
381            fields
382        );
383    }
384
385    #[test]
386    fn test_parse_header_fields_literal() {
387        let header = b"is_cat-is-isdog-wascow-was_is_apple-12345-!$%*(_)";
388        let delim = Regex::new("-").unwrap();
389        let delim = RegexOrString::Regex(delim);
390        let header_fields = vec![Regex::new(r"is").unwrap()];
391        let fields =
392            FieldRange::from_header_list(&header_fields, header, &delim, false, false).unwrap();
393        assert_eq!(
394            vec![FieldRange {
395                low: 1,
396                high: 1,
397                pos: 0
398            },],
399            fields
400        );
401    }
402
403    #[test]
404    fn test_parse_header_fields_literal_header_not_found() {
405        let header = b"is_cat-is-isdog-wascow-was_is_apple-12345-!$%*(_)";
406        let delim = Regex::new("-").unwrap();
407        let delim = RegexOrString::Regex(delim);
408        let header_fields = vec![
409            Regex::new(r"^is_.*$").unwrap(),
410            Regex::new("dog").unwrap(),
411            Regex::new(r"\$%").unwrap(),
412            Regex::new(r"is").unwrap(),
413        ];
414        let result = FieldRange::from_header_list(&header_fields, header, &delim, false, false);
415        assert_eq!(
416            result.unwrap_err(),
417            FieldError::HeaderNotFound(String::from(r"^is_.*$"))
418        );
419    }
420
421    #[test]
422    #[rustfmt::skip::macros(assert_eq)]
423    fn test_exclude_simple() {
424        assert_eq!(
425            vec![
426                FieldRange { low: 1, high: MAX - 1, pos: 0}
427            ],
428            FieldRange::exclude(
429                vec![FieldRange { low: 0, high: MAX - 1, pos: 0}],
430                vec![FieldRange { low: 0, high: 0,       pos: 0}]
431            ),
432            "1"
433        );
434        assert_eq!(
435            vec![
436                FieldRange { low: 1, high: 2,       pos: 0},
437                FieldRange { low: 4, high: MAX - 1, pos: 0},
438            ],
439            FieldRange::exclude(
440                vec![FieldRange { low: 0, high: MAX - 1, pos: 0}],
441                vec![
442                    FieldRange { low: 0, high: 0,        pos: 0},
443                    FieldRange { low: 3, high: 3,        pos: 0}
444                ]
445            ),
446            "1,4"
447        );
448        assert_eq!(
449            vec![
450                FieldRange { low: 2, high: 2,            pos: 0},
451            ],
452            FieldRange::exclude(
453                vec![FieldRange { low: 0, high: MAX - 1, pos: 0}],
454                vec![
455                    FieldRange { low: 0, high: 1,              pos: 0},
456                    FieldRange { low: 3, high: usize::MAX - 1, pos: 1}
457                ]
458            ),
459            "1,2,4-"
460        );
461        assert_eq!(
462            vec![
463                FieldRange { low: 0, high: 0,              pos: 0},
464            ],
465            FieldRange::exclude(
466                vec![FieldRange { low: 0, high: MAX - 1, pos: 0}],
467                vec![
468                    FieldRange { low: 1, high: 2,       pos: 0},
469                    FieldRange { low: 3, high: MAX - 1, pos: 1}
470                ]
471            ),
472            "2,3,4-"
473        );
474        assert_eq!(
475            vec![
476                FieldRange { low: 1, high: 2,       pos: 0},
477            ],
478            FieldRange::exclude(
479                vec![FieldRange { low: 0, high: MAX - 1, pos: 0}],
480                vec![
481                    FieldRange { low: 0, high: 0,       pos: 0},
482                    FieldRange { low: 3, high: MAX - 1, pos: 1}
483                ]
484            ),
485            "1,4-,5-8"
486        );
487        assert_eq!(
488            vec![
489                FieldRange { low: 1, high: 2,       pos: 0},
490            ],
491            FieldRange::exclude(
492                vec![FieldRange { low: 0, high: MAX - 1, pos: 0}],
493                vec![
494                    FieldRange { low: 0, high: 0,       pos: 1},
495                    FieldRange { low: 3, high: MAX - 1, pos: 0},
496                    FieldRange { low: 4, high: 7,       pos: 2}
497                ]
498            ),
499            "4-,1,5-8"
500        );
501        assert_eq!(
502            vec![
503                FieldRange { low: 4, high: MAX - 1, pos: 0},
504            ],
505            FieldRange::exclude(
506                vec![FieldRange { low: 0, high: MAX - 1, pos: 0}],
507                vec![
508                    FieldRange { low: 0, high: 3,       pos: 0}
509                ]
510            ),
511            "-4"
512        );
513        assert_eq!(
514            vec![
515                FieldRange { low: 8, high: MAX - 1, pos: 0},
516            ],
517            FieldRange::exclude(
518                vec![FieldRange { low: 0, high: MAX - 1, pos: 0}],
519                vec![
520                    FieldRange { low: 0, high: 7,       pos: 0}
521                ]
522            ),
523            "-4,5-8"
524        );
525    }
526    #[test]
527    #[rustfmt::skip::macros(assert_eq)]
528    fn test_exclude_complex() {
529        assert_eq!(
530            vec![
531                FieldRange { low: 1, high: 3, pos: 0},
532                FieldRange { low: 7, high: 14, pos: 1},
533            ],
534            FieldRange::exclude(
535                vec![FieldRange { low: 0, high: 3, pos: 0}, FieldRange { low: 7, high: MAX - 1, pos: 1}],
536                vec![FieldRange { low: 0, high: 0, pos: 0}, FieldRange { low: 15, high: MAX - 1, pos: 0}]
537            ),
538            "-f1-4,8- : -e1,16-"
539        );
540        let empty: Vec<FieldRange> = vec![];
541        assert_eq!(
542            empty,
543            FieldRange::exclude(
544                vec![FieldRange { low: 0, high: MAX-1, pos: 0}],
545                vec![FieldRange { low: 0, high: MAX-1, pos: 0}]
546            ),
547            "-f1- : -e1-"
548        );
549        assert_eq!(
550            vec![
551                FieldRange { low: 0, high: 0, pos: 0},
552                FieldRange { low: 9, high: 9, pos: 3},
553            ],
554            FieldRange::exclude(
555                vec![FieldRange { low: 0, high: 0, pos: 0}, FieldRange { low: 3, high: 3, pos: 1 }, FieldRange { low: 7, high: 7, pos: 2}, FieldRange { low: 9, high: 9, pos: 3}],
556                vec![FieldRange { low: 3, high: 7, pos: 0}]
557            ),
558            "-f1,4,8,10 : -e4-8"
559        );
560        // Fields: XXXXXXXXXXX
561        // Exclud:      XXXXXXXXX
562        assert_eq!(
563            vec![
564                FieldRange { low: 0, high: 3, pos: 0},
565            ],
566            FieldRange::exclude(
567                vec![FieldRange { low: 0, high: 9, pos: 0}],
568                vec![FieldRange { low: 4, high: MAX - 1, pos: 0}]
569            ),
570            "-f1-10 : -e5-"
571        );
572        // Fields:      XXXXXXXXXXXXX
573        // Exclud:  XXXXXXXX
574        assert_eq!(
575            vec![
576                FieldRange { low: 15, high: 19, pos: 0},
577            ],
578            FieldRange::exclude(
579                vec![FieldRange { low: 9, high: 19, pos: 0}],
580                vec![FieldRange { low: 4, high: 14, pos: 0}]
581            ),
582            "-f10-20 : -e5-15"
583        );
584        // Fields: XXXXXXXXXXXXXX
585        // Exclud:    XXXXXXX
586        assert_eq!(
587            vec![
588                FieldRange { low: 9, high: 11, pos: 0},
589                FieldRange { low: 16, high: 19, pos: 0},
590            ],
591            FieldRange::exclude(
592                vec![FieldRange { low: 9, high: 19, pos: 0}],
593                vec![FieldRange { low: 12, high: 15, pos: 0}]
594            ),
595            "-f10-20 : -e13-16"
596        );
597        // Fields:      XXX
598        // Exclud:    XXXXXXX
599        assert_eq!(
600            empty,
601            FieldRange::exclude(
602                vec![FieldRange { low: 12, high: 15, pos: 0}],
603                vec![FieldRange { low: 9, high: 19, pos: 0}]
604            ),
605            "-f13-16 : -e10-20"
606        );
607        // Fields: XXXXXXXX      XXXXX
608        // Exclud:     XXXXXXXXXXXXX
609        assert_eq!(
610            vec![FieldRange { low: 4, high: 8, pos: 0 }, FieldRange { low: 25, high: 29, pos: 1}],
611            FieldRange::exclude(
612                vec![FieldRange { low: 4, high: 15, pos: 0}, FieldRange { low: 19, high: 29, pos: 1}],
613                vec![FieldRange { low: 9, high: 24, pos: 0}]
614            ),
615            "-f5-16,20-30 : -e10-25"
616        );
617    }
618}