Skip to main content

skim/
field.rs

1//! Field extraction and parsing utilities.
2//!
3//! This module provides utilities for parsing field ranges and extracting
4//! fields from text based on delimiters.
5
6use regex::Regex;
7use std::{
8    cmp::{max, min},
9    sync::LazyLock,
10};
11
12static FIELD_RANGE: LazyLock<Regex> =
13    LazyLock::new(|| Regex::new(r"^(?P<left>-?\d+)?(?P<sep>\.\.)?(?P<right>-?\d+)?$").unwrap());
14
15/// Represents a range of fields to extract from text
16#[derive(PartialEq, Eq, Clone, Debug)]
17pub enum FieldRange {
18    /// A single field at the given index
19    Single(i32),
20    /// All fields from the start up to and including the given index
21    LeftInf(i32),
22    /// All fields from the given index to the end
23    RightInf(i32),
24    /// Fields between two indices (inclusive)
25    Both(i32, i32),
26}
27
28impl FieldRange {
29    /// Parses a field range from a string (e.g., "1", "1..", "..10", "1..10")
30    #[allow(clippy::should_implement_trait)]
31    pub fn from_str(range: &str) -> Option<FieldRange> {
32        use self::FieldRange::*;
33
34        // "1", "1..", "..10", "1..10", etc.
35        let opt_caps = FIELD_RANGE.captures(range);
36        if let Some(caps) = opt_caps {
37            let opt_left = caps.name("left").map(|s| s.as_str().parse().unwrap_or(1));
38            let opt_right = caps.name("right").map(|s| s.as_str().parse().unwrap_or(-1));
39            let opt_sep = caps.name("sep").map(|s| s.as_str().to_string());
40
41            match (opt_left, opt_right) {
42                (None, None) => Some(RightInf(0)),
43                (Some(left), None) => {
44                    match opt_sep {
45                        None => Some(Single(left)),      // 1
46                        Some(_) => Some(RightInf(left)), // 1..
47                    }
48                }
49                (None, Some(right)) => {
50                    match opt_sep {
51                        None => Some(Single(right)),     // 1 (should not happen)
52                        Some(_) => Some(LeftInf(right)), // ..1 (should not happen)
53                    }
54                }
55                (Some(left), Some(right)) => Some(Both(left, right)), // 1..3
56            }
57        } else {
58            None
59        }
60    }
61
62    /// Converts a field range to an index pair (left, right).
63    ///
64    /// For example, 1..3 => (0, 4). Note that field range is inclusive while
65    /// the output index will exclude the right end.
66    pub fn to_index_pair(&self, length: usize) -> Option<(usize, usize)> {
67        use self::FieldRange::*;
68        match *self {
69            Single(num) => {
70                let num = FieldRange::translate_neg(num, length);
71                if num == 0 || num > length {
72                    None
73                } else {
74                    Some((num - 1, num))
75                }
76            }
77            LeftInf(right) => {
78                let right = FieldRange::translate_neg(right, length);
79                if length == 0 || right == 0 {
80                    None
81                } else {
82                    let right = min(right, length);
83                    Some((0, right))
84                }
85            }
86            RightInf(left) => {
87                let left = FieldRange::translate_neg(left, length);
88                if length == 0 || left > length {
89                    None
90                } else {
91                    let left = max(left, 1);
92                    Some((left - 1, length))
93                }
94            }
95            Both(left, right) => {
96                let left = FieldRange::translate_neg(left, length);
97                let right = FieldRange::translate_neg(right, length);
98                if length == 0 || right == 0 || left > right || left > length {
99                    None
100                } else {
101                    Some((max(left, 1) - 1, min(right, length)))
102                }
103            }
104        }
105    }
106
107    fn translate_neg(idx: i32, length: usize) -> usize {
108        let len = length as i32;
109        let idx = if idx < 0 { idx + len + 1 } else { idx };
110        max(0, idx) as usize
111    }
112}
113
114// ("|", "a|b||c") -> [(0, 2), (2, 4), (4, 5), (5, 6)]
115// explain: split to ["a|", "b|", "|", "c"]
116fn get_ranges_by_delimiter(delimiter: &Regex, text: &str) -> Vec<(usize, usize)> {
117    let mut ranges = Vec::new();
118    let mut last = 0;
119    for mat in delimiter.find_iter(text) {
120        ranges.push((last, mat.start()));
121        last = mat.end();
122    }
123    ranges.push((last, text.len()));
124    ranges
125}
126
127/// Extracts a substring from text based on a field range and delimiter.
128///
129/// For example, with delimiter = Regex::new(",").unwrap(), text "a,b,c", and field Single(2),
130/// this returns "b". Note that this is different from `to_index_pair`, it uses delimiters.
131pub fn get_string_by_field<'a>(delimiter: &Regex, text: &'a str, field: &FieldRange) -> Option<&'a str> {
132    let ranges = get_ranges_by_delimiter(delimiter, text);
133
134    if let Some((start, stop)) = field.to_index_pair(ranges.len()) {
135        let &(begin, _) = &ranges[start];
136        let &(_, end) = ranges.get(stop - 1).unwrap_or(&(text.len(), 0));
137        Some(&text[begin..end])
138    } else {
139        None
140    }
141}
142
143/// Extracts a substring from text by parsing a range string and using a delimiter
144pub fn get_string_by_range<'a>(delimiter: &Regex, text: &'a str, range: &str) -> Option<&'a str> {
145    FieldRange::from_str(range).and_then(|field| get_string_by_field(delimiter, text, &field))
146}
147
148/// Parses matching fields and returns a vector of byte ranges.
149///
150/// Given delimiter `,`, text: "a,b,c", and fields &[Single(2), LeftInf(2)],
151/// this returns [(2, 4), (0, 4)].
152pub fn parse_matching_fields(delimiter: &Regex, text: &str, fields: &[FieldRange]) -> Vec<(usize, usize)> {
153    let ranges = get_ranges_by_delimiter(delimiter, text);
154
155    let mut ret = Vec::new();
156    for field in fields {
157        if let Some((start, stop)) = field.to_index_pair(ranges.len()) {
158            let &(begin, _) = &ranges[start];
159            let &(end, _) = ranges.get(stop).unwrap_or(&(text.len(), 0));
160            ret.push((begin, end));
161        }
162    }
163    ret
164}
165
166/// Extracts the specified fields from text using the delimiter
167pub fn parse_transform_fields(delimiter: &Regex, text: &str, fields: &[FieldRange]) -> String {
168    let ranges = get_ranges_by_delimiter(delimiter, text);
169
170    let mut ret = String::new();
171    for field in fields {
172        if let Some((start, stop)) = field.to_index_pair(ranges.len()) {
173            let &(begin, _) = &ranges[start];
174            let &(end, _) = ranges.get(stop).unwrap_or(&(text.len(), 0));
175            ret.push_str(&text[begin..end]);
176        }
177    }
178    ret
179}
180
181#[cfg(test)]
182#[cfg_attr(coverage, coverage(off))]
183mod test {
184    use super::FieldRange::*;
185    #[test]
186    fn test_parse_range() {
187        assert_eq!(FieldRange::from_str("1"), Some(Single(1)));
188        assert_eq!(FieldRange::from_str("-1"), Some(Single(-1)));
189
190        assert_eq!(FieldRange::from_str("1.."), Some(RightInf(1)));
191        assert_eq!(FieldRange::from_str("-1.."), Some(RightInf(-1)));
192
193        assert_eq!(FieldRange::from_str("..1"), Some(LeftInf(1)));
194        assert_eq!(FieldRange::from_str("..-1"), Some(LeftInf(-1)));
195
196        assert_eq!(FieldRange::from_str("1..3"), Some(Both(1, 3)));
197        assert_eq!(FieldRange::from_str("-1..-3"), Some(Both(-1, -3)));
198
199        assert_eq!(FieldRange::from_str(".."), Some(RightInf(0)));
200        assert_eq!(FieldRange::from_str("a.."), None);
201        assert_eq!(FieldRange::from_str("..b"), None);
202        assert_eq!(FieldRange::from_str("a..b"), None);
203    }
204
205    use regex::Regex;
206
207    #[test]
208    fn test_parse_field_range() {
209        assert_eq!(Single(0).to_index_pair(10), None);
210        assert_eq!(Single(1).to_index_pair(10), Some((0, 1)));
211        assert_eq!(Single(10).to_index_pair(10), Some((9, 10)));
212        assert_eq!(Single(11).to_index_pair(10), None);
213        assert_eq!(Single(-1).to_index_pair(10), Some((9, 10)));
214        assert_eq!(Single(-10).to_index_pair(10), Some((0, 1)));
215        assert_eq!(Single(-11).to_index_pair(10), None);
216
217        assert_eq!(LeftInf(0).to_index_pair(10), None);
218        assert_eq!(LeftInf(1).to_index_pair(10), Some((0, 1)));
219        assert_eq!(LeftInf(8).to_index_pair(10), Some((0, 8)));
220        assert_eq!(LeftInf(10).to_index_pair(10), Some((0, 10)));
221        assert_eq!(LeftInf(11).to_index_pair(10), Some((0, 10)));
222        assert_eq!(LeftInf(-1).to_index_pair(10), Some((0, 10)));
223        assert_eq!(LeftInf(-8).to_index_pair(10), Some((0, 3)));
224        assert_eq!(LeftInf(-9).to_index_pair(10), Some((0, 2)));
225        assert_eq!(LeftInf(-10).to_index_pair(10), Some((0, 1)));
226        assert_eq!(LeftInf(-11).to_index_pair(10), None);
227
228        assert_eq!(RightInf(0).to_index_pair(10), Some((0, 10)));
229        assert_eq!(RightInf(1).to_index_pair(10), Some((0, 10)));
230        assert_eq!(RightInf(8).to_index_pair(10), Some((7, 10)));
231        assert_eq!(RightInf(10).to_index_pair(10), Some((9, 10)));
232        assert_eq!(RightInf(11).to_index_pair(10), None);
233        assert_eq!(RightInf(-1).to_index_pair(10), Some((9, 10)));
234        assert_eq!(RightInf(-8).to_index_pair(10), Some((2, 10)));
235        assert_eq!(RightInf(-9).to_index_pair(10), Some((1, 10)));
236        assert_eq!(RightInf(-10).to_index_pair(10), Some((0, 10)));
237        assert_eq!(RightInf(-11).to_index_pair(10), Some((0, 10)));
238
239        assert_eq!(Both(0, 0).to_index_pair(10), None);
240        assert_eq!(Both(0, 1).to_index_pair(10), Some((0, 1)));
241        assert_eq!(Both(0, 10).to_index_pair(10), Some((0, 10)));
242        assert_eq!(Both(0, 11).to_index_pair(10), Some((0, 10)));
243        assert_eq!(Both(1, -1).to_index_pair(10), Some((0, 10)));
244        assert_eq!(Both(1, -9).to_index_pair(10), Some((0, 2)));
245        assert_eq!(Both(1, -10).to_index_pair(10), Some((0, 1)));
246        assert_eq!(Both(1, -11).to_index_pair(10), None);
247        assert_eq!(Both(-9, -9).to_index_pair(10), Some((1, 2)));
248        assert_eq!(Both(-9, -8).to_index_pair(10), Some((1, 3)));
249        assert_eq!(Both(-9, 0).to_index_pair(10), None);
250        assert_eq!(Both(-9, 1).to_index_pair(10), None);
251        assert_eq!(Both(-9, 2).to_index_pair(10), Some((1, 2)));
252        assert_eq!(Both(-1, 0).to_index_pair(10), None);
253        assert_eq!(Both(11, 20).to_index_pair(10), None);
254        assert_eq!(Both(-11, -11).to_index_pair(10), None);
255    }
256
257    #[test]
258    fn test_parse_transform_fields() {
259        // delimiter is ","
260        let re = Regex::new(",").unwrap();
261
262        assert_eq!(
263            super::parse_transform_fields(&re, "A,B,C,D,E,F", &[Single(2), Single(4), Single(-1), Single(-7)]),
264            "B,D,F"
265        );
266
267        assert_eq!(
268            super::parse_transform_fields(&re, "A,B,C,D,E,F", &[LeftInf(3), LeftInf(-6), LeftInf(-7)]),
269            "A,B,C,A,"
270        );
271
272        assert_eq!(
273            super::parse_transform_fields(
274                &re,
275                "A,B,C,D,E,F",
276                &[RightInf(5), RightInf(-2), RightInf(-1), RightInf(8)]
277            ),
278            "E,FE,FF"
279        );
280
281        assert_eq!(
282            super::parse_transform_fields(
283                &re,
284                "A,B,C,D,E,F",
285                &[Both(3, 3), Both(-9, 2), Both(6, 10), Both(-9, -5)]
286            ),
287            "C,A,B,FA,B,"
288        );
289    }
290
291    #[test]
292    fn test_parse_matching_fields() {
293        // delimiter is ","
294        let re = Regex::new(",").unwrap();
295
296        // bytes:3  3  3 3
297        //       中,华,人,民,E,F",
298
299        assert_eq!(
300            super::parse_matching_fields(&re, "中,华,人,民,E,F", &[Single(2), Single(4), Single(-1), Single(-7)]),
301            vec![(4, 8), (12, 16), (18, 19)]
302        );
303
304        assert_eq!(
305            super::parse_matching_fields(&re, "中,华,人,民,E,F", &[LeftInf(3), LeftInf(-6), LeftInf(-7)]),
306            vec![(0, 12), (0, 4)]
307        );
308
309        assert_eq!(
310            super::parse_matching_fields(
311                &re,
312                "中,华,人,民,E,F",
313                &[RightInf(5), RightInf(-2), RightInf(-1), RightInf(7)]
314            ),
315            vec![(16, 19), (16, 19), (18, 19)]
316        );
317
318        assert_eq!(
319            super::parse_matching_fields(
320                &re,
321                "中,华,人,民,E,F",
322                &[Both(3, 3), Both(-8, 2), Both(6, 10), Both(-8, -5)]
323            ),
324            vec![(8, 12), (0, 8), (18, 19), (0, 8)]
325        );
326    }
327
328    use super::*;
329
330    #[test]
331    fn test_null_delimiter() {
332        // Test with null byte delimiter
333        let re = Regex::new("\x00").unwrap();
334        let text = "a\x00b\x00c";
335
336        // Test field extraction
337        assert_eq!(get_string_by_field(&re, text, &Single(1)), Some("a"));
338        assert_eq!(get_string_by_field(&re, text, &Single(2)), Some("b"));
339        assert_eq!(get_string_by_field(&re, text, &Single(3)), Some("c"));
340
341        // Test matching fields - ranges include the delimiter after the field
342        // text bytes: a(0), \0(1), b(2), \0(3), c(4)
343        // Field 2 is "b" at byte 2, range includes delimiter at byte 3, so (2, 4)
344        assert_eq!(parse_matching_fields(&re, text, &[Single(2)]), vec![(2, 4)]);
345
346        // Field 1 is "a" at byte 0, range includes delimiter at byte 1, so (0, 2)
347        // Field 3 is "c" at byte 4, no delimiter after it, so (4, 5)
348        assert_eq!(
349            parse_matching_fields(&re, text, &[Single(1), Single(3)]),
350            vec![(0, 2), (4, 5)]
351        );
352    }
353
354    #[test]
355    fn test_get_string_by_field() {
356        // delimiter is ","
357        let re = Regex::new(",").unwrap();
358        let text = "a,b,c,";
359        assert_eq!(get_string_by_field(&re, text, &Single(0)), None);
360        assert_eq!(get_string_by_field(&re, text, &Single(1)), Some("a"));
361        assert_eq!(get_string_by_field(&re, text, &Single(2)), Some("b"));
362        assert_eq!(get_string_by_field(&re, text, &Single(3)), Some("c"));
363        assert_eq!(get_string_by_field(&re, text, &Single(4)), Some(""));
364        assert_eq!(get_string_by_field(&re, text, &Single(5)), None);
365        assert_eq!(get_string_by_field(&re, text, &Single(6)), None);
366        assert_eq!(get_string_by_field(&re, text, &Single(-1)), Some(""));
367        assert_eq!(get_string_by_field(&re, text, &Single(-2)), Some("c"));
368        assert_eq!(get_string_by_field(&re, text, &Single(-3)), Some("b"));
369        assert_eq!(get_string_by_field(&re, text, &Single(-4)), Some("a"));
370        assert_eq!(get_string_by_field(&re, text, &Single(-5)), None);
371        assert_eq!(get_string_by_field(&re, text, &Single(-6)), None);
372
373        assert_eq!(get_string_by_field(&re, text, &LeftInf(0)), None);
374        assert_eq!(get_string_by_field(&re, text, &LeftInf(1)), Some("a"));
375        assert_eq!(get_string_by_field(&re, text, &LeftInf(2)), Some("a,b"));
376        assert_eq!(get_string_by_field(&re, text, &LeftInf(3)), Some("a,b,c"));
377        assert_eq!(get_string_by_field(&re, text, &LeftInf(4)), Some("a,b,c,"));
378        assert_eq!(get_string_by_field(&re, text, &LeftInf(5)), Some("a,b,c,"));
379        assert_eq!(get_string_by_field(&re, text, &LeftInf(-5)), None);
380        assert_eq!(get_string_by_field(&re, text, &LeftInf(-4)), Some("a"));
381        assert_eq!(get_string_by_field(&re, text, &LeftInf(-3)), Some("a,b"));
382        assert_eq!(get_string_by_field(&re, text, &LeftInf(-2)), Some("a,b,c"));
383        assert_eq!(get_string_by_field(&re, text, &LeftInf(-1)), Some("a,b,c,"));
384
385        assert_eq!(get_string_by_field(&re, text, &RightInf(0)), Some("a,b,c,"));
386        assert_eq!(get_string_by_field(&re, text, &RightInf(1)), Some("a,b,c,"));
387        assert_eq!(get_string_by_field(&re, text, &RightInf(2)), Some("b,c,"));
388        assert_eq!(get_string_by_field(&re, text, &RightInf(3)), Some("c,"));
389        assert_eq!(get_string_by_field(&re, text, &RightInf(4)), Some(""));
390        assert_eq!(get_string_by_field(&re, text, &RightInf(5)), None);
391        assert_eq!(get_string_by_field(&re, text, &RightInf(-5)), Some("a,b,c,"));
392        assert_eq!(get_string_by_field(&re, text, &RightInf(-4)), Some("a,b,c,"));
393        assert_eq!(get_string_by_field(&re, text, &RightInf(-3)), Some("b,c,"));
394        assert_eq!(get_string_by_field(&re, text, &RightInf(-2)), Some("c,"));
395        assert_eq!(get_string_by_field(&re, text, &RightInf(-1)), Some(""));
396
397        assert_eq!(get_string_by_field(&re, text, &Both(0, 0)), None);
398        assert_eq!(get_string_by_field(&re, text, &Both(0, 1)), Some("a"));
399        assert_eq!(get_string_by_field(&re, text, &Both(0, 2)), Some("a,b"));
400        assert_eq!(get_string_by_field(&re, text, &Both(0, 3)), Some("a,b,c"));
401        assert_eq!(get_string_by_field(&re, text, &Both(0, 4)), Some("a,b,c,"));
402        assert_eq!(get_string_by_field(&re, text, &Both(0, 5)), Some("a,b,c,"));
403        assert_eq!(get_string_by_field(&re, text, &Both(1, 1)), Some("a"));
404        assert_eq!(get_string_by_field(&re, text, &Both(1, 2)), Some("a,b"));
405        assert_eq!(get_string_by_field(&re, text, &Both(1, 3)), Some("a,b,c"));
406        assert_eq!(get_string_by_field(&re, text, &Both(1, 4)), Some("a,b,c,"));
407        assert_eq!(get_string_by_field(&re, text, &Both(1, 5)), Some("a,b,c,"));
408        assert_eq!(get_string_by_field(&re, text, &Both(2, 5)), Some("b,c,"));
409        assert_eq!(get_string_by_field(&re, text, &Both(3, 5)), Some("c,"));
410        assert_eq!(get_string_by_field(&re, text, &Both(4, 5)), Some(""));
411        assert_eq!(get_string_by_field(&re, text, &Both(5, 5)), None);
412        assert_eq!(get_string_by_field(&re, text, &Both(6, 5)), None);
413        assert_eq!(get_string_by_field(&re, text, &Both(2, 3)), Some("b,c"));
414        assert_eq!(get_string_by_field(&re, text, &Both(3, 3)), Some("c"));
415        assert_eq!(get_string_by_field(&re, text, &Both(4, 3)), None);
416    }
417}