rcut_lib/
lib.rs

1//! `rcut` is a Rust replacement for GNU cut that supports UTF-8.
2//! Implementation details are exported for reusability in case users
3//! are interested in building their own char/word cutter.
4//!
5
6use std::{cmp, str};
7
8extern crate rtools_traits;
9use rtools_traits::{RtoolT, LineProcessorT};
10
11/// Cargo version specified in the Cargo.toml file
12const VERSION: &'static str = env!("CARGO_PKG_VERSION");
13
14/// Cargo version specified in the Cargo.toml file
15pub fn version() -> &'static str {
16    VERSION
17}
18
19/// Extract ranged pair having the pattern `(\d|\d+-|-\d+|\d+-\d+)`
20pub fn str_to_ranged_pair(char_part: &str) -> (usize, usize) {
21    assert!(char_part != "-", "invalid range with no endpoint: -");
22
23    let str_pos: Vec<&str> = char_part.split("-").collect();
24
25    if str_pos.len() == 1 {
26        let start_pos = char_part.parse::<usize>().unwrap();
27        (start_pos, start_pos)
28    } else {
29        assert!(str_pos.len() == 2);
30
31        let start_pos = if str_pos[0].is_empty() {
32            1
33        } else {
34            str_pos[0].parse::<usize>().unwrap()
35        };
36
37        let end_pos = if str_pos[1].is_empty() {
38            std::usize::MAX
39        } else {
40            str_pos[1].parse::<usize>().unwrap()
41        };
42
43        (start_pos, end_pos)
44    }
45}
46
47/// Extract list of comma-separated ranged pairs
48pub fn extract_ranged_pairs(ranged_pairs_str: &str) -> Vec<(usize, usize)> {
49    let unsorted_ranged_pairs: Vec<(usize, usize)> = ranged_pairs_str
50        .split(",")
51        .map(|char_part| str_to_ranged_pair(char_part))
52        .filter(|(start_pos, end_pos)| start_pos <= end_pos)
53        .collect();
54
55    unsorted_ranged_pairs
56}
57
58/// Sort ranged pairs and merge those having adjacent or overlapping boundaries
59pub fn merge_ranged_pairs(mut unsorted_ranged_pairs: Vec<(usize, usize)>) -> Vec<(usize, usize)> {
60    // Without prior sorting, merging would be a bad idea
61    unsorted_ranged_pairs.sort();
62
63    let mut ranged_pairs: Vec<(usize, usize)> = vec![];
64
65    for ranged_pair in &unsorted_ranged_pairs {
66        if ranged_pairs.is_empty() {
67            ranged_pairs.push(ranged_pair.clone());
68        } else {
69            let last_mut = ranged_pairs.last_mut().unwrap();
70
71            // Merge 2 adjacently sorted intervals whenever possible
72            if ranged_pair.0 - 1 > last_mut.1 {
73                ranged_pairs.push(ranged_pair.clone());
74            } else {
75                last_mut.1 = cmp::max(last_mut.1, ranged_pair.1);
76            }
77        }
78    }
79
80    ranged_pairs
81}
82
83/// Utility function to process ranged pairs (extract, and merge on demand)
84pub fn prepare_ranged_pairs(no_merge: bool, ranged_pairs_str: &str) -> Vec<(usize, usize)> {
85    let unsorted_ranged_pairs = extract_ranged_pairs(ranged_pairs_str);
86
87    let ranged_pairs = if no_merge {
88        unsorted_ranged_pairs
89    } else {
90        merge_ranged_pairs(unsorted_ranged_pairs)
91    };
92
93    ranged_pairs
94}
95
96pub trait CharContextT {
97    fn ranged_pairs(&self) -> &Vec<(usize, usize)>;
98}
99
100pub trait FieldContextT {
101    fn ranged_pairs(&self) -> &Vec<(usize, usize)>;
102
103    fn delim(&self) -> &str;
104}
105
106pub struct CharContext<'a> {
107    ranged_pairs: &'a Vec<(usize, usize)>,
108}
109
110impl<'a> CharContext<'a> {
111    pub fn new(ranged_pairs: &'a Vec<(usize, usize)>) -> CharContext<'a> {
112        CharContext {
113            ranged_pairs: ranged_pairs,
114        }
115    }
116}
117
118impl CharContextT for CharContext<'_> {
119    fn ranged_pairs(&self) -> &Vec<(usize, usize)> {
120        self.ranged_pairs
121    }
122}
123
124pub struct FieldContext<'a> {
125    ranged_pairs: &'a Vec<(usize, usize)>,
126    delim: &'a str,
127}
128
129impl<'a> FieldContext<'a> {
130    pub fn new(ranged_pairs: &'a Vec<(usize, usize)>, delim: &'a str) -> FieldContext<'a> {
131        FieldContext {
132            ranged_pairs,
133            delim,
134        }
135    }
136}
137
138impl FieldContextT for FieldContext<'_> {
139    fn ranged_pairs(&self) -> &Vec<(usize, usize)> {
140        self.ranged_pairs
141    }
142
143    fn delim(&self) -> &str {
144        self.delim
145    }
146}
147
148pub struct CharUtf8LineProcessor {}
149
150/// Extract chars from a UTF-8 line within given ranges
151pub fn process_line_by_char_utf8(line: &str, ranged_pairs: &Vec<(usize, usize)>) -> Vec<u8> {
152    let uchars: Vec<char> = line.chars().collect();
153    let mut out_bytes: Vec<u8> = vec![];
154    let char_count = &uchars.len();
155
156    // Handle UTF-8
157    // https://stackoverflow.com/questions/51982999/slice-a-string-containing-unicode-chars
158    // https://crates.io/crates/unicode-segmentation
159
160    for (start_pos, end_pos) in ranged_pairs {
161        let mut char_pos: usize = start_pos.clone();
162
163        while char_pos <= *char_count && char_pos <= *end_pos {
164            let mut dst = [0; 8];
165            out_bytes.extend(uchars[char_pos - 1].encode_utf8(&mut dst).as_bytes());
166            char_pos += 1;
167        }
168    }
169
170    out_bytes.extend("\n".as_bytes());
171    out_bytes
172}
173
174impl<C: CharContextT> LineProcessorT<C> for CharUtf8LineProcessor {
175    /// Extract parts of a UTF-8 encoded line
176    fn process(&self, line: &str, context: &C) -> Vec<u8> {
177        process_line_by_char_utf8(line, context.ranged_pairs())
178    }
179}
180
181pub struct ByteLineProcessor {}
182
183/// Extract bytes from a line within given ranges
184pub fn process_line_by_byte(line: &str, ranged_pairs: &Vec<(usize, usize)>) -> Vec<u8> {
185    let mut out_bytes: Vec<u8> = vec![];
186    let bytes = line.as_bytes();
187    let len = &bytes.len();
188
189    // Handle ASCII/single-bytes only
190    for (start_pos, end_pos) in ranged_pairs {
191        if *start_pos > *len {
192            break;
193        }
194
195        // NOTE: This will panic if multi-byte characters are present
196        let final_bytes = if *end_pos < *len {
197            &bytes[start_pos - 1..*end_pos]
198        } else {
199            &bytes[start_pos - 1..]
200        };
201
202        out_bytes.extend(final_bytes);
203    }
204
205    out_bytes.extend("\n".as_bytes());
206    out_bytes
207}
208
209impl<C: CharContextT> LineProcessorT<C> for ByteLineProcessor {
210    /// Extract parts of an ASCII encoded line
211    fn process(&self, line: &str, context: &C) -> Vec<u8> {
212        process_line_by_byte(line, context.ranged_pairs())
213    }
214}
215
216pub struct CharProcessor {}
217
218impl<C: CharContextT, P: LineProcessorT<C>> RtoolT<C, P> for CharProcessor {}
219
220pub struct FieldUtf8LineProcessor {}
221
222/// Extract fields from a UTF-8 line within given ranges
223pub fn process_line_by_field_utf8(
224    line: &str,
225    ranged_pairs: &Vec<(usize, usize)>,
226    delim: &str,
227) -> Vec<u8> {
228    let mut out_bytes: Vec<u8> = vec![];
229    let delim = delim;
230
231    let fields: Vec<&str> = line.split(delim).collect();
232    let mut has_written = false;
233
234    for (start_pos, end_pos) in ranged_pairs {
235        let len = &fields.len();
236        if *start_pos > *len {
237            break;
238        }
239
240        let extracted_fields = if *end_pos < *len {
241            &fields[start_pos - 1..*end_pos]
242        } else {
243            &fields[start_pos - 1..]
244        };
245
246        for field in extracted_fields {
247            // Delimiter sits between fields
248            if has_written {
249                out_bytes.extend(delim.as_bytes());
250            } else {
251                has_written = true;
252            }
253
254            out_bytes.extend(field.as_bytes());
255        }
256    }
257
258    out_bytes.extend("\n".as_bytes());
259    out_bytes
260}
261
262impl<C: FieldContextT> LineProcessorT<C> for FieldUtf8LineProcessor {
263    /// Extract parts of an ASCII encoded line
264    fn process(&self, line: &str, context: &C) -> Vec<u8> {
265        process_line_by_field_utf8(line, context.ranged_pairs(), context.delim())
266    }
267}
268
269pub struct FieldProcessor {}
270
271impl<C: FieldContextT, P: LineProcessorT<C>> RtoolT<C, P> for FieldProcessor {}
272
273#[cfg(test)]
274mod tests {
275    use super::*;
276
277    const _STR_RANGES_01: &'static str = "9,4,7,3,12,5-15";
278    const _STR_BIRDS: &'static str = "ðŸĶƒðŸ”🐓ðŸĢðŸĪðŸĨðŸĶ🐧🕊ðŸĶ…ðŸĶ†ðŸĶĒðŸĶ‰ðŸĶšðŸĶœ";
279    const _STR_BIRDS_OUTPUT: &'static str = "🕊ðŸĢðŸĶ🐓ðŸĶĒðŸĪðŸĨðŸĶ🐧🕊ðŸĶ…ðŸĶ†ðŸĶĒðŸĶ‰ðŸĶšðŸĶœ\n";
280    const _STR_ALPHABET: &'static str = "abcdefghijklmnopqrstuvwxyz";
281    const _STR_ALPHABET_OUTPUT: &'static str = "idgclefghijklmno\n";
282
283    #[test]
284    fn test_str_to_ranged_pair_valid_inputs() {
285        assert_eq!(str_to_ranged_pair("1"), (1, 1));
286        assert_eq!(str_to_ranged_pair("2"), (2, 2));
287        assert_eq!(str_to_ranged_pair("-20"), (1, 20));
288        assert_eq!(str_to_ranged_pair("20-"), (20, std::usize::MAX));
289        assert_eq!(str_to_ranged_pair("3-7"), (3, 7));
290    }
291
292    #[test]
293    #[should_panic]
294    fn test_str_to_ranged_pair_empty_input() {
295        str_to_ranged_pair("");
296    }
297
298    #[test]
299    #[should_panic]
300    fn test_str_to_ranged_pair_no_range() {
301        str_to_ranged_pair("-");
302    }
303
304    #[test]
305    #[should_panic]
306    fn test_str_to_ranged_pair_invalid_char() {
307        str_to_ranged_pair(";");
308    }
309
310    #[test]
311    #[should_panic]
312    fn test_str_to_ranged_pair_space() {
313        str_to_ranged_pair(" ");
314    }
315
316    #[test]
317    #[should_panic]
318    fn test_str_to_ranged_pair_tab() {
319        str_to_ranged_pair("\t");
320    }
321
322    #[test]
323    fn test_extract_ranged_pairs_basic_valid_inputs() {
324        assert_eq!(extract_ranged_pairs("1"), vec![(1, 1)]);
325        assert_eq!(extract_ranged_pairs("1-8"), vec![(1, 8)]);
326        assert_eq!(extract_ranged_pairs("5-9"), vec![(5, 9)]);
327        assert_eq!(extract_ranged_pairs("9-5"), vec![]);
328        assert_eq!(extract_ranged_pairs("-5"), vec![(1, 5)]);
329        assert_eq!(extract_ranged_pairs("5-"), vec![(5, std::usize::MAX)]);
330    }
331
332    #[test]
333    fn test_extract_ranged_pairs_ensure_no_sorting() {
334        assert_eq!(
335            extract_ranged_pairs("3,4,5-"),
336            vec![(3, 3), (4, 4), (5, std::usize::MAX)]
337        );
338        assert_eq!(
339            extract_ranged_pairs("5-,3,4"),
340            vec![(5, std::usize::MAX), (3, 3), (4, 4)]
341        );
342        assert_eq!(
343            extract_ranged_pairs("6-10,5-"),
344            vec![(6, 10), (5, std::usize::MAX)]
345        );
346        assert_eq!(
347            extract_ranged_pairs("7,6-10,5-"),
348            vec![(7, 7), (6, 10), (5, std::usize::MAX)]
349        );
350    }
351
352    #[test]
353    #[should_panic]
354    fn test_extract_ranged_pairs_empty() {
355        extract_ranged_pairs("");
356    }
357
358    #[test]
359    #[should_panic]
360    fn test_extract_ranged_pairs_bad_range() {
361        extract_ranged_pairs("-");
362    }
363
364    #[test]
365    fn test_merge_ranged_pairs() {
366        assert_eq!(
367            merge_ranged_pairs(extract_ranged_pairs("3,4,5-")),
368            vec![(3, std::usize::MAX)]
369        );
370        assert_eq!(
371            merge_ranged_pairs(extract_ranged_pairs("3-4,5-")),
372            vec![(3, std::usize::MAX)]
373        );
374        assert_eq!(
375            merge_ranged_pairs(extract_ranged_pairs("3-5,5-")),
376            vec![(3, std::usize::MAX)]
377        );
378        assert_eq!(
379            merge_ranged_pairs(extract_ranged_pairs("3-6,5-")),
380            vec![(3, std::usize::MAX)]
381        );
382        assert_eq!(
383            merge_ranged_pairs(extract_ranged_pairs("7,6-10,5-")),
384            vec![(5, std::usize::MAX)]
385        );
386        assert_eq!(
387            merge_ranged_pairs(extract_ranged_pairs("3-7,8,2-10,12-20")),
388            vec![(2, 10), (12, 20)]
389        );
390        assert_eq!(
391            merge_ranged_pairs(extract_ranged_pairs("3-7,8,2-10,11-20")),
392            vec![(2, 20)]
393        );
394    }
395
396    #[test]
397    fn test_process_line_utf8() {
398        let char_processor = CharUtf8LineProcessor {};
399        let ranged_pairs = extract_ranged_pairs(_STR_RANGES_01);
400        assert_eq!(
401            _STR_BIRDS_OUTPUT.as_bytes().to_vec(),
402            char_processor.process(
403                _STR_BIRDS,
404                &CharContext {
405                    ranged_pairs: &ranged_pairs
406                }
407            )
408        );
409    }
410
411    #[test]
412    fn test_process_line_ascii() {
413        let char_processor = ByteLineProcessor {};
414        let ranged_pairs = extract_ranged_pairs(_STR_RANGES_01);
415        assert_eq!(
416            _STR_ALPHABET_OUTPUT.as_bytes().to_vec(),
417            char_processor.process(
418                _STR_ALPHABET,
419                &CharContext {
420                    ranged_pairs: &ranged_pairs
421                }
422            )
423        );
424    }
425
426    #[test]
427    #[should_panic]
428    fn test_process_line_ascii_panic() {
429        let char_processor = ByteLineProcessor {};
430        let ranged_pairs = extract_ranged_pairs(_STR_RANGES_01);
431        assert_eq!(
432            _STR_BIRDS_OUTPUT.as_bytes().to_vec(),
433            char_processor.process(
434                _STR_BIRDS,
435                &CharContext {
436                    ranged_pairs: &ranged_pairs
437                }
438            )
439        );
440    }
441
442    #[test]
443    fn test_process_lines_utf8_with_cursor() {
444        use std::io::{BufReader, BufWriter};
445        use std::io::prelude::*;
446
447        // https://doc.rust-lang.org/std/io/struct.Cursor.html
448        // https://stackoverflow.com/questions/41069865/how-to-create-an-in-memory-object-that-can-be-used-as-a-reader-writer-or-seek
449        let input = BufReader::new(std::io::Cursor::new(_STR_BIRDS));
450        let mut out_cursor = std::io::Cursor::new(Vec::<u8>::new());
451
452        let ranged_pairs = extract_ranged_pairs(_STR_RANGES_01);
453        let char_processor = CharProcessor {};
454        // Let borrower of the output cursor expire before reacquiring the output cursor
455        char_processor.process_lines(
456            &CharUtf8LineProcessor {},
457            input,
458            &mut BufWriter::new(&mut out_cursor),
459            &CharContext {
460                ranged_pairs: &ranged_pairs,
461            },
462        );
463
464        out_cursor.seek(std::io::SeekFrom::Start(0)).unwrap();
465        // Read the fake "file's" contents into a vector
466        let mut out = Vec::new();
467        out_cursor.read_to_end(&mut out).unwrap();
468        assert_eq!(_STR_BIRDS_OUTPUT.as_bytes().to_vec(), out);
469    }
470
471    #[test]
472    fn test_process_ascii_fields_for_line_ignored_delim() {
473        let line_processor = FieldUtf8LineProcessor {};
474        let line = "1234";
475        let delim = ":";
476        let ranged_pairs: Vec<(usize, usize)> = vec![(2, 2), (4, 6)];
477        assert_eq!(
478            vec![10],
479            line_processor.process(
480                line,
481                &FieldContext {
482                    delim,
483                    ranged_pairs: &ranged_pairs
484                }
485            )
486        );
487    }
488
489    #[test]
490    fn test_process_ascii_fields_for_line_leading_delim() {
491        let line_processor = FieldUtf8LineProcessor {};
492        let line = ":1234";
493        let delim = ":";
494        let ranged_pairs: Vec<(usize, usize)> = vec![(2, 2), (4, 6)];
495        assert_eq!(
496            "1234\n".as_bytes().to_vec(),
497            line_processor.process(
498                line,
499                &FieldContext {
500                    delim,
501                    ranged_pairs: &ranged_pairs
502                }
503            )
504        );
505    }
506
507    #[test]
508    fn test_process_ascii_fields_for_line_trailing_delim() {
509        let line_processor = FieldUtf8LineProcessor {};
510        let line = "1234:";
511        let delim = ":";
512        let ranged_pairs: Vec<(usize, usize)> = vec![(2, 2), (4, 6)];
513        assert_eq!(
514            "\n".as_bytes().to_vec(),
515            line_processor.process(
516                line,
517                &FieldContext {
518                    delim,
519                    ranged_pairs: &ranged_pairs
520                }
521            )
522        );
523    }
524
525    #[test]
526    fn test_process_ascii_fields_for_line_1st_field_empty() {
527        let line_processor = FieldUtf8LineProcessor {};
528        let line = ":1:2:3";
529        let delim = ":";
530        assert_eq!(
531            ":2\n".as_bytes().to_vec(),
532            line_processor.process(
533                line,
534                &FieldContext {
535                    delim,
536                    ranged_pairs: &vec![(1, 1), (3, 3)]
537                },
538            )
539        );
540        assert_eq!(
541            ":2:3\n".as_bytes().to_vec(),
542            line_processor.process(
543                line,
544                &FieldContext {
545                    delim,
546                    ranged_pairs: &vec![(1, 1), (3, 3), (4, 4)]
547                }
548            )
549        );
550        assert_eq!(
551            ":3\n".as_bytes().to_vec(),
552            line_processor.process(
553                line,
554                &FieldContext {
555                    delim,
556                    ranged_pairs: &vec![(1, 1), (4, 4)]
557                }
558            )
559        );
560        assert_eq!(
561            ":2:3\n".as_bytes().to_vec(),
562            line_processor.process(
563                line,
564                &FieldContext {
565                    delim,
566                    ranged_pairs: &vec![(1, 1), (3, 4)]
567                }
568            )
569        );
570        assert_eq!(
571            ":2:3\n".as_bytes().to_vec(),
572            line_processor.process(
573                line,
574                &FieldContext {
575                    delim,
576                    ranged_pairs: &vec![(1, 1), (3, 5)]
577                }
578            )
579        );
580    }
581
582    #[test]
583    fn test_process_utf8_fields_for_line_1st_field_empty() {
584        let line_processor = FieldUtf8LineProcessor {};
585        let line = ":ðŸĢ:ðŸĨ:🐓";
586        let delim = ":";
587        assert_eq!(
588            ":ðŸĨ\n".as_bytes().to_vec(),
589            line_processor.process(
590                line,
591                &FieldContext {
592                    delim,
593                    ranged_pairs: &vec![(1, 1), (3, 3)]
594                }
595            )
596        );
597        assert_eq!(
598            ":ðŸĨ:🐓\n".as_bytes().to_vec(),
599            line_processor.process(
600                line,
601                &FieldContext {
602                    delim,
603                    ranged_pairs: &vec![(1, 1), (3, 3), (4, 4)]
604                }
605            )
606        );
607        assert_eq!(
608            ":🐓\n".as_bytes().to_vec(),
609            line_processor.process(
610                line,
611                &FieldContext {
612                    delim,
613                    ranged_pairs: &vec![(1, 1), (4, 4)]
614                }
615            )
616        );
617        assert_eq!(
618            ":ðŸĨ:🐓\n".as_bytes().to_vec(),
619            line_processor.process(
620                line,
621                &FieldContext {
622                    delim,
623                    ranged_pairs: &vec![(1, 1), (3, 4)]
624                }
625            )
626        );
627        assert_eq!(
628            ":ðŸĨ:🐓\n".as_bytes().to_vec(),
629            line_processor.process(
630                line,
631                &FieldContext {
632                    delim,
633                    ranged_pairs: &vec![(1, 1), (3, 5)]
634                }
635            )
636        );
637    }
638}