Skip to main content

stryke/
par_lines.rs

1//! Memory-mapped parallel line iteration for `par_lines PATH, fn { ... }`.
2//! Splits the file into byte ranges aligned to line starts, then processes each chunk in parallel
3//! with rayon (each chunk scans its lines sequentially).
4
5/// Build up to `max_chunks` contiguous byte ranges `[start, end)` covering `data`, where each
6/// range starts at a line boundary (byte 0 or immediately after `\n`). Ranges partition the file
7/// without splitting lines.
8pub fn line_aligned_chunks(data: &[u8], max_chunks: usize) -> Vec<(usize, usize)> {
9    let len = data.len();
10    if len == 0 {
11        return vec![];
12    }
13    let k = max_chunks.max(1).min(len);
14    let mut splits: Vec<usize> = (0..=k).map(|i| i * len / k).collect();
15    for split in splits.iter_mut().take(k).skip(1) {
16        let mut p = *split;
17        while p < len && p > 0 && data[p - 1] != b'\n' {
18            p += 1;
19        }
20        *split = p;
21    }
22    for i in 1..=k {
23        if splits[i] < splits[i - 1] {
24            splits[i] = splits[i - 1];
25        }
26    }
27    let mut out = Vec::new();
28    for i in 0..k {
29        let s = splits[i];
30        let e = splits[i + 1];
31        if s < e {
32            out.push((s, e));
33        }
34    }
35    if out.is_empty() {
36        out.push((0, len));
37    }
38    out
39}
40
41/// Count newline-delimited lines (non-empty buffer; last line may omit trailing `\n`).
42pub fn line_count_bytes(data: &[u8]) -> usize {
43    if data.is_empty() {
44        return 0;
45    }
46    let mut n = data.iter().filter(|&&b| b == b'\n').count();
47    if !data.ends_with(b"\n") {
48        n += 1;
49    }
50    n
51}
52
53/// Convert one line of bytes (no `\n`) to a Perl string; strips trailing `\r` for CRLF.
54pub fn line_to_perl_string(line: &[u8]) -> String {
55    let line = if line.ends_with(b"\r") && !line.is_empty() {
56        &line[..line.len() - 1]
57    } else {
58        line
59    };
60    crate::perl_decode::decode_utf8_or_latin1_line(line)
61}
62
63#[cfg(test)]
64mod tests {
65    use super::*;
66
67    #[test]
68    fn line_aligned_chunks_splits_without_breaking_lines() {
69        let data = b"a\nbb\nccc\n";
70        let chunks = line_aligned_chunks(data, 4);
71        let rebuilt: Vec<u8> = chunks
72            .iter()
73            .flat_map(|(s, e)| data[*s..*e].iter().copied())
74            .collect();
75        assert_eq!(rebuilt, data);
76        for (s, _e) in &chunks {
77            if *s > 0 {
78                assert_eq!(data[*s - 1], b'\n');
79            }
80        }
81    }
82
83    #[test]
84    fn line_count_bytes_matches_scan() {
85        assert_eq!(line_count_bytes(b""), 0);
86        assert_eq!(line_count_bytes(b"a\nb"), 2);
87        assert_eq!(line_count_bytes(b"a\nb\n"), 2);
88        assert_eq!(line_count_bytes(b"a"), 1);
89    }
90
91    #[test]
92    fn scan_lines_in_slice_three_lines() {
93        let data = b"one\ntwo\nthree";
94        let mut lines = Vec::new();
95        let mut s = 0usize;
96        while s < data.len() {
97            let e = data[s..]
98                .iter()
99                .position(|&b| b == b'\n')
100                .map(|p| s + p)
101                .unwrap_or(data.len());
102            lines.push(&data[s..e]);
103            if e >= data.len() {
104                break;
105            }
106            s = e + 1;
107        }
108        assert_eq!(lines, vec![&b"one"[..], &b"two"[..], &b"three"[..]]);
109    }
110
111    #[test]
112    fn line_aligned_chunks_empty_input() {
113        assert!(line_aligned_chunks(&[], 8).is_empty());
114    }
115
116    #[test]
117    fn line_aligned_chunks_single_byte() {
118        let c = line_aligned_chunks(b"x", 4);
119        assert_eq!(c, vec![(0, 1)]);
120    }
121
122    #[test]
123    fn line_aligned_chunks_max_chunks_zero_uses_one() {
124        let data = b"a\nb\n";
125        let c = line_aligned_chunks(data, 0);
126        assert!(!c.is_empty());
127        let rebuilt: Vec<u8> = c
128            .iter()
129            .flat_map(|(s, e)| data[*s..*e].iter().copied())
130            .collect();
131        assert_eq!(rebuilt, data);
132    }
133
134    #[test]
135    fn line_to_perl_string_strips_cr() {
136        assert_eq!(line_to_perl_string(b"row\r"), "row");
137    }
138
139    #[test]
140    fn line_to_perl_string_invalid_utf8_maps_octets() {
141        let s = line_to_perl_string(&[0xff, 0xfe]);
142        assert_eq!(s, "\u{00ff}\u{00fe}");
143    }
144}