coreutils_rs/fold/
core.rs

1use std::io::Write;
2
3/// Fold (wrap) lines to a given width.
4///
5/// Modes:
6/// - `bytes` mode (-b): count bytes, break at byte boundaries
7/// - default mode: count columns (tab = advance to next tab stop, backspace = decrement)
8///
9/// If `spaces` (-s): break at the last space within the width instead of mid-word.
10pub fn fold_bytes(
11    data: &[u8],
12    width: usize,
13    count_bytes: bool,
14    break_at_spaces: bool,
15    out: &mut impl Write,
16) -> std::io::Result<()> {
17    if data.is_empty() {
18        return Ok(());
19    }
20
21    if width == 0 {
22        return fold_width_zero(data, out);
23    }
24
25    // Fast path: byte mode without -s, use SIMD-accelerated scanning
26    if count_bytes && !break_at_spaces {
27        return fold_byte_fast(data, width, out);
28    }
29
30    let mut output = Vec::with_capacity(data.len() + data.len() / width);
31
32    if count_bytes {
33        fold_byte_mode(data, width, break_at_spaces, &mut output);
34    } else {
35        fold_column_mode(data, width, break_at_spaces, &mut output);
36    }
37
38    out.write_all(&output)
39}
40
41/// Width 0: GNU fold behavior — each byte becomes a newline.
42fn fold_width_zero(data: &[u8], out: &mut impl Write) -> std::io::Result<()> {
43    let output = vec![b'\n'; data.len()];
44    out.write_all(&output)
45}
46
47/// Fast fold by byte count without -s flag.
48/// Uses memchr to find newlines, bulk-copies runs, inserts breaks at exact positions.
49fn fold_byte_fast(data: &[u8], width: usize, out: &mut impl Write) -> std::io::Result<()> {
50    // Each line can have at most one extra newline inserted
51    let mut output = Vec::with_capacity(data.len() + data.len() / width + 1);
52    let mut pos: usize = 0;
53
54    while pos < data.len() {
55        // Find the next newline within the remaining data
56        let remaining = &data[pos..];
57
58        match memchr::memchr(b'\n', remaining) {
59            Some(nl_offset) => {
60                // Process the segment up to (and including) the newline
61                let segment = &data[pos..pos + nl_offset + 1];
62                fold_segment_bytes(&mut output, segment, width);
63                pos += nl_offset + 1;
64            }
65            None => {
66                // No more newlines: process the rest
67                fold_segment_bytes(&mut output, &data[pos..], width);
68                break;
69            }
70        }
71    }
72
73    out.write_all(&output)
74}
75
76/// Fold a single line segment (no internal newlines except possibly trailing) by bytes.
77#[inline]
78fn fold_segment_bytes(output: &mut Vec<u8>, segment: &[u8], width: usize) {
79    let mut start = 0;
80    while start + width < segment.len() {
81        // Check if the character at start+width is a newline (end of line)
82        if segment[start + width] == b'\n' {
83            output.extend_from_slice(&segment[start..start + width + 1]);
84            return;
85        }
86        output.extend_from_slice(&segment[start..start + width]);
87        output.push(b'\n');
88        start += width;
89    }
90    // Remaining bytes
91    if start < segment.len() {
92        output.extend_from_slice(&segment[start..]);
93    }
94}
95
96/// Fold by byte count with -s (break at spaces).
97/// When breaking at a space, uses copy_within instead of allocating a temporary Vec.
98fn fold_byte_mode(data: &[u8], width: usize, break_at_spaces: bool, output: &mut Vec<u8>) {
99    let mut col: usize = 0;
100    let mut last_space_out_pos: Option<usize> = None;
101
102    for &byte in data {
103        if byte == b'\n' {
104            output.push(b'\n');
105            col = 0;
106            last_space_out_pos = None;
107            continue;
108        }
109
110        if col >= width {
111            if break_at_spaces {
112                if let Some(sp_pos) = last_space_out_pos {
113                    // Insert newline after the space and shift trailing bytes forward
114                    let tail_start = sp_pos + 1;
115                    let tail_end = output.len();
116                    let after_len = tail_end - tail_start;
117                    output.push(0); // make room for the newline
118                    output.copy_within(tail_start..tail_end, tail_start + 1);
119                    output[tail_start] = b'\n';
120                    col = after_len;
121                    last_space_out_pos = None;
122                } else {
123                    output.push(b'\n');
124                    col = 0;
125                }
126            } else {
127                output.push(b'\n');
128                col = 0;
129            }
130        }
131
132        if break_at_spaces && (byte == b' ' || byte == b'\t') {
133            last_space_out_pos = Some(output.len());
134        }
135
136        output.push(byte);
137        col += 1;
138    }
139}
140
141/// Fold by column count (default mode, handles tabs, backspaces, and UTF-8).
142fn fold_column_mode(data: &[u8], width: usize, break_at_spaces: bool, output: &mut Vec<u8>) {
143    let mut pos = 0;
144
145    while pos < data.len() {
146        // Find the next newline using SIMD
147        let remaining = &data[pos..];
148        let line_end = memchr::memchr(b'\n', remaining).map(|p| pos + p);
149        let line_data = match line_end {
150            Some(nl) => &data[pos..nl],
151            None => &data[pos..],
152        };
153
154        // Fast path: pure ASCII, no tabs/backspaces — column == byte count
155        if is_ascii_simple(line_data) {
156            if line_data.len() <= width {
157                // Short line: no wrapping needed
158                output.extend_from_slice(line_data);
159            } else if break_at_spaces {
160                fold_ascii_line_spaces(line_data, width, output);
161            } else {
162                fold_segment_bytes(output, line_data, width);
163            }
164            if let Some(nl) = line_end {
165                output.push(b'\n');
166                pos = nl + 1;
167            } else {
168                break;
169            }
170            continue;
171        }
172
173        // Slow path: process character by character for this line
174        fold_one_line_column(line_data, width, break_at_spaces, output);
175        if let Some(nl) = line_end {
176            output.push(b'\n');
177            pos = nl + 1;
178        } else {
179            break;
180        }
181    }
182}
183
184/// Fast fold an ASCII line with -s (break at spaces).
185/// Since it's pure ASCII, column == byte position.
186fn fold_ascii_line_spaces(line: &[u8], width: usize, output: &mut Vec<u8>) {
187    let mut start = 0;
188    while start + width < line.len() {
189        // Look for the last space within the width
190        let chunk = &line[start..start + width];
191        match memchr::memrchr(b' ', chunk) {
192            Some(sp_offset) => {
193                // Break after the space (include the space, then newline)
194                let break_at = start + sp_offset + 1;
195                output.extend_from_slice(&line[start..break_at]);
196                output.push(b'\n');
197                start = break_at;
198            }
199            None => {
200                // No space found: hard break at width
201                output.extend_from_slice(&line[start..start + width]);
202                output.push(b'\n');
203                start += width;
204            }
205        }
206    }
207    // Remaining bytes
208    if start < line.len() {
209        output.extend_from_slice(&line[start..]);
210    }
211}
212
213/// Check if a line is pure ASCII with no tabs or backspaces.
214#[inline]
215fn is_ascii_simple(data: &[u8]) -> bool {
216    // All bytes must be ASCII printable (0x20..=0x7E) or space
217    data.iter().all(|&b| b >= 0x20 && b <= 0x7E)
218}
219
220/// Get the column width and byte length of a byte at `data[pos]`.
221/// Returns (column_width, byte_length) — always (1, 1) for non-special bytes.
222///
223/// GNU fold's multibyte path is guarded by:
224///   `#if HAVE_MBRTOC32 && (! defined __GLIBC__ || defined __UCLIBC__)`
225/// On glibc (every mainstream Linux distro), that condition is false, so
226/// fold counts bytes — one column per byte, same as -b mode.
227/// Tab, backspace, and CR are handled by the caller.
228#[inline]
229fn char_info(data: &[u8], pos: usize) -> (usize, usize) {
230    let b = data[pos];
231    if b < 0x80 {
232        // ASCII: tab/backspace handled by caller; control chars have 0 width
233        if b < 0x20 || b == 0x7f {
234            (0, 1)
235        } else {
236            (1, 1)
237        }
238    } else {
239        // High byte: count as 1 column, 1 byte (GNU glibc compat)
240        (1, 1)
241    }
242}
243
244/// Process a single line (no newlines) in column mode, writing to output.
245///
246/// Uses a scan-and-flush approach: tracks break points in the INPUT data,
247/// then writes complete segments. Avoids copy_within for -s mode.
248fn fold_one_line_column(line: &[u8], width: usize, break_at_spaces: bool, output: &mut Vec<u8>) {
249    let mut col: usize = 0;
250    // For -s mode: track last space in input, not output
251    let mut last_space_in: Option<usize> = None; // byte index in `line` AFTER the space
252    let mut col_at_space: usize = 0;
253    // CR/backspace change col non-linearly, invalidating `col - col_at_space`.
254    // When set, we must use recalc_column() to replay from the space marker.
255    let mut needs_recalc = false;
256    let mut seg_start: usize = 0; // start of current unflushed segment in `line`
257    let mut i = 0;
258
259    while i < line.len() {
260        let byte = line[i];
261
262        // Handle tab specially
263        if byte == b'\t' {
264            let tab_width = ((col / 8) + 1) * 8 - col;
265
266            if col + tab_width > width && tab_width > 0 {
267                // Need to break before this tab
268                if break_at_spaces {
269                    if let Some(sp_after) = last_space_in {
270                        // Flush up to and including the space, then newline
271                        output.extend_from_slice(&line[seg_start..sp_after]);
272                        output.push(b'\n');
273                        seg_start = sp_after;
274                        col = if needs_recalc {
275                            recalc_column(&line[sp_after..i])
276                        } else {
277                            col - col_at_space
278                        };
279                        last_space_in = None;
280                        needs_recalc = false;
281                        // Re-evaluate this tab with the new col — it may
282                        // still exceed width after the space break.
283                        continue;
284                    } else {
285                        output.extend_from_slice(&line[seg_start..i]);
286                        output.push(b'\n');
287                        seg_start = i;
288                        col = 0;
289                    }
290                } else {
291                    output.extend_from_slice(&line[seg_start..i]);
292                    output.push(b'\n');
293                    seg_start = i;
294                    col = 0;
295                }
296            }
297
298            if break_at_spaces {
299                last_space_in = Some(i + 1);
300                col_at_space = col + ((col / 8) + 1) * 8 - col;
301                needs_recalc = false;
302            }
303            col += ((col / 8) + 1) * 8 - col;
304            i += 1;
305            continue;
306        }
307
308        // Handle carriage return: resets column to 0 (GNU adjust_column compat).
309        // Invalidates `col - col_at_space` but keeps the space marker —
310        // GNU fold still breaks at the last space even after CR.
311        if byte == b'\r' {
312            col = 0;
313            if last_space_in.is_some() {
314                needs_recalc = true;
315            }
316            i += 1;
317            continue;
318        }
319
320        // Handle backspace: decrements column non-linearly.
321        // Invalidates `col - col_at_space` but keeps the space marker.
322        if byte == b'\x08' {
323            if col > 0 {
324                col -= 1;
325            }
326            if last_space_in.is_some() {
327                needs_recalc = true;
328            }
329            i += 1;
330            continue;
331        }
332
333        // Get character info (display width + byte length)
334        let (cw, byte_len) = char_info(line, i);
335
336        // Check if adding this character would exceed width
337        if col + cw > width && cw > 0 {
338            if break_at_spaces {
339                if let Some(sp_after) = last_space_in {
340                    output.extend_from_slice(&line[seg_start..sp_after]);
341                    output.push(b'\n');
342                    seg_start = sp_after;
343                    col = if needs_recalc {
344                        recalc_column(&line[sp_after..i])
345                    } else {
346                        col - col_at_space
347                    };
348                    last_space_in = None;
349                    needs_recalc = false;
350                    // Re-evaluate this character with the new col — it may
351                    // still exceed width after the space break.
352                    continue;
353                } else {
354                    output.extend_from_slice(&line[seg_start..i]);
355                    output.push(b'\n');
356                    seg_start = i;
357                    col = 0;
358                }
359            } else {
360                output.extend_from_slice(&line[seg_start..i]);
361                output.push(b'\n');
362                seg_start = i;
363                col = 0;
364            }
365        }
366
367        if break_at_spaces && byte == b' ' {
368            last_space_in = Some(i + 1);
369            col_at_space = col + cw;
370            needs_recalc = false;
371        }
372
373        col += cw;
374        i += byte_len;
375    }
376
377    // Flush remaining segment
378    if seg_start < line.len() {
379        output.extend_from_slice(&line[seg_start..]);
380    }
381}
382
383/// Recalculate column position by replaying a segment (handles tabs, CR, backspace).
384/// Used when non-linear column operations (CR, backspace) invalidate the fast
385/// `col - col_at_space` delta formula.
386fn recalc_column(data: &[u8]) -> usize {
387    let mut col = 0;
388    let mut i = 0;
389    while i < data.len() {
390        let b = data[i];
391        if b == b'\r' {
392            col = 0;
393            i += 1;
394        } else if b == b'\t' {
395            col = ((col / 8) + 1) * 8;
396            i += 1;
397        } else if b == b'\x08' {
398            if col > 0 {
399                col -= 1;
400            }
401            i += 1;
402        } else if b < 0x80 {
403            if b >= 0x20 && b != 0x7f {
404                col += 1;
405            }
406            i += 1;
407        } else {
408            let (cw, byte_len) = char_info(data, i);
409            col += cw;
410            i += byte_len;
411        }
412    }
413    col
414}
coreutils_rs/fold/core.rs

coreutils_rs/fold/
core.rs