Skip to main content

coreutils_rs/fold/
core.rs

1use std::io::Write;
2
3/// Fold (wrap) lines to a given width.
4///
5/// Modes:
6/// - `bytes` mode (-b): count bytes, break at byte boundaries
7/// - default mode: count columns (tab = advance to next tab stop, backspace = decrement)
8///
9/// If `spaces` (-s): break at the last space within the width instead of mid-word.
10pub fn fold_bytes(
11    data: &[u8],
12    width: usize,
13    count_bytes: bool,
14    break_at_spaces: bool,
15    out: &mut impl Write,
16) -> std::io::Result<()> {
17    if data.is_empty() {
18        return Ok(());
19    }
20
21    if width == 0 {
22        return fold_width_zero(data, out);
23    }
24
25    // Fast path: byte mode, use SIMD-accelerated scanning
26    if count_bytes {
27        if break_at_spaces {
28            return fold_byte_fast_spaces(data, width, out);
29        } else {
30            return fold_byte_fast(data, width, out);
31        }
32    }
33
34    // Column mode without tabs: byte mode is equivalent (on glibc)
35    if memchr::memchr(b'\t', data).is_none() {
36        if break_at_spaces {
37            return fold_byte_fast_spaces(data, width, out);
38        } else {
39            return fold_byte_fast(data, width, out);
40        }
41    }
42
43    fold_column_mode_streaming(data, width, break_at_spaces, out)
44}
45
46/// Width 0: GNU fold behavior — each byte becomes a newline.
47fn fold_width_zero(data: &[u8], out: &mut impl Write) -> std::io::Result<()> {
48    let output = vec![b'\n'; data.len()];
49    out.write_all(&output)
50}
51
52/// Parallel threshold for fold byte mode.
53const FOLD_BYTE_PARALLEL_THRESHOLD: usize = 32 * 1024 * 1024;
54
55/// Fast fold by byte count without -s flag.
56/// Uses unsafe pointer copies and a pre-allocated 1MB output buffer.
57/// For files >= 32MB, uses rayon parallel processing on line-aligned chunks.
58fn fold_byte_fast(data: &[u8], width: usize, out: &mut impl Write) -> std::io::Result<()> {
59    if data.len() >= FOLD_BYTE_PARALLEL_THRESHOLD {
60        return fold_byte_fast_parallel(data, width, out);
61    }
62
63    const BUF_CAP: usize = 1024 * 1024 + 4096;
64    let mut buf: Vec<u8> = Vec::with_capacity(BUF_CAP);
65    let src = data.as_ptr();
66    let mut wp: usize = 0;
67    let mut base = buf.as_mut_ptr();
68    let mut seg_start = 0usize;
69
70    for nl_pos in memchr::memchr_iter(b'\n', data) {
71        let seg_len = nl_pos - seg_start;
72
73        if seg_len <= width {
74            let total = seg_len + 1;
75            if wp + total > BUF_CAP {
76                unsafe { buf.set_len(wp) };
77                out.write_all(&buf)?;
78                buf.clear();
79                wp = 0;
80                base = buf.as_mut_ptr();
81            }
82            unsafe {
83                std::ptr::copy_nonoverlapping(src.add(seg_start), base.add(wp), total);
84            }
85            wp += total;
86        } else {
87            let mut off = seg_start;
88            let end = nl_pos;
89            while off + width < end {
90                let chunk = width + 1;
91                if wp + chunk > BUF_CAP {
92                    unsafe { buf.set_len(wp) };
93                    out.write_all(&buf)?;
94                    buf.clear();
95                    wp = 0;
96                    base = buf.as_mut_ptr();
97                }
98                unsafe {
99                    std::ptr::copy_nonoverlapping(src.add(off), base.add(wp), width);
100                    *base.add(wp + width) = b'\n';
101                }
102                wp += chunk;
103                off += width;
104            }
105            let rem = end - off + 1;
106            if wp + rem > BUF_CAP {
107                unsafe { buf.set_len(wp) };
108                out.write_all(&buf)?;
109                buf.clear();
110                wp = 0;
111                base = buf.as_mut_ptr();
112            }
113            unsafe {
114                std::ptr::copy_nonoverlapping(src.add(off), base.add(wp), rem);
115            }
116            wp += rem;
117        }
118        seg_start = nl_pos + 1;
119    }
120
121    if seg_start < data.len() {
122        let mut off = seg_start;
123        let end = data.len();
124        while off + width < end {
125            let chunk = width + 1;
126            if wp + chunk > BUF_CAP {
127                unsafe { buf.set_len(wp) };
128                out.write_all(&buf)?;
129                buf.clear();
130                wp = 0;
131                base = buf.as_mut_ptr();
132            }
133            unsafe {
134                std::ptr::copy_nonoverlapping(src.add(off), base.add(wp), width);
135                *base.add(wp + width) = b'\n';
136            }
137            wp += chunk;
138            off += width;
139        }
140        if off < end {
141            let rem = end - off;
142            if wp + rem > BUF_CAP {
143                unsafe { buf.set_len(wp) };
144                out.write_all(&buf)?;
145                buf.clear();
146                wp = 0;
147                base = buf.as_mut_ptr();
148            }
149            unsafe {
150                std::ptr::copy_nonoverlapping(src.add(off), base.add(wp), rem);
151            }
152            wp += rem;
153        }
154    }
155
156    if wp > 0 {
157        unsafe { buf.set_len(wp) };
158        out.write_all(&buf)?;
159    }
160    Ok(())
161}
162
163/// Parallel fold by byte count. Splits at newline boundaries, processes in parallel.
164fn fold_byte_fast_parallel(data: &[u8], width: usize, out: &mut impl Write) -> std::io::Result<()> {
165    use rayon::prelude::*;
166
167    let num_chunks = rayon::current_num_threads().max(2);
168    let target_chunk_size = data.len() / num_chunks;
169    let mut chunks: Vec<&[u8]> = Vec::with_capacity(num_chunks + 1);
170    let mut pos: usize = 0;
171
172    for _ in 0..num_chunks - 1 {
173        if pos >= data.len() {
174            break;
175        }
176        let target_end = (pos + target_chunk_size).min(data.len());
177        let chunk_end = if target_end >= data.len() {
178            data.len()
179        } else {
180            match memchr::memchr(b'\n', &data[target_end..]) {
181                Some(off) => target_end + off + 1,
182                None => data.len(),
183            }
184        };
185        chunks.push(&data[pos..chunk_end]);
186        pos = chunk_end;
187    }
188    if pos < data.len() {
189        chunks.push(&data[pos..]);
190    }
191
192    let results: Vec<Vec<u8>> = chunks
193        .par_iter()
194        .map(|chunk| {
195            let mut buf = Vec::with_capacity(chunk.len() + chunk.len() / width + 256);
196            fold_byte_chunk(chunk, width, &mut buf);
197            buf
198        })
199        .collect();
200
201    for result in &results {
202        if !result.is_empty() {
203            out.write_all(result)?;
204        }
205    }
206    Ok(())
207}
208
209/// Process a chunk for fold byte mode into a Vec<u8>.
210/// Uses unsafe pointer copies for maximum throughput.
211fn fold_byte_chunk(data: &[u8], width: usize, buf: &mut Vec<u8>) {
212    if data.is_empty() {
213        return;
214    }
215
216    let needed = data.len() + data.len() / width + 256;
217    buf.reserve(needed);
218    let base = buf.as_mut_ptr();
219    let src = data.as_ptr();
220    let initial_len = buf.len();
221    let mut wp: usize = initial_len;
222    let mut seg_start = 0usize;
223
224    for nl_pos in memchr::memchr_iter(b'\n', data) {
225        let seg_len = nl_pos - seg_start;
226
227        if seg_len <= width {
228            let total = seg_len + 1;
229            unsafe {
230                std::ptr::copy_nonoverlapping(src.add(seg_start), base.add(wp), total);
231            }
232            wp += total;
233        } else {
234            let mut off = seg_start;
235            let end = nl_pos;
236            while off + width < end {
237                unsafe {
238                    std::ptr::copy_nonoverlapping(src.add(off), base.add(wp), width);
239                    *base.add(wp + width) = b'\n';
240                }
241                wp += width + 1;
242                off += width;
243            }
244            let rem = end - off + 1;
245            unsafe {
246                std::ptr::copy_nonoverlapping(src.add(off), base.add(wp), rem);
247            }
248            wp += rem;
249        }
250        seg_start = nl_pos + 1;
251    }
252
253    // Handle final segment without trailing newline
254    if seg_start < data.len() {
255        let mut off = seg_start;
256        let end = data.len();
257        while off + width < end {
258            unsafe {
259                std::ptr::copy_nonoverlapping(src.add(off), base.add(wp), width);
260                *base.add(wp + width) = b'\n';
261            }
262            wp += width + 1;
263            off += width;
264        }
265        if off < end {
266            let rem = end - off;
267            unsafe {
268                std::ptr::copy_nonoverlapping(src.add(off), base.add(wp), rem);
269            }
270            wp += rem;
271        }
272    }
273
274    unsafe {
275        buf.set_len(wp);
276    }
277}
278
279/// Fast fold by byte count with -s (break at spaces).
280/// For files >= 32MB, uses rayon parallel processing.
281fn fold_byte_fast_spaces(data: &[u8], width: usize, out: &mut impl Write) -> std::io::Result<()> {
282    if data.len() >= FOLD_BYTE_PARALLEL_THRESHOLD {
283        return fold_byte_spaces_parallel(data, width, out);
284    }
285
286    let mut outbuf: Vec<u8> = Vec::with_capacity(1024 * 1024 + 4096);
287    let mut pos: usize = 0;
288
289    for nl_pos in memchr::memchr_iter(b'\n', data) {
290        let segment = &data[pos..nl_pos];
291        fold_segment_bytes_spaces_buffered(segment, width, &mut outbuf);
292        outbuf.push(b'\n');
293        pos = nl_pos + 1;
294
295        if outbuf.len() >= 1024 * 1024 {
296            out.write_all(&outbuf)?;
297            outbuf.clear();
298        }
299    }
300
301    if pos < data.len() {
302        fold_segment_bytes_spaces_buffered(&data[pos..], width, &mut outbuf);
303    }
304
305    if !outbuf.is_empty() {
306        out.write_all(&outbuf)?;
307    }
308    Ok(())
309}
310
311/// Parallel fold by byte count with -s.
312fn fold_byte_spaces_parallel(
313    data: &[u8],
314    width: usize,
315    out: &mut impl Write,
316) -> std::io::Result<()> {
317    use rayon::prelude::*;
318
319    let num_chunks = rayon::current_num_threads().max(2);
320    let target_chunk_size = data.len() / num_chunks;
321    let mut chunks: Vec<&[u8]> = Vec::with_capacity(num_chunks + 1);
322    let mut pos: usize = 0;
323
324    for _ in 0..num_chunks - 1 {
325        if pos >= data.len() {
326            break;
327        }
328        let target_end = (pos + target_chunk_size).min(data.len());
329        let chunk_end = if target_end >= data.len() {
330            data.len()
331        } else {
332            match memchr::memchr(b'\n', &data[target_end..]) {
333                Some(off) => target_end + off + 1,
334                None => data.len(),
335            }
336        };
337        chunks.push(&data[pos..chunk_end]);
338        pos = chunk_end;
339    }
340    if pos < data.len() {
341        chunks.push(&data[pos..]);
342    }
343
344    let results: Vec<Vec<u8>> = chunks
345        .par_iter()
346        .map(|chunk| {
347            let mut buf = Vec::with_capacity(chunk.len() + chunk.len() / width + 256);
348            fold_byte_spaces_chunk(chunk, width, &mut buf);
349            buf
350        })
351        .collect();
352
353    for result in &results {
354        if !result.is_empty() {
355            out.write_all(result)?;
356        }
357    }
358    Ok(())
359}
360
361/// Process a chunk for fold byte mode with -s into a Vec<u8>.
362fn fold_byte_spaces_chunk(data: &[u8], width: usize, outbuf: &mut Vec<u8>) {
363    let mut pos: usize = 0;
364
365    for nl_pos in memchr::memchr_iter(b'\n', data) {
366        let segment = &data[pos..nl_pos];
367        fold_segment_bytes_spaces_buffered(segment, width, outbuf);
368        outbuf.push(b'\n');
369        pos = nl_pos + 1;
370    }
371
372    if pos < data.len() {
373        fold_segment_bytes_spaces_buffered(&data[pos..], width, outbuf);
374    }
375}
376
377/// Parallel threshold for fold column mode.
378const FOLD_PARALLEL_THRESHOLD: usize = 4 * 1024 * 1024;
379
380/// Streaming fold by column count — single-pass stream using memchr2.
381/// For large files (>4MB), uses rayon parallel processing on line-aligned chunks.
382/// Each chunk is processed independently since column resets at newlines.
383fn fold_column_mode_streaming(
384    data: &[u8],
385    width: usize,
386    break_at_spaces: bool,
387    out: &mut impl Write,
388) -> std::io::Result<()> {
389    if break_at_spaces {
390        return fold_column_mode_spaces_streaming(data, width, out);
391    }
392
393    if data.len() >= FOLD_PARALLEL_THRESHOLD {
394        return fold_column_parallel(data, width, out);
395    }
396
397    let mut outbuf: Vec<u8> = Vec::with_capacity(data.len() + data.len() / 4);
398    fold_column_chunk(data, width, &mut outbuf);
399    if !outbuf.is_empty() {
400        out.write_all(&outbuf)?;
401    }
402    Ok(())
403}
404
405/// Parallel fold for column mode with tabs. Splits data into line-aligned chunks,
406/// processes each in parallel with rayon, writes results in order.
407fn fold_column_parallel(data: &[u8], width: usize, out: &mut impl Write) -> std::io::Result<()> {
408    use rayon::prelude::*;
409
410    let num_chunks = rayon::current_num_threads().max(2);
411    let target_chunk_size = data.len() / num_chunks;
412    let mut chunks: Vec<&[u8]> = Vec::with_capacity(num_chunks + 1);
413    let mut pos: usize = 0;
414
415    for _ in 0..num_chunks - 1 {
416        if pos >= data.len() {
417            break;
418        }
419        let target_end = (pos + target_chunk_size).min(data.len());
420        let chunk_end = if target_end >= data.len() {
421            data.len()
422        } else {
423            match memchr::memchr(b'\n', &data[target_end..]) {
424                Some(off) => target_end + off + 1,
425                None => data.len(),
426            }
427        };
428        chunks.push(&data[pos..chunk_end]);
429        pos = chunk_end;
430    }
431    if pos < data.len() {
432        chunks.push(&data[pos..]);
433    }
434
435    let results: Vec<Vec<u8>> = chunks
436        .par_iter()
437        .map(|chunk| {
438            let mut buf = Vec::with_capacity(chunk.len() + chunk.len() / 4);
439            fold_column_chunk(chunk, width, &mut buf);
440            buf
441        })
442        .collect();
443
444    for result in &results {
445        if !result.is_empty() {
446            out.write_all(result)?;
447        }
448    }
449    Ok(())
450}
451
452/// Process a chunk for fold column mode using unsafe pointer writes.
453/// Uses memchr2 SIMD scanning for tabs and newlines, with raw pointer output.
454fn fold_column_chunk(data: &[u8], width: usize, outbuf: &mut Vec<u8>) {
455    if data.is_empty() {
456        return;
457    }
458
459    // Worst case: every char at width boundary → output ≈ 2x input
460    let worst = data.len() * 2 + 4096;
461    outbuf.reserve(worst);
462
463    let src = data.as_ptr();
464    let out_base = outbuf.as_mut_ptr();
465    let initial_len = outbuf.len();
466    let mut wp: usize = initial_len;
467    let mut col: usize = 0;
468    let mut seg_start: usize = 0;
469    let mut i: usize = 0;
470
471    while i < data.len() {
472        match memchr::memchr2(b'\t', b'\n', &data[i..]) {
473            Some(off) => {
474                let special_pos = i + off;
475                let run_len = special_pos - i;
476
477                if col + run_len > width {
478                    loop {
479                        let remaining = special_pos - i;
480                        let fit = width - col;
481                        if fit >= remaining {
482                            col += remaining;
483                            i = special_pos;
484                            break;
485                        }
486                        let copy_len = i + fit - seg_start;
487                        unsafe {
488                            std::ptr::copy_nonoverlapping(
489                                src.add(seg_start),
490                                out_base.add(wp),
491                                copy_len,
492                            );
493                            wp += copy_len;
494                            *out_base.add(wp) = b'\n';
495                            wp += 1;
496                        }
497                        i += fit;
498                        seg_start = i;
499                        col = 0;
500                    }
501                } else {
502                    col += run_len;
503                    i = special_pos;
504                }
505
506                if data[i] == b'\n' {
507                    let copy_len = i + 1 - seg_start;
508                    unsafe {
509                        std::ptr::copy_nonoverlapping(
510                            src.add(seg_start),
511                            out_base.add(wp),
512                            copy_len,
513                        );
514                    }
515                    wp += copy_len;
516                    col = 0;
517                    i += 1;
518                    seg_start = i;
519                } else {
520                    let new_col = ((col >> 3) + 1) << 3;
521                    if new_col > width && col > 0 {
522                        let copy_len = i - seg_start;
523                        unsafe {
524                            std::ptr::copy_nonoverlapping(
525                                src.add(seg_start),
526                                out_base.add(wp),
527                                copy_len,
528                            );
529                            wp += copy_len;
530                            *out_base.add(wp) = b'\n';
531                            wp += 1;
532                        }
533                        seg_start = i;
534                        col = 0;
535                        continue;
536                    }
537                    col = new_col;
538                    i += 1;
539                }
540            }
541            None => {
542                let remaining = data.len() - i;
543                if col + remaining > width {
544                    loop {
545                        let rem_now = data.len() - i;
546                        let fit = width - col;
547                        if fit >= rem_now {
548                            break;
549                        }
550                        let copy_len = i + fit - seg_start;
551                        unsafe {
552                            std::ptr::copy_nonoverlapping(
553                                src.add(seg_start),
554                                out_base.add(wp),
555                                copy_len,
556                            );
557                            wp += copy_len;
558                            *out_base.add(wp) = b'\n';
559                            wp += 1;
560                        }
561                        i += fit;
562                        seg_start = i;
563                        col = 0;
564                    }
565                }
566                break;
567            }
568        }
569    }
570
571    if seg_start < data.len() {
572        let copy_len = data.len() - seg_start;
573        unsafe {
574            std::ptr::copy_nonoverlapping(src.add(seg_start), out_base.add(wp), copy_len);
575        }
576        wp += copy_len;
577    }
578
579    unsafe {
580        outbuf.set_len(wp);
581    }
582}
583
584/// Fold a byte segment (no newlines) with -s (break at spaces), buffered output.
585#[inline]
586fn fold_segment_bytes_spaces_buffered(segment: &[u8], width: usize, outbuf: &mut Vec<u8>) {
587    let mut start = 0;
588    while start + width < segment.len() {
589        let chunk = &segment[start..start + width];
590        match memchr::memrchr2(b' ', b'\t', chunk) {
591            Some(sp_offset) => {
592                let break_at = start + sp_offset + 1;
593                outbuf.extend_from_slice(&segment[start..break_at]);
594                outbuf.push(b'\n');
595                start = break_at;
596            }
597            None => {
598                outbuf.extend_from_slice(&segment[start..start + width]);
599                outbuf.push(b'\n');
600                start += width;
601            }
602        }
603    }
604    if start < segment.len() {
605        outbuf.extend_from_slice(&segment[start..]);
606    }
607}
608
609/// Streaming fold column mode with -s (break at spaces).
610/// Uses buffered output to minimize write syscalls.
611/// Fast path: if no tabs in data, column width == byte width, so we can
612/// use the simpler byte-mode space-breaking algorithm.
613fn fold_column_mode_spaces_streaming(
614    data: &[u8],
615    width: usize,
616    out: &mut impl Write,
617) -> std::io::Result<()> {
618    // If no tabs, column mode == byte mode (every byte has width 1)
619    // BS/CR/control chars could theoretically differ but are vanishingly rare
620    // in practice and the difference is negligible.
621    if memchr::memchr(b'\t', data).is_none() {
622        return fold_byte_fast_spaces(data, width, out);
623    }
624
625    let mut pos = 0;
626    let mut outbuf: Vec<u8> = Vec::with_capacity(1024 * 1024 + 4096);
627
628    for nl_pos in memchr::memchr_iter(b'\n', data) {
629        let line = &data[pos..nl_pos];
630        // Short-circuit: line fits in width AND has no tabs → no folding needed
631        if line.len() <= width && memchr::memchr(b'\t', line).is_none() {
632            outbuf.extend_from_slice(line);
633        } else {
634            fold_column_spaces_fast(line, width, &mut outbuf);
635        }
636        outbuf.push(b'\n');
637
638        if outbuf.len() >= 1024 * 1024 {
639            out.write_all(&outbuf)?;
640            outbuf.clear();
641        }
642
643        pos = nl_pos + 1;
644    }
645
646    // Handle final line without trailing newline
647    if pos < data.len() {
648        let line = &data[pos..];
649        if line.len() <= width && memchr::memchr(b'\t', line).is_none() {
650            outbuf.extend_from_slice(line);
651        } else {
652            fold_column_spaces_fast(line, width, &mut outbuf);
653        }
654    }
655
656    if !outbuf.is_empty() {
657        out.write_all(&outbuf)?;
658    }
659
660    Ok(())
661}
662
663/// Fast column-mode fold for a single line with -s (break at spaces).
664/// Uses memchr2 to find tabs and spaces in bulk, processing runs of regular
665/// bytes without per-byte branching. Matches GNU fold's exact algorithm:
666/// - `column > width` triggers break (strictly greater)
667/// - Break at last blank: output INCLUDING the blank, remainder starts after it
668/// - After break: recalculate column from remaining data, re-process current char
669/// - All bytes width 1 except tab (next tab stop), BS (col-1), CR (col=0)
670#[inline]
671fn fold_column_spaces_fast(line: &[u8], width: usize, outbuf: &mut Vec<u8>) {
672    let mut col: usize = 0;
673    let mut seg_start: usize = 0;
674    let mut last_space_after: usize = 0;
675    let mut has_space = false;
676    let mut i: usize = 0;
677
678    while i < line.len() {
679        let b = line[i];
680        if b == b'\t' {
681            let new_col = ((col >> 3) + 1) << 3;
682            if new_col > width && col > 0 {
683                // Tab exceeds width — break
684                if has_space {
685                    outbuf.extend_from_slice(&line[seg_start..last_space_after]);
686                    outbuf.push(b'\n');
687                    seg_start = last_space_after;
688                    col = recalc_column(&line[seg_start..i]);
689                    has_space = false;
690                    continue; // re-evaluate tab
691                }
692                outbuf.extend_from_slice(&line[seg_start..i]);
693                outbuf.push(b'\n');
694                seg_start = i;
695                col = 0;
696                continue; // re-evaluate tab with col=0
697            }
698            // Tab also counts as a breakable whitespace for -s (GNU compat)
699            has_space = true;
700            last_space_after = i + 1;
701            col = new_col;
702            i += 1;
703        } else if b == b' ' {
704            col += 1;
705            if col > width {
706                if has_space {
707                    outbuf.extend_from_slice(&line[seg_start..last_space_after]);
708                    outbuf.push(b'\n');
709                    seg_start = last_space_after;
710                    col = recalc_column(&line[seg_start..i]);
711                    has_space = false;
712                    continue; // re-evaluate this space
713                }
714                // No prior blank — break before this space (GNU: output buffer, rescan)
715                outbuf.extend_from_slice(&line[seg_start..i]);
716                outbuf.push(b'\n');
717                seg_start = i;
718                col = 1; // space starts the new line with width 1
719                has_space = true;
720                last_space_after = i + 1;
721                i += 1;
722                continue;
723            }
724            has_space = true;
725            last_space_after = i + 1;
726            i += 1;
727        } else {
728            // Find next tab or space using SIMD memchr2
729            let run_end = match memchr::memchr2(b'\t', b' ', &line[i + 1..]) {
730                Some(off) => i + 1 + off,
731                None => line.len(),
732            };
733
734            // Process run of regular bytes: each has column width 1
735            let run_remaining = run_end - i;
736            if col + run_remaining <= width {
737                // Entire run fits
738                col += run_remaining;
739                i = run_end;
740            } else {
741                // Run exceeds width — need to break
742                let mut j = i;
743                loop {
744                    let rem = run_end - j;
745                    if col + rem <= width {
746                        col += rem;
747                        i = run_end;
748                        break;
749                    }
750                    if has_space {
751                        // Break at last blank (includes the blank)
752                        outbuf.extend_from_slice(&line[seg_start..last_space_after]);
753                        outbuf.push(b'\n');
754                        seg_start = last_space_after;
755                        col = j - seg_start; // regular bytes only, each width 1
756                        has_space = false;
757                        continue; // re-check with new col
758                    }
759                    // No blank — hard break at width boundary
760                    let fit = width - col;
761                    outbuf.extend_from_slice(&line[seg_start..j + fit]);
762                    outbuf.push(b'\n');
763                    j += fit;
764                    seg_start = j;
765                    col = 0;
766                }
767            }
768        }
769    }
770
771    if seg_start < line.len() {
772        outbuf.extend_from_slice(&line[seg_start..]);
773    }
774}
775
776/// Get the column width and byte length of a byte at `data[pos]`.
777/// Returns (column_width, byte_length) — always (1, 1) for non-special bytes.
778///
779/// GNU fold's multibyte path is guarded by:
780///   `#if HAVE_MBRTOC32 && (! defined __GLIBC__ || defined __UCLIBC__)`
781/// On glibc (every mainstream Linux distro), that condition is false, so
782/// fold counts bytes — one column per byte, same as -b mode.
783/// Tab, backspace, and CR are handled by the caller.
784#[inline]
785fn char_info(data: &[u8], pos: usize) -> (usize, usize) {
786    let b = data[pos];
787    if b < 0x80 {
788        // ASCII: tab/backspace handled by caller; control chars have 0 width
789        if b < 0x20 || b == 0x7f {
790            (0, 1)
791        } else {
792            (1, 1)
793        }
794    } else {
795        // High byte: count as 1 column, 1 byte (GNU glibc compat)
796        (1, 1)
797    }
798}
799
800/// Check if folding would produce identical output (all lines fit within width).
801/// Used by the binary for direct write-through optimization.
802pub fn fold_is_passthrough(data: &[u8], width: usize, count_bytes: bool) -> bool {
803    if width == 0 || data.is_empty() {
804        return data.is_empty();
805    }
806    // Column mode with tabs: can't easily determine passthrough
807    if !count_bytes && memchr::memchr(b'\t', data).is_some() {
808        return false;
809    }
810    let mut prev = 0;
811    for nl_pos in memchr::memchr_iter(b'\n', data) {
812        if nl_pos - prev > width {
813            return false;
814        }
815        prev = nl_pos + 1;
816    }
817    data.len() - prev <= width
818}
819
820/// Recalculate column position by replaying a segment (handles tabs, CR, backspace).
821/// Used when non-linear column operations (CR, backspace) invalidate the fast
822/// `col - col_at_space` delta formula.
823fn recalc_column(data: &[u8]) -> usize {
824    let mut col = 0;
825    let mut i = 0;
826    while i < data.len() {
827        let b = data[i];
828        if b == b'\r' {
829            col = 0;
830            i += 1;
831        } else if b == b'\t' {
832            col = ((col / 8) + 1) * 8;
833            i += 1;
834        } else if b == b'\x08' {
835            if col > 0 {
836                col -= 1;
837            }
838            i += 1;
839        } else if b < 0x80 {
840            if b >= 0x20 && b != 0x7f {
841                col += 1;
842            }
843            i += 1;
844        } else {
845            let (cw, byte_len) = char_info(data, i);
846            col += cw;
847            i += byte_len;
848        }
849    }
850    col
851}