coreutils_rs/expand/
core.rs

1use std::io::Write;
2
3/// Tab stop specification
4#[derive(Clone, Debug)]
5pub enum TabStops {
6    /// Regular interval (default 8)
7    Regular(usize),
8    /// Explicit list of tab stop positions (0-indexed columns)
9    List(Vec<usize>),
10}
11
12impl TabStops {
13    /// Calculate the number of spaces to the next tab stop from the given column.
14    #[inline]
15    fn spaces_to_next(&self, column: usize) -> usize {
16        match self {
17            TabStops::Regular(n) => {
18                if *n == 0 {
19                    return 0;
20                }
21                *n - (column % *n)
22            }
23            TabStops::List(stops) => {
24                // Find the first tab stop > current column
25                match stops.binary_search(&(column + 1)) {
26                    Ok(idx) => stops[idx] - column,
27                    Err(idx) => {
28                        if idx < stops.len() {
29                            stops[idx] - column
30                        } else {
31                            // Past all tab stops: GNU uses 1 space
32                            1
33                        }
34                    }
35                }
36            }
37        }
38    }
39
40    /// Get the next tab stop position after the given column.
41    #[inline]
42    fn next_tab_stop(&self, column: usize) -> usize {
43        column + self.spaces_to_next(column)
44    }
45}
46
47/// Parse a tab specification string (e.g., "4", "4,8,12", "4 8 12").
48pub fn parse_tab_stops(spec: &str) -> Result<TabStops, String> {
49    let spec = spec.trim();
50    if spec.is_empty() {
51        return Ok(TabStops::Regular(8));
52    }
53
54    // Check if it's a single number (regular interval)
55    if let Ok(n) = spec.parse::<usize>() {
56        if n == 0 {
57            return Err("tab size cannot be 0".to_string());
58        }
59        return Ok(TabStops::Regular(n));
60    }
61
62    // Parse as comma or space-separated list
63    let mut stops: Vec<usize> = Vec::new();
64    for part in spec.split([',', ' ']) {
65        let part = part.trim();
66        if part.is_empty() {
67            continue;
68        }
69        // Handle / prefix for repeating tab stops
70        if let Some(rest) = part.strip_prefix('/') {
71            let n: usize = rest
72                .parse()
73                .map_err(|_| format!("'{}' is not a valid number", part))?;
74            if n == 0 {
75                return Err("tab size cannot be 0".to_string());
76            }
77            let last = stops.last().copied().unwrap_or(0);
78            let mut pos = last + n;
79            while pos < 10000 {
80                stops.push(pos);
81                pos += n;
82            }
83            continue;
84        }
85        match part.parse::<usize>() {
86            Ok(n) => {
87                if !stops.is_empty() && n <= *stops.last().unwrap() {
88                    return Err("tab sizes must be ascending".to_string());
89                }
90                stops.push(n);
91            }
92            Err(_) => return Err(format!("'{}' is not a valid number", part)),
93        }
94    }
95
96    if stops.is_empty() {
97        return Err("tab specification is empty".to_string());
98    }
99
100    if stops.len() == 1 {
101        return Ok(TabStops::Regular(stops[0]));
102    }
103
104    Ok(TabStops::List(stops))
105}
106
107// Pre-computed spaces buffer for fast tab expansion (avoids per-tab allocation)
108// 4KB buffer covers even very large tab stops in a single memcpy
109const SPACES: [u8; 4096] = [b' '; 4096];
110
111/// Write N spaces to a Vec efficiently using pre-computed buffer.
112#[inline]
113fn push_spaces(output: &mut Vec<u8>, n: usize) {
114    let mut remaining = n;
115    while remaining > 0 {
116        let chunk = remaining.min(SPACES.len());
117        output.extend_from_slice(&SPACES[..chunk]);
118        remaining -= chunk;
119    }
120}
121
122/// Write N spaces to a Write stream using pre-computed buffer.
123#[inline]
124fn write_spaces(out: &mut impl Write, n: usize) -> std::io::Result<()> {
125    let mut remaining = n;
126    while remaining > 0 {
127        let chunk = remaining.min(SPACES.len());
128        out.write_all(&SPACES[..chunk])?;
129        remaining -= chunk;
130    }
131    Ok(())
132}
133
134/// Expand tabs to spaces using SIMD scanning.
135/// Dispatches to the optimal path based on tab configuration and data content.
136pub fn expand_bytes(
137    data: &[u8],
138    tabs: &TabStops,
139    initial_only: bool,
140    out: &mut impl Write,
141) -> std::io::Result<()> {
142    if data.is_empty() {
143        return Ok(());
144    }
145
146    // For regular tab stops, use fast SIMD paths.
147    // We combine the no-tabs and no-backspace checks to minimize full-data scans.
148    if let TabStops::Regular(tab_size) = tabs {
149        if initial_only {
150            // --initial mode: check for tabs first (cheap) to skip processing
151            if memchr::memchr(b'\t', data).is_none() {
152                return out.write_all(data);
153            }
154            return expand_initial_fast(data, *tab_size, out);
155        }
156
157        // Fast path: no tabs → write-through (avoids backspace scan too)
158        if memchr::memchr(b'\t', data).is_none() {
159            return out.write_all(data);
160        }
161
162        // Check for backspaces. If none, use the fast SIMD path.
163        if memchr::memchr(b'\x08', data).is_none() {
164            return expand_regular_fast(data, *tab_size, out);
165        }
166
167        // Has backspaces → fall through to generic
168        return expand_generic(data, tabs, initial_only, true, out);
169    }
170
171    // Tab list path: check for tabs first
172    if memchr::memchr(b'\t', data).is_none() {
173        return out.write_all(data);
174    }
175    let has_backspace = memchr::memchr(b'\x08', data).is_some();
176    expand_generic(data, tabs, initial_only, has_backspace, out)
177}
178
179/// Parallel threshold: files above this size use multi-threaded expand.
180/// Below this, single-threaded is faster (avoids thread pool overhead).
181const PARALLEL_THRESHOLD: usize = 4 * 1024 * 1024; // 4MB
182
183/// Fast expand for regular tab stops without -i flag.
184/// For large files (>4MB), uses rayon to process chunks in parallel.
185/// For smaller files, uses single-threaded SIMD processing.
186fn expand_regular_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
187    debug_assert!(tab_size > 0, "tab_size must be > 0");
188
189    if data.len() >= PARALLEL_THRESHOLD {
190        expand_regular_parallel(data, tab_size, out)
191    } else {
192        expand_regular_single(data, tab_size, out)
193    }
194}
195
196/// Single-threaded expand: SIMD memchr2_iter scan of entire data.
197fn expand_regular_single(
198    data: &[u8],
199    tab_size: usize,
200    out: &mut impl Write,
201) -> std::io::Result<()> {
202    let is_pow2 = tab_size.is_power_of_two();
203    let mask = tab_size.wrapping_sub(1);
204
205    const INPUT_CHUNK: usize = 256 * 1024;
206    let worst_output = INPUT_CHUNK * tab_size + tab_size;
207    let mut output: Vec<u8> = Vec::with_capacity(worst_output);
208
209    let mut column: usize = 0;
210    let mut data_pos: usize = 0;
211
212    while data_pos < data.len() {
213        let chunk_end = if data_pos + INPUT_CHUNK >= data.len() {
214            data.len()
215        } else {
216            let search_end = data_pos + INPUT_CHUNK;
217            match memchr::memrchr(b'\n', &data[data_pos..search_end]) {
218                Some(off) => data_pos + off + 1,
219                None => search_end,
220            }
221        };
222
223        let chunk = &data[data_pos..chunk_end];
224        output.clear();
225
226        let out_ptr = output.as_mut_ptr();
227        let src = chunk.as_ptr();
228        let mut wp: usize = 0;
229        let mut pos: usize = 0;
230
231        for hit in memchr::memchr2_iter(b'\t', b'\n', chunk) {
232            let seg_len = hit - pos;
233            if seg_len > 0 {
234                unsafe {
235                    std::ptr::copy_nonoverlapping(src.add(pos), out_ptr.add(wp), seg_len);
236                }
237                wp += seg_len;
238                column += seg_len;
239            }
240
241            if unsafe { *src.add(hit) } == b'\n' {
242                unsafe {
243                    *out_ptr.add(wp) = b'\n';
244                }
245                wp += 1;
246                column = 0;
247            } else {
248                let rem = if is_pow2 {
249                    column & mask
250                } else {
251                    column % tab_size
252                };
253                let spaces = tab_size - rem;
254                unsafe {
255                    std::ptr::write_bytes(out_ptr.add(wp), b' ', spaces);
256                }
257                wp += spaces;
258                column += spaces;
259            }
260
261            pos = hit + 1;
262        }
263
264        if pos < chunk.len() {
265            let tail = chunk.len() - pos;
266            unsafe {
267                std::ptr::copy_nonoverlapping(src.add(pos), out_ptr.add(wp), tail);
268            }
269            wp += tail;
270            column += tail;
271        }
272
273        unsafe {
274            output.set_len(wp);
275        }
276        if wp > 0 {
277            out.write_all(&output)?;
278        }
279
280        data_pos = chunk_end;
281    }
282
283    Ok(())
284}
285
286/// Expand a chunk of data (must start/end on newline boundaries, except possibly
287/// the last chunk). Column starts at 0 for each chunk since chunks are line-aligned.
288/// Returns the expanded output as a Vec<u8>.
289/// Pre-allocates worst-case output to eliminate all capacity checks in the hot loop.
290fn expand_chunk(chunk: &[u8], tab_size: usize, is_pow2: bool, mask: usize) -> Vec<u8> {
291    // Worst case: every byte is a tab → tab_size× expansion.
292    // For most real data the expansion ratio is ~1.5x, so this over-allocates,
293    // but it eliminates all capacity checks from the hot loop.
294    let worst_case = chunk.len() * tab_size;
295    let mut output: Vec<u8> = Vec::with_capacity(worst_case);
296
297    let out_ptr = output.as_mut_ptr();
298    let src = chunk.as_ptr();
299    let mut wp: usize = 0;
300    let mut column: usize = 0;
301    let mut pos: usize = 0;
302
303    // Inner loop: zero capacity checks, zero Vec::len() reads, zero set_len() calls.
304    // Just raw pointer arithmetic.
305    for hit in memchr::memchr2_iter(b'\t', b'\n', chunk) {
306        let seg_len = hit - pos;
307        if seg_len > 0 {
308            unsafe {
309                std::ptr::copy_nonoverlapping(src.add(pos), out_ptr.add(wp), seg_len);
310            }
311            wp += seg_len;
312            column += seg_len;
313        }
314
315        if unsafe { *src.add(hit) } == b'\n' {
316            unsafe {
317                *out_ptr.add(wp) = b'\n';
318            }
319            wp += 1;
320            column = 0;
321        } else {
322            let rem = if is_pow2 {
323                column & mask
324            } else {
325                column % tab_size
326            };
327            let spaces = tab_size - rem;
328            unsafe {
329                std::ptr::write_bytes(out_ptr.add(wp), b' ', spaces);
330            }
331            wp += spaces;
332            column += spaces;
333        }
334
335        pos = hit + 1;
336    }
337
338    // Copy tail
339    if pos < chunk.len() {
340        let tail = chunk.len() - pos;
341        unsafe {
342            std::ptr::copy_nonoverlapping(src.add(pos), out_ptr.add(wp), tail);
343        }
344        wp += tail;
345    }
346
347    unsafe {
348        output.set_len(wp);
349    }
350    output
351}
352
353/// Parallel expand for large files (>4MB). Splits data into line-aligned chunks
354/// and processes them concurrently using rayon. Each chunk is expanded independently
355/// (column tracking resets at newline boundaries), then results are written in order.
356fn expand_regular_parallel(
357    data: &[u8],
358    tab_size: usize,
359    out: &mut impl Write,
360) -> std::io::Result<()> {
361    use rayon::prelude::*;
362
363    let is_pow2 = tab_size.is_power_of_two();
364    let mask = tab_size.wrapping_sub(1);
365
366    // Split data into line-aligned chunks for parallel processing.
367    // Use ~N chunks where N = number of CPUs for optimal load balance.
368    let num_chunks = rayon::current_num_threads().max(2);
369    let target_chunk_size = data.len() / num_chunks;
370    let mut chunks: Vec<&[u8]> = Vec::with_capacity(num_chunks + 1);
371    let mut pos: usize = 0;
372
373    for _ in 0..num_chunks - 1 {
374        if pos >= data.len() {
375            break;
376        }
377        let target_end = (pos + target_chunk_size).min(data.len());
378        // Find the next newline at or after target_end
379        let chunk_end = if target_end >= data.len() {
380            data.len()
381        } else {
382            match memchr::memchr(b'\n', &data[target_end..]) {
383                Some(off) => target_end + off + 1,
384                None => data.len(),
385            }
386        };
387        chunks.push(&data[pos..chunk_end]);
388        pos = chunk_end;
389    }
390    // Last chunk gets everything remaining
391    if pos < data.len() {
392        chunks.push(&data[pos..]);
393    }
394
395    // Process all chunks in parallel
396    let results: Vec<Vec<u8>> = chunks
397        .par_iter()
398        .map(|chunk| expand_chunk(chunk, tab_size, is_pow2, mask))
399        .collect();
400
401    // Write results in order
402    for result in &results {
403        if !result.is_empty() {
404            out.write_all(result)?;
405        }
406    }
407
408    Ok(())
409}
410
411/// Fast expand for --initial mode with regular tab stops.
412/// Only expands tabs in the leading whitespace of each line, bulk-copying the rest.
413/// Uses memchr (SIMD) to find line boundaries. Leading-whitespace expansion is scalar.
414/// Handles backspace per-line: lines containing \x08 fall back to generic expand.
415fn expand_initial_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
416    debug_assert!(tab_size > 0, "tab_size must be > 0");
417    let tabs = TabStops::Regular(tab_size);
418    let mut pos: usize = 0;
419
420    while pos < data.len() {
421        // Find end of this line
422        let line_end = memchr::memchr(b'\n', &data[pos..])
423            .map(|off| pos + off + 1)
424            .unwrap_or(data.len());
425
426        let line = &data[pos..line_end];
427        debug_assert!(!line.is_empty());
428
429        // Fast skip: if line doesn't start with tab or space, write it whole
430        let first = line[0];
431        if first != b'\t' && first != b' ' {
432            out.write_all(line)?;
433            pos = line_end;
434            continue;
435        }
436
437        // If this line contains a backspace, fall back to generic for this line only
438        if memchr::memchr(b'\x08', line).is_some() {
439            expand_generic(line, &tabs, true, true, out)?;
440            pos = line_end;
441            continue;
442        }
443
444        // Expand only leading tabs/spaces in this line
445        let mut column: usize = 0;
446        let mut i = 0; // offset within line
447        while i < line.len() {
448            let byte = line[i];
449            if byte == b'\t' {
450                let spaces = tab_size - (column % tab_size);
451                write_spaces(out, spaces)?;
452                column += spaces;
453                i += 1;
454            } else if byte == b' ' {
455                // Batch consecutive spaces from source data
456                let space_start = i;
457                while i < line.len() && line[i] == b' ' {
458                    i += 1;
459                }
460                out.write_all(&line[space_start..i])?;
461                column += i - space_start;
462            } else {
463                // First non-blank: write the rest of the line unchanged
464                break;
465            }
466        }
467
468        // Write remainder of line as-is (zero-copy)
469        if i < line.len() {
470            out.write_all(&line[i..])?;
471        }
472
473        pos = line_end;
474    }
475
476    Ok(())
477}
478
479/// Generic expand with support for -i flag and tab lists.
480/// Uses memchr2 SIMD scanning when no backspaces are present.
481/// `has_backspace` hint avoids a redundant O(n) scan when the caller already knows.
482fn expand_generic(
483    data: &[u8],
484    tabs: &TabStops,
485    initial_only: bool,
486    has_backspace: bool,
487    out: &mut impl Write,
488) -> std::io::Result<()> {
489    const FLUSH_THRESHOLD: usize = 256 * 1024;
490    let cap = data.len().min(FLUSH_THRESHOLD) + data.len().min(FLUSH_THRESHOLD) / 8;
491    let mut output = Vec::with_capacity(cap);
492
493    // If no backspaces present and not initial-only, use SIMD bulk scanning
494    if !initial_only && !has_backspace {
495        let mut column: usize = 0;
496        let mut pos: usize = 0;
497
498        while pos < data.len() {
499            match memchr::memchr2(b'\t', b'\n', &data[pos..]) {
500                Some(offset) => {
501                    if offset > 0 {
502                        output.extend_from_slice(&data[pos..pos + offset]);
503                        column += offset;
504                    }
505                    let byte = data[pos + offset];
506                    pos += offset + 1;
507
508                    if byte == b'\n' {
509                        output.push(b'\n');
510                        column = 0;
511                    } else {
512                        let spaces = tabs.spaces_to_next(column);
513                        push_spaces(&mut output, spaces);
514                        column += spaces;
515                    }
516                    if output.len() >= FLUSH_THRESHOLD {
517                        out.write_all(&output)?;
518                        output.clear();
519                    }
520                }
521                None => {
522                    output.extend_from_slice(&data[pos..]);
523                    break;
524                }
525            }
526        }
527    } else {
528        // Byte-by-byte fallback for backspace handling or initial-only mode
529        let mut column: usize = 0;
530        let mut in_initial = true;
531
532        for &byte in data {
533            match byte {
534                b'\t' => {
535                    if initial_only && !in_initial {
536                        output.push(b'\t');
537                        column = tabs.next_tab_stop(column);
538                    } else {
539                        let spaces = tabs.spaces_to_next(column);
540                        push_spaces(&mut output, spaces);
541                        column += spaces;
542                    }
543                }
544                b'\n' => {
545                    output.push(b'\n');
546                    column = 0;
547                    in_initial = true;
548                    if output.len() >= FLUSH_THRESHOLD {
549                        out.write_all(&output)?;
550                        output.clear();
551                    }
552                }
553                b'\x08' => {
554                    output.push(b'\x08');
555                    if column > 0 {
556                        column -= 1;
557                    }
558                }
559                _ => {
560                    if initial_only && in_initial && byte != b' ' {
561                        in_initial = false;
562                    }
563                    output.push(byte);
564                    column += 1;
565                }
566            }
567        }
568    }
569
570    if !output.is_empty() {
571        out.write_all(&output)?;
572    }
573    Ok(())
574}
575
576/// Unexpand spaces to tabs.
577/// If `all` is true, convert all sequences of spaces; otherwise only leading spaces.
578pub fn unexpand_bytes(
579    data: &[u8],
580    tabs: &TabStops,
581    all: bool,
582    out: &mut impl Write,
583) -> std::io::Result<()> {
584    if data.is_empty() {
585        return Ok(());
586    }
587
588    // Fast path: no spaces or tabs → just copy through
589    if memchr::memchr2(b' ', b'\t', data).is_none() {
590        return out.write_all(data);
591    }
592
593    // For regular tab stops, use the optimized SIMD-scanning path
594    if let TabStops::Regular(tab_size) = tabs {
595        if memchr::memchr(b'\x08', data).is_none() {
596            return unexpand_regular_fast(data, *tab_size, all, out);
597        }
598    }
599
600    // Generic path for tab lists or data with backspaces
601    unexpand_generic(data, tabs, all, out)
602}
603
604/// Emit a run of blanks as the optimal combination of tabs and spaces.
605/// Matches GNU unexpand behavior: a single blank at a tab stop is only converted
606/// to a tab if more blanks follow, otherwise it stays as a space.
607#[inline]
608fn emit_blanks(
609    out: &mut impl Write,
610    start_col: usize,
611    count: usize,
612    tab_size: usize,
613) -> std::io::Result<()> {
614    if count == 0 {
615        return Ok(());
616    }
617    let end_col = start_col + count;
618    let mut col = start_col;
619
620    // Emit tabs for each tab stop we can reach
621    loop {
622        let next_tab = col + (tab_size - col % tab_size);
623        if next_tab > end_col {
624            break;
625        }
626        let blanks_consumed = next_tab - col;
627        if blanks_consumed >= 2 || next_tab < end_col {
628            // 2+ blanks to tab stop, OR 1 blank but more follow → emit tab
629            out.write_all(b"\t")?;
630            col = next_tab;
631        } else {
632            // 1 blank at tab stop with nothing after → keep as space
633            break;
634        }
635    }
636
637    // Emit remaining spaces
638    let remaining = end_col - col;
639    if remaining > 0 {
640        let mut r = remaining;
641        while r > 0 {
642            let chunk = r.min(SPACES.len());
643            out.write_all(&SPACES[..chunk])?;
644            r -= chunk;
645        }
646    }
647    Ok(())
648}
649
650/// Fast unexpand for regular tab stops without backspaces.
651/// Uses memchr SIMD scanning to skip non-special bytes in bulk.
652fn unexpand_regular_fast(
653    data: &[u8],
654    tab_size: usize,
655    all: bool,
656    out: &mut impl Write,
657) -> std::io::Result<()> {
658    if all {
659        return unexpand_regular_fast_all(data, tab_size, out);
660    }
661
662    let mut column: usize = 0;
663    let mut pos: usize = 0;
664    let mut in_initial = true;
665
666    while pos < data.len() {
667        if in_initial {
668            // Check for blanks to convert
669            if data[pos] == b' ' || data[pos] == b'\t' {
670                // Count consecutive blanks, tracking column advancement
671                let blank_start_col = column;
672                while pos < data.len() && (data[pos] == b' ' || data[pos] == b'\t') {
673                    if data[pos] == b'\t' {
674                        column += tab_size - column % tab_size;
675                    } else {
676                        column += 1;
677                    }
678                    pos += 1;
679                }
680                // Emit blanks as optimal tabs+spaces
681                emit_blanks(out, blank_start_col, column - blank_start_col, tab_size)?;
682                continue;
683            }
684            if data[pos] == b'\n' {
685                out.write_all(b"\n")?;
686                column = 0;
687                in_initial = true;
688                pos += 1;
689                continue;
690            }
691            // Non-blank: fall through to body mode below.
692        }
693
694        // Body of line: bulk copy until newline (default mode)
695        match memchr::memchr(b'\n', &data[pos..]) {
696            Some(offset) => {
697                out.write_all(&data[pos..pos + offset + 1])?;
698                column = 0;
699                in_initial = true;
700                pos += offset + 1;
701            }
702            None => {
703                out.write_all(&data[pos..])?;
704                return Ok(());
705            }
706        }
707    }
708
709    Ok(())
710}
711
712/// Fast unexpand -a for regular tab stops without backspaces.
713/// Buffers output into a Vec to avoid per-call write_all overhead.
714/// Handles single spaces efficiently (most common case: no tab conversion needed).
715fn unexpand_regular_fast_all(
716    data: &[u8],
717    tab_size: usize,
718    out: &mut impl Write,
719) -> std::io::Result<()> {
720    // Line-level fast path: process the file line by line.
721    // Lines with only single spaces (no tabs, no double spaces) don't need
722    // any conversion and can be copied as-is — this covers most natural text.
723    let mut output: Vec<u8> = Vec::with_capacity(data.len());
724    let mut pos: usize = 0;
725
726    for nl_pos in memchr::memchr_iter(b'\n', data) {
727        let line = &data[pos..nl_pos];
728
729        // Fast check: skip lines with no tabs and no consecutive spaces.
730        // Two separate SIMD scans are needed: memchr2 can't work because
731        // a space appearing before a tab would cause us to miss the tab.
732        let needs_processing =
733            memchr::memchr(b'\t', line).is_some() || memchr::memmem::find(line, b"  ").is_some();
734
735        if !needs_processing {
736            // No conversion needed: copy line as-is
737            output.extend_from_slice(line);
738        } else {
739            // Process this line through the blank-conversion logic
740            unexpand_line_all(line, tab_size, &mut output);
741        }
742        output.push(b'\n');
743
744        // Flush periodically
745        if output.len() >= 256 * 1024 {
746            out.write_all(&output)?;
747            output.clear();
748        }
749        pos = nl_pos + 1;
750    }
751
752    // Handle final line without trailing newline
753    if pos < data.len() {
754        let line = &data[pos..];
755        let needs_processing =
756            memchr::memchr(b'\t', line).is_some() || memchr::memmem::find(line, b"  ").is_some();
757        if !needs_processing {
758            output.extend_from_slice(line);
759        } else {
760            unexpand_line_all(line, tab_size, &mut output);
761        }
762    }
763
764    if !output.is_empty() {
765        out.write_all(&output)?;
766    }
767    Ok(())
768}
769
770/// Process a single line for unexpand -a, converting blank runs to tabs+spaces.
771#[inline]
772fn unexpand_line_all(line: &[u8], tab_size: usize, output: &mut Vec<u8>) {
773    let mut column: usize = 0;
774    let mut pos: usize = 0;
775
776    while pos < line.len() {
777        // Check for blanks to convert
778        if line[pos] == b' ' || line[pos] == b'\t' {
779            // Count consecutive blanks, tracking column advancement
780            let blank_start_col = column;
781            while pos < line.len() && (line[pos] == b' ' || line[pos] == b'\t') {
782                if line[pos] == b'\t' {
783                    column += tab_size - column % tab_size;
784                } else {
785                    column += 1;
786                }
787                pos += 1;
788            }
789            // Vec<u8> implements Write, so we can use emit_blanks directly.
790            emit_blanks(output, blank_start_col, column - blank_start_col, tab_size).unwrap();
791            continue;
792        }
793
794        // Non-blank: copy until next space or tab
795        match memchr::memchr2(b' ', b'\t', &line[pos..]) {
796            Some(offset) => {
797                output.extend_from_slice(&line[pos..pos + offset]);
798                column += offset;
799                pos += offset;
800            }
801            None => {
802                output.extend_from_slice(&line[pos..]);
803                break;
804            }
805        }
806    }
807}
808
809/// Generic unexpand with support for tab lists and backspaces.
810fn unexpand_generic(
811    data: &[u8],
812    tabs: &TabStops,
813    all: bool,
814    out: &mut impl Write,
815) -> std::io::Result<()> {
816    let tab_size = match tabs {
817        TabStops::Regular(n) => *n,
818        TabStops::List(_) => 0, // handled by is_tab_stop/next_tab_stop
819    };
820    let mut column: usize = 0;
821    let mut space_start_col: Option<usize> = None;
822    let mut in_initial = true;
823
824    for &byte in data {
825        match byte {
826            b' ' => {
827                if !all && !in_initial {
828                    out.write_all(b" ")?;
829                    column += 1;
830                } else {
831                    if space_start_col.is_none() {
832                        space_start_col = Some(column);
833                    }
834                    column += 1;
835                    // Don't convert to tab here — wait for end of blank run
836                }
837            }
838            b'\t' => {
839                if !all && !in_initial {
840                    // In non-converting mode, just emit the tab
841                    if let Some(start_col) = space_start_col.take() {
842                        let n = column - start_col;
843                        out.write_all(&SPACES[..n.min(SPACES.len())])?;
844                    }
845                    out.write_all(b"\t")?;
846                    column = tabs.next_tab_stop(column);
847                } else {
848                    if space_start_col.is_none() {
849                        space_start_col = Some(column);
850                    }
851                    column = tabs.next_tab_stop(column);
852                }
853            }
854            _ => {
855                // Flush pending blanks
856                if let Some(start_col) = space_start_col.take() {
857                    let count = column - start_col;
858                    if tab_size > 0 {
859                        emit_blanks(out, start_col, count, tab_size)?;
860                    } else {
861                        // Tab list: use is_tab_stop for conversion
862                        emit_blanks_tablist(out, start_col, count, tabs)?;
863                    }
864                }
865
866                if byte == b'\n' {
867                    out.write_all(b"\n")?;
868                    column = 0;
869                    in_initial = true;
870                } else if byte == b'\x08' {
871                    out.write_all(b"\x08")?;
872                    if column > 0 {
873                        column -= 1;
874                    }
875                } else {
876                    if in_initial {
877                        in_initial = false;
878                    }
879                    out.write_all(&[byte])?;
880                    column += 1;
881                }
882            }
883        }
884    }
885
886    if let Some(start_col) = space_start_col {
887        let count = column - start_col;
888        if tab_size > 0 {
889            emit_blanks(out, start_col, count, tab_size)?;
890        } else {
891            emit_blanks_tablist(out, start_col, count, tabs)?;
892        }
893    }
894
895    Ok(())
896}
897
898/// Emit blanks using a tab list (non-regular tab stops).
899/// After the last defined tab stop, only spaces are emitted (no more tabs).
900fn emit_blanks_tablist(
901    out: &mut impl Write,
902    start_col: usize,
903    count: usize,
904    tabs: &TabStops,
905) -> std::io::Result<()> {
906    if count == 0 {
907        return Ok(());
908    }
909    let end_col = start_col + count;
910    let mut col = start_col;
911
912    // Get the last defined tab stop to know when to stop converting to tabs
913    let last_stop = match tabs {
914        TabStops::List(stops) => stops.last().copied().unwrap_or(0),
915        TabStops::Regular(_) => usize::MAX,
916    };
917
918    while col < last_stop {
919        let next_tab = tabs.next_tab_stop(col);
920        if next_tab > end_col || next_tab > last_stop {
921            break;
922        }
923        let blanks_consumed = next_tab - col;
924        if blanks_consumed >= 2 || next_tab < end_col {
925            out.write_all(b"\t")?;
926            col = next_tab;
927        } else {
928            break;
929        }
930    }
931
932    let remaining = end_col - col;
933    if remaining > 0 {
934        let mut r = remaining;
935        while r > 0 {
936            let chunk = r.min(SPACES.len());
937            out.write_all(&SPACES[..chunk])?;
938            r -= chunk;
939        }
940    }
941    Ok(())
942}
coreutils_rs/expand/core.rs

coreutils_rs/expand/
core.rs