coreutils_rs/expand/
core.rs

1use std::io::Write;
2
3/// Tab stop specification
4#[derive(Clone, Debug)]
5pub enum TabStops {
6    /// Regular interval (default 8)
7    Regular(usize),
8    /// Explicit list of tab stop positions (0-indexed columns)
9    List(Vec<usize>),
10}
11
12impl TabStops {
13    /// Calculate the number of spaces to the next tab stop from the given column.
14    #[inline]
15    fn spaces_to_next(&self, column: usize) -> usize {
16        match self {
17            TabStops::Regular(n) => {
18                if *n == 0 {
19                    return 0;
20                }
21                *n - (column % *n)
22            }
23            TabStops::List(stops) => {
24                // Find the first tab stop > current column
25                match stops.binary_search(&(column + 1)) {
26                    Ok(idx) => stops[idx] - column,
27                    Err(idx) => {
28                        if idx < stops.len() {
29                            stops[idx] - column
30                        } else {
31                            // Past all tab stops: GNU uses 1 space
32                            1
33                        }
34                    }
35                }
36            }
37        }
38    }
39
40    /// Get the next tab stop position after the given column.
41    #[inline]
42    fn next_tab_stop(&self, column: usize) -> usize {
43        column + self.spaces_to_next(column)
44    }
45}
46
47/// Parse a tab specification string (e.g., "4", "4,8,12", "4 8 12").
48pub fn parse_tab_stops(spec: &str) -> Result<TabStops, String> {
49    let spec = spec.trim();
50    if spec.is_empty() {
51        return Ok(TabStops::Regular(8));
52    }
53
54    // Check if it's a single number (regular interval)
55    if let Ok(n) = spec.parse::<usize>() {
56        if n == 0 {
57            return Err("tab size cannot be 0".to_string());
58        }
59        return Ok(TabStops::Regular(n));
60    }
61
62    // Parse as comma or space-separated list
63    let mut stops: Vec<usize> = Vec::new();
64    for part in spec.split([',', ' ']) {
65        let part = part.trim();
66        if part.is_empty() {
67            continue;
68        }
69        // Handle / prefix for repeating tab stops
70        if let Some(rest) = part.strip_prefix('/') {
71            let n: usize = rest
72                .parse()
73                .map_err(|_| format!("'{}' is not a valid number", part))?;
74            if n == 0 {
75                return Err("tab size cannot be 0".to_string());
76            }
77            let last = stops.last().copied().unwrap_or(0);
78            let mut pos = last + n;
79            while pos < 10000 {
80                stops.push(pos);
81                pos += n;
82            }
83            continue;
84        }
85        match part.parse::<usize>() {
86            Ok(n) => {
87                if !stops.is_empty() && n <= *stops.last().unwrap() {
88                    return Err("tab sizes must be ascending".to_string());
89                }
90                stops.push(n);
91            }
92            Err(_) => return Err(format!("'{}' is not a valid number", part)),
93        }
94    }
95
96    if stops.is_empty() {
97        return Err("tab specification is empty".to_string());
98    }
99
100    if stops.len() == 1 {
101        return Ok(TabStops::Regular(stops[0]));
102    }
103
104    Ok(TabStops::List(stops))
105}
106
107// Pre-computed spaces buffer for fast tab expansion (avoids per-tab allocation)
108// 4KB buffer covers even very large tab stops in a single memcpy
109const SPACES: [u8; 4096] = [b' '; 4096];
110
111/// Write N spaces to a Vec efficiently using pre-computed buffer.
112#[inline]
113fn push_spaces(output: &mut Vec<u8>, n: usize) {
114    let mut remaining = n;
115    while remaining > 0 {
116        let chunk = remaining.min(SPACES.len());
117        output.extend_from_slice(&SPACES[..chunk]);
118        remaining -= chunk;
119    }
120}
121
122/// Write N spaces to a Write stream using pre-computed buffer.
123#[inline]
124fn write_spaces(out: &mut impl Write, n: usize) -> std::io::Result<()> {
125    let mut remaining = n;
126    while remaining > 0 {
127        let chunk = remaining.min(SPACES.len());
128        out.write_all(&SPACES[..chunk])?;
129        remaining -= chunk;
130    }
131    Ok(())
132}
133
134/// Expand tabs to spaces using SIMD scanning.
135/// Uses memchr2 to find tabs and newlines, bulk-copying everything between them.
136pub fn expand_bytes(
137    data: &[u8],
138    tabs: &TabStops,
139    initial_only: bool,
140    out: &mut impl Write,
141) -> std::io::Result<()> {
142    if data.is_empty() {
143        return Ok(());
144    }
145
146    // Fast path: no tabs in data → just copy through
147    if memchr::memchr(b'\t', data).is_none() {
148        return out.write_all(data);
149    }
150
151    // For regular tab stops, use fast SIMD paths
152    if let TabStops::Regular(tab_size) = tabs {
153        if initial_only {
154            // --initial mode processes line-by-line anyway, so handle backspace
155            // per-line instead of scanning the whole buffer.
156            return expand_initial_fast(data, *tab_size, out);
157        } else if memchr::memchr(b'\x08', data).is_none() {
158            return expand_regular_fast(data, *tab_size, out);
159        }
160    }
161
162    // Generic path for backspace handling or tab lists.
163    // For Regular tabs, we only reach here when the memchr check at line 157 found
164    // backspaces, so has_backspace=true avoids a redundant O(n) scan.
165    // For List tabs, we haven't scanned yet, so check now.
166    let has_backspace = match tabs {
167        TabStops::Regular(_) => true,
168        TabStops::List(_) => memchr::memchr(b'\x08', data).is_some(),
169    };
170    expand_generic(data, tabs, initial_only, has_backspace, out)
171}
172
173/// Fast expand for regular tab stops without -i flag.
174/// Accumulates output into a buffer and flushes periodically (every 256KB) to bound memory.
175/// Uses memchr2 SIMD scanning to skip non-tab/non-newline runs in bulk.
176fn expand_regular_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
177    debug_assert!(tab_size > 0, "tab_size must be > 0");
178    const FLUSH_THRESHOLD: usize = 256 * 1024;
179    let cap = data.len().min(FLUSH_THRESHOLD) + data.len().min(FLUSH_THRESHOLD) / 8;
180    let mut output = Vec::with_capacity(cap);
181    let mut column: usize = 0;
182    let mut pos: usize = 0;
183
184    // For power-of-2 tab sizes, use bitwise AND instead of modulo (avoids DIV instruction)
185    let is_pow2 = tab_size.is_power_of_two();
186    let mask = tab_size - 1; // only valid when is_pow2
187
188    while pos < data.len() {
189        match memchr::memchr2(b'\t', b'\n', &data[pos..]) {
190            Some(offset) => {
191                // Copy non-special bytes in bulk
192                if offset > 0 {
193                    output.extend_from_slice(&data[pos..pos + offset]);
194                    column += offset;
195                }
196                let byte = data[pos + offset];
197                pos += offset + 1;
198
199                if byte == b'\n' {
200                    output.push(b'\n');
201                    column = 0;
202                } else {
203                    // Tab: append spaces to buffer
204                    let rem = if is_pow2 {
205                        column & mask
206                    } else {
207                        column % tab_size
208                    };
209                    let spaces = tab_size - rem;
210                    push_spaces(&mut output, spaces);
211                    column += spaces;
212                }
213
214                // Flush periodically to bound memory usage
215                if output.len() >= FLUSH_THRESHOLD {
216                    out.write_all(&output)?;
217                    output.clear();
218                }
219            }
220            None => {
221                output.extend_from_slice(&data[pos..]);
222                break;
223            }
224        }
225    }
226
227    if !output.is_empty() {
228        out.write_all(&output)?;
229    }
230    Ok(())
231}
232
233/// Fast expand for --initial mode with regular tab stops.
234/// Only expands tabs in the leading whitespace of each line, bulk-copying the rest.
235/// Uses memchr (SIMD) to find line boundaries. Leading-whitespace expansion is scalar.
236/// Handles backspace per-line: lines containing \x08 fall back to generic expand.
237fn expand_initial_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
238    debug_assert!(tab_size > 0, "tab_size must be > 0");
239    let tabs = TabStops::Regular(tab_size);
240    let mut pos: usize = 0;
241
242    while pos < data.len() {
243        // Find end of this line
244        let line_end = memchr::memchr(b'\n', &data[pos..])
245            .map(|off| pos + off + 1)
246            .unwrap_or(data.len());
247
248        let line = &data[pos..line_end];
249        debug_assert!(!line.is_empty());
250
251        // Fast skip: if line doesn't start with tab or space, write it whole
252        let first = line[0];
253        if first != b'\t' && first != b' ' {
254            out.write_all(line)?;
255            pos = line_end;
256            continue;
257        }
258
259        // If this line contains a backspace, fall back to generic for this line only
260        if memchr::memchr(b'\x08', line).is_some() {
261            expand_generic(line, &tabs, true, true, out)?;
262            pos = line_end;
263            continue;
264        }
265
266        // Expand only leading tabs/spaces in this line
267        let mut column: usize = 0;
268        let mut i = 0; // offset within line
269        while i < line.len() {
270            let byte = line[i];
271            if byte == b'\t' {
272                let spaces = tab_size - (column % tab_size);
273                write_spaces(out, spaces)?;
274                column += spaces;
275                i += 1;
276            } else if byte == b' ' {
277                // Batch consecutive spaces from source data
278                let space_start = i;
279                while i < line.len() && line[i] == b' ' {
280                    i += 1;
281                }
282                out.write_all(&line[space_start..i])?;
283                column += i - space_start;
284            } else {
285                // First non-blank: write the rest of the line unchanged
286                break;
287            }
288        }
289
290        // Write remainder of line as-is (zero-copy)
291        if i < line.len() {
292            out.write_all(&line[i..])?;
293        }
294
295        pos = line_end;
296    }
297
298    Ok(())
299}
300
301/// Generic expand with support for -i flag and tab lists.
302/// Uses memchr2 SIMD scanning when no backspaces are present.
303/// `has_backspace` hint avoids a redundant O(n) scan when the caller already knows.
304fn expand_generic(
305    data: &[u8],
306    tabs: &TabStops,
307    initial_only: bool,
308    has_backspace: bool,
309    out: &mut impl Write,
310) -> std::io::Result<()> {
311    const FLUSH_THRESHOLD: usize = 256 * 1024;
312    let cap = data.len().min(FLUSH_THRESHOLD) + data.len().min(FLUSH_THRESHOLD) / 8;
313    let mut output = Vec::with_capacity(cap);
314
315    // If no backspaces present and not initial-only, use SIMD bulk scanning
316    if !initial_only && !has_backspace {
317        let mut column: usize = 0;
318        let mut pos: usize = 0;
319
320        while pos < data.len() {
321            match memchr::memchr2(b'\t', b'\n', &data[pos..]) {
322                Some(offset) => {
323                    if offset > 0 {
324                        output.extend_from_slice(&data[pos..pos + offset]);
325                        column += offset;
326                    }
327                    let byte = data[pos + offset];
328                    pos += offset + 1;
329
330                    if byte == b'\n' {
331                        output.push(b'\n');
332                        column = 0;
333                    } else {
334                        let spaces = tabs.spaces_to_next(column);
335                        push_spaces(&mut output, spaces);
336                        column += spaces;
337                    }
338                    if output.len() >= FLUSH_THRESHOLD {
339                        out.write_all(&output)?;
340                        output.clear();
341                    }
342                }
343                None => {
344                    output.extend_from_slice(&data[pos..]);
345                    break;
346                }
347            }
348        }
349    } else {
350        // Byte-by-byte fallback for backspace handling or initial-only mode
351        let mut column: usize = 0;
352        let mut in_initial = true;
353
354        for &byte in data {
355            match byte {
356                b'\t' => {
357                    if initial_only && !in_initial {
358                        output.push(b'\t');
359                        column = tabs.next_tab_stop(column);
360                    } else {
361                        let spaces = tabs.spaces_to_next(column);
362                        push_spaces(&mut output, spaces);
363                        column += spaces;
364                    }
365                }
366                b'\n' => {
367                    output.push(b'\n');
368                    column = 0;
369                    in_initial = true;
370                    if output.len() >= FLUSH_THRESHOLD {
371                        out.write_all(&output)?;
372                        output.clear();
373                    }
374                }
375                b'\x08' => {
376                    output.push(b'\x08');
377                    if column > 0 {
378                        column -= 1;
379                    }
380                }
381                _ => {
382                    if initial_only && in_initial && byte != b' ' {
383                        in_initial = false;
384                    }
385                    output.push(byte);
386                    column += 1;
387                }
388            }
389        }
390    }
391
392    if !output.is_empty() {
393        out.write_all(&output)?;
394    }
395    Ok(())
396}
397
398/// Unexpand spaces to tabs.
399/// If `all` is true, convert all sequences of spaces; otherwise only leading spaces.
400pub fn unexpand_bytes(
401    data: &[u8],
402    tabs: &TabStops,
403    all: bool,
404    out: &mut impl Write,
405) -> std::io::Result<()> {
406    if data.is_empty() {
407        return Ok(());
408    }
409
410    // Fast path: no spaces or tabs → just copy through
411    if memchr::memchr2(b' ', b'\t', data).is_none() {
412        return out.write_all(data);
413    }
414
415    // For regular tab stops, use the optimized SIMD-scanning path
416    if let TabStops::Regular(tab_size) = tabs {
417        if memchr::memchr(b'\x08', data).is_none() {
418            return unexpand_regular_fast(data, *tab_size, all, out);
419        }
420    }
421
422    // Generic path for tab lists or data with backspaces
423    unexpand_generic(data, tabs, all, out)
424}
425
426/// Emit a run of blanks as the optimal combination of tabs and spaces.
427/// Matches GNU unexpand behavior: a single blank at a tab stop is only converted
428/// to a tab if more blanks follow, otherwise it stays as a space.
429#[inline]
430fn emit_blanks(
431    out: &mut impl Write,
432    start_col: usize,
433    count: usize,
434    tab_size: usize,
435) -> std::io::Result<()> {
436    if count == 0 {
437        return Ok(());
438    }
439    let end_col = start_col + count;
440    let mut col = start_col;
441
442    // Emit tabs for each tab stop we can reach
443    loop {
444        let next_tab = col + (tab_size - col % tab_size);
445        if next_tab > end_col {
446            break;
447        }
448        let blanks_consumed = next_tab - col;
449        if blanks_consumed >= 2 || next_tab < end_col {
450            // 2+ blanks to tab stop, OR 1 blank but more follow → emit tab
451            out.write_all(b"\t")?;
452            col = next_tab;
453        } else {
454            // 1 blank at tab stop with nothing after → keep as space
455            break;
456        }
457    }
458
459    // Emit remaining spaces
460    let remaining = end_col - col;
461    if remaining > 0 {
462        let mut r = remaining;
463        while r > 0 {
464            let chunk = r.min(SPACES.len());
465            out.write_all(&SPACES[..chunk])?;
466            r -= chunk;
467        }
468    }
469    Ok(())
470}
471
472/// Fast unexpand for regular tab stops without backspaces.
473/// Uses memchr SIMD scanning to skip non-special bytes in bulk.
474fn unexpand_regular_fast(
475    data: &[u8],
476    tab_size: usize,
477    all: bool,
478    out: &mut impl Write,
479) -> std::io::Result<()> {
480    let mut column: usize = 0;
481    let mut pos: usize = 0;
482    let mut in_initial = true;
483
484    while pos < data.len() {
485        if in_initial || all {
486            // Check for blanks to convert
487            if data[pos] == b' ' || data[pos] == b'\t' {
488                // Count consecutive blanks, tracking column advancement
489                let blank_start_col = column;
490                while pos < data.len() && (data[pos] == b' ' || data[pos] == b'\t') {
491                    if data[pos] == b'\t' {
492                        column += tab_size - column % tab_size;
493                    } else {
494                        column += 1;
495                    }
496                    pos += 1;
497                }
498                // Emit blanks as optimal tabs+spaces
499                emit_blanks(out, blank_start_col, column - blank_start_col, tab_size)?;
500                continue;
501            }
502            if data[pos] == b'\n' {
503                out.write_all(b"\n")?;
504                column = 0;
505                in_initial = true;
506                pos += 1;
507                continue;
508            }
509            // Non-blank: switch to body mode
510            in_initial = false;
511        }
512
513        // Body of line: bulk copy until next interesting byte
514        if !all {
515            // Default mode: copy everything until newline
516            match memchr::memchr(b'\n', &data[pos..]) {
517                Some(offset) => {
518                    out.write_all(&data[pos..pos + offset + 1])?;
519                    column = 0;
520                    in_initial = true;
521                    pos += offset + 1;
522                }
523                None => {
524                    out.write_all(&data[pos..])?;
525                    return Ok(());
526                }
527            }
528        } else {
529            // all=true: copy until next space, tab, or newline
530            match memchr::memchr3(b' ', b'\t', b'\n', &data[pos..]) {
531                Some(offset) => {
532                    if offset > 0 {
533                        out.write_all(&data[pos..pos + offset])?;
534                        column += offset;
535                    }
536                    pos += offset;
537                }
538                None => {
539                    out.write_all(&data[pos..])?;
540                    return Ok(());
541                }
542            }
543        }
544    }
545
546    Ok(())
547}
548
549/// Generic unexpand with support for tab lists and backspaces.
550fn unexpand_generic(
551    data: &[u8],
552    tabs: &TabStops,
553    all: bool,
554    out: &mut impl Write,
555) -> std::io::Result<()> {
556    let tab_size = match tabs {
557        TabStops::Regular(n) => *n,
558        TabStops::List(_) => 0, // handled by is_tab_stop/next_tab_stop
559    };
560    let mut column: usize = 0;
561    let mut space_start_col: Option<usize> = None;
562    let mut in_initial = true;
563
564    for &byte in data {
565        match byte {
566            b' ' => {
567                if !all && !in_initial {
568                    out.write_all(b" ")?;
569                    column += 1;
570                } else {
571                    if space_start_col.is_none() {
572                        space_start_col = Some(column);
573                    }
574                    column += 1;
575                    // Don't convert to tab here — wait for end of blank run
576                }
577            }
578            b'\t' => {
579                if !all && !in_initial {
580                    // In non-converting mode, just emit the tab
581                    if let Some(start_col) = space_start_col.take() {
582                        let n = column - start_col;
583                        out.write_all(&SPACES[..n.min(SPACES.len())])?;
584                    }
585                    out.write_all(b"\t")?;
586                    column = tabs.next_tab_stop(column);
587                } else {
588                    if space_start_col.is_none() {
589                        space_start_col = Some(column);
590                    }
591                    column = tabs.next_tab_stop(column);
592                }
593            }
594            _ => {
595                // Flush pending blanks
596                if let Some(start_col) = space_start_col.take() {
597                    let count = column - start_col;
598                    if tab_size > 0 {
599                        emit_blanks(out, start_col, count, tab_size)?;
600                    } else {
601                        // Tab list: use is_tab_stop for conversion
602                        emit_blanks_tablist(out, start_col, count, tabs)?;
603                    }
604                }
605
606                if byte == b'\n' {
607                    out.write_all(b"\n")?;
608                    column = 0;
609                    in_initial = true;
610                } else if byte == b'\x08' {
611                    out.write_all(b"\x08")?;
612                    if column > 0 {
613                        column -= 1;
614                    }
615                } else {
616                    if in_initial {
617                        in_initial = false;
618                    }
619                    out.write_all(&[byte])?;
620                    column += 1;
621                }
622            }
623        }
624    }
625
626    if let Some(start_col) = space_start_col {
627        let count = column - start_col;
628        if tab_size > 0 {
629            emit_blanks(out, start_col, count, tab_size)?;
630        } else {
631            emit_blanks_tablist(out, start_col, count, tabs)?;
632        }
633    }
634
635    Ok(())
636}
637
638/// Emit blanks using a tab list (non-regular tab stops).
639/// After the last defined tab stop, only spaces are emitted (no more tabs).
640fn emit_blanks_tablist(
641    out: &mut impl Write,
642    start_col: usize,
643    count: usize,
644    tabs: &TabStops,
645) -> std::io::Result<()> {
646    if count == 0 {
647        return Ok(());
648    }
649    let end_col = start_col + count;
650    let mut col = start_col;
651
652    // Get the last defined tab stop to know when to stop converting to tabs
653    let last_stop = match tabs {
654        TabStops::List(stops) => stops.last().copied().unwrap_or(0),
655        TabStops::Regular(_) => usize::MAX,
656    };
657
658    while col < last_stop {
659        let next_tab = tabs.next_tab_stop(col);
660        if next_tab > end_col || next_tab > last_stop {
661            break;
662        }
663        let blanks_consumed = next_tab - col;
664        if blanks_consumed >= 2 || next_tab < end_col {
665            out.write_all(b"\t")?;
666            col = next_tab;
667        } else {
668            break;
669        }
670    }
671
672    let remaining = end_col - col;
673    if remaining > 0 {
674        let mut r = remaining;
675        while r > 0 {
676            let chunk = r.min(SPACES.len());
677            out.write_all(&SPACES[..chunk])?;
678            r -= chunk;
679        }
680    }
681    Ok(())
682}
coreutils_rs/expand/core.rs

coreutils_rs/expand/
core.rs