Skip to main content

coreutils_rs/expand/
core.rs

1use std::io::Write;
2
3/// Tab stop specification
4#[derive(Clone, Debug)]
5pub enum TabStops {
6    /// Regular interval (default 8)
7    Regular(usize),
8    /// Explicit list of tab stop positions (0-indexed columns)
9    List(Vec<usize>),
10}
11
12impl TabStops {
13    /// Calculate the number of spaces to the next tab stop from the given column.
14    #[inline]
15    fn spaces_to_next(&self, column: usize) -> usize {
16        match self {
17            TabStops::Regular(n) => {
18                if *n == 0 {
19                    return 0;
20                }
21                *n - (column % *n)
22            }
23            TabStops::List(stops) => {
24                // Find the first tab stop > current column
25                match stops.binary_search(&(column + 1)) {
26                    Ok(idx) => stops[idx] - column,
27                    Err(idx) => {
28                        if idx < stops.len() {
29                            stops[idx] - column
30                        } else {
31                            // Past all tab stops: GNU uses 1 space
32                            1
33                        }
34                    }
35                }
36            }
37        }
38    }
39
40    /// Get the next tab stop position after the given column.
41    #[inline]
42    fn next_tab_stop(&self, column: usize) -> usize {
43        column + self.spaces_to_next(column)
44    }
45}
46
47/// Parse a tab specification string (e.g., "4", "4,8,12", "4 8 12").
48pub fn parse_tab_stops(spec: &str) -> Result<TabStops, String> {
49    let spec = spec.trim();
50    if spec.is_empty() {
51        return Ok(TabStops::Regular(8));
52    }
53
54    // Check if it's a single number (regular interval)
55    if let Ok(n) = spec.parse::<usize>() {
56        if n == 0 {
57            return Err("tab size cannot be 0".to_string());
58        }
59        return Ok(TabStops::Regular(n));
60    }
61
62    // Parse as comma or space-separated list
63    let mut stops: Vec<usize> = Vec::new();
64    for part in spec.split([',', ' ']) {
65        let part = part.trim();
66        if part.is_empty() {
67            continue;
68        }
69        // Handle / prefix for repeating tab stops
70        if let Some(rest) = part.strip_prefix('/') {
71            let n: usize = rest
72                .parse()
73                .map_err(|_| format!("'{}' is not a valid number", part))?;
74            if n == 0 {
75                return Err("tab size cannot be 0".to_string());
76            }
77            let last = stops.last().copied().unwrap_or(0);
78            let mut pos = last + n;
79            while pos < 10000 {
80                stops.push(pos);
81                pos += n;
82            }
83            continue;
84        }
85        match part.parse::<usize>() {
86            Ok(n) => {
87                if !stops.is_empty() && n <= *stops.last().unwrap() {
88                    return Err("tab sizes must be ascending".to_string());
89                }
90                stops.push(n);
91            }
92            Err(_) => return Err(format!("'{}' is not a valid number", part)),
93        }
94    }
95
96    if stops.is_empty() {
97        return Err("tab specification is empty".to_string());
98    }
99
100    if stops.len() == 1 {
101        return Ok(TabStops::Regular(stops[0]));
102    }
103
104    Ok(TabStops::List(stops))
105}
106
107// Pre-computed spaces buffer for fast tab expansion (avoids per-tab allocation)
108// Larger buffer (256 bytes) means most tabs can be served in a single memcpy
109const SPACES: [u8; 256] = [b' '; 256];
110
111/// Write N spaces to a Vec efficiently using pre-computed buffer.
112#[inline]
113fn push_spaces(output: &mut Vec<u8>, n: usize) {
114    let mut remaining = n;
115    while remaining > 0 {
116        let chunk = remaining.min(SPACES.len());
117        output.extend_from_slice(&SPACES[..chunk]);
118        remaining -= chunk;
119    }
120}
121
122/// Write N spaces to a Write stream using pre-computed buffer.
123#[inline]
124fn write_spaces(out: &mut impl Write, n: usize) -> std::io::Result<()> {
125    let mut remaining = n;
126    while remaining > 0 {
127        let chunk = remaining.min(SPACES.len());
128        out.write_all(&SPACES[..chunk])?;
129        remaining -= chunk;
130    }
131    Ok(())
132}
133
134/// Expand tabs to spaces using SIMD scanning.
135/// Uses memchr2 to find tabs and newlines, bulk-copying everything between them.
136pub fn expand_bytes(
137    data: &[u8],
138    tabs: &TabStops,
139    initial_only: bool,
140    out: &mut impl Write,
141) -> std::io::Result<()> {
142    if data.is_empty() {
143        return Ok(());
144    }
145
146    // Fast path: no tabs in data → just copy through
147    if memchr::memchr(b'\t', data).is_none() {
148        return out.write_all(data);
149    }
150
151    // For regular tab stops, use fast SIMD paths
152    if let TabStops::Regular(tab_size) = tabs {
153        if initial_only {
154            // --initial mode processes line-by-line anyway, so handle backspace
155            // per-line instead of scanning the whole buffer.
156            return expand_initial_fast(data, *tab_size, out);
157        } else if memchr::memchr(b'\x08', data).is_none() {
158            return expand_regular_fast(data, *tab_size, out);
159        }
160    }
161
162    // Generic path for backspace handling or tab lists
163    expand_generic(data, tabs, initial_only, out)
164}
165
166/// Fast expand for regular tab stops without -i flag.
167/// Writes directly from source data for non-tab runs, avoiding intermediate buffer copies.
168fn expand_regular_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
169    debug_assert!(tab_size > 0, "tab_size must be > 0");
170    let mut column: usize = 0;
171    let mut pos: usize = 0;
172
173    while pos < data.len() {
174        match memchr::memchr2(b'\t', b'\n', &data[pos..]) {
175            Some(offset) => {
176                // Write non-special bytes directly from source (zero-copy)
177                if offset > 0 {
178                    out.write_all(&data[pos..pos + offset])?;
179                    column += offset;
180                }
181                let byte = data[pos + offset];
182                pos += offset + 1;
183
184                if byte == b'\n' {
185                    out.write_all(b"\n")?;
186                    column = 0;
187                } else {
188                    // Tab: write spaces
189                    let spaces = tab_size - (column % tab_size);
190                    write_spaces(out, spaces)?;
191                    column += spaces;
192                }
193            }
194            None => {
195                out.write_all(&data[pos..])?;
196                break;
197            }
198        }
199    }
200
201    Ok(())
202}
203
204/// Fast expand for --initial mode with regular tab stops.
205/// Only expands tabs in the leading whitespace of each line, bulk-copying the rest.
206/// Uses memchr (SIMD) to find line boundaries. Leading-whitespace expansion is scalar.
207/// Handles backspace per-line: lines containing \x08 fall back to generic expand.
208fn expand_initial_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
209    debug_assert!(tab_size > 0, "tab_size must be > 0");
210    let tabs = TabStops::Regular(tab_size);
211    let mut pos: usize = 0;
212
213    while pos < data.len() {
214        // Find end of this line
215        let line_end = memchr::memchr(b'\n', &data[pos..])
216            .map(|off| pos + off + 1)
217            .unwrap_or(data.len());
218
219        let line = &data[pos..line_end];
220        debug_assert!(!line.is_empty());
221
222        // Fast skip: if line doesn't start with tab or space, write it whole
223        let first = line[0];
224        if first != b'\t' && first != b' ' {
225            out.write_all(line)?;
226            pos = line_end;
227            continue;
228        }
229
230        // If this line contains a backspace, fall back to generic for this line only
231        if memchr::memchr(b'\x08', line).is_some() {
232            expand_generic(line, &tabs, true, out)?;
233            pos = line_end;
234            continue;
235        }
236
237        // Expand only leading tabs/spaces in this line
238        let mut column: usize = 0;
239        let mut i = 0; // offset within line
240        while i < line.len() {
241            let byte = line[i];
242            if byte == b'\t' {
243                let spaces = tab_size - (column % tab_size);
244                write_spaces(out, spaces)?;
245                column += spaces;
246                i += 1;
247            } else if byte == b' ' {
248                // Batch consecutive spaces from source data
249                let space_start = i;
250                while i < line.len() && line[i] == b' ' {
251                    i += 1;
252                }
253                out.write_all(&line[space_start..i])?;
254                column += i - space_start;
255            } else {
256                // First non-blank: write the rest of the line unchanged
257                break;
258            }
259        }
260
261        // Write remainder of line as-is (zero-copy)
262        if i < line.len() {
263            out.write_all(&line[i..])?;
264        }
265
266        pos = line_end;
267    }
268
269    Ok(())
270}
271
272/// Generic expand with support for -i flag and tab lists.
273fn expand_generic(
274    data: &[u8],
275    tabs: &TabStops,
276    initial_only: bool,
277    out: &mut impl Write,
278) -> std::io::Result<()> {
279    let mut output = Vec::with_capacity(data.len() + data.len() / 8);
280    let mut column: usize = 0;
281    let mut in_initial = true;
282
283    for &byte in data {
284        match byte {
285            b'\t' => {
286                if initial_only && !in_initial {
287                    output.push(b'\t');
288                    column = tabs.next_tab_stop(column);
289                } else {
290                    let spaces = tabs.spaces_to_next(column);
291                    push_spaces(&mut output, spaces);
292                    column += spaces;
293                }
294            }
295            b'\n' => {
296                output.push(b'\n');
297                column = 0;
298                in_initial = true;
299            }
300            b'\x08' => {
301                output.push(b'\x08');
302                if column > 0 {
303                    column -= 1;
304                }
305            }
306            _ => {
307                if initial_only && in_initial && byte != b' ' {
308                    in_initial = false;
309                }
310                output.push(byte);
311                column += 1;
312            }
313        }
314    }
315
316    out.write_all(&output)
317}
318
319/// Unexpand spaces to tabs.
320/// If `all` is true, convert all sequences of spaces; otherwise only leading spaces.
321pub fn unexpand_bytes(
322    data: &[u8],
323    tabs: &TabStops,
324    all: bool,
325    out: &mut impl Write,
326) -> std::io::Result<()> {
327    if data.is_empty() {
328        return Ok(());
329    }
330
331    // Fast path: no spaces or tabs → just copy through
332    if memchr::memchr2(b' ', b'\t', data).is_none() {
333        return out.write_all(data);
334    }
335
336    // For regular tab stops, use the optimized SIMD-scanning path
337    if let TabStops::Regular(tab_size) = tabs {
338        if memchr::memchr(b'\x08', data).is_none() {
339            return unexpand_regular_fast(data, *tab_size, all, out);
340        }
341    }
342
343    // Generic path for tab lists or data with backspaces
344    unexpand_generic(data, tabs, all, out)
345}
346
347/// Emit a run of blanks as the optimal combination of tabs and spaces.
348/// Matches GNU unexpand behavior: a single blank at a tab stop is only converted
349/// to a tab if more blanks follow, otherwise it stays as a space.
350#[inline]
351fn emit_blanks(
352    out: &mut impl Write,
353    start_col: usize,
354    count: usize,
355    tab_size: usize,
356) -> std::io::Result<()> {
357    if count == 0 {
358        return Ok(());
359    }
360    let end_col = start_col + count;
361    let mut col = start_col;
362
363    // Emit tabs for each tab stop we can reach
364    loop {
365        let next_tab = col + (tab_size - col % tab_size);
366        if next_tab > end_col {
367            break;
368        }
369        let blanks_consumed = next_tab - col;
370        if blanks_consumed >= 2 || next_tab < end_col {
371            // 2+ blanks to tab stop, OR 1 blank but more follow → emit tab
372            out.write_all(b"\t")?;
373            col = next_tab;
374        } else {
375            // 1 blank at tab stop with nothing after → keep as space
376            break;
377        }
378    }
379
380    // Emit remaining spaces
381    let remaining = end_col - col;
382    if remaining > 0 {
383        let mut r = remaining;
384        while r > 0 {
385            let chunk = r.min(SPACES.len());
386            out.write_all(&SPACES[..chunk])?;
387            r -= chunk;
388        }
389    }
390    Ok(())
391}
392
393/// Fast unexpand for regular tab stops without backspaces.
394/// Uses memchr SIMD scanning to skip non-special bytes in bulk.
395fn unexpand_regular_fast(
396    data: &[u8],
397    tab_size: usize,
398    all: bool,
399    out: &mut impl Write,
400) -> std::io::Result<()> {
401    let mut column: usize = 0;
402    let mut pos: usize = 0;
403    let mut in_initial = true;
404
405    while pos < data.len() {
406        if in_initial || all {
407            // Check for blanks to convert
408            if data[pos] == b' ' || data[pos] == b'\t' {
409                // Count consecutive blanks, tracking column advancement
410                let blank_start_col = column;
411                while pos < data.len() && (data[pos] == b' ' || data[pos] == b'\t') {
412                    if data[pos] == b'\t' {
413                        column += tab_size - column % tab_size;
414                    } else {
415                        column += 1;
416                    }
417                    pos += 1;
418                }
419                // Emit blanks as optimal tabs+spaces
420                emit_blanks(out, blank_start_col, column - blank_start_col, tab_size)?;
421                continue;
422            }
423            if data[pos] == b'\n' {
424                out.write_all(b"\n")?;
425                column = 0;
426                in_initial = true;
427                pos += 1;
428                continue;
429            }
430            // Non-blank: switch to body mode
431            in_initial = false;
432        }
433
434        // Body of line: bulk copy until next interesting byte
435        if !all {
436            // Default mode: copy everything until newline
437            match memchr::memchr(b'\n', &data[pos..]) {
438                Some(offset) => {
439                    out.write_all(&data[pos..pos + offset + 1])?;
440                    column = 0;
441                    in_initial = true;
442                    pos += offset + 1;
443                }
444                None => {
445                    out.write_all(&data[pos..])?;
446                    return Ok(());
447                }
448            }
449        } else {
450            // all=true: copy until next space, tab, or newline
451            match memchr::memchr3(b' ', b'\t', b'\n', &data[pos..]) {
452                Some(offset) => {
453                    if offset > 0 {
454                        out.write_all(&data[pos..pos + offset])?;
455                        column += offset;
456                    }
457                    pos += offset;
458                }
459                None => {
460                    out.write_all(&data[pos..])?;
461                    return Ok(());
462                }
463            }
464        }
465    }
466
467    Ok(())
468}
469
470/// Generic unexpand with support for tab lists and backspaces.
471fn unexpand_generic(
472    data: &[u8],
473    tabs: &TabStops,
474    all: bool,
475    out: &mut impl Write,
476) -> std::io::Result<()> {
477    let tab_size = match tabs {
478        TabStops::Regular(n) => *n,
479        TabStops::List(_) => 0, // handled by is_tab_stop/next_tab_stop
480    };
481    let mut column: usize = 0;
482    let mut space_start_col: Option<usize> = None;
483    let mut in_initial = true;
484
485    for &byte in data {
486        match byte {
487            b' ' => {
488                if !all && !in_initial {
489                    out.write_all(b" ")?;
490                    column += 1;
491                } else {
492                    if space_start_col.is_none() {
493                        space_start_col = Some(column);
494                    }
495                    column += 1;
496                    // Don't convert to tab here — wait for end of blank run
497                }
498            }
499            b'\t' => {
500                if !all && !in_initial {
501                    // In non-converting mode, just emit the tab
502                    if let Some(start_col) = space_start_col.take() {
503                        let n = column - start_col;
504                        out.write_all(&SPACES[..n.min(SPACES.len())])?;
505                    }
506                    out.write_all(b"\t")?;
507                    column = tabs.next_tab_stop(column);
508                } else {
509                    if space_start_col.is_none() {
510                        space_start_col = Some(column);
511                    }
512                    column = tabs.next_tab_stop(column);
513                }
514            }
515            _ => {
516                // Flush pending blanks
517                if let Some(start_col) = space_start_col.take() {
518                    let count = column - start_col;
519                    if tab_size > 0 {
520                        emit_blanks(out, start_col, count, tab_size)?;
521                    } else {
522                        // Tab list: use is_tab_stop for conversion
523                        emit_blanks_tablist(out, start_col, count, tabs)?;
524                    }
525                }
526
527                if byte == b'\n' {
528                    out.write_all(b"\n")?;
529                    column = 0;
530                    in_initial = true;
531                } else if byte == b'\x08' {
532                    out.write_all(b"\x08")?;
533                    if column > 0 {
534                        column -= 1;
535                    }
536                } else {
537                    if in_initial {
538                        in_initial = false;
539                    }
540                    out.write_all(&[byte])?;
541                    column += 1;
542                }
543            }
544        }
545    }
546
547    if let Some(start_col) = space_start_col {
548        let count = column - start_col;
549        if tab_size > 0 {
550            emit_blanks(out, start_col, count, tab_size)?;
551        } else {
552            emit_blanks_tablist(out, start_col, count, tabs)?;
553        }
554    }
555
556    Ok(())
557}
558
559/// Emit blanks using a tab list (non-regular tab stops).
560/// After the last defined tab stop, only spaces are emitted (no more tabs).
561fn emit_blanks_tablist(
562    out: &mut impl Write,
563    start_col: usize,
564    count: usize,
565    tabs: &TabStops,
566) -> std::io::Result<()> {
567    if count == 0 {
568        return Ok(());
569    }
570    let end_col = start_col + count;
571    let mut col = start_col;
572
573    // Get the last defined tab stop to know when to stop converting to tabs
574    let last_stop = match tabs {
575        TabStops::List(stops) => stops.last().copied().unwrap_or(0),
576        TabStops::Regular(_) => usize::MAX,
577    };
578
579    while col < last_stop {
580        let next_tab = tabs.next_tab_stop(col);
581        if next_tab > end_col || next_tab > last_stop {
582            break;
583        }
584        let blanks_consumed = next_tab - col;
585        if blanks_consumed >= 2 || next_tab < end_col {
586            out.write_all(b"\t")?;
587            col = next_tab;
588        } else {
589            break;
590        }
591    }
592
593    let remaining = end_col - col;
594    if remaining > 0 {
595        let mut r = remaining;
596        while r > 0 {
597            let chunk = r.min(SPACES.len());
598            out.write_all(&SPACES[..chunk])?;
599            r -= chunk;
600        }
601    }
602    Ok(())
603}