Skip to main content

coreutils_rs/expand/
core.rs

1use std::io::Write;
2
3/// Tab stop specification
4#[derive(Clone, Debug)]
5pub enum TabStops {
6    /// Regular interval (default 8)
7    Regular(usize),
8    /// Explicit list of tab stop positions (0-indexed columns)
9    List(Vec<usize>),
10}
11
12impl TabStops {
13    /// Calculate the number of spaces to the next tab stop from the given column.
14    #[inline]
15    fn spaces_to_next(&self, column: usize) -> usize {
16        match self {
17            TabStops::Regular(n) => {
18                if *n == 0 {
19                    return 0;
20                }
21                *n - (column % *n)
22            }
23            TabStops::List(stops) => {
24                // Find the first tab stop > current column
25                match stops.binary_search(&(column + 1)) {
26                    Ok(idx) => stops[idx] - column,
27                    Err(idx) => {
28                        if idx < stops.len() {
29                            stops[idx] - column
30                        } else {
31                            // Past all tab stops: GNU uses 1 space
32                            1
33                        }
34                    }
35                }
36            }
37        }
38    }
39
40    /// Get the next tab stop position after the given column.
41    #[inline]
42    fn next_tab_stop(&self, column: usize) -> usize {
43        column + self.spaces_to_next(column)
44    }
45}
46
47/// Parse a tab specification string (e.g., "4", "4,8,12", "4 8 12").
48pub fn parse_tab_stops(spec: &str) -> Result<TabStops, String> {
49    let spec = spec.trim();
50    if spec.is_empty() {
51        return Ok(TabStops::Regular(8));
52    }
53
54    // Check if it's a single number (regular interval)
55    if let Ok(n) = spec.parse::<usize>() {
56        if n == 0 {
57            return Err("tab size cannot be 0".to_string());
58        }
59        return Ok(TabStops::Regular(n));
60    }
61
62    // Parse as comma or space-separated list
63    let mut stops: Vec<usize> = Vec::new();
64    for part in spec.split([',', ' ']) {
65        let part = part.trim();
66        if part.is_empty() {
67            continue;
68        }
69        // Handle / prefix for repeating tab stops
70        if let Some(rest) = part.strip_prefix('/') {
71            let n: usize = rest
72                .parse()
73                .map_err(|_| format!("'{}' is not a valid number", part))?;
74            if n == 0 {
75                return Err("tab size cannot be 0".to_string());
76            }
77            let last = stops.last().copied().unwrap_or(0);
78            let mut pos = last + n;
79            while pos < 10000 {
80                stops.push(pos);
81                pos += n;
82            }
83            continue;
84        }
85        match part.parse::<usize>() {
86            Ok(n) => {
87                if !stops.is_empty() && n <= *stops.last().unwrap() {
88                    return Err("tab sizes must be ascending".to_string());
89                }
90                stops.push(n);
91            }
92            Err(_) => return Err(format!("'{}' is not a valid number", part)),
93        }
94    }
95
96    if stops.is_empty() {
97        return Err("tab specification is empty".to_string());
98    }
99
100    if stops.len() == 1 {
101        return Ok(TabStops::Regular(stops[0]));
102    }
103
104    Ok(TabStops::List(stops))
105}
106
107// Pre-computed spaces buffer for fast tab expansion (avoids per-tab allocation)
108// Larger buffer (256 bytes) means most tabs can be served in a single memcpy
109const SPACES: [u8; 256] = [b' '; 256];
110
111/// Write N spaces to a Vec efficiently using pre-computed buffer.
112#[inline]
113fn push_spaces(output: &mut Vec<u8>, n: usize) {
114    let mut remaining = n;
115    while remaining > 0 {
116        let chunk = remaining.min(SPACES.len());
117        output.extend_from_slice(&SPACES[..chunk]);
118        remaining -= chunk;
119    }
120}
121
122/// Expand tabs to spaces using SIMD scanning.
123/// Uses memchr2 to find tabs and newlines, bulk-copying everything between them.
124pub fn expand_bytes(
125    data: &[u8],
126    tabs: &TabStops,
127    initial_only: bool,
128    out: &mut impl Write,
129) -> std::io::Result<()> {
130    if data.is_empty() {
131        return Ok(());
132    }
133
134    // Fast path: no tabs in data → just copy through
135    if memchr::memchr(b'\t', data).is_none() {
136        return out.write_all(data);
137    }
138
139    // For regular tab stops with no -i flag, use the fast SIMD path
140    if let TabStops::Regular(tab_size) = tabs {
141        if !initial_only && memchr::memchr(b'\x08', data).is_none() {
142            return expand_regular_fast(data, *tab_size, out);
143        }
144    }
145
146    // Generic path for -i flag or tab lists
147    expand_generic(data, tabs, initial_only, out)
148}
149
150/// Fast expand for regular tab stops without -i flag.
151/// Writes directly from source data for non-tab runs, avoiding intermediate buffer copies.
152fn expand_regular_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
153    let mut column: usize = 0;
154    let mut pos: usize = 0;
155
156    while pos < data.len() {
157        match memchr::memchr2(b'\t', b'\n', &data[pos..]) {
158            Some(offset) => {
159                // Write non-special bytes directly from source (zero-copy)
160                if offset > 0 {
161                    out.write_all(&data[pos..pos + offset])?;
162                    column += offset;
163                }
164                let byte = data[pos + offset];
165                pos += offset + 1;
166
167                if byte == b'\n' {
168                    out.write_all(b"\n")?;
169                    column = 0;
170                } else {
171                    // Tab: write spaces directly from const buffer
172                    let spaces = tab_size - (column % tab_size);
173                    out.write_all(&SPACES[..spaces])?;
174                    column += spaces;
175                }
176            }
177            None => {
178                out.write_all(&data[pos..])?;
179                break;
180            }
181        }
182    }
183
184    Ok(())
185}
186
187/// Generic expand with support for -i flag and tab lists.
188fn expand_generic(
189    data: &[u8],
190    tabs: &TabStops,
191    initial_only: bool,
192    out: &mut impl Write,
193) -> std::io::Result<()> {
194    let mut output = Vec::with_capacity(data.len() + data.len() / 8);
195    let mut column: usize = 0;
196    let mut in_initial = true;
197
198    for &byte in data {
199        match byte {
200            b'\t' => {
201                if initial_only && !in_initial {
202                    output.push(b'\t');
203                    column = tabs.next_tab_stop(column);
204                } else {
205                    let spaces = tabs.spaces_to_next(column);
206                    push_spaces(&mut output, spaces);
207                    column += spaces;
208                }
209            }
210            b'\n' => {
211                output.push(b'\n');
212                column = 0;
213                in_initial = true;
214            }
215            b'\x08' => {
216                output.push(b'\x08');
217                if column > 0 {
218                    column -= 1;
219                }
220            }
221            _ => {
222                if initial_only && in_initial && byte != b' ' {
223                    in_initial = false;
224                }
225                output.push(byte);
226                column += 1;
227            }
228        }
229    }
230
231    out.write_all(&output)
232}
233
234/// Unexpand spaces to tabs.
235/// If `all` is true, convert all sequences of spaces; otherwise only leading spaces.
236pub fn unexpand_bytes(
237    data: &[u8],
238    tabs: &TabStops,
239    all: bool,
240    out: &mut impl Write,
241) -> std::io::Result<()> {
242    if data.is_empty() {
243        return Ok(());
244    }
245
246    // Fast path: no spaces or tabs → just copy through
247    if memchr::memchr2(b' ', b'\t', data).is_none() {
248        return out.write_all(data);
249    }
250
251    // For regular tab stops, use the optimized SIMD-scanning path
252    if let TabStops::Regular(tab_size) = tabs {
253        if memchr::memchr(b'\x08', data).is_none() {
254            return unexpand_regular_fast(data, *tab_size, all, out);
255        }
256    }
257
258    // Generic path for tab lists or data with backspaces
259    unexpand_generic(data, tabs, all, out)
260}
261
262/// Emit a run of blanks as the optimal combination of tabs and spaces.
263/// Matches GNU unexpand behavior: a single blank at a tab stop is only converted
264/// to a tab if more blanks follow, otherwise it stays as a space.
265#[inline]
266fn emit_blanks(
267    out: &mut impl Write,
268    start_col: usize,
269    count: usize,
270    tab_size: usize,
271) -> std::io::Result<()> {
272    if count == 0 {
273        return Ok(());
274    }
275    let end_col = start_col + count;
276    let mut col = start_col;
277
278    // Emit tabs for each tab stop we can reach
279    loop {
280        let next_tab = col + (tab_size - col % tab_size);
281        if next_tab > end_col {
282            break;
283        }
284        let blanks_consumed = next_tab - col;
285        if blanks_consumed >= 2 || next_tab < end_col {
286            // 2+ blanks to tab stop, OR 1 blank but more follow → emit tab
287            out.write_all(b"\t")?;
288            col = next_tab;
289        } else {
290            // 1 blank at tab stop with nothing after → keep as space
291            break;
292        }
293    }
294
295    // Emit remaining spaces
296    let remaining = end_col - col;
297    if remaining > 0 {
298        let mut r = remaining;
299        while r > 0 {
300            let chunk = r.min(SPACES.len());
301            out.write_all(&SPACES[..chunk])?;
302            r -= chunk;
303        }
304    }
305    Ok(())
306}
307
308/// Fast unexpand for regular tab stops without backspaces.
309/// Uses memchr SIMD scanning to skip non-special bytes in bulk.
310fn unexpand_regular_fast(
311    data: &[u8],
312    tab_size: usize,
313    all: bool,
314    out: &mut impl Write,
315) -> std::io::Result<()> {
316    let mut column: usize = 0;
317    let mut pos: usize = 0;
318    let mut in_initial = true;
319
320    while pos < data.len() {
321        if in_initial || all {
322            // Check for blanks to convert
323            if data[pos] == b' ' || data[pos] == b'\t' {
324                // Count consecutive blanks, tracking column advancement
325                let blank_start_col = column;
326                while pos < data.len() && (data[pos] == b' ' || data[pos] == b'\t') {
327                    if data[pos] == b'\t' {
328                        column += tab_size - column % tab_size;
329                    } else {
330                        column += 1;
331                    }
332                    pos += 1;
333                }
334                // Emit blanks as optimal tabs+spaces
335                emit_blanks(out, blank_start_col, column - blank_start_col, tab_size)?;
336                continue;
337            }
338            if data[pos] == b'\n' {
339                out.write_all(b"\n")?;
340                column = 0;
341                in_initial = true;
342                pos += 1;
343                continue;
344            }
345            // Non-blank: switch to body mode
346            in_initial = false;
347        }
348
349        // Body of line: bulk copy until next interesting byte
350        if !all {
351            // Default mode: copy everything until newline
352            match memchr::memchr(b'\n', &data[pos..]) {
353                Some(offset) => {
354                    out.write_all(&data[pos..pos + offset + 1])?;
355                    column = 0;
356                    in_initial = true;
357                    pos += offset + 1;
358                }
359                None => {
360                    out.write_all(&data[pos..])?;
361                    return Ok(());
362                }
363            }
364        } else {
365            // all=true: copy until next space, tab, or newline
366            match memchr::memchr3(b' ', b'\t', b'\n', &data[pos..]) {
367                Some(offset) => {
368                    if offset > 0 {
369                        out.write_all(&data[pos..pos + offset])?;
370                        column += offset;
371                    }
372                    pos += offset;
373                }
374                None => {
375                    out.write_all(&data[pos..])?;
376                    return Ok(());
377                }
378            }
379        }
380    }
381
382    Ok(())
383}
384
385/// Generic unexpand with support for tab lists and backspaces.
386fn unexpand_generic(
387    data: &[u8],
388    tabs: &TabStops,
389    all: bool,
390    out: &mut impl Write,
391) -> std::io::Result<()> {
392    let tab_size = match tabs {
393        TabStops::Regular(n) => *n,
394        TabStops::List(_) => 0, // handled by is_tab_stop/next_tab_stop
395    };
396    let mut column: usize = 0;
397    let mut space_start_col: Option<usize> = None;
398    let mut in_initial = true;
399
400    for &byte in data {
401        match byte {
402            b' ' => {
403                if !all && !in_initial {
404                    out.write_all(b" ")?;
405                    column += 1;
406                } else {
407                    if space_start_col.is_none() {
408                        space_start_col = Some(column);
409                    }
410                    column += 1;
411                    // Don't convert to tab here — wait for end of blank run
412                }
413            }
414            b'\t' => {
415                if !all && !in_initial {
416                    // In non-converting mode, just emit the tab
417                    if let Some(start_col) = space_start_col.take() {
418                        let n = column - start_col;
419                        out.write_all(&SPACES[..n.min(SPACES.len())])?;
420                    }
421                    out.write_all(b"\t")?;
422                    column = tabs.next_tab_stop(column);
423                } else {
424                    if space_start_col.is_none() {
425                        space_start_col = Some(column);
426                    }
427                    column = tabs.next_tab_stop(column);
428                }
429            }
430            _ => {
431                // Flush pending blanks
432                if let Some(start_col) = space_start_col.take() {
433                    let count = column - start_col;
434                    if tab_size > 0 {
435                        emit_blanks(out, start_col, count, tab_size)?;
436                    } else {
437                        // Tab list: use is_tab_stop for conversion
438                        emit_blanks_tablist(out, start_col, count, tabs)?;
439                    }
440                }
441
442                if byte == b'\n' {
443                    out.write_all(b"\n")?;
444                    column = 0;
445                    in_initial = true;
446                } else if byte == b'\x08' {
447                    out.write_all(b"\x08")?;
448                    if column > 0 {
449                        column -= 1;
450                    }
451                } else {
452                    if in_initial {
453                        in_initial = false;
454                    }
455                    out.write_all(&[byte])?;
456                    column += 1;
457                }
458            }
459        }
460    }
461
462    if let Some(start_col) = space_start_col {
463        let count = column - start_col;
464        if tab_size > 0 {
465            emit_blanks(out, start_col, count, tab_size)?;
466        } else {
467            emit_blanks_tablist(out, start_col, count, tabs)?;
468        }
469    }
470
471    Ok(())
472}
473
474/// Emit blanks using a tab list (non-regular tab stops).
475/// After the last defined tab stop, only spaces are emitted (no more tabs).
476fn emit_blanks_tablist(
477    out: &mut impl Write,
478    start_col: usize,
479    count: usize,
480    tabs: &TabStops,
481) -> std::io::Result<()> {
482    if count == 0 {
483        return Ok(());
484    }
485    let end_col = start_col + count;
486    let mut col = start_col;
487
488    // Get the last defined tab stop to know when to stop converting to tabs
489    let last_stop = match tabs {
490        TabStops::List(stops) => stops.last().copied().unwrap_or(0),
491        TabStops::Regular(_) => usize::MAX,
492    };
493
494    while col < last_stop {
495        let next_tab = tabs.next_tab_stop(col);
496        if next_tab > end_col || next_tab > last_stop {
497            break;
498        }
499        let blanks_consumed = next_tab - col;
500        if blanks_consumed >= 2 || next_tab < end_col {
501            out.write_all(b"\t")?;
502            col = next_tab;
503        } else {
504            break;
505        }
506    }
507
508    let remaining = end_col - col;
509    if remaining > 0 {
510        let mut r = remaining;
511        while r > 0 {
512            let chunk = r.min(SPACES.len());
513            out.write_all(&SPACES[..chunk])?;
514            r -= chunk;
515        }
516    }
517    Ok(())
518}