Skip to main content

coreutils_rs/expand/
core.rs

1use std::io::Write;
2
3/// Tab stop specification
4#[derive(Clone, Debug)]
5pub enum TabStops {
6    /// Regular interval (default 8)
7    Regular(usize),
8    /// Explicit list of tab stop positions (0-indexed columns)
9    List(Vec<usize>),
10}
11
12impl TabStops {
13    /// Calculate the number of spaces to the next tab stop from the given column.
14    #[inline]
15    fn spaces_to_next(&self, column: usize) -> usize {
16        match self {
17            TabStops::Regular(n) => {
18                if *n == 0 {
19                    return 0;
20                }
21                *n - (column % *n)
22            }
23            TabStops::List(stops) => {
24                // Find the first tab stop > current column
25                match stops.binary_search(&(column + 1)) {
26                    Ok(idx) => stops[idx] - column,
27                    Err(idx) => {
28                        if idx < stops.len() {
29                            stops[idx] - column
30                        } else {
31                            // Past all tab stops: GNU uses 1 space
32                            1
33                        }
34                    }
35                }
36            }
37        }
38    }
39
40    /// Check if the given column is at a tab stop position.
41    #[inline]
42    fn is_tab_stop(&self, column: usize) -> bool {
43        match self {
44            TabStops::Regular(n) => {
45                if *n == 0 {
46                    return false;
47                }
48                column.is_multiple_of(*n)
49            }
50            TabStops::List(stops) => stops.binary_search(&column).is_ok(),
51        }
52    }
53
54    /// Get the next tab stop position after the given column.
55    #[inline]
56    fn next_tab_stop(&self, column: usize) -> usize {
57        column + self.spaces_to_next(column)
58    }
59}
60
61/// Parse a tab specification string (e.g., "4", "4,8,12", "4 8 12").
62pub fn parse_tab_stops(spec: &str) -> Result<TabStops, String> {
63    let spec = spec.trim();
64    if spec.is_empty() {
65        return Ok(TabStops::Regular(8));
66    }
67
68    // Check if it's a single number (regular interval)
69    if let Ok(n) = spec.parse::<usize>() {
70        if n == 0 {
71            return Err("tab size cannot be 0".to_string());
72        }
73        return Ok(TabStops::Regular(n));
74    }
75
76    // Parse as comma or space-separated list
77    let mut stops: Vec<usize> = Vec::new();
78    for part in spec.split([',', ' ']) {
79        let part = part.trim();
80        if part.is_empty() {
81            continue;
82        }
83        // Handle / prefix for repeating tab stops
84        if let Some(rest) = part.strip_prefix('/') {
85            let n: usize = rest
86                .parse()
87                .map_err(|_| format!("'{}' is not a valid number", part))?;
88            if n == 0 {
89                return Err("tab size cannot be 0".to_string());
90            }
91            let last = stops.last().copied().unwrap_or(0);
92            let mut pos = last + n;
93            while pos < 10000 {
94                stops.push(pos);
95                pos += n;
96            }
97            continue;
98        }
99        match part.parse::<usize>() {
100            Ok(n) => {
101                if !stops.is_empty() && n <= *stops.last().unwrap() {
102                    return Err("tab sizes must be ascending".to_string());
103                }
104                stops.push(n);
105            }
106            Err(_) => return Err(format!("'{}' is not a valid number", part)),
107        }
108    }
109
110    if stops.is_empty() {
111        return Err("tab specification is empty".to_string());
112    }
113
114    if stops.len() == 1 {
115        return Ok(TabStops::Regular(stops[0]));
116    }
117
118    Ok(TabStops::List(stops))
119}
120
121// Pre-computed spaces buffer for fast tab expansion (avoids per-tab allocation)
122const SPACES: [u8; 64] = [b' '; 64];
123
124/// Write N spaces to a Vec efficiently using pre-computed buffer.
125#[inline]
126fn push_spaces(output: &mut Vec<u8>, n: usize) {
127    let mut remaining = n;
128    while remaining > 0 {
129        let chunk = remaining.min(SPACES.len());
130        output.extend_from_slice(&SPACES[..chunk]);
131        remaining -= chunk;
132    }
133}
134
135/// Write N spaces to a writer efficiently using pre-computed buffer.
136#[inline]
137fn write_spaces(out: &mut impl Write, n: usize) -> std::io::Result<()> {
138    let mut remaining = n;
139    while remaining > 0 {
140        let chunk = remaining.min(SPACES.len());
141        out.write_all(&SPACES[..chunk])?;
142        remaining -= chunk;
143    }
144    Ok(())
145}
146
147/// Expand tabs to spaces using SIMD scanning.
148/// Uses memchr2 to find tabs and newlines, bulk-copying everything between them.
149pub fn expand_bytes(
150    data: &[u8],
151    tabs: &TabStops,
152    initial_only: bool,
153    out: &mut impl Write,
154) -> std::io::Result<()> {
155    if data.is_empty() {
156        return Ok(());
157    }
158
159    // Fast path: no tabs in data → just copy through
160    if memchr::memchr(b'\t', data).is_none() {
161        return out.write_all(data);
162    }
163
164    // For regular tab stops with no -i flag, use the fast SIMD path
165    if let TabStops::Regular(tab_size) = tabs {
166        if !initial_only && memchr::memchr(b'\x08', data).is_none() {
167            return expand_regular_fast(data, *tab_size, out);
168        }
169    }
170
171    // Generic path for -i flag or tab lists
172    expand_generic(data, tabs, initial_only, out)
173}
174
175/// Fast expand for regular tab stops without -i flag.
176/// Streams directly to writer using memchr2 SIMD to find tabs and newlines.
177/// Avoids allocating a large intermediate buffer.
178fn expand_regular_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
179    let mut column: usize = 0;
180    let mut pos: usize = 0;
181
182    while pos < data.len() {
183        match memchr::memchr2(b'\t', b'\n', &data[pos..]) {
184            Some(offset) => {
185                // Bulk write everything before the special byte
186                if offset > 0 {
187                    out.write_all(&data[pos..pos + offset])?;
188                    column += offset;
189                }
190                let byte = data[pos + offset];
191                pos += offset + 1;
192
193                if byte == b'\n' {
194                    out.write_all(b"\n")?;
195                    column = 0;
196                } else {
197                    // Tab: write spaces directly to output
198                    let spaces = tab_size - (column % tab_size);
199                    write_spaces(out, spaces)?;
200                    column += spaces;
201                }
202            }
203            None => {
204                out.write_all(&data[pos..])?;
205                break;
206            }
207        }
208    }
209
210    Ok(())
211}
212
213/// Generic expand with support for -i flag and tab lists.
214fn expand_generic(
215    data: &[u8],
216    tabs: &TabStops,
217    initial_only: bool,
218    out: &mut impl Write,
219) -> std::io::Result<()> {
220    let mut output = Vec::with_capacity(data.len() + data.len() / 8);
221    let mut column: usize = 0;
222    let mut in_initial = true;
223
224    for &byte in data {
225        match byte {
226            b'\t' => {
227                if initial_only && !in_initial {
228                    output.push(b'\t');
229                    column = tabs.next_tab_stop(column);
230                } else {
231                    let spaces = tabs.spaces_to_next(column);
232                    push_spaces(&mut output, spaces);
233                    column += spaces;
234                }
235            }
236            b'\n' => {
237                output.push(b'\n');
238                column = 0;
239                in_initial = true;
240            }
241            b'\x08' => {
242                output.push(b'\x08');
243                if column > 0 {
244                    column -= 1;
245                }
246            }
247            _ => {
248                if initial_only && in_initial && byte != b' ' {
249                    in_initial = false;
250                }
251                output.push(byte);
252                column += 1;
253            }
254        }
255    }
256
257    out.write_all(&output)
258}
259
260/// Unexpand spaces to tabs.
261/// If `all` is true, convert all sequences of spaces; otherwise only leading spaces.
262pub fn unexpand_bytes(
263    data: &[u8],
264    tabs: &TabStops,
265    all: bool,
266    out: &mut impl Write,
267) -> std::io::Result<()> {
268    if data.is_empty() {
269        return Ok(());
270    }
271
272    let mut output = Vec::with_capacity(data.len());
273    let mut column: usize = 0;
274    let mut space_start_col: Option<usize> = None;
275    let mut in_initial = true;
276
277    for &byte in data {
278        match byte {
279            b' ' => {
280                if !all && !in_initial {
281                    output.push(b' ');
282                    column += 1;
283                } else {
284                    if space_start_col.is_none() {
285                        space_start_col = Some(column);
286                    }
287                    column += 1;
288                    if tabs.is_tab_stop(column) {
289                        output.push(b'\t');
290                        space_start_col = None;
291                    }
292                }
293            }
294            b'\t' => {
295                space_start_col = None;
296                output.push(b'\t');
297                column = tabs.next_tab_stop(column);
298            }
299            b'\n' => {
300                if let Some(start_col) = space_start_col.take() {
301                    push_spaces(&mut output, column - start_col);
302                }
303                output.push(b'\n');
304                column = 0;
305                in_initial = true;
306            }
307            b'\x08' => {
308                if let Some(start_col) = space_start_col.take() {
309                    push_spaces(&mut output, column - start_col);
310                }
311                output.push(b'\x08');
312                if column > 0 {
313                    column -= 1;
314                }
315            }
316            _ => {
317                if let Some(start_col) = space_start_col.take() {
318                    push_spaces(&mut output, column - start_col);
319                }
320                if in_initial {
321                    in_initial = false;
322                }
323                output.push(byte);
324                column += 1;
325            }
326        }
327    }
328
329    if let Some(start_col) = space_start_col {
330        push_spaces(&mut output, column - start_col);
331    }
332
333    out.write_all(&output)
334}