Skip to main content

coreutils_rs/expand/
core.rs

1use std::io::Write;
2
3/// Tab stop specification
4#[derive(Clone, Debug)]
5pub enum TabStops {
6    /// Regular interval (default 8)
7    Regular(usize),
8    /// Explicit list of tab stop positions (0-indexed columns)
9    List(Vec<usize>),
10}
11
12impl TabStops {
13    /// Calculate the number of spaces to the next tab stop from the given column.
14    #[inline]
15    fn spaces_to_next(&self, column: usize) -> usize {
16        match self {
17            TabStops::Regular(n) => {
18                if *n == 0 {
19                    return 0;
20                }
21                *n - (column % *n)
22            }
23            TabStops::List(stops) => {
24                // Find the first tab stop > current column
25                match stops.binary_search(&(column + 1)) {
26                    Ok(idx) => stops[idx] - column,
27                    Err(idx) => {
28                        if idx < stops.len() {
29                            stops[idx] - column
30                        } else {
31                            // Past all tab stops: GNU uses 1 space
32                            1
33                        }
34                    }
35                }
36            }
37        }
38    }
39
40    /// Check if the given column is at a tab stop position.
41    #[inline]
42    fn is_tab_stop(&self, column: usize) -> bool {
43        match self {
44            TabStops::Regular(n) => {
45                if *n == 0 {
46                    return false;
47                }
48                column.is_multiple_of(*n)
49            }
50            TabStops::List(stops) => stops.binary_search(&column).is_ok(),
51        }
52    }
53
54    /// Get the next tab stop position after the given column.
55    #[inline]
56    fn next_tab_stop(&self, column: usize) -> usize {
57        column + self.spaces_to_next(column)
58    }
59}
60
61/// Parse a tab specification string (e.g., "4", "4,8,12", "4 8 12").
62pub fn parse_tab_stops(spec: &str) -> Result<TabStops, String> {
63    let spec = spec.trim();
64    if spec.is_empty() {
65        return Ok(TabStops::Regular(8));
66    }
67
68    // Check if it's a single number (regular interval)
69    if let Ok(n) = spec.parse::<usize>() {
70        if n == 0 {
71            return Err("tab size cannot be 0".to_string());
72        }
73        return Ok(TabStops::Regular(n));
74    }
75
76    // Parse as comma or space-separated list
77    let mut stops: Vec<usize> = Vec::new();
78    for part in spec.split([',', ' ']) {
79        let part = part.trim();
80        if part.is_empty() {
81            continue;
82        }
83        // Handle / prefix for repeating tab stops
84        if let Some(rest) = part.strip_prefix('/') {
85            let n: usize = rest
86                .parse()
87                .map_err(|_| format!("'{}' is not a valid number", part))?;
88            if n == 0 {
89                return Err("tab size cannot be 0".to_string());
90            }
91            let last = stops.last().copied().unwrap_or(0);
92            let mut pos = last + n;
93            while pos < 10000 {
94                stops.push(pos);
95                pos += n;
96            }
97            continue;
98        }
99        match part.parse::<usize>() {
100            Ok(n) => {
101                if !stops.is_empty() && n <= *stops.last().unwrap() {
102                    return Err("tab sizes must be ascending".to_string());
103                }
104                stops.push(n);
105            }
106            Err(_) => return Err(format!("'{}' is not a valid number", part)),
107        }
108    }
109
110    if stops.is_empty() {
111        return Err("tab specification is empty".to_string());
112    }
113
114    if stops.len() == 1 {
115        return Ok(TabStops::Regular(stops[0]));
116    }
117
118    Ok(TabStops::List(stops))
119}
120
121// Pre-computed spaces buffer for fast tab expansion (avoids per-tab allocation)
122// Larger buffer (256 bytes) means most tabs can be served in a single memcpy
123const SPACES: [u8; 256] = [b' '; 256];
124
125/// Write N spaces to a Vec efficiently using pre-computed buffer.
126#[inline]
127fn push_spaces(output: &mut Vec<u8>, n: usize) {
128    let mut remaining = n;
129    while remaining > 0 {
130        let chunk = remaining.min(SPACES.len());
131        output.extend_from_slice(&SPACES[..chunk]);
132        remaining -= chunk;
133    }
134}
135
136/// Expand tabs to spaces using SIMD scanning.
137/// Uses memchr2 to find tabs and newlines, bulk-copying everything between them.
138pub fn expand_bytes(
139    data: &[u8],
140    tabs: &TabStops,
141    initial_only: bool,
142    out: &mut impl Write,
143) -> std::io::Result<()> {
144    if data.is_empty() {
145        return Ok(());
146    }
147
148    // Fast path: no tabs in data → just copy through
149    if memchr::memchr(b'\t', data).is_none() {
150        return out.write_all(data);
151    }
152
153    // For regular tab stops with no -i flag, use the fast SIMD path
154    if let TabStops::Regular(tab_size) = tabs {
155        if !initial_only && memchr::memchr(b'\x08', data).is_none() {
156            return expand_regular_fast(data, *tab_size, out);
157        }
158    }
159
160    // Generic path for -i flag or tab lists
161    expand_generic(data, tabs, initial_only, out)
162}
163
164/// Fast expand for regular tab stops without -i flag.
165/// Uses batched internal buffer to reduce write syscalls.
166fn expand_regular_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
167    const FLUSH_THRESHOLD: usize = 256 * 1024;
168    let mut buf = Vec::with_capacity(data.len().min(FLUSH_THRESHOLD * 2) + data.len() / 8);
169    let mut column: usize = 0;
170    let mut pos: usize = 0;
171
172    while pos < data.len() {
173        match memchr::memchr2(b'\t', b'\n', &data[pos..]) {
174            Some(offset) => {
175                // Bulk copy everything before the special byte
176                if offset > 0 {
177                    buf.extend_from_slice(&data[pos..pos + offset]);
178                    column += offset;
179                }
180                let byte = data[pos + offset];
181                pos += offset + 1;
182
183                if byte == b'\n' {
184                    buf.push(b'\n');
185                    column = 0;
186                } else {
187                    // Tab: push spaces to buffer
188                    let spaces = tab_size - (column % tab_size);
189                    push_spaces(&mut buf, spaces);
190                    column += spaces;
191                }
192
193                // Flush when buffer is large enough
194                if buf.len() >= FLUSH_THRESHOLD {
195                    out.write_all(&buf)?;
196                    buf.clear();
197                }
198            }
199            None => {
200                buf.extend_from_slice(&data[pos..]);
201                break;
202            }
203        }
204    }
205
206    if !buf.is_empty() {
207        out.write_all(&buf)?;
208    }
209    Ok(())
210}
211
212/// Generic expand with support for -i flag and tab lists.
213fn expand_generic(
214    data: &[u8],
215    tabs: &TabStops,
216    initial_only: bool,
217    out: &mut impl Write,
218) -> std::io::Result<()> {
219    let mut output = Vec::with_capacity(data.len() + data.len() / 8);
220    let mut column: usize = 0;
221    let mut in_initial = true;
222
223    for &byte in data {
224        match byte {
225            b'\t' => {
226                if initial_only && !in_initial {
227                    output.push(b'\t');
228                    column = tabs.next_tab_stop(column);
229                } else {
230                    let spaces = tabs.spaces_to_next(column);
231                    push_spaces(&mut output, spaces);
232                    column += spaces;
233                }
234            }
235            b'\n' => {
236                output.push(b'\n');
237                column = 0;
238                in_initial = true;
239            }
240            b'\x08' => {
241                output.push(b'\x08');
242                if column > 0 {
243                    column -= 1;
244                }
245            }
246            _ => {
247                if initial_only && in_initial && byte != b' ' {
248                    in_initial = false;
249                }
250                output.push(byte);
251                column += 1;
252            }
253        }
254    }
255
256    out.write_all(&output)
257}
258
259/// Unexpand spaces to tabs.
260/// If `all` is true, convert all sequences of spaces; otherwise only leading spaces.
261pub fn unexpand_bytes(
262    data: &[u8],
263    tabs: &TabStops,
264    all: bool,
265    out: &mut impl Write,
266) -> std::io::Result<()> {
267    if data.is_empty() {
268        return Ok(());
269    }
270
271    let mut output = Vec::with_capacity(data.len());
272    let mut column: usize = 0;
273    let mut space_start_col: Option<usize> = None;
274    let mut in_initial = true;
275
276    for &byte in data {
277        match byte {
278            b' ' => {
279                if !all && !in_initial {
280                    output.push(b' ');
281                    column += 1;
282                } else {
283                    if space_start_col.is_none() {
284                        space_start_col = Some(column);
285                    }
286                    column += 1;
287                    if tabs.is_tab_stop(column) {
288                        output.push(b'\t');
289                        space_start_col = None;
290                    }
291                }
292            }
293            b'\t' => {
294                space_start_col = None;
295                output.push(b'\t');
296                column = tabs.next_tab_stop(column);
297            }
298            b'\n' => {
299                if let Some(start_col) = space_start_col.take() {
300                    push_spaces(&mut output, column - start_col);
301                }
302                output.push(b'\n');
303                column = 0;
304                in_initial = true;
305            }
306            b'\x08' => {
307                if let Some(start_col) = space_start_col.take() {
308                    push_spaces(&mut output, column - start_col);
309                }
310                output.push(b'\x08');
311                if column > 0 {
312                    column -= 1;
313                }
314            }
315            _ => {
316                if let Some(start_col) = space_start_col.take() {
317                    push_spaces(&mut output, column - start_col);
318                }
319                if in_initial {
320                    in_initial = false;
321                }
322                output.push(byte);
323                column += 1;
324            }
325        }
326    }
327
328    if let Some(start_col) = space_start_col {
329        push_spaces(&mut output, column - start_col);
330    }
331
332    out.write_all(&output)
333}