Skip to main content

coreutils_rs/fmt/
core.rs

1use std::io::{self, Read, Write};
2
3/// Configuration for the fmt command.
4pub struct FmtConfig {
5    /// Maximum line width (default 75).
6    pub width: usize,
7    /// Goal width for line filling (default 93% of width).
8    pub goal: usize,
9    /// Only split long lines, do not refill short lines.
10    pub split_only: bool,
11    /// Crown margin mode: preserve the indentation of the first two lines.
12    pub crown_margin: bool,
13    /// Tagged paragraph mode: first line indentation differs from subsequent lines.
14    pub tagged: bool,
15    /// Uniform spacing: one space between words, two after sentence-ending punctuation.
16    pub uniform_spacing: bool,
17    /// Only reformat lines beginning with this prefix.
18    pub prefix: Option<String>,
19}
20
21impl Default for FmtConfig {
22    fn default() -> Self {
23        let width = 75;
24        Self {
25            width,
26            goal: (width * 187) / 200,
27            split_only: false,
28            crown_margin: false,
29            tagged: false,
30            uniform_spacing: false,
31            prefix: None,
32        }
33    }
34}
35
36/// Reformat text from `input` and write the result to `output`.
37///
38/// Text is processed paragraph by paragraph in a streaming fashion.
39/// Each paragraph is formatted and written immediately, avoiding holding
40/// the entire file in memory.
41pub fn fmt_file<R: Read, W: Write>(
42    mut input: R,
43    output: &mut W,
44    config: &FmtConfig,
45) -> io::Result<()> {
46    // Read entire input into a contiguous buffer to avoid per-line String allocation.
47    let mut data = Vec::new();
48    input.read_to_end(&mut data)?;
49    fmt_data(&data, output, config)
50}
51
52/// Format in-memory data. Works on byte slices to avoid String allocation.
53pub fn fmt_data(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
54    // Convert to str once (fmt processes text, so UTF-8 is expected)
55    let text = match std::str::from_utf8(data) {
56        Ok(s) => s,
57        Err(_) => {
58            // Fallback: lossy conversion
59            let owned = String::from_utf8_lossy(data);
60            return fmt_str_owned(&owned, output, config);
61        }
62    };
63    fmt_str(text, output, config)
64}
65
66/// Format a string slice, processing paragraph by paragraph with zero-copy word extraction.
67fn fmt_str(text: &str, output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
68    let prefix_str = config.prefix.as_deref();
69    let mut para_start = 0;
70    let bytes = text.as_bytes();
71
72    // Scan through the text finding paragraph boundaries
73    let mut i = 0;
74    let _para_lines_start = 0; // byte offset where current paragraph starts
75
76    while i < bytes.len() {
77        // Find end of current line
78        let line_end = memchr::memchr(b'\n', &bytes[i..])
79            .map(|p| i + p)
80            .unwrap_or(bytes.len());
81
82        let line = &text[i..line_end];
83
84        // Strip \r if present
85        let line = line.strip_suffix('\r').unwrap_or(line);
86
87        // Handle prefix filter
88        if let Some(pfx) = prefix_str {
89            if !line.starts_with(pfx) {
90                // Flush current paragraph
91                if para_start < i {
92                    format_paragraph_str(text, para_start, i, config, output)?;
93                }
94                para_start = if line_end < bytes.len() {
95                    line_end + 1
96                } else {
97                    bytes.len()
98                };
99                // Emit verbatim
100                output.write_all(line.as_bytes())?;
101                output.write_all(b"\n")?;
102                i = para_start;
103                continue;
104            }
105        }
106
107        if line.trim().is_empty() {
108            // Blank line = paragraph boundary
109            if para_start < i {
110                format_paragraph_str(text, para_start, i, config, output)?;
111            }
112            output.write_all(b"\n")?;
113            para_start = if line_end < bytes.len() {
114                line_end + 1
115            } else {
116                bytes.len()
117            };
118        }
119
120        i = if line_end < bytes.len() {
121            line_end + 1
122        } else {
123            bytes.len()
124        };
125    }
126
127    // Flush remaining paragraph
128    if para_start < bytes.len() {
129        let remaining = text[para_start..].trim_end_matches('\n');
130        if !remaining.is_empty() {
131            format_paragraph_str(text, para_start, bytes.len(), config, output)?;
132        }
133    }
134
135    Ok(())
136}
137
138/// Fallback for non-UTF8 data (owned String from lossy conversion)
139fn fmt_str_owned(text: &str, output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
140    fmt_str(text, output, config)
141}
142
143/// Format a paragraph from a region of the source text [start..end).
144/// Extracts lines and words directly from the source text — zero String allocation.
145fn format_paragraph_str(
146    text: &str,
147    start: usize,
148    end: usize,
149    config: &FmtConfig,
150    output: &mut impl Write,
151) -> io::Result<()> {
152    let region = &text[start..end];
153    // Collect lines (without trailing newlines)
154    let lines: Vec<&str> = region
155        .split('\n')
156        .map(|l| l.strip_suffix('\r').unwrap_or(l))
157        .filter(|l| !l.is_empty())
158        .collect();
159
160    if lines.is_empty() {
161        return Ok(());
162    }
163
164    let prefix_str = config.prefix.as_deref();
165
166    // Strip the prefix from lines for indentation analysis.
167    let stripped_first = match prefix_str {
168        Some(pfx) => lines[0].strip_prefix(pfx).unwrap_or(lines[0]),
169        None => lines[0],
170    };
171
172    let stripped_second: &str = if lines.len() > 1 {
173        match prefix_str {
174            Some(pfx) => lines[1].strip_prefix(pfx).unwrap_or(lines[1]),
175            None => lines[1],
176        }
177    } else {
178        stripped_first
179    };
180
181    let first_indent = leading_indent(stripped_first);
182    let rest_indent = leading_indent(stripped_second);
183
184    let (first_line_indent, cont_indent) = if config.tagged || config.crown_margin {
185        (first_indent, rest_indent)
186    } else {
187        (first_indent, first_indent)
188    };
189
190    if config.split_only {
191        for line in &lines {
192            split_long_line(line, config, prefix_str, output)?;
193        }
194        return Ok(());
195    }
196
197    // Collect words directly from source text — zero-copy &str references.
198    let total_chars: usize = lines.iter().map(|l| l.len()).sum();
199    let mut all_words: Vec<&str> = Vec::with_capacity(total_chars / 5 + 16);
200    for line in &lines {
201        let s = match prefix_str {
202            Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
203            None => line,
204        };
205        all_words.extend(s.split_whitespace());
206    }
207
208    if all_words.is_empty() {
209        output.write_all(b"\n")?;
210        return Ok(());
211    }
212
213    let pfx = prefix_str.unwrap_or("");
214    reflow_paragraph(
215        &all_words,
216        pfx,
217        first_line_indent,
218        cont_indent,
219        config,
220        output,
221    )
222}
223
224/// Determine the leading whitespace (indentation) of a line.
225fn leading_indent(line: &str) -> &str {
226    let trimmed = line.trim_start();
227    &line[..line.len() - trimmed.len()]
228}
229
230/// Check if a word ends a sentence (ends with '.', '!', or '?').
231fn is_sentence_end(word: &str) -> bool {
232    matches!(word.as_bytes().last(), Some(b'.' | b'!' | b'?'))
233}
234
235/// Reflow words into lines that fit within the configured width.
236///
237/// Uses optimal line breaking with a cost function matching GNU fmt.
238/// Writes directly to the output writer, avoiding intermediate String allocation.
239/// Eliminates pre-computed arrays: sep_widths, word_lens, break_cost, has_more_lines
240/// are all computed inline to reduce memory footprint and improve cache performance.
241fn reflow_paragraph<W: Write>(
242    words: &[&str],
243    prefix: &str,
244    first_indent: &str,
245    cont_indent: &str,
246    config: &FmtConfig,
247    output: &mut W,
248) -> io::Result<()> {
249    if words.is_empty() {
250        return Ok(());
251    }
252
253    let n = words.len();
254    let first_base = prefix.len() + first_indent.len();
255    let cont_base = prefix.len() + cont_indent.len();
256    let goal = config.goal as i64;
257    let width = config.width;
258    let uniform = config.uniform_spacing;
259
260    const LINE_COST: i64 = 70 * 70;
261    const NOBREAK_COST: i64 = 600 * 600;
262    const SENTENCE_BONUS: i64 = 50 * 50;
263    const SENT_FLAG: u32 = 1 << 16;
264
265    // Pack word length + sentence-end flag into compact u32 array.
266    // bits 0-15: word length, bit 16: sentence-end flag.
267    // This is 4 bytes/word vs 16 bytes/word for fat pointers — much better cache usage.
268    let winfo: Vec<u32> = words
269        .iter()
270        .map(|w| {
271            let len = w.len() as u32;
272            let sent = if matches!(w.as_bytes().last(), Some(b'.' | b'!' | b'?')) {
273                SENT_FLAG
274            } else {
275                0
276            };
277            len | sent
278        })
279        .collect();
280
281    // 3 DP arrays: cost, best break point, line length
282    let mut dp_cost = vec![i64::MAX; n + 1];
283    let mut best = vec![0u32; n];
284    let mut line_len = vec![0i32; n + 1];
285    dp_cost[n] = 0;
286
287    // SAFETY: All array indices are provably in-bounds:
288    // - i ∈ [0, n-1] for winfo[i], best[i], dp_cost[i], line_len[i]
289    // - j ∈ [i, n-1] for winfo[j], j-1 ∈ [0, n-2] for winfo[j-1]
290    // - j+1 ∈ [1, n] for dp_cost[j+1] (n+1 elements), best[j+1] (n elements, only when j<n-1)
291    // - line_len has n+1 elements, accessed at i and j+1
292    let winfo_ptr = winfo.as_ptr();
293    let dp_cost_ptr = dp_cost.as_mut_ptr();
294    let best_ptr = best.as_mut_ptr();
295    let line_len_ptr = line_len.as_mut_ptr();
296
297    for i in (0..n).rev() {
298        let base = if i == 0 { first_base } else { cont_base };
299        let mut len = base + unsafe { (*winfo_ptr.add(i) & 0xFFFF) as usize };
300        let mut best_total = i64::MAX;
301        let mut best_j = i as u32;
302        let mut best_len = len as i32;
303
304        for j in i..n {
305            if j > i {
306                let sep = if uniform && unsafe { *winfo_ptr.add(j - 1) & SENT_FLAG != 0 } {
307                    2
308                } else {
309                    1
310                };
311                len += sep + unsafe { (*winfo_ptr.add(j) & 0xFFFF) as usize };
312            }
313
314            if len > width {
315                if j == i {
316                    let lc = if j == n - 1 {
317                        0i64
318                    } else {
319                        let bc = if unsafe { *winfo_ptr.add(j) & SENT_FLAG != 0 } {
320                            if uniform {
321                                LINE_COST - SENTENCE_BONUS
322                            } else {
323                                LINE_COST + NOBREAK_COST
324                            }
325                        } else {
326                            LINE_COST
327                        };
328                        let short_n = goal - len as i64;
329                        let short_cost = short_n * 10 * short_n * 10;
330                        let ragged_cost = if unsafe { *best_ptr.add(j + 1) as usize + 1 < n } {
331                            let ragged_n = len as i64 - unsafe { *line_len_ptr.add(j + 1) } as i64;
332                            ragged_n * 10 * ragged_n * 10 / 2
333                        } else {
334                            0
335                        };
336                        bc + short_cost + ragged_cost
337                    };
338                    let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
339                    if cj1 != i64::MAX {
340                        let total = lc + cj1;
341                        if total < best_total {
342                            best_total = total;
343                            best_j = j as u32;
344                            best_len = len as i32;
345                        }
346                    }
347                }
348                break;
349            }
350
351            let lc = if j == n - 1 {
352                0i64
353            } else {
354                let bc = if unsafe { *winfo_ptr.add(j) & SENT_FLAG != 0 } {
355                    if uniform {
356                        LINE_COST - SENTENCE_BONUS
357                    } else {
358                        LINE_COST + NOBREAK_COST
359                    }
360                } else {
361                    LINE_COST
362                };
363                let short_n = goal - len as i64;
364                let short_cost = short_n * 10 * short_n * 10;
365                let ragged_cost = if unsafe { *best_ptr.add(j + 1) as usize + 1 < n } {
366                    let ragged_n = len as i64 - unsafe { *line_len_ptr.add(j + 1) } as i64;
367                    ragged_n * 10 * ragged_n * 10 / 2
368                } else {
369                    0
370                };
371                bc + short_cost + ragged_cost
372            };
373
374            let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
375            if cj1 != i64::MAX {
376                let total = lc + cj1;
377                if total < best_total {
378                    best_total = total;
379                    best_j = j as u32;
380                    best_len = len as i32;
381                }
382            }
383        }
384
385        if best_total < i64::MAX {
386            unsafe {
387                *dp_cost_ptr.add(i) = best_total;
388                *best_ptr.add(i) = best_j;
389                *line_len_ptr.add(i) = best_len;
390            }
391        }
392    }
393
394    // Reconstruct the lines from the DP solution, writing directly to output.
395    let mut i = 0;
396    let mut is_first_line = true;
397
398    while i < n {
399        let j = best[i] as usize;
400
401        output.write_all(prefix.as_bytes())?;
402        if is_first_line {
403            output.write_all(first_indent.as_bytes())?;
404        } else {
405            output.write_all(cont_indent.as_bytes())?;
406        }
407        output.write_all(words[i].as_bytes())?;
408
409        for k in (i + 1)..=j {
410            if uniform && is_sentence_end(words[k - 1]) {
411                output.write_all(b"  ")?;
412            } else {
413                output.write_all(b" ")?;
414            }
415            output.write_all(words[k].as_bytes())?;
416        }
417        output.write_all(b"\n")?;
418
419        is_first_line = false;
420        i = j + 1;
421    }
422
423    Ok(())
424}
425
426/// Split a single long line at the width boundary without reflowing.
427/// Used in split-only mode (-s).
428fn split_long_line<W: Write>(
429    line: &str,
430    config: &FmtConfig,
431    prefix: Option<&str>,
432    output: &mut W,
433) -> io::Result<()> {
434    let stripped = match prefix {
435        Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
436        None => line,
437    };
438    let indent = leading_indent(stripped);
439    let pfx = prefix.unwrap_or("");
440
441    if line.len() <= config.width {
442        output.write_all(line.as_bytes())?;
443        output.write_all(b"\n")?;
444        return Ok(());
445    }
446
447    let s = match prefix {
448        Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
449        None => line,
450    };
451
452    let pfx_indent_len = pfx.len() + indent.len();
453    let mut cur_len = pfx_indent_len;
454    let mut first_word_on_line = true;
455
456    // Write initial prefix+indent
457    output.write_all(pfx.as_bytes())?;
458    output.write_all(indent.as_bytes())?;
459
460    for (i, word) in s.split_whitespace().enumerate() {
461        let sep_len = if first_word_on_line {
462            0
463        } else if config.uniform_spacing && i > 0 {
464            // Check the previous word for sentence end - we can't easily do this
465            // without tracking, so use 1 for safety
466            1
467        } else {
468            1
469        };
470
471        if !first_word_on_line && cur_len + sep_len + word.len() > config.width {
472            output.write_all(b"\n")?;
473            output.write_all(pfx.as_bytes())?;
474            output.write_all(indent.as_bytes())?;
475            cur_len = pfx_indent_len;
476            first_word_on_line = true;
477        }
478
479        if !first_word_on_line {
480            output.write_all(b" ")?;
481            cur_len += 1;
482        }
483        output.write_all(word.as_bytes())?;
484        cur_len += word.len();
485        first_word_on_line = false;
486    }
487
488    if !first_word_on_line {
489        output.write_all(b"\n")?;
490    }
491
492    Ok(())
493}