coreutils_rs/fmt/
core.rs

1use std::io::{self, Read, Write};
2
3/// Configuration for the fmt command.
4pub struct FmtConfig {
5    /// Maximum line width (default 75).
6    pub width: usize,
7    /// Goal width for line filling (default 93% of width).
8    pub goal: usize,
9    /// Only split long lines, do not refill short lines.
10    pub split_only: bool,
11    /// Crown margin mode: preserve the indentation of the first two lines.
12    pub crown_margin: bool,
13    /// Tagged paragraph mode: first line indentation differs from subsequent lines.
14    pub tagged: bool,
15    /// Uniform spacing: one space between words, two after sentence-ending punctuation.
16    pub uniform_spacing: bool,
17    /// Only reformat lines beginning with this prefix.
18    pub prefix: Option<String>,
19}
20
21impl Default for FmtConfig {
22    fn default() -> Self {
23        let width = 75;
24        Self {
25            width,
26            goal: (width * 187) / 200,
27            split_only: false,
28            crown_margin: false,
29            tagged: false,
30            uniform_spacing: false,
31            prefix: None,
32        }
33    }
34}
35
36/// Reformat text from `input` and write the result to `output`.
37///
38/// Text is processed paragraph by paragraph in a streaming fashion.
39/// Each paragraph is formatted and written immediately, avoiding holding
40/// the entire file in memory.
41pub fn fmt_file<R: Read, W: Write>(
42    mut input: R,
43    output: &mut W,
44    config: &FmtConfig,
45) -> io::Result<()> {
46    // Read entire input into a contiguous buffer to avoid per-line String allocation.
47    let mut data = Vec::new();
48    input.read_to_end(&mut data)?;
49    fmt_data(&data, output, config)
50}
51
52/// Format in-memory data. Works on byte slices to avoid String allocation.
53pub fn fmt_data(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
54    // Convert to str once (fmt processes text, so UTF-8 is expected)
55    let text = match std::str::from_utf8(data) {
56        Ok(s) => s,
57        Err(_) => {
58            // Fallback: lossy conversion
59            let owned = String::from_utf8_lossy(data);
60            return fmt_str_owned(&owned, output, config);
61        }
62    };
63    fmt_str(text, output, config)
64}
65
66/// Format a string slice, processing paragraph by paragraph with zero-copy word extraction.
67fn fmt_str(text: &str, output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
68    let prefix_str = config.prefix.as_deref();
69    let mut para_start = 0;
70    let bytes = text.as_bytes();
71
72    // Scan through the text finding paragraph boundaries
73    let mut i = 0;
74    let _para_lines_start = 0; // byte offset where current paragraph starts
75
76    while i < bytes.len() {
77        // Find end of current line
78        let line_end = memchr::memchr(b'\n', &bytes[i..])
79            .map(|p| i + p)
80            .unwrap_or(bytes.len());
81
82        let line = &text[i..line_end];
83
84        // Strip \r if present
85        let line = line.strip_suffix('\r').unwrap_or(line);
86
87        // Handle prefix filter
88        if let Some(pfx) = prefix_str {
89            if !line.starts_with(pfx) {
90                // Flush current paragraph
91                if para_start < i {
92                    format_paragraph_str(text, para_start, i, config, output)?;
93                }
94                para_start = if line_end < bytes.len() {
95                    line_end + 1
96                } else {
97                    bytes.len()
98                };
99                // Emit verbatim
100                output.write_all(line.as_bytes())?;
101                output.write_all(b"\n")?;
102                i = para_start;
103                continue;
104            }
105        }
106
107        if line.trim().is_empty() {
108            // Blank line = paragraph boundary
109            if para_start < i {
110                format_paragraph_str(text, para_start, i, config, output)?;
111            }
112            output.write_all(b"\n")?;
113            para_start = if line_end < bytes.len() {
114                line_end + 1
115            } else {
116                bytes.len()
117            };
118        }
119
120        i = if line_end < bytes.len() {
121            line_end + 1
122        } else {
123            bytes.len()
124        };
125    }
126
127    // Flush remaining paragraph
128    if para_start < bytes.len() {
129        let remaining = text[para_start..].trim_end_matches('\n');
130        if !remaining.is_empty() {
131            format_paragraph_str(text, para_start, bytes.len(), config, output)?;
132        }
133    }
134
135    Ok(())
136}
137
138/// Fallback for non-UTF8 data (owned String from lossy conversion)
139fn fmt_str_owned(text: &str, output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
140    fmt_str(text, output, config)
141}
142
143/// Format a paragraph from a region of the source text [start..end).
144/// Extracts lines and words directly from the source text — zero String allocation.
145fn format_paragraph_str(
146    text: &str,
147    start: usize,
148    end: usize,
149    config: &FmtConfig,
150    output: &mut impl Write,
151) -> io::Result<()> {
152    let region = &text[start..end];
153    // Collect lines (without trailing newlines)
154    let lines: Vec<&str> = region
155        .split('\n')
156        .map(|l| l.strip_suffix('\r').unwrap_or(l))
157        .filter(|l| !l.is_empty())
158        .collect();
159
160    if lines.is_empty() {
161        return Ok(());
162    }
163
164    let prefix_str = config.prefix.as_deref();
165
166    // Strip the prefix from lines for indentation analysis.
167    let stripped_first = match prefix_str {
168        Some(pfx) => lines[0].strip_prefix(pfx).unwrap_or(lines[0]),
169        None => lines[0],
170    };
171
172    let stripped_second: &str = if lines.len() > 1 {
173        match prefix_str {
174            Some(pfx) => lines[1].strip_prefix(pfx).unwrap_or(lines[1]),
175            None => lines[1],
176        }
177    } else {
178        stripped_first
179    };
180
181    let first_indent = leading_indent(stripped_first);
182    let rest_indent = leading_indent(stripped_second);
183
184    let (first_line_indent, cont_indent) = if config.tagged || config.crown_margin {
185        (first_indent, rest_indent)
186    } else {
187        (first_indent, first_indent)
188    };
189
190    if config.split_only {
191        for line in &lines {
192            split_long_line(line, config, prefix_str, output)?;
193        }
194        return Ok(());
195    }
196
197    // Collect words directly from source text — zero-copy &str references.
198    // Also track which words are sentence endings based on original spacing:
199    // A word ending in .?! is a sentence end if followed by 2+ spaces or end of line.
200    let total_chars: usize = lines.iter().map(|l| l.len()).sum();
201    let mut all_words: Vec<&str> = Vec::with_capacity(total_chars / 5 + 16);
202    let mut sentence_ends: Vec<bool> = Vec::with_capacity(total_chars / 5 + 16);
203
204    for line in &lines {
205        let s = match prefix_str {
206            Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
207            None => line,
208        };
209        collect_words_with_sentence_info(s, &mut all_words, &mut sentence_ends);
210    }
211
212    if all_words.is_empty() {
213        output.write_all(b"\n")?;
214        return Ok(());
215    }
216
217    let pfx = prefix_str.unwrap_or("");
218    reflow_paragraph(
219        &all_words,
220        &sentence_ends,
221        pfx,
222        first_line_indent,
223        cont_indent,
224        config,
225        output,
226    )
227}
228
229/// Determine the leading whitespace (indentation) of a line.
230fn leading_indent(line: &str) -> &str {
231    let trimmed = line.trim_start();
232    &line[..line.len() - trimmed.len()]
233}
234
235/// Check if a word has sentence-ending punctuation (ends with '.', '!', or '?',
236/// possibly followed by closing quotes/brackets).
237fn has_sentence_ending_punct(word: &str) -> bool {
238    let bytes = word.as_bytes();
239    // Walk backwards past closing punctuation
240    let mut i = bytes.len();
241    while i > 0 && matches!(bytes[i - 1], b'"' | b'\'' | b')' | b']') {
242        i -= 1;
243    }
244    i > 0 && matches!(bytes[i - 1], b'.' | b'!' | b'?')
245}
246
247/// Check if a word ends a sentence, considering original input context.
248/// GNU fmt rules: a word ending in .?! is a sentence end if:
249/// 1. It was followed by 2+ spaces in the original input, OR
250/// 2. It was at the end of a line
251/// Additionally, single uppercase letters (like "E.") are abbreviations, not sentence ends.
252fn is_sentence_end_contextual(word: &str, followed_by_double_space_or_eol: bool) -> bool {
253    if !has_sentence_ending_punct(word) {
254        return false;
255    }
256    if !followed_by_double_space_or_eol {
257        return false;
258    }
259    // Strip trailing punctuation to find the "core" word
260    let bytes = word.as_bytes();
261    let mut end = bytes.len();
262    while end > 0
263        && matches!(
264            bytes[end - 1],
265            b'.' | b'!' | b'?' | b'"' | b'\'' | b')' | b']'
266        )
267    {
268        end -= 1;
269    }
270    // A single uppercase letter followed by '.' is an abbreviation, not a sentence end
271    if end == 1 && bytes[0].is_ascii_uppercase() {
272        return false;
273    }
274    // Empty core (e.g., just "." or "...") is not a sentence end
275    end > 0
276}
277
278/// Word flags for GNU fmt cost model.
279/// Packed into the upper bits of the winfo u32 array for cache efficiency.
280const SENT_FLAG: u32 = 1 << 16; // sentence-final (period + double-space/eol context)
281const PERIOD_FLAG: u32 = 1 << 17; // has sentence-ending punct (.!?) regardless of context
282const PUNCT_FLAG: u32 = 1 << 18; // ends with non-period punctuation (,;:)
283const PAREN_FLAG: u32 = 1 << 19; // starts with opening paren/bracket
284
285/// Check if a word ends with non-period punctuation (,;:).
286fn has_non_period_punct(word: &str) -> bool {
287    let bytes = word.as_bytes();
288    let mut i = bytes.len();
289    // Skip trailing close quotes/parens
290    while i > 0 && matches!(bytes[i - 1], b'"' | b'\'' | b')' | b']') {
291        i -= 1;
292    }
293    i > 0 && matches!(bytes[i - 1], b',' | b';' | b':')
294}
295
296/// Collect words from a line, tracking sentence endings and word properties
297/// for the GNU fmt cost model.
298///
299/// Word properties tracked:
300/// - `final` (sentence-ending): .!? followed by 2+ spaces or at end of line
301/// - `period`: has .!? regardless of spacing context
302/// - `punct`: ends with ,;:
303/// - `paren`: starts with ([{
304fn collect_words_with_sentence_info<'a>(
305    line: &'a str,
306    words: &mut Vec<&'a str>,
307    sentence_ends: &mut Vec<bool>,
308) {
309    let bytes = line.as_bytes();
310    let len = bytes.len();
311    let mut i = 0;
312
313    // Skip leading whitespace
314    while i < len && bytes[i].is_ascii_whitespace() {
315        i += 1;
316    }
317
318    while i < len {
319        // Find end of word
320        let word_start = i;
321        while i < len && !bytes[i].is_ascii_whitespace() {
322            i += 1;
323        }
324        let word = &line[word_start..i];
325
326        // Count spaces after this word
327        let space_start = i;
328        while i < len && bytes[i].is_ascii_whitespace() {
329            i += 1;
330        }
331        let space_count = i - space_start;
332
333        // Sentence end: at end of line (i >= len) or followed by 2+ spaces
334        let at_eol = i >= len;
335        let double_space = space_count >= 2;
336
337        let is_sent_end = is_sentence_end_contextual(word, at_eol || double_space);
338
339        words.push(word);
340        sentence_ends.push(is_sent_end);
341    }
342}
343
344/// Reflow words into lines that fit within the configured width.
345///
346/// Uses optimal line breaking with a cost function matching GNU fmt.
347/// Writes directly to the output writer, avoiding intermediate String allocation.
348/// Eliminates pre-computed arrays: sep_widths, word_lens, break_cost, has_more_lines
349/// are all computed inline to reduce memory footprint and improve cache performance.
350fn reflow_paragraph<W: Write>(
351    words: &[&str],
352    sentence_ends: &[bool],
353    prefix: &str,
354    first_indent: &str,
355    cont_indent: &str,
356    config: &FmtConfig,
357    output: &mut W,
358) -> io::Result<()> {
359    if words.is_empty() {
360        return Ok(());
361    }
362
363    let n = words.len();
364    let first_base = prefix.len() + first_indent.len();
365    let cont_base = prefix.len() + cont_indent.len();
366    let goal = config.goal as i64;
367    let width = config.width;
368    debug_assert_eq!(sentence_ends.len(), words.len());
369
370    // GNU fmt cost model (from coreutils fmt.c):
371    // EQUIV(n)       = n²
372    // SHORT_COST(n)  = EQUIV(n*10) = 100n²
373    // RAGGED_COST(n) = SHORT_COST(n)/2 = 50n²
374    // LINE_COST      = EQUIV(70) = 4900
375    // SENTENCE_BONUS = EQUIV(50) = 2500
376    // NOBREAK_COST   = EQUIV(600) = 360000
377    // PUNCT_BONUS    = EQUIV(40) = 1600
378    // PAREN_BONUS    = EQUIV(40) = 1600
379    // WIDOW_COST(n)  = EQUIV(200)/(n+2) = 40000/(n+2)
380    // ORPHAN_COST(n) = EQUIV(150)/(n+2) = 22500/(n+2)
381    const SHORT_FACTOR: i64 = 100;
382    const RAGGED_FACTOR: i64 = 50;
383    const LINE_COST: i64 = 70 * 70;
384    const SENTENCE_BONUS: i64 = 50 * 50;
385    const NOBREAK_COST: i64 = 600 * 600;
386    const PUNCT_BONUS: i64 = 40 * 40;
387    const PAREN_BONUS: i64 = 40 * 40;
388
389    // Pack word length + flags into compact u32 array.
390    // bits 0-15: word length, bits 16-19: flags.
391    let winfo: Vec<u32> = words
392        .iter()
393        .enumerate()
394        .map(|(i, w)| {
395            debug_assert!(w.len() <= 0xFFFF, "word too long for winfo packing");
396            let len = w.len() as u32;
397            let mut flags = 0u32;
398            if sentence_ends.get(i).copied().unwrap_or(false) {
399                flags |= SENT_FLAG; // sentence-final (period + context)
400            }
401            if has_sentence_ending_punct(w) {
402                flags |= PERIOD_FLAG; // has .!? regardless of context
403            }
404            if has_non_period_punct(w) {
405                flags |= PUNCT_FLAG; // ends with ,;:
406            }
407            let bytes = w.as_bytes();
408            if !bytes.is_empty() && matches!(bytes[0], b'(' | b'[' | b'{') {
409                flags |= PAREN_FLAG; // starts with opening paren
410            }
411            len | flags
412        })
413        .collect();
414
415    // 3 DP arrays: cost, best break point, line length
416    let mut dp_cost = vec![i64::MAX; n + 1];
417    let mut best = vec![0u32; n];
418    let mut line_len = vec![0i32; n + 1];
419    dp_cost[n] = 0;
420
421    // SAFETY: All array indices are provably in-bounds:
422    // - i ∈ [0, n-1] for winfo[i], best[i], dp_cost[i], line_len[i]
423    // - j ∈ [i, n-1] for winfo[j], j-1 ∈ [0, n-2] for winfo[j-1]
424    // - j+1 ∈ [1, n] for dp_cost[j+1] (n+1 elements), best[j+1] (n elements, only when j<n-1)
425    // - line_len has n+1 elements, accessed at i and j+1
426    let winfo_ptr = winfo.as_ptr();
427    let dp_cost_ptr = dp_cost.as_mut_ptr();
428    let best_ptr = best.as_mut_ptr();
429    let line_len_ptr = line_len.as_mut_ptr();
430
431    for i in (0..n).rev() {
432        let base = if i == 0 { first_base } else { cont_base };
433        let mut len = base + unsafe { (*winfo_ptr.add(i) & 0xFFFF) as usize };
434        let mut best_total = i64::MAX;
435        let mut best_j = i as u32;
436        let mut best_len = len as i32;
437
438        for j in i..n {
439            if j > i {
440                // GNU fmt uses 2 spaces after sentence-ending punctuation
441                let sep = if unsafe { *winfo_ptr.add(j - 1) & SENT_FLAG != 0 } {
442                    2
443                } else {
444                    1
445                };
446                len += sep + unsafe { (*winfo_ptr.add(j) & 0xFFFF) as usize };
447            }
448
449            // Compute line cost for placing words i..=j on one line.
450            macro_rules! try_candidate {
451                () => {
452                    let lc = if j == n - 1 {
453                        0i64
454                    } else {
455                        // line_cost: SHORT_COST + RAGGED_COST
456                        let short_n = goal - len as i64;
457                        let short_cost = short_n * short_n * SHORT_FACTOR;
458                        let ragged_cost = if unsafe { *best_ptr.add(j + 1) as usize + 1 < n } {
459                            let ragged_n = len as i64 - unsafe { *line_len_ptr.add(j + 1) } as i64;
460                            ragged_n * ragged_n * RAGGED_FACTOR
461                        } else {
462                            0
463                        };
464                        short_cost + ragged_cost
465                    };
466
467                    // base_cost for the NEXT line starting at j+1 (GNU base_cost model).
468                    // Applies to all lines except last (which has lc=0 already).
469                    let bc = if j == n - 1 {
470                        0i64
471                    } else {
472                        let wj = unsafe { *winfo_ptr.add(j) };
473                        let wj1 = unsafe { *winfo_ptr.add(j + 1) };
474                        let mut cost = LINE_COST;
475
476                        // Context from word ending the current line (word j)
477                        if wj & PERIOD_FLAG != 0 {
478                            if wj & SENT_FLAG != 0 {
479                                cost -= SENTENCE_BONUS;
480                            } else {
481                                cost += NOBREAK_COST;
482                            }
483                        } else if wj & PUNCT_FLAG != 0 {
484                            cost -= PUNCT_BONUS;
485                        } else if j > 0 {
486                            // WIDOW_COST: short word after a sentence end
487                            let wjm1 = unsafe { *winfo_ptr.add(j - 1) };
488                            if wjm1 & SENT_FLAG != 0 {
489                                let word_len = (wj & 0xFFFF) as i64;
490                                cost += 40000 / (word_len + 2);
491                            }
492                        }
493
494                        // Context from word starting the next line (word j+1)
495                        if wj1 & PAREN_FLAG != 0 {
496                            cost -= PAREN_BONUS;
497                        } else if wj1 & SENT_FLAG != 0 {
498                            let word_len = (wj1 & 0xFFFF) as i64;
499                            cost += 22500 / (word_len + 2);
500                        }
501
502                        cost
503                    };
504
505                    let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
506                    if cj1 != i64::MAX {
507                        let total = lc + bc + cj1;
508                        if total < best_total {
509                            best_total = total;
510                            best_j = j as u32;
511                            best_len = len as i32;
512                        }
513                    }
514                };
515            }
516
517            if len > width {
518                if j == i {
519                    try_candidate!();
520                }
521                break;
522            }
523
524            try_candidate!();
525        }
526
527        if best_total < i64::MAX {
528            unsafe {
529                *dp_cost_ptr.add(i) = best_total;
530                *best_ptr.add(i) = best_j;
531                *line_len_ptr.add(i) = best_len;
532            }
533        }
534    }
535
536    // Reconstruct the lines from the DP solution, writing directly to output.
537    let mut i = 0;
538    let mut is_first_line = true;
539
540    while i < n {
541        let j = best[i] as usize;
542
543        output.write_all(prefix.as_bytes())?;
544        if is_first_line {
545            output.write_all(first_indent.as_bytes())?;
546        } else {
547            output.write_all(cont_indent.as_bytes())?;
548        }
549        output.write_all(words[i].as_bytes())?;
550
551        for k in (i + 1)..=j {
552            // GNU fmt uses 2 spaces after sentence-ending punctuation
553            // (only when the original input had 2+ spaces or end-of-line after it)
554            if sentence_ends[k - 1] {
555                output.write_all(b"  ")?;
556            } else {
557                output.write_all(b" ")?;
558            }
559            output.write_all(words[k].as_bytes())?;
560        }
561        output.write_all(b"\n")?;
562
563        is_first_line = false;
564        i = j + 1;
565    }
566
567    Ok(())
568}
569
570/// Split a single long line at the width boundary without reflowing.
571/// Used in split-only mode (-s).
572fn split_long_line<W: Write>(
573    line: &str,
574    config: &FmtConfig,
575    prefix: Option<&str>,
576    output: &mut W,
577) -> io::Result<()> {
578    let stripped = match prefix {
579        Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
580        None => line,
581    };
582    let indent = leading_indent(stripped);
583    let pfx = prefix.unwrap_or("");
584
585    if line.len() <= config.width {
586        output.write_all(line.as_bytes())?;
587        output.write_all(b"\n")?;
588        return Ok(());
589    }
590
591    let s = match prefix {
592        Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
593        None => line,
594    };
595
596    let pfx_indent_len = pfx.len() + indent.len();
597    let mut cur_len = pfx_indent_len;
598    let mut first_word_on_line = true;
599
600    // Write initial prefix+indent
601    output.write_all(pfx.as_bytes())?;
602    output.write_all(indent.as_bytes())?;
603
604    // GNU fmt -s splits long lines at the max width (not goal width).
605    // Break before a word would exceed the maximum line width.
606    for word in s.split_whitespace() {
607        if !first_word_on_line {
608            let new_len = cur_len + 1 + word.len();
609            if new_len > config.width {
610                output.write_all(b"\n")?;
611                output.write_all(pfx.as_bytes())?;
612                output.write_all(indent.as_bytes())?;
613                cur_len = pfx_indent_len;
614                first_word_on_line = true;
615            }
616        }
617
618        if !first_word_on_line {
619            output.write_all(b" ")?;
620            cur_len += 1;
621        }
622        output.write_all(word.as_bytes())?;
623        cur_len += word.len();
624        first_word_on_line = false;
625    }
626
627    if !first_word_on_line {
628        output.write_all(b"\n")?;
629    }
630
631    Ok(())
632}
coreutils_rs/fmt/core.rs

coreutils_rs/fmt/
core.rs