coreutils_rs/fmt/
core.rs

1use std::io::{self, Read, Write};
2
3/// Configuration for the fmt command.
4pub struct FmtConfig {
5    /// Maximum line width (default 75).
6    pub width: usize,
7    /// Goal width for line filling (default 93% of width).
8    pub goal: usize,
9    /// Only split long lines, do not refill short lines.
10    pub split_only: bool,
11    /// Crown margin mode: preserve the indentation of the first two lines.
12    pub crown_margin: bool,
13    /// Tagged paragraph mode: first line indentation differs from subsequent lines.
14    pub tagged: bool,
15    /// Uniform spacing: one space between words, two after sentence-ending punctuation.
16    pub uniform_spacing: bool,
17    /// Only reformat lines beginning with this prefix.
18    pub prefix: Option<String>,
19}
20
21impl Default for FmtConfig {
22    fn default() -> Self {
23        let width = 75;
24        Self {
25            width,
26            goal: (width * 187) / 200,
27            split_only: false,
28            crown_margin: false,
29            tagged: false,
30            uniform_spacing: false,
31            prefix: None,
32        }
33    }
34}
35
36/// Reformat text from `input` and write the result to `output`.
37///
38/// Text is processed paragraph by paragraph in a streaming fashion.
39/// Each paragraph is formatted and written immediately, avoiding holding
40/// the entire file in memory.
41pub fn fmt_file<R: Read, W: Write>(
42    mut input: R,
43    output: &mut W,
44    config: &FmtConfig,
45) -> io::Result<()> {
46    // Read entire input into a contiguous buffer to avoid per-line String allocation.
47    let mut data = Vec::new();
48    input.read_to_end(&mut data)?;
49    fmt_data(&data, output, config)
50}
51
52/// Format in-memory data. Works on byte slices to avoid String allocation.
53pub fn fmt_data(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
54    // Convert to str once (fmt processes text, so UTF-8 is expected)
55    let text = match std::str::from_utf8(data) {
56        Ok(s) => s,
57        Err(_) => {
58            // Fallback: lossy conversion
59            let owned = String::from_utf8_lossy(data);
60            return fmt_str_owned(&owned, output, config);
61        }
62    };
63    fmt_str(text, output, config)
64}
65
66/// Format a string slice, processing paragraph by paragraph with zero-copy word extraction.
67fn fmt_str(text: &str, output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
68    let prefix_str = config.prefix.as_deref();
69    let mut para_start = 0;
70    let bytes = text.as_bytes();
71
72    // Scan through the text finding paragraph boundaries
73    let mut i = 0;
74    let _para_lines_start = 0; // byte offset where current paragraph starts
75
76    while i < bytes.len() {
77        // Find end of current line
78        let line_end = memchr::memchr(b'\n', &bytes[i..])
79            .map(|p| i + p)
80            .unwrap_or(bytes.len());
81
82        let line = &text[i..line_end];
83
84        // Strip \r if present
85        let line = line.strip_suffix('\r').unwrap_or(line);
86
87        // Handle prefix filter
88        if let Some(pfx) = prefix_str {
89            if !line.starts_with(pfx) {
90                // Flush current paragraph
91                if para_start < i {
92                    format_paragraph_str(text, para_start, i, config, output)?;
93                }
94                para_start = if line_end < bytes.len() {
95                    line_end + 1
96                } else {
97                    bytes.len()
98                };
99                // Emit verbatim
100                output.write_all(line.as_bytes())?;
101                output.write_all(b"\n")?;
102                i = para_start;
103                continue;
104            }
105        }
106
107        if line.trim().is_empty() {
108            // Blank line = paragraph boundary
109            if para_start < i {
110                format_paragraph_str(text, para_start, i, config, output)?;
111            }
112            output.write_all(b"\n")?;
113            para_start = if line_end < bytes.len() {
114                line_end + 1
115            } else {
116                bytes.len()
117            };
118        }
119
120        i = if line_end < bytes.len() {
121            line_end + 1
122        } else {
123            bytes.len()
124        };
125    }
126
127    // Flush remaining paragraph
128    if para_start < bytes.len() {
129        let remaining = text[para_start..].trim_end_matches('\n');
130        if !remaining.is_empty() {
131            format_paragraph_str(text, para_start, bytes.len(), config, output)?;
132        }
133    }
134
135    Ok(())
136}
137
138/// Fallback for non-UTF8 data (owned String from lossy conversion)
139fn fmt_str_owned(text: &str, output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
140    fmt_str(text, output, config)
141}
142
143/// Format a paragraph from a region of the source text [start..end).
144/// Extracts lines and words directly from the source text — zero String allocation.
145fn format_paragraph_str(
146    text: &str,
147    start: usize,
148    end: usize,
149    config: &FmtConfig,
150    output: &mut impl Write,
151) -> io::Result<()> {
152    let region = &text[start..end];
153    // Collect lines (without trailing newlines)
154    let lines: Vec<&str> = region
155        .split('\n')
156        .map(|l| l.strip_suffix('\r').unwrap_or(l))
157        .filter(|l| !l.is_empty())
158        .collect();
159
160    if lines.is_empty() {
161        return Ok(());
162    }
163
164    let prefix_str = config.prefix.as_deref();
165
166    // Strip the prefix from lines for indentation analysis.
167    let stripped_first = match prefix_str {
168        Some(pfx) => lines[0].strip_prefix(pfx).unwrap_or(lines[0]),
169        None => lines[0],
170    };
171
172    let stripped_second: &str = if lines.len() > 1 {
173        match prefix_str {
174            Some(pfx) => lines[1].strip_prefix(pfx).unwrap_or(lines[1]),
175            None => lines[1],
176        }
177    } else {
178        stripped_first
179    };
180
181    let first_indent = leading_indent(stripped_first);
182    let rest_indent = leading_indent(stripped_second);
183
184    let (first_line_indent, cont_indent) = if config.tagged || config.crown_margin {
185        (first_indent, rest_indent)
186    } else {
187        (first_indent, first_indent)
188    };
189
190    if config.split_only {
191        for line in &lines {
192            split_line_optimal(line, config, prefix_str, output)?;
193        }
194        return Ok(());
195    }
196
197    // Collect words directly from source text — zero-copy &str references.
198    // Also track which words are sentence endings based on original spacing:
199    // A word ending in .?! is a sentence end if followed by 2+ spaces or end of line.
200    let total_chars: usize = lines.iter().map(|l| l.len()).sum();
201    let mut all_words: Vec<&str> = Vec::with_capacity(total_chars / 5 + 16);
202    let mut sentence_ends: Vec<bool> = Vec::with_capacity(total_chars / 5 + 16);
203
204    for line in &lines {
205        let s = match prefix_str {
206            Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
207            None => line,
208        };
209        collect_words_with_sentence_info(s, &mut all_words, &mut sentence_ends);
210    }
211
212    if all_words.is_empty() {
213        output.write_all(b"\n")?;
214        return Ok(());
215    }
216
217    let pfx = prefix_str.unwrap_or("");
218    reflow_paragraph(
219        &all_words,
220        &sentence_ends,
221        pfx,
222        first_line_indent,
223        cont_indent,
224        config,
225        output,
226    )
227}
228
229/// Determine the leading whitespace (indentation) of a line.
230fn leading_indent(line: &str) -> &str {
231    let trimmed = line.trim_start();
232    &line[..line.len() - trimmed.len()]
233}
234
235/// Check if a word has sentence-ending punctuation (ends with '.', '!', or '?',
236/// possibly followed by closing quotes/brackets).
237fn has_sentence_ending_punct(word: &str) -> bool {
238    let bytes = word.as_bytes();
239    // Walk backwards past closing punctuation
240    let mut i = bytes.len();
241    while i > 0 && matches!(bytes[i - 1], b'"' | b'\'' | b')' | b']') {
242        i -= 1;
243    }
244    i > 0 && matches!(bytes[i - 1], b'.' | b'!' | b'?')
245}
246
247/// Check if a word ends a sentence, considering original input context.
248/// GNU fmt rules: a word ending in .?! is a sentence end if:
249/// 1. It was followed by 2+ spaces in the original input, OR
250/// 2. It was at the end of a line
251/// Additionally, single uppercase letters (like "E.") are abbreviations, not sentence ends.
252fn is_sentence_end_contextual(word: &str, followed_by_double_space_or_eol: bool) -> bool {
253    if !has_sentence_ending_punct(word) {
254        return false;
255    }
256    if !followed_by_double_space_or_eol {
257        return false;
258    }
259    // Strip trailing punctuation to find the "core" word
260    let bytes = word.as_bytes();
261    let mut end = bytes.len();
262    while end > 0
263        && matches!(
264            bytes[end - 1],
265            b'.' | b'!' | b'?' | b'"' | b'\'' | b')' | b']'
266        )
267    {
268        end -= 1;
269    }
270    // A single uppercase letter followed by '.' is an abbreviation, not a sentence end
271    if end == 1 && bytes[0].is_ascii_uppercase() {
272        return false;
273    }
274    // Empty core (e.g., just "." or "...") is not a sentence end
275    end > 0
276}
277
278/// Word flags for GNU fmt cost model.
279/// Packed into the upper bits of the winfo u32 array for cache efficiency.
280const SENT_FLAG: u32 = 1 << 16; // sentence-final (period + double-space/eol context)
281const PERIOD_FLAG: u32 = 1 << 17; // has sentence-ending punct (.!?) regardless of context
282const PUNCT_FLAG: u32 = 1 << 18; // ends with non-period punctuation (,;:)
283const PAREN_FLAG: u32 = 1 << 19; // starts with opening paren/bracket
284
285/// Check if a word ends with non-period punctuation (,;:).
286fn has_non_period_punct(word: &str) -> bool {
287    let bytes = word.as_bytes();
288    let mut i = bytes.len();
289    // Skip trailing close quotes/parens
290    while i > 0 && matches!(bytes[i - 1], b'"' | b'\'' | b')' | b']') {
291        i -= 1;
292    }
293    i > 0 && matches!(bytes[i - 1], b',' | b';' | b':')
294}
295
296/// Collect words from a line, tracking sentence endings and word properties
297/// for the GNU fmt cost model.
298///
299/// Word properties tracked:
300/// - `final` (sentence-ending): .!? followed by 2+ spaces or at end of line
301/// - `period`: has .!? regardless of spacing context
302/// - `punct`: ends with ,;:
303/// - `paren`: starts with ([{
304fn collect_words_with_sentence_info<'a>(
305    line: &'a str,
306    words: &mut Vec<&'a str>,
307    sentence_ends: &mut Vec<bool>,
308) {
309    let bytes = line.as_bytes();
310    let len = bytes.len();
311    let mut i = 0;
312
313    // Skip leading whitespace
314    while i < len && bytes[i].is_ascii_whitespace() {
315        i += 1;
316    }
317
318    while i < len {
319        // Find end of word
320        let word_start = i;
321        while i < len && !bytes[i].is_ascii_whitespace() {
322            i += 1;
323        }
324        let word = &line[word_start..i];
325
326        // Count spaces after this word
327        let space_start = i;
328        while i < len && bytes[i].is_ascii_whitespace() {
329            i += 1;
330        }
331        let space_count = i - space_start;
332
333        // Sentence end: at end of line (i >= len) or followed by 2+ spaces
334        let at_eol = i >= len;
335        let double_space = space_count >= 2;
336
337        let is_sent_end = is_sentence_end_contextual(word, at_eol || double_space);
338
339        words.push(word);
340        sentence_ends.push(is_sent_end);
341    }
342}
343
344/// Reflow words into lines that fit within the configured width.
345///
346/// Uses optimal line breaking with a cost function matching GNU fmt.
347/// Writes directly to the output writer, avoiding intermediate String allocation.
348/// Eliminates pre-computed arrays: sep_widths, word_lens, break_cost, has_more_lines
349/// are all computed inline to reduce memory footprint and improve cache performance.
350fn reflow_paragraph<W: Write>(
351    words: &[&str],
352    sentence_ends: &[bool],
353    prefix: &str,
354    first_indent: &str,
355    cont_indent: &str,
356    config: &FmtConfig,
357    output: &mut W,
358) -> io::Result<()> {
359    if words.is_empty() {
360        return Ok(());
361    }
362
363    let n = words.len();
364    let first_base = prefix.len() + first_indent.len();
365    let cont_base = prefix.len() + cont_indent.len();
366    let goal = config.goal as i64;
367    let width = config.width;
368    debug_assert_eq!(sentence_ends.len(), words.len());
369
370    // GNU fmt cost model (from coreutils fmt.c):
371    // EQUIV(n)       = n²
372    // SHORT_COST(n)  = EQUIV(n*10) = 100n²
373    // RAGGED_COST(n) = SHORT_COST(n)/2 = 50n²
374    // LINE_COST      = EQUIV(70) = 4900
375    // SENTENCE_BONUS = EQUIV(50) = 2500
376    // NOBREAK_COST   = EQUIV(600) = 360000
377    // PUNCT_BONUS    = EQUIV(40) = 1600
378    // PAREN_BONUS    = EQUIV(40) = 1600
379    // WIDOW_COST(n)  = EQUIV(200)/(n+2) = 40000/(n+2)
380    // ORPHAN_COST(n) = EQUIV(150)/(n+2) = 22500/(n+2)
381    const SHORT_FACTOR: i64 = 100;
382    const RAGGED_FACTOR: i64 = 50;
383    const LINE_COST: i64 = 70 * 70;
384    const SENTENCE_BONUS: i64 = 50 * 50;
385    const NOBREAK_COST: i64 = 600 * 600;
386    const PUNCT_BONUS: i64 = 40 * 40;
387    const PAREN_BONUS: i64 = 40 * 40;
388
389    // Pack word length + flags into compact u32 array.
390    // bits 0-15: word length, bits 16-19: flags.
391    let winfo: Vec<u32> = words
392        .iter()
393        .enumerate()
394        .map(|(i, w)| {
395            debug_assert!(w.len() <= 0xFFFF, "word too long for winfo packing");
396            let len = w.len() as u32;
397            let mut flags = 0u32;
398            if sentence_ends.get(i).copied().unwrap_or(false) {
399                flags |= SENT_FLAG; // sentence-final (period + context)
400            }
401            if has_sentence_ending_punct(w) {
402                flags |= PERIOD_FLAG; // has .!? regardless of context
403            }
404            if has_non_period_punct(w) {
405                flags |= PUNCT_FLAG; // ends with ,;:
406            }
407            let bytes = w.as_bytes();
408            if !bytes.is_empty() && matches!(bytes[0], b'(' | b'[' | b'{') {
409                flags |= PAREN_FLAG; // starts with opening paren
410            }
411            // GNU fmt always marks the last word of a paragraph as sentence-final
412            // (period=true, final=true). This affects the cost model by adding
413            // ORPHAN_COST, discouraging short final lines (orphans).
414            if i == n - 1 {
415                flags |= SENT_FLAG | PERIOD_FLAG;
416            }
417            len | flags
418        })
419        .collect();
420
421    // 3 DP arrays: cost, best break point, line length
422    let mut dp_cost = vec![i64::MAX; n + 1];
423    let mut best = vec![0u32; n];
424    let mut line_len = vec![0i32; n + 1];
425    dp_cost[n] = 0;
426
427    // SAFETY: All array indices are provably in-bounds:
428    // - i ∈ [0, n-1] for winfo[i], best[i], dp_cost[i], line_len[i]
429    // - j ∈ [i, n-1] for winfo[j], j-1 ∈ [0, n-2] for winfo[j-1]
430    // - j+1 ∈ [1, n] for dp_cost[j+1] (n+1 elements), best[j+1] (n elements, only when j<n-1)
431    // - line_len has n+1 elements, accessed at i and j+1
432    let winfo_ptr = winfo.as_ptr();
433    let dp_cost_ptr = dp_cost.as_mut_ptr();
434    let best_ptr = best.as_mut_ptr();
435    let line_len_ptr = line_len.as_mut_ptr();
436
437    for i in (0..n).rev() {
438        let base = if i == 0 { first_base } else { cont_base };
439        let mut len = base + unsafe { (*winfo_ptr.add(i) & 0xFFFF) as usize };
440        let mut best_total = i64::MAX;
441        let mut best_j = i as u32;
442        let mut best_len = len as i32;
443
444        for j in i..n {
445            if j > i {
446                // GNU fmt uses 2 spaces after sentence-ending punctuation
447                let sep = if unsafe { *winfo_ptr.add(j - 1) & SENT_FLAG != 0 } {
448                    2
449                } else {
450                    1
451                };
452                len += sep + unsafe { (*winfo_ptr.add(j) & 0xFFFF) as usize };
453            }
454
455            // Compute line cost for placing words i..=j on one line.
456            macro_rules! try_candidate {
457                () => {
458                    let lc = if j == n - 1 {
459                        0i64
460                    } else {
461                        // line_cost: SHORT_COST + RAGGED_COST
462                        let short_n = goal - len as i64;
463                        let short_cost = short_n * short_n * SHORT_FACTOR;
464                        let ragged_cost = if unsafe { *best_ptr.add(j + 1) as usize + 1 < n } {
465                            let ragged_n = len as i64 - unsafe { *line_len_ptr.add(j + 1) } as i64;
466                            ragged_n * ragged_n * RAGGED_FACTOR
467                        } else {
468                            0
469                        };
470                        short_cost + ragged_cost
471                    };
472
473                    // base_cost for the NEXT line starting at j+1 (GNU base_cost model).
474                    // Applies to all lines except last (which has lc=0 already).
475                    let bc = if j == n - 1 {
476                        0i64
477                    } else {
478                        let wj = unsafe { *winfo_ptr.add(j) };
479                        let wj1 = unsafe { *winfo_ptr.add(j + 1) };
480                        let mut cost = LINE_COST;
481
482                        // Context from word ending the current line (word j)
483                        if wj & PERIOD_FLAG != 0 {
484                            if wj & SENT_FLAG != 0 {
485                                cost -= SENTENCE_BONUS;
486                            } else {
487                                cost += NOBREAK_COST;
488                            }
489                        } else if wj & PUNCT_FLAG != 0 {
490                            cost -= PUNCT_BONUS;
491                        } else if j > 0 {
492                            // WIDOW_COST: short word after a sentence end
493                            let wjm1 = unsafe { *winfo_ptr.add(j - 1) };
494                            if wjm1 & SENT_FLAG != 0 {
495                                let word_len = (wj & 0xFFFF) as i64;
496                                cost += 40000 / (word_len + 2);
497                            }
498                        }
499
500                        // Context from word starting the next line (word j+1)
501                        if wj1 & PAREN_FLAG != 0 {
502                            cost -= PAREN_BONUS;
503                        } else if wj1 & SENT_FLAG != 0 {
504                            let word_len = (wj1 & 0xFFFF) as i64;
505                            cost += 22500 / (word_len + 2);
506                        }
507
508                        cost
509                    };
510
511                    let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
512                    if cj1 != i64::MAX {
513                        let total = lc + bc + cj1;
514                        if total < best_total {
515                            best_total = total;
516                            best_j = j as u32;
517                            best_len = len as i32;
518                        }
519                    }
520                };
521            }
522
523            // GNU fmt uses strict less-than: lines must be < width, not <= width.
524            if len >= width {
525                if j == i {
526                    try_candidate!();
527                }
528                break;
529            }
530
531            try_candidate!();
532        }
533
534        if best_total < i64::MAX {
535            unsafe {
536                *dp_cost_ptr.add(i) = best_total;
537                *best_ptr.add(i) = best_j;
538                *line_len_ptr.add(i) = best_len;
539            }
540        }
541    }
542
543    // Reconstruct the lines from the DP solution, writing directly to output.
544    let mut i = 0;
545    let mut is_first_line = true;
546
547    while i < n {
548        let j = best[i] as usize;
549
550        output.write_all(prefix.as_bytes())?;
551        if is_first_line {
552            output.write_all(first_indent.as_bytes())?;
553        } else {
554            output.write_all(cont_indent.as_bytes())?;
555        }
556        output.write_all(words[i].as_bytes())?;
557
558        for k in (i + 1)..=j {
559            // GNU fmt uses 2 spaces after sentence-ending punctuation.
560            // Use winfo SENT_FLAG which includes the GNU convention of
561            // marking the last word of a paragraph as sentence-final.
562            if winfo[k - 1] & SENT_FLAG != 0 {
563                output.write_all(b"  ")?;
564            } else {
565                output.write_all(b" ")?;
566            }
567            output.write_all(words[k].as_bytes())?;
568        }
569        output.write_all(b"\n")?;
570
571        is_first_line = false;
572        i = j + 1;
573    }
574
575    Ok(())
576}
577
578/// Split a single input line using the optimal paragraph algorithm.
579/// Used in split-only mode (-s): short lines are preserved as-is,
580/// long lines are broken optimally (same algorithm as normal reflow).
581fn split_line_optimal<W: Write>(
582    line: &str,
583    config: &FmtConfig,
584    prefix: Option<&str>,
585    output: &mut W,
586) -> io::Result<()> {
587    let stripped = match prefix {
588        Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
589        None => line,
590    };
591    let indent = leading_indent(stripped);
592    let pfx = prefix.unwrap_or("");
593
594    // Short line: output as-is (no splitting needed).
595    // GNU fmt uses strict less-than: lines must be < width.
596    if line.len() < config.width {
597        output.write_all(line.as_bytes())?;
598        output.write_all(b"\n")?;
599        return Ok(());
600    }
601
602    let s = match prefix {
603        Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
604        None => line,
605    };
606
607    // Collect words and sentence info from this single line.
608    let mut words: Vec<&str> = Vec::new();
609    let mut sentence_ends: Vec<bool> = Vec::new();
610    collect_words_with_sentence_info(s, &mut words, &mut sentence_ends);
611
612    if words.is_empty() {
613        output.write_all(line.as_bytes())?;
614        output.write_all(b"\n")?;
615        return Ok(());
616    }
617
618    // Use the same optimal reflow as normal mode, treating this line as a paragraph.
619    reflow_paragraph(&words, &sentence_ends, pfx, indent, indent, config, output)
620}
coreutils_rs/fmt/core.rs

coreutils_rs/fmt/
core.rs