Skip to main content

coreutils_rs/fmt/
core.rs

1use std::io::{self, BufRead, Write};
2
3/// Configuration for the fmt command.
4pub struct FmtConfig {
5    /// Maximum line width (default 75).
6    pub width: usize,
7    /// Goal width for line filling (default 93% of width).
8    pub goal: usize,
9    /// Only split long lines, do not refill short lines.
10    pub split_only: bool,
11    /// Crown margin mode: preserve the indentation of the first two lines.
12    pub crown_margin: bool,
13    /// Tagged paragraph mode: first line indentation differs from subsequent lines.
14    pub tagged: bool,
15    /// Uniform spacing: one space between words, two after sentence-ending punctuation.
16    pub uniform_spacing: bool,
17    /// Only reformat lines beginning with this prefix.
18    pub prefix: Option<String>,
19}
20
21impl Default for FmtConfig {
22    fn default() -> Self {
23        let width = 75;
24        Self {
25            width,
26            goal: (width * 93) / 100,
27            split_only: false,
28            crown_margin: false,
29            tagged: false,
30            uniform_spacing: false,
31            prefix: None,
32        }
33    }
34}
35
36/// Reformat text from `input` and write the result to `output`.
37///
38/// Text is processed paragraph by paragraph (paragraphs are separated by blank lines).
39/// Each paragraph's words are reflowed to fit within the configured width using
40/// greedy line breaking.
41pub fn fmt_file<R: BufRead, W: Write>(
42    input: R,
43    output: &mut W,
44    config: &FmtConfig,
45) -> io::Result<()> {
46    let mut paragraphs: Vec<Vec<String>> = Vec::new();
47    let mut current: Vec<String> = Vec::new();
48
49    for line in input.lines() {
50        let line = line?;
51
52        // If a prefix is set, only reformat lines that start with it.
53        // Lines without the prefix are emitted verbatim.
54        if let Some(ref pfx) = config.prefix {
55            if !line.starts_with(pfx.as_str()) {
56                // Flush current paragraph before emitting non-prefix line.
57                if !current.is_empty() {
58                    paragraphs.push(current);
59                    current = Vec::new();
60                }
61                // Emit as a single-line paragraph marked for verbatim output.
62                // We use a special sentinel: a paragraph whose sole entry is the
63                // raw line prefixed with a NUL byte so we can distinguish it later.
64                paragraphs.push(vec![format!("\x00{}", line)]);
65                continue;
66            }
67        }
68
69        if line.trim().is_empty() {
70            if !current.is_empty() {
71                paragraphs.push(current);
72                current = Vec::new();
73            }
74            // Represent a blank line as an empty paragraph.
75            paragraphs.push(Vec::new());
76        } else {
77            current.push(line);
78        }
79    }
80    if !current.is_empty() {
81        paragraphs.push(current);
82    }
83
84    for para in &paragraphs {
85        // Blank line separator.
86        if para.is_empty() {
87            output.write_all(b"\n")?;
88            continue;
89        }
90
91        // Check for verbatim sentinel (non-prefix line).
92        if para.len() == 1 && para[0].starts_with('\x00') {
93            output.write_all(&para[0].as_bytes()[1..])?;
94            output.write_all(b"\n")?;
95            continue;
96        }
97
98        format_paragraph(para, config, output)?;
99    }
100
101    Ok(())
102}
103
104/// Determine the leading whitespace (indentation) of a line.
105fn leading_indent(line: &str) -> &str {
106    let trimmed = line.trim_start();
107    &line[..line.len() - trimmed.len()]
108}
109
110/// Check if a word ends a sentence (ends with '.', '!', or '?').
111fn is_sentence_end(word: &str) -> bool {
112    matches!(word.as_bytes().last(), Some(b'.' | b'!' | b'?'))
113}
114
115/// Extract words from a line, optionally stripping a prefix first.
116fn extract_words<'a>(line: &'a str, prefix: Option<&str>) -> Vec<&'a str> {
117    let s = match prefix {
118        Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
119        None => line,
120    };
121    s.split_whitespace().collect()
122}
123
124/// Format a single paragraph (a group of non-blank lines) and write it.
125fn format_paragraph<W: Write>(
126    lines: &[String],
127    config: &FmtConfig,
128    output: &mut W,
129) -> io::Result<()> {
130    if lines.is_empty() {
131        return Ok(());
132    }
133
134    let prefix_str = config.prefix.as_deref();
135
136    // Strip the prefix from lines for indentation analysis.
137    let stripped_first = match prefix_str {
138        Some(pfx) => lines[0].strip_prefix(pfx).unwrap_or(&lines[0]),
139        None => &lines[0],
140    };
141
142    let first_indent = leading_indent(stripped_first).to_string();
143
144    // Determine continuation indent.
145    let rest_indent = if lines.len() > 1 {
146        let stripped = match prefix_str {
147            Some(pfx) => lines[1].strip_prefix(pfx).unwrap_or(&lines[1]),
148            None => &lines[1],
149        };
150        leading_indent(stripped).to_string()
151    } else {
152        first_indent.clone()
153    };
154
155    // Choose indentation based on mode.
156    let (first_line_indent, cont_indent) = if config.tagged {
157        // Tagged paragraph: first line keeps its indent, rest uses second line's indent.
158        (first_indent.clone(), rest_indent.clone())
159    } else if config.crown_margin {
160        // Crown margin: preserve the first two lines' indentation exactly.
161        (first_indent.clone(), rest_indent.clone())
162    } else {
163        // Default: use the first line's indent for all lines.
164        (first_indent.clone(), first_indent.clone())
165    };
166
167    // In split-only mode, we do not rejoin words across lines.
168    if config.split_only {
169        for line in lines {
170            split_long_line(line, config, prefix_str, output)?;
171        }
172        return Ok(());
173    }
174
175    // Collect all words from the paragraph.
176    let mut all_words: Vec<&str> = Vec::new();
177    for line in lines {
178        all_words.extend(extract_words(line, prefix_str));
179    }
180
181    if all_words.is_empty() {
182        output.write_all(b"\n")?;
183        return Ok(());
184    }
185
186    // Build the prefix string to prepend to each output line.
187    let pfx = prefix_str.unwrap_or("");
188
189    // Reflow the words.
190    let result = reflow_paragraph(&all_words, pfx, &first_line_indent, &cont_indent, config);
191    output.write_all(result.as_bytes())?;
192    Ok(())
193}
194
195/// Reflow words into lines that fit within the configured width.
196///
197/// Uses optimal line breaking with a cost function matching GNU fmt:
198///   SHORT_COST(n) = (n * 10)^2  where n = goal - line_length
199///   RAGGED_COST(n) = (n * 10)^2 / 2  where n = line_length - next_line_length
200///   Last line has zero cost.
201fn reflow_paragraph(
202    words: &[&str],
203    prefix: &str,
204    first_indent: &str,
205    cont_indent: &str,
206    config: &FmtConfig,
207) -> String {
208    if words.is_empty() {
209        return String::new();
210    }
211
212    let n = words.len();
213    let first_base = prefix.len() + first_indent.len();
214    let cont_base = prefix.len() + cont_indent.len();
215    let goal = config.goal as i64;
216    let width = config.width;
217
218    // Compute separator widths between consecutive words on the same line.
219    let sep_widths: Vec<usize> = (0..n)
220        .map(|i| {
221            if i == 0 {
222                0
223            } else if config.uniform_spacing && is_sentence_end(words[i - 1]) {
224                2
225            } else {
226                1
227            }
228        })
229        .collect();
230
231    let word_lens: Vec<usize> = words.iter().map(|w| w.len()).collect();
232
233    // DP state:
234    // cost[i] = minimum cost to format words[i..n]
235    // best[i] = last word index of the optimal line starting at word i
236    // first_line_len[i] = length of the first line in the optimal layout of words[i..n]
237    // has_more_lines[i] = whether the sub-solution at i has more than one line
238    let mut cost = vec![i64::MAX; n + 1];
239    let mut best = vec![0usize; n];
240    let mut first_line_len = vec![0i64; n + 1];
241    let mut has_more_lines = vec![false; n + 1]; // true if sub-solution has >= 2 lines
242    cost[n] = 0;
243
244    for i in (0..n).rev() {
245        let base = if i == 0 { first_base } else { cont_base };
246        let mut len = base + word_lens[i];
247
248        for j in i..n {
249            if j > i {
250                len += sep_widths[j] + word_lens[j];
251            }
252
253            if len > width {
254                if j == i {
255                    let line_cost = if j == n - 1 {
256                        0
257                    } else {
258                        let short_n = goal - len as i64;
259                        let short_cost = short_n * 10 * short_n * 10;
260                        // Ragged cost only if the next sub-solution has more than one line
261                        // (i.e., next line is not the last line of the paragraph)
262                        let ragged_cost = if has_more_lines[j + 1] {
263                            let ragged_n = len as i64 - first_line_len[j + 1];
264                            ragged_n * 10 * ragged_n * 10 / 2
265                        } else {
266                            0
267                        };
268                        short_cost + ragged_cost
269                    };
270                    if cost[j + 1] != i64::MAX {
271                        let total = line_cost + cost[j + 1];
272                        if total < cost[i] {
273                            cost[i] = total;
274                            best[i] = j;
275                            first_line_len[i] = len as i64;
276                            has_more_lines[i] = j + 1 < n;
277                        }
278                    }
279                }
280                break;
281            }
282
283            let line_cost = if j == n - 1 {
284                0
285            } else {
286                let short_n = goal - len as i64;
287                let short_cost = short_n * 10 * short_n * 10;
288                // Ragged cost only when the next line is not the last line
289                let ragged_cost = if has_more_lines[j + 1] {
290                    let ragged_n = len as i64 - first_line_len[j + 1];
291                    ragged_n * 10 * ragged_n * 10 / 2
292                } else {
293                    0
294                };
295                short_cost + ragged_cost
296            };
297
298            if cost[j + 1] != i64::MAX {
299                let total = line_cost + cost[j + 1];
300                if total < cost[i] {
301                    cost[i] = total;
302                    best[i] = j;
303                    first_line_len[i] = len as i64;
304                    has_more_lines[i] = j + 1 < n;
305                }
306            }
307        }
308    }
309
310    // Reconstruct the lines from the DP solution.
311    let mut result = String::new();
312    let mut i = 0;
313    let mut is_first_line = true;
314
315    while i < n {
316        let j = best[i];
317        let base = if is_first_line {
318            format!("{}{}", prefix, first_indent)
319        } else {
320            format!("{}{}", prefix, cont_indent)
321        };
322
323        result.push_str(&base);
324        result.push_str(words[i]);
325        for k in (i + 1)..=j {
326            if config.uniform_spacing && is_sentence_end(words[k - 1]) {
327                result.push_str("  ");
328            } else {
329                result.push(' ');
330            }
331            result.push_str(words[k]);
332        }
333        result.push('\n');
334
335        is_first_line = false;
336        i = j + 1;
337    }
338
339    result
340}
341
342/// Split a single long line at the width boundary without reflowing.
343/// Used in split-only mode (-s).
344fn split_long_line<W: Write>(
345    line: &str,
346    config: &FmtConfig,
347    prefix: Option<&str>,
348    output: &mut W,
349) -> io::Result<()> {
350    let stripped = match prefix {
351        Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
352        None => line,
353    };
354    let indent = leading_indent(stripped).to_string();
355    let pfx = prefix.unwrap_or("");
356
357    if line.len() <= config.width {
358        output.write_all(line.as_bytes())?;
359        output.write_all(b"\n")?;
360        return Ok(());
361    }
362
363    // Split the line's words, preserving the original structure as much as possible.
364    let words: Vec<&str> = extract_words(line, prefix);
365    if words.is_empty() {
366        output.write_all(line.as_bytes())?;
367        output.write_all(b"\n")?;
368        return Ok(());
369    }
370
371    let mut cur_line = format!("{}{}", pfx, indent);
372    for (i, word) in words.iter().enumerate() {
373        let sep = if cur_line.len() == pfx.len() + indent.len() {
374            ""
375        } else if config.uniform_spacing && i > 0 && is_sentence_end(words[i - 1]) {
376            "  "
377        } else {
378            " "
379        };
380
381        if cur_line.len() + sep.len() + word.len() > config.width
382            && cur_line.len() > pfx.len() + indent.len()
383        {
384            output.write_all(cur_line.as_bytes())?;
385            output.write_all(b"\n")?;
386            cur_line = format!("{}{}", pfx, indent);
387        }
388
389        let sep = if cur_line.len() == pfx.len() + indent.len() {
390            ""
391        } else if config.uniform_spacing && i > 0 && is_sentence_end(words[i - 1]) {
392            "  "
393        } else {
394            " "
395        };
396        cur_line.push_str(sep);
397        cur_line.push_str(word);
398    }
399
400    if cur_line.len() > pfx.len() + indent.len() {
401        output.write_all(cur_line.as_bytes())?;
402        output.write_all(b"\n")?;
403    }
404
405    Ok(())
406}