Skip to main content

coreutils_rs/fmt/
core.rs

1use std::io::{self, Read, Write};
2
3/// Configuration for the fmt command.
4pub struct FmtConfig {
5    /// Maximum line width (default 75).
6    pub width: usize,
7    /// Goal width for line filling (default 93% of width).
8    pub goal: usize,
9    /// Only split long lines, do not refill short lines.
10    pub split_only: bool,
11    /// Crown margin mode: preserve the indentation of the first two lines.
12    pub crown_margin: bool,
13    /// Tagged paragraph mode: first line indentation differs from subsequent lines.
14    pub tagged: bool,
15    /// Uniform spacing: one space between words, two after sentence-ending punctuation.
16    pub uniform_spacing: bool,
17    /// Only reformat lines beginning with this prefix.
18    pub prefix: Option<String>,
19}
20
21impl Default for FmtConfig {
22    fn default() -> Self {
23        let width = 75;
24        Self {
25            width,
26            goal: (width * 187) / 200,
27            split_only: false,
28            crown_margin: false,
29            tagged: false,
30            uniform_spacing: false,
31            prefix: None,
32        }
33    }
34}
35
36/// 256-byte lookup table: 1 = ASCII whitespace (\t \n \x0B \x0C \r \x20), 0 = non-whitespace.
37/// Using a lookup table avoids branch-heavy `is_ascii_whitespace()` per byte.
38static WS_TABLE: [u8; 256] = {
39    let mut t = [0u8; 256];
40    t[b'\t' as usize] = 1;
41    t[b'\n' as usize] = 1;
42    t[0x0B] = 1; // vertical tab
43    t[0x0C] = 1; // form feed
44    t[b'\r' as usize] = 1;
45    t[b' ' as usize] = 1;
46    t
47};
48
49/// Fast whitespace check using lookup table.
50#[inline(always)]
51fn is_ws(b: u8) -> bool {
52    // SAFETY: b as usize is always in 0..256
53    unsafe { *WS_TABLE.get_unchecked(b as usize) != 0 }
54}
55
56/// Word flags for GNU fmt cost model.
57/// Packed into the upper bits of the winfo u32 array for cache efficiency.
58const SENT_FLAG: u32 = 1 << 16; // sentence-final (period + double-space/eol context)
59const PERIOD_FLAG: u32 = 1 << 17; // has sentence-ending punct (.!?) regardless of context
60const PUNCT_FLAG: u32 = 1 << 18; // ends with non-period punctuation (,;:)
61const PAREN_FLAG: u32 = 1 << 19; // starts with opening paren/bracket
62
63/// Reusable buffers for the entire formatting session.
64/// All data is offset-based (no borrowed references), enabling reuse across paragraphs.
65struct FmtCtx {
66    /// Word byte offsets into the source text. One u32 per word.
67    word_off: Vec<u32>,
68    /// Packed word info: bits 0-15 = length, bits 16-19 = flags.
69    /// Parallel to word_off. Built once during word collection, used directly in DP.
70    winfo: Vec<u32>,
71    /// DP cost array (n+1 elements).
72    dp_cost: Vec<i64>,
73    /// Best break-point for word i (n elements).
74    best: Vec<u32>,
75    /// Line length at break-point i (n+1 elements).
76    line_len: Vec<i32>,
77    /// Output line buffer for batched writes.
78    line_buf: Vec<u8>,
79}
80
81impl FmtCtx {
82    fn new() -> Self {
83        Self {
84            word_off: Vec::with_capacity(256),
85            winfo: Vec::with_capacity(256),
86            dp_cost: Vec::with_capacity(257),
87            best: Vec::with_capacity(256),
88            line_len: Vec::with_capacity(257),
89            line_buf: Vec::with_capacity(256),
90        }
91    }
92
93    #[inline(always)]
94    fn clear_words(&mut self) {
95        self.word_off.clear();
96        self.winfo.clear();
97    }
98}
99
100/// Reformat text from `input` and write the result to `output`.
101///
102/// Text is processed paragraph by paragraph in a streaming fashion.
103/// Each paragraph is formatted and written immediately, avoiding holding
104/// the entire file in memory.
105pub fn fmt_file<R: Read, W: Write>(
106    mut input: R,
107    output: &mut W,
108    config: &FmtConfig,
109) -> io::Result<()> {
110    let mut data = Vec::new();
111    input.read_to_end(&mut data)?;
112    fmt_data(&data, output, config)
113}
114
115/// Format in-memory data. Works on byte slices to avoid String allocation.
116pub fn fmt_data(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
117    let text = match std::str::from_utf8(data) {
118        Ok(s) => s,
119        Err(_) => {
120            let owned = String::from_utf8_lossy(data);
121            return fmt_str(&owned, output, config);
122        }
123    };
124    fmt_str(text, output, config)
125}
126
127/// Check if a byte range is all whitespace using our fast lookup table.
128#[inline(always)]
129fn is_blank_bytes(bytes: &[u8]) -> bool {
130    for &b in bytes {
131        if !is_ws(b) {
132            return false;
133        }
134    }
135    true
136}
137
138/// Format a string slice, processing paragraph by paragraph with zero-copy word extraction.
139fn fmt_str(text: &str, output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
140    let prefix_str = config.prefix.as_deref();
141    let mut para_start = 0;
142    let bytes = text.as_bytes();
143    let blen = bytes.len();
144    let mut ctx = FmtCtx::new();
145
146    let mut i = 0;
147
148    while i < blen {
149        // Find end of current line using memchr for SIMD-accelerated newline search
150        let line_end = memchr::memchr(b'\n', &bytes[i..])
151            .map(|p| i + p)
152            .unwrap_or(blen);
153
154        // Effective line end (strip \r)
155        let le = if line_end > i && bytes[line_end - 1] == b'\r' {
156            line_end - 1
157        } else {
158            line_end
159        };
160
161        // Handle prefix filter
162        if let Some(pfx) = prefix_str {
163            let line = &text[i..le];
164            if !line.starts_with(pfx) {
165                // Flush current paragraph
166                if para_start < i {
167                    format_paragraph(text, bytes, para_start, i, config, output, &mut ctx)?;
168                }
169                let next = if line_end < blen { line_end + 1 } else { blen };
170                para_start = next;
171                // Emit verbatim
172                output.write_all(&bytes[i..le])?;
173                output.write_all(b"\n")?;
174                i = next;
175                continue;
176            }
177        }
178
179        // Fast blank-line check using WS lookup table (no allocation)
180        if is_blank_bytes(&bytes[i..le]) {
181            // Blank line = paragraph boundary
182            if para_start < i {
183                format_paragraph(text, bytes, para_start, i, config, output, &mut ctx)?;
184            }
185            output.write_all(b"\n")?;
186            let next = if line_end < blen { line_end + 1 } else { blen };
187            para_start = next;
188        }
189
190        i = if line_end < blen { line_end + 1 } else { blen };
191    }
192
193    // Flush remaining paragraph
194    if para_start < blen {
195        let remaining = text[para_start..].trim_end_matches('\n');
196        if !remaining.is_empty() {
197            format_paragraph(text, bytes, para_start, blen, config, output, &mut ctx)?;
198        }
199    }
200
201    Ok(())
202}
203
204/// Format a paragraph from a region of the source text [start..end).
205/// Uses single-pass word extraction with offset-based storage.
206/// All flags are computed once during word collection, eliminating double punctuation analysis.
207fn format_paragraph(
208    text: &str,
209    bytes: &[u8],
210    start: usize,
211    end: usize,
212    config: &FmtConfig,
213    output: &mut impl Write,
214    ctx: &mut FmtCtx,
215) -> io::Result<()> {
216    let region = &bytes[start..end];
217    let prefix_str = config.prefix.as_deref();
218
219    // Single-pass line extraction using memchr for indentation analysis.
220    // We need the first two non-empty lines for indent detection.
221    let mut first_line: Option<&str> = None;
222    let mut second_line: Option<&str> = None;
223    {
224        let rlen = region.len();
225        let mut pos = 0;
226        while pos < rlen {
227            let nl = memchr::memchr(b'\n', &region[pos..])
228                .map(|p| pos + p)
229                .unwrap_or(rlen);
230            let mut le = nl;
231            if le > pos && region[le - 1] == b'\r' {
232                le -= 1;
233            }
234            if le > pos {
235                let line = &text[start + pos..start + le];
236                if first_line.is_none() {
237                    first_line = Some(line);
238                } else if second_line.is_none() {
239                    second_line = Some(line);
240                    break; // Only need first two lines for indent analysis
241                }
242            }
243            pos = nl + 1;
244        }
245    }
246
247    let fl = match first_line {
248        Some(l) => l,
249        None => return Ok(()),
250    };
251
252    let stripped_first = match prefix_str {
253        Some(pfx) => fl.strip_prefix(pfx).unwrap_or(fl),
254        None => fl,
255    };
256
257    let stripped_second = match second_line {
258        Some(l) => match prefix_str {
259            Some(pfx) => l.strip_prefix(pfx).unwrap_or(l),
260            None => l,
261        },
262        None => stripped_first,
263    };
264
265    let first_indent = leading_indent(stripped_first);
266    let rest_indent = leading_indent(stripped_second);
267
268    let (first_line_indent, cont_indent) = if config.tagged || config.crown_margin {
269        (first_indent, rest_indent)
270    } else {
271        (first_indent, first_indent)
272    };
273
274    if config.split_only {
275        // Split-only mode: process each line independently.
276        let rlen = region.len();
277        let mut pos = 0;
278        while pos < rlen {
279            let nl = memchr::memchr(b'\n', &region[pos..])
280                .map(|p| pos + p)
281                .unwrap_or(rlen);
282            let mut le = nl;
283            if le > pos && region[le - 1] == b'\r' {
284                le -= 1;
285            }
286            if le > pos {
287                let line = &text[start + pos..start + le];
288                split_line_optimal(line, config, prefix_str, output)?;
289            }
290            pos = nl + 1;
291        }
292        return Ok(());
293    }
294
295    // Collect words with full flag computation in a single pass.
296    // Words are stored as byte offsets (word_off) + packed winfo, avoiding Vec<&str>.
297    ctx.clear_words();
298    collect_words_from_region(bytes, region, start, prefix_str, ctx);
299
300    let n = ctx.word_off.len();
301    if n == 0 {
302        output.write_all(b"\n")?;
303        return Ok(());
304    }
305
306    // Mark last word as sentence-final (GNU fmt convention).
307    let last_idx = n - 1;
308    ctx.winfo[last_idx] |= SENT_FLAG | PERIOD_FLAG;
309
310    let pfx = prefix_str.unwrap_or("");
311    reflow_paragraph(
312        bytes,
313        pfx,
314        first_line_indent,
315        cont_indent,
316        config,
317        output,
318        ctx,
319    )
320}
321
322/// Determine the leading whitespace (indentation) of a line.
323fn leading_indent(line: &str) -> &str {
324    let trimmed = line.trim_start();
325    &line[..line.len() - trimmed.len()]
326}
327
328/// Collect words from all lines in a paragraph region, computing all flags in one pass.
329/// Stores byte offsets + packed winfo in ctx. No intermediate Vec<&str> or Vec<bool>.
330fn collect_words_from_region(
331    bytes: &[u8],
332    region: &[u8],
333    start: usize,
334    prefix: Option<&str>,
335    ctx: &mut FmtCtx,
336) {
337    let rlen = region.len();
338    let mut pos = 0;
339    let pfx_len = prefix.map_or(0, |p| p.len());
340
341    while pos < rlen {
342        let nl = memchr::memchr(b'\n', &region[pos..])
343            .map(|p| pos + p)
344            .unwrap_or(rlen);
345        let mut le = nl;
346        if le > pos && region[le - 1] == b'\r' {
347            le -= 1;
348        }
349        if le > pos {
350            let line_start = start + pos;
351            let line_end = start + le;
352
353            // Strip prefix
354            let ls = if pfx_len > 0 && le - pos >= pfx_len {
355                let pfx_bytes = prefix.unwrap().as_bytes();
356                if &bytes[line_start..line_start + pfx_len] == pfx_bytes {
357                    line_start + pfx_len
358                } else {
359                    line_start
360                }
361            } else {
362                line_start
363            };
364
365            collect_words_line(bytes, ls, line_end, ctx);
366        }
367        pos = nl + 1;
368    }
369}
370
371/// Collect words from a single line [ls..le) in the source bytes.
372/// Computes all flags (SENT, PERIOD, PUNCT, PAREN) during collection.
373/// Uses lookup-table whitespace scanning and unsafe ptr for bounds-check elision.
374#[inline(always)]
375fn collect_words_line(bytes: &[u8], ls: usize, le: usize, ctx: &mut FmtCtx) {
376    let ptr = bytes.as_ptr();
377    let mut i = ls;
378
379    // Skip leading whitespace
380    while i < le && unsafe { is_ws(*ptr.add(i)) } {
381        i += 1;
382    }
383
384    while i < le {
385        let word_start = i;
386        while i < le && unsafe { !is_ws(*ptr.add(i)) } {
387            i += 1;
388        }
389        let wlen = i - word_start;
390
391        // Count trailing spaces
392        let space_start = i;
393        while i < le && unsafe { is_ws(*ptr.add(i)) } {
394            i += 1;
395        }
396        let space_count = i - space_start;
397
398        // Compute all flags in one pass (single backward scan for punctuation)
399        let wb = unsafe { std::slice::from_raw_parts(ptr.add(word_start), wlen) };
400        let mut flags = 0u32;
401
402        let in_sent_ctx = i >= le || space_count >= 2;
403        flags |= classify_word_punct(wb, in_sent_ctx);
404        if wlen > 0 && matches!(wb[0], b'(' | b'[' | b'{') {
405            flags |= PAREN_FLAG;
406        }
407
408        ctx.word_off.push(word_start as u32);
409        ctx.winfo.push((wlen as u32) | flags);
410    }
411}
412
413/// Classify a word's trailing punctuation in a single backward scan.
414/// Combines what was previously two separate functions (analyze_word_punct +
415/// is_sentence_end_contextual) into one pass, avoiding redundant byte scanning.
416/// Returns the appropriate flag bits (PERIOD_FLAG, SENT_FLAG, PUNCT_FLAG).
417#[inline(always)]
418fn classify_word_punct(bytes: &[u8], in_sentence_context: bool) -> u32 {
419    let mut i = bytes.len();
420    // Strip trailing quotes/parens
421    while i > 0 && matches!(bytes[i - 1], b'"' | b'\'' | b')' | b']') {
422        i -= 1;
423    }
424    if i == 0 {
425        return 0;
426    }
427    let c = bytes[i - 1];
428    if c == b'.' || c == b'!' || c == b'?' {
429        let mut flags = PERIOD_FLAG;
430        if in_sentence_context {
431            // Strip sentence-ending punctuation to find core word
432            let mut end = i;
433            while end > 0 && matches!(bytes[end - 1], b'.' | b'!' | b'?') {
434                end -= 1;
435            }
436            // Single uppercase letter = abbreviation, not sentence end
437            if !(end == 1 && bytes[0].is_ascii_uppercase()) && end > 0 {
438                flags |= SENT_FLAG;
439            }
440        }
441        flags
442    } else if c == b',' || c == b';' || c == b':' {
443        PUNCT_FLAG
444    } else {
445        0
446    }
447}
448
449/// Reflow words into lines that fit within the configured width.
450///
451/// Uses optimal line breaking with a cost function matching GNU fmt.
452/// Words are referenced by offset (ctx.word_off + ctx.winfo), not by &str slices.
453/// Builds each output line in a buffer and writes once.
454#[allow(clippy::too_many_arguments)]
455fn reflow_paragraph<W: Write>(
456    bytes: &[u8],
457    prefix: &str,
458    first_indent: &str,
459    cont_indent: &str,
460    config: &FmtConfig,
461    output: &mut W,
462    ctx: &mut FmtCtx,
463) -> io::Result<()> {
464    let n = ctx.word_off.len();
465    if n == 0 {
466        return Ok(());
467    }
468
469    let first_base = prefix.len() + first_indent.len();
470    let cont_base = prefix.len() + cont_indent.len();
471    let goal = config.goal as i64;
472    let width = config.width;
473
474    // GNU fmt cost model constants
475    const SHORT_FACTOR: i64 = 100;
476    const RAGGED_FACTOR: i64 = 50;
477    const LINE_COST: i64 = 70 * 70;
478    const SENTENCE_BONUS: i64 = 50 * 50;
479    const NOBREAK_COST: i64 = 600 * 600;
480    const PUNCT_BONUS: i64 = 40 * 40;
481    const PAREN_BONUS: i64 = 40 * 40;
482
483    // Reuse DP buffers
484    ctx.dp_cost.clear();
485    ctx.dp_cost.resize(n + 1, i64::MAX);
486    ctx.dp_cost[n] = 0;
487
488    ctx.best.clear();
489    ctx.best.resize(n, 0);
490
491    ctx.line_len.clear();
492    ctx.line_len.resize(n + 1, 0);
493
494    // SAFETY: All array indices are provably in-bounds:
495    // - winfo has exactly n elements (one per word)
496    // - dp_cost has n+1 elements, accessed with indices 0..=n
497    // - best has n elements, accessed with indices 0..n-1
498    // - line_len has n+1 elements, accessed with indices 0..=n
499    let winfo_ptr = ctx.winfo.as_ptr();
500    let dp_cost_ptr = ctx.dp_cost.as_mut_ptr();
501    let best_ptr = ctx.best.as_mut_ptr();
502    let line_len_ptr = ctx.line_len.as_mut_ptr();
503
504    // Precomputed division tables to avoid expensive integer division in the inner loop.
505    // div40k[len] = 40000 / (len + 2), div22k[len] = 22500 / (len + 2).
506    // Word lengths above 126 are clamped (extremely rare, cost difference negligible).
507    const LUT_SIZE: usize = 128;
508    let div40k: [i64; LUT_SIZE] = {
509        let mut t = [0i64; LUT_SIZE];
510        let mut k = 0;
511        while k < LUT_SIZE {
512            t[k] = 40000 / (k as i64 + 2);
513            k += 1;
514        }
515        t
516    };
517    let div22k: [i64; LUT_SIZE] = {
518        let mut t = [0i64; LUT_SIZE];
519        let mut k = 0;
520        while k < LUT_SIZE {
521            t[k] = 22500 / (k as i64 + 2);
522            k += 1;
523        }
524        t
525    };
526
527    for i in (0..n).rev() {
528        let base = if i == 0 { first_base } else { cont_base };
529        let mut len = base + unsafe { (*winfo_ptr.add(i) & 0xFFFF) as usize };
530        let mut best_total = i64::MAX;
531        let mut best_j = i as u32;
532        let mut best_len = len as i32;
533
534        for j in i..n {
535            if j > i {
536                let sep = if unsafe { *winfo_ptr.add(j - 1) & SENT_FLAG != 0 } {
537                    2
538                } else {
539                    1
540                };
541                len += sep + unsafe { (*winfo_ptr.add(j) & 0xFFFF) as usize };
542            }
543
544            macro_rules! try_candidate {
545                () => {
546                    let lc = if j == n - 1 {
547                        0i64
548                    } else {
549                        let short_n = goal - len as i64;
550                        let short_cost = short_n * short_n * SHORT_FACTOR;
551                        let ragged_cost = if unsafe { *best_ptr.add(j + 1) as usize + 1 < n } {
552                            let ragged_n = len as i64 - unsafe { *line_len_ptr.add(j + 1) } as i64;
553                            ragged_n * ragged_n * RAGGED_FACTOR
554                        } else {
555                            0
556                        };
557                        short_cost + ragged_cost
558                    };
559
560                    let bc = if j == n - 1 {
561                        0i64
562                    } else {
563                        let wj = unsafe { *winfo_ptr.add(j) };
564                        let wj1 = unsafe { *winfo_ptr.add(j + 1) };
565                        let mut cost = LINE_COST;
566
567                        if wj & PERIOD_FLAG != 0 {
568                            if wj & SENT_FLAG != 0 {
569                                cost -= SENTENCE_BONUS;
570                            } else {
571                                cost += NOBREAK_COST;
572                            }
573                        } else if wj & PUNCT_FLAG != 0 {
574                            cost -= PUNCT_BONUS;
575                        } else if j > 0 {
576                            let wjm1 = unsafe { *winfo_ptr.add(j - 1) };
577                            if wjm1 & SENT_FLAG != 0 {
578                                let wl = ((wj & 0xFFFF) as usize).min(LUT_SIZE - 1);
579                                cost += div40k[wl];
580                            }
581                        }
582
583                        if wj1 & PAREN_FLAG != 0 {
584                            cost -= PAREN_BONUS;
585                        } else if wj1 & SENT_FLAG != 0 {
586                            let wl = ((wj1 & 0xFFFF) as usize).min(LUT_SIZE - 1);
587                            cost += div22k[wl];
588                        }
589
590                        cost
591                    };
592
593                    let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
594                    if cj1 != i64::MAX {
595                        let total = lc + bc + cj1;
596                        if total < best_total {
597                            best_total = total;
598                            best_j = j as u32;
599                            best_len = len as i32;
600                        }
601                    }
602                };
603            }
604
605            if len >= width {
606                if j == i {
607                    try_candidate!();
608                }
609                break;
610            }
611
612            try_candidate!();
613        }
614
615        if best_total < i64::MAX {
616            unsafe {
617                *dp_cost_ptr.add(i) = best_total;
618                *best_ptr.add(i) = best_j;
619                *line_len_ptr.add(i) = best_len;
620            }
621        }
622    }
623
624    // Reconstruct lines from DP solution.
625    // Reference words by byte offset into source, build each line in buffer.
626    let mut i = 0;
627    let mut is_first_line = true;
628    let line_buf = &mut ctx.line_buf;
629    let word_off = &ctx.word_off;
630    let winfo = &ctx.winfo;
631    let best = &ctx.best;
632
633    while i < n {
634        let j = best[i] as usize;
635
636        line_buf.clear();
637
638        line_buf.extend_from_slice(prefix.as_bytes());
639        if is_first_line {
640            line_buf.extend_from_slice(first_indent.as_bytes());
641        } else {
642            line_buf.extend_from_slice(cont_indent.as_bytes());
643        }
644
645        // First word on line
646        let off = word_off[i] as usize;
647        let wlen = (winfo[i] & 0xFFFF) as usize;
648        line_buf.extend_from_slice(&bytes[off..off + wlen]);
649
650        // Subsequent words on line
651        for k in (i + 1)..=j {
652            if winfo[k - 1] & SENT_FLAG != 0 {
653                line_buf.extend_from_slice(b"  ");
654            } else {
655                line_buf.push(b' ');
656            }
657            let off = word_off[k] as usize;
658            let wlen = (winfo[k] & 0xFFFF) as usize;
659            line_buf.extend_from_slice(&bytes[off..off + wlen]);
660        }
661        line_buf.push(b'\n');
662
663        output.write_all(line_buf)?;
664
665        is_first_line = false;
666        i = j + 1;
667    }
668
669    Ok(())
670}
671
672/// Split a single input line using the optimal paragraph algorithm.
673/// Used in split-only mode (-s): short lines are preserved as-is,
674/// long lines are broken optimally (same algorithm as normal reflow).
675fn split_line_optimal<W: Write>(
676    line: &str,
677    config: &FmtConfig,
678    prefix: Option<&str>,
679    output: &mut W,
680) -> io::Result<()> {
681    let stripped = match prefix {
682        Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
683        None => line,
684    };
685    let indent = leading_indent(stripped);
686    let pfx = prefix.unwrap_or("");
687
688    // Short line: output as-is (no splitting needed).
689    if line.len() < config.width {
690        output.write_all(line.as_bytes())?;
691        output.write_all(b"\n")?;
692        return Ok(());
693    }
694
695    let s = match prefix {
696        Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
697        None => line,
698    };
699
700    let bytes = line.as_bytes();
701    let mut ctx = FmtCtx::new();
702
703    // Find the offset of s within line
704    let s_start = s.as_ptr() as usize - line.as_ptr() as usize;
705    let s_end = s_start + s.len();
706    collect_words_line(bytes, s_start, s_end, &mut ctx);
707
708    if ctx.word_off.is_empty() {
709        output.write_all(line.as_bytes())?;
710        output.write_all(b"\n")?;
711        return Ok(());
712    }
713
714    // Mark last word as sentence-final
715    let last = ctx.winfo.len() - 1;
716    ctx.winfo[last] |= SENT_FLAG | PERIOD_FLAG;
717
718    reflow_paragraph(bytes, pfx, indent, indent, config, output, &mut ctx)
719}