Skip to main content

coreutils_rs/fmt/
core.rs

1use std::io::{self, Read, Write};
2
3/// Configuration for the fmt command.
4pub struct FmtConfig {
5    /// Maximum line width (default 75).
6    pub width: usize,
7    /// Goal width for line filling (default 93% of width).
8    pub goal: usize,
9    /// Only split long lines, do not refill short lines.
10    pub split_only: bool,
11    /// Crown margin mode: preserve the indentation of the first two lines.
12    pub crown_margin: bool,
13    /// Tagged paragraph mode: first line indentation differs from subsequent lines.
14    pub tagged: bool,
15    /// Uniform spacing: one space between words, two after sentence-ending punctuation.
16    pub uniform_spacing: bool,
17    /// Only reformat lines beginning with this prefix.
18    pub prefix: Option<String>,
19}
20
21impl Default for FmtConfig {
22    fn default() -> Self {
23        let width = 75;
24        Self {
25            width,
26            goal: (width * 187) / 200,
27            split_only: false,
28            crown_margin: false,
29            tagged: false,
30            uniform_spacing: false,
31            prefix: None,
32        }
33    }
34}
35
36/// 256-byte lookup table: 1 = ASCII whitespace (\t \n \x0B \x0C \r \x20), 0 = non-whitespace.
37/// Using a lookup table avoids branch-heavy `is_ascii_whitespace()` per byte.
38static WS_TABLE: [u8; 256] = {
39    let mut t = [0u8; 256];
40    t[b'\t' as usize] = 1;
41    t[b'\n' as usize] = 1;
42    t[0x0B] = 1; // vertical tab
43    t[0x0C] = 1; // form feed
44    t[b'\r' as usize] = 1;
45    t[b' ' as usize] = 1;
46    t
47};
48
49/// Fast whitespace check using lookup table.
50#[inline(always)]
51fn is_ws(b: u8) -> bool {
52    // SAFETY: b as usize is always in 0..256
53    unsafe { *WS_TABLE.get_unchecked(b as usize) != 0 }
54}
55
56/// Word flags for GNU fmt cost model.
57/// Packed into the upper bits of the winfo u32 array for cache efficiency.
58const SENT_FLAG: u32 = 1 << 16; // sentence-final (period + double-space/eol context)
59const PERIOD_FLAG: u32 = 1 << 17; // has sentence-ending punct (.!?) regardless of context
60const PUNCT_FLAG: u32 = 1 << 18; // ends with non-period punctuation (,;:)
61const PAREN_FLAG: u32 = 1 << 19; // starts with opening paren/bracket
62
63/// Precomputed division tables to avoid expensive integer division in the DP inner loop.
64/// div40k[len] = 40000 / (len + 2), div22k[len] = 22500 / (len + 2).
65/// Word lengths above 126 are clamped (extremely rare, cost difference negligible).
66const LUT_SIZE: usize = 128;
67
68static DIV40K: [i64; LUT_SIZE] = {
69    let mut t = [0i64; LUT_SIZE];
70    let mut k = 0;
71    while k < LUT_SIZE {
72        t[k] = 40000 / (k as i64 + 2);
73        k += 1;
74    }
75    t
76};
77
78static DIV22K: [i64; LUT_SIZE] = {
79    let mut t = [0i64; LUT_SIZE];
80    let mut k = 0;
81    while k < LUT_SIZE {
82        t[k] = 22500 / (k as i64 + 2);
83        k += 1;
84    }
85    t
86};
87
88/// Reusable buffers for the entire formatting session.
89/// All data is offset-based (no borrowed references), enabling reuse across paragraphs.
90struct FmtCtx {
91    /// Word byte offsets into the source text. One u32 per word.
92    word_off: Vec<u32>,
93    /// Packed word info: bits 0-15 = length, bits 16-19 = flags.
94    /// Parallel to word_off. Built once during word collection, used directly in DP.
95    winfo: Vec<u32>,
96}
97
98impl FmtCtx {
99    fn new() -> Self {
100        Self {
101            word_off: Vec::with_capacity(256),
102            winfo: Vec::with_capacity(256),
103        }
104    }
105
106    #[inline(always)]
107    fn clear_words(&mut self) {
108        self.word_off.clear();
109        self.winfo.clear();
110    }
111}
112
113/// Reformat text from `input` and write the result to `output`.
114///
115/// Text is processed paragraph by paragraph in a streaming fashion.
116/// Each paragraph is formatted and written immediately, avoiding holding
117/// the entire file in memory.
118pub fn fmt_file<R: Read, W: Write>(
119    mut input: R,
120    output: &mut W,
121    config: &FmtConfig,
122) -> io::Result<()> {
123    let mut data = Vec::new();
124    input.read_to_end(&mut data)?;
125    fmt_data(&data, output, config)
126}
127
128/// Format in-memory data. Works on byte slices directly — no UTF-8 validation pass.
129/// The formatter only inspects ASCII whitespace/punctuation, so raw bytes are fine.
130pub fn fmt_data(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
131    // Work directly on bytes — no UTF-8 validation needed.
132    // The formatter only examines ASCII whitespace and punctuation characters.
133    // Non-ASCII bytes are passed through verbatim (zero-copy).
134    fmt_bytes(data, output, config)
135}
136
137/// Check if a byte range is all whitespace using our fast lookup table.
138#[inline(always)]
139fn is_blank_bytes(bytes: &[u8]) -> bool {
140    for &b in bytes {
141        if !is_ws(b) {
142            return false;
143        }
144    }
145    true
146}
147
148/// Format raw bytes, processing paragraph by paragraph with zero-copy word extraction.
149/// Operates entirely on &[u8] — no UTF-8 conversion or validation at any point.
150/// Uses memchr_iter for SIMD-accelerated newline scanning across the entire input.
151fn fmt_bytes(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
152    let prefix_bytes = config.prefix.as_deref().map(|s| s.as_bytes());
153    let mut para_start = 0;
154    let blen = data.len();
155    let mut ctx = FmtCtx::new();
156    let mut line_start = 0;
157
158    // Use memchr_iter for SIMD-accelerated bulk newline scanning.
159    // This finds all newlines in one pass, avoiding repeated memchr calls
160    // that each re-scan overlapping regions.
161    for nl_pos in memchr::memchr_iter(b'\n', data) {
162        let line_end = nl_pos;
163
164        // Effective line end (strip \r)
165        let le = if line_end > line_start && data[line_end - 1] == b'\r' {
166            line_end - 1
167        } else {
168            line_end
169        };
170
171        // Handle prefix filter
172        if let Some(pfx) = prefix_bytes {
173            if !data[line_start..le].starts_with(pfx) {
174                // Flush current paragraph
175                if para_start < line_start {
176                    format_paragraph(data, para_start, line_start, config, output, &mut ctx)?;
177                }
178                para_start = nl_pos + 1;
179                // Emit verbatim
180                output.write_all(&data[line_start..le])?;
181                output.write_all(b"\n")?;
182                line_start = nl_pos + 1;
183                continue;
184            }
185        }
186
187        // Fast blank-line check using WS lookup table (no allocation)
188        if is_blank_bytes(&data[line_start..le]) {
189            // Blank line = paragraph boundary
190            if para_start < line_start {
191                format_paragraph(data, para_start, line_start, config, output, &mut ctx)?;
192            }
193            output.write_all(b"\n")?;
194            para_start = nl_pos + 1;
195        }
196
197        line_start = nl_pos + 1;
198    }
199
200    // Handle last line (no trailing newline) and flush remaining paragraph
201    if line_start < blen {
202        // There's content after the last newline (no trailing newline)
203        // It's part of the current paragraph
204    }
205
206    if para_start < blen {
207        // Check if remaining content is non-empty (ignoring trailing newlines)
208        let mut end = blen;
209        while end > para_start && data[end - 1] == b'\n' {
210            end -= 1;
211        }
212        if end > para_start {
213            format_paragraph(data, para_start, blen, config, output, &mut ctx)?;
214        }
215    }
216
217    Ok(())
218}
219
220/// Format a paragraph from a region of the source bytes [start..end).
221/// Uses single-pass word extraction with offset-based storage.
222/// All flags are computed once during word collection, eliminating double punctuation analysis.
223fn format_paragraph(
224    bytes: &[u8],
225    start: usize,
226    end: usize,
227    config: &FmtConfig,
228    output: &mut impl Write,
229    ctx: &mut FmtCtx,
230) -> io::Result<()> {
231    let region = &bytes[start..end];
232    let prefix_bytes = config.prefix.as_deref().map(|s| s.as_bytes());
233
234    // Single-pass line extraction using memchr for indentation analysis.
235    // We need the first two non-empty lines for indent detection.
236    let mut first_line: Option<(usize, usize)> = None; // (start_offset, end_offset) relative to `bytes`
237    let mut second_line: Option<(usize, usize)> = None;
238    {
239        let rlen = region.len();
240        let mut pos = 0;
241        while pos < rlen {
242            let nl = memchr::memchr(b'\n', &region[pos..])
243                .map(|p| pos + p)
244                .unwrap_or(rlen);
245            let mut le = nl;
246            if le > pos && region[le - 1] == b'\r' {
247                le -= 1;
248            }
249            if le > pos {
250                let line_range = (start + pos, start + le);
251                if first_line.is_none() {
252                    first_line = Some(line_range);
253                } else if second_line.is_none() {
254                    second_line = Some(line_range);
255                    break; // Only need first two lines for indent analysis
256                }
257            }
258            pos = nl + 1;
259        }
260    }
261
262    let (fl_start, fl_end) = match first_line {
263        Some(r) => r,
264        None => return Ok(()),
265    };
266
267    // Strip prefix from first line for indent analysis
268    let stripped_first_start = match prefix_bytes {
269        Some(pfx) if bytes[fl_start..fl_end].starts_with(pfx) => fl_start + pfx.len(),
270        _ => fl_start,
271    };
272
273    // Strip prefix from second line for indent analysis
274    let (stripped_second_start, stripped_second_end) = match second_line {
275        Some((s, e)) => match prefix_bytes {
276            Some(pfx) if bytes[s..e].starts_with(pfx) => (s + pfx.len(), e),
277            _ => (s, e),
278        },
279        None => (stripped_first_start, fl_end),
280    };
281
282    let first_indent_len = leading_ws_len(&bytes[stripped_first_start..fl_end]);
283    let rest_indent_len = leading_ws_len(&bytes[stripped_second_start..stripped_second_end]);
284
285    // Extract indent bytes (zero-copy slices from source)
286    let first_indent = &bytes[stripped_first_start..stripped_first_start + first_indent_len];
287    let rest_indent = &bytes[stripped_second_start..stripped_second_start + rest_indent_len];
288
289    let (first_line_indent, cont_indent) = if config.tagged || config.crown_margin {
290        (first_indent, rest_indent)
291    } else {
292        (first_indent, first_indent)
293    };
294
295    if config.split_only {
296        // Split-only mode: process each line independently.
297        let rlen = region.len();
298        let mut pos = 0;
299        while pos < rlen {
300            let nl = memchr::memchr(b'\n', &region[pos..])
301                .map(|p| pos + p)
302                .unwrap_or(rlen);
303            let mut le = nl;
304            if le > pos && region[le - 1] == b'\r' {
305                le -= 1;
306            }
307            if le > pos {
308                let line_start = start + pos;
309                let line_end = start + le;
310                split_line_optimal(
311                    bytes,
312                    line_start,
313                    line_end,
314                    config,
315                    prefix_bytes,
316                    output,
317                    ctx,
318                )?;
319            }
320            pos = nl + 1;
321        }
322        return Ok(());
323    }
324
325    let pfx = prefix_bytes.unwrap_or(b"");
326
327    // GNU fmt's MAXWORDS is 1,000,000 — process the entire paragraph in one shot.
328    const MAXWORDS: usize = 1_000_000;
329
330    // Streaming word collection + chunked DP: collect words in MAXWORDS-sized
331    // batches and process each chunk immediately. This avoids allocating a
332    // Vec of 1.5M+ words for huge single-paragraph files.
333    collect_and_reflow_chunked(
334        bytes,
335        region,
336        start,
337        prefix_bytes,
338        pfx,
339        first_line_indent,
340        cont_indent,
341        config,
342        output,
343        ctx,
344        MAXWORDS,
345    )
346}
347
348/// Determine the length of leading ASCII whitespace in a byte slice.
349/// Replaces the old `leading_indent(&str) -> &str` — works on raw bytes,
350/// no UTF-8 requirement.
351#[inline(always)]
352fn leading_ws_len(bytes: &[u8]) -> usize {
353    let mut i = 0;
354    while i < bytes.len() && is_ws(bytes[i]) && bytes[i] != b'\n' {
355        i += 1;
356    }
357    i
358}
359
360/// Collect words from a single line [ls..le) in the source bytes.
361/// Computes all flags (SENT, PERIOD, PUNCT, PAREN) during collection.
362/// Uses a simple byte loop for word boundary detection — for typical line
363/// lengths (< 200 bytes), this is faster than memchr2's SIMD setup overhead
364/// which dominates for short slices.
365#[inline(always)]
366fn collect_words_line(bytes: &[u8], ls: usize, le: usize, ctx: &mut FmtCtx) {
367    let ptr = bytes.as_ptr();
368    let mut i = ls;
369
370    // Skip leading whitespace
371    while i < le && unsafe { is_ws(*ptr.add(i)) } {
372        i += 1;
373    }
374
375    while i < le {
376        let word_start = i;
377
378        // Find end of word using direct byte scan.
379        // For typical line lengths (< 200 bytes), the byte loop is faster than
380        // memchr2 which has per-call SIMD setup overhead (~15ns).
381        while i < le && unsafe { !is_ws(*ptr.add(i)) } {
382            i += 1;
383        }
384        let wlen = i - word_start;
385
386        // Count trailing spaces
387        let space_start = i;
388        while i < le && unsafe { is_ws(*ptr.add(i)) } {
389            i += 1;
390        }
391        let space_count = i - space_start;
392
393        // Compute all flags in one pass
394        let wb = unsafe { std::slice::from_raw_parts(ptr.add(word_start), wlen) };
395        let mut flags = 0u32;
396
397        let in_sent_ctx = i >= le || space_count >= 2;
398        flags |= classify_word_punct(wb, in_sent_ctx);
399        if wlen > 0 && matches!(wb[0], b'(' | b'[' | b'{') {
400            flags |= PAREN_FLAG;
401        }
402
403        ctx.word_off.push(word_start as u32);
404        ctx.winfo.push((wlen as u32) | flags);
405    }
406}
407
408/// Classify a word's trailing punctuation in a single backward scan.
409/// Combines what was previously two separate functions (analyze_word_punct +
410/// is_sentence_end_contextual) into one pass, avoiding redundant byte scanning.
411/// Returns the appropriate flag bits (PERIOD_FLAG, SENT_FLAG, PUNCT_FLAG).
412#[inline(always)]
413fn classify_word_punct(bytes: &[u8], in_sentence_context: bool) -> u32 {
414    let mut i = bytes.len();
415    // Strip trailing quotes/parens
416    while i > 0 && matches!(bytes[i - 1], b'"' | b'\'' | b')' | b']') {
417        i -= 1;
418    }
419    if i == 0 {
420        return 0;
421    }
422    let c = bytes[i - 1];
423    if c == b'.' || c == b'!' || c == b'?' {
424        let mut flags = PERIOD_FLAG;
425        if in_sentence_context {
426            // Strip sentence-ending punctuation to find core word
427            let mut end = i;
428            while end > 0 && matches!(bytes[end - 1], b'.' | b'!' | b'?') {
429                end -= 1;
430            }
431            // Single uppercase letter = abbreviation, not sentence end
432            if !(end == 1 && bytes[0].is_ascii_uppercase()) && end > 0 {
433                flags |= SENT_FLAG;
434            }
435        }
436        flags
437    } else if c == b',' || c == b';' || c == b':' {
438        PUNCT_FLAG
439    } else {
440        0
441    }
442}
443
444/// Collect words from a paragraph and reflow in MAXWORDS-sized chunks.
445/// Combines word collection with chunked DP processing to avoid allocating
446/// a Vec of millions of words for huge single-paragraph files.
447#[allow(clippy::too_many_arguments)]
448fn collect_and_reflow_chunked(
449    bytes: &[u8],
450    region: &[u8],
451    start: usize,
452    prefix_filter: Option<&[u8]>,
453    prefix_out: &[u8],
454    first_indent: &[u8],
455    cont_indent: &[u8],
456    config: &FmtConfig,
457    output: &mut impl Write,
458    ctx: &mut FmtCtx,
459    max_words: usize,
460) -> io::Result<()> {
461    let rlen = region.len();
462    let pfx_len = prefix_filter.map_or(0, |p| p.len());
463    let mut pos = 0;
464    let mut is_first_chunk = true;
465
466    // DP buffers — lazily grown to actual word count, not pre-allocated to MAXWORDS.
467    let mut dp = DpBufs::new();
468
469    ctx.clear_words();
470
471    while pos < rlen {
472        let nl = memchr::memchr(b'\n', &region[pos..])
473            .map(|p| pos + p)
474            .unwrap_or(rlen);
475        let mut le = nl;
476        if le > pos && region[le - 1] == b'\r' {
477            le -= 1;
478        }
479        if le > pos {
480            let line_start = start + pos;
481            let line_end = start + le;
482
483            let ls = if pfx_len > 0 && le - pos >= pfx_len {
484                let pfx_bytes = prefix_filter.unwrap();
485                if &bytes[line_start..line_start + pfx_len] == pfx_bytes {
486                    line_start + pfx_len
487                } else {
488                    line_start
489                }
490            } else {
491                line_start
492            };
493
494            collect_words_line(bytes, ls, line_end, ctx);
495
496            // Flush chunks when we've accumulated enough words.
497            // Match GNU's approach: run DP, output all lines except the last
498            // partial line, keep the last line's words as overlap for the
499            // next chunk. This ensures natural line breaks at chunk boundaries.
500            while ctx.word_off.len() >= max_words {
501                // Mark last word of chunk as sentence-final
502                ctx.winfo[max_words - 1] |= SENT_FLAG | PERIOD_FLAG;
503
504                let fi = if is_first_chunk {
505                    first_indent
506                } else {
507                    cont_indent
508                };
509
510                // Run DP and output all lines except the last, return
511                // the index of the first word of the last line (kept for overlap).
512                let keep_from = reflow_chunk_partial(
513                    bytes,
514                    prefix_out,
515                    fi,
516                    cont_indent,
517                    config,
518                    output,
519                    &ctx.word_off[..max_words],
520                    &ctx.winfo[..max_words],
521                    &mut dp,
522                )?;
523
524                // Remove processed words, keep overlap from last line
525                let total = ctx.word_off.len();
526                let new_start = keep_from; // index within the chunk
527                let remaining_after_chunk = total - max_words;
528                let keep_count = max_words - new_start + remaining_after_chunk;
529
530                if keep_count > 0 && keep_count < total {
531                    // Shift: keep overlap words (new_start..max_words) + remaining (max_words..total)
532                    ctx.word_off.copy_within(new_start.., 0);
533                    ctx.winfo.copy_within(new_start.., 0);
534                    ctx.word_off.truncate(keep_count);
535                    ctx.winfo.truncate(keep_count);
536                    // Clear the sentence-final flag from what was the chunk boundary
537                    // (it's no longer the last word)
538                    let old_last = max_words - 1 - new_start;
539                    if old_last < ctx.winfo.len() {
540                        ctx.winfo[old_last] &= !(SENT_FLAG | PERIOD_FLAG);
541                    }
542                } else if keep_count == 0 {
543                    ctx.word_off.clear();
544                    ctx.winfo.clear();
545                }
546
547                is_first_chunk = false;
548            }
549        }
550        pos = nl + 1;
551    }
552
553    // Flush remaining words
554    let remaining = ctx.word_off.len();
555    if remaining > 0 {
556        // Mark last word as sentence-final
557        ctx.winfo[remaining - 1] |= SENT_FLAG | PERIOD_FLAG;
558
559        let fi = if is_first_chunk {
560            first_indent
561        } else {
562            cont_indent
563        };
564
565        reflow_chunk(
566            bytes,
567            prefix_out,
568            fi,
569            cont_indent,
570            config,
571            output,
572            &ctx.word_off[..remaining],
573            &ctx.winfo[..remaining],
574            &mut dp,
575        )?;
576    } else if is_first_chunk {
577        // No words collected at all
578        output.write_all(b"\n")?;
579    }
580
581    Ok(())
582}
583
584/// DP buffers — lazily grown to actual word count.
585/// Avoids the previous ~16MB upfront allocation for MAXWORDS=1,000,000.
586/// Also owns the output buffer for batched writes (avoids borrow conflicts
587/// when word_off/winfo are borrowed from FmtCtx while building output).
588struct DpBufs {
589    dp_cost: Vec<i64>,
590    best: Vec<u32>,
591    line_len: Vec<i32>,
592    out_buf: Vec<u8>,
593    break_cost: Vec<i64>,
594    word_len: Vec<u16>,
595    sep_width: Vec<u8>,
596}
597
598impl DpBufs {
599    fn new() -> Self {
600        Self {
601            dp_cost: Vec::with_capacity(257),
602            best: Vec::with_capacity(256),
603            line_len: Vec::with_capacity(257),
604            out_buf: Vec::with_capacity(8192),
605            break_cost: Vec::with_capacity(256),
606            word_len: Vec::with_capacity(256),
607            sep_width: Vec::with_capacity(256),
608        }
609    }
610
611    /// Ensure buffers are large enough for n words.
612    /// Uses unsafe set_len after reserve to avoid zeroing memory — the DP loop
613    /// always writes every element before reading it.
614    #[inline]
615    fn ensure_capacity(&mut self, n: usize) {
616        let needed = n + 1;
617        if self.dp_cost.len() < needed {
618            self.dp_cost.reserve(needed - self.dp_cost.len());
619            self.best.reserve(n.saturating_sub(self.best.len()));
620            self.line_len.reserve(needed - self.line_len.len());
621            // SAFETY: run_dp always writes dp_cost[0..=n], best[0..n-1], line_len[0..=n]
622            // before reading them. The values are all primitive types with no Drop.
623            unsafe {
624                self.dp_cost.set_len(needed);
625                self.best.set_len(n);
626                self.line_len.set_len(needed);
627            }
628        }
629    }
630}
631
632/// Reflow a chunk of words, outputting all lines EXCEPT the last one.
633/// Returns the index (within the chunk) of the first word of the last line.
634/// This allows the caller to keep those words as overlap for the next chunk,
635/// ensuring natural line breaks at chunk boundaries (matching GNU fmt behavior).
636///
637/// Avoids allocating a Vec<usize> for line_starts by walking the DP solution twice:
638/// first to find the last line start, then to output all lines before it.
639#[allow(clippy::too_many_arguments)]
640fn reflow_chunk_partial<W: Write>(
641    bytes: &[u8],
642    prefix: &[u8],
643    first_indent: &[u8],
644    cont_indent: &[u8],
645    config: &FmtConfig,
646    output: &mut W,
647    word_off: &[u32],
648    winfo: &[u32],
649    dp: &mut DpBufs,
650) -> io::Result<usize> {
651    let n = word_off.len();
652    if n == 0 {
653        return Ok(0);
654    }
655
656    // Run the full DP
657    run_dp(n, prefix, first_indent, cont_indent, config, winfo, dp);
658
659    // First pass: count lines and find the last line start
660    let mut line_count = 0usize;
661    let mut last_line_start = 0usize;
662    {
663        let mut i = 0;
664        while i < n {
665            line_count += 1;
666            last_line_start = i;
667            let j = dp.best[i] as usize;
668            i = j + 1;
669        }
670    }
671
672    if line_count <= 1 {
673        // Only one line in the chunk — output nothing, keep all words
674        return Ok(0);
675    }
676
677    // Second pass: output all lines except the last one
678    let out_buf = &mut dp.out_buf;
679    out_buf.clear();
680
681    let mut i = 0;
682    let mut li = 0;
683    while i < n {
684        let j = dp.best[i] as usize;
685        if i == last_line_start {
686            break; // Don't output the last line
687        }
688
689        out_buf.extend_from_slice(prefix);
690        if li == 0 {
691            out_buf.extend_from_slice(first_indent);
692        } else {
693            out_buf.extend_from_slice(cont_indent);
694        }
695
696        let off = word_off[i] as usize;
697        let wlen = (winfo[i] & 0xFFFF) as usize;
698        out_buf.extend_from_slice(&bytes[off..off + wlen]);
699
700        for k in (i + 1)..=j {
701            if winfo[k - 1] & SENT_FLAG != 0 {
702                out_buf.extend_from_slice(b"  ");
703            } else {
704                out_buf.push(b' ');
705            }
706            let off = word_off[k] as usize;
707            let wlen = (winfo[k] & 0xFFFF) as usize;
708            out_buf.extend_from_slice(&bytes[off..off + wlen]);
709        }
710        out_buf.push(b'\n');
711
712        li += 1;
713        i = j + 1;
714    }
715
716    // Single write for all output lines
717    output.write_all(out_buf)?;
718
719    // Return the index of the first word of the last line (for overlap)
720    Ok(last_line_start)
721}
722
723/// Run the backward DP pass on n words, filling dp.dp_cost, dp.best, dp.line_len.
724///
725/// Optimization: break costs at position j only depend on winfo[j-1], winfo[j], winfo[j+1],
726/// so we pre-compute them once in O(n) before the main O(n*W) DP loop. This removes all
727/// flag-checking branches from the hot inner loop, where each j is visited by ~W different
728/// values of i (W = words per line, typically 10-15).
729#[allow(clippy::too_many_arguments)]
730fn run_dp(
731    n: usize,
732    prefix: &[u8],
733    first_indent: &[u8],
734    cont_indent: &[u8],
735    config: &FmtConfig,
736    winfo: &[u32],
737    dp: &mut DpBufs,
738) {
739    let first_base = prefix.len() + first_indent.len();
740    let cont_base = prefix.len() + cont_indent.len();
741    let goal = config.goal as i64;
742    let width = config.width;
743
744    const SHORT_FACTOR: i64 = 100;
745    const RAGGED_FACTOR: i64 = 50;
746    const LINE_COST: i64 = 70 * 70;
747    const SENTENCE_BONUS: i64 = 50 * 50;
748    const NOBREAK_COST: i64 = 600 * 600;
749    const PUNCT_BONUS: i64 = 40 * 40;
750    const PAREN_BONUS: i64 = 40 * 40;
751
752    dp.ensure_capacity(n);
753    unsafe {
754        std::ptr::write_bytes(dp.dp_cost.as_mut_ptr(), 0xFF, n + 1);
755    }
756    dp.dp_cost[n] = 0;
757
758    // Pre-compute break costs: bc[j] = cost of breaking after word j.
759    // Only depends on winfo[j-1..=j+1], invariant across different line-start i.
760    // bc[n-1] = 0 (last word = end of paragraph, no break cost).
761    let bc = &mut dp.break_cost;
762    if bc.len() < n {
763        bc.resize(n, 0);
764    }
765    let winfo_ptr = winfo.as_ptr();
766    unsafe {
767        *bc.as_mut_ptr().add(n - 1) = 0;
768    }
769    for j in 0..n.saturating_sub(1) {
770        let wj = unsafe { *winfo_ptr.add(j) };
771        let wj1 = unsafe { *winfo_ptr.add(j + 1) };
772        let mut cost = LINE_COST;
773
774        if wj & PERIOD_FLAG != 0 {
775            if wj & SENT_FLAG != 0 {
776                cost -= SENTENCE_BONUS;
777            } else {
778                cost += NOBREAK_COST;
779            }
780        } else if wj & PUNCT_FLAG != 0 {
781            cost -= PUNCT_BONUS;
782        } else if j > 0 {
783            let wjm1 = unsafe { *winfo_ptr.add(j - 1) };
784            if wjm1 & SENT_FLAG != 0 {
785                let wl = ((wj & 0xFFFF) as usize).min(LUT_SIZE - 1);
786                cost += DIV40K[wl];
787            }
788        }
789
790        if wj1 & PAREN_FLAG != 0 {
791            cost -= PAREN_BONUS;
792        } else if wj1 & SENT_FLAG != 0 {
793            let wl = ((wj1 & 0xFFFF) as usize).min(LUT_SIZE - 1);
794            cost += DIV22K[wl];
795        }
796
797        unsafe {
798            *bc.as_mut_ptr().add(j) = cost;
799        }
800    }
801
802    // Pre-extract word lengths and separator widths to avoid repeated mask/flag ops
803    let word_len = &mut dp.word_len;
804    if word_len.len() < n {
805        word_len.resize(n, 0);
806    }
807    let sep_w = &mut dp.sep_width;
808    if sep_w.len() < n {
809        sep_w.resize(n, 0);
810    }
811    for j in 0..n {
812        let w = unsafe { *winfo_ptr.add(j) };
813        unsafe {
814            *word_len.as_mut_ptr().add(j) = (w & 0xFFFF) as u16;
815            *sep_w.as_mut_ptr().add(j) = if j > 0 && (*winfo_ptr.add(j - 1) & SENT_FLAG != 0) {
816                2u8
817            } else {
818                1u8
819            };
820        }
821    }
822
823    let dp_cost_ptr = dp.dp_cost.as_mut_ptr();
824    let best_ptr = dp.best.as_mut_ptr();
825    let line_len_ptr = dp.line_len.as_mut_ptr();
826    let bc_ptr = bc.as_ptr();
827    let wl_ptr = word_len.as_ptr();
828    let sw_ptr = sep_w.as_ptr();
829    let nm1 = n - 1;
830
831    for i in (0..n).rev() {
832        let base = if i == 0 { first_base } else { cont_base };
833        let mut len = base + unsafe { *wl_ptr.add(i) } as usize;
834        let mut best_total = i64::MAX;
835        let mut best_j = i as u32;
836        let mut best_len = len as i32;
837
838        // Inner loop: try each possible line ending j from i..n
839        // Break costs and word lengths are pre-computed — the hot path is
840        // just arithmetic (two squarings + additions + comparison).
841        for j in i..n {
842            if j > i {
843                len += unsafe { *sw_ptr.add(j) } as usize + unsafe { *wl_ptr.add(j) } as usize;
844            }
845
846            if len >= width {
847                if j == i {
848                    // Single word exceeds width — must accept it
849                    let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
850                    if cj1 >= 0 {
851                        best_total = cj1;
852                        best_j = j as u32;
853                        best_len = len as i32;
854                    }
855                }
856                break;
857            }
858
859            // Line cost: 0 for last line of paragraph, else shortfall + raggedness
860            let lc = if j == nm1 {
861                0i64
862            } else {
863                let short_n = goal - len as i64;
864                let short_cost = short_n * short_n * SHORT_FACTOR;
865                let next_best = unsafe { *best_ptr.add(j + 1) };
866                let ragged_cost = if (next_best as usize + 1) < n {
867                    let ragged_n = len as i64 - unsafe { *line_len_ptr.add(j + 1) } as i64;
868                    ragged_n * ragged_n * RAGGED_FACTOR
869                } else {
870                    0
871                };
872                short_cost + ragged_cost
873            };
874
875            let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
876            if cj1 >= 0 {
877                let total = lc + unsafe { *bc_ptr.add(j) } + cj1;
878                if total < best_total {
879                    best_total = total;
880                    best_j = j as u32;
881                    best_len = len as i32;
882                }
883            }
884        }
885
886        if best_total < i64::MAX {
887            unsafe {
888                *dp_cost_ptr.add(i) = best_total;
889                *best_ptr.add(i) = best_j;
890                *line_len_ptr.add(i) = best_len;
891            }
892        }
893    }
894}
895
896/// Reflow a chunk of words using pre-allocated DP buffers (outputs all lines).
897/// Builds each line in a reusable buffer, writes once per line.
898/// The downstream BufWriter handles batching of small writes into large syscalls.
899#[allow(clippy::too_many_arguments)]
900fn reflow_chunk<W: Write>(
901    bytes: &[u8],
902    prefix: &[u8],
903    first_indent: &[u8],
904    cont_indent: &[u8],
905    config: &FmtConfig,
906    output: &mut W,
907    word_off: &[u32],
908    winfo: &[u32],
909    dp: &mut DpBufs,
910) -> io::Result<()> {
911    let n = word_off.len();
912    if n == 0 {
913        return Ok(());
914    }
915
916    let out_buf = &mut dp.out_buf;
917    out_buf.clear();
918
919    // Fast path: if all words fit on one line, skip DP entirely.
920    // Compute total width = prefix + indent + sum(word_len) + separators.
921    let base = prefix.len() + first_indent.len();
922    let mut total_width = base;
923    for j in 0..n {
924        let wl = (winfo[j] & 0xFFFF) as usize;
925        if j > 0 {
926            total_width += if winfo[j - 1] & SENT_FLAG != 0 { 2 } else { 1 };
927        }
928        total_width += wl;
929    }
930    if total_width <= config.width {
931        // Single line — output directly without DP
932        out_buf.extend_from_slice(prefix);
933        out_buf.extend_from_slice(first_indent);
934        let off = word_off[0] as usize;
935        let wlen = (winfo[0] & 0xFFFF) as usize;
936        out_buf.extend_from_slice(&bytes[off..off + wlen]);
937        for k in 1..n {
938            if winfo[k - 1] & SENT_FLAG != 0 {
939                out_buf.extend_from_slice(b"  ");
940            } else {
941                out_buf.push(b' ');
942            }
943            let off = word_off[k] as usize;
944            let wlen = (winfo[k] & 0xFFFF) as usize;
945            out_buf.extend_from_slice(&bytes[off..off + wlen]);
946        }
947        out_buf.push(b'\n');
948        return output.write_all(out_buf);
949    }
950
951    run_dp(n, prefix, first_indent, cont_indent, config, winfo, dp);
952
953    // Build all output lines in one buffer, then write once.
954    let out_buf = &mut dp.out_buf;
955
956    let mut i = 0;
957    let mut is_first_line = true;
958    while i < n {
959        let j = dp.best[i] as usize;
960        out_buf.extend_from_slice(prefix);
961        if is_first_line {
962            out_buf.extend_from_slice(first_indent);
963        } else {
964            out_buf.extend_from_slice(cont_indent);
965        }
966        let off = word_off[i] as usize;
967        let wlen = (winfo[i] & 0xFFFF) as usize;
968        out_buf.extend_from_slice(&bytes[off..off + wlen]);
969        for k in (i + 1)..=j {
970            if winfo[k - 1] & SENT_FLAG != 0 {
971                out_buf.extend_from_slice(b"  ");
972            } else {
973                out_buf.push(b' ');
974            }
975            let off = word_off[k] as usize;
976            let wlen = (winfo[k] & 0xFFFF) as usize;
977            out_buf.extend_from_slice(&bytes[off..off + wlen]);
978        }
979        out_buf.push(b'\n');
980        is_first_line = false;
981        i = j + 1;
982    }
983
984    output.write_all(out_buf)
985}
986
987/// Split a single input line using the optimal paragraph algorithm.
988/// Used in split-only mode (-s): short lines are preserved as-is,
989/// long lines are broken optimally (same algorithm as normal reflow).
990/// Works entirely on &[u8] — no UTF-8 conversion.
991#[allow(clippy::too_many_arguments)]
992fn split_line_optimal<W: Write>(
993    bytes: &[u8],
994    line_start: usize,
995    line_end: usize,
996    config: &FmtConfig,
997    prefix: Option<&[u8]>,
998    output: &mut W,
999    ctx: &mut FmtCtx,
1000) -> io::Result<()> {
1001    let line_len = line_end - line_start;
1002    let pfx = prefix.unwrap_or(b"");
1003
1004    // Short line: output as-is (no splitting needed).
1005    if line_len < config.width {
1006        output.write_all(&bytes[line_start..line_end])?;
1007        output.write_all(b"\n")?;
1008        return Ok(());
1009    }
1010
1011    // Strip prefix for word collection
1012    let content_start = match prefix {
1013        Some(pfx_bytes) if bytes[line_start..line_end].starts_with(pfx_bytes) => {
1014            line_start + pfx_bytes.len()
1015        }
1016        _ => line_start,
1017    };
1018
1019    let indent_len = leading_ws_len(&bytes[content_start..line_end]);
1020    let indent = &bytes[content_start..content_start + indent_len];
1021
1022    ctx.clear_words();
1023    collect_words_line(bytes, content_start, line_end, ctx);
1024
1025    if ctx.word_off.is_empty() {
1026        output.write_all(&bytes[line_start..line_end])?;
1027        output.write_all(b"\n")?;
1028        return Ok(());
1029    }
1030
1031    // Mark last word as sentence-final
1032    let last = ctx.winfo.len() - 1;
1033    ctx.winfo[last] |= SENT_FLAG | PERIOD_FLAG;
1034
1035    // Use shared DP infrastructure instead of duplicated reflow_paragraph
1036    let n = ctx.word_off.len();
1037    let mut dp = DpBufs::new();
1038
1039    run_dp(n, pfx, indent, indent, config, &ctx.winfo, &mut dp);
1040
1041    // Build and write output line by line
1042    let line_buf = &mut dp.out_buf;
1043    let mut i = 0;
1044    while i < n {
1045        let j = dp.best[i] as usize;
1046        line_buf.clear();
1047        line_buf.extend_from_slice(pfx);
1048        line_buf.extend_from_slice(indent);
1049        let off = ctx.word_off[i] as usize;
1050        let wlen = (ctx.winfo[i] & 0xFFFF) as usize;
1051        line_buf.extend_from_slice(&bytes[off..off + wlen]);
1052        for k in (i + 1)..=j {
1053            if ctx.winfo[k - 1] & SENT_FLAG != 0 {
1054                line_buf.extend_from_slice(b"  ");
1055            } else {
1056                line_buf.push(b' ');
1057            }
1058            let off = ctx.word_off[k] as usize;
1059            let wlen = (ctx.winfo[k] & 0xFFFF) as usize;
1060            line_buf.extend_from_slice(&bytes[off..off + wlen]);
1061        }
1062        line_buf.push(b'\n');
1063        output.write_all(line_buf)?;
1064        i = j + 1;
1065    }
1066
1067    Ok(())
1068}