Skip to main content

coreutils_rs/fmt/
core.rs

1use std::io::{self, Read, Write};
2
3/// Configuration for the fmt command.
4pub struct FmtConfig {
5    /// Maximum line width (default 75).
6    pub width: usize,
7    /// Goal width for line filling (default 93% of width).
8    pub goal: usize,
9    /// Only split long lines, do not refill short lines.
10    pub split_only: bool,
11    /// Crown margin mode: preserve the indentation of the first two lines.
12    pub crown_margin: bool,
13    /// Tagged paragraph mode: first line indentation differs from subsequent lines.
14    pub tagged: bool,
15    /// Uniform spacing: one space between words, two after sentence-ending punctuation.
16    pub uniform_spacing: bool,
17    /// Only reformat lines beginning with this prefix.
18    pub prefix: Option<String>,
19}
20
21impl Default for FmtConfig {
22    fn default() -> Self {
23        let width = 75;
24        Self {
25            width,
26            goal: (width * 187) / 200,
27            split_only: false,
28            crown_margin: false,
29            tagged: false,
30            uniform_spacing: false,
31            prefix: None,
32        }
33    }
34}
35
36/// 256-byte lookup table: 1 = ASCII whitespace (\t \n \x0B \x0C \r \x20), 0 = non-whitespace.
37/// Using a lookup table avoids branch-heavy `is_ascii_whitespace()` per byte.
38static WS_TABLE: [u8; 256] = {
39    let mut t = [0u8; 256];
40    t[b'\t' as usize] = 1;
41    t[b'\n' as usize] = 1;
42    t[0x0B] = 1; // vertical tab
43    t[0x0C] = 1; // form feed
44    t[b'\r' as usize] = 1;
45    t[b' ' as usize] = 1;
46    t
47};
48
49/// Fast whitespace check using lookup table.
50#[inline(always)]
51fn is_ws(b: u8) -> bool {
52    // SAFETY: b as usize is always in 0..256
53    unsafe { *WS_TABLE.get_unchecked(b as usize) != 0 }
54}
55
56/// Word flags for GNU fmt cost model.
57/// Packed into the upper bits of the winfo u32 array for cache efficiency.
58const SENT_FLAG: u32 = 1 << 16; // sentence-final (period + double-space/eol context)
59const PERIOD_FLAG: u32 = 1 << 17; // has sentence-ending punct (.!?) regardless of context
60const PUNCT_FLAG: u32 = 1 << 18; // ends with non-period punctuation (,;:)
61const PAREN_FLAG: u32 = 1 << 19; // starts with opening paren/bracket
62
63/// Precomputed division tables to avoid expensive integer division in the DP inner loop.
64/// div40k[len] = 40000 / (len + 2), div22k[len] = 22500 / (len + 2).
65/// Word lengths above 126 are clamped (extremely rare, cost difference negligible).
66const LUT_SIZE: usize = 128;
67
68static DIV40K: [i64; LUT_SIZE] = {
69    let mut t = [0i64; LUT_SIZE];
70    let mut k = 0;
71    while k < LUT_SIZE {
72        t[k] = 40000 / (k as i64 + 2);
73        k += 1;
74    }
75    t
76};
77
78static DIV22K: [i64; LUT_SIZE] = {
79    let mut t = [0i64; LUT_SIZE];
80    let mut k = 0;
81    while k < LUT_SIZE {
82        t[k] = 22500 / (k as i64 + 2);
83        k += 1;
84    }
85    t
86};
87
88/// Reusable buffers for the entire formatting session.
89/// All data is offset-based (no borrowed references), enabling reuse across paragraphs.
90struct FmtCtx {
91    /// Word byte offsets into the source text. One u32 per word.
92    word_off: Vec<u32>,
93    /// Packed word info: bits 0-15 = length, bits 16-19 = flags.
94    /// Parallel to word_off. Built once during word collection, used directly in DP.
95    winfo: Vec<u32>,
96}
97
98impl FmtCtx {
99    fn new() -> Self {
100        Self {
101            word_off: Vec::with_capacity(256),
102            winfo: Vec::with_capacity(256),
103        }
104    }
105
106    #[inline(always)]
107    fn clear_words(&mut self) {
108        self.word_off.clear();
109        self.winfo.clear();
110    }
111}
112
113/// Reformat text from `input` and write the result to `output`.
114///
115/// Text is processed paragraph by paragraph in a streaming fashion.
116/// Each paragraph is formatted and written immediately, avoiding holding
117/// the entire file in memory.
118pub fn fmt_file<R: Read, W: Write>(
119    mut input: R,
120    output: &mut W,
121    config: &FmtConfig,
122) -> io::Result<()> {
123    let mut data = Vec::new();
124    input.read_to_end(&mut data)?;
125    fmt_data(&data, output, config)
126}
127
128/// Format in-memory data. Works on byte slices directly — no UTF-8 validation pass.
129/// The formatter only inspects ASCII whitespace/punctuation, so raw bytes are fine.
130pub fn fmt_data(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
131    // Work directly on bytes — no UTF-8 validation needed.
132    // The formatter only examines ASCII whitespace and punctuation characters.
133    // Non-ASCII bytes are passed through verbatim (zero-copy).
134    fmt_bytes(data, output, config)
135}
136
137/// Check if a byte range is all whitespace using our fast lookup table.
138#[inline(always)]
139fn is_blank_bytes(bytes: &[u8]) -> bool {
140    for &b in bytes {
141        if !is_ws(b) {
142            return false;
143        }
144    }
145    true
146}
147
148/// Format raw bytes, processing paragraph by paragraph with zero-copy word extraction.
149/// Operates entirely on &[u8] — no UTF-8 conversion or validation at any point.
150/// Uses memchr_iter for SIMD-accelerated newline scanning across the entire input.
151fn fmt_bytes(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
152    let prefix_bytes = config.prefix.as_deref().map(|s| s.as_bytes());
153    let mut para_start = 0;
154    let blen = data.len();
155    let mut ctx = FmtCtx::new();
156    let mut line_start = 0;
157
158    // Use memchr_iter for SIMD-accelerated bulk newline scanning.
159    // This finds all newlines in one pass, avoiding repeated memchr calls
160    // that each re-scan overlapping regions.
161    for nl_pos in memchr::memchr_iter(b'\n', data) {
162        let line_end = nl_pos;
163
164        // Effective line end (strip \r)
165        let le = if line_end > line_start && data[line_end - 1] == b'\r' {
166            line_end - 1
167        } else {
168            line_end
169        };
170
171        // Handle prefix filter
172        if let Some(pfx) = prefix_bytes {
173            if !data[line_start..le].starts_with(pfx) {
174                // Flush current paragraph
175                if para_start < line_start {
176                    format_paragraph(data, para_start, line_start, config, output, &mut ctx)?;
177                }
178                para_start = nl_pos + 1;
179                // Emit verbatim
180                output.write_all(&data[line_start..le])?;
181                output.write_all(b"\n")?;
182                line_start = nl_pos + 1;
183                continue;
184            }
185        }
186
187        // Fast blank-line check using WS lookup table (no allocation)
188        if is_blank_bytes(&data[line_start..le]) {
189            // Blank line = paragraph boundary
190            if para_start < line_start {
191                format_paragraph(data, para_start, line_start, config, output, &mut ctx)?;
192            }
193            output.write_all(b"\n")?;
194            para_start = nl_pos + 1;
195        }
196
197        line_start = nl_pos + 1;
198    }
199
200    // Handle last line (no trailing newline) and flush remaining paragraph
201    if line_start < blen {
202        // There's content after the last newline (no trailing newline)
203        // It's part of the current paragraph
204    }
205
206    if para_start < blen {
207        // Check if remaining content is non-empty (ignoring trailing newlines)
208        let mut end = blen;
209        while end > para_start && data[end - 1] == b'\n' {
210            end -= 1;
211        }
212        if end > para_start {
213            format_paragraph(data, para_start, blen, config, output, &mut ctx)?;
214        }
215    }
216
217    Ok(())
218}
219
220/// Format a paragraph from a region of the source bytes [start..end).
221/// Uses single-pass word extraction with offset-based storage.
222/// All flags are computed once during word collection, eliminating double punctuation analysis.
223fn format_paragraph(
224    bytes: &[u8],
225    start: usize,
226    end: usize,
227    config: &FmtConfig,
228    output: &mut impl Write,
229    ctx: &mut FmtCtx,
230) -> io::Result<()> {
231    let region = &bytes[start..end];
232    let prefix_bytes = config.prefix.as_deref().map(|s| s.as_bytes());
233
234    // Single-pass line extraction using memchr for indentation analysis.
235    // We need the first two non-empty lines for indent detection.
236    let mut first_line: Option<(usize, usize)> = None; // (start_offset, end_offset) relative to `bytes`
237    let mut second_line: Option<(usize, usize)> = None;
238    {
239        let rlen = region.len();
240        let mut pos = 0;
241        while pos < rlen {
242            let nl = memchr::memchr(b'\n', &region[pos..])
243                .map(|p| pos + p)
244                .unwrap_or(rlen);
245            let mut le = nl;
246            if le > pos && region[le - 1] == b'\r' {
247                le -= 1;
248            }
249            if le > pos {
250                let line_range = (start + pos, start + le);
251                if first_line.is_none() {
252                    first_line = Some(line_range);
253                } else if second_line.is_none() {
254                    second_line = Some(line_range);
255                    break; // Only need first two lines for indent analysis
256                }
257            }
258            pos = nl + 1;
259        }
260    }
261
262    let (fl_start, fl_end) = match first_line {
263        Some(r) => r,
264        None => return Ok(()),
265    };
266
267    // Strip prefix from first line for indent analysis
268    let stripped_first_start = match prefix_bytes {
269        Some(pfx) if bytes[fl_start..fl_end].starts_with(pfx) => fl_start + pfx.len(),
270        _ => fl_start,
271    };
272
273    // Strip prefix from second line for indent analysis
274    let (stripped_second_start, stripped_second_end) = match second_line {
275        Some((s, e)) => match prefix_bytes {
276            Some(pfx) if bytes[s..e].starts_with(pfx) => (s + pfx.len(), e),
277            _ => (s, e),
278        },
279        None => (stripped_first_start, fl_end),
280    };
281
282    let first_indent_len = leading_ws_len(&bytes[stripped_first_start..fl_end]);
283    let rest_indent_len = leading_ws_len(&bytes[stripped_second_start..stripped_second_end]);
284
285    // Extract indent bytes (zero-copy slices from source)
286    let first_indent = &bytes[stripped_first_start..stripped_first_start + first_indent_len];
287    let rest_indent = &bytes[stripped_second_start..stripped_second_start + rest_indent_len];
288
289    let (first_line_indent, cont_indent) = if config.tagged || config.crown_margin {
290        (first_indent, rest_indent)
291    } else {
292        (first_indent, first_indent)
293    };
294
295    if config.split_only {
296        // Split-only mode: process each line independently.
297        let rlen = region.len();
298        let mut pos = 0;
299        while pos < rlen {
300            let nl = memchr::memchr(b'\n', &region[pos..])
301                .map(|p| pos + p)
302                .unwrap_or(rlen);
303            let mut le = nl;
304            if le > pos && region[le - 1] == b'\r' {
305                le -= 1;
306            }
307            if le > pos {
308                let line_start = start + pos;
309                let line_end = start + le;
310                split_line_optimal(
311                    bytes,
312                    line_start,
313                    line_end,
314                    config,
315                    prefix_bytes,
316                    output,
317                    ctx,
318                )?;
319            }
320            pos = nl + 1;
321        }
322        return Ok(());
323    }
324
325    let pfx = prefix_bytes.unwrap_or(b"");
326
327    // GNU fmt's MAXWORDS is 1,000,000 — process the entire paragraph in one shot.
328    const MAXWORDS: usize = 1_000_000;
329
330    // Streaming word collection + chunked DP: collect words in MAXWORDS-sized
331    // batches and process each chunk immediately. This avoids allocating a
332    // Vec of 1.5M+ words for huge single-paragraph files.
333    collect_and_reflow_chunked(
334        bytes,
335        region,
336        start,
337        prefix_bytes,
338        pfx,
339        first_line_indent,
340        cont_indent,
341        config,
342        output,
343        ctx,
344        MAXWORDS,
345    )
346}
347
348/// Determine the length of leading ASCII whitespace in a byte slice.
349/// Replaces the old `leading_indent(&str) -> &str` — works on raw bytes,
350/// no UTF-8 requirement.
351#[inline(always)]
352fn leading_ws_len(bytes: &[u8]) -> usize {
353    let mut i = 0;
354    while i < bytes.len() && is_ws(bytes[i]) && bytes[i] != b'\n' {
355        i += 1;
356    }
357    i
358}
359
360/// Collect words from a single line [ls..le) in the source bytes.
361/// Computes all flags (SENT, PERIOD, PUNCT, PAREN) during collection.
362/// Uses a simple byte loop for word boundary detection — for typical line
363/// lengths (< 200 bytes), this is faster than memchr2's SIMD setup overhead
364/// which dominates for short slices.
365#[inline(always)]
366fn collect_words_line(bytes: &[u8], ls: usize, le: usize, ctx: &mut FmtCtx) {
367    let ptr = bytes.as_ptr();
368    let mut i = ls;
369
370    // Skip leading whitespace
371    while i < le && unsafe { is_ws(*ptr.add(i)) } {
372        i += 1;
373    }
374
375    while i < le {
376        let word_start = i;
377
378        // Find end of word using direct byte scan.
379        // For typical line lengths (< 200 bytes), the byte loop is faster than
380        // memchr2 which has per-call SIMD setup overhead (~15ns).
381        while i < le && unsafe { !is_ws(*ptr.add(i)) } {
382            i += 1;
383        }
384        let wlen = i - word_start;
385
386        // Count trailing spaces
387        let space_start = i;
388        while i < le && unsafe { is_ws(*ptr.add(i)) } {
389            i += 1;
390        }
391        let space_count = i - space_start;
392
393        // Compute all flags in one pass
394        let wb = unsafe { std::slice::from_raw_parts(ptr.add(word_start), wlen) };
395        let mut flags = 0u32;
396
397        let in_sent_ctx = i >= le || space_count >= 2;
398        flags |= classify_word_punct(wb, in_sent_ctx);
399        if wlen > 0 && matches!(wb[0], b'(' | b'[' | b'{') {
400            flags |= PAREN_FLAG;
401        }
402
403        ctx.word_off.push(word_start as u32);
404        ctx.winfo.push((wlen as u32) | flags);
405    }
406}
407
408/// Classify a word's trailing punctuation in a single backward scan.
409/// Combines what was previously two separate functions (analyze_word_punct +
410/// is_sentence_end_contextual) into one pass, avoiding redundant byte scanning.
411/// Returns the appropriate flag bits (PERIOD_FLAG, SENT_FLAG, PUNCT_FLAG).
412#[inline(always)]
413fn classify_word_punct(bytes: &[u8], in_sentence_context: bool) -> u32 {
414    let mut i = bytes.len();
415    // Strip trailing quotes/parens
416    while i > 0 && matches!(bytes[i - 1], b'"' | b'\'' | b')' | b']') {
417        i -= 1;
418    }
419    if i == 0 {
420        return 0;
421    }
422    let c = bytes[i - 1];
423    if c == b'.' || c == b'!' || c == b'?' {
424        let mut flags = PERIOD_FLAG;
425        if in_sentence_context {
426            // Strip sentence-ending punctuation to find core word
427            let mut end = i;
428            while end > 0 && matches!(bytes[end - 1], b'.' | b'!' | b'?') {
429                end -= 1;
430            }
431            // Single uppercase letter = abbreviation, not sentence end
432            if !(end == 1 && bytes[0].is_ascii_uppercase()) && end > 0 {
433                flags |= SENT_FLAG;
434            }
435        }
436        flags
437    } else if c == b',' || c == b';' || c == b':' {
438        PUNCT_FLAG
439    } else {
440        0
441    }
442}
443
444/// Collect words from a paragraph and reflow in MAXWORDS-sized chunks.
445/// Combines word collection with chunked DP processing to avoid allocating
446/// a Vec of millions of words for huge single-paragraph files.
447#[allow(clippy::too_many_arguments)]
448fn collect_and_reflow_chunked(
449    bytes: &[u8],
450    region: &[u8],
451    start: usize,
452    prefix_filter: Option<&[u8]>,
453    prefix_out: &[u8],
454    first_indent: &[u8],
455    cont_indent: &[u8],
456    config: &FmtConfig,
457    output: &mut impl Write,
458    ctx: &mut FmtCtx,
459    max_words: usize,
460) -> io::Result<()> {
461    let rlen = region.len();
462    let pfx_len = prefix_filter.map_or(0, |p| p.len());
463    let mut pos = 0;
464    let mut is_first_chunk = true;
465
466    // DP buffers — lazily grown to actual word count, not pre-allocated to MAXWORDS.
467    let mut dp = DpBufs::new();
468
469    ctx.clear_words();
470
471    while pos < rlen {
472        let nl = memchr::memchr(b'\n', &region[pos..])
473            .map(|p| pos + p)
474            .unwrap_or(rlen);
475        let mut le = nl;
476        if le > pos && region[le - 1] == b'\r' {
477            le -= 1;
478        }
479        if le > pos {
480            let line_start = start + pos;
481            let line_end = start + le;
482
483            let ls = if pfx_len > 0 && le - pos >= pfx_len {
484                let pfx_bytes = prefix_filter.unwrap();
485                if &bytes[line_start..line_start + pfx_len] == pfx_bytes {
486                    line_start + pfx_len
487                } else {
488                    line_start
489                }
490            } else {
491                line_start
492            };
493
494            collect_words_line(bytes, ls, line_end, ctx);
495
496            // Flush chunks when we've accumulated enough words.
497            // Match GNU's approach: run DP, output all lines except the last
498            // partial line, keep the last line's words as overlap for the
499            // next chunk. This ensures natural line breaks at chunk boundaries.
500            while ctx.word_off.len() >= max_words {
501                // Mark last word of chunk as sentence-final
502                ctx.winfo[max_words - 1] |= SENT_FLAG | PERIOD_FLAG;
503
504                let fi = if is_first_chunk {
505                    first_indent
506                } else {
507                    cont_indent
508                };
509
510                // Run DP and output all lines except the last, return
511                // the index of the first word of the last line (kept for overlap).
512                let keep_from = reflow_chunk_partial(
513                    bytes,
514                    prefix_out,
515                    fi,
516                    cont_indent,
517                    config,
518                    output,
519                    &ctx.word_off[..max_words],
520                    &ctx.winfo[..max_words],
521                    &mut dp,
522                )?;
523
524                // Remove processed words, keep overlap from last line
525                let total = ctx.word_off.len();
526                let new_start = keep_from; // index within the chunk
527                let remaining_after_chunk = total - max_words;
528                let keep_count = max_words - new_start + remaining_after_chunk;
529
530                if keep_count > 0 && keep_count < total {
531                    // Shift: keep overlap words (new_start..max_words) + remaining (max_words..total)
532                    ctx.word_off.copy_within(new_start.., 0);
533                    ctx.winfo.copy_within(new_start.., 0);
534                    ctx.word_off.truncate(keep_count);
535                    ctx.winfo.truncate(keep_count);
536                    // Clear the sentence-final flag from what was the chunk boundary
537                    // (it's no longer the last word)
538                    let old_last = max_words - 1 - new_start;
539                    if old_last < ctx.winfo.len() {
540                        ctx.winfo[old_last] &= !(SENT_FLAG | PERIOD_FLAG);
541                    }
542                } else if keep_count == 0 {
543                    ctx.word_off.clear();
544                    ctx.winfo.clear();
545                }
546
547                is_first_chunk = false;
548            }
549        }
550        pos = nl + 1;
551    }
552
553    // Flush remaining words
554    let remaining = ctx.word_off.len();
555    if remaining > 0 {
556        // Mark last word as sentence-final
557        ctx.winfo[remaining - 1] |= SENT_FLAG | PERIOD_FLAG;
558
559        let fi = if is_first_chunk {
560            first_indent
561        } else {
562            cont_indent
563        };
564
565        reflow_chunk(
566            bytes,
567            prefix_out,
568            fi,
569            cont_indent,
570            config,
571            output,
572            &ctx.word_off[..remaining],
573            &ctx.winfo[..remaining],
574            &mut dp,
575        )?;
576    } else if is_first_chunk {
577        // No words collected at all
578        output.write_all(b"\n")?;
579    }
580
581    Ok(())
582}
583
584/// DP buffers — lazily grown to actual word count.
585/// Avoids the previous ~16MB upfront allocation for MAXWORDS=1,000,000.
586/// Also owns the output buffer for batched writes (avoids borrow conflicts
587/// when word_off/winfo are borrowed from FmtCtx while building output).
588struct DpBufs {
589    dp_cost: Vec<i64>,
590    best: Vec<u32>,
591    line_len: Vec<i32>,
592    out_buf: Vec<u8>,
593}
594
595impl DpBufs {
596    fn new() -> Self {
597        // Pre-allocate for typical paragraph sizes (256 words covers most paragraphs).
598        // This avoids repeated small reallocations for the first few paragraphs.
599        Self {
600            dp_cost: Vec::with_capacity(257),
601            best: Vec::with_capacity(256),
602            line_len: Vec::with_capacity(257),
603            out_buf: Vec::with_capacity(8192),
604        }
605    }
606
607    /// Ensure buffers are large enough for n words.
608    /// Uses unsafe set_len after reserve to avoid zeroing memory — the DP loop
609    /// always writes every element before reading it.
610    #[inline]
611    fn ensure_capacity(&mut self, n: usize) {
612        let needed = n + 1;
613        if self.dp_cost.len() < needed {
614            self.dp_cost.reserve(needed - self.dp_cost.len());
615            self.best.reserve(n.saturating_sub(self.best.len()));
616            self.line_len.reserve(needed - self.line_len.len());
617            // SAFETY: run_dp always writes dp_cost[0..=n], best[0..n-1], line_len[0..=n]
618            // before reading them. The values are all primitive types with no Drop.
619            unsafe {
620                self.dp_cost.set_len(needed);
621                self.best.set_len(n);
622                self.line_len.set_len(needed);
623            }
624        }
625    }
626}
627
628/// Reflow a chunk of words, outputting all lines EXCEPT the last one.
629/// Returns the index (within the chunk) of the first word of the last line.
630/// This allows the caller to keep those words as overlap for the next chunk,
631/// ensuring natural line breaks at chunk boundaries (matching GNU fmt behavior).
632///
633/// Avoids allocating a Vec<usize> for line_starts by walking the DP solution twice:
634/// first to find the last line start, then to output all lines before it.
635#[allow(clippy::too_many_arguments)]
636fn reflow_chunk_partial<W: Write>(
637    bytes: &[u8],
638    prefix: &[u8],
639    first_indent: &[u8],
640    cont_indent: &[u8],
641    config: &FmtConfig,
642    output: &mut W,
643    word_off: &[u32],
644    winfo: &[u32],
645    dp: &mut DpBufs,
646) -> io::Result<usize> {
647    let n = word_off.len();
648    if n == 0 {
649        return Ok(0);
650    }
651
652    // Run the full DP
653    run_dp(n, prefix, first_indent, cont_indent, config, winfo, dp);
654
655    // First pass: count lines and find the last line start
656    let mut line_count = 0usize;
657    let mut last_line_start = 0usize;
658    {
659        let mut i = 0;
660        while i < n {
661            line_count += 1;
662            last_line_start = i;
663            let j = dp.best[i] as usize;
664            i = j + 1;
665        }
666    }
667
668    if line_count <= 1 {
669        // Only one line in the chunk — output nothing, keep all words
670        return Ok(0);
671    }
672
673    // Second pass: output all lines except the last one
674    let out_buf = &mut dp.out_buf;
675    out_buf.clear();
676
677    let mut i = 0;
678    let mut li = 0;
679    while i < n {
680        let j = dp.best[i] as usize;
681        if i == last_line_start {
682            break; // Don't output the last line
683        }
684
685        out_buf.extend_from_slice(prefix);
686        if li == 0 {
687            out_buf.extend_from_slice(first_indent);
688        } else {
689            out_buf.extend_from_slice(cont_indent);
690        }
691
692        let off = word_off[i] as usize;
693        let wlen = (winfo[i] & 0xFFFF) as usize;
694        out_buf.extend_from_slice(&bytes[off..off + wlen]);
695
696        for k in (i + 1)..=j {
697            if winfo[k - 1] & SENT_FLAG != 0 {
698                out_buf.extend_from_slice(b"  ");
699            } else {
700                out_buf.push(b' ');
701            }
702            let off = word_off[k] as usize;
703            let wlen = (winfo[k] & 0xFFFF) as usize;
704            out_buf.extend_from_slice(&bytes[off..off + wlen]);
705        }
706        out_buf.push(b'\n');
707
708        li += 1;
709        i = j + 1;
710    }
711
712    // Single write for all output lines
713    output.write_all(out_buf)?;
714
715    // Return the index of the first word of the last line (for overlap)
716    Ok(last_line_start)
717}
718
719/// Run the backward DP pass on n words, filling dp.dp_cost, dp.best, dp.line_len.
720#[allow(clippy::too_many_arguments)]
721fn run_dp(
722    n: usize,
723    prefix: &[u8],
724    first_indent: &[u8],
725    cont_indent: &[u8],
726    config: &FmtConfig,
727    winfo: &[u32],
728    dp: &mut DpBufs,
729) {
730    let first_base = prefix.len() + first_indent.len();
731    let cont_base = prefix.len() + cont_indent.len();
732    let goal = config.goal as i64;
733    let width = config.width;
734
735    const SHORT_FACTOR: i64 = 100;
736    const RAGGED_FACTOR: i64 = 50;
737    const LINE_COST: i64 = 70 * 70;
738    const SENTENCE_BONUS: i64 = 50 * 50;
739    const NOBREAK_COST: i64 = 600 * 600;
740    const PUNCT_BONUS: i64 = 40 * 40;
741    const PAREN_BONUS: i64 = 40 * 40;
742
743    dp.ensure_capacity(n);
744    // Use ptr::write_bytes for hardware memset (rep stosb) — 3-4x faster than
745    // fill() for i64::MAX which requires 8-byte stores since 0x7F != any repeated byte.
746    // We use -1 (all 0xFF bytes) as sentinel, then check cost >= 0 to detect valid entries.
747    // All valid DP costs are non-negative, so -1 works as "uninitialized".
748    unsafe {
749        std::ptr::write_bytes(dp.dp_cost.as_mut_ptr(), 0xFF, n + 1);
750    }
751    dp.dp_cost[n] = 0;
752
753    let winfo_ptr = winfo.as_ptr();
754    let dp_cost_ptr = dp.dp_cost.as_mut_ptr();
755    let best_ptr = dp.best.as_mut_ptr();
756    let line_len_ptr = dp.line_len.as_mut_ptr();
757
758    for i in (0..n).rev() {
759        let base = if i == 0 { first_base } else { cont_base };
760        let mut len = base + unsafe { (*winfo_ptr.add(i) & 0xFFFF) as usize };
761        let mut best_total = i64::MAX;
762        let mut best_j = i as u32;
763        let mut best_len = len as i32;
764
765        for j in i..n {
766            if j > i {
767                let sep = if unsafe { *winfo_ptr.add(j - 1) & SENT_FLAG != 0 } {
768                    2
769                } else {
770                    1
771                };
772                len += sep + unsafe { (*winfo_ptr.add(j) & 0xFFFF) as usize };
773            }
774
775            macro_rules! try_candidate {
776                () => {
777                    let lc = if j == n - 1 {
778                        0i64
779                    } else {
780                        let short_n = goal - len as i64;
781                        let short_cost = short_n * short_n * SHORT_FACTOR;
782                        let ragged_cost = if unsafe { *best_ptr.add(j + 1) as usize + 1 < n } {
783                            let ragged_n = len as i64 - unsafe { *line_len_ptr.add(j + 1) } as i64;
784                            ragged_n * ragged_n * RAGGED_FACTOR
785                        } else {
786                            0
787                        };
788                        short_cost + ragged_cost
789                    };
790
791                    let bc = if j == n - 1 {
792                        0i64
793                    } else {
794                        let wj = unsafe { *winfo_ptr.add(j) };
795                        let wj1 = unsafe { *winfo_ptr.add(j + 1) };
796                        let mut cost = LINE_COST;
797
798                        if wj & PERIOD_FLAG != 0 {
799                            if wj & SENT_FLAG != 0 {
800                                cost -= SENTENCE_BONUS;
801                            } else {
802                                cost += NOBREAK_COST;
803                            }
804                        } else if wj & PUNCT_FLAG != 0 {
805                            cost -= PUNCT_BONUS;
806                        } else if j > 0 {
807                            let wjm1 = unsafe { *winfo_ptr.add(j - 1) };
808                            if wjm1 & SENT_FLAG != 0 {
809                                let wl = ((wj & 0xFFFF) as usize).min(LUT_SIZE - 1);
810                                cost += DIV40K[wl];
811                            }
812                        }
813
814                        if wj1 & PAREN_FLAG != 0 {
815                            cost -= PAREN_BONUS;
816                        } else if wj1 & SENT_FLAG != 0 {
817                            let wl = ((wj1 & 0xFFFF) as usize).min(LUT_SIZE - 1);
818                            cost += DIV22K[wl];
819                        }
820
821                        cost
822                    };
823
824                    let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
825                    if cj1 >= 0 {
826                        let total = lc + bc + cj1;
827                        if total < best_total {
828                            best_total = total;
829                            best_j = j as u32;
830                            best_len = len as i32;
831                        }
832                    }
833                };
834            }
835
836            if len >= width {
837                if j == i {
838                    try_candidate!();
839                }
840                break;
841            }
842
843            try_candidate!();
844        }
845
846        if best_total < i64::MAX {
847            unsafe {
848                *dp_cost_ptr.add(i) = best_total;
849                *best_ptr.add(i) = best_j;
850                *line_len_ptr.add(i) = best_len;
851            }
852        }
853    }
854}
855
856/// Reflow a chunk of words using pre-allocated DP buffers (outputs all lines).
857/// Builds each line in a reusable buffer, writes once per line.
858/// The downstream BufWriter handles batching of small writes into large syscalls.
859#[allow(clippy::too_many_arguments)]
860fn reflow_chunk<W: Write>(
861    bytes: &[u8],
862    prefix: &[u8],
863    first_indent: &[u8],
864    cont_indent: &[u8],
865    config: &FmtConfig,
866    output: &mut W,
867    word_off: &[u32],
868    winfo: &[u32],
869    dp: &mut DpBufs,
870) -> io::Result<()> {
871    let n = word_off.len();
872    if n == 0 {
873        return Ok(());
874    }
875
876    run_dp(n, prefix, first_indent, cont_indent, config, winfo, dp);
877
878    // Build all output lines in one buffer, then write once.
879    let out_buf = &mut dp.out_buf;
880    out_buf.clear();
881
882    let mut i = 0;
883    let mut is_first_line = true;
884    while i < n {
885        let j = dp.best[i] as usize;
886        out_buf.extend_from_slice(prefix);
887        if is_first_line {
888            out_buf.extend_from_slice(first_indent);
889        } else {
890            out_buf.extend_from_slice(cont_indent);
891        }
892        let off = word_off[i] as usize;
893        let wlen = (winfo[i] & 0xFFFF) as usize;
894        out_buf.extend_from_slice(&bytes[off..off + wlen]);
895        for k in (i + 1)..=j {
896            if winfo[k - 1] & SENT_FLAG != 0 {
897                out_buf.extend_from_slice(b"  ");
898            } else {
899                out_buf.push(b' ');
900            }
901            let off = word_off[k] as usize;
902            let wlen = (winfo[k] & 0xFFFF) as usize;
903            out_buf.extend_from_slice(&bytes[off..off + wlen]);
904        }
905        out_buf.push(b'\n');
906        is_first_line = false;
907        i = j + 1;
908    }
909
910    output.write_all(out_buf)
911}
912
913/// Split a single input line using the optimal paragraph algorithm.
914/// Used in split-only mode (-s): short lines are preserved as-is,
915/// long lines are broken optimally (same algorithm as normal reflow).
916/// Works entirely on &[u8] — no UTF-8 conversion.
917#[allow(clippy::too_many_arguments)]
918fn split_line_optimal<W: Write>(
919    bytes: &[u8],
920    line_start: usize,
921    line_end: usize,
922    config: &FmtConfig,
923    prefix: Option<&[u8]>,
924    output: &mut W,
925    ctx: &mut FmtCtx,
926) -> io::Result<()> {
927    let line_len = line_end - line_start;
928    let pfx = prefix.unwrap_or(b"");
929
930    // Short line: output as-is (no splitting needed).
931    if line_len < config.width {
932        output.write_all(&bytes[line_start..line_end])?;
933        output.write_all(b"\n")?;
934        return Ok(());
935    }
936
937    // Strip prefix for word collection
938    let content_start = match prefix {
939        Some(pfx_bytes) if bytes[line_start..line_end].starts_with(pfx_bytes) => {
940            line_start + pfx_bytes.len()
941        }
942        _ => line_start,
943    };
944
945    let indent_len = leading_ws_len(&bytes[content_start..line_end]);
946    let indent = &bytes[content_start..content_start + indent_len];
947
948    ctx.clear_words();
949    collect_words_line(bytes, content_start, line_end, ctx);
950
951    if ctx.word_off.is_empty() {
952        output.write_all(&bytes[line_start..line_end])?;
953        output.write_all(b"\n")?;
954        return Ok(());
955    }
956
957    // Mark last word as sentence-final
958    let last = ctx.winfo.len() - 1;
959    ctx.winfo[last] |= SENT_FLAG | PERIOD_FLAG;
960
961    // Use shared DP infrastructure instead of duplicated reflow_paragraph
962    let n = ctx.word_off.len();
963    let mut dp = DpBufs::new();
964
965    run_dp(n, pfx, indent, indent, config, &ctx.winfo, &mut dp);
966
967    // Build and write output line by line
968    let line_buf = &mut dp.out_buf;
969    let mut i = 0;
970    while i < n {
971        let j = dp.best[i] as usize;
972        line_buf.clear();
973        line_buf.extend_from_slice(pfx);
974        line_buf.extend_from_slice(indent);
975        let off = ctx.word_off[i] as usize;
976        let wlen = (ctx.winfo[i] & 0xFFFF) as usize;
977        line_buf.extend_from_slice(&bytes[off..off + wlen]);
978        for k in (i + 1)..=j {
979            if ctx.winfo[k - 1] & SENT_FLAG != 0 {
980                line_buf.extend_from_slice(b"  ");
981            } else {
982                line_buf.push(b' ');
983            }
984            let off = ctx.word_off[k] as usize;
985            let wlen = (ctx.winfo[k] & 0xFFFF) as usize;
986            line_buf.extend_from_slice(&bytes[off..off + wlen]);
987        }
988        line_buf.push(b'\n');
989        output.write_all(line_buf)?;
990        i = j + 1;
991    }
992
993    Ok(())
994}