1use std::io::{self, Read, Write};
2
3pub struct FmtConfig {
5 pub width: usize,
7 pub goal: usize,
9 pub split_only: bool,
11 pub crown_margin: bool,
13 pub tagged: bool,
15 pub uniform_spacing: bool,
17 pub prefix: Option<String>,
19}
20
21impl Default for FmtConfig {
22 fn default() -> Self {
23 let width = 75;
24 Self {
25 width,
26 goal: (width * 187) / 200,
27 split_only: false,
28 crown_margin: false,
29 tagged: false,
30 uniform_spacing: false,
31 prefix: None,
32 }
33 }
34}
35
36static WS_TABLE: [u8; 256] = {
39 let mut t = [0u8; 256];
40 t[b'\t' as usize] = 1;
41 t[b'\n' as usize] = 1;
42 t[0x0B] = 1; t[0x0C] = 1; t[b'\r' as usize] = 1;
45 t[b' ' as usize] = 1;
46 t
47};
48
49#[inline(always)]
51fn is_ws(b: u8) -> bool {
52 unsafe { *WS_TABLE.get_unchecked(b as usize) != 0 }
54}
55
56const SENT_FLAG: u32 = 1 << 16; const PERIOD_FLAG: u32 = 1 << 17; const PUNCT_FLAG: u32 = 1 << 18; const PAREN_FLAG: u32 = 1 << 19; struct FmtCtx {
66 word_off: Vec<u32>,
68 winfo: Vec<u32>,
71 dp_cost: Vec<i64>,
73 best: Vec<u32>,
75 line_len: Vec<i32>,
77 line_buf: Vec<u8>,
79}
80
81impl FmtCtx {
82 fn new() -> Self {
83 Self {
84 word_off: Vec::with_capacity(256),
85 winfo: Vec::with_capacity(256),
86 dp_cost: Vec::with_capacity(257),
87 best: Vec::with_capacity(256),
88 line_len: Vec::with_capacity(257),
89 line_buf: Vec::with_capacity(256),
90 }
91 }
92
93 #[inline(always)]
94 fn clear_words(&mut self) {
95 self.word_off.clear();
96 self.winfo.clear();
97 }
98}
99
100pub fn fmt_file<R: Read, W: Write>(
106 mut input: R,
107 output: &mut W,
108 config: &FmtConfig,
109) -> io::Result<()> {
110 let mut data = Vec::new();
111 input.read_to_end(&mut data)?;
112 fmt_data(&data, output, config)
113}
114
115pub fn fmt_data(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
117 let text = match std::str::from_utf8(data) {
118 Ok(s) => s,
119 Err(_) => {
120 let owned = String::from_utf8_lossy(data);
121 return fmt_str(&owned, output, config);
122 }
123 };
124 fmt_str(text, output, config)
125}
126
127#[inline(always)]
129fn is_blank_bytes(bytes: &[u8]) -> bool {
130 for &b in bytes {
131 if !is_ws(b) {
132 return false;
133 }
134 }
135 true
136}
137
138fn fmt_str(text: &str, output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
140 let prefix_str = config.prefix.as_deref();
141 let mut para_start = 0;
142 let bytes = text.as_bytes();
143 let blen = bytes.len();
144 let mut ctx = FmtCtx::new();
145
146 let mut i = 0;
147
148 while i < blen {
149 let line_end = memchr::memchr(b'\n', &bytes[i..])
151 .map(|p| i + p)
152 .unwrap_or(blen);
153
154 let le = if line_end > i && bytes[line_end - 1] == b'\r' {
156 line_end - 1
157 } else {
158 line_end
159 };
160
161 if let Some(pfx) = prefix_str {
163 let line = &text[i..le];
164 if !line.starts_with(pfx) {
165 if para_start < i {
167 format_paragraph(text, bytes, para_start, i, config, output, &mut ctx)?;
168 }
169 let next = if line_end < blen { line_end + 1 } else { blen };
170 para_start = next;
171 output.write_all(&bytes[i..le])?;
173 output.write_all(b"\n")?;
174 i = next;
175 continue;
176 }
177 }
178
179 if is_blank_bytes(&bytes[i..le]) {
181 if para_start < i {
183 format_paragraph(text, bytes, para_start, i, config, output, &mut ctx)?;
184 }
185 output.write_all(b"\n")?;
186 let next = if line_end < blen { line_end + 1 } else { blen };
187 para_start = next;
188 }
189
190 i = if line_end < blen { line_end + 1 } else { blen };
191 }
192
193 if para_start < blen {
195 let remaining = text[para_start..].trim_end_matches('\n');
196 if !remaining.is_empty() {
197 format_paragraph(text, bytes, para_start, blen, config, output, &mut ctx)?;
198 }
199 }
200
201 Ok(())
202}
203
204fn format_paragraph(
208 text: &str,
209 bytes: &[u8],
210 start: usize,
211 end: usize,
212 config: &FmtConfig,
213 output: &mut impl Write,
214 ctx: &mut FmtCtx,
215) -> io::Result<()> {
216 let region = &bytes[start..end];
217 let prefix_str = config.prefix.as_deref();
218
219 let mut first_line: Option<&str> = None;
222 let mut second_line: Option<&str> = None;
223 {
224 let rlen = region.len();
225 let mut pos = 0;
226 while pos < rlen {
227 let nl = memchr::memchr(b'\n', ®ion[pos..])
228 .map(|p| pos + p)
229 .unwrap_or(rlen);
230 let mut le = nl;
231 if le > pos && region[le - 1] == b'\r' {
232 le -= 1;
233 }
234 if le > pos {
235 let line = &text[start + pos..start + le];
236 if first_line.is_none() {
237 first_line = Some(line);
238 } else if second_line.is_none() {
239 second_line = Some(line);
240 break; }
242 }
243 pos = nl + 1;
244 }
245 }
246
247 let fl = match first_line {
248 Some(l) => l,
249 None => return Ok(()),
250 };
251
252 let stripped_first = match prefix_str {
253 Some(pfx) => fl.strip_prefix(pfx).unwrap_or(fl),
254 None => fl,
255 };
256
257 let stripped_second = match second_line {
258 Some(l) => match prefix_str {
259 Some(pfx) => l.strip_prefix(pfx).unwrap_or(l),
260 None => l,
261 },
262 None => stripped_first,
263 };
264
265 let first_indent = leading_indent(stripped_first);
266 let rest_indent = leading_indent(stripped_second);
267
268 let (first_line_indent, cont_indent) = if config.tagged || config.crown_margin {
269 (first_indent, rest_indent)
270 } else {
271 (first_indent, first_indent)
272 };
273
274 if config.split_only {
275 let rlen = region.len();
277 let mut pos = 0;
278 while pos < rlen {
279 let nl = memchr::memchr(b'\n', ®ion[pos..])
280 .map(|p| pos + p)
281 .unwrap_or(rlen);
282 let mut le = nl;
283 if le > pos && region[le - 1] == b'\r' {
284 le -= 1;
285 }
286 if le > pos {
287 let line = &text[start + pos..start + le];
288 split_line_optimal(line, config, prefix_str, output)?;
289 }
290 pos = nl + 1;
291 }
292 return Ok(());
293 }
294
295 ctx.clear_words();
298 collect_words_from_region(bytes, region, start, prefix_str, ctx);
299
300 let n = ctx.word_off.len();
301 if n == 0 {
302 output.write_all(b"\n")?;
303 return Ok(());
304 }
305
306 let last_idx = n - 1;
308 ctx.winfo[last_idx] |= SENT_FLAG | PERIOD_FLAG;
309
310 let pfx = prefix_str.unwrap_or("");
311 reflow_paragraph(
312 bytes,
313 pfx,
314 first_line_indent,
315 cont_indent,
316 config,
317 output,
318 ctx,
319 )
320}
321
322fn leading_indent(line: &str) -> &str {
324 let trimmed = line.trim_start();
325 &line[..line.len() - trimmed.len()]
326}
327
328fn collect_words_from_region(
331 bytes: &[u8],
332 region: &[u8],
333 start: usize,
334 prefix: Option<&str>,
335 ctx: &mut FmtCtx,
336) {
337 let rlen = region.len();
338 let mut pos = 0;
339 let pfx_len = prefix.map_or(0, |p| p.len());
340
341 while pos < rlen {
342 let nl = memchr::memchr(b'\n', ®ion[pos..])
343 .map(|p| pos + p)
344 .unwrap_or(rlen);
345 let mut le = nl;
346 if le > pos && region[le - 1] == b'\r' {
347 le -= 1;
348 }
349 if le > pos {
350 let line_start = start + pos;
351 let line_end = start + le;
352
353 let ls = if pfx_len > 0 && le - pos >= pfx_len {
355 let pfx_bytes = prefix.unwrap().as_bytes();
356 if &bytes[line_start..line_start + pfx_len] == pfx_bytes {
357 line_start + pfx_len
358 } else {
359 line_start
360 }
361 } else {
362 line_start
363 };
364
365 collect_words_line(bytes, ls, line_end, ctx);
366 }
367 pos = nl + 1;
368 }
369}
370
371#[inline(always)]
375fn collect_words_line(bytes: &[u8], ls: usize, le: usize, ctx: &mut FmtCtx) {
376 let ptr = bytes.as_ptr();
377 let mut i = ls;
378
379 while i < le && unsafe { is_ws(*ptr.add(i)) } {
381 i += 1;
382 }
383
384 while i < le {
385 let word_start = i;
386 while i < le && unsafe { !is_ws(*ptr.add(i)) } {
387 i += 1;
388 }
389 let wlen = i - word_start;
390
391 let space_start = i;
393 while i < le && unsafe { is_ws(*ptr.add(i)) } {
394 i += 1;
395 }
396 let space_count = i - space_start;
397
398 let wb = unsafe { std::slice::from_raw_parts(ptr.add(word_start), wlen) };
400 let mut flags = 0u32;
401
402 let in_sent_ctx = i >= le || space_count >= 2;
403 flags |= classify_word_punct(wb, in_sent_ctx);
404 if wlen > 0 && matches!(wb[0], b'(' | b'[' | b'{') {
405 flags |= PAREN_FLAG;
406 }
407
408 ctx.word_off.push(word_start as u32);
409 ctx.winfo.push((wlen as u32) | flags);
410 }
411}
412
413#[inline(always)]
418fn classify_word_punct(bytes: &[u8], in_sentence_context: bool) -> u32 {
419 let mut i = bytes.len();
420 while i > 0 && matches!(bytes[i - 1], b'"' | b'\'' | b')' | b']') {
422 i -= 1;
423 }
424 if i == 0 {
425 return 0;
426 }
427 let c = bytes[i - 1];
428 if c == b'.' || c == b'!' || c == b'?' {
429 let mut flags = PERIOD_FLAG;
430 if in_sentence_context {
431 let mut end = i;
433 while end > 0 && matches!(bytes[end - 1], b'.' | b'!' | b'?') {
434 end -= 1;
435 }
436 if !(end == 1 && bytes[0].is_ascii_uppercase()) && end > 0 {
438 flags |= SENT_FLAG;
439 }
440 }
441 flags
442 } else if c == b',' || c == b';' || c == b':' {
443 PUNCT_FLAG
444 } else {
445 0
446 }
447}
448
449#[allow(clippy::too_many_arguments)]
455fn reflow_paragraph<W: Write>(
456 bytes: &[u8],
457 prefix: &str,
458 first_indent: &str,
459 cont_indent: &str,
460 config: &FmtConfig,
461 output: &mut W,
462 ctx: &mut FmtCtx,
463) -> io::Result<()> {
464 let n = ctx.word_off.len();
465 if n == 0 {
466 return Ok(());
467 }
468
469 let first_base = prefix.len() + first_indent.len();
470 let cont_base = prefix.len() + cont_indent.len();
471 let goal = config.goal as i64;
472 let width = config.width;
473
474 const SHORT_FACTOR: i64 = 100;
476 const RAGGED_FACTOR: i64 = 50;
477 const LINE_COST: i64 = 70 * 70;
478 const SENTENCE_BONUS: i64 = 50 * 50;
479 const NOBREAK_COST: i64 = 600 * 600;
480 const PUNCT_BONUS: i64 = 40 * 40;
481 const PAREN_BONUS: i64 = 40 * 40;
482
483 ctx.dp_cost.clear();
485 ctx.dp_cost.resize(n + 1, i64::MAX);
486 ctx.dp_cost[n] = 0;
487
488 ctx.best.clear();
489 ctx.best.resize(n, 0);
490
491 ctx.line_len.clear();
492 ctx.line_len.resize(n + 1, 0);
493
494 let winfo_ptr = ctx.winfo.as_ptr();
500 let dp_cost_ptr = ctx.dp_cost.as_mut_ptr();
501 let best_ptr = ctx.best.as_mut_ptr();
502 let line_len_ptr = ctx.line_len.as_mut_ptr();
503
504 const LUT_SIZE: usize = 128;
508 let div40k: [i64; LUT_SIZE] = {
509 let mut t = [0i64; LUT_SIZE];
510 let mut k = 0;
511 while k < LUT_SIZE {
512 t[k] = 40000 / (k as i64 + 2);
513 k += 1;
514 }
515 t
516 };
517 let div22k: [i64; LUT_SIZE] = {
518 let mut t = [0i64; LUT_SIZE];
519 let mut k = 0;
520 while k < LUT_SIZE {
521 t[k] = 22500 / (k as i64 + 2);
522 k += 1;
523 }
524 t
525 };
526
527 for i in (0..n).rev() {
528 let base = if i == 0 { first_base } else { cont_base };
529 let mut len = base + unsafe { (*winfo_ptr.add(i) & 0xFFFF) as usize };
530 let mut best_total = i64::MAX;
531 let mut best_j = i as u32;
532 let mut best_len = len as i32;
533
534 for j in i..n {
535 if j > i {
536 let sep = if unsafe { *winfo_ptr.add(j - 1) & SENT_FLAG != 0 } {
537 2
538 } else {
539 1
540 };
541 len += sep + unsafe { (*winfo_ptr.add(j) & 0xFFFF) as usize };
542 }
543
544 macro_rules! try_candidate {
545 () => {
546 let lc = if j == n - 1 {
547 0i64
548 } else {
549 let short_n = goal - len as i64;
550 let short_cost = short_n * short_n * SHORT_FACTOR;
551 let ragged_cost = if unsafe { *best_ptr.add(j + 1) as usize + 1 < n } {
552 let ragged_n = len as i64 - unsafe { *line_len_ptr.add(j + 1) } as i64;
553 ragged_n * ragged_n * RAGGED_FACTOR
554 } else {
555 0
556 };
557 short_cost + ragged_cost
558 };
559
560 let bc = if j == n - 1 {
561 0i64
562 } else {
563 let wj = unsafe { *winfo_ptr.add(j) };
564 let wj1 = unsafe { *winfo_ptr.add(j + 1) };
565 let mut cost = LINE_COST;
566
567 if wj & PERIOD_FLAG != 0 {
568 if wj & SENT_FLAG != 0 {
569 cost -= SENTENCE_BONUS;
570 } else {
571 cost += NOBREAK_COST;
572 }
573 } else if wj & PUNCT_FLAG != 0 {
574 cost -= PUNCT_BONUS;
575 } else if j > 0 {
576 let wjm1 = unsafe { *winfo_ptr.add(j - 1) };
577 if wjm1 & SENT_FLAG != 0 {
578 let wl = ((wj & 0xFFFF) as usize).min(LUT_SIZE - 1);
579 cost += div40k[wl];
580 }
581 }
582
583 if wj1 & PAREN_FLAG != 0 {
584 cost -= PAREN_BONUS;
585 } else if wj1 & SENT_FLAG != 0 {
586 let wl = ((wj1 & 0xFFFF) as usize).min(LUT_SIZE - 1);
587 cost += div22k[wl];
588 }
589
590 cost
591 };
592
593 let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
594 if cj1 != i64::MAX {
595 let total = lc + bc + cj1;
596 if total < best_total {
597 best_total = total;
598 best_j = j as u32;
599 best_len = len as i32;
600 }
601 }
602 };
603 }
604
605 if len >= width {
606 if j == i {
607 try_candidate!();
608 }
609 break;
610 }
611
612 try_candidate!();
613 }
614
615 if best_total < i64::MAX {
616 unsafe {
617 *dp_cost_ptr.add(i) = best_total;
618 *best_ptr.add(i) = best_j;
619 *line_len_ptr.add(i) = best_len;
620 }
621 }
622 }
623
624 let mut i = 0;
627 let mut is_first_line = true;
628 let line_buf = &mut ctx.line_buf;
629 let word_off = &ctx.word_off;
630 let winfo = &ctx.winfo;
631 let best = &ctx.best;
632
633 while i < n {
634 let j = best[i] as usize;
635
636 line_buf.clear();
637
638 line_buf.extend_from_slice(prefix.as_bytes());
639 if is_first_line {
640 line_buf.extend_from_slice(first_indent.as_bytes());
641 } else {
642 line_buf.extend_from_slice(cont_indent.as_bytes());
643 }
644
645 let off = word_off[i] as usize;
647 let wlen = (winfo[i] & 0xFFFF) as usize;
648 line_buf.extend_from_slice(&bytes[off..off + wlen]);
649
650 for k in (i + 1)..=j {
652 if winfo[k - 1] & SENT_FLAG != 0 {
653 line_buf.extend_from_slice(b" ");
654 } else {
655 line_buf.push(b' ');
656 }
657 let off = word_off[k] as usize;
658 let wlen = (winfo[k] & 0xFFFF) as usize;
659 line_buf.extend_from_slice(&bytes[off..off + wlen]);
660 }
661 line_buf.push(b'\n');
662
663 output.write_all(line_buf)?;
664
665 is_first_line = false;
666 i = j + 1;
667 }
668
669 Ok(())
670}
671
672fn split_line_optimal<W: Write>(
676 line: &str,
677 config: &FmtConfig,
678 prefix: Option<&str>,
679 output: &mut W,
680) -> io::Result<()> {
681 let stripped = match prefix {
682 Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
683 None => line,
684 };
685 let indent = leading_indent(stripped);
686 let pfx = prefix.unwrap_or("");
687
688 if line.len() < config.width {
690 output.write_all(line.as_bytes())?;
691 output.write_all(b"\n")?;
692 return Ok(());
693 }
694
695 let s = match prefix {
696 Some(pfx) => line.strip_prefix(pfx).unwrap_or(line),
697 None => line,
698 };
699
700 let bytes = line.as_bytes();
701 let mut ctx = FmtCtx::new();
702
703 let s_start = s.as_ptr() as usize - line.as_ptr() as usize;
705 let s_end = s_start + s.len();
706 collect_words_line(bytes, s_start, s_end, &mut ctx);
707
708 if ctx.word_off.is_empty() {
709 output.write_all(line.as_bytes())?;
710 output.write_all(b"\n")?;
711 return Ok(());
712 }
713
714 let last = ctx.winfo.len() - 1;
716 ctx.winfo[last] |= SENT_FLAG | PERIOD_FLAG;
717
718 reflow_paragraph(bytes, pfx, indent, indent, config, output, &mut ctx)
719}