use std::io::{self, Read, Write};
pub struct FmtConfig {
pub width: usize,
pub goal: usize,
pub split_only: bool,
pub crown_margin: bool,
pub tagged: bool,
pub uniform_spacing: bool,
pub prefix: Option<String>,
}
impl Default for FmtConfig {
fn default() -> Self {
let width = 75;
Self {
width,
goal: (width * 187) / 200,
split_only: false,
crown_margin: false,
tagged: false,
uniform_spacing: false,
prefix: None,
}
}
}
static WS_TABLE: [u8; 256] = {
let mut t = [0u8; 256];
t[b'\t' as usize] = 1;
t[b'\n' as usize] = 1;
t[0x0B] = 1; t[0x0C] = 1; t[b'\r' as usize] = 1;
t[b' ' as usize] = 1;
t
};
#[inline(always)]
fn is_ws(b: u8) -> bool {
unsafe { *WS_TABLE.get_unchecked(b as usize) != 0 }
}
const SENT_FLAG: u32 = 1 << 16; const PERIOD_FLAG: u32 = 1 << 17; const PUNCT_FLAG: u32 = 1 << 18; const PAREN_FLAG: u32 = 1 << 19;
const LUT_SIZE: usize = 128;
static DIV40K: [i64; LUT_SIZE] = {
let mut t = [0i64; LUT_SIZE];
let mut k = 0;
while k < LUT_SIZE {
t[k] = 40000 / (k as i64 + 2);
k += 1;
}
t
};
static DIV22K: [i64; LUT_SIZE] = {
let mut t = [0i64; LUT_SIZE];
let mut k = 0;
while k < LUT_SIZE {
t[k] = 22500 / (k as i64 + 2);
k += 1;
}
t
};
struct FmtCtx {
word_off: Vec<u32>,
winfo: Vec<u32>,
}
impl FmtCtx {
fn new() -> Self {
Self {
word_off: Vec::with_capacity(256),
winfo: Vec::with_capacity(256),
}
}
#[inline(always)]
fn clear_words(&mut self) {
self.word_off.clear();
self.winfo.clear();
}
}
pub fn fmt_file<R: Read, W: Write>(
mut input: R,
output: &mut W,
config: &FmtConfig,
) -> io::Result<()> {
let mut data = Vec::new();
input.read_to_end(&mut data)?;
fmt_data(&data, output, config)
}
pub fn fmt_data(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
fmt_bytes(data, output, config)
}
#[inline(always)]
fn is_blank_bytes(bytes: &[u8]) -> bool {
for &b in bytes {
if !is_ws(b) {
return false;
}
}
true
}
fn fmt_bytes(data: &[u8], output: &mut impl Write, config: &FmtConfig) -> io::Result<()> {
let prefix_bytes = config.prefix.as_deref().map(|s| s.as_bytes());
let mut para_start = 0;
let blen = data.len();
let mut ctx = FmtCtx::new();
let mut line_start = 0;
for nl_pos in memchr::memchr_iter(b'\n', data) {
let line_end = nl_pos;
let le = if line_end > line_start && data[line_end - 1] == b'\r' {
line_end - 1
} else {
line_end
};
if let Some(pfx) = prefix_bytes {
if !data[line_start..le].starts_with(pfx) {
if para_start < line_start {
format_paragraph(data, para_start, line_start, config, output, &mut ctx)?;
}
para_start = nl_pos + 1;
output.write_all(&data[line_start..le])?;
output.write_all(b"\n")?;
line_start = nl_pos + 1;
continue;
}
}
if is_blank_bytes(&data[line_start..le]) {
if para_start < line_start {
format_paragraph(data, para_start, line_start, config, output, &mut ctx)?;
}
output.write_all(b"\n")?;
para_start = nl_pos + 1;
}
line_start = nl_pos + 1;
}
if line_start < blen {
}
if para_start < blen {
let mut end = blen;
while end > para_start && data[end - 1] == b'\n' {
end -= 1;
}
if end > para_start {
format_paragraph(data, para_start, blen, config, output, &mut ctx)?;
}
}
Ok(())
}
fn format_paragraph(
bytes: &[u8],
start: usize,
end: usize,
config: &FmtConfig,
output: &mut impl Write,
ctx: &mut FmtCtx,
) -> io::Result<()> {
let region = &bytes[start..end];
let prefix_bytes = config.prefix.as_deref().map(|s| s.as_bytes());
let mut first_line: Option<(usize, usize)> = None; let mut second_line: Option<(usize, usize)> = None;
{
let rlen = region.len();
let mut pos = 0;
while pos < rlen {
let nl = memchr::memchr(b'\n', ®ion[pos..])
.map(|p| pos + p)
.unwrap_or(rlen);
let mut le = nl;
if le > pos && region[le - 1] == b'\r' {
le -= 1;
}
if le > pos {
let line_range = (start + pos, start + le);
if first_line.is_none() {
first_line = Some(line_range);
} else if second_line.is_none() {
second_line = Some(line_range);
break; }
}
pos = nl + 1;
}
}
let (fl_start, fl_end) = match first_line {
Some(r) => r,
None => return Ok(()),
};
let stripped_first_start = match prefix_bytes {
Some(pfx) if bytes[fl_start..fl_end].starts_with(pfx) => fl_start + pfx.len(),
_ => fl_start,
};
let (stripped_second_start, stripped_second_end) = match second_line {
Some((s, e)) => match prefix_bytes {
Some(pfx) if bytes[s..e].starts_with(pfx) => (s + pfx.len(), e),
_ => (s, e),
},
None => (stripped_first_start, fl_end),
};
let first_indent_len = leading_ws_len(&bytes[stripped_first_start..fl_end]);
let rest_indent_len = leading_ws_len(&bytes[stripped_second_start..stripped_second_end]);
let first_indent = &bytes[stripped_first_start..stripped_first_start + first_indent_len];
let rest_indent = &bytes[stripped_second_start..stripped_second_start + rest_indent_len];
let (first_line_indent, cont_indent) = if config.tagged || config.crown_margin {
(first_indent, rest_indent)
} else {
(first_indent, first_indent)
};
if config.split_only {
let rlen = region.len();
let mut pos = 0;
while pos < rlen {
let nl = memchr::memchr(b'\n', ®ion[pos..])
.map(|p| pos + p)
.unwrap_or(rlen);
let mut le = nl;
if le > pos && region[le - 1] == b'\r' {
le -= 1;
}
if le > pos {
let line_start = start + pos;
let line_end = start + le;
split_line_optimal(
bytes,
line_start,
line_end,
config,
prefix_bytes,
output,
ctx,
)?;
}
pos = nl + 1;
}
return Ok(());
}
let pfx = prefix_bytes.unwrap_or(b"");
const MAXWORDS: usize = 1_000_000;
collect_and_reflow_chunked(
bytes,
region,
start,
prefix_bytes,
pfx,
first_line_indent,
cont_indent,
config,
output,
ctx,
MAXWORDS,
)
}
#[inline(always)]
fn leading_ws_len(bytes: &[u8]) -> usize {
let mut i = 0;
while i < bytes.len() && is_ws(bytes[i]) && bytes[i] != b'\n' {
i += 1;
}
i
}
#[inline(always)]
fn collect_words_line(bytes: &[u8], ls: usize, le: usize, ctx: &mut FmtCtx) {
let ptr = bytes.as_ptr();
let mut i = ls;
while i < le && unsafe { is_ws(*ptr.add(i)) } {
i += 1;
}
while i < le {
let word_start = i;
while i < le && unsafe { !is_ws(*ptr.add(i)) } {
i += 1;
}
let wlen = i - word_start;
let space_start = i;
while i < le && unsafe { is_ws(*ptr.add(i)) } {
i += 1;
}
let space_count = i - space_start;
let wb = unsafe { std::slice::from_raw_parts(ptr.add(word_start), wlen) };
let mut flags = 0u32;
let in_sent_ctx = i >= le || space_count >= 2;
flags |= classify_word_punct(wb, in_sent_ctx);
if wlen > 0 && matches!(wb[0], b'(' | b'[' | b'{') {
flags |= PAREN_FLAG;
}
ctx.word_off.push(word_start as u32);
ctx.winfo.push((wlen as u32) | flags);
}
}
#[inline(always)]
fn classify_word_punct(bytes: &[u8], in_sentence_context: bool) -> u32 {
let mut i = bytes.len();
while i > 0 && matches!(bytes[i - 1], b'"' | b'\'' | b')' | b']') {
i -= 1;
}
if i == 0 {
return 0;
}
let c = bytes[i - 1];
if c == b'.' || c == b'!' || c == b'?' {
let mut flags = PERIOD_FLAG;
if in_sentence_context {
let mut end = i;
while end > 0 && matches!(bytes[end - 1], b'.' | b'!' | b'?') {
end -= 1;
}
if !(end == 1 && bytes[0].is_ascii_uppercase()) && end > 0 {
flags |= SENT_FLAG;
}
}
flags
} else if c == b',' || c == b';' || c == b':' {
PUNCT_FLAG
} else {
0
}
}
#[allow(clippy::too_many_arguments)]
fn collect_and_reflow_chunked(
bytes: &[u8],
region: &[u8],
start: usize,
prefix_filter: Option<&[u8]>,
prefix_out: &[u8],
first_indent: &[u8],
cont_indent: &[u8],
config: &FmtConfig,
output: &mut impl Write,
ctx: &mut FmtCtx,
max_words: usize,
) -> io::Result<()> {
let rlen = region.len();
let pfx_len = prefix_filter.map_or(0, |p| p.len());
let mut pos = 0;
let mut is_first_chunk = true;
let mut dp = DpBufs::new();
ctx.clear_words();
while pos < rlen {
let nl = memchr::memchr(b'\n', ®ion[pos..])
.map(|p| pos + p)
.unwrap_or(rlen);
let mut le = nl;
if le > pos && region[le - 1] == b'\r' {
le -= 1;
}
if le > pos {
let line_start = start + pos;
let line_end = start + le;
let ls = if pfx_len > 0 && le - pos >= pfx_len {
let pfx_bytes = prefix_filter.unwrap();
if &bytes[line_start..line_start + pfx_len] == pfx_bytes {
line_start + pfx_len
} else {
line_start
}
} else {
line_start
};
collect_words_line(bytes, ls, line_end, ctx);
while ctx.word_off.len() >= max_words {
ctx.winfo[max_words - 1] |= SENT_FLAG | PERIOD_FLAG;
let fi = if is_first_chunk {
first_indent
} else {
cont_indent
};
let keep_from = reflow_chunk_partial(
bytes,
prefix_out,
fi,
cont_indent,
config,
output,
&ctx.word_off[..max_words],
&ctx.winfo[..max_words],
&mut dp,
)?;
let total = ctx.word_off.len();
let new_start = keep_from; let remaining_after_chunk = total - max_words;
let keep_count = max_words - new_start + remaining_after_chunk;
if keep_count > 0 && keep_count < total {
ctx.word_off.copy_within(new_start.., 0);
ctx.winfo.copy_within(new_start.., 0);
ctx.word_off.truncate(keep_count);
ctx.winfo.truncate(keep_count);
let old_last = max_words - 1 - new_start;
if old_last < ctx.winfo.len() {
ctx.winfo[old_last] &= !(SENT_FLAG | PERIOD_FLAG);
}
} else if keep_count == 0 {
ctx.word_off.clear();
ctx.winfo.clear();
}
is_first_chunk = false;
}
}
pos = nl + 1;
}
let remaining = ctx.word_off.len();
if remaining > 0 {
ctx.winfo[remaining - 1] |= SENT_FLAG | PERIOD_FLAG;
let fi = if is_first_chunk {
first_indent
} else {
cont_indent
};
reflow_chunk(
bytes,
prefix_out,
fi,
cont_indent,
config,
output,
&ctx.word_off[..remaining],
&ctx.winfo[..remaining],
&mut dp,
)?;
} else if is_first_chunk {
output.write_all(b"\n")?;
}
Ok(())
}
struct DpBufs {
dp_cost: Vec<i64>,
best: Vec<u32>,
line_len: Vec<i32>,
out_buf: Vec<u8>,
break_cost: Vec<i64>,
word_len: Vec<u16>,
sep_width: Vec<u8>,
}
impl DpBufs {
fn new() -> Self {
Self {
dp_cost: Vec::with_capacity(257),
best: Vec::with_capacity(256),
line_len: Vec::with_capacity(257),
out_buf: Vec::with_capacity(8192),
break_cost: Vec::with_capacity(256),
word_len: Vec::with_capacity(256),
sep_width: Vec::with_capacity(256),
}
}
#[inline]
fn ensure_capacity(&mut self, n: usize) {
let needed = n + 1;
if self.dp_cost.len() < needed {
self.dp_cost.reserve(needed - self.dp_cost.len());
self.best.reserve(n.saturating_sub(self.best.len()));
self.line_len.reserve(needed - self.line_len.len());
unsafe {
self.dp_cost.set_len(needed);
self.best.set_len(n);
self.line_len.set_len(needed);
}
}
}
}
#[allow(clippy::too_many_arguments)]
fn reflow_chunk_partial<W: Write>(
bytes: &[u8],
prefix: &[u8],
first_indent: &[u8],
cont_indent: &[u8],
config: &FmtConfig,
output: &mut W,
word_off: &[u32],
winfo: &[u32],
dp: &mut DpBufs,
) -> io::Result<usize> {
let n = word_off.len();
if n == 0 {
return Ok(0);
}
run_dp(n, prefix, first_indent, cont_indent, config, winfo, dp);
let mut line_count = 0usize;
let mut last_line_start = 0usize;
{
let mut i = 0;
while i < n {
line_count += 1;
last_line_start = i;
let j = dp.best[i] as usize;
i = j + 1;
}
}
if line_count <= 1 {
return Ok(0);
}
let out_buf = &mut dp.out_buf;
out_buf.clear();
let mut i = 0;
let mut li = 0;
while i < n {
let j = dp.best[i] as usize;
if i == last_line_start {
break; }
out_buf.extend_from_slice(prefix);
if li == 0 {
out_buf.extend_from_slice(first_indent);
} else {
out_buf.extend_from_slice(cont_indent);
}
let off = word_off[i] as usize;
let wlen = (winfo[i] & 0xFFFF) as usize;
out_buf.extend_from_slice(&bytes[off..off + wlen]);
for k in (i + 1)..=j {
if winfo[k - 1] & SENT_FLAG != 0 {
out_buf.extend_from_slice(b" ");
} else {
out_buf.push(b' ');
}
let off = word_off[k] as usize;
let wlen = (winfo[k] & 0xFFFF) as usize;
out_buf.extend_from_slice(&bytes[off..off + wlen]);
}
out_buf.push(b'\n');
li += 1;
i = j + 1;
}
output.write_all(out_buf)?;
Ok(last_line_start)
}
#[allow(clippy::too_many_arguments)]
fn run_dp(
n: usize,
prefix: &[u8],
first_indent: &[u8],
cont_indent: &[u8],
config: &FmtConfig,
winfo: &[u32],
dp: &mut DpBufs,
) {
let first_base = prefix.len() + first_indent.len();
let cont_base = prefix.len() + cont_indent.len();
let goal = config.goal as i64;
let width = config.width;
const SHORT_FACTOR: i64 = 100;
const RAGGED_FACTOR: i64 = 50;
const LINE_COST: i64 = 70 * 70;
const SENTENCE_BONUS: i64 = 50 * 50;
const NOBREAK_COST: i64 = 600 * 600;
const PUNCT_BONUS: i64 = 40 * 40;
const PAREN_BONUS: i64 = 40 * 40;
dp.ensure_capacity(n);
unsafe {
std::ptr::write_bytes(dp.dp_cost.as_mut_ptr(), 0xFF, n + 1);
}
dp.dp_cost[n] = 0;
let bc = &mut dp.break_cost;
if bc.len() < n {
bc.resize(n, 0);
}
let winfo_ptr = winfo.as_ptr();
unsafe {
*bc.as_mut_ptr().add(n - 1) = 0;
}
for j in 0..n.saturating_sub(1) {
let wj = unsafe { *winfo_ptr.add(j) };
let wj1 = unsafe { *winfo_ptr.add(j + 1) };
let mut cost = LINE_COST;
if wj & PERIOD_FLAG != 0 {
if wj & SENT_FLAG != 0 {
cost -= SENTENCE_BONUS;
} else {
cost += NOBREAK_COST;
}
} else if wj & PUNCT_FLAG != 0 {
cost -= PUNCT_BONUS;
} else if j > 0 {
let wjm1 = unsafe { *winfo_ptr.add(j - 1) };
if wjm1 & SENT_FLAG != 0 {
let wl = ((wj & 0xFFFF) as usize).min(LUT_SIZE - 1);
cost += DIV40K[wl];
}
}
if wj1 & PAREN_FLAG != 0 {
cost -= PAREN_BONUS;
} else if wj1 & SENT_FLAG != 0 {
let wl = ((wj1 & 0xFFFF) as usize).min(LUT_SIZE - 1);
cost += DIV22K[wl];
}
unsafe {
*bc.as_mut_ptr().add(j) = cost;
}
}
let word_len = &mut dp.word_len;
if word_len.len() < n {
word_len.resize(n, 0);
}
let sep_w = &mut dp.sep_width;
if sep_w.len() < n {
sep_w.resize(n, 0);
}
for j in 0..n {
let w = unsafe { *winfo_ptr.add(j) };
unsafe {
*word_len.as_mut_ptr().add(j) = (w & 0xFFFF) as u16;
*sep_w.as_mut_ptr().add(j) = if j > 0 && (*winfo_ptr.add(j - 1) & SENT_FLAG != 0) {
2u8
} else {
1u8
};
}
}
let dp_cost_ptr = dp.dp_cost.as_mut_ptr();
let best_ptr = dp.best.as_mut_ptr();
let line_len_ptr = dp.line_len.as_mut_ptr();
let bc_ptr = bc.as_ptr();
let wl_ptr = word_len.as_ptr();
let sw_ptr = sep_w.as_ptr();
let nm1 = n - 1;
for i in (0..n).rev() {
let base = if i == 0 { first_base } else { cont_base };
let mut len = base + unsafe { *wl_ptr.add(i) } as usize;
let mut best_total = i64::MAX;
let mut best_j = i as u32;
let mut best_len = len as i32;
for j in i..n {
if j > i {
len += unsafe { *sw_ptr.add(j) } as usize + unsafe { *wl_ptr.add(j) } as usize;
}
if len >= width {
if j == i {
let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
if cj1 >= 0 {
best_total = cj1;
best_j = j as u32;
best_len = len as i32;
}
}
break;
}
let lc = if j == nm1 {
0i64
} else {
let short_n = goal - len as i64;
let short_cost = short_n * short_n * SHORT_FACTOR;
let next_best = unsafe { *best_ptr.add(j + 1) };
let ragged_cost = if (next_best as usize + 1) < n {
let ragged_n = len as i64 - unsafe { *line_len_ptr.add(j + 1) } as i64;
ragged_n * ragged_n * RAGGED_FACTOR
} else {
0
};
short_cost + ragged_cost
};
let cj1 = unsafe { *dp_cost_ptr.add(j + 1) };
if cj1 >= 0 {
let total = lc + unsafe { *bc_ptr.add(j) } + cj1;
if total < best_total {
best_total = total;
best_j = j as u32;
best_len = len as i32;
}
}
}
if best_total < i64::MAX {
unsafe {
*dp_cost_ptr.add(i) = best_total;
*best_ptr.add(i) = best_j;
*line_len_ptr.add(i) = best_len;
}
}
}
}
#[allow(clippy::too_many_arguments)]
fn reflow_chunk<W: Write>(
bytes: &[u8],
prefix: &[u8],
first_indent: &[u8],
cont_indent: &[u8],
config: &FmtConfig,
output: &mut W,
word_off: &[u32],
winfo: &[u32],
dp: &mut DpBufs,
) -> io::Result<()> {
let n = word_off.len();
if n == 0 {
return Ok(());
}
let out_buf = &mut dp.out_buf;
out_buf.clear();
let base = prefix.len() + first_indent.len();
let mut total_width = base;
for j in 0..n {
let wl = (winfo[j] & 0xFFFF) as usize;
if j > 0 {
total_width += if winfo[j - 1] & SENT_FLAG != 0 { 2 } else { 1 };
}
total_width += wl;
}
if total_width <= config.width {
out_buf.extend_from_slice(prefix);
out_buf.extend_from_slice(first_indent);
let off = word_off[0] as usize;
let wlen = (winfo[0] & 0xFFFF) as usize;
out_buf.extend_from_slice(&bytes[off..off + wlen]);
for k in 1..n {
if winfo[k - 1] & SENT_FLAG != 0 {
out_buf.extend_from_slice(b" ");
} else {
out_buf.push(b' ');
}
let off = word_off[k] as usize;
let wlen = (winfo[k] & 0xFFFF) as usize;
out_buf.extend_from_slice(&bytes[off..off + wlen]);
}
out_buf.push(b'\n');
return output.write_all(out_buf);
}
run_dp(n, prefix, first_indent, cont_indent, config, winfo, dp);
let out_buf = &mut dp.out_buf;
let mut i = 0;
let mut is_first_line = true;
while i < n {
let j = dp.best[i] as usize;
out_buf.extend_from_slice(prefix);
if is_first_line {
out_buf.extend_from_slice(first_indent);
} else {
out_buf.extend_from_slice(cont_indent);
}
let off = word_off[i] as usize;
let wlen = (winfo[i] & 0xFFFF) as usize;
out_buf.extend_from_slice(&bytes[off..off + wlen]);
for k in (i + 1)..=j {
if winfo[k - 1] & SENT_FLAG != 0 {
out_buf.extend_from_slice(b" ");
} else {
out_buf.push(b' ');
}
let off = word_off[k] as usize;
let wlen = (winfo[k] & 0xFFFF) as usize;
out_buf.extend_from_slice(&bytes[off..off + wlen]);
}
out_buf.push(b'\n');
is_first_line = false;
i = j + 1;
}
output.write_all(out_buf)
}
#[allow(clippy::too_many_arguments)]
fn split_line_optimal<W: Write>(
bytes: &[u8],
line_start: usize,
line_end: usize,
config: &FmtConfig,
prefix: Option<&[u8]>,
output: &mut W,
ctx: &mut FmtCtx,
) -> io::Result<()> {
let line_len = line_end - line_start;
let pfx = prefix.unwrap_or(b"");
if line_len < config.width {
output.write_all(&bytes[line_start..line_end])?;
output.write_all(b"\n")?;
return Ok(());
}
let content_start = match prefix {
Some(pfx_bytes) if bytes[line_start..line_end].starts_with(pfx_bytes) => {
line_start + pfx_bytes.len()
}
_ => line_start,
};
let indent_len = leading_ws_len(&bytes[content_start..line_end]);
let indent = &bytes[content_start..content_start + indent_len];
ctx.clear_words();
collect_words_line(bytes, content_start, line_end, ctx);
if ctx.word_off.is_empty() {
output.write_all(&bytes[line_start..line_end])?;
output.write_all(b"\n")?;
return Ok(());
}
let last = ctx.winfo.len() - 1;
ctx.winfo[last] |= SENT_FLAG | PERIOD_FLAG;
let n = ctx.word_off.len();
let mut dp = DpBufs::new();
run_dp(n, pfx, indent, indent, config, &ctx.winfo, &mut dp);
let line_buf = &mut dp.out_buf;
let mut i = 0;
while i < n {
let j = dp.best[i] as usize;
line_buf.clear();
line_buf.extend_from_slice(pfx);
line_buf.extend_from_slice(indent);
let off = ctx.word_off[i] as usize;
let wlen = (ctx.winfo[i] & 0xFFFF) as usize;
line_buf.extend_from_slice(&bytes[off..off + wlen]);
for k in (i + 1)..=j {
if ctx.winfo[k - 1] & SENT_FLAG != 0 {
line_buf.extend_from_slice(b" ");
} else {
line_buf.push(b' ');
}
let off = ctx.word_off[k] as usize;
let wlen = (ctx.winfo[k] & 0xFFFF) as usize;
line_buf.extend_from_slice(&bytes[off..off + wlen]);
}
line_buf.push(b'\n');
output.write_all(line_buf)?;
i = j + 1;
}
Ok(())
}