use std::io::Write;
#[derive(Clone, Debug)]
pub enum TabStops {
Regular(usize),
List(Vec<usize>),
}
impl TabStops {
#[inline]
fn spaces_to_next(&self, column: usize) -> usize {
match self {
TabStops::Regular(n) => {
if *n == 0 {
return 0;
}
*n - (column % *n)
}
TabStops::List(stops) => {
match stops.binary_search(&(column + 1)) {
Ok(idx) => stops[idx] - column,
Err(idx) => {
if idx < stops.len() {
stops[idx] - column
} else {
1
}
}
}
}
}
}
#[inline]
fn next_tab_stop(&self, column: usize) -> usize {
column + self.spaces_to_next(column)
}
}
pub fn parse_tab_stops(spec: &str) -> Result<TabStops, String> {
let spec = spec.trim();
if spec.is_empty() {
return Ok(TabStops::Regular(8));
}
if let Ok(n) = spec.parse::<usize>() {
if n == 0 {
return Err("tab size cannot be 0".to_string());
}
return Ok(TabStops::Regular(n));
}
let mut stops: Vec<usize> = Vec::new();
for part in spec.split([',', ' ']) {
let part = part.trim();
if part.is_empty() {
continue;
}
if let Some(rest) = part.strip_prefix('/') {
let n: usize = rest
.parse()
.map_err(|_| format!("'{}' is not a valid number", part))?;
if n == 0 {
return Err("tab size cannot be 0".to_string());
}
let last = stops.last().copied().unwrap_or(0);
let mut pos = last + n;
while pos < 10000 {
stops.push(pos);
pos += n;
}
continue;
}
match part.parse::<usize>() {
Ok(n) => {
if !stops.is_empty() && n <= *stops.last().unwrap() {
return Err("tab sizes must be ascending".to_string());
}
stops.push(n);
}
Err(_) => return Err(format!("'{}' is not a valid number", part)),
}
}
if stops.is_empty() {
return Err("tab specification is empty".to_string());
}
if stops.len() == 1 {
return Ok(TabStops::Regular(stops[0]));
}
Ok(TabStops::List(stops))
}
const SPACES: [u8; 4096] = [b' '; 4096];
#[inline]
fn push_spaces(output: &mut Vec<u8>, n: usize) {
let mut remaining = n;
while remaining > 0 {
let chunk = remaining.min(SPACES.len());
output.extend_from_slice(&SPACES[..chunk]);
remaining -= chunk;
}
}
#[inline]
fn write_spaces(out: &mut impl Write, n: usize) -> std::io::Result<()> {
let mut remaining = n;
while remaining > 0 {
let chunk = remaining.min(SPACES.len());
out.write_all(&SPACES[..chunk])?;
remaining -= chunk;
}
Ok(())
}
pub fn expand_bytes(
data: &[u8],
tabs: &TabStops,
initial_only: bool,
out: &mut impl Write,
) -> std::io::Result<()> {
if data.is_empty() {
return Ok(());
}
if let TabStops::Regular(tab_size) = tabs {
if initial_only {
if memchr::memchr(b'\t', data).is_none() {
return out.write_all(data);
}
return expand_initial_fast(data, *tab_size, out);
}
if memchr::memchr(b'\t', data).is_none() {
return out.write_all(data);
}
if memchr::memchr(b'\x08', data).is_none() {
return expand_regular_fast(data, *tab_size, out);
}
return expand_generic(data, tabs, initial_only, true, out);
}
if memchr::memchr(b'\t', data).is_none() {
return out.write_all(data);
}
let has_backspace = memchr::memchr(b'\x08', data).is_some();
expand_generic(data, tabs, initial_only, has_backspace, out)
}
const PARALLEL_THRESHOLD: usize = 4 * 1024 * 1024;
fn expand_regular_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
debug_assert!(tab_size > 0, "tab_size must be > 0");
if data.len() >= PARALLEL_THRESHOLD {
expand_regular_parallel(data, tab_size, out)
} else {
expand_regular_single(data, tab_size, out)
}
}
fn expand_regular_single(
data: &[u8],
tab_size: usize,
out: &mut impl Write,
) -> std::io::Result<()> {
let is_pow2 = tab_size.is_power_of_two();
let mask = tab_size.wrapping_sub(1);
const BUF_CAP: usize = 2 * 1024 * 1024;
const FLUSH_AT: usize = BUF_CAP - 64 * 1024;
let mut output: Vec<u8> = Vec::with_capacity(BUF_CAP);
let src = data.as_ptr();
let mut column: usize = 0;
let mut pos: usize = 0;
let mut wp: usize = 0;
for hit in memchr::memchr2_iter(b'\t', b'\n', data) {
let seg_len = hit - pos;
if wp + seg_len + tab_size > BUF_CAP {
unsafe {
output.set_len(wp);
}
out.write_all(&output)?;
output.clear();
wp = 0;
if seg_len > BUF_CAP {
out.write_all(&data[pos..pos + seg_len])?;
column += seg_len;
pos = hit + 1;
let out_ptr = output.as_mut_ptr();
if unsafe { *src.add(hit) } == b'\n' {
unsafe {
*out_ptr.add(wp) = b'\n';
}
wp += 1;
column = 0;
} else {
let rem = if is_pow2 {
column & mask
} else {
column % tab_size
};
let spaces = tab_size - rem;
unsafe {
std::ptr::write_bytes(out_ptr.add(wp), b' ', spaces);
}
wp += spaces;
column += spaces;
}
continue;
}
}
let out_ptr = output.as_mut_ptr();
if seg_len > 0 {
unsafe {
std::ptr::copy_nonoverlapping(src.add(pos), out_ptr.add(wp), seg_len);
}
wp += seg_len;
column += seg_len;
}
if unsafe { *src.add(hit) } == b'\n' {
unsafe {
*out_ptr.add(wp) = b'\n';
}
wp += 1;
column = 0;
} else {
let rem = if is_pow2 {
column & mask
} else {
column % tab_size
};
let spaces = tab_size - rem;
unsafe {
std::ptr::write_bytes(out_ptr.add(wp), b' ', spaces);
}
wp += spaces;
column += spaces;
}
pos = hit + 1;
if wp >= FLUSH_AT {
unsafe {
output.set_len(wp);
}
out.write_all(&output)?;
output.clear();
wp = 0;
}
}
if pos < data.len() {
let tail = data.len() - pos;
if wp + tail > BUF_CAP {
unsafe {
output.set_len(wp);
}
out.write_all(&output)?;
output.clear();
wp = 0;
if tail > BUF_CAP {
out.write_all(&data[pos..])?;
return Ok(());
}
}
unsafe {
std::ptr::copy_nonoverlapping(src.add(pos), output.as_mut_ptr().add(wp), tail);
}
wp += tail;
}
if wp > 0 {
unsafe {
output.set_len(wp);
}
out.write_all(&output)?;
}
Ok(())
}
fn expand_chunk(chunk: &[u8], tab_size: usize, is_pow2: bool, mask: usize) -> Vec<u8> {
let worst_case = chunk.len() * tab_size;
let mut output: Vec<u8> = Vec::with_capacity(worst_case);
let out_ptr = output.as_mut_ptr();
let src = chunk.as_ptr();
let mut wp: usize = 0;
let mut column: usize = 0;
let mut pos: usize = 0;
for hit in memchr::memchr2_iter(b'\t', b'\n', chunk) {
let seg_len = hit - pos;
if seg_len > 0 {
unsafe {
std::ptr::copy_nonoverlapping(src.add(pos), out_ptr.add(wp), seg_len);
}
wp += seg_len;
column += seg_len;
}
if unsafe { *src.add(hit) } == b'\n' {
unsafe {
*out_ptr.add(wp) = b'\n';
}
wp += 1;
column = 0;
} else {
let rem = if is_pow2 {
column & mask
} else {
column % tab_size
};
let spaces = tab_size - rem;
unsafe {
std::ptr::write_bytes(out_ptr.add(wp), b' ', spaces);
}
wp += spaces;
column += spaces;
}
pos = hit + 1;
}
if pos < chunk.len() {
let tail = chunk.len() - pos;
unsafe {
std::ptr::copy_nonoverlapping(src.add(pos), out_ptr.add(wp), tail);
}
wp += tail;
}
unsafe {
output.set_len(wp);
}
output
}
fn expand_regular_parallel(
data: &[u8],
tab_size: usize,
out: &mut impl Write,
) -> std::io::Result<()> {
use rayon::prelude::*;
let is_pow2 = tab_size.is_power_of_two();
let mask = tab_size.wrapping_sub(1);
let num_chunks = rayon::current_num_threads().max(2);
let target_chunk_size = data.len() / num_chunks;
let mut chunks: Vec<&[u8]> = Vec::with_capacity(num_chunks + 1);
let mut pos: usize = 0;
for _ in 0..num_chunks - 1 {
if pos >= data.len() {
break;
}
let target_end = (pos + target_chunk_size).min(data.len());
let chunk_end = if target_end >= data.len() {
data.len()
} else {
match memchr::memchr(b'\n', &data[target_end..]) {
Some(off) => target_end + off + 1,
None => data.len(),
}
};
chunks.push(&data[pos..chunk_end]);
pos = chunk_end;
}
if pos < data.len() {
chunks.push(&data[pos..]);
}
let results: Vec<Vec<u8>> = chunks
.par_iter()
.map(|chunk| expand_chunk(chunk, tab_size, is_pow2, mask))
.collect();
for result in &results {
if !result.is_empty() {
out.write_all(result)?;
}
}
Ok(())
}
fn expand_initial_fast(data: &[u8], tab_size: usize, out: &mut impl Write) -> std::io::Result<()> {
debug_assert!(tab_size > 0, "tab_size must be > 0");
let tabs = TabStops::Regular(tab_size);
let mut pos: usize = 0;
while pos < data.len() {
let line_end = memchr::memchr(b'\n', &data[pos..])
.map(|off| pos + off + 1)
.unwrap_or(data.len());
let line = &data[pos..line_end];
debug_assert!(!line.is_empty());
let first = line[0];
if first != b'\t' && first != b' ' {
out.write_all(line)?;
pos = line_end;
continue;
}
if memchr::memchr(b'\x08', line).is_some() {
expand_generic(line, &tabs, true, true, out)?;
pos = line_end;
continue;
}
let mut column: usize = 0;
let mut i = 0; while i < line.len() {
let byte = line[i];
if byte == b'\t' {
let spaces = tab_size - (column % tab_size);
write_spaces(out, spaces)?;
column += spaces;
i += 1;
} else if byte == b' ' {
let space_start = i;
while i < line.len() && line[i] == b' ' {
i += 1;
}
out.write_all(&line[space_start..i])?;
column += i - space_start;
} else {
break;
}
}
if i < line.len() {
out.write_all(&line[i..])?;
}
pos = line_end;
}
Ok(())
}
fn expand_generic(
data: &[u8],
tabs: &TabStops,
initial_only: bool,
has_backspace: bool,
out: &mut impl Write,
) -> std::io::Result<()> {
const FLUSH_THRESHOLD: usize = 256 * 1024;
let cap = data.len().min(FLUSH_THRESHOLD) + data.len().min(FLUSH_THRESHOLD) / 8;
let mut output = Vec::with_capacity(cap);
if !initial_only && !has_backspace {
let mut column: usize = 0;
let mut pos: usize = 0;
while pos < data.len() {
match memchr::memchr2(b'\t', b'\n', &data[pos..]) {
Some(offset) => {
if offset > 0 {
output.extend_from_slice(&data[pos..pos + offset]);
column += offset;
}
let byte = data[pos + offset];
pos += offset + 1;
if byte == b'\n' {
output.push(b'\n');
column = 0;
} else {
let spaces = tabs.spaces_to_next(column);
push_spaces(&mut output, spaces);
column += spaces;
}
if output.len() >= FLUSH_THRESHOLD {
out.write_all(&output)?;
output.clear();
}
}
None => {
output.extend_from_slice(&data[pos..]);
break;
}
}
}
} else {
let mut column: usize = 0;
let mut in_initial = true;
for &byte in data {
match byte {
b'\t' => {
if initial_only && !in_initial {
output.push(b'\t');
column = tabs.next_tab_stop(column);
} else {
let spaces = tabs.spaces_to_next(column);
push_spaces(&mut output, spaces);
column += spaces;
}
}
b'\n' => {
output.push(b'\n');
column = 0;
in_initial = true;
if output.len() >= FLUSH_THRESHOLD {
out.write_all(&output)?;
output.clear();
}
}
b'\x08' => {
output.push(b'\x08');
if column > 0 {
column -= 1;
}
}
_ => {
if initial_only && in_initial && byte != b' ' {
in_initial = false;
}
output.push(byte);
column += 1;
}
}
}
}
if !output.is_empty() {
out.write_all(&output)?;
}
Ok(())
}
pub fn unexpand_is_passthrough(data: &[u8], tabs: &TabStops, all: bool) -> bool {
if data.is_empty() {
return true;
}
if memchr::memchr2(b' ', b'\t', data).is_none() {
return true;
}
if let TabStops::Regular(_) = tabs {
if all {
memchr::memchr(b'\t', data).is_none() && memchr::memmem::find(data, b" ").is_none()
} else {
!unexpand_default_needs_processing(data)
}
} else {
false
}
}
pub fn unexpand_bytes(
data: &[u8],
tabs: &TabStops,
all: bool,
out: &mut impl Write,
) -> std::io::Result<()> {
if data.is_empty() {
return Ok(());
}
if memchr::memchr2(b' ', b'\t', data).is_none() {
return out.write_all(data);
}
if let TabStops::Regular(tab_size) = tabs {
if all {
if memchr::memchr(b'\t', data).is_none() && memchr::memmem::find(data, b" ").is_none()
{
return out.write_all(data);
}
} else {
if !unexpand_default_needs_processing(data) {
return out.write_all(data);
}
}
if memchr::memchr(b'\x08', data).is_none() {
return unexpand_regular_fast(data, *tab_size, all, out);
}
}
unexpand_generic(data, tabs, all, out)
}
#[inline]
fn unexpand_default_needs_processing(data: &[u8]) -> bool {
if data[0] == b' ' || data[0] == b'\t' {
return true;
}
memchr::memmem::find(data, b"\n ").is_some() || memchr::memmem::find(data, b"\n\t").is_some()
}
#[inline]
fn emit_blanks(
out: &mut impl Write,
start_col: usize,
count: usize,
tab_size: usize,
) -> std::io::Result<()> {
if count == 0 {
return Ok(());
}
let end_col = start_col + count;
let mut col = start_col;
loop {
let next_tab = col + (tab_size - col % tab_size);
if next_tab > end_col {
break;
}
let blanks_consumed = next_tab - col;
if blanks_consumed >= 2 || next_tab < end_col {
out.write_all(b"\t")?;
col = next_tab;
} else {
break;
}
}
let remaining = end_col - col;
if remaining > 0 {
let mut r = remaining;
while r > 0 {
let chunk = r.min(SPACES.len());
out.write_all(&SPACES[..chunk])?;
r -= chunk;
}
}
Ok(())
}
fn unexpand_regular_fast(
data: &[u8],
tab_size: usize,
all: bool,
out: &mut impl Write,
) -> std::io::Result<()> {
if all {
return unexpand_regular_fast_all(data, tab_size, out);
}
let mut column: usize = 0;
let mut pos: usize = 0;
let mut in_initial = true;
while pos < data.len() {
if in_initial {
if data[pos] == b' ' || data[pos] == b'\t' {
let blank_start_col = column;
while pos < data.len() && (data[pos] == b' ' || data[pos] == b'\t') {
if data[pos] == b'\t' {
column += tab_size - column % tab_size;
} else {
column += 1;
}
pos += 1;
}
emit_blanks(out, blank_start_col, column - blank_start_col, tab_size)?;
continue;
}
if data[pos] == b'\n' {
out.write_all(b"\n")?;
column = 0;
in_initial = true;
pos += 1;
continue;
}
}
match memchr::memchr(b'\n', &data[pos..]) {
Some(offset) => {
out.write_all(&data[pos..pos + offset + 1])?;
column = 0;
in_initial = true;
pos += offset + 1;
}
None => {
out.write_all(&data[pos..])?;
return Ok(());
}
}
}
Ok(())
}
fn unexpand_regular_fast_all(
data: &[u8],
tab_size: usize,
out: &mut impl Write,
) -> std::io::Result<()> {
let mut output: Vec<u8> = Vec::with_capacity(data.len());
let mut pos: usize = 0;
for nl_pos in memchr::memchr_iter(b'\n', data) {
let line = &data[pos..nl_pos];
if memchr::memchr(b'\t', line).is_none() && memchr::memmem::find(line, b" ").is_none() {
output.extend_from_slice(line);
} else {
unexpand_line_all_fast(line, tab_size, &mut output);
}
output.push(b'\n');
if output.len() >= 1024 * 1024 {
out.write_all(&output)?;
output.clear();
}
pos = nl_pos + 1;
}
if pos < data.len() {
let line = &data[pos..];
if memchr::memchr(b'\t', line).is_none() && memchr::memmem::find(line, b" ").is_none() {
output.extend_from_slice(line);
} else {
unexpand_line_all_fast(line, tab_size, &mut output);
}
}
if !output.is_empty() {
out.write_all(&output)?;
}
Ok(())
}
#[inline]
fn unexpand_line_all_fast(line: &[u8], tab_size: usize, output: &mut Vec<u8>) {
let mut column: usize = 0;
let mut pos: usize = 0;
loop {
let blank_pos = {
let mut search = pos;
loop {
match memchr::memchr2(b' ', b'\t', &line[search..]) {
Some(off) => {
let abs = search + off;
if line[abs] == b'\t' {
break Some(abs);
}
if abs + 1 < line.len() && (line[abs + 1] == b' ' || line[abs + 1] == b'\t')
{
break Some(abs);
}
search = abs + 1;
}
None => break None,
}
}
};
match blank_pos {
Some(bp) => {
if bp > pos {
output.extend_from_slice(&line[pos..bp]);
column += bp - pos;
}
let blank_start_col = column;
let blank_start = bp;
pos = bp;
let mut has_tab = false;
while pos < line.len() && (line[pos] == b' ' || line[pos] == b'\t') {
if line[pos] == b'\t' {
has_tab = true;
column += tab_size - column % tab_size;
} else {
column += 1;
}
pos += 1;
}
if has_tab {
emit_blank_run_vec(output, &line[blank_start..pos], blank_start_col, tab_size);
} else {
let more_follow = pos < line.len();
emit_spaces_only_vec(
output,
blank_start_col,
pos - blank_start,
tab_size,
more_follow,
);
}
}
None => {
if pos < line.len() {
output.extend_from_slice(&line[pos..]);
}
break;
}
}
}
}
#[inline(always)]
fn emit_spaces_only_vec(
output: &mut Vec<u8>,
start_col: usize,
count: usize,
tab_size: usize,
more_follow: bool,
) {
if count == 0 {
return;
}
let end_col = start_col + count;
let mut col = start_col;
loop {
let next_tab = col + (tab_size - col % tab_size);
if next_tab > end_col {
break;
}
let blanks_to_tab = next_tab - col;
if blanks_to_tab >= 2 || next_tab < end_col || more_follow {
output.push(b'\t');
col = next_tab;
} else {
break;
}
}
let remaining = end_col - col;
if remaining > 0 {
let len = output.len();
output.resize(len + remaining, b' ');
}
}
#[inline(always)]
fn emit_blank_run_vec(output: &mut Vec<u8>, blanks: &[u8], start_col: usize, tab_size: usize) {
if memchr::memchr(b'\t', blanks).is_none() {
emit_spaces_only_vec(output, start_col, blanks.len(), tab_size, false);
return;
}
let mut col = start_col;
let mut pending_spaces: usize = 0;
let mut pending_start_col = start_col;
for (idx, &b) in blanks.iter().enumerate() {
if b == b'\t' {
let new_col = col + (tab_size - col % tab_size);
if pending_spaces > 0 {
let mut tc = pending_start_col;
loop {
let next_ts = tc + (tab_size - tc % tab_size);
if next_ts > new_col {
break;
}
output.push(b'\t');
tc = next_ts;
}
let gap = new_col - tc;
if gap > 0 {
let len = output.len();
output.resize(len + gap, b' ');
}
pending_spaces = 0;
} else {
output.push(b'\t');
}
col = new_col;
pending_start_col = col;
} else {
if pending_spaces == 0 {
pending_start_col = col;
}
pending_spaces += 1;
col += 1;
if col.is_multiple_of(tab_size) {
let more_follow = idx + 1 < blanks.len();
if pending_spaces >= 2 || more_follow {
output.push(b'\t');
pending_spaces = 0;
pending_start_col = col;
}
}
}
}
if pending_spaces > 0 {
let len = output.len();
output.resize(len + pending_spaces, b' ');
}
}
fn unexpand_generic(
data: &[u8],
tabs: &TabStops,
all: bool,
out: &mut impl Write,
) -> std::io::Result<()> {
let tab_size = match tabs {
TabStops::Regular(n) => *n,
TabStops::List(_) => 0, };
let mut column: usize = 0;
let mut space_start_col: Option<usize> = None;
let mut in_initial = true;
for &byte in data {
match byte {
b' ' => {
if !all && !in_initial {
out.write_all(b" ")?;
column += 1;
} else {
if space_start_col.is_none() {
space_start_col = Some(column);
}
column += 1;
}
}
b'\t' => {
if !all && !in_initial {
if let Some(start_col) = space_start_col.take() {
let n = column - start_col;
out.write_all(&SPACES[..n.min(SPACES.len())])?;
}
out.write_all(b"\t")?;
column = tabs.next_tab_stop(column);
} else {
if space_start_col.is_none() {
space_start_col = Some(column);
}
column = tabs.next_tab_stop(column);
}
}
_ => {
if let Some(start_col) = space_start_col.take() {
let count = column - start_col;
if tab_size > 0 {
emit_blanks(out, start_col, count, tab_size)?;
} else {
emit_blanks_tablist(out, start_col, count, tabs)?;
}
}
if byte == b'\n' {
out.write_all(b"\n")?;
column = 0;
in_initial = true;
} else if byte == b'\x08' {
out.write_all(b"\x08")?;
if column > 0 {
column -= 1;
}
} else {
if in_initial {
in_initial = false;
}
out.write_all(&[byte])?;
column += 1;
}
}
}
}
if let Some(start_col) = space_start_col {
let count = column - start_col;
if tab_size > 0 {
emit_blanks(out, start_col, count, tab_size)?;
} else {
emit_blanks_tablist(out, start_col, count, tabs)?;
}
}
Ok(())
}
fn emit_blanks_tablist(
out: &mut impl Write,
start_col: usize,
count: usize,
tabs: &TabStops,
) -> std::io::Result<()> {
if count == 0 {
return Ok(());
}
let end_col = start_col + count;
let mut col = start_col;
let last_stop = match tabs {
TabStops::List(stops) => stops.last().copied().unwrap_or(0),
TabStops::Regular(_) => usize::MAX,
};
while col < last_stop {
let next_tab = tabs.next_tab_stop(col);
if next_tab > end_col || next_tab > last_stop {
break;
}
let blanks_consumed = next_tab - col;
if blanks_consumed >= 2 || next_tab < end_col {
out.write_all(b"\t")?;
col = next_tab;
} else {
break;
}
}
let remaining = end_col - col;
if remaining > 0 {
let mut r = remaining;
while r > 0 {
let chunk = r.min(SPACES.len());
out.write_all(&SPACES[..chunk])?;
r -= chunk;
}
}
Ok(())
}