use std::io::BufRead;
use std::iter::Peekable;
use std::slice::Iter;
use unicode_width::UnicodeWidthChar;
use crate::FileOrStdReader;
use crate::FmtOptions;
fn char_width(c: char) -> usize {
if (c as usize) < 0xA0 {
1
} else {
UnicodeWidthChar::width(c).unwrap_or(1)
}
}
fn utf8_char_width(byte: u8) -> Option<usize> {
const ASCII_MAX: u8 = 0x7F;
const TWO_BYTE_START: u8 = 0xC2;
const TWO_BYTE_END: u8 = 0xDF;
const THREE_BYTE_START: u8 = 0xE0;
const THREE_BYTE_END: u8 = 0xEF;
const FOUR_BYTE_START: u8 = 0xF0;
const FOUR_BYTE_END: u8 = 0xF4;
if byte <= ASCII_MAX {
return Some(1);
}
if (TWO_BYTE_START..=TWO_BYTE_END).contains(&byte) {
return Some(2);
}
if (THREE_BYTE_START..=THREE_BYTE_END).contains(&byte) {
return Some(3);
}
if (FOUR_BYTE_START..=FOUR_BYTE_END).contains(&byte) {
return Some(4);
}
None
}
fn decode_char(bytes: &[u8], start: usize) -> (Option<char>, usize) {
let Some(&first) = bytes.get(start) else {
return (None, 1);
};
if first < 0x80 {
return (Some(first as char), 1);
}
let Some(width) = utf8_char_width(first) else {
return (None, 1);
};
if start + width > bytes.len() {
return (None, 1);
}
match std::str::from_utf8(&bytes[start..start + width]) {
Ok(s) => (s.chars().next(), width),
Err(_) => (None, 1),
}
}
struct DecodedCharInfo {
ch: Option<char>,
consumed: usize,
width: usize,
is_ascii: bool,
}
fn decode_char_info(bytes: &[u8], start: usize) -> DecodedCharInfo {
let (ch, consumed) = decode_char(bytes, start);
let (width, is_ascii) = match ch {
Some(c) => (char_width(c), c.is_ascii()),
None => (1, false),
};
DecodedCharInfo {
ch,
consumed,
width,
is_ascii,
}
}
fn byte_display_width(bytes: &[u8]) -> usize {
let mut width = 0;
let mut idx = 0;
while idx < bytes.len() {
let info = decode_char_info(bytes, idx);
width += info.width;
idx += info.consumed;
}
width
}
fn is_fmt_whitespace(c: char) -> bool {
matches!(c, ' ' | '\t' | '\n' | '\r' | '\x0B' | '\x0C')
}
fn is_fmt_whitespace_byte(b: u8) -> bool {
matches!(b, b' ' | b'\t' | b'\n' | b'\r' | 0x0B | 0x0C)
}
#[derive(Debug)]
pub enum Line {
FormatLine(FileLine),
NoFormatLine(Vec<u8>, bool),
}
impl Line {
fn get_formatline(self) -> FileLine {
match self {
Self::FormatLine(fl) => fl,
Self::NoFormatLine(..) => panic!("Found NoFormatLine when expecting FormatLine"),
}
}
fn get_noformatline(self) -> (Vec<u8>, bool) {
match self {
Self::NoFormatLine(s, b) => (s, b),
Self::FormatLine(..) => panic!("Found FormatLine when expecting NoFormatLine"),
}
}
}
#[derive(Debug)]
pub struct FileLine {
line: Vec<u8>,
indent_end: usize,
prefix_indent_end: usize,
indent_len: usize,
prefix_len: usize,
}
pub struct FileLines<'a> {
opts: &'a FmtOptions,
reader: &'a mut FileOrStdReader,
}
impl FileLines<'_> {
fn new<'b>(opts: &'b FmtOptions, reader: &'b mut FileOrStdReader) -> FileLines<'b> {
FileLines { opts, reader }
}
fn match_prefix(&self, line: &[u8]) -> (bool, usize) {
let Some(prefix) = &self.opts.prefix else {
return (true, 0);
};
FileLines::match_prefix_generic(prefix.as_bytes(), line, self.opts.xprefix)
}
fn match_anti_prefix(&self, line: &[u8]) -> bool {
let Some(anti_prefix) = &self.opts.anti_prefix else {
return true;
};
match FileLines::match_prefix_generic(anti_prefix.as_bytes(), line, self.opts.xanti_prefix)
{
(true, _) => false,
(_, _) => true,
}
}
fn match_prefix_generic(pfx: &[u8], line: &[u8], exact: bool) -> (bool, usize) {
if line.starts_with(pfx) {
return (true, 0);
}
if !exact {
let mut i = 0;
while i < line.len() {
if line[i..].starts_with(pfx) {
return (true, i);
} else if !is_fmt_whitespace_byte(line[i]) {
break;
}
i += 1;
}
}
(false, 0)
}
fn compute_indent(&self, bytes: &[u8], prefix_end: usize) -> (usize, usize, usize) {
let mut prefix_len = 0;
let mut indent_len = 0;
let mut indent_end = bytes.len();
let mut idx = 0;
while idx < bytes.len() {
if idx == prefix_end {
prefix_len = indent_len;
}
let byte = bytes[idx];
if idx >= prefix_end && !is_fmt_whitespace_byte(byte) {
indent_end = idx;
break;
}
if byte == b'\t' {
indent_len = (indent_len / self.opts.tabwidth + 1) * self.opts.tabwidth;
idx += 1;
continue;
}
let info = decode_char_info(bytes, idx);
indent_len += info.width;
idx += info.consumed;
}
if indent_end == bytes.len() {
indent_end = idx;
}
(indent_end, prefix_len, indent_len)
}
}
impl Iterator for FileLines<'_> {
type Item = Line;
fn next(&mut self) -> Option<Line> {
let mut buf = Vec::new();
match self.reader.read_until(b'\n', &mut buf) {
Ok(0) => return None,
Ok(_) => {}
Err(_) => return None,
}
if buf.ends_with(b"\n") {
buf.pop();
if buf.ends_with(b"\r") {
buf.pop();
}
}
let n = buf;
if n.iter().all(|&b| is_fmt_whitespace_byte(b)) {
return Some(Line::NoFormatLine(Vec::new(), true));
}
let (pmatch, poffset) = self.match_prefix(&n[..]);
if !pmatch {
return Some(Line::NoFormatLine(n, false));
}
if pmatch
&& n[poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len())..]
.iter()
.all(|&b| is_fmt_whitespace_byte(b))
{
return Some(Line::NoFormatLine(n, false));
}
if !self.match_anti_prefix(&n[..]) {
return Some(Line::NoFormatLine(n, false));
}
let prefix_end = poffset + self.opts.prefix.as_ref().map_or(0, |s| s.len());
let (indent_end, prefix_len, indent_len) = self.compute_indent(&n[..], prefix_end);
Some(Line::FormatLine(FileLine {
line: n,
indent_end,
prefix_indent_end: poffset,
indent_len,
prefix_len,
}))
}
}
#[derive(Debug)]
pub struct Paragraph {
lines: Vec<Vec<u8>>,
pub init_str: Vec<u8>,
pub init_len: usize,
init_end: usize,
pub indent_str: Vec<u8>,
pub indent_len: usize,
indent_end: usize,
pub mail_header: bool,
}
pub struct ParagraphStream<'a> {
lines: Peekable<FileLines<'a>>,
next_mail: bool,
opts: &'a FmtOptions,
}
impl ParagraphStream<'_> {
pub fn new<'b>(opts: &'b FmtOptions, reader: &'b mut FileOrStdReader) -> ParagraphStream<'b> {
let lines = FileLines::new(opts, reader).peekable();
ParagraphStream {
lines,
next_mail: true,
opts,
}
}
fn is_mail_header(line: &FileLine) -> bool {
if line.indent_end > 0 {
false
} else {
let l_slice = &line.line[..];
if l_slice.starts_with(b"From ") {
true
} else {
let Some(colon_posn) = l_slice.iter().position(|&b| b == b':') else {
return false;
};
if colon_posn == 0 {
return false;
}
l_slice[..colon_posn]
.iter()
.all(|&b| (33..=126).contains(&(b as usize)) && b != b':')
}
}
}
}
impl Iterator for ParagraphStream<'_> {
type Item = Result<Paragraph, Vec<u8>>;
#[allow(clippy::cognitive_complexity)]
fn next(&mut self) -> Option<Result<Paragraph, Vec<u8>>> {
let noformat = match self.lines.peek()? {
Line::FormatLine(_) => false,
Line::NoFormatLine(_, _) => true,
};
if noformat {
let (s, nm) = self.lines.next().unwrap().get_noformatline();
self.next_mail = nm;
return Some(Err(s));
}
let mut init_str = Vec::new();
let mut init_end = 0;
let mut init_len = 0;
let mut indent_str = Vec::new();
let mut indent_end = 0;
let mut indent_len = 0;
let mut prefix_len = 0;
let mut prefix_indent_end = 0;
let mut p_lines = Vec::new();
let mut in_mail = false;
let mut second_done = false; loop {
let Some(Line::FormatLine(fl)) = self.lines.peek() else {
break;
};
if p_lines.is_empty() {
if self.opts.mail && self.next_mail && ParagraphStream::is_mail_header(fl) {
in_mail = true;
indent_str.extend_from_slice(b" ");
indent_len = 2;
} else {
if self.opts.crown || self.opts.tagged {
init_str.extend_from_slice(&fl.line[..fl.indent_end]);
init_len = fl.indent_len;
init_end = fl.indent_end;
} else {
second_done = true;
}
indent_str.extend_from_slice(&fl.line[..fl.indent_end]);
indent_len = fl.indent_len;
indent_end = fl.indent_end;
prefix_len = fl.prefix_len;
prefix_indent_end = fl.prefix_indent_end;
if self.opts.tagged {
indent_str.extend_from_slice(b" ");
indent_len += 4;
}
}
} else if in_mail {
if fl.indent_end == 0 || (self.opts.prefix.is_some() && fl.prefix_indent_end == 0) {
break; }
} else if !second_done {
if prefix_len != fl.prefix_len || prefix_indent_end != fl.prefix_indent_end {
break;
}
if self.opts.tagged
&& indent_len - 4 == fl.indent_len
&& indent_end == fl.indent_end
{
break;
}
indent_str.clear();
indent_str.extend_from_slice(&fl.line[..fl.indent_end]);
indent_len = fl.indent_len;
indent_end = fl.indent_end;
second_done = true;
} else {
if indent_end != fl.indent_end
|| prefix_indent_end != fl.prefix_indent_end
|| indent_len != fl.indent_len
|| prefix_len != fl.prefix_len
{
break;
}
}
p_lines.push(self.lines.next().unwrap().get_formatline().line);
if self.opts.split_only {
break;
}
}
self.next_mail = in_mail;
Some(Ok(Paragraph {
lines: p_lines,
init_str,
init_len,
init_end,
indent_str,
indent_len,
indent_end,
mail_header: in_mail,
}))
}
}
pub struct ParaWords<'a> {
opts: &'a FmtOptions,
para: &'a Paragraph,
words: Vec<WordInfo<'a>>,
}
impl<'a> ParaWords<'a> {
pub fn new(opts: &'a FmtOptions, para: &'a Paragraph) -> Self {
let mut pw = ParaWords {
opts,
para,
words: Vec::new(),
};
pw.create_words();
pw
}
fn create_words(&mut self) {
if self.para.mail_header {
self.words.extend(
self.para
.lines
.iter()
.flat_map(|x| {
x.split(|b| is_fmt_whitespace_byte(*b))
.filter(|segment| !segment.is_empty())
})
.map(|x| WordInfo {
word: x,
word_start: 0,
word_nchars: byte_display_width(x),
before_tab: None,
after_tab: 0,
sentence_start: false,
ends_punct: false,
new_line: false,
}),
);
} else {
self.words.extend(if self.opts.crown || self.opts.tagged {
WordSplit::new(self.opts, &self.para.lines[0][self.para.init_end..])
} else {
WordSplit::new(self.opts, &self.para.lines[0][self.para.indent_end..])
});
if self.para.lines.len() > 1 {
let indent_end = self.para.indent_end;
let opts = self.opts;
self.words.extend(
self.para
.lines
.iter()
.skip(1)
.flat_map(|x| WordSplit::new(opts, &x[indent_end..])),
);
}
}
}
pub fn words(&'a self) -> Iter<'a, WordInfo<'a>> {
self.words.iter()
}
}
struct WordSplit<'a> {
opts: &'a FmtOptions,
bytes: &'a [u8],
length: usize,
position: usize,
prev_punct: bool,
}
impl WordSplit<'_> {
fn analyze_tabs(&self, bytes: &[u8]) -> (Option<usize>, usize, Option<usize>) {
let mut beforetab = None;
let mut aftertab = 0;
let mut word_start = None;
for (idx, b) in bytes.iter().enumerate() {
if !is_fmt_whitespace_byte(*b) {
word_start = Some(idx);
break;
} else if *b == b'\t' {
if beforetab.is_none() {
beforetab = Some(aftertab);
aftertab = 0;
} else {
aftertab = (aftertab / self.opts.tabwidth + 1) * self.opts.tabwidth;
}
} else {
aftertab += 1;
}
}
(beforetab, aftertab, word_start)
}
fn new<'b>(opts: &'b FmtOptions, bytes: &'b [u8]) -> WordSplit<'b> {
let start = bytes
.iter()
.position(|&b| !is_fmt_whitespace_byte(b))
.unwrap_or(bytes.len());
let trimmed = &bytes[start..];
WordSplit {
opts,
bytes: trimmed,
length: trimmed.len(),
position: 0,
prev_punct: false,
}
}
fn is_punctuation_byte(b: u8) -> bool {
matches!(b, b'!' | b'.' | b'?')
}
fn scan_word_end(&self, word_start: usize) -> (usize, usize, Option<u8>) {
let mut word_nchars = 0;
let mut idx = word_start;
let mut last_ascii = None;
while idx < self.length {
let info = decode_char_info(self.bytes, idx);
let is_whitespace = info.is_ascii && info.ch.is_some_and(is_fmt_whitespace);
if is_whitespace {
break;
}
word_nchars += info.width;
if info.is_ascii {
last_ascii = info.ch.map(|c| c as u8);
} else {
last_ascii = None;
}
idx += info.consumed;
}
(idx, word_nchars, last_ascii)
}
}
pub struct WordInfo<'a> {
pub word: &'a [u8],
pub word_start: usize,
pub word_nchars: usize,
pub before_tab: Option<usize>,
pub after_tab: usize,
pub sentence_start: bool,
pub ends_punct: bool,
pub new_line: bool,
}
impl<'a> Iterator for WordSplit<'a> {
type Item = WordInfo<'a>;
fn next(&mut self) -> Option<WordInfo<'a>> {
if self.position >= self.length {
return None;
}
let old_position = self.position;
let new_line = old_position == 0;
let (before_tab, after_tab, word_start) =
if let (b, a, Some(s)) = self.analyze_tabs(&self.bytes[old_position..]) {
(b, a, s + old_position)
} else {
self.position = self.length;
return None;
};
let (next_position, word_nchars, last_ascii) = self.scan_word_end(word_start);
self.position = next_position;
let word_start_relative = word_start - old_position;
let is_start_of_sentence =
self.prev_punct && (before_tab.is_some() || word_start_relative > 1);
let ends_punct = last_ascii.is_some_and(WordSplit::is_punctuation_byte);
self.prev_punct = ends_punct;
let (word, word_start_relative, before_tab, after_tab) = if self.opts.uniform {
(&self.bytes[word_start..self.position], 0, None, 0)
} else {
(
&self.bytes[old_position..self.position],
word_start_relative,
before_tab,
after_tab,
)
};
Some(WordInfo {
word,
word_start: word_start_relative,
word_nchars,
before_tab,
after_tab,
sentence_start: is_start_of_sentence,
ends_punct,
new_line,
})
}
}