use std::borrow::Cow;
use std::fmt::Write;
#[derive(Debug, Clone)]
pub struct EncodedText {
pub text: String,
pub bytes_consumed: usize,
pub had_errors: bool,
pub replacements: usize,
}
impl EncodedText {
#[must_use]
pub fn ok(text: impl Into<String>, bytes_consumed: usize) -> Self {
Self {
text: text.into(),
bytes_consumed,
had_errors: false,
replacements: 0,
}
}
#[must_use]
pub fn with_errors(
text: impl Into<String>,
bytes_consumed: usize,
replacements: usize,
) -> Self {
Self {
text: text.into(),
bytes_consumed,
had_errors: replacements > 0,
replacements,
}
}
}
#[must_use]
pub fn decode_utf8_lossy(bytes: &[u8]) -> EncodedText {
let text = String::from_utf8_lossy(bytes);
let replacements = text.matches('\u{FFFD}').count();
EncodedText {
text: text.into_owned(),
bytes_consumed: bytes.len(),
had_errors: replacements > 0,
replacements,
}
}
pub fn decode_utf8_strict(bytes: &[u8]) -> Result<EncodedText, std::str::Utf8Error> {
let text = std::str::from_utf8(bytes)?;
Ok(EncodedText::ok(text, bytes.len()))
}
#[must_use]
#[allow(unsafe_code)]
pub fn decode_utf8_escape(bytes: &[u8]) -> EncodedText {
let mut result = String::with_capacity(bytes.len());
let mut replacements = 0;
let mut i = 0;
while i < bytes.len() {
match std::str::from_utf8(&bytes[i..]) {
Ok(valid) => {
result.push_str(valid);
break;
}
Err(e) => {
let valid_up_to = e.valid_up_to();
if valid_up_to > 0 {
result.push_str(unsafe {
std::str::from_utf8_unchecked(&bytes[i..i + valid_up_to])
});
}
i += valid_up_to;
let error_len = e.error_len().unwrap_or(1);
for byte in &bytes[i..i + error_len] {
let _ = write!(result, "\\x{byte:02x}");
replacements += 1;
}
i += error_len;
}
}
}
EncodedText::with_errors(result, bytes.len(), replacements)
}
#[must_use]
#[allow(unsafe_code)]
pub fn decode_utf8_skip(bytes: &[u8]) -> EncodedText {
let mut result = String::with_capacity(bytes.len());
let mut replacements = 0;
let mut i = 0;
while i < bytes.len() {
match std::str::from_utf8(&bytes[i..]) {
Ok(valid) => {
result.push_str(valid);
break;
}
Err(e) => {
let valid_up_to = e.valid_up_to();
if valid_up_to > 0 {
result.push_str(unsafe {
std::str::from_utf8_unchecked(&bytes[i..i + valid_up_to])
});
}
i += valid_up_to;
let error_len = e.error_len().unwrap_or(1);
replacements += error_len;
i += error_len;
}
}
}
EncodedText::with_errors(result, bytes.len(), replacements)
}
#[must_use]
pub fn normalize_line_endings(text: &str, ending: LineEndingStyle) -> Cow<'_, str> {
let target = ending.as_str();
let needs_crlf = text.contains("\r\n");
let needs_cr = text.contains('\r') && !needs_crlf;
let needs_lf = text.contains('\n') && !needs_crlf;
match ending {
LineEndingStyle::Lf if !needs_crlf && !needs_cr => return Cow::Borrowed(text),
LineEndingStyle::CrLf if needs_crlf && !needs_cr && !needs_lf => {
return Cow::Borrowed(text);
}
LineEndingStyle::Cr if needs_cr && !needs_crlf && !needs_lf => return Cow::Borrowed(text),
_ => {}
}
let normalized = if needs_crlf {
text.replace("\r\n", "\n")
} else {
text.to_string()
};
let normalized = if normalized.contains('\r') {
normalized.replace('\r', "\n")
} else {
normalized
};
let result = if target == "\n" {
normalized
} else {
normalized.replace('\n', target)
};
Cow::Owned(result)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum LineEndingStyle {
#[default]
Lf,
CrLf,
Cr,
}
impl LineEndingStyle {
#[must_use]
pub const fn as_str(self) -> &'static str {
match self {
Self::Lf => "\n",
Self::CrLf => "\r\n",
Self::Cr => "\r",
}
}
#[must_use]
pub const fn as_bytes(self) -> &'static [u8] {
match self {
Self::Lf => b"\n",
Self::CrLf => b"\r\n",
Self::Cr => b"\r",
}
}
#[must_use]
pub const fn from_env() -> Self {
if cfg!(windows) { Self::CrLf } else { Self::Lf }
}
}
#[must_use]
pub fn detect_line_ending(text: &str) -> Option<LineEndingStyle> {
let crlf_count = text.matches("\r\n").count();
let lf_only_count = text.matches('\n').count().saturating_sub(crlf_count);
let cr_only_count = text
.chars()
.zip(text.chars().skip(1).chain(std::iter::once('\0')))
.filter(|&(c, next)| c == '\r' && next != '\n')
.count();
if crlf_count == 0 && lf_only_count == 0 && cr_only_count == 0 {
return None;
}
if crlf_count >= lf_only_count && crlf_count >= cr_only_count {
Some(LineEndingStyle::CrLf)
} else if lf_only_count >= cr_only_count {
Some(LineEndingStyle::Lf)
} else {
Some(LineEndingStyle::Cr)
}
}
#[must_use]
pub fn detect_encoding_from_env() -> DetectedEncoding {
let locale = std::env::var("LC_ALL")
.or_else(|_| std::env::var("LC_CTYPE"))
.or_else(|_| std::env::var("LANG"))
.unwrap_or_default();
let locale_lower = locale.to_lowercase();
if locale_lower.contains("utf-8") || locale_lower.contains("utf8") {
DetectedEncoding::Utf8
} else if locale_lower.contains("iso-8859-1") || locale_lower.contains("iso8859-1") {
DetectedEncoding::Latin1
} else if locale_lower.contains("1252") {
DetectedEncoding::Windows1252
} else if locale.is_empty() {
DetectedEncoding::Utf8
} else {
DetectedEncoding::Unknown(locale)
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum DetectedEncoding {
Utf8,
Latin1,
Windows1252,
Unknown(String),
}
impl DetectedEncoding {
#[must_use]
pub const fn is_utf8(&self) -> bool {
matches!(self, Self::Utf8)
}
}
#[must_use]
pub fn strip_ansi(text: &str) -> Cow<'_, str> {
if !text.contains('\x1b') {
return Cow::Borrowed(text);
}
let mut result = String::with_capacity(text.len());
let mut chars = text.chars().peekable();
while let Some(c) = chars.next() {
if c == '\x1b' {
if let Some(&next) = chars.peek() {
match next {
'[' => {
chars.next(); while let Some(¶m) = chars.peek() {
if param.is_ascii_alphabetic() || param == '@' || param == '`' {
chars.next(); break;
}
chars.next();
}
}
']' => {
chars.next(); while let Some(osc_char) = chars.next() {
if osc_char == '\x07' || osc_char == '\x1b' {
if osc_char == '\x1b' && chars.peek() == Some(&'\\') {
chars.next(); }
break;
}
}
}
'(' | ')' | '*' | '+' => {
chars.next();
chars.next();
}
_ if next.is_ascii_uppercase() || next == '=' || next == '>' => {
chars.next();
}
_ => {
}
}
}
} else {
result.push(c);
}
}
Cow::Owned(result)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn decode_valid_utf8() {
let result = decode_utf8_lossy(b"hello world");
assert_eq!(result.text, "hello world");
assert!(!result.had_errors);
assert_eq!(result.replacements, 0);
}
#[test]
fn decode_invalid_utf8_lossy() {
let result = decode_utf8_lossy(b"hello\xff\xfeworld");
assert!(result.text.contains('\u{FFFD}'));
assert!(result.had_errors);
assert!(result.replacements > 0);
}
#[test]
fn decode_invalid_utf8_escape() {
let result = decode_utf8_escape(b"hello\xffworld");
assert!(result.text.contains("\\xff"));
assert!(result.had_errors);
}
#[test]
fn decode_invalid_utf8_skip() {
let result = decode_utf8_skip(b"hello\xff\xfeworld");
assert_eq!(result.text, "helloworld");
assert!(result.had_errors);
}
#[test]
fn normalize_crlf_to_lf() {
let text = "line1\r\nline2\r\nline3";
let result = normalize_line_endings(text, LineEndingStyle::Lf);
assert_eq!(result, "line1\nline2\nline3");
}
#[test]
fn normalize_lf_to_crlf() {
let text = "line1\nline2\nline3";
let result = normalize_line_endings(text, LineEndingStyle::CrLf);
assert_eq!(result, "line1\r\nline2\r\nline3");
}
#[test]
fn detect_line_ending_lf() {
assert_eq!(
detect_line_ending("line1\nline2\n"),
Some(LineEndingStyle::Lf)
);
}
#[test]
fn detect_line_ending_crlf() {
assert_eq!(
detect_line_ending("line1\r\nline2\r\n"),
Some(LineEndingStyle::CrLf)
);
}
#[test]
fn strip_ansi_csi() {
let text = "\x1b[32mgreen\x1b[0m text";
let result = strip_ansi(text);
assert_eq!(result, "green text");
}
#[test]
fn strip_ansi_no_escape() {
let text = "plain text";
let result = strip_ansi(text);
assert!(matches!(result, Cow::Borrowed(_)));
assert_eq!(result, "plain text");
}
#[test]
fn strip_ansi_osc() {
let text = "\x1b]0;Window Title\x07normal text";
let result = strip_ansi(text);
assert_eq!(result, "normal text");
}
}