use lazy_static::lazy_static;
pub fn next_char_boundary(s: &str, start: usize) -> usize {
let mut i = start;
let len = s.len();
if i >= len {
return len;
}
while !s.is_char_boundary(i) && i < len {
i += 1;
}
i
}
pub fn prev_char_boundary(s: &str, start: usize) -> usize {
let mut i = start;
while !s.is_char_boundary(i) && i > 0 {
i -= 1;
}
i
}
pub fn slice_at_char_boundaries(
s: &str,
start_byte_index: usize,
end_byte_index: usize,
) -> &str {
if start_byte_index > end_byte_index
|| start_byte_index > s.len()
|| end_byte_index > s.len()
{
return EMPTY_STRING;
}
&s[prev_char_boundary(s, start_byte_index)
..next_char_boundary(s, end_byte_index)]
}
pub fn slice_up_to_char_boundary(s: &str, byte_index: usize) -> &str {
&s[..next_char_boundary(s, byte_index)]
}
pub fn try_parse_utf8_char(input: &[u8]) -> Option<(char, usize)> {
let str_from_utf8 = |seq| std::str::from_utf8(seq).ok();
let decoded = input
.get(0..1)
.and_then(str_from_utf8)
.map(|c| (c, 1))
.or_else(|| input.get(0..2).and_then(str_from_utf8).map(|c| (c, 2)))
.or_else(|| input.get(0..3).and_then(str_from_utf8).map(|c| (c, 3)))
.or_else(|| input.get(0..4).and_then(str_from_utf8).map(|c| (c, 4)));
decoded.map(|(seq, n)| (seq.chars().next().unwrap(), n))
}
lazy_static! {
static ref NULL_SYMBOL: char = char::from_u32(0x2400).unwrap();
}
pub const EMPTY_STRING: &str = "";
pub const TAB_WIDTH: usize = 4;
const TAB_CHARACTER: char = '\t';
const LINE_FEED_CHARACTER: char = '\x0A';
const DELETE_CHARACTER: char = '\x7F';
const BOM_CHARACTER: char = '\u{FEFF}';
const NULL_CHARACTER: char = '\x00';
const UNIT_SEPARATOR_CHARACTER: char = '\u{001F}';
const APPLICATION_PROGRAM_COMMAND_CHARACTER: char = '\u{009F}';
const NF_RANGE_DEVICONS: std::ops::RangeInclusive<char> =
'\u{e700}'..='\u{e8ef}';
const NF_RANGE_SETI: std::ops::RangeInclusive<char> = '\u{e5fa}'..='\u{e6b7}';
const NF_RANGE_FONT_AWESOME: std::ops::RangeInclusive<char> =
'\u{ed00}'..='\u{f2ff}';
const NF_RANGE_FONT_AWESOME_EXT: std::ops::RangeInclusive<char> =
'\u{e200}'..='\u{e2a9}';
const NF_RANGE_MATERIAL: std::ops::RangeInclusive<char> =
'\u{f0001}'..='\u{f1af0}';
const NF_RANGE_WEATHER: std::ops::RangeInclusive<char> =
'\u{e300}'..='\u{e3e3}';
const NF_RANGE_OCTICONS_1: std::ops::RangeInclusive<char> =
'\u{f400}'..='\u{f533}';
const NF_RANGE_OCTICONS_2: std::ops::RangeInclusive<char> =
'\u{2665}'..='\u{26a1}';
const NF_RANGE_POWERLINE_1: std::ops::RangeInclusive<char> =
'\u{e0a0}'..='\u{e0a2}';
const NF_RANGE_POWERLINE_2: std::ops::RangeInclusive<char> =
'\u{e0b0}'..='\u{e0b3}';
const ALL_NF_RANGES: [&std::ops::RangeInclusive<char>; 10] = [
&NF_RANGE_DEVICONS,
&NF_RANGE_SETI,
&NF_RANGE_FONT_AWESOME,
&NF_RANGE_FONT_AWESOME_EXT,
&NF_RANGE_MATERIAL,
&NF_RANGE_WEATHER,
&NF_RANGE_OCTICONS_1,
&NF_RANGE_OCTICONS_2,
&NF_RANGE_POWERLINE_1,
&NF_RANGE_POWERLINE_2,
];
pub struct ReplaceNonPrintableConfig {
pub replace_tab: bool,
pub tab_width: usize,
pub replace_line_feed: bool,
pub replace_control_characters: bool,
}
impl ReplaceNonPrintableConfig {
pub fn tab_width(&mut self, tab_width: usize) -> &mut Self {
self.tab_width = tab_width;
self
}
}
impl Default for ReplaceNonPrintableConfig {
fn default() -> Self {
Self {
replace_tab: true,
tab_width: TAB_WIDTH,
replace_line_feed: true,
replace_control_characters: true,
}
}
}
#[allow(clippy::missing_panics_doc)]
pub fn replace_non_printable(
input: &[u8],
config: &ReplaceNonPrintableConfig,
) -> (String, Vec<i16>) {
let mut output = String::with_capacity(input.len());
let mut offsets = Vec::new();
let mut cumulative_offset: i16 = 0;
let mut idx = 0;
let len = input.len();
while idx < len {
offsets.push(cumulative_offset);
if let Some((chr, skip_ahead)) = try_parse_utf8_char(&input[idx..]) {
idx += skip_ahead;
match chr {
TAB_CHARACTER if config.replace_tab => {
output.push_str(&" ".repeat(config.tab_width));
cumulative_offset +=
i16::try_from(config.tab_width).unwrap() - 1;
}
LINE_FEED_CHARACTER if config.replace_line_feed => {
cumulative_offset -= 1;
}
NULL_CHARACTER..=UNIT_SEPARATOR_CHARACTER
| DELETE_CHARACTER..=APPLICATION_PROGRAM_COMMAND_CHARACTER
| BOM_CHARACTER
if config.replace_control_characters =>
{
output.push(*NULL_SYMBOL);
}
c if ('\u{4E00}'..='\u{9FFF}').contains(&c) => {
output.push(c);
}
c if ALL_NF_RANGES.iter().any(|r| r.contains(&c)) => {
output.push(c);
}
c if c > '\u{0700}' => {
output.push(*NULL_SYMBOL);
}
c => output.push(c),
}
} else {
output.push(*NULL_SYMBOL);
idx += 1;
}
}
(output, offsets)
}
pub const PRINTABLE_ASCII_THRESHOLD: f32 = 0.7;
pub fn proportion_of_printable_ascii_characters(buffer: &[u8]) -> f32 {
let mut printable: usize = 0;
for &byte in buffer {
if (32..127).contains(&byte) {
printable += 1;
}
}
printable as f32 / buffer.len() as f32
}
const MAX_LINE_LENGTH: usize = 300;
pub fn preprocess_line(line: &str) -> (String, Vec<i16>) {
replace_non_printable(
{
if line.len() > MAX_LINE_LENGTH {
slice_up_to_char_boundary(line, MAX_LINE_LENGTH)
} else {
line
}
}
.as_bytes(),
&ReplaceNonPrintableConfig::default(),
)
}
pub fn make_matched_string_printable(
matched_string: &str,
match_ranges: Option<&[(u32, u32)]>,
) -> (String, Vec<(u32, u32)>) {
let (printable, transformation_offsets) = preprocess_line(matched_string);
let mut match_indices = Vec::new();
if let Some(ranges) = match_ranges {
for (start, end) in ranges.iter().take_while(|(start, _)| {
*start < u32::try_from(transformation_offsets.len()).unwrap()
}) {
let new_start = i64::from(*start)
+ i64::from(transformation_offsets[*start as usize]);
let new_end = i64::from(*end)
+ i64::from(
transformation_offsets[(*end as usize)
.min(transformation_offsets.len() - 1)],
);
match_indices.push((
u32::try_from(new_start).unwrap(),
u32::try_from(new_end).unwrap(),
));
}
}
(printable, match_indices)
}
pub fn shrink_with_ellipsis(s: &str, max_length: usize) -> String {
if s.len() <= max_length {
return s.to_string();
}
let half_max_length = (max_length / 2).saturating_sub(2);
let first_half = slice_up_to_char_boundary(s, half_max_length);
let second_half =
slice_at_char_boundaries(s, s.len() - half_max_length, s.len());
format!("{first_half}โฆ{second_half}")
}
#[cfg(test)]
mod tests {
use super::*;
fn test_next_char_boundary(input: &str, start: usize, expected: usize) {
let actual = next_char_boundary(input, start);
assert_eq!(actual, expected);
}
#[test]
fn test_next_char_boundary_ascii() {
test_next_char_boundary("Hello, World!", 0, 0);
test_next_char_boundary("Hello, World!", 1, 1);
test_next_char_boundary("Hello, World!", 13, 13);
test_next_char_boundary("Hello, World!", 30, 13);
}
#[test]
fn test_next_char_boundary_emoji() {
test_next_char_boundary("๐๐!", 0, 0);
test_next_char_boundary("๐๐!", 1, 4);
test_next_char_boundary("๐๐!", 4, 4);
test_next_char_boundary("๐๐!", 8, 8);
test_next_char_boundary("๐๐!", 7, 8);
}
fn test_previous_char_boundary(
input: &str,
start: usize,
expected: usize,
) {
let actual = prev_char_boundary(input, start);
assert_eq!(actual, expected);
}
#[test]
fn test_previous_char_boundary_ascii() {
test_previous_char_boundary("Hello, World!", 0, 0);
test_previous_char_boundary("Hello, World!", 1, 1);
test_previous_char_boundary("Hello, World!", 5, 5);
}
#[test]
fn test_previous_char_boundary_emoji() {
test_previous_char_boundary("๐๐!", 0, 0);
test_previous_char_boundary("๐๐!", 4, 4);
test_previous_char_boundary("๐๐!", 6, 4);
test_previous_char_boundary("๐๐!", 8, 8);
}
fn test_slice_at_char_boundaries(
input: &str,
start: usize,
end: usize,
expected: &str,
) {
let actual = slice_at_char_boundaries(input, start, end);
assert_eq!(actual, expected);
}
#[test]
fn test_slice_at_char_boundaries_ascii() {
test_slice_at_char_boundaries("Hello, World!", 0, 0, "");
test_slice_at_char_boundaries("Hello, World!", 0, 1, "H");
test_slice_at_char_boundaries("Hello, World!", 0, 13, "Hello, World!");
test_slice_at_char_boundaries("Hello, World!", 0, 30, "");
}
#[test]
fn test_slice_at_char_boundaries_emoji() {
test_slice_at_char_boundaries("๐๐!", 0, 0, "");
test_slice_at_char_boundaries("๐๐!", 0, 4, "๐");
test_slice_at_char_boundaries("๐๐!", 0, 8, "๐๐");
test_slice_at_char_boundaries("๐๐!", 0, 7, "๐๐");
test_slice_at_char_boundaries("๐๐!", 0, 9, "๐๐!");
}
fn test_replace_non_printable(input: &str, expected: &str) {
let (actual, _offset) = replace_non_printable(
input.as_bytes(),
&ReplaceNonPrintableConfig::default().tab_width(2),
);
assert_eq!(actual, expected);
}
#[test]
fn test_replace_non_printable_ascii() {
test_replace_non_printable("Hello, World!", "Hello, World!");
}
#[test]
fn test_replace_non_printable_tab() {
test_replace_non_printable("Hello\tWorld!", "Hello World!");
test_replace_non_printable(
" -- AND
", " -- AND",
)
}
#[test]
fn test_replace_non_printable_line_feed() {
test_replace_non_printable("Hello\nWorld!", "HelloWorld!");
}
#[test]
fn test_replace_non_printable_null() {
test_replace_non_printable("Hello\x00World!", "HelloโWorld!");
test_replace_non_printable("Hello World!\0", "Hello World!โ");
}
#[test]
fn test_replace_non_printable_delete() {
test_replace_non_printable("Hello\x7FWorld!", "HelloโWorld!");
}
#[test]
fn test_replace_non_printable_bom() {
test_replace_non_printable("Hello\u{FEFF}World!", "HelloโWorld!");
}
#[test]
fn test_replace_non_printable_start_txt() {
test_replace_non_printable("รรฌ", "รรฌโ");
}
#[test]
fn test_replace_non_printable_range_tab() {
let input = b"Hello,\tWorld!";
let (output, offsets) = replace_non_printable(
input,
&ReplaceNonPrintableConfig::default(),
);
assert_eq!(output, "Hello, World!");
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3]);
}
#[test]
fn test_replace_non_printable_range_line_feed() {
let input = b"Hello,\nWorld!";
let (output, offsets) = replace_non_printable(
input,
&ReplaceNonPrintableConfig::default().tab_width(2),
);
assert_eq!(output, "Hello,World!");
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1]);
}
#[test]
fn test_replace_non_printable_no_range_changes() {
let input = b"Hello,\x00World!";
let (output, offsets) = replace_non_printable(
input,
&ReplaceNonPrintableConfig::default().tab_width(2),
);
assert_eq!(output, "Hello,โWorld!");
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
let input = b"Hello,\x7FWorld!";
let (output, offsets) = replace_non_printable(
input,
&ReplaceNonPrintableConfig::default().tab_width(2),
);
assert_eq!(output, "Hello,โWorld!");
assert_eq!(offsets, vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
}
fn test_proportion_of_printable_ascii_characters(
input: &str,
expected: f32,
) {
let actual =
proportion_of_printable_ascii_characters(input.as_bytes());
assert_eq!(actual, expected);
}
#[test]
fn test_proportion_of_printable_ascii_characters_ascii() {
test_proportion_of_printable_ascii_characters("Hello, World!", 1.0);
test_proportion_of_printable_ascii_characters(
"Hello, World!\x00",
0.9285714,
);
test_proportion_of_printable_ascii_characters(
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0A\x0B\x0C\x0D\x0E\x0F",
0.0,
);
}
fn test_preprocess_line(input: &str, expected: &str) {
let (actual, _offset) = preprocess_line(input);
assert_eq!(actual, expected, "input: {:?}", input);
}
#[test]
fn test_preprocess_line_cases() {
test_preprocess_line("Hello, World!", "Hello, World!");
test_preprocess_line("Hello, World!\n", "Hello, World!");
test_preprocess_line("Hello, World!\x00", "Hello, World!โ");
test_preprocess_line("Hello, World!\x7F", "Hello, World!โ");
test_preprocess_line("Hello, World!\u{FEFF}", "Hello, World!โ");
test_preprocess_line(&"a".repeat(400), &"a".repeat(300));
}
}