use crate::unicode_table;
use nom::{
branch::alt,
bytes::complete::{tag, take, take_till, take_until},
character::complete::one_of,
combinator::{map, map_opt, peek, recognize, value},
multi::{many0, many_m_n},
sequence::{delimited, preceded, separated_pair, terminated},
IResult, Parser,
};
use std::{
char,
collections::{HashMap, HashSet},
error::Error,
fmt::{Debug, Display},
io::{BufRead, Write},
ops::Not,
};
use uucore::error::{UError, UResult, USimpleError};
use uucore::show_warning;
#[derive(Debug, Clone)]
pub enum BadSequence {
MissingCharClassName,
MissingEquivalentClassChar,
MultipleCharRepeatInSet2,
CharRepeatInSet1,
InvalidRepeatCount(String),
EmptySet2WhenNotTruncatingSet1,
ClassExceptLowerUpperInSet2,
ClassInSet2NotMatchedBySet1,
Set1LongerSet2EndsInClass,
ComplementMoreThanOneUniqueInSet2,
BackwardsRange { end: u32, start: u32 },
MultipleCharInEquivalence(String),
}
impl Display for BadSequence {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::MissingCharClassName => write!(f, "missing character class name '[::]'"),
Self::MissingEquivalentClassChar => {
write!(f, "missing equivalence class character '[==]'")
}
Self::MultipleCharRepeatInSet2 => {
write!(f, "only one [c*] repeat construct may appear in string2")
}
Self::CharRepeatInSet1 => {
write!(f, "the [c*] repeat construct may not appear in string1")
}
Self::InvalidRepeatCount(count) => {
write!(f, "invalid repeat count '{count}' in [c*n] construct")
}
Self::EmptySet2WhenNotTruncatingSet1 => {
write!(f, "when not truncating set1, string2 must be non-empty")
}
Self::ClassExceptLowerUpperInSet2 => {
write!(f, "when translating, the only character classes that may appear in set2 are 'upper' and 'lower'")
}
Self::ClassInSet2NotMatchedBySet1 => {
write!(f, "when translating, every 'upper'/'lower' in set2 must be matched by a 'upper'/'lower' in the same position in set1")
}
Self::Set1LongerSet2EndsInClass => {
write!(f, "when translating with string1 longer than string2,\nthe latter string must not end with a character class")
}
Self::ComplementMoreThanOneUniqueInSet2 => {
write!(f, "when translating with complemented character classes,\nstring2 must map all characters in the domain to one")
}
Self::BackwardsRange { end, start } => {
fn end_or_start_to_string(ut: &u32) -> String {
match char::from_u32(*ut) {
Some(ch @ '\x20'..='\x7E') => ch.escape_default().to_string(),
_ => {
format!("\\{ut:03o}")
}
}
}
write!(
f,
"range-endpoints of '{}-{}' are in reverse collating sequence order",
end_or_start_to_string(start),
end_or_start_to_string(end)
)
}
Self::MultipleCharInEquivalence(s) => write!(
f,
"{s}: equivalence class operand must be a single character"
),
}
}
}
impl Error for BadSequence {}
impl UError for BadSequence {}
#[derive(Debug, Clone, Copy)]
pub enum Class {
Alnum,
Alpha,
Blank,
Control,
Digit,
Graph,
Lower,
Print,
Punct,
Space,
Upper,
Xdigit,
}
#[derive(Debug, Clone, Copy)]
pub enum Sequence {
Char(u8),
CharRange(u8, u8),
CharStar(u8),
CharRepeat(u8, usize),
Class(Class),
}
impl Sequence {
pub fn flatten(&self) -> Box<dyn Iterator<Item = u8>> {
match self {
Self::Char(c) => Box::new(std::iter::once(*c)),
Self::CharRange(l, r) => Box::new(*l..=*r),
Self::CharStar(c) => Box::new(std::iter::repeat(*c)),
Self::CharRepeat(c, n) => Box::new(std::iter::repeat_n(*c, *n)),
Self::Class(class) => match class {
Class::Alnum => Box::new((b'0'..=b'9').chain(b'A'..=b'Z').chain(b'a'..=b'z')),
Class::Alpha => Box::new((b'A'..=b'Z').chain(b'a'..=b'z')),
Class::Blank => Box::new(unicode_table::BLANK.iter().copied()),
Class::Control => Box::new((0..=31).chain(std::iter::once(127))),
Class::Digit => Box::new(b'0'..=b'9'),
Class::Graph => Box::new(
(48..=57) .chain(65..=90) .chain(97..=122) .chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126)
.chain(std::iter::once(32)), ),
Class::Print => Box::new(
(48..=57) .chain(65..=90) .chain(97..=122) .chain(33..=47)
.chain(58..=64)
.chain(91..=96)
.chain(123..=126),
),
Class::Punct => Box::new((33..=47).chain(58..=64).chain(91..=96).chain(123..=126)),
Class::Space => Box::new(unicode_table::SPACES.iter().copied()),
Class::Xdigit => Box::new((b'0'..=b'9').chain(b'A'..=b'F').chain(b'a'..=b'f')),
Class::Lower => Box::new(b'a'..=b'z'),
Class::Upper => Box::new(b'A'..=b'Z'),
},
}
}
pub fn solve_set_characters(
set1_str: &[u8],
set2_str: &[u8],
complement_flag: bool,
truncate_set1_flag: bool,
translating: bool,
) -> Result<(Vec<u8>, Vec<u8>), BadSequence> {
let is_char_star = |s: &&Self| -> bool { matches!(s, Self::CharStar(_)) };
let set1 = Self::from_str(set1_str)?;
if set1.iter().filter(is_char_star).count() != 0 {
return Err(BadSequence::CharRepeatInSet1);
}
let mut set2 = Self::from_str(set2_str)?;
if set2.iter().filter(is_char_star).count() > 1 {
return Err(BadSequence::MultipleCharRepeatInSet2);
}
if translating
&& set2.iter().any(|&x| {
matches!(x, Self::Class(_))
&& !matches!(x, Self::Class(Class::Upper) | Self::Class(Class::Lower))
})
{
return Err(BadSequence::ClassExceptLowerUpperInSet2);
}
let mut set1_solved: Vec<u8> = set1.iter().flat_map(Self::flatten).collect();
if complement_flag {
set1_solved = (0..=u8::MAX).filter(|x| !set1_solved.contains(x)).collect();
}
let set1_len = set1_solved.len();
let set2_len = set2
.iter()
.filter_map(|s| match s {
Self::CharStar(_) => None,
r => Some(r),
})
.flat_map(Self::flatten)
.count();
let star_compensate_len = set1_len.saturating_sub(set2_len);
set2 = set2
.iter()
.filter_map(|s| match s {
Self::CharStar(0) => None,
Self::CharStar(c) => Some(Self::CharRepeat(*c, star_compensate_len)),
r => Some(*r),
})
.collect();
for (set2_pos, set2_item) in set2.iter().enumerate() {
if matches!(set2_item, Self::Class(_)) {
let mut set2_part_solved_len = 0;
if set2_pos >= 1 {
set2_part_solved_len =
set2.iter().take(set2_pos).flat_map(Self::flatten).count();
}
let mut class_matches = false;
for (set1_pos, set1_item) in set1.iter().enumerate() {
if matches!(set1_item, Self::Class(_)) {
let mut set1_part_solved_len = 0;
if set1_pos >= 1 {
set1_part_solved_len =
set1.iter().take(set1_pos).flat_map(Self::flatten).count();
}
if set1_part_solved_len == set2_part_solved_len {
class_matches = true;
break;
}
}
}
if !class_matches {
return Err(BadSequence::ClassInSet2NotMatchedBySet1);
}
}
}
let set2_solved: Vec<_> = set2.iter().flat_map(Self::flatten).collect();
let mut set2_uniques = set2_solved.clone();
set2_uniques.sort();
set2_uniques.dedup();
if set1.iter().any(|x| matches!(x, Self::Class(_)))
&& translating
&& complement_flag
&& (set2_uniques.len() > 1 || set2_solved.len() > set1_len)
{
return Err(BadSequence::ComplementMoreThanOneUniqueInSet2);
}
if set2_solved.len() < set1_solved.len()
&& !truncate_set1_flag
&& matches!(
set2.last().copied(),
Some(Self::Class(Class::Upper)) | Some(Self::Class(Class::Lower))
)
{
return Err(BadSequence::Set1LongerSet2EndsInClass);
}
if truncate_set1_flag {
set1_solved.truncate(set2_solved.len());
}
Ok((set1_solved, set2_solved))
}
}
impl Sequence {
pub fn from_str(input: &[u8]) -> Result<Vec<Self>, BadSequence> {
many0(alt((
Self::parse_char_range,
Self::parse_char_star,
Self::parse_char_repeat,
Self::parse_class,
Self::parse_char_equal,
map(Self::parse_backslash_or_char_with_warning, |s| {
Ok(Self::Char(s))
}),
)))
.parse(input)
.map(|(_, r)| r)
.unwrap()
.into_iter()
.collect::<Result<Vec<_>, _>>()
}
fn parse_octal(input: &[u8]) -> IResult<&[u8], u8> {
preceded(tag("\\"), Self::parse_octal_up_to_three_digits).parse(input)
}
fn parse_octal_with_warning(input: &[u8]) -> IResult<&[u8], u8> {
preceded(
tag("\\"),
alt((
Self::parse_octal_up_to_three_digits_with_warning,
Self::parse_octal_two_digits,
)),
)
.parse(input)
}
fn parse_octal_up_to_three_digits(input: &[u8]) -> IResult<&[u8], u8> {
map_opt(
recognize(many_m_n(1, 3, one_of("01234567"))),
|out: &[u8]| {
let str_to_parse = std::str::from_utf8(out).unwrap();
u8::from_str_radix(str_to_parse, 8).ok()
},
)
.parse(input)
}
fn parse_octal_up_to_three_digits_with_warning(input: &[u8]) -> IResult<&[u8], u8> {
map_opt(
recognize(many_m_n(1, 3, one_of("01234567"))),
|out: &[u8]| {
let str_to_parse = std::str::from_utf8(out).unwrap();
let result = u8::from_str_radix(str_to_parse, 8).ok();
if result.is_none() {
let origin_octal: &str = std::str::from_utf8(input).unwrap();
let actual_octal_tail: &str = std::str::from_utf8(&input[0..2]).unwrap();
let outstand_char: char = char::from_u32(input[2] as u32).unwrap();
show_warning!(
"the ambiguous octal escape \\{} is being\n interpreted as the 2-byte sequence \\0{}, {}",
origin_octal,
actual_octal_tail,
outstand_char
);
}
result
},
).parse(input)
}
fn parse_octal_two_digits(input: &[u8]) -> IResult<&[u8], u8> {
map_opt(
recognize(many_m_n(2, 2, one_of("01234567"))),
|out: &[u8]| u8::from_str_radix(std::str::from_utf8(out).unwrap(), 8).ok(),
)
.parse(input)
}
fn parse_backslash(input: &[u8]) -> IResult<&[u8], u8> {
preceded(tag("\\"), Self::single_char)
.parse(input)
.map(|(l, a)| {
let c = match a {
b'a' => unicode_table::BEL,
b'b' => unicode_table::BS,
b'f' => unicode_table::FF,
b'n' => unicode_table::LF,
b'r' => unicode_table::CR,
b't' => unicode_table::HT,
b'v' => unicode_table::VT,
x => x,
};
(l, c)
})
}
fn parse_backslash_or_char(input: &[u8]) -> IResult<&[u8], u8> {
alt((Self::parse_octal, Self::parse_backslash, Self::single_char)).parse(input)
}
fn parse_backslash_or_char_with_warning(input: &[u8]) -> IResult<&[u8], u8> {
alt((
Self::parse_octal_with_warning,
Self::parse_backslash,
Self::single_char,
))
.parse(input)
}
fn single_char(input: &[u8]) -> IResult<&[u8], u8> {
take(1usize)(input).map(|(l, a)| (l, a[0]))
}
fn parse_char_range(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
separated_pair(
Self::parse_backslash_or_char,
tag("-"),
Self::parse_backslash_or_char,
)
.parse(input)
.map(|(l, (a, b))| {
(l, {
let (start, end) = (u32::from(a), u32::from(b));
let range = start..=end;
if range.is_empty() {
Err(BadSequence::BackwardsRange { end, start })
} else {
Ok(Self::CharRange(start as u8, end as u8))
}
})
})
}
fn parse_char_star(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited(tag("["), Self::parse_backslash_or_char, tag("*]"))
.parse(input)
.map(|(l, a)| (l, Ok(Self::CharStar(a))))
}
fn parse_char_repeat(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited(
tag("["),
separated_pair(
Self::parse_backslash_or_char,
tag("*"),
take_till(|ue| matches!(ue, b']' | b'\\')),
),
tag("]"),
)
.parse(input)
.map(|(l, (c, cnt_str))| {
let s = String::from_utf8_lossy(cnt_str);
let result = if cnt_str.starts_with(b"0") {
match usize::from_str_radix(&s, 8) {
Ok(0) => Ok(Self::CharStar(c)),
Ok(count) => Ok(Self::CharRepeat(c, count)),
Err(_) => Err(BadSequence::InvalidRepeatCount(s.to_string())),
}
} else {
match s.parse::<usize>() {
Ok(0) => Ok(Self::CharStar(c)),
Ok(count) => Ok(Self::CharRepeat(c, count)),
Err(_) => Err(BadSequence::InvalidRepeatCount(s.to_string())),
}
};
(l, result)
})
}
fn parse_class(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
delimited(
tag("[:"),
alt((
map(
alt((
value(Self::Class(Class::Alnum), tag("alnum")),
value(Self::Class(Class::Alpha), tag("alpha")),
value(Self::Class(Class::Blank), tag("blank")),
value(Self::Class(Class::Control), tag("cntrl")),
value(Self::Class(Class::Digit), tag("digit")),
value(Self::Class(Class::Graph), tag("graph")),
value(Self::Class(Class::Lower), tag("lower")),
value(Self::Class(Class::Print), tag("print")),
value(Self::Class(Class::Punct), tag("punct")),
value(Self::Class(Class::Space), tag("space")),
value(Self::Class(Class::Upper), tag("upper")),
value(Self::Class(Class::Xdigit), tag("xdigit")),
)),
Ok,
),
value(Err(BadSequence::MissingCharClassName), tag("")),
)),
tag(":]"),
)
.parse(input)
}
fn parse_char_equal(input: &[u8]) -> IResult<&[u8], Result<Self, BadSequence>> {
preceded(
tag("[="),
(
alt((
value(Err(()), peek(tag("=]"))),
map(Self::parse_backslash_or_char, Ok),
)),
map(terminated(take_until("=]"), tag("=]")), |v: &[u8]| {
if v.is_empty() {
Ok(())
} else {
Err(v)
}
}),
),
)
.parse(input)
.map(|(l, (a, b))| {
(
l,
match (a, b) {
(Err(()), _) => Err(BadSequence::MissingEquivalentClassChar),
(Ok(c), Ok(())) => Ok(Self::Char(c)),
(Ok(c), Err(v)) => Err(BadSequence::MultipleCharInEquivalence(format!(
"{}{}",
String::from_utf8_lossy(&[c]).into_owned(),
String::from_utf8_lossy(v).into_owned()
))),
},
)
})
}
}
pub trait SymbolTranslator {
fn translate(&mut self, current: u8) -> Option<u8>;
fn chain<T>(self, other: T) -> ChainedSymbolTranslator<Self, T>
where
Self: Sized,
{
ChainedSymbolTranslator::<Self, T> {
stage_a: self,
stage_b: other,
}
}
}
pub struct ChainedSymbolTranslator<A, B> {
stage_a: A,
stage_b: B,
}
impl<A: SymbolTranslator, B: SymbolTranslator> SymbolTranslator for ChainedSymbolTranslator<A, B> {
fn translate(&mut self, current: u8) -> Option<u8> {
self.stage_a
.translate(current)
.and_then(|c| self.stage_b.translate(c))
}
}
#[derive(Debug)]
pub struct DeleteOperation {
set: Vec<u8>,
}
impl DeleteOperation {
pub fn new(set: Vec<u8>) -> Self {
Self { set }
}
}
impl SymbolTranslator for DeleteOperation {
fn translate(&mut self, current: u8) -> Option<u8> {
self.set.contains(¤t).not().then_some(current)
}
}
#[derive(Debug)]
pub struct TranslateOperation {
translation_map: HashMap<u8, u8>,
}
impl TranslateOperation {
pub fn new(set1: Vec<u8>, set2: Vec<u8>) -> Result<Self, BadSequence> {
if let Some(fallback) = set2.last().copied() {
Ok(Self {
translation_map: set1
.into_iter()
.zip(set2.into_iter().chain(std::iter::repeat(fallback)))
.collect::<HashMap<_, _>>(),
})
} else if set1.is_empty() && set2.is_empty() {
Ok(Self {
translation_map: HashMap::new(),
})
} else {
Err(BadSequence::EmptySet2WhenNotTruncatingSet1)
}
}
}
impl SymbolTranslator for TranslateOperation {
fn translate(&mut self, current: u8) -> Option<u8> {
Some(
self.translation_map
.get(¤t)
.copied()
.unwrap_or(current),
)
}
}
#[derive(Debug, Clone)]
pub struct SqueezeOperation {
set1: HashSet<u8>,
previous: Option<u8>,
}
impl SqueezeOperation {
pub fn new(set1: Vec<u8>) -> Self {
Self {
set1: set1.into_iter().collect(),
previous: None,
}
}
}
impl SymbolTranslator for SqueezeOperation {
fn translate(&mut self, current: u8) -> Option<u8> {
let next = if self.set1.contains(¤t) {
match self.previous {
Some(v) if v == current => None,
_ => Some(current),
}
} else {
Some(current)
};
self.previous = Some(current);
next
}
}
pub fn translate_input<T, R, W>(input: &mut R, output: &mut W, mut translator: T) -> UResult<()>
where
T: SymbolTranslator,
R: BufRead,
W: Write,
{
let mut buf = Vec::new();
let mut output_buf = Vec::new();
while let Ok(length) = input.read_until(b'\n', &mut buf) {
if length == 0 {
break; }
let filtered = buf.iter().filter_map(|&c| translator.translate(c));
output_buf.extend(filtered);
if let Err(e) = output.write_all(&output_buf) {
return Err(USimpleError::new(
1,
format!("{}: write error: {}", uucore::util_name(), e),
));
}
buf.clear();
output_buf.clear();
}
Ok(())
}