use std::{
borrow::Borrow,
cmp::Ordering,
fmt::{Debug, Display, Formatter, Result as FmtResult},
hash::{Hash, Hasher},
ops::{Deref, DerefMut},
};
use encoding_rs::{CoderResult, Encoder, EncoderResult, Encoding, UTF_8};
use serde::Serialize;
use thiserror::Error as ThisError;
use unicase::UniCase;
use unicode_properties::UnicodeGeneralCategory;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum Class {
Ordinary,
System,
Scratch,
Macro,
}
impl Class {
pub fn must_leave(self) -> bool {
self == Self::Scratch
}
}
impl From<&Identifier> for Class {
fn from(id: &Identifier) -> Self {
if id.0.starts_with('$') {
Self::System
} else if id.0.starts_with('#') {
Self::Scratch
} else if id.0.starts_with('!') {
Self::Macro
} else {
Self::Ordinary
}
}
}
pub trait IdentifierChar {
fn ascii_may_start_id(self) -> bool;
fn may_start_id(self) -> bool;
fn ascii_may_continue_id(self) -> bool;
fn may_continue_id(self) -> bool;
}
impl IdentifierChar for char {
fn ascii_may_start_id(self) -> bool {
matches!(self, 'a'..='z' | 'A'..='Z' | '@' | '#' | '$' | '!')
}
fn may_start_id(self) -> bool {
if self < '\u{0080}' {
self.ascii_may_start_id()
} else {
use unicode_properties::GeneralCategoryGroup::*;
matches!(self.general_category_group(), Letter | Mark | Symbol)
&& self != char::REPLACEMENT_CHARACTER
}
}
fn ascii_may_continue_id(self) -> bool {
matches!(self, 'a'..='z' | 'A'..='Z' | '0'..='9' | '@' | '#' | '$' | '.' | '_')
}
fn may_continue_id(self) -> bool {
if self < '\u{0080}' {
self.ascii_may_continue_id()
} else {
use unicode_properties::GeneralCategoryGroup::*;
matches!(
self.general_category_group(),
Letter | Mark | Symbol | Number
) && self != char::REPLACEMENT_CHARACTER
}
}
}
#[derive(Clone, Debug, ThisError, PartialEq, Eq)]
pub enum Error {
#[error("Identifier cannot be empty string.")]
Empty,
#[error("\"{0}\" may not be used as an identifier because it is a reserved word.")]
Reserved(String),
#[error("\"!\" is not a valid identifier.")]
Bang,
#[error("{string:?} may not be used as an identifier because it begins with disallowed character {c:?}.")]
BadFirstCharacter { string: String, c: char },
#[error(
"{string:?} may not be used as an identifier because it contains disallowed character {c:?}."
)]
BadLaterCharacter { string: String, c: char },
#[error("Identifier \"{id}\" is {length} bytes in the encoding in use ({encoding}), which exceeds the {max}-byte limit.")]
TooLong {
id: String,
length: usize,
encoding: &'static str,
max: usize,
},
#[error("\"{id}\" may not be used as an identifier because the encoding in use ({encoding}) cannot represent \"{c}\".")]
NotEncodable {
id: String,
encoding: &'static str,
c: char,
},
#[error("Multiple response set name \"{0}\" does not begin with required \"$\".")]
MissingAt(Identifier),
}
pub enum ReservedWord {
And,
Or,
Not,
Eq,
Ge,
Gt,
Le,
Lt,
Ne,
All,
By,
To,
With,
}
impl TryFrom<&str> for ReservedWord {
type Error = ();
fn try_from(source: &str) -> Result<Self, Self::Error> {
if !(2..=4).contains(&source.len()) {
Err(())
} else {
let b = source.as_bytes();
let c0 = b[0].to_ascii_uppercase();
let c1 = b[1].to_ascii_uppercase();
match (source.len(), c0, c1) {
(2, b'B', b'Y') => Ok(Self::By),
(2, b'E', b'Q') => Ok(Self::Eq),
(2, b'G', b'T') => Ok(Self::Gt),
(2, b'G', b'E') => Ok(Self::Ge),
(2, b'L', b'T') => Ok(Self::Lt),
(2, b'L', b'E') => Ok(Self::Le),
(2, b'N', b'E') => Ok(Self::Ne),
(3, b'N', b'O') if b[2].eq_ignore_ascii_case(&b'T') => Ok(Self::Not),
(2, b'O', b'R') => Ok(Self::Or),
(2, b'T', b'O') => Ok(Self::To),
(3, b'A', b'L') if b[2].eq_ignore_ascii_case(&b'L') => Ok(Self::All),
(3, b'A', b'N') if b[2].eq_ignore_ascii_case(&b'D') => Ok(Self::And),
(4, b'W', b'I') if b[2..4].eq_ignore_ascii_case(b"TH") => Ok(Self::With),
_ => Err(()),
}
}
}
}
pub fn is_reserved_word(s: &str) -> bool {
ReservedWord::try_from(s).is_ok()
}
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Identifier(pub UniCase<String>);
impl Identifier {
pub const MAX_LEN: usize = 64;
fn new_unchecked(s: impl Into<UniCase<String>>) -> Self {
let s: UniCase<String> = s.into();
debug_assert!(Self::check_plausible(&s).is_ok());
Identifier(s)
}
pub fn new(s: impl Into<UniCase<String>>) -> Result<Self, Error> {
Self::from_encoding(s, UTF_8)
}
pub fn codepage_to_unicode(&mut self) {
while self.len() > Self::MAX_LEN {
let (new_len, _) = self.as_str().grapheme_indices(true).next_back().unwrap();
self.0.truncate(new_len);
if self.0.is_empty() {
*self = Identifier::new("VAR").unwrap();
return;
}
}
}
pub fn from_encoding(
s: impl Into<UniCase<String>>,
encoding: &'static Encoding,
) -> Result<Identifier, Error> {
let s: UniCase<String> = s.into();
Self::check_plausible(&s)?;
let identifier = Identifier(s);
identifier.check_encoding(encoding)?;
Ok(identifier)
}
pub fn check_encoding(&self, encoding: &'static Encoding) -> Result<(), Error> {
let s = self.0.as_str();
let (_encoded, _, unencodable) = encoding.encode(s);
if unencodable {
let mut encoder = encoding.new_encoder();
let mut buf = Vec::with_capacity(
encoder
.max_buffer_length_from_utf8_without_replacement(s.len())
.unwrap(),
);
let EncoderResult::Unmappable(c) = encoder
.encode_from_utf8_to_vec_without_replacement(s, &mut buf, true)
.0
else {
unreachable!();
};
return Err(Error::NotEncodable {
id: s.into(),
encoding: encoding.name(),
c,
});
}
Ok(())
}
pub fn check_plausible(s: &str) -> Result<(), Error> {
if s.is_empty() {
return Err(Error::Empty);
}
if is_reserved_word(s) {
return Err(Error::Reserved(s.into()));
}
if s == "!" {
return Err(Error::Bang);
}
let mut i = s.chars();
let first = i.next().unwrap();
if !first.may_start_id() {
return Err(Error::BadFirstCharacter {
string: s.into(),
c: first,
});
}
for c in i {
if !c.may_continue_id() {
return Err(Error::BadLaterCharacter {
string: s.into(),
c,
});
}
}
Ok(())
}
pub fn matches_keyword(&self, keyword: &str) -> bool {
id_match_n_nonstatic(keyword, self.0.as_str(), 3)
}
pub fn matches_keyword_n(&self, keyword: &str, n: usize) -> bool {
id_match_n_nonstatic(keyword, self.0.as_str(), n)
}
pub fn must_be_ordinary(self) -> Result<Self, Error> {
match Class::from(&self) {
Class::Ordinary => Ok(self),
_ => {
let s = self.0.into_inner();
let first = s.chars().next().unwrap();
Err(Error::BadFirstCharacter {
string: s,
c: first,
})
}
}
}
pub fn class(&self) -> Class {
self.into()
}
pub fn as_str(&self) -> &str {
self.0.as_ref()
}
pub fn shortened(&self, encoding: &'static Encoding) -> Self {
let new_len = shortened_len(self, "", encoding, 8);
Self::new_unchecked(self.0[..new_len].to_string())
}
pub fn with_suffix(
&self,
suffix: &str,
encoding: &'static Encoding,
max_len: usize,
) -> Result<Self, Error> {
let prefix_len = shortened_len(self, suffix, encoding, max_len);
if prefix_len == 0 {
Self::new(suffix)
} else {
Self::new(format!("{}{suffix}", &self[..prefix_len]))
}
}
}
impl Serialize for Identifier {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
self.0.as_str().serialize(serializer)
}
}
fn encode_fully(encoder: &mut Encoder, mut src: &str, dst: &mut Vec<u8>, last: bool) {
while let (CoderResult::OutputFull, read, _) = encoder.encode_from_utf8_to_vec(src, dst, last) {
src = &src[read..];
dst.reserve((dst.capacity() * 2) - dst.len());
}
}
fn shortened_len(prefix: &str, suffix: &str, encoding: &'static Encoding, max_len: usize) -> usize {
assert!(max_len <= 64);
if encoding == UTF_8 {
if prefix.len() + suffix.len() <= max_len {
prefix.len()
} else if suffix.len() >= max_len {
0
} else {
let mut copy_len = 0;
for (cluster_start, cluster) in prefix.grapheme_indices(true) {
let cluster_end = cluster_start + cluster.len();
if cluster_end > max_len - suffix.len() {
break;
}
copy_len = cluster_end;
}
copy_len
}
} else {
let mut copy_len = 0;
let mut tmp = Vec::with_capacity(max_len);
for (cluster_start, cluster) in prefix.grapheme_indices(true) {
let cluster_end = cluster_start + cluster.len();
let mut encoder = encoding.new_encoder();
tmp.clear();
encode_fully(&mut encoder, &prefix[..cluster_end], &mut tmp, false);
if tmp.len() <= max_len {
encode_fully(&mut encoder, suffix, &mut tmp, true);
}
if tmp.len() > max_len {
break;
}
copy_len = cluster_end;
}
copy_len
}
}
impl Deref for Identifier {
type Target = UniCase<String>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl PartialEq<str> for Identifier {
fn eq(&self, other: &str) -> bool {
self.0.eq(&UniCase::new(other))
}
}
pub fn id_match(keyword: &'static str, token: &str) -> bool {
id_match_n(keyword, token, 3)
}
pub fn id_match_n(keyword: &'static str, token: &str, n: usize) -> bool {
id_match_n_nonstatic(keyword, token, n)
}
pub fn id_match_n_nonstatic(keyword: &str, token: &str, n: usize) -> bool {
debug_assert!(keyword.is_ascii());
let keyword_prefix = if (n..keyword.len()).contains(&token.len()) {
&keyword[..token.len()]
} else {
keyword
};
keyword_prefix.eq_ignore_ascii_case(token)
}
impl Display for Identifier {
fn fmt(&self, f: &mut Formatter) -> FmtResult {
write!(f, "{}", self.0)
}
}
impl Debug for Identifier {
fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult {
write!(f, "{:?}", self.0)
}
}
pub trait HasIdentifier {
fn identifier(&self) -> &UniCase<String>;
}
pub struct ByIdentifier<T>(pub T)
where
T: HasIdentifier;
impl<T> ByIdentifier<T>
where
T: HasIdentifier,
{
pub fn new(inner: T) -> Self {
Self(inner)
}
}
impl<T> PartialEq for ByIdentifier<T>
where
T: HasIdentifier,
{
fn eq(&self, other: &Self) -> bool {
self.0.identifier().eq(other.0.identifier())
}
}
impl<T> Eq for ByIdentifier<T> where T: HasIdentifier {}
impl<T> PartialOrd for ByIdentifier<T>
where
T: HasIdentifier,
{
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<T> Ord for ByIdentifier<T>
where
T: HasIdentifier,
{
fn cmp(&self, other: &Self) -> Ordering {
self.0.identifier().cmp(other.0.identifier())
}
}
impl<T> Hash for ByIdentifier<T>
where
T: HasIdentifier,
{
fn hash<H: Hasher>(&self, state: &mut H) {
self.0.identifier().hash(state)
}
}
impl<T> Borrow<UniCase<String>> for ByIdentifier<T>
where
T: HasIdentifier,
{
fn borrow(&self) -> &UniCase<String> {
self.0.identifier()
}
}
impl<T> Debug for ByIdentifier<T>
where
T: HasIdentifier + Debug,
{
fn fmt(&self, f: &mut Formatter) -> FmtResult {
self.0.fmt(f)
}
}
impl<T> Clone for ByIdentifier<T>
where
T: HasIdentifier + Clone,
{
fn clone(&self) -> Self {
Self(self.0.clone())
}
}
impl<T> Deref for ByIdentifier<T>
where
T: HasIdentifier + Clone,
{
type Target = T;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl<T> DerefMut for ByIdentifier<T>
where
T: HasIdentifier + Clone,
{
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
impl<T> Serialize for ByIdentifier<T>
where
T: HasIdentifier + Clone + Serialize,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
self.0.serialize(serializer)
}
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use encoding_rs::{Encoding, UTF_8, WINDOWS_1252};
use unicase::UniCase;
use crate::identifier::Identifier;
use super::{ByIdentifier, HasIdentifier};
#[derive(PartialEq, Eq, Debug, Clone)]
struct SimpleVar {
name: Identifier,
value: i32,
}
impl HasIdentifier for SimpleVar {
fn identifier(&self) -> &UniCase<String> {
&self.name.0
}
}
#[test]
fn identifier() {
let abcd = Identifier::new("abcd").unwrap();
let abcd1 = SimpleVar {
name: abcd.clone(),
value: 1,
};
let abcd2 = SimpleVar {
name: abcd,
value: 2,
};
assert_ne!(abcd1, abcd2);
let abcd1_by_name = ByIdentifier::new(abcd1);
let abcd2_by_name = ByIdentifier::new(abcd2);
assert_eq!(abcd1_by_name, abcd2_by_name);
let mut vars: HashSet<ByIdentifier<SimpleVar>> = HashSet::new();
assert!(vars.insert(ByIdentifier::new(abcd1_by_name.0.clone())));
assert!(!vars.insert(ByIdentifier::new(abcd2_by_name.0.clone())));
assert_eq!(
vars.get(&UniCase::new(String::from("abcd")))
.unwrap()
.0
.value,
1
);
}
#[test]
fn with_suffix() {
for (head, suffix, encoding, max_len, expected) in [
("abc", "xyz", UTF_8, 6, "abcxyz"),
("abcd", "xyz", UTF_8, 6, "abcxyz"),
("abcd", "uvwxyz", UTF_8, 6, "uvwxyz"),
("abc", "tuvwxyz", UTF_8, 6, "tuvwxyz"),
("éèä", "xyz", UTF_8, 6, "éxyz"),
("éèä", "xyz", WINDOWS_1252, 6, "éèäxyz"),
] {
let head = Identifier::new(head).unwrap();
let suffix = Identifier::new(suffix).unwrap();
let actual = head.with_suffix(&suffix, encoding, max_len).unwrap();
assert_eq!(&actual, expected);
}
}
#[test]
fn shortened() {
for (long, expected_short, encoding) in [
("abc", "abc", UTF_8),
("éèäîVarNameA", "éèäî", UTF_8),
("éèäîVarNameA", "éèäîVarN", WINDOWS_1252),
] {
let long = Identifier::new(long).unwrap();
let short = long.shortened(encoding);
assert_eq!(&short, expected_short);
}
}
#[test]
fn codepage_to_unicode() {
fn check_unicode(identifier: &str, encoding: &'static Encoding, expected: &str) {
let identifier = Identifier::from_encoding(String::from(identifier), encoding).unwrap();
let mut actual = identifier.clone();
actual.codepage_to_unicode();
assert_eq!(actual.as_str(), expected);
}
check_unicode("abc", UTF_8, "abc");
check_unicode("éèäî", UTF_8, "éèäî");
check_unicode(
"éèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
WINDOWS_1252,
"éèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
);
check_unicode(
"xéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
WINDOWS_1252,
"xéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèä",
);
check_unicode(
"xyéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèäî",
WINDOWS_1252,
"xyéèäîéèäîéèäîéèäîéèäîéèäîéèäîéèä",
);
}
}