use crate::ast::Literal;
use crate::diagnostics::{Error as DiagnosticError, Result};
use crate::effects::Effect;
use crate::eval::value::{PrimitiveProcedure, PrimitiveImpl, ThreadSafeEnvironment, Value};
use std::collections::BTreeSet;
use std::fmt;
use std::sync::Arc;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CharSet {
chars: BTreeSet<char>,
}
impl CharSet {
pub fn new() -> Self {
Self {
chars: BTreeSet::new(),
}
}
pub fn from_chars<I>(chars: I) -> Self
where
I: IntoIterator<Item = char>,
{
Self {
chars: chars.into_iter().collect(),
}
}
pub fn from_string(s: &str) -> Self {
Self::from_chars(s.chars())
}
pub fn from_range(start: char, end: char) -> Self {
if start > end {
return Self::new();
}
let start_code = start as u32;
let end_code = end as u32;
let chars = (start_code..=end_code)
.filter_map(char::from_u32)
.collect();
Self { chars }
}
pub fn singleton(c: char) -> Self {
let mut chars = BTreeSet::new();
chars.insert(c);
Self { chars }
}
pub fn is_empty(&self) -> bool {
self.chars.is_empty()
}
pub fn size(&self) -> usize {
self.chars.len()
}
pub fn contains(&self, c: char) -> bool {
self.chars.contains(&c)
}
pub fn insert(&self, c: char) -> Self {
let mut new_chars = self.chars.clone();
new_chars.insert(c);
Self { chars: new_chars }
}
pub fn remove(&self, c: char) -> Self {
let mut new_chars = self.chars.clone();
new_chars.remove(&c);
Self { chars: new_chars }
}
pub fn union(&self, other: &Self) -> Self {
let chars = self.chars.union(&other.chars).cloned().collect();
Self { chars }
}
pub fn intersection(&self, other: &Self) -> Self {
let chars = self.chars.intersection(&other.chars).cloned().collect();
Self { chars }
}
pub fn difference(&self, other: &Self) -> Self {
let chars = self.chars.difference(&other.chars).cloned().collect();
Self { chars }
}
pub fn symmetric_difference(&self, other: &Self) -> Self {
let chars = self.chars.symmetric_difference(&other.chars).cloned().collect();
Self { chars }
}
pub fn complement(&self) -> Self {
let mut complement_chars = BTreeSet::new();
for code in 32..=126 {
if let Some(c) = char::from_u32(code) {
if !self.chars.contains(&c) {
complement_chars.insert(c);
}
}
}
let whitespace_chars = ['\t', '\n', '\r', ' '];
for &c in &whitespace_chars {
if !self.chars.contains(&c) {
complement_chars.insert(c);
}
}
Self { chars: complement_chars }
}
pub fn is_subset(&self, other: &Self) -> bool {
self.chars.is_subset(&other.chars)
}
pub fn is_equal(&self, other: &Self) -> bool {
self.chars == other.chars
}
pub fn iter(&self) -> impl Iterator<Item = &char> {
self.chars.iter()
}
pub fn to_vec(&self) -> Vec<char> {
self.chars.iter().cloned().collect()
}
pub fn filter<F>(&self, predicate: F) -> Self
where
F: Fn(char) -> bool,
{
let chars = self.chars.iter().filter(|&&c| predicate(c)).cloned().collect();
Self { chars }
}
pub fn count<F>(&self, predicate: F) -> usize
where
F: Fn(char) -> bool,
{
self.chars.iter().filter(|&&c| predicate(c)).count()
}
}
impl Default for CharSet {
fn default() -> Self {
Self::new()
}
}
impl fmt::Display for CharSet {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "#<char-set")?;
if self.chars.is_empty() {
write!(f, " empty")?;
} else {
write!(f, " size={}", self.size())?;
if self.size() <= 10 {
write!(f, " {{")?;
for (i, &c) in self.chars.iter().enumerate() {
if i > 0 {
write!(f, " ")?;
}
if c.is_ascii_graphic() || c == ' ' {
write!(f, "{c}")?;
} else {
write!(f, "\\u{{{:04x}}}", c as u32)?;
}
}
write!(f, "}}")?;
}
}
write!(f, ">")
}
}
pub struct StandardCharSets;
impl StandardCharSets {
pub fn lower_case() -> CharSet {
CharSet::from_chars(('a'..='z').chain('à '..='ÿ').filter(|c| c.is_lowercase()))
}
pub fn upper_case() -> CharSet {
CharSet::from_chars(('A'..='Z').chain('À'..='Þ').filter(|c| c.is_uppercase()))
}
pub fn digit() -> CharSet {
CharSet::from_chars('0'..='9')
}
pub fn letter() -> CharSet {
Self::lower_case().union(&Self::upper_case())
}
pub fn letter_plus_digit() -> CharSet {
Self::letter().union(&Self::digit())
}
pub fn whitespace() -> CharSet {
CharSet::from_chars([' ', '\t', '\n', '\r', '\x0C', '\x0B'].iter().cloned())
}
pub fn punctuation() -> CharSet {
let punct_chars = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
CharSet::from_string(punct_chars)
}
pub fn graphic() -> CharSet {
CharSet::from_chars((33..=126).filter_map(char::from_u32))
}
pub fn printing() -> CharSet {
Self::graphic().union(&CharSet::singleton(' '))
}
pub fn ascii() -> CharSet {
CharSet::from_chars((0..=127).filter_map(char::from_u32))
}
pub fn empty() -> CharSet {
CharSet::new()
}
pub fn full() -> CharSet {
Self::ascii()
}
pub fn hex_digit() -> CharSet {
CharSet::from_string("0123456789ABCDEFabcdef")
}
pub fn blank() -> CharSet {
CharSet::from_chars([' ', '\t'].iter().cloned())
}
pub fn iso_control() -> CharSet {
CharSet::from_chars((0..=31).chain(127..=159).filter_map(char::from_u32))
}
}
pub fn create_charset_bindings(env: &Arc<ThreadSafeEnvironment>) {
bind_charset_predicates(env);
bind_charset_constructors(env);
bind_charset_operations(env);
bind_standard_charsets(env);
bind_charset_conversions(env);
}
fn bind_charset_predicates(env: &Arc<ThreadSafeEnvironment>) {
env.define("char-set?".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set?".to_string(),
arity_min: 1,
arity_max: Some(1),
implementation: PrimitiveImpl::RustFn(primitive_char_set_p),
effects: vec![Effect::Pure],
})));
env.define("char-set=".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set=".to_string(),
arity_min: 2,
arity_max: None,
implementation: PrimitiveImpl::RustFn(primitive_char_set_equal),
effects: vec![Effect::Pure],
})));
env.define("char-set<=".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set<=".to_string(),
arity_min: 2,
arity_max: None,
implementation: PrimitiveImpl::RustFn(primitive_char_set_subset),
effects: vec![Effect::Pure],
})));
env.define("char-set-contains?".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set-contains?".to_string(),
arity_min: 2,
arity_max: Some(2),
implementation: PrimitiveImpl::RustFn(primitive_char_set_contains),
effects: vec![Effect::Pure],
})));
}
fn bind_charset_constructors(env: &Arc<ThreadSafeEnvironment>) {
env.define("char-set".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set".to_string(),
arity_min: 0,
arity_max: None,
implementation: PrimitiveImpl::RustFn(primitive_char_set),
effects: vec![Effect::Pure],
})));
env.define("list->char-set".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "list->char-set".to_string(),
arity_min: 1,
arity_max: Some(2),
implementation: PrimitiveImpl::RustFn(primitive_list_to_char_set),
effects: vec![Effect::Pure],
})));
env.define("string->char-set".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "string->char-set".to_string(),
arity_min: 1,
arity_max: Some(2),
implementation: PrimitiveImpl::RustFn(primitive_string_to_char_set),
effects: vec![Effect::Pure],
})));
env.define("char-set-filter".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set-filter".to_string(),
arity_min: 2,
arity_max: Some(3),
implementation: PrimitiveImpl::RustFn(primitive_char_set_filter),
effects: vec![Effect::Pure],
})));
env.define("ucs-range->char-set".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "ucs-range->char-set".to_string(),
arity_min: 2,
arity_max: Some(4),
implementation: PrimitiveImpl::RustFn(primitive_ucs_range_to_char_set),
effects: vec![Effect::Pure],
})));
}
fn bind_charset_operations(env: &Arc<ThreadSafeEnvironment>) {
env.define("char-set-size".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set-size".to_string(),
arity_min: 1,
arity_max: Some(1),
implementation: PrimitiveImpl::RustFn(primitive_char_set_size),
effects: vec![Effect::Pure],
})));
env.define("char-set-count".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set-count".to_string(),
arity_min: 2,
arity_max: Some(2),
implementation: PrimitiveImpl::RustFn(primitive_char_set_count),
effects: vec![Effect::Pure],
})));
env.define("char-set-union".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set-union".to_string(),
arity_min: 0,
arity_max: None,
implementation: PrimitiveImpl::RustFn(primitive_char_set_union),
effects: vec![Effect::Pure],
})));
env.define("char-set-intersection".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set-intersection".to_string(),
arity_min: 0,
arity_max: None,
implementation: PrimitiveImpl::RustFn(primitive_char_set_intersection),
effects: vec![Effect::Pure],
})));
env.define("char-set-difference".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set-difference".to_string(),
arity_min: 1,
arity_max: None,
implementation: PrimitiveImpl::RustFn(primitive_char_set_difference),
effects: vec![Effect::Pure],
})));
env.define("char-set-complement".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set-complement".to_string(),
arity_min: 1,
arity_max: Some(1),
implementation: PrimitiveImpl::RustFn(primitive_char_set_complement),
effects: vec![Effect::Pure],
})));
env.define("char-set-xor".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set-xor".to_string(),
arity_min: 0,
arity_max: None,
implementation: PrimitiveImpl::RustFn(primitive_char_set_xor),
effects: vec![Effect::Pure],
})));
}
fn bind_standard_charsets(env: &Arc<ThreadSafeEnvironment>) {
env.define("char-set:lower-case".to_string(), Value::CharSet(Arc::new(StandardCharSets::lower_case())));
env.define("char-set:upper-case".to_string(), Value::CharSet(Arc::new(StandardCharSets::upper_case())));
env.define("char-set:digit".to_string(), Value::CharSet(Arc::new(StandardCharSets::digit())));
env.define("char-set:letter".to_string(), Value::CharSet(Arc::new(StandardCharSets::letter())));
env.define("char-set:letter+digit".to_string(), Value::CharSet(Arc::new(StandardCharSets::letter_plus_digit())));
env.define("char-set:whitespace".to_string(), Value::CharSet(Arc::new(StandardCharSets::whitespace())));
env.define("char-set:punctuation".to_string(), Value::CharSet(Arc::new(StandardCharSets::punctuation())));
env.define("char-set:graphic".to_string(), Value::CharSet(Arc::new(StandardCharSets::graphic())));
env.define("char-set:printing".to_string(), Value::CharSet(Arc::new(StandardCharSets::printing())));
env.define("char-set:ascii".to_string(), Value::CharSet(Arc::new(StandardCharSets::ascii())));
env.define("char-set:empty".to_string(), Value::CharSet(Arc::new(StandardCharSets::empty())));
env.define("char-set:full".to_string(), Value::CharSet(Arc::new(StandardCharSets::full())));
env.define("char-set:hex-digit".to_string(), Value::CharSet(Arc::new(StandardCharSets::hex_digit())));
env.define("char-set:blank".to_string(), Value::CharSet(Arc::new(StandardCharSets::blank())));
env.define("char-set:iso-control".to_string(), Value::CharSet(Arc::new(StandardCharSets::iso_control())));
}
fn bind_charset_conversions(env: &Arc<ThreadSafeEnvironment>) {
env.define("char-set->list".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set->list".to_string(),
arity_min: 1,
arity_max: Some(1),
implementation: PrimitiveImpl::RustFn(primitive_char_set_to_list),
effects: vec![Effect::Pure],
})));
env.define("char-set->string".to_string(), Value::Primitive(Arc::new(PrimitiveProcedure {
name: "char-set->string".to_string(),
arity_min: 1,
arity_max: Some(1),
implementation: PrimitiveImpl::RustFn(primitive_char_set_to_string),
effects: vec![Effect::Pure],
})));
}
fn get_charset(value: &Value) -> Result<&CharSet> {
match value {
Value::CharSet(charset) => Ok(charset),
_ => Err(Box::new(DiagnosticError::runtime_error(
"Expected character set".to_string(),
None,
))),
}
}
fn get_char(value: &Value) -> Result<char> {
match value {
Value::Literal(Literal::Character(c)) => Ok(*c),
_ => Err(Box::new(DiagnosticError::runtime_error(
"Expected character".to_string(),
None,
))),
}
}
fn primitive_char_set_p(args: &[Value]) -> Result<Value> {
if args.len() != 1 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("char-set? expects 1 argument, got {}", args.len()),
None,
)));
}
Ok(Value::boolean(matches!(args[0], Value::CharSet(_))))
}
fn primitive_char_set_equal(args: &[Value]) -> Result<Value> {
if args.len() < 2 {
return Err(Box::new(DiagnosticError::runtime_error(
"char-set= requires at least 2 arguments".to_string(),
None,
)));
}
let first = get_charset(&args[0])?;
for arg in &args[1..] {
let charset = get_charset(arg)?;
if !first.is_equal(charset) {
return Ok(Value::boolean(false));
}
}
Ok(Value::boolean(true))
}
fn primitive_char_set_subset(args: &[Value]) -> Result<Value> {
if args.len() < 2 {
return Err(Box::new(DiagnosticError::runtime_error(
"char-set<= requires at least 2 arguments".to_string(),
None,
)));
}
for i in 0..args.len() - 1 {
let current = get_charset(&args[i])?;
let next = get_charset(&args[i + 1])?;
if !current.is_subset(next) {
return Ok(Value::boolean(false));
}
}
Ok(Value::boolean(true))
}
fn primitive_char_set_contains(args: &[Value]) -> Result<Value> {
if args.len() != 2 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("char-set-contains? expects 2 arguments, got {}", args.len()),
None,
)));
}
let charset = get_charset(&args[0])?;
let c = get_char(&args[1])?;
Ok(Value::boolean(charset.contains(c)))
}
fn primitive_char_set(args: &[Value]) -> Result<Value> {
let mut chars = Vec::new();
for arg in args {
let c = get_char(arg)?;
chars.push(c);
}
Ok(Value::CharSet(Arc::new(CharSet::from_chars(chars))))
}
fn primitive_list_to_char_set(args: &[Value]) -> Result<Value> {
if args.is_empty() || args.len() > 2 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("list->char-set expects 1 or 2 arguments, got {}", args.len()),
None,
)));
}
let char_list = args[0].as_list().ok_or_else(|| {
Box::new(DiagnosticError::runtime_error("Expected a list".to_string(), None))
})?;
let mut chars = Vec::new();
for value in char_list {
let c = get_char(&value)?;
chars.push(c);
}
let base_charset = if args.len() == 2 {
get_charset(&args[1])?.clone()
} else {
CharSet::new()
};
let new_charset = CharSet::from_chars(chars).union(&base_charset);
Ok(Value::CharSet(Arc::new(new_charset)))
}
fn primitive_string_to_char_set(args: &[Value]) -> Result<Value> {
if args.is_empty() || args.len() > 2 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("string->char-set expects 1 or 2 arguments, got {}", args.len()),
None,
)));
}
let s = args[0].as_string().ok_or_else(|| {
Box::new(DiagnosticError::runtime_error("Expected a string".to_string(), None))
})?;
let base_charset = if args.len() == 2 {
get_charset(&args[1])?.clone()
} else {
CharSet::new()
};
let new_charset = CharSet::from_string(s).union(&base_charset);
Ok(Value::CharSet(Arc::new(new_charset)))
}
fn primitive_char_set_filter(args: &[Value]) -> Result<Value> {
if args.len() < 2 || args.len() > 3 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("char-set-filter expects 2 or 3 arguments, got {}", args.len()),
None,
)));
}
Err(Box::new(DiagnosticError::runtime_error(
"char-set-filter with procedure predicates not yet implemented".to_string(),
None,
)))
}
fn primitive_ucs_range_to_char_set(args: &[Value]) -> Result<Value> {
if args.len() < 2 || args.len() > 4 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("ucs-range->char-set expects 2-4 arguments, got {}", args.len()),
None,
)));
}
let start = args[0].as_integer().ok_or_else(|| {
Box::new(DiagnosticError::runtime_error("Expected integer for start".to_string(), None))
})? as u32;
let end = args[1].as_integer().ok_or_else(|| {
Box::new(DiagnosticError::runtime_error("Expected integer for end".to_string(), None))
})? as u32;
let error_on_invalid = if args.len() >= 3 {
args[2].is_truthy()
} else {
false
};
let base_charset = if args.len() == 4 {
get_charset(&args[3])?.clone()
} else {
CharSet::new()
};
if start > end {
return Ok(Value::CharSet(Arc::new(base_charset)));
}
let chars: Result<Vec<char>> = (start..end)
.map(|code| {
char::from_u32(code).ok_or_else(|| {
if error_on_invalid {
Box::new(DiagnosticError::runtime_error(
format!("Invalid Unicode code point: {code}"),
None,
))
} else {
Box::new(DiagnosticError::runtime_error("Invalid Unicode code point".to_string(), None))
}
})
})
.collect();
let chars = chars?;
let new_charset = CharSet::from_chars(chars).union(&base_charset);
Ok(Value::CharSet(Arc::new(new_charset)))
}
fn primitive_char_set_size(args: &[Value]) -> Result<Value> {
if args.len() != 1 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("char-set-size expects 1 argument, got {}", args.len()),
None,
)));
}
let charset = get_charset(&args[0])?;
Ok(Value::integer(charset.size() as i64))
}
fn primitive_char_set_count(args: &[Value]) -> Result<Value> {
if args.len() != 2 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("char-set-count expects 2 arguments, got {}", args.len()),
None,
)));
}
Err(Box::new(DiagnosticError::runtime_error(
"char-set-count with procedure predicates not yet implemented".to_string(),
None,
)))
}
fn primitive_char_set_union(args: &[Value]) -> Result<Value> {
if args.is_empty() {
return Ok(Value::CharSet(Arc::new(CharSet::new())));
}
let mut result = get_charset(&args[0])?.clone();
for arg in &args[1..] {
let charset = get_charset(arg)?;
result = result.union(charset);
}
Ok(Value::CharSet(Arc::new(result)))
}
fn primitive_char_set_intersection(args: &[Value]) -> Result<Value> {
if args.is_empty() {
return Ok(Value::CharSet(Arc::new(StandardCharSets::full())));
}
let mut result = get_charset(&args[0])?.clone();
for arg in &args[1..] {
let charset = get_charset(arg)?;
result = result.intersection(charset);
}
Ok(Value::CharSet(Arc::new(result)))
}
fn primitive_char_set_difference(args: &[Value]) -> Result<Value> {
if args.is_empty() {
return Err(Box::new(DiagnosticError::runtime_error(
"char-set-difference requires at least 1 argument".to_string(),
None,
)));
}
let mut result = get_charset(&args[0])?.clone();
for arg in &args[1..] {
let charset = get_charset(arg)?;
result = result.difference(charset);
}
Ok(Value::CharSet(Arc::new(result)))
}
fn primitive_char_set_complement(args: &[Value]) -> Result<Value> {
if args.len() != 1 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("char-set-complement expects 1 argument, got {}", args.len()),
None,
)));
}
let charset = get_charset(&args[0])?;
let complement = charset.complement();
Ok(Value::CharSet(Arc::new(complement)))
}
fn primitive_char_set_xor(args: &[Value]) -> Result<Value> {
if args.is_empty() {
return Ok(Value::CharSet(Arc::new(CharSet::new())));
}
let mut result = get_charset(&args[0])?.clone();
for arg in &args[1..] {
let charset = get_charset(arg)?;
result = result.symmetric_difference(charset);
}
Ok(Value::CharSet(Arc::new(result)))
}
fn primitive_char_set_to_list(args: &[Value]) -> Result<Value> {
if args.len() != 1 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("char-set->list expects 1 argument, got {}", args.len()),
None,
)));
}
let charset = get_charset(&args[0])?;
let chars: Vec<Value> = charset.iter()
.map(|&c| Value::Literal(Literal::Character(c)))
.collect();
Ok(Value::list(chars))
}
fn primitive_char_set_to_string(args: &[Value]) -> Result<Value> {
if args.len() != 1 {
return Err(Box::new(DiagnosticError::runtime_error(
format!("char-set->string expects 1 argument, got {}", args.len()),
None,
)));
}
let charset = get_charset(&args[0])?;
let s: String = charset.iter().collect();
Ok(Value::string(s))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_charset_creation() {
let empty = CharSet::new();
assert!(empty.is_empty());
assert_eq!(empty.size(), 0);
let abc = CharSet::from_string("abc");
assert!(!abc.is_empty());
assert_eq!(abc.size(), 3);
assert!(abc.contains('a'));
assert!(abc.contains('b'));
assert!(abc.contains('c'));
assert!(!abc.contains('d'));
}
#[test]
fn test_charset_operations() {
let ab = CharSet::from_string("ab");
let bc = CharSet::from_string("bc");
let union = ab.union(&bc);
assert_eq!(union.size(), 3);
assert!(union.contains('a'));
assert!(union.contains('b'));
assert!(union.contains('c'));
let intersection = ab.intersection(&bc);
assert_eq!(intersection.size(), 1);
assert!(intersection.contains('b'));
let difference = ab.difference(&bc);
assert_eq!(difference.size(), 1);
assert!(difference.contains('a'));
}
#[test]
fn test_standard_charsets() {
let digits = StandardCharSets::digit();
assert_eq!(digits.size(), 10);
assert!(digits.contains('0'));
assert!(digits.contains('9'));
assert!(!digits.contains('a'));
let lower = StandardCharSets::lower_case();
assert!(lower.contains('a'));
assert!(lower.contains('z'));
assert!(!lower.contains('A'));
let upper = StandardCharSets::upper_case();
assert!(upper.contains('A'));
assert!(upper.contains('Z'));
assert!(!upper.contains('a'));
}
#[test]
fn test_charset_display() {
let empty = CharSet::new();
assert_eq!(format!("{}", empty), "#<char-set empty>");
let abc = CharSet::from_string("abc");
let display = format!("{}", abc);
assert!(display.contains("size=3"));
assert!(display.contains("{a b c}"));
}
}