use std::{
borrow::{Borrow, BorrowMut, Cow},
cmp::Ordering,
fmt::{Debug, Display, Formatter},
hash::Hash,
str::from_utf8,
};
use encoding_rs::{Encoding, UTF_8, mem::decode_latin1};
use itertools::Itertools;
use ordered_float::OrderedFloat;
use serde::{
Serialize,
ser::{SerializeSeq, SerializeTupleVariant},
};
use crate::{
dictionary::Dictionary,
format::DisplayPlain,
output::{
Item, Text,
pivot::{Axis3, Dimension, Group, PivotTable, value::Value},
},
variable::{VarType, VarWidth},
};
pub trait RawString: Debug + PartialEq + Eq + PartialOrd + Ord + Hash {
fn raw_string_bytes(&self) -> &[u8];
fn eq_ignore_trailing_spaces<R>(&self, other: &R) -> bool
where
R: RawString,
{
self.raw_string_bytes()
.iter()
.copied()
.zip_longest(other.raw_string_bytes().iter().copied())
.all(|elem| {
let (left, right) = elem.or(b' ', b' ');
left == right
})
}
fn is_resizable(&self, new_len: usize) -> bool {
new_len >= self.len()
|| self.raw_string_bytes()[new_len..]
.iter()
.copied()
.all(|b| b == b' ')
}
fn is_empty(&self) -> bool {
self.raw_string_bytes().is_empty()
}
fn is_spaces(&self) -> bool {
self.without_trailing_spaces().is_empty()
}
fn len(&self) -> usize {
self.raw_string_bytes().len()
}
fn as_ref(&self) -> &ByteStr {
ByteStr::new(self.raw_string_bytes())
}
fn without_trailing_spaces(&self) -> &ByteStr {
let mut raw = self.raw_string_bytes();
while let Some(trimmed) = raw.strip_suffix(b" ") {
raw = trimmed;
}
ByteStr::new(raw)
}
fn as_encoded(&self, encoding: &'static Encoding) -> WithEncoding<&ByteStr>
where
Self: Sized,
{
WithEncoding::new(self.as_ref(), encoding)
}
fn with_encoding(self, encoding: &'static Encoding) -> WithEncoding<Self>
where
Self: Sized,
{
WithEncoding::new(self, encoding)
}
}
pub trait MutRawString: RawString {
fn resize(&mut self, new_len: usize) -> Result<(), ResizeError>;
fn trim_end(&mut self);
}
impl RawString for &'_ str {
fn raw_string_bytes(&self) -> &[u8] {
self.as_bytes()
}
}
impl RawString for String {
fn raw_string_bytes(&self) -> &[u8] {
self.as_bytes()
}
}
impl RawString for &'_ String {
fn raw_string_bytes(&self) -> &[u8] {
self.as_bytes()
}
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(transparent)]
pub struct ByteStr(pub [u8]);
impl PartialEq<ByteString> for &ByteStr {
fn eq(&self, other: &ByteString) -> bool {
self.raw_string_bytes() == other.raw_string_bytes()
}
}
impl ByteStr {
pub fn new(s: &[u8]) -> &ByteStr {
unsafe { &*(s as *const [u8] as *const ByteStr) }
}
}
impl RawString for &ByteStr {
fn raw_string_bytes(&self) -> &[u8] {
&self.0
}
}
impl Serialize for &ByteStr {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if let Ok(s) = str::from_utf8(&self.0) {
let (variant_index, variant) = if self.0.iter().all(|b| b.is_ascii()) {
(0, "Ascii")
} else {
(1, "Utf8")
};
let mut tuple =
serializer.serialize_tuple_variant("RawString", variant_index, variant, 1)?;
tuple.serialize_field(s)?;
tuple.end()
} else {
let mut tuple = serializer.serialize_tuple_variant("RawString", 2, "Windows1252", 1)?;
tuple.serialize_field(&decode_latin1(&self.0))?;
tuple.end()
}
}
}
impl Debug for ByteStr {
fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
let s = from_utf8(&self.0).map_or_else(|_| decode_latin1(&self.0), Cow::from);
write!(f, "{s:?}")
}
}
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct ByteCow<'a>(pub Cow<'a, [u8]>);
impl ByteCow<'_> {
pub fn into_owned(self) -> ByteString {
ByteString(self.0.into_owned())
}
}
impl Serialize for ByteCow<'_> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
ByteStr::new(&self.0).serialize(serializer)
}
}
impl RawString for ByteCow<'_> {
fn raw_string_bytes(&self) -> &[u8] {
&self.0
}
}
impl Debug for ByteCow<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
ByteStr::new(&self.0).fmt(f)
}
}
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct ByteStrArray<const N: usize>(pub [u8; N]);
impl<const N: usize> Serialize for ByteStrArray<N> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
ByteStr::new(&self.0).serialize(serializer)
}
}
impl<const N: usize> RawString for ByteStrArray<N> {
fn raw_string_bytes(&self) -> &[u8] {
&self.0
}
}
impl<const N: usize> Debug for ByteStrArray<N> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
ByteStr::new(&self.0).fmt(f)
}
}
#[derive(Clone, Default, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct ByteString(pub Vec<u8>);
impl ByteString {
pub fn spaces(n: usize) -> Self {
Self(std::iter::repeat_n(b' ', n).collect())
}
pub fn display_hex(&self) -> HexBytes<'_> {
HexBytes(self.0.as_slice())
}
pub fn to_hex(&self) -> String {
self.display_hex().to_string()
}
}
pub struct HexBytes<'a>(&'a [u8]);
impl<'a> Display for HexBytes<'a> {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
for byte in self.0 {
write!(f, "{:02X}", *byte)?;
}
Ok(())
}
}
impl Borrow<ByteStr> for ByteString {
fn borrow(&self) -> &ByteStr {
ByteStr::new(&self.0)
}
}
impl From<String> for ByteString {
fn from(value: String) -> Self {
value.into_bytes().into()
}
}
impl From<&'_ str> for ByteString {
fn from(value: &str) -> Self {
value.as_bytes().into()
}
}
impl From<Cow<'_, str>> for ByteString {
fn from(value: Cow<'_, str>) -> Self {
value.into_owned().into()
}
}
impl From<Cow<'_, [u8]>> for ByteString {
fn from(value: Cow<'_, [u8]>) -> Self {
value.into_owned().into()
}
}
impl From<Vec<u8>> for ByteString {
fn from(value: Vec<u8>) -> Self {
Self(value)
}
}
impl From<&[u8]> for ByteString {
fn from(value: &[u8]) -> Self {
Self(value.into())
}
}
impl From<&ByteString> for ByteString {
fn from(value: &ByteString) -> Self {
value.clone()
}
}
impl<const N: usize> From<&ByteStrArray<N>> for ByteString {
fn from(value: &ByteStrArray<N>) -> Self {
Self::from(value.raw_string_bytes())
}
}
impl<const N: usize> From<[u8; N]> for ByteString {
fn from(value: [u8; N]) -> Self {
value.as_slice().into()
}
}
impl RawString for ByteString {
fn raw_string_bytes(&self) -> &[u8] {
self.0.as_slice()
}
}
impl Serialize for ByteString {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
if let Ok(s) = str::from_utf8(&self.0) {
let (variant_index, variant) = if self.0.iter().all(|b| b.is_ascii()) {
(0, "Ascii")
} else {
(1, "Utf8")
};
let mut tuple =
serializer.serialize_tuple_variant("RawString", variant_index, variant, 1)?;
tuple.serialize_field(s)?;
tuple.end()
} else {
let mut tuple = serializer.serialize_tuple_variant("RawString", 2, "Windows1252", 1)?;
tuple.serialize_field(&decode_latin1(&self.0))?;
tuple.end()
}
}
}
impl Debug for ByteString {
fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
let s =
from_utf8(self.0.borrow()).map_or_else(|_| decode_latin1(self.0.borrow()), Cow::from);
write!(f, "{s:?}")
}
}
impl MutRawString for ByteString {
fn resize(&mut self, new_len: usize) -> Result<(), ResizeError> {
match new_len.cmp(&self.0.len()) {
Ordering::Less => {
if !self.0[new_len..].iter().all(|b| *b == b' ') {
return Err(ResizeError::TooWide);
}
self.0.truncate(new_len);
}
Ordering::Equal => (),
Ordering::Greater => self.0.resize(new_len, b' '),
}
Ok(())
}
fn trim_end(&mut self) {
while self.0.pop_if(|c| *c == b' ').is_some() {}
}
}
mod encoded;
pub use encoded::{Encoded, EncodedString, WithEncoding};
pub type OwnedDatum = Datum<WithEncoding<ByteString>>;
#[derive(Clone)]
pub enum Datum<T> {
Number(
Option<f64>,
),
String(
T,
),
}
impl Datum<WithEncoding<ByteString>> {
pub fn new_utf8(s: impl Into<String>) -> Self {
let s: String = s.into();
Datum::String(ByteString::from(s).with_encoding(UTF_8))
}
pub fn codepage_to_unicode(&mut self) {
if let Some(s) = self.as_string_mut() {
s.codepage_to_unicode();
}
}
pub fn without_encoding(self) -> Datum<ByteString> {
self.map_string(|s| s.into_inner())
}
}
impl<'a> Datum<WithEncoding<ByteCow<'a>>> {
pub fn into_owned(self) -> Datum<WithEncoding<ByteString>> {
self.map_string(|s| s.into_owned())
}
}
impl<T> Datum<T>
where
T: EncodedString,
{
pub fn as_borrowed(&self) -> Datum<WithEncoding<&ByteStr>> {
self.as_ref().map_string(|s| s.as_encoded_byte_str())
}
pub fn cloned(&self) -> Datum<WithEncoding<ByteString>> {
self.as_ref().map_string(|s| s.cloned())
}
}
impl<B> Debug for Datum<B>
where
B: Debug,
{
fn fmt(&self, f: &mut Formatter) -> std::fmt::Result {
match self {
Self::Number(Some(number)) => write!(f, "{number:?}"),
Self::Number(None) => write!(f, "SYSMIS"),
Self::String(s) => write!(f, "{s:?}"),
}
}
}
impl<T> Display for Datum<T>
where
T: Display,
{
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
Self::Number(None) => write!(f, "SYSMIS"),
Self::Number(Some(number)) => number.display_plain().fmt(f),
Self::String(string) => string.fmt(f),
}
}
}
impl<B> Serialize for Datum<B>
where
B: Serialize,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
match self {
Self::Number(number) => number.serialize(serializer),
Self::String(raw_string) => raw_string.serialize(serializer),
}
}
}
impl<T, R> PartialEq<Datum<R>> for Datum<T>
where
T: PartialEq<R>,
{
fn eq(&self, other: &Datum<R>) -> bool {
match (self, other) {
(Self::Number(Some(n1)), Datum::Number(Some(n2))) => {
OrderedFloat(*n1) == OrderedFloat(*n2)
}
(Self::Number(None), Datum::Number(None)) => true,
(Self::String(s1), Datum::String(s2)) => s1 == s2,
_ => false,
}
}
}
impl<T> Eq for Datum<T> where T: Eq {}
impl<T, R> PartialOrd<Datum<R>> for Datum<T>
where
T: PartialOrd<R>,
{
fn partial_cmp(&self, other: &Datum<R>) -> Option<Ordering> {
match (self, other) {
(Self::Number(a), Datum::Number(b)) => {
a.map(OrderedFloat).partial_cmp(&b.map(OrderedFloat))
}
(Self::Number(_), Datum::String(_)) => Some(Ordering::Less),
(Self::String(_), Datum::Number(_)) => Some(Ordering::Greater),
(Self::String(a), Datum::String(b)) => a.partial_cmp(b),
}
}
}
impl<T> Ord for Datum<T>
where
T: Ord,
{
fn cmp(&self, other: &Self) -> Ordering {
self.partial_cmp(other).unwrap()
}
}
impl<T> Hash for Datum<T>
where
T: Hash,
{
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
match self {
Self::Number(number) => number.map(OrderedFloat).hash(state),
Self::String(string) => string.hash(state),
}
}
}
impl<B> Datum<B> {
pub fn as_ref(&self) -> Datum<&B> {
match self {
Datum::Number(number) => Datum::Number(*number),
Datum::String(string) => Datum::String(string),
}
}
pub fn map_string<F, R>(self, f: F) -> Datum<R>
where
F: Fn(B) -> R,
{
match self {
Datum::Number(number) => Datum::Number(number),
Datum::String(string) => Datum::String(f(string)),
}
}
pub const fn sysmis() -> Self {
Self::Number(None)
}
pub const fn is_sysmis(&self) -> bool {
matches!(self, Self::Number(None))
}
pub const fn is_number(&self) -> bool {
matches!(self, Self::Number(_))
}
pub const fn is_string(&self) -> bool {
matches!(self, Self::String(_))
}
pub fn is_number_and<F>(&self, f: F) -> bool
where
F: FnOnce(Option<f64>) -> bool,
{
if let Self::Number(number) = self {
f(*number)
} else {
false
}
}
pub fn is_string_or<F>(&self, f: F) -> bool
where
F: FnOnce(Option<f64>) -> bool,
{
if let Self::Number(number) = self {
f(*number)
} else {
true
}
}
pub fn is_string_and<F>(&self, f: F) -> bool
where
F: FnOnce(&B) -> bool,
{
if let Self::String(string) = self {
f(string)
} else {
false
}
}
pub fn is_number_or<F>(&self, f: F) -> bool
where
F: FnOnce(&B) -> bool,
{
if let Self::String(string) = self {
f(string)
} else {
true
}
}
pub fn as_number(&self) -> Option<Option<f64>> {
match self {
Self::Number(number) => Some(*number),
Self::String(_) => None,
}
}
pub fn as_string(&self) -> Option<&B> {
match self {
Self::Number(_) => None,
Self::String(s) => Some(s),
}
}
pub fn into_string(self) -> Option<B> {
match self {
Self::Number(_) => None,
Self::String(s) => Some(s),
}
}
pub fn var_type(&self) -> VarType {
match self {
Self::Number(_) => VarType::Numeric,
Self::String(_) => VarType::String,
}
}
}
impl<T> Datum<T>
where
T: RawString,
{
pub fn is_resizable(&self, width: VarWidth) -> bool {
match (self, width) {
(Self::Number(_), VarWidth::Numeric) => true,
(Self::String(s), VarWidth::String(new_width)) => s.is_resizable(new_width as usize),
_ => false,
}
}
pub fn width(&self) -> VarWidth {
match self {
Self::Number(_) => VarWidth::Numeric,
Self::String(s) => VarWidth::String(s.len().try_into().unwrap()),
}
}
pub fn eq_ignore_trailing_spaces<R>(&self, other: &Datum<R>) -> bool
where
R: RawString,
{
match (self, other) {
(Self::String(a), Datum::String(b)) => a.eq_ignore_trailing_spaces(b),
(Self::Number(a), Datum::Number(b)) => a == b,
_ => false,
}
}
pub fn is_spaces(&self) -> bool {
self.is_string_and(|s| s.is_spaces())
}
pub fn as_raw(&self) -> Datum<&ByteStr> {
self.as_ref().map_string(|s| s.as_ref())
}
pub fn as_encoded(&self, encoding: &'static Encoding) -> Datum<WithEncoding<&ByteStr>> {
self.as_ref().map_string(|s| s.as_encoded(encoding))
}
pub fn with_encoding(self, encoding: &'static Encoding) -> Datum<WithEncoding<T>> {
self.map_string(|s| s.with_encoding(encoding))
}
}
impl<B> Datum<B>
where
B: EncodedString,
{
pub fn quoted<'a>(&'a self) -> QuotedDatum<'a, B> {
QuotedDatum(self)
}
}
pub struct QuotedDatum<'a, B>(&'a Datum<B>);
impl<'a, B> Display for QuotedDatum<'a, B>
where
B: Display,
{
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match &self.0 {
Datum::Number(None) => write!(f, "SYSMIS"),
Datum::Number(Some(number)) => number.display_plain().fmt(f),
Datum::String(string) => write!(f, "\"{string}\""),
}
}
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum ResizeError {
MixedTypes,
TooWide,
}
impl<T> Datum<T> {
pub fn as_string_mut(&mut self) -> Option<&mut T> {
match self {
Self::Number(_) => None,
Self::String(s) => Some(s.borrow_mut()),
}
}
pub fn trim_end(&mut self)
where
T: MutRawString,
{
if let Some(s) = self.as_string_mut() {
s.trim_end()
}
}
pub fn resize(&mut self, width: VarWidth) -> Result<(), ResizeError>
where
T: MutRawString,
{
match (self, width) {
(Self::Number(_), VarWidth::Numeric) => Ok(()),
(Self::String(s), VarWidth::String(new_width)) => s.resize(new_width as usize),
_ => Err(ResizeError::MixedTypes),
}
}
}
impl<B> From<f64> for Datum<B> {
fn from(number: f64) -> Self {
Some(number).into()
}
}
impl<B> From<Option<f64>> for Datum<B> {
fn from(value: Option<f64>) -> Self {
Self::Number(value)
}
}
impl<'a> From<&'a str> for Datum<&'a ByteStr> {
fn from(value: &'a str) -> Self {
Datum::String(ByteStr::new(value.as_bytes()))
}
}
impl<'a> From<&'a [u8]> for Datum<&'a ByteStr> {
fn from(value: &'a [u8]) -> Self {
Self::String(ByteStr::new(value))
}
}
#[derive(Clone, Debug, Serialize)]
pub struct RawCase(
pub Vec<Datum<ByteString>>,
);
impl RawCase {
pub fn as_encoding(&self, encoding: &'static Encoding) -> Case<&'_ [Datum<ByteString>]> {
Case {
encoding,
data: &self.0,
}
}
pub fn with_encoding(self, encoding: &'static Encoding) -> Case<Vec<Datum<ByteString>>> {
Case {
encoding,
data: self.0,
}
}
}
pub struct Case<B>
where
B: Borrow<[Datum<ByteString>]>,
{
data: B,
encoding: &'static Encoding,
}
impl<B> Case<B>
where
B: Borrow<[Datum<ByteString>]>,
{
pub fn new(data: B, encoding: &'static Encoding) -> Self {
Self { data, encoding }
}
pub fn encoding(&self) -> &'static Encoding {
self.encoding
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn len(&self) -> usize {
self.data.borrow().len()
}
pub fn iter(&self) -> CaseIter<'_> {
self.into_iter()
}
}
impl Case<Vec<Datum<ByteString>>> {
pub fn into_unicode(self) -> Self {
if self.encoding == UTF_8 {
self
} else {
Self {
encoding: UTF_8,
data: self
.data
.into_iter()
.map(|datum| {
datum.map_string(|s| {
let mut s = s.with_encoding(self.encoding);
s.codepage_to_unicode();
s.into_inner()
})
})
.collect(),
}
}
}
}
pub fn cases_to_output<C, E>(dictionary: &Dictionary, cases: C) -> Vec<Item>
where
C: IntoIterator<Item = Result<Case<Vec<Datum<ByteString>>>, E>>,
E: Display,
{
let mut output = Vec::new();
let cases = cases.into_iter();
let variables =
Group::new("Variable").with_multiple(dictionary.variables.iter().map(|var| &**var));
let mut case_numbers = Group::new("Case").with_label_shown();
let mut data = Vec::new();
for case in cases {
match case {
Ok(case) => {
case_numbers.push(Value::new_integer(Some((case_numbers.len() + 1) as f64)));
data.push(
case.into_iter()
.map(|datum| Value::new_datum(&datum))
.collect::<Vec<_>>(),
);
}
Err(error) => {
output.push(Item::from(Text::new_log(error.to_string())));
}
}
}
if !data.is_empty() {
let mut pt = PivotTable::new([
(Axis3::X, Dimension::new(variables)),
(Axis3::Y, Dimension::new(case_numbers)),
]);
for (row_number, row) in data.into_iter().enumerate() {
for (column_number, datum) in row.into_iter().enumerate() {
pt.insert(&[column_number, row_number], datum);
}
}
output.push(pt.into());
}
output
}
impl<B> Serialize for Case<B>
where
B: Borrow<[Datum<ByteString>]>,
{
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let mut seq = serializer.serialize_seq(Some(self.len()))?;
for datum in self.iter() {
seq.serialize_element(&datum)?;
}
seq.end()
}
}
pub struct CaseIter<'a> {
encoding: &'static Encoding,
iter: std::slice::Iter<'a, Datum<ByteString>>,
}
impl<'a> Iterator for CaseIter<'a> {
type Item = Datum<WithEncoding<&'a ByteStr>>;
fn next(&mut self) -> Option<Self::Item> {
self.iter.next().map(|d| d.as_encoded(self.encoding))
}
}
impl<'a, B> IntoIterator for &'a Case<B>
where
B: Borrow<[Datum<ByteString>]>,
{
type Item = Datum<WithEncoding<&'a ByteStr>>;
type IntoIter = CaseIter<'a>;
fn into_iter(self) -> Self::IntoIter {
CaseIter {
encoding: self.encoding,
iter: self.data.borrow().iter(),
}
}
}
impl IntoIterator for Case<Vec<Datum<ByteString>>> {
type Item = Datum<WithEncoding<ByteString>>;
type IntoIter = CaseIntoIter;
fn into_iter(self) -> Self::IntoIter {
CaseIntoIter {
encoding: self.encoding,
iter: self.data.into_iter(),
}
}
}
pub struct CaseIntoIter {
encoding: &'static Encoding,
iter: std::vec::IntoIter<Datum<ByteString>>,
}
impl Iterator for CaseIntoIter {
type Item = Datum<WithEncoding<ByteString>>;
fn next(&mut self) -> Option<Self::Item> {
self.iter
.next()
.map(|datum| datum.with_encoding(self.encoding))
}
}
pub struct Quoted<T>(T)
where
T: Display;
impl<T> Display for Quoted<T>
where
T: Display,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "\"{}\"", &self.0)
}
}