use alloc::{string::String, vec::Vec};
use core::{error, fmt};
#[cfg(feature = "pyo3")]
use pyo3::prelude::*;
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
#[non_exhaustive]
#[cfg_attr(feature = "pyo3", pyclass(module = "spimdisasm", eq))]
pub enum Encoding {
Ascii,
ShiftJis,
EucJp,
EucCn,
}
static BANNED_ESCAPE_CHARACTERS: [u8; 25] = [
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x08, 0x0B, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1C, 0x1D, 0x1E, 0x1F, ];
static ESCAPE_CHARACTERS_SPECIAL_CASES: [u8; 3] = [
0x1A, 0x1B, 0x7F, ];
impl Encoding {
pub fn from_name(name: &str) -> Option<Self> {
match name {
"ASCII" => Some(Encoding::Ascii),
"SHIFT-JIS" => Some(Encoding::ShiftJis),
"EUC-JP" => Some(Encoding::EucJp),
"EUC-CN" | "GB2312" => Some(Encoding::EucCn),
_ => None,
}
}
pub const fn default() -> Self {
Encoding::Ascii
}
const fn as_encoding_rs(&self) -> &'static encoding_rs::Encoding {
match self {
Encoding::Ascii => {
encoding_rs::WINDOWS_1252
}
Encoding::ShiftJis => encoding_rs::SHIFT_JIS,
Encoding::EucJp => encoding_rs::EUC_JP,
Encoding::EucCn => encoding_rs::GBK,
}
}
pub(crate) fn check_valid(&self, bytes: &[u8]) -> Result<usize, DecodingError> {
let mut i = 0;
while i < bytes.len() && bytes[i] != 0 {
let char = bytes[i];
if char > 0x7F {
let sequence_length = match *self {
Encoding::Ascii => {
return Err(DecodingError::AsciiOutOfRange {
index: i,
character: char,
});
}
Encoding::ShiftJis => {
match char {
0x80 | 0xA0 | 0xFD..=0xFF => {
return Err(DecodingError::InvalidFirstByteOfMultibyte {
index: i,
character: char,
});
}
_ => 2,
}
}
Encoding::EucJp => {
match char {
0x8F => 3,
0x8C | 0x8D => {
i += 1;
continue;
}
0x80..=0x8D | 0x8F..=0xA0 | 0xA9..=0xAF | 0xF5..=0xFF => {
return Err(DecodingError::InvalidFirstByteOfMultibyte {
index: i,
character: char,
});
}
_ => 2,
}
}
Encoding::EucCn => {
match char {
0x80..=0xA0 | 0xAA..=0xAF | 0xF8..=0xFF => {
return Err(DecodingError::InvalidFirstByteOfMultibyte {
index: i,
character: char,
});
}
_ => 2,
}
}
};
if i + sequence_length > bytes.len() {
return Err(DecodingError::MultibyteNotLongEnough {
index: i,
character: char,
expected_sequence_length: sequence_length,
});
}
if self
.decode_to_string(&bytes[i..i + sequence_length])
.is_none()
{
let chars = match sequence_length {
1 => [bytes[i], 0, 0],
2 => [bytes[i], bytes[i + 1], 0],
3 => [bytes[i], bytes[i + 1], bytes[i + 2]],
_ => unreachable!(
"Oh no! We have a silly bug on the string decoding code! :c"
),
};
return Err(DecodingError::InvalidMultibyte { index: i, chars });
}
i += sequence_length;
continue;
} else {
if BANNED_ESCAPE_CHARACTERS.contains(&char) {
return Err(DecodingError::InvalidEscapeCharacter {
index: i,
character: char,
});
}
i += 1;
continue;
}
}
if i >= bytes.len() || bytes[i] != 0 {
return Err(DecodingError::TerminatorNotFound);
}
let mut check_start_offset = i;
let check_end_offset = bytes.len().min((check_start_offset & !3) + 4);
while check_start_offset < check_end_offset {
let char = bytes[check_start_offset];
if char != 0 {
return Err(DecodingError::InvalidPad {
index: i,
character: char,
});
}
check_start_offset += 1
}
i += 1;
Ok(i)
}
pub(crate) fn decode_to_string(&self, bytes: &[u8]) -> Option<String> {
if let (x, false) = self.as_encoding_rs().decode_without_bom_handling(bytes) {
Some(x.into_owned())
} else {
None
}
}
pub(crate) fn decode_to_strings_vec(self, bytes: &[u8]) -> Option<Vec<(String, bool)>> {
let mut ret = Vec::new();
for (s, finished) in DecoderIterator::new(self, bytes) {
ret.push((s, finished));
if finished {
return Some(
ret.into_iter()
.map(|(x, finished)| (x.into_string(), finished))
.collect(),
);
}
}
None
}
}
impl Default for Encoding {
fn default() -> Self {
Self::default()
}
}
#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
struct DecoderIterator<'a> {
encoding: Encoding,
bytes: &'a [u8],
index: usize,
trailing_backslash: bool,
}
impl<'a> DecoderIterator<'a> {
fn new(encoding: Encoding, bytes: &'a [u8]) -> Self {
Self {
encoding,
bytes,
index: 0,
trailing_backslash: false,
}
}
pub(crate) fn next(&mut self) -> Option<(DecodingResult, bool)> {
let len = self.bytes.len();
#[cfg(feature = "pyo3")]
if len == 0 && self.index == 0 {
self.index = 1;
return Some((DecodingResult::DecodedString(String::new()), true));
}
if self.index >= len {
return None;
}
let c = self.bytes[self.index];
if self.trailing_backslash {
self.index += 1;
self.trailing_backslash = false;
return Some((DecodingResult::RawChar(c), self.index >= len));
}
if c > 0x7F {
if self.index + 1 < len {
let next_char = self.bytes[self.index + 1];
if next_char == 0x5C {
self.index += 1;
self.trailing_backslash = true;
return Some((DecodingResult::RawChar(c), self.index >= len));
}
}
}
if ESCAPE_CHARACTERS_SPECIAL_CASES.contains(&c)
|| (self.encoding == Encoding::EucJp && (c == 0x8C || c == 0x8D))
{
self.index += 1;
return Some((DecodingResult::RawChar(c), self.index >= len));
}
for i in self.index..len {
let c = self.bytes[i];
if c > 0x7F {
if i + 1 < len {
let next_char = self.bytes[i + 1];
if next_char == 0x5C {
let start = self.index;
self.index = i;
return self
.encoding
.decode_to_string(&self.bytes[start..i])
.map(|x| (DecodingResult::DecodedString(x), self.index >= len));
}
}
}
if ESCAPE_CHARACTERS_SPECIAL_CASES.contains(&c)
|| (self.encoding == Encoding::EucJp && (c == 0x8C || c == 0x8D))
{
let start = self.index;
self.index = i;
return self
.encoding
.decode_to_string(&self.bytes[start..i])
.map(|x| (DecodingResult::DecodedString(x), self.index >= len));
}
}
let start = self.index;
self.index = len;
self.encoding
.decode_to_string(&self.bytes[start..len])
.map(|x| (DecodingResult::DecodedString(x), true))
}
}
impl Iterator for DecoderIterator<'_> {
type Item = (DecodingResult, bool);
fn next(&mut self) -> Option<Self::Item> {
self.next()
}
}
#[derive(Debug, Clone, Hash, PartialEq, Eq, PartialOrd, Ord)]
enum DecodingResult {
DecodedString(String),
RawChar(u8),
}
impl DecodingResult {
fn into_string(self) -> String {
match self {
DecodingResult::DecodedString(x) => escape_string(&x),
DecodingResult::RawChar(c) => format!("\\x{c:02X}"),
}
}
}
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
#[non_exhaustive]
pub(crate) enum DecodingError {
InvalidEscapeCharacter {
index: usize,
character: u8,
},
AsciiOutOfRange {
index: usize,
character: u8,
},
MultibyteNotLongEnough {
index: usize,
character: u8,
expected_sequence_length: usize,
},
InvalidFirstByteOfMultibyte {
index: usize,
character: u8,
},
InvalidMultibyte {
index: usize,
chars: [u8; 3],
},
TerminatorNotFound,
InvalidPad {
index: usize,
character: u8,
},
}
impl fmt::Display for DecodingError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt::Debug::fmt(&self, f)
}
}
impl error::Error for DecodingError {}
pub(crate) fn escape_string(val: &str) -> String {
let mut escaped_buf = Vec::with_capacity(val.len());
let bytes = val.as_bytes();
for b in bytes {
match b {
0x07 => {
escaped_buf.push(b'\\');
escaped_buf.push(b'a');
}
b'\t' => {
escaped_buf.push(b'\\');
escaped_buf.push(b't');
}
b'\n' => {
escaped_buf.push(b'\\');
escaped_buf.push(b'n');
}
0x0C => {
escaped_buf.push(b'\\');
escaped_buf.push(b'f');
}
b'\r' => {
escaped_buf.push(b'\\');
escaped_buf.push(b'r');
}
b'\"' => {
escaped_buf.push(b'\\');
escaped_buf.push(b'"');
}
b'\\' => {
escaped_buf.push(b'\\');
escaped_buf.push(b'\\');
}
x => escaped_buf.push(*x),
}
}
String::from_utf8(escaped_buf).unwrap()
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
#[test]
fn check_valid_string_ending_in_word_of_zeroes() {
static BYTES: [u8; 56] = [
0x54, 0x68, 0x69, 0x73, 0x20, 0x67, 0x61, 0x6D, 0x65, 0x20, 0x69, 0x73, 0x20, 0x6E,
0x6F, 0x74, 0x20, 0x64, 0x65, 0x73, 0x69, 0x67, 0x6E, 0x65, 0x64, 0x7E, 0x6E, 0x66,
0x6F, 0x72, 0x20, 0x75, 0x73, 0x65, 0x20, 0x6F, 0x6E, 0x20, 0x74, 0x68, 0x69, 0x73,
0x20, 0x73, 0x79, 0x73, 0x74, 0x65, 0x6D, 0x2E, 0x7E, 0x7A, 0x00, 0x00, 0x00, 0x00,
];
let encoding = Encoding::ShiftJis;
let maybe_size = encoding.check_valid(&BYTES);
assert_eq!(maybe_size, Ok(53));
}
#[test]
fn check_valid_string_ending_tight() {
static BYTES: [u8; 56] = [
0x54, 0x68, 0x69, 0x73, 0x20, 0x67, 0x61, 0x6D, 0x65, 0x20, 0x69, 0x73, 0x20, 0x6E,
0x6F, 0x74, 0x20, 0x64, 0x65, 0x73, 0x69, 0x67, 0x6E, 0x65, 0x64, 0x7E, 0x6E, 0x66,
0x6F, 0x72, 0x20, 0x75, 0x73, 0x65, 0x20, 0x6F, 0x6E, 0x20, 0x74, 0x68, 0x69, 0x73,
0x20, 0x73, 0x79, 0x73, 0x74, 0x65, 0x6D, 0x2E, 0x7E, 0x7A, 0x20, 0x20, 0x20, 0x00,
];
let encoding = Encoding::ShiftJis;
let maybe_size = encoding.check_valid(&BYTES);
assert_eq!(maybe_size, Ok(56));
}
#[test]
fn check_valid_string_ending_tight_2() {
static BYTES: [u8; 8] = [0x4E, 0x41, 0x4E, 0x00, 0x3F, 0xF0, 0x00, 0x00];
let encoding = Encoding::ShiftJis;
let maybe_size = encoding.check_valid(&BYTES);
assert_eq!(maybe_size, Ok(4));
}
#[test]
fn check_valid_string_invalid() {
static BYTES: [u8; 56] = [
0x54, 0x68, 0x69, 0x73, 0x20, 0x67, 0x61, 0x6D, 0x65, 0x20, 0x69, 0x73, 0x20, 0x6E,
0x6F, 0x74, 0x20, 0x64, 0x65, 0x73, 0x69, 0x67, 0x6E, 0x65, 0x64, 0x7E, 0x6E, 0x66,
0x6F, 0x72, 0x20, 0x75, 0x73, 0x65, 0x20, 0x6F, 0x6E, 0x20, 0x74, 0x68, 0x69, 0x73,
0x20, 0x73, 0x79, 0x73, 0x74, 0x65, 0x6D, 0x2E, 0x7E, 0x7A, 0x20, 0x20, 0x20, 0x20,
];
let encoding = Encoding::ShiftJis;
let maybe_size = encoding.check_valid(&BYTES);
assert_eq!(maybe_size, Err(DecodingError::TerminatorNotFound));
}
#[test]
fn check_valid_string_invalid_2() {
static BYTES: [u8; 16] = [
0x80, 0x03, 0xD7, 0xCC, 0x80, 0x03, 0xD7, 0xCC, 0x80, 0x03, 0xD7, 0xB0, 0x00, 0x00, 0x00, 0x00, ];
let encoding = Encoding::ShiftJis;
let maybe_size = encoding.check_valid(&BYTES);
#[cfg(feature = "std")]
println!("{:?}", encoding.decode_to_string(&BYTES));
assert_eq!(
maybe_size,
Err(DecodingError::InvalidFirstByteOfMultibyte {
index: 0,
character: 0x80
})
);
}
#[test]
fn check_valid_string_invalid_3() {
static BYTES: [u8; 5] = [0x80, 0x2C, 0x5E, 0x68, 0x00];
let encoding = Encoding::ShiftJis;
let maybe_size = encoding.check_valid(&BYTES);
#[cfg(feature = "std")]
println!("{:?}", encoding.decode_to_string(&BYTES));
assert_eq!(
maybe_size,
Err(DecodingError::InvalidFirstByteOfMultibyte {
index: 0,
character: 0x80
})
);
}
#[test]
fn asdf() {
static BYTES: [u8; 276] = [
0x82, 0x60, 0x82, 0x61, 0x82, 0x62, 0x82, 0x63, 0x82, 0x64, 0x5F, 0x5F, 0x82, 0x81,
0x82, 0x82, 0x82, 0x83, 0x82, 0x84, 0x82, 0x85, 0x5F, 0x5F, 0x81, 0x9B, 0x81, 0x7E,
0x81, 0x99, 0x82, 0x65, 0x82, 0x66, 0x82, 0x67, 0x82, 0x68, 0x82, 0x69, 0x5F, 0x5F,
0x82, 0x86, 0x82, 0x87, 0x82, 0x88, 0x82, 0x89, 0x82, 0x8A, 0x5F, 0x5F, 0x81, 0x45,
0x81, 0x44, 0x81, 0x46, 0x82, 0x6A, 0x82, 0x6B, 0x82, 0x6C, 0x82, 0x6D, 0x82, 0x6E,
0x5F, 0x5F, 0x82, 0x8B, 0x82, 0x8C, 0x82, 0x8D, 0x82, 0x8E, 0x82, 0x8F, 0x5F, 0x5F,
0x81, 0x49, 0x81, 0x48, 0x81, 0x95, 0x82, 0x6F, 0x82, 0x70, 0x82, 0x71, 0x82, 0x72,
0x82, 0x73, 0x5F, 0x5F, 0x82, 0x90, 0x82, 0x91, 0x82, 0x92, 0x82, 0x93, 0x82, 0x94,
0x5F, 0x5F, 0x81, 0x96, 0x81, 0x5E, 0x81, 0x93, 0x82, 0x74, 0x82, 0x75, 0x82, 0x76,
0x82, 0x77, 0x82, 0x78, 0x5F, 0x5F, 0x82, 0x95, 0x82, 0x96, 0x82, 0x97, 0x82, 0x98,
0x82, 0x99, 0x5F, 0x5F, 0x81, 0x43, 0x81, 0x66, 0x81, 0x60, 0x82, 0x79, 0x5F, 0x5F,
0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x82, 0x9A, 0x5F, 0x5F, 0x5F, 0x5F,
0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x82, 0x4F,
0x82, 0x50, 0x82, 0x51, 0x82, 0x52, 0x82, 0x53, 0x5F, 0x5F, 0x82, 0x54, 0x82, 0x55,
0x82, 0x56, 0x82, 0x57, 0x82, 0x58, 0x5F, 0x5F, 0x33, 0x30, 0x33, 0x31, 0x33, 0x32,
0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F,
0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x34, 0x30, 0x34, 0x31,
0x34, 0x32, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F,
0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x5F, 0x35, 0x30,
0x35, 0x31, 0x35, 0x32, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
];
let encoding = Encoding::ShiftJis;
let decoded = encoding.decode_to_strings_vec(&BYTES);
assert!(decoded.is_some());
let decoded = decoded.unwrap();
#[cfg(feature = "std")]
for (x, f) in &decoded {
println!("{} {}", f, x);
}
assert_eq!(decoded.len(), 1);
assert_eq!(decoded[0].0, "ABCDE__abcde__○×☆FGHIJ__fghij__・.:KLMNO__klmno__!?&PQRST__pqrst__*/%UVWXY__uvwxy__,’~Z__________z________________01234__56789__303132________________________404142________________________505152\0\0\0\0\0\0");
}
}
#[cfg(feature = "pyo3")]
pub(crate) mod python_bindings {
use super::*;
#[pymethods]
impl Encoding {
#[staticmethod]
#[pyo3(name = "from_name")]
pub fn py_from_name(name: &str) -> Option<Self> {
Self::from_name(name)
}
}
}