use crate::GedcomError;
use encoding_rs::{Encoding, ISO_8859_15, UTF_16BE, UTF_16LE, WINDOWS_1252};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GedcomEncoding {
Utf8,
Utf16Le,
Utf16Be,
Iso8859_1,
Iso8859_15,
Ascii,
Ansel,
Unknown,
}
impl std::fmt::Display for GedcomEncoding {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
GedcomEncoding::Utf8 => write!(f, "UTF-8"),
GedcomEncoding::Utf16Le => write!(f, "UTF-16LE"),
GedcomEncoding::Utf16Be => write!(f, "UTF-16BE"),
GedcomEncoding::Iso8859_1 => write!(f, "ISO-8859-1"),
GedcomEncoding::Iso8859_15 => write!(f, "ISO-8859-15"),
GedcomEncoding::Ascii => write!(f, "ASCII"),
GedcomEncoding::Ansel => write!(f, "ANSEL"),
GedcomEncoding::Unknown => write!(f, "Unknown"),
}
}
}
#[must_use]
pub fn detect_encoding(bytes: &[u8]) -> GedcomEncoding {
if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
return GedcomEncoding::Utf8;
}
if bytes.len() >= 2 {
if bytes[0] == 0xFF && bytes[1] == 0xFE {
return GedcomEncoding::Utf16Le;
}
if bytes[0] == 0xFE && bytes[1] == 0xFF {
return GedcomEncoding::Utf16Be;
}
}
if let Some(encoding) = detect_encoding_from_char_tag(bytes) {
return encoding;
}
detect_encoding_by_content(bytes)
}
fn detect_encoding_from_char_tag(bytes: &[u8]) -> Option<GedcomEncoding> {
let content = if let Ok(s) = std::str::from_utf8(bytes) {
s.to_string()
} else {
let sample = &bytes[..bytes.len().min(4096)];
let (decoded, _, _) = WINDOWS_1252.decode(sample);
decoded.into_owned()
};
let upper_content = content.to_uppercase();
for line in upper_content.lines().take(50) {
let trimmed = line.trim();
if trimmed.contains("CHAR") {
let parts: Vec<&str> = trimmed.split_whitespace().collect();
if parts.len() >= 3 && parts[1] == "CHAR" {
return parse_encoding_value(parts[2]);
}
}
}
None
}
fn parse_encoding_value(value: &str) -> Option<GedcomEncoding> {
match value.to_uppercase().as_str() {
"UTF-8" | "UTF8" => Some(GedcomEncoding::Utf8),
"UTF-16" | "UTF16" | "UNICODE" | "UTF-16LE" | "UTF16LE" => Some(GedcomEncoding::Utf16Le),
"UTF-16BE" | "UTF16BE" => Some(GedcomEncoding::Utf16Be),
"ISO-8859-1" | "ISO8859-1" | "LATIN1" | "ISO_8859-1" => Some(GedcomEncoding::Iso8859_1),
"ISO-8859-15" | "ISO8859-15" | "LATIN9" | "ISO_8859-15" => Some(GedcomEncoding::Iso8859_15),
"ASCII" | "ANSI" => Some(GedcomEncoding::Ascii),
"ANSEL" => Some(GedcomEncoding::Ansel),
_ => None,
}
}
fn detect_encoding_by_content(bytes: &[u8]) -> GedcomEncoding {
if std::str::from_utf8(bytes).is_ok() {
if bytes.iter().all(|&b| b < 128) {
return GedcomEncoding::Ascii;
}
return GedcomEncoding::Utf8;
}
if bytes.len() >= 4 {
let matches_little_endian = bytes
.chunks(2)
.take(100)
.filter(|chunk| chunk.len() == 2)
.any(|chunk| chunk[1] == 0 && chunk[0].is_ascii());
let matches_big_endian = bytes
.chunks(2)
.take(100)
.filter(|chunk| chunk.len() == 2)
.any(|chunk| chunk[0] == 0 && chunk[1].is_ascii());
if matches_little_endian && !matches_big_endian {
return GedcomEncoding::Utf16Le;
}
if matches_big_endian && !matches_little_endian {
return GedcomEncoding::Utf16Be;
}
}
GedcomEncoding::Iso8859_1
}
pub fn decode_gedcom_bytes(bytes: &[u8]) -> Result<(String, GedcomEncoding), GedcomError> {
let encoding = detect_encoding(bytes);
decode_with_encoding(bytes, encoding)
}
pub fn decode_with_encoding(
bytes: &[u8],
encoding: GedcomEncoding,
) -> Result<(String, GedcomEncoding), GedcomError> {
let result = match encoding {
GedcomEncoding::Utf8 | GedcomEncoding::Ascii => {
let bytes =
if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
&bytes[3..]
} else {
bytes
};
String::from_utf8(bytes.to_vec())
.map_err(|e| GedcomError::EncodingError(format!("Invalid UTF-8: {e}")))?
}
GedcomEncoding::Utf16Le => decode_utf16(bytes, UTF_16LE)?,
GedcomEncoding::Utf16Be => decode_utf16(bytes, UTF_16BE)?,
GedcomEncoding::Iso8859_1 => {
let (decoded, _, had_errors) = WINDOWS_1252.decode(bytes);
if had_errors {
return Err(GedcomError::EncodingError(
"Invalid ISO-8859-1 sequence".to_string(),
));
}
decoded.into_owned()
}
GedcomEncoding::Iso8859_15 => {
let (decoded, _, had_errors) = ISO_8859_15.decode(bytes);
if had_errors {
return Err(GedcomError::EncodingError(
"Invalid ISO-8859-15 sequence".to_string(),
));
}
decoded.into_owned()
}
GedcomEncoding::Ansel => decode_ansel(bytes),
GedcomEncoding::Unknown => {
if let Ok(s) = String::from_utf8(bytes.to_vec()) {
return Ok((s, GedcomEncoding::Utf8));
}
let (decoded, _, _) = WINDOWS_1252.decode(bytes);
decoded.into_owned()
}
};
Ok((result, encoding))
}
fn decode_utf16(bytes: &[u8], encoding: &'static Encoding) -> Result<String, GedcomError> {
let bytes = if bytes.len() >= 2 {
if (bytes[0] == 0xFF && bytes[1] == 0xFE) || (bytes[0] == 0xFE && bytes[1] == 0xFF) {
&bytes[2..]
} else {
bytes
}
} else {
bytes
};
let (decoded, _, had_errors) = encoding.decode(bytes);
if had_errors {
return Err(GedcomError::EncodingError(format!(
"Invalid {} sequence",
encoding.name()
)));
}
Ok(decoded.into_owned())
}
fn ansel_combining_mark(byte: u8) -> Option<char> {
match byte {
0xE0 => Some('\u{0309}'), 0xE1 => Some('\u{0300}'), 0xE2 => Some('\u{0301}'), 0xE3 => Some('\u{0302}'), 0xE4 => Some('\u{0303}'), 0xE5 => Some('\u{0304}'), 0xE6 => Some('\u{0306}'), 0xE7 => Some('\u{0307}'), 0xE8 => Some('\u{0308}'), 0xE9 => Some('\u{030C}'), 0xEA => Some('\u{030A}'), 0xEB => Some('\u{FE20}'), 0xEC => Some('\u{FE21}'), 0xED => Some('\u{0315}'), 0xEE => Some('\u{030B}'), 0xEF => Some('\u{0310}'), 0xF0 => Some('\u{0327}'), 0xF1 => Some('\u{0328}'), 0xF2 => Some('\u{0323}'), 0xF3 => Some('\u{0324}'), 0xF4 => Some('\u{0325}'), 0xF5 => Some('\u{0333}'), 0xF6 => Some('\u{0332}'), 0xF7 => Some('\u{0326}'), 0xF8 => Some('\u{031C}'), 0xF9 => Some('\u{032E}'), 0xFA => Some('\u{FE22}'), 0xFB => Some('\u{FE23}'), 0xFE => Some('\u{0313}'), _ => None,
}
}
fn ansel_special_char(byte: u8) -> Option<char> {
match byte {
0xA1 => Some('\u{0141}'), 0xA2 => Some('\u{00D8}'), 0xA3 => Some('\u{0110}'), 0xA4 => Some('\u{00DE}'), 0xA5 => Some('\u{00C6}'), 0xA6 => Some('\u{0152}'), 0xA7 => Some('\u{02B9}'), 0xA8 => Some('\u{00B7}'), 0xA9 => Some('\u{266D}'), 0xAA => Some('\u{00AE}'), 0xAB => Some('\u{00B1}'), 0xAC => Some('\u{01A0}'), 0xAD => Some('\u{01AF}'), 0xAE => Some('\u{02BC}'), 0xB0 => Some('\u{02BB}'), 0xB1 => Some('\u{0142}'), 0xB2 => Some('\u{00F8}'), 0xB3 => Some('\u{0111}'), 0xB4 => Some('\u{00FE}'), 0xB5 => Some('\u{00E6}'), 0xB6 => Some('\u{0153}'), 0xB7 => Some('\u{02BA}'), 0xB8 => Some('\u{0131}'), 0xB9 => Some('\u{00A3}'), 0xBA => Some('\u{00F0}'), 0xBC => Some('\u{01A1}'), 0xBD => Some('\u{01B0}'), 0xC0 => Some('\u{00B0}'), 0xC1 => Some('\u{2113}'), 0xC2 => Some('\u{2117}'), 0xC3 => Some('\u{00A9}'), 0xC4 => Some('\u{266F}'), 0xC5 => Some('\u{00BF}'), 0xC6 => Some('\u{00A1}'), 0xC7 | 0xCF => Some('\u{00DF}'), 0xC8 => Some('\u{20AC}'), 0xFC => Some('\u{200D}'), 0xFD => Some('\u{200C}'), _ => None,
}
}
fn decode_ansel(bytes: &[u8]) -> String {
let mut result = String::with_capacity(bytes.len());
let mut pending_diacritics: Vec<char> = Vec::new();
let mut i = 0;
while i < bytes.len() {
let byte = bytes[i];
if let Some(combining) = ansel_combining_mark(byte) {
pending_diacritics.push(combining);
i += 1;
continue;
}
let ch = if byte < 0x80 {
byte as char
} else if let Some(special) = ansel_special_char(byte) {
special
} else {
char::from_u32(u32::from(byte)).unwrap_or('\u{FFFD}')
};
result.push(ch);
for diacritic in pending_diacritics.drain(..) {
result.push(diacritic);
}
i += 1;
}
for diacritic in pending_diacritics {
result.push(diacritic);
}
result
}
fn encode_ansel(content: &str) -> Vec<u8> {
let mut result = Vec::with_capacity(content.len());
let mut chars = content.chars().peekable();
while let Some(ch) = chars.next() {
let base_byte = unicode_to_ansel_base(ch);
let mut combining_marks = Vec::new();
while let Some(&next_ch) = chars.peek() {
if let Some(ansel_diacritic) = unicode_combining_to_ansel(next_ch) {
combining_marks.push(ansel_diacritic);
chars.next();
} else {
break;
}
}
for diacritic in combining_marks {
result.push(diacritic);
}
if let Some(byte) = base_byte {
result.push(byte);
} else {
if let Some(bytes) = unicode_precomposed_to_ansel(ch) {
result.extend_from_slice(&bytes);
} else if ch.is_ascii() {
result.push(ch as u8);
} else {
result.push(b'?');
}
}
}
result
}
fn unicode_to_ansel_base(ch: char) -> Option<u8> {
match ch {
'\u{0141}' => Some(0xA1), '\u{00D8}' => Some(0xA2), '\u{0110}' => Some(0xA3), '\u{00DE}' => Some(0xA4), '\u{00C6}' => Some(0xA5), '\u{0152}' => Some(0xA6), '\u{02B9}' => Some(0xA7), '\u{00B7}' => Some(0xA8), '\u{266D}' => Some(0xA9), '\u{00AE}' => Some(0xAA), '\u{00B1}' => Some(0xAB), '\u{01A0}' => Some(0xAC), '\u{01AF}' => Some(0xAD), '\u{02BC}' => Some(0xAE), '\u{02BB}' => Some(0xB0), '\u{0142}' => Some(0xB1), '\u{00F8}' => Some(0xB2), '\u{0111}' => Some(0xB3), '\u{00FE}' => Some(0xB4), '\u{00E6}' => Some(0xB5), '\u{0153}' => Some(0xB6), '\u{02BA}' => Some(0xB7), '\u{0131}' => Some(0xB8), '\u{00A3}' => Some(0xB9), '\u{00F0}' => Some(0xBA), '\u{01A1}' => Some(0xBC), '\u{01B0}' => Some(0xBD), '\u{00B0}' => Some(0xC0), '\u{2113}' => Some(0xC1), '\u{2117}' => Some(0xC2), '\u{00A9}' => Some(0xC3), '\u{266F}' => Some(0xC4), '\u{00BF}' => Some(0xC5), '\u{00A1}' => Some(0xC6), '\u{00DF}' => Some(0xC7), '\u{20AC}' => Some(0xC8), _ if ch.is_ascii() => Some(ch as u8),
_ => None,
}
}
fn unicode_combining_to_ansel(ch: char) -> Option<u8> {
match ch {
'\u{0309}' => Some(0xE0), '\u{0300}' => Some(0xE1), '\u{0301}' => Some(0xE2), '\u{0302}' => Some(0xE3), '\u{0303}' => Some(0xE4), '\u{0304}' => Some(0xE5), '\u{0306}' => Some(0xE6), '\u{0307}' => Some(0xE7), '\u{0308}' => Some(0xE8), '\u{030C}' => Some(0xE9), '\u{030A}' => Some(0xEA), '\u{FE20}' => Some(0xEB), '\u{FE21}' => Some(0xEC), '\u{0315}' => Some(0xED), '\u{030B}' => Some(0xEE), '\u{0310}' => Some(0xEF), '\u{0327}' => Some(0xF0), '\u{0328}' => Some(0xF1), '\u{0323}' => Some(0xF2), '\u{0324}' => Some(0xF3), '\u{0325}' => Some(0xF4), '\u{0333}' => Some(0xF5), '\u{0332}' => Some(0xF6), '\u{0326}' => Some(0xF7), '\u{031C}' => Some(0xF8), '\u{032E}' => Some(0xF9), '\u{FE22}' => Some(0xFA), '\u{FE23}' => Some(0xFB), '\u{0313}' => Some(0xFE), _ => None,
}
}
fn unicode_precomposed_to_ansel(ch: char) -> Option<Vec<u8>> {
acute_to_ansel(ch)
.or_else(|| grave_to_ansel(ch))
.or_else(|| circumflex_to_ansel(ch))
.or_else(|| tilde_to_ansel(ch))
.or_else(|| umlaut_to_ansel(ch))
.or_else(|| caron_to_ansel(ch))
.or_else(|| ring_to_ansel(ch))
.or_else(|| cedilla_to_ansel(ch))
.or_else(|| ogonek_to_ansel(ch))
.or_else(|| macron_to_ansel(ch))
.or_else(|| breve_to_ansel(ch))
.or_else(|| dot_above_to_ansel(ch))
.or_else(|| double_acute_to_ansel(ch))
}
fn acute_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Á' => Some(vec![0xE2, b'A']),
'á' => Some(vec![0xE2, b'a']),
'É' => Some(vec![0xE2, b'E']),
'é' => Some(vec![0xE2, b'e']),
'Í' => Some(vec![0xE2, b'I']),
'í' => Some(vec![0xE2, b'i']),
'Ó' => Some(vec![0xE2, b'O']),
'ó' => Some(vec![0xE2, b'o']),
'Ú' => Some(vec![0xE2, b'U']),
'ú' => Some(vec![0xE2, b'u']),
'Ý' => Some(vec![0xE2, b'Y']),
'ý' => Some(vec![0xE2, b'y']),
'Ć' => Some(vec![0xE2, b'C']),
'ć' => Some(vec![0xE2, b'c']),
'Ń' => Some(vec![0xE2, b'N']),
'ń' => Some(vec![0xE2, b'n']),
'Ś' => Some(vec![0xE2, b'S']),
'ś' => Some(vec![0xE2, b's']),
'Ź' => Some(vec![0xE2, b'Z']),
'ź' => Some(vec![0xE2, b'z']),
_ => None,
}
}
fn grave_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'À' => Some(vec![0xE1, b'A']),
'à' => Some(vec![0xE1, b'a']),
'È' => Some(vec![0xE1, b'E']),
'è' => Some(vec![0xE1, b'e']),
'Ì' => Some(vec![0xE1, b'I']),
'ì' => Some(vec![0xE1, b'i']),
'Ò' => Some(vec![0xE1, b'O']),
'ò' => Some(vec![0xE1, b'o']),
'Ù' => Some(vec![0xE1, b'U']),
'ù' => Some(vec![0xE1, b'u']),
_ => None,
}
}
fn circumflex_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Â' => Some(vec![0xE3, b'A']),
'â' => Some(vec![0xE3, b'a']),
'Ê' => Some(vec![0xE3, b'E']),
'ê' => Some(vec![0xE3, b'e']),
'Î' => Some(vec![0xE3, b'I']),
'î' => Some(vec![0xE3, b'i']),
'Ô' => Some(vec![0xE3, b'O']),
'ô' => Some(vec![0xE3, b'o']),
'Û' => Some(vec![0xE3, b'U']),
'û' => Some(vec![0xE3, b'u']),
_ => None,
}
}
fn tilde_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Ã' => Some(vec![0xE4, b'A']),
'ã' => Some(vec![0xE4, b'a']),
'Ñ' => Some(vec![0xE4, b'N']),
'ñ' => Some(vec![0xE4, b'n']),
'Õ' => Some(vec![0xE4, b'O']),
'õ' => Some(vec![0xE4, b'o']),
_ => None,
}
}
fn umlaut_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Ä' => Some(vec![0xE8, b'A']),
'ä' => Some(vec![0xE8, b'a']),
'Ë' => Some(vec![0xE8, b'E']),
'ë' => Some(vec![0xE8, b'e']),
'Ï' => Some(vec![0xE8, b'I']),
'ï' => Some(vec![0xE8, b'i']),
'Ö' => Some(vec![0xE8, b'O']),
'ö' => Some(vec![0xE8, b'o']),
'Ü' => Some(vec![0xE8, b'U']),
'ü' => Some(vec![0xE8, b'u']),
'Ÿ' => Some(vec![0xE8, b'Y']),
'ÿ' => Some(vec![0xE8, b'y']),
_ => None,
}
}
fn caron_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Č' => Some(vec![0xE9, b'C']),
'č' => Some(vec![0xE9, b'c']),
'Ě' => Some(vec![0xE9, b'E']),
'ě' => Some(vec![0xE9, b'e']),
'Ř' => Some(vec![0xE9, b'R']),
'ř' => Some(vec![0xE9, b'r']),
'Š' => Some(vec![0xE9, b'S']),
'š' => Some(vec![0xE9, b's']),
'Ž' => Some(vec![0xE9, b'Z']),
'ž' => Some(vec![0xE9, b'z']),
_ => None,
}
}
fn ring_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Å' => Some(vec![0xEA, b'A']),
'å' => Some(vec![0xEA, b'a']),
'Ů' => Some(vec![0xEA, b'U']),
'ů' => Some(vec![0xEA, b'u']),
_ => None,
}
}
fn cedilla_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Ç' => Some(vec![0xF0, b'C']),
'ç' => Some(vec![0xF0, b'c']),
'Ş' => Some(vec![0xF0, b'S']),
'ş' => Some(vec![0xF0, b's']),
_ => None,
}
}
fn ogonek_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Ą' => Some(vec![0xF1, b'A']),
'ą' => Some(vec![0xF1, b'a']),
'Ę' => Some(vec![0xF1, b'E']),
'ę' => Some(vec![0xF1, b'e']),
_ => None,
}
}
fn macron_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Ā' => Some(vec![0xE5, b'A']),
'ā' => Some(vec![0xE5, b'a']),
'Ē' => Some(vec![0xE5, b'E']),
'ē' => Some(vec![0xE5, b'e']),
'Ī' => Some(vec![0xE5, b'I']),
'ī' => Some(vec![0xE5, b'i']),
'Ō' => Some(vec![0xE5, b'O']),
'ō' => Some(vec![0xE5, b'o']),
'Ū' => Some(vec![0xE5, b'U']),
'ū' => Some(vec![0xE5, b'u']),
_ => None,
}
}
fn breve_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Ă' => Some(vec![0xE6, b'A']),
'ă' => Some(vec![0xE6, b'a']),
_ => None,
}
}
fn dot_above_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Ż' => Some(vec![0xE7, b'Z']),
'ż' => Some(vec![0xE7, b'z']),
'Ġ' => Some(vec![0xE7, b'G']),
'ġ' => Some(vec![0xE7, b'g']),
_ => None,
}
}
fn double_acute_to_ansel(ch: char) -> Option<Vec<u8>> {
match ch {
'Ő' => Some(vec![0xEE, b'O']),
'ő' => Some(vec![0xEE, b'o']),
'Ű' => Some(vec![0xEE, b'U']),
'ű' => Some(vec![0xEE, b'u']),
_ => None,
}
}
pub fn encode_to_bytes(content: &str, encoding: GedcomEncoding) -> Result<Vec<u8>, GedcomError> {
match encoding {
GedcomEncoding::Utf8 | GedcomEncoding::Ascii | GedcomEncoding::Unknown => {
Ok(content.as_bytes().to_vec())
}
GedcomEncoding::Utf16Le => {
let mut bytes = vec![0xFF, 0xFE]; for c in content.encode_utf16() {
bytes.extend_from_slice(&c.to_le_bytes());
}
Ok(bytes)
}
GedcomEncoding::Utf16Be => {
let mut bytes = vec![0xFE, 0xFF]; for c in content.encode_utf16() {
bytes.extend_from_slice(&c.to_be_bytes());
}
Ok(bytes)
}
GedcomEncoding::Iso8859_1 => {
let (encoded, _, had_errors) = WINDOWS_1252.encode(content);
if had_errors {
return Err(GedcomError::EncodingError(
"Cannot encode to ISO-8859-1: contains unsupported characters".to_string(),
));
}
Ok(encoded.into_owned())
}
GedcomEncoding::Iso8859_15 => {
let (encoded, _, had_errors) = ISO_8859_15.encode(content);
if had_errors {
return Err(GedcomError::EncodingError(
"Cannot encode to ISO-8859-15: contains unsupported characters".to_string(),
));
}
Ok(encoded.into_owned())
}
GedcomEncoding::Ansel => Ok(encode_ansel(content)),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_utf8_bom() {
let bytes = [0xEF, 0xBB, 0xBF, b'0', b' ', b'H', b'E', b'A', b'D'];
assert_eq!(detect_encoding(&bytes), GedcomEncoding::Utf8);
}
#[test]
fn test_detect_utf16_le_bom() {
let bytes = [0xFF, 0xFE, b'0', 0x00, b' ', 0x00];
assert_eq!(detect_encoding(&bytes), GedcomEncoding::Utf16Le);
}
#[test]
fn test_detect_utf16_be_bom() {
let bytes = [0xFE, 0xFF, 0x00, b'0', 0x00, b' '];
assert_eq!(detect_encoding(&bytes), GedcomEncoding::Utf16Be);
}
#[test]
fn test_detect_ascii() {
let bytes = b"0 HEAD\n1 GEDC\n2 VERS 5.5\n0 TRLR\n";
assert_eq!(detect_encoding(bytes), GedcomEncoding::Ascii);
}
#[test]
fn test_detect_utf8_from_char_tag() {
let bytes = b"0 HEAD\n1 CHAR UTF-8\n0 TRLR\n";
assert_eq!(detect_encoding(bytes), GedcomEncoding::Utf8);
}
#[test]
fn test_detect_iso8859_1_from_char_tag() {
let bytes = b"0 HEAD\n1 CHAR ISO-8859-1\n0 TRLR\n";
assert_eq!(detect_encoding(bytes), GedcomEncoding::Iso8859_1);
}
#[test]
fn test_detect_iso8859_15_from_char_tag() {
let bytes = b"0 HEAD\n1 CHAR ISO-8859-15\n0 TRLR\n";
assert_eq!(detect_encoding(bytes), GedcomEncoding::Iso8859_15);
}
#[test]
fn test_decode_utf8() {
let bytes = "0 HEAD\n1 NAME José García\n0 TRLR\n".as_bytes();
let (content, encoding) = decode_gedcom_bytes(bytes).unwrap();
assert_eq!(encoding, GedcomEncoding::Utf8);
assert!(content.contains("José García"));
}
#[test]
fn test_decode_utf8_with_bom() {
let mut bytes = vec![0xEF, 0xBB, 0xBF];
bytes.extend_from_slice(b"0 HEAD\n1 NAME Test\n0 TRLR\n");
let (content, encoding) = decode_gedcom_bytes(&bytes).unwrap();
assert_eq!(encoding, GedcomEncoding::Utf8);
assert!(content.starts_with("0 HEAD"));
}
#[test]
fn test_decode_iso8859_1() {
let bytes = b"0 HEAD\n1 CHAR ISO-8859-1\n1 NAME Jos\xE9\n0 TRLR\n";
let (content, encoding) = decode_gedcom_bytes(bytes).unwrap();
assert_eq!(encoding, GedcomEncoding::Iso8859_1);
assert!(content.contains("José"));
}
#[test]
fn test_decode_iso8859_15() {
let bytes = b"0 HEAD\n1 CHAR ISO-8859-15\n1 NOTE 10\xA4\n0 TRLR\n";
let (content, encoding) = decode_gedcom_bytes(bytes).unwrap();
assert_eq!(encoding, GedcomEncoding::Iso8859_15);
assert!(content.contains("10€"));
}
#[test]
fn test_decode_utf16_le() {
let content = "0 HEAD\n1 NAME Test\n0 TRLR\n";
let mut bytes = vec![0xFF, 0xFE]; for c in content.encode_utf16() {
bytes.extend_from_slice(&c.to_le_bytes());
}
let (decoded, encoding) = decode_gedcom_bytes(&bytes).unwrap();
assert_eq!(encoding, GedcomEncoding::Utf16Le);
assert!(decoded.contains("HEAD"));
}
#[test]
fn test_decode_utf16_be() {
let content = "0 HEAD\n1 NAME Test\n0 TRLR\n";
let mut bytes = vec![0xFE, 0xFF]; for c in content.encode_utf16() {
bytes.extend_from_slice(&c.to_be_bytes());
}
let (decoded, encoding) = decode_gedcom_bytes(&bytes).unwrap();
assert_eq!(encoding, GedcomEncoding::Utf16Be);
assert!(decoded.contains("HEAD"));
}
#[test]
fn test_encode_to_utf8() {
let content = "0 HEAD\n1 NAME José\n0 TRLR\n";
let bytes = encode_to_bytes(content, GedcomEncoding::Utf8).unwrap();
assert_eq!(bytes, content.as_bytes());
}
#[test]
fn test_encode_to_utf16_le() {
let content = "Test";
let bytes = encode_to_bytes(content, GedcomEncoding::Utf16Le).unwrap();
assert_eq!(bytes[0], 0xFF); assert_eq!(bytes[1], 0xFE);
assert_eq!(bytes[2], 0x54);
assert_eq!(bytes[3], 0x00);
}
#[test]
fn test_encode_to_utf16_be() {
let content = "Test";
let bytes = encode_to_bytes(content, GedcomEncoding::Utf16Be).unwrap();
assert_eq!(bytes[0], 0xFE); assert_eq!(bytes[1], 0xFF);
assert_eq!(bytes[2], 0x00);
assert_eq!(bytes[3], 0x54);
}
#[test]
fn test_roundtrip_utf16_le() {
let original = "0 HEAD\n1 NAME José García\n0 TRLR\n";
let encoded = encode_to_bytes(original, GedcomEncoding::Utf16Le).unwrap();
let (decoded, _) = decode_gedcom_bytes(&encoded).unwrap();
assert_eq!(decoded, original);
}
#[test]
fn test_roundtrip_utf16_be() {
let original = "0 HEAD\n1 NAME José García\n0 TRLR\n";
let encoded = encode_to_bytes(original, GedcomEncoding::Utf16Be).unwrap();
let (decoded, _) = decode_gedcom_bytes(&encoded).unwrap();
assert_eq!(decoded, original);
}
#[test]
fn test_encoding_display() {
assert_eq!(format!("{}", GedcomEncoding::Utf8), "UTF-8");
assert_eq!(format!("{}", GedcomEncoding::Utf16Le), "UTF-16LE");
assert_eq!(format!("{}", GedcomEncoding::Utf16Be), "UTF-16BE");
assert_eq!(format!("{}", GedcomEncoding::Iso8859_1), "ISO-8859-1");
assert_eq!(format!("{}", GedcomEncoding::Iso8859_15), "ISO-8859-15");
assert_eq!(format!("{}", GedcomEncoding::Ascii), "ASCII");
assert_eq!(format!("{}", GedcomEncoding::Ansel), "ANSEL");
assert_eq!(format!("{}", GedcomEncoding::Unknown), "Unknown");
}
#[test]
fn test_ansel_decode_basic() {
let bytes = b"0 HEAD\n1 NAME John Smith\n0 TRLR\n";
let result = decode_ansel(bytes);
assert_eq!(result, "0 HEAD\n1 NAME John Smith\n0 TRLR\n");
}
#[test]
fn test_ansel_decode_special_chars() {
let bytes = &[0xA1, 0xA2, 0xB5, 0xB2];
let result = decode_ansel(bytes);
assert_eq!(result, "ŁØæø");
}
#[test]
fn test_ansel_decode_diacritics() {
let bytes = &[0xE2, b'e']; let result = decode_ansel(bytes);
assert_eq!(result, "e\u{0301}");
}
#[test]
fn test_ansel_decode_jose() {
let bytes = &[b'J', b'o', b's', 0xE2, b'e'];
let result = decode_ansel(bytes);
assert_eq!(result, "Jose\u{0301}"); }
#[test]
fn test_ansel_decode_multiple_diacritics() {
let bytes = &[0xE3, 0xE8, b'a'];
let result = decode_ansel(bytes);
assert_eq!(result, "a\u{0302}\u{0308}"); }
#[test]
fn test_ansel_encode_basic() {
let content = "John Smith";
let bytes = encode_ansel(content);
assert_eq!(bytes, b"John Smith");
}
#[test]
fn test_ansel_encode_special_chars() {
let content = "Łódź"; let bytes = encode_ansel(content);
assert_eq!(bytes, &[0xA1, 0xE2, b'o', b'd', 0xE2, b'z']);
}
#[test]
fn test_ansel_encode_precomposed() {
let content = "José García";
let bytes = encode_ansel(content);
assert_eq!(
bytes,
&[b'J', b'o', b's', 0xE2, b'e', b' ', b'G', b'a', b'r', b'c', 0xE2, b'i', b'a']
);
}
#[test]
fn test_ansel_roundtrip_special() {
let original_bytes = &[0xA1, 0xB1, 0xA5, 0xB5]; let decoded = decode_ansel(original_bytes);
assert_eq!(decoded, "ŁłÆæ");
let encoded = encode_ansel(&decoded);
assert_eq!(encoded, original_bytes);
}
#[test]
fn test_ansel_with_char_tag() {
let bytes = b"0 HEAD\n1 CHAR ANSEL\n0 TRLR\n";
let encoding = detect_encoding(bytes);
assert_eq!(encoding, GedcomEncoding::Ansel);
}
#[test]
fn test_decode_gedcom_ansel() {
let mut bytes = b"0 HEAD\n1 CHAR ANSEL\n1 NAME Jos".to_vec();
bytes.extend_from_slice(&[0xE2, b'e']); bytes.extend_from_slice(b"\n0 TRLR\n");
let (content, encoding) = decode_gedcom_bytes(&bytes).unwrap();
assert_eq!(encoding, GedcomEncoding::Ansel);
assert!(content.contains("Jose\u{0301}")); }
#[test]
fn test_parse_encoding_values() {
assert_eq!(parse_encoding_value("UTF-8"), Some(GedcomEncoding::Utf8));
assert_eq!(parse_encoding_value("utf-8"), Some(GedcomEncoding::Utf8));
assert_eq!(parse_encoding_value("UTF8"), Some(GedcomEncoding::Utf8));
assert_eq!(
parse_encoding_value("ISO-8859-1"),
Some(GedcomEncoding::Iso8859_1)
);
assert_eq!(
parse_encoding_value("LATIN1"),
Some(GedcomEncoding::Iso8859_1)
);
assert_eq!(
parse_encoding_value("ISO-8859-15"),
Some(GedcomEncoding::Iso8859_15)
);
assert_eq!(
parse_encoding_value("LATIN9"),
Some(GedcomEncoding::Iso8859_15)
);
assert_eq!(
parse_encoding_value("UNICODE"),
Some(GedcomEncoding::Utf16Le)
);
assert_eq!(parse_encoding_value("ASCII"), Some(GedcomEncoding::Ascii));
assert_eq!(parse_encoding_value("ANSEL"), Some(GedcomEncoding::Ansel));
assert_eq!(parse_encoding_value("UNKNOWN"), None);
}
}