mod error;

use core::str::from_utf8;
use std::borrow::Cow;

pub use crate::error::DecodingError;

pub fn decode(bytes: &[u8]) -> Result<Cow<str>, DecodingError> {
    if let Ok(str) = from_utf8(bytes) {
        return Ok(Cow::Borrowed(str));
    }

    let mut decoded = Vec::with_capacity(bytes.len());
    let mut iter = bytes.iter();

    macro_rules! err {
        () => {
            return Err(DecodingError)
        };
    }

    macro_rules! next {
        () => {
            match iter.next() {
                Some(&byte) => byte,
                None => return Err(DecodingError),
            }
        };
    }

    macro_rules! next_continuation {
        () => {{
            let byte = next!();
            if is_continuation_byte(byte) {
                byte
            } else {
                return Err(DecodingError);
            }
        }};
    }

    loop {
        let first = match iter.next() {
            Some(&byte) => byte,
            None => break,
        };

        if first <= MAX_ASCII_CODEPOINT {
            decoded.push(first)
        } else {
            let width = match utf8_char_width(first) {
                Some(v) => v,
                None => err!(),
            };
            let second = next_continuation!();
            match width {
                2 => decoded.extend_from_slice(&[first, second]),
                3 => {
                    let third = next_continuation!();
                    match (first, second) {
                        (0xE0, 0xA0..=0xBF)
                        | (0xE1..=0xEC, 0x80..=0xBF)
                        | (0xED, 0x80..=0x9F)
                        | (0xEE..=0xEF, 0x80..=0xBF) => {
                            decoded.extend_from_slice(&[first, second, third]);
                        }
                        (0xED, 0xA0..=0xAF) => {
                            let fourth = next!();
                            if fourth != 0xED {
                                err!();
                            }
                            let fifth = next_continuation!();
                            if fifth < 0xB0 || 0xBF < fifth {
                                err!();
                            }
                            let sixth = next_continuation!();
                            decoded.extend_from_slice(&decode_surrogate_pair(
                                second, third, fifth, sixth,
                            ));
                        }
                        _ => err!(),
                    }
                }
                _ => err!(),
            }
        }
    }

    debug_assert!(from_utf8(&decoded).is_ok());
    Ok(Cow::Owned(unsafe { String::from_utf8_unchecked(decoded) }))
}

fn decode_surrogate_pair(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {
    let surrogate1 = decode_surrogate(second, third);
    let surrogate2 = decode_surrogate(fifth, sixth);
    let codepoint = 0x10000 + ((surrogate1 - 0xD800) << 10 | (surrogate2 - 0xDC00));
    decode_codepoint(codepoint)
}

fn decode_surrogate(second: u8, third: u8) -> u32 {
    0xD000 | ((second & VAL_MASK) as u32) << 6 | (third & VAL_MASK) as u32
}

fn decode_codepoint(codepoint: u32) -> [u8; 4] {
    const STRT_TAG: u8 = 0b11110000;
    [
        STRT_TAG | ((codepoint & 0b1_1100_0000_0000_0000_0000) >> 18) as u8,
        CONT_TAG | ((codepoint & 0b0_0011_1111_0000_0000_0000) >> 12) as u8,
        CONT_TAG | ((codepoint & 0b0_0000_0000_1111_1100_0000) >> 6) as u8,
        CONT_TAG | ((codepoint & 0b0_0000_0000_0000_0011_1111) as u8),
    ]
}

pub fn encode(text: &str) -> Cow<[u8]> {
    if is_valid(text) {
        return Cow::Borrowed(text.as_bytes());
    }

    let bytes = text.as_bytes();
    let capacity = encoded_capacity(text);
    let mut encoded = Vec::with_capacity(capacity);
    let mut index = 0;

    while index < bytes.len() {
        let byte = bytes[index];
        if byte <= MAX_ASCII_CODEPOINT {
            encoded.push(byte);
            index += 1;
        } else {
            let width = utf8_char_width(byte).unwrap();
            assert!(index + width <= bytes.len());
            let slice_range = index..index + width;
            if width <= CESU8_MAX_CHAR_WIDTH {
                encoded.extend(&bytes[slice_range])
            } else {
                let str = &text[slice_range];
                let codepoint = str.chars().next().unwrap() as u32;
                let surrogate_pair = to_surrogate_pair(codepoint);
                let encoded_pair = encode_surrogate_pair(surrogate_pair);
                encoded.extend(&encoded_pair)
            }
        }
    }

    Cow::Owned(encoded)
}

fn encode_surrogate_pair(surrogate_pair: [u16; 2]) -> [u8; 6] {
    let [b1, b2, b3] = encode_surrogate(surrogate_pair[0]);
    let [b4, b5, b6] = encode_surrogate(surrogate_pair[1]);
    [b1, b2, b3, b4, b5, b6]
}

fn encode_surrogate(surrogate: u16) -> [u8; 3] {
    const STRT_TAG: u8 = 0b11100000;
    [
        STRT_TAG | ((surrogate & 0b11110000_00000000) >> 12) as u8,
        CONT_TAG | ((surrogate & 0b00001111_11000000) >> 6) as u8,
        CONT_TAG | ((surrogate & 0b00000000_00111111) as u8),
    ]
}

fn to_surrogate_pair(codepoint: u32) -> [u16; 2] {
    let codepoint = codepoint - 0x10000;
    let first = ((codepoint >> 10) as u16) | 0xD800;
    let second = ((codepoint & 0x3FF) as u16) | 0xDC00;
    [first, second]
}

fn encoded_capacity(text: &str) -> usize {
    let bytes = text.as_bytes();
    let mut capacity = 0;
    let mut index = 0;
    while index < bytes.len() {
        let byte = bytes[index];
        if byte <= MAX_ASCII_CODEPOINT {
            capacity += 1;
            index += 1;
        } else {
            let width = utf8_char_width(byte).unwrap();
            capacity += if width <= CESU8_MAX_CHAR_WIDTH {
                width
            } else {
                6
            };
            index += width;
        }
    }
    capacity
}

const MAX_ASCII_CODEPOINT: u8 = 0x7F;

pub fn is_valid(str: &str) -> bool {
    for byte in str.bytes() {
        if is_continuation_byte(byte) {
            continue;
        }
        if let Some(width) = utf8_char_width(byte) {
            if width > CESU8_MAX_CHAR_WIDTH {
                return false;
            }
        } else {
            return false;
        }
    }
    true
}

const CESU8_MAX_CHAR_WIDTH: usize = 3;

fn is_continuation_byte(byte: u8) -> bool {
    byte & TAG_MASK == CONT_TAG
}

const VAL_MASK: u8 = 0b00111111;
const TAG_MASK: u8 = 0b11000000;
const CONT_TAG: u8 = 0b10000000;

fn utf8_char_width(byte: u8) -> Option<usize> {
    match byte {
        0x00..=0x7F => Some(1),
        0xC2..=0xDF => Some(2),
        0xE0..=0xEF => Some(3),
        0xF0..=0xF4 => Some(4),
        _ => None,
    }
}