encoding-next 0.3.0

Character encoding support for Rust
Documentation
// This is a part of encoding-next.
// Copyright (c) 2013-2015, Kang Seonghoon.
// See README.md and LICENSE.txt for details.

//! Internal utilities.

use crate::types;
use std::convert::Into;
use std::default::Default;
use std::marker::PhantomData;
use std::{char, mem, str};

/// Unchecked conversion to `char`.
#[allow(clippy::transmute_int_to_char)]
pub fn as_char(ch: u32) -> char {
    debug_assert!(char::from_u32(ch).is_some());
    unsafe { mem::transmute(ch) }
}

/// External iterator for a string's characters with its corresponding byte offset range.
pub struct StrCharIndexIterator<'r> {
    index: usize,
    chars: str::Chars<'r>,
}

impl<'r> Iterator for StrCharIndexIterator<'r> {
    type Item = ((usize, usize), char);

    #[inline]
    fn next(&mut self) -> Option<((usize, usize), char)> {
        if let Some(ch) = self.chars.next() {
            let prev = self.index;
            let next = prev + ch.len_utf8();
            self.index = next;
            Some(((prev, next), ch))
        } else {
            None
        }
    }
}

/// A trait providing an `index_iter` method.
pub trait StrCharIndex<'r> {
    fn index_iter(&self) -> StrCharIndexIterator<'r>;
}

impl<'r> StrCharIndex<'r> for &'r str {
    /// Iterates over each character with corresponding byte offset range.
    fn index_iter(&self) -> StrCharIndexIterator<'r> {
        StrCharIndexIterator {
            index: 0,
            chars: self.chars(),
        }
    }
}

/// A helper struct for the stateful decoder DSL.
pub struct StatefulDecoderHelper<'a, St, Data: 'a> {
    /// The current buffer.
    pub buf: &'a [u8],
    /// The current index to the buffer.
    pub pos: usize,
    /// The output buffer.
    pub output: &'a mut (dyn types::StringWriter + 'a),
    /// The last codec error. The caller will later collect this.
    pub err: Option<types::CodecError>,
    /// The additional data attached for the use from transition functions.
    pub data: &'a Data,
    /// A marker for the phantom type parameter `St`.
    _marker: PhantomData<St>,
}

impl<'a, St: Default, Data> StatefulDecoderHelper<'a, St, Data> {
    /// Makes a new decoder context out of given buffer and output callback.
    #[inline(always)]
    pub fn new(
        buf: &'a [u8],
        output: &'a mut (dyn types::StringWriter + 'a),
        data: &'a Data,
    ) -> StatefulDecoderHelper<'a, St, Data> {
        StatefulDecoderHelper {
            buf,
            pos: 0,
            output,
            err: None,
            data,
            _marker: PhantomData,
        }
    }

    /// Reads one byte from the buffer if any.
    #[inline(always)]
    pub fn read(&mut self) -> Option<u8> {
        match self.buf.get(self.pos) {
            Some(&c) => {
                self.pos += 1;
                Some(c)
            }
            None => None,
        }
    }

    /// Resets back to the initial state.
    /// This should be the last expr in the rules.
    #[inline(always)]
    pub fn reset(&self) -> St {
        Default::default()
    }

    /// Writes one Unicode scalar value to the output.
    /// There is intentionally no check for `c`, so the caller should ensure that it's valid.
    /// If this is the last expr in the rules, also resets back to the initial state.
    #[inline(always)]
    #[allow(clippy::transmute_int_to_char)]
    pub fn emit(&mut self, c: u32) -> St {
        self.output.write_char(unsafe { mem::transmute(c) });
        Default::default()
    }

    /// Writes a Unicode string to the output.
    /// If this is the last expr in the rules, also resets back to the initial state.
    #[inline(always)]
    pub fn emit_str(&mut self, s: &str) -> St {
        self.output.write_str(s);
        Default::default()
    }

    /// Issues a codec error with given message at the current position.
    /// If this is the last expr in the rules, also resets back to the initial state.
    #[inline(always)]
    pub fn err(&mut self, msg: &'static str) -> St {
        self.err = Some(types::CodecError {
            upto: self.pos as isize,
            cause: msg.into(),
        });
        Default::default()
    }

    /// Issues a codec error with given message at the current position minus `backup` bytes.
    /// If this is the last expr in the rules, also resets back to the initial state.
    ///
    /// This should be used to implement "prepending byte to the stream" in the Encoding spec,
    /// which corresponds to `ctx.backup_and_err(1, ...)`.
    #[inline(always)]
    pub fn backup_and_err(&mut self, backup: usize, msg: &'static str) -> St {
        let upto = self.pos as isize - backup as isize;
        self.err = Some(types::CodecError {
            upto,
            cause: msg.into(),
        });
        Default::default()
    }
}

/// Defines a stateful decoder from given state machine.
macro_rules! stateful_decoder {
    (
        module $stmod:ident; // should be unique from other existing identifiers
        $(internal $item:item)* // will only be visible from state functions
    initial:
        state $inist:ident($inictx:ident: Context) {
            $(case $($inilhs:pat),+ => $($inirhs:expr),+;)+
            final => $($inifin:expr),+;
        }
    checkpoint:
        $(state $ckst:ident($ckctx:ident: Context $(, $ckarg:ident: $ckty:ty)*) {
            $(case $($cklhs:pat),+ => $($ckrhs:expr),+;)+
            final => $($ckfin:expr),+;
        })*
    transient:
        $(state $st:ident($ctx:ident: Context $(, $arg:ident: $ty:ty)*) {
            $(case $($lhs:pat),+ => $($rhs:expr),+;)+
            final => $($fin:expr),+;
        })*
    ) => (
        #[allow(non_snake_case)]
        #[allow(clippy::upper_case_acronyms)]
        mod $stmod {
            pub use self::State::*;

            #[derive(PartialEq, Eq, Clone, Copy)]
            pub enum State {
                $inist,
                $(
                    $ckst(() $(, $ckty)*),
                )*
                $(
                    $st(() $(, $ty)*),
                )*
            }

            impl ::std::default::Default for State {
                #[inline(always)] fn default() -> State { $inist }
            }

            pub mod internal {
                pub type Context<'a, Data> = crate::util::StatefulDecoderHelper<'a, super::State, Data>;

                $($item)*
            }

            pub mod start {
                use super::internal::*;

                #[inline(always)]
                pub fn $inist<T>($inictx: &mut Context<T>) -> super::State {
                    // prohibits all kind of recursions, including self-recursions
                    #[allow(unused_imports)] use super::transient::*;
                    match $inictx.read() {
                        None => super::$inist,
                        Some(c) => match c { $($($inilhs)|+ => { $($inirhs);+ })+ },
                    }
                }

                $(
                    #[inline(always)]
                    pub fn $ckst<T>($ckctx: &mut Context<T> $(, $ckarg: $ckty)*) -> super::State {
                        // prohibits all kind of recursions, including self-recursions
                        #[allow(unused_imports)] use super::transient::*;
                        match $ckctx.read() {
                            None => super::$ckst(() $(, $ckarg)*),
                            Some(c) => match c { $($($cklhs)|+ => { $($ckrhs);+ })+ },
                        }
                    }
                )*
            }

            pub mod transient {
                use super::internal::*;

                #[inline(always)]
                #[allow(dead_code)]
                pub fn $inist<T>(_: &mut Context<T>) -> super::State {
                    super::$inist // do not recurse further
                }

                $(
                    #[inline(always)]
                    #[allow(dead_code)]
                    pub fn $ckst<T>(_: &mut Context<T> $(, $ckarg: $ckty)*) -> super::State {
                        super::$ckst(() $(, $ckarg)*) // do not recurse further
                    }
                )*

                $(
                    #[inline(always)]
                    pub fn $st<T>($ctx: &mut Context<T> $(, $arg: $ty)*) -> super::State {
                        match $inictx.read() {
                            None => super::$st(() $(, $arg)*),
                            Some(c) => match c { $($($lhs)|+ => { $($rhs);+ })+ },
                        }
                    }
                )*
            }

            pub fn raw_feed<T>(mut st: State, input: &[u8], output: &mut dyn (crate::types::StringWriter),
                               data: &T) -> (State, usize, Option<crate::types::CodecError>) {
                output.writer_hint(input.len());

                let mut ctx = crate::util::StatefulDecoderHelper::new(input, output, data);
                let mut processed = 0;

                let st_ = match st {
                    $inist => $inist,
                    $(
                        $ckst(() $(, $ckarg)*) => start::$ckst(&mut ctx $(, $ckarg)*),
                    )*
                    $(
                        $st(() $(, $arg)*) => transient::$st(&mut ctx $(, $arg)*),
                    )*
                };
                match (ctx.err.take(), st_) {
                    (None, $inist) $(| (None, $ckst(..)))* => { st = st_; processed = ctx.pos; }
                    // XXX splitting the match case improves the performance somehow, but why?
                    (None, _) => { return (st_, processed, None); }
                    (Some(err), _) => { return (st_, processed, Some(err)); }
                }

                while ctx.pos < ctx.buf.len() {
                    let st_ = match st {
                        $inist => start::$inist(&mut ctx),
                        $(
                            $ckst(() $(, $ckarg)*) => start::$ckst(&mut ctx $(, $ckarg)*),
                        )*
                        _ => unreachable!(),
                    };
                    match (ctx.err.take(), st_) {
                        (None, $inist) $(| (None, $ckst(..)))* => { st = st_; processed = ctx.pos; }
                        // XXX splitting the match case improves the performance somehow, but why?
                        (None, _) => { return (st_, processed, None); }
                        (Some(err), _) => { return (st_, processed, Some(err)); }
                    }
                }

                (st, processed, None)
            }

            pub fn raw_finish<T>(mut st: State, output: &mut dyn (crate::types::StringWriter),
                                 data: &T) -> (State, Option<crate::types::CodecError>) {
                #![allow(unused_mut, unused_variables)]
                let mut ctx = crate::util::StatefulDecoderHelper::new(&[], output, data);
                let st = match ::std::mem::replace(&mut st, $inist) {
                    $inist => { let $inictx = &mut ctx; $($inifin);+ },
                    $(
                        $ckst(() $(, $ckarg)*) => { let $ckctx = &mut ctx; $($ckfin);+ },
                    )*
                    $(
                        $st(() $(, $arg)*) => { let $ctx = &mut ctx; $($fin);+ },
                    )*
                };
                (st, ctx.err.take())
            }
        }
    );

    // simplified rules: no checkpoint and default final actions
    (
        module $stmod:ident; // should be unique from other existing identifiers
        $(internal $item:item)* // will only be visible from state functions
    initial:
        state $inist:ident($inictx:ident: Context) {
            $(case $($inilhs:pat),+ => $($inirhs:expr),+;)+
        }
    transient:
        $(state $st:ident($ctx:ident: Context $(, $arg:ident: $ty:ty)*) {
            $(case $($lhs:pat),+ => $($rhs:expr),+;)+
        })*
    ) => (
        stateful_decoder! {
            module $stmod;
            $(internal $item)*
        initial:
            state $inist($inictx: Context) {
                $(case $($inilhs),+ => $($inirhs),+;)+
                final => $inictx.reset();
            }
        checkpoint:
        transient:
            $(state $st($ctx: Context $(, $arg: $ty)*) {
                $(case $($lhs),+ => $($rhs),+;)+
                final => $ctx.err("incomplete sequence");
            })*
        }
    );
}