use std::alloc::Layout;
use std::fmt::Debug;
use std::marker::PhantomData;
use std::ops::Range;
use tract_data::internal::*;
use crate::mmm::{EagerPackedInput, MMMInput};
#[derive(Clone, Debug, Eq, PartialEq, Hash)]
pub struct Packer {
    pub r: usize,
    alignment: usize,
    end_padding_record: usize,
}
impl Packer {
    pub fn new(nr: usize, alignment: usize, end_padding_record: usize) -> Packer {
        Packer { r: nr, alignment, end_padding_record }
    }
    #[inline]
    pub fn alignment(&self) -> usize {
        self.alignment
    }
    #[inline]
    pub fn panel_width(&self) -> usize {
        self.r
    }
    #[inline]
    pub fn len<D: DimLike>(&self, k: D, n: D) -> D {
        n.divceil(self.r) * self.single_panel_len(k)
    }
    #[inline]
    pub fn single_panel_len<D: DimLike>(&self, k: D) -> D {
        ((k + self.end_padding_record) * self.r).divceil(self.alignment()) * self.alignment()
    }
    #[inline]
    pub fn single_panel_layout(&self, k: usize, item_size: usize) -> Layout {
        assert!(k > 0);
        Layout::from_size_align(self.single_panel_len(k) * item_size, self.alignment()).unwrap()
    }
    pub fn pack_tensor(
        &self,
        t: &Tensor,
        k_axis: usize,
        mn_axis: usize,
    ) -> TractResult<Box<dyn MMMInput>> {
        let k = t.shape()[k_axis];
        let mn = t.shape()[mn_axis];
        let packed_len = self.len(k, mn);
        let panel_len = self.single_panel_len(k);
        let panel_bytes = panel_len * t.datum_type().size_of();
        let strides = t.strides();
        unsafe {
            let mut packed =
                Tensor::uninitialized_aligned_dt(t.datum_type(), &[packed_len], self.alignment())?;
            dispatch_copy!(Self::pack_t(t.datum_type())(
                self,
                packed.as_ptr_mut_unchecked(),
                t.as_ptr_unchecked(),
                mn,
                strides[k_axis],
                strides[mn_axis],
                0..k,
                0..mn
            ));
            Ok(Box::new(EagerPackedInput { packed, panel_bytes, mn, k, r: self.r }))
        }
    }
    pub fn pack_tensor_view(
        &self,
        t: &TensorView,
        k_axis: usize,
        mn_axis: usize,
    ) -> TractResult<Box<dyn MMMInput>> {
        let k = t.shape()[k_axis];
        let mn = t.shape()[mn_axis];
        let packed_len = self.len(k, mn);
        let panel_len = self.single_panel_len(k);
        let panel_bytes = panel_len * t.datum_type().size_of();
        let strides = t.strides();
        unsafe {
            let mut packed =
                Tensor::uninitialized_aligned_dt(t.datum_type(), &[packed_len], self.alignment())?;
            dispatch_copy!(Self::pack_t(t.datum_type())(
                self,
                packed.as_ptr_mut_unchecked(),
                t.as_ptr_unchecked(),
                mn,
                strides[k_axis],
                strides[mn_axis],
                0..k,
                0..mn
            ));
            Ok(Box::new(EagerPackedInput { packed, panel_bytes, mn, k, r: self.r }))
        }
    }
    pub unsafe fn pack<'a, 'b>(
        &self,
        pb: impl std::borrow::BorrowMut<TensorView<'a>>,
        b: impl std::borrow::Borrow<TensorView<'b>>,
        k_axis: usize,
        mn_axis: usize,
    ) {
        let k = b.borrow().shape()[k_axis];
        let mn = b.borrow().shape()[mn_axis];
        self.pack_segment(pb, b, k_axis, mn_axis, 0..k, 0..mn);
    }
    #[allow(clippy::too_many_arguments)]
    #[rustfmt::skip]
    pub unsafe fn pack_t<T: Datum + Copy>(
        &self,
        pb: *mut T,
        b: *const T,
        mn: usize,
        k_stride: isize,
        mn_stride: isize,
        k_range: Range<usize>,
        mn_range: Range<usize>,
    ) {
        if self.r == 1 && k_stride == 1 && mn == 1 {
            pb.copy_from_nonoverlapping(b.add(k_range.start), k_range.len())
        } else if mn_stride == 1 {
            let size_of = T::datum_type().size_of();
            let rbytes = self.r * size_of;
            let mn_valid_end = mn_range.end.min(mn);
            let mn_range_bytes = mn_range.start * size_of..mn_valid_end * size_of;
            let k_stride_bytes = k_stride * size_of as isize;
            let bb = b as *const u8;
            let pbb = pb as *mut u8;
            let panel_len = self.single_panel_len(k_range.len()) * size_of;
            match rbytes {
                16 => pack_mn_major::<[u8; 16]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
                24 => pack_mn_major::<[u8; 24]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
                32 => pack_mn_major::<[u8; 32]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
                48 => pack_mn_major::<[u8; 48]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
                64 => pack_mn_major::<[u8; 64]>(bb, pbb, panel_len, k_stride_bytes, mn_range_bytes, k_range),
                _ => {
                    let mut packer = self.write_with_k_outer(pb, k_range.len(), mn_range.len());
                    for k in k_range {
                        for x in mn_range.start..mn_valid_end {
                            packer.write(*b.offset(x as isize + k_stride * k as isize))
                        }
                        for _x in mn_valid_end..mn_range.end {
                            packer.write(T::default())
                        }
                    }
                }
            }
        } else if k_stride == 1 {
            let mut packer = self.write_with_k_inner(pb, k_range.len(), mn);
            let mn_valid_end = mn_range.end.min(mn);
            for x in mn_range.start..mn_valid_end {
                for k in k_range.clone() {
                    packer.write(*b.offset(x as isize * mn_stride + k as isize))
                }
            }
            } else {
            let mut packer = self.write_with_k_outer(pb, k_range.len(), mn);
            let mn_valid_end = mn_range.end.min(mn);
            for k in k_range {
                for x in mn_range.start..mn_valid_end {
                    packer.write(*b.offset(x as isize * mn_stride + k_stride * k as isize))
                }
                for _x in mn_valid_end..mn_range.end {
                    packer.write(T::default())
                }
            }
        }
    }
    #[inline]
    pub unsafe fn pack_segment<'a, 'b>(
        &self,
        mut pb: impl std::borrow::BorrowMut<TensorView<'a>>,
        b: impl std::borrow::Borrow<TensorView<'b>>,
        k_axis: usize,
        mn_axis: usize,
        k_range: Range<usize>,
        mn_range: Range<usize>,
    ) {
        debug_assert!(pb.borrow().len() >= self.len(k_range.len(), mn_range.len()));
        let pb = pb.borrow_mut();
        let b = b.borrow();
        let dt = pb.datum_type();
        dispatch_copy!(Self::pack_t(dt)(
            self,
            pb.as_ptr_mut_unchecked(),
            b.as_ptr_unchecked(),
            b.shape()[mn_axis],
            b.strides()[k_axis],
            b.strides()[mn_axis],
            k_range,
            mn_range
        ));
    }
    pub fn write_with_k_outer<'p, T: Copy + Debug>(
        &self,
        pb: *mut T,
        k: usize,
        mn: usize,
    ) -> KOutWriter<'p, T> {
        KOutWriter::new(pb, self.r, self.single_panel_len(k), mn, k)
    }
    pub fn write_single_panel_with_k_outer<'p, T: Copy + Debug>(
        &self,
        pb: *mut T,
    ) -> KOutSinglePanelWriter<'p, T> {
        KOutSinglePanelWriter::new(pb)
    }
    pub fn write_with_k_inner<'p, T: Copy + Debug>(
        &self,
        pb: *mut T,
        k: usize,
        mn: usize,
    ) -> KInWriter<'p, T> {
        let panel_len = self.single_panel_len(k);
        KInWriter::new(pb, panel_len, self.r, mn, k)
    }
}
pub trait PackingWriter<T: Copy> {
    fn write(&mut self, t: T);
}
#[derive(Debug)]
pub struct KOutSinglePanelWriter<'p, T>
where
    T: Copy + std::fmt::Debug,
{
    ptr: *mut T,
    _phantom: PhantomData<&'p T>,
}
impl<'p, T> KOutSinglePanelWriter<'p, T>
where
    T: Copy + std::fmt::Debug,
{
    pub fn new(ptr: *mut T) -> KOutSinglePanelWriter<'p, T> {
        KOutSinglePanelWriter { ptr, _phantom: PhantomData }
    }
}
impl<'p, T> PackingWriter<T> for KOutSinglePanelWriter<'p, T>
where
    T: Copy + std::fmt::Debug,
{
    #[inline(always)]
    fn write(&mut self, t: T) {
        unsafe {
            *self.ptr = t;
            self.ptr = self.ptr.offset(1);
        }
    }
}
#[derive(Debug)]
pub struct KOutWriter<'p, T>
where
    T: Copy + std::fmt::Debug,
{
    ptr: *mut T,
    panels: usize,
    panel_width: usize,
    last_panel_width: usize,
    remain: usize,
    current_panel: usize,
    next_panel: isize,
    next_lane: isize,
    _phantom: PhantomData<&'p T>,
}
impl<'p, T> KOutWriter<'p, T>
where
    T: Copy + std::fmt::Debug,
{
    pub fn new(
        ptr: *mut T,
        panel_width: usize,
        panel_len: usize,
        mn: usize,
        _k: usize,
    ) -> KOutWriter<'p, T> {
        let panels = (mn + panel_width - 1) / panel_width;
        let last_panel_width = mn - (panels - 1) * panel_width;
        KOutWriter {
            ptr,
            panels,
            panel_width,
            last_panel_width,
            remain: if panels > 1 { panel_width } else { last_panel_width },
            current_panel: 0,
            next_panel: (panel_len - panel_width) as isize,
            next_lane: (panel_width - last_panel_width) as isize
                - (panel_len * (panels - 1)) as isize,
            _phantom: PhantomData,
        }
    }
}
impl<'p, T> PackingWriter<T> for KOutWriter<'p, T>
where
    T: Copy + std::fmt::Debug,
{
    #[inline(always)]
    fn write(&mut self, t: T) {
        unsafe {
            *self.ptr = t;
            self.remain -= 1;
            self.ptr = self.ptr.offset(1);
            if self.remain == 0 {
                self.current_panel += 1;
                if self.current_panel == self.panels {
                    self.ptr = self.ptr.offset(self.next_lane);
                    self.current_panel = 0;
                } else {
                    self.ptr = self.ptr.offset(self.next_panel);
                }
                if self.current_panel == self.panels - 1 {
                    self.remain = self.last_panel_width;
                } else {
                    self.remain = self.panel_width;
                }
            }
        }
    }
}
#[derive(Debug)]
pub struct KInWriter<'p, T>
where
    T: Copy + Debug,
{
    ptr: *mut T,
    k: usize,
    panels: usize,
    panel_width: usize,
    last_panel_width: usize,
    remain_on_k: usize,
    remain_on_mn: usize,
    current_panel: usize,
    next_mn_offset: isize,
    next_panel_offset: isize,
    _phantom: PhantomData<&'p T>,
}
impl<'p, T> KInWriter<'p, T>
where
    T: Copy + Debug,
{
    pub fn new(
        ptr: *mut T,
        panel_len: usize,
        panel_width: usize,
        mn: usize,
        k: usize,
    ) -> KInWriter<'p, T> {
        let panels = (mn + panel_width - 1) / panel_width;
        let last_panel_width = mn - (panels - 1) * panel_width;
        KInWriter {
            ptr,
            k,
            panels,
            panel_width,
            last_panel_width,
            remain_on_k: k,
            remain_on_mn: if panels == 1 { last_panel_width } else { panel_width },
            current_panel: 0,
            next_mn_offset: 1 - (k * panel_width) as isize,
            next_panel_offset: panel_len as isize - (k * panel_width + panel_width - 1) as isize,
            _phantom: PhantomData,
        }
    }
}
impl<'p, T> PackingWriter<T> for KInWriter<'p, T>
where
    T: Copy + std::fmt::Debug,
{
    #[inline(always)]
    fn write(&mut self, t: T) {
        unsafe {
            *self.ptr = t;
            self.remain_on_k -= 1;
            self.ptr = self.ptr.add(self.panel_width);
            if self.remain_on_k == 0 {
                self.remain_on_k = self.k;
                self.remain_on_mn -= 1;
                if self.remain_on_mn > 0 {
                    self.ptr = self.ptr.offset(self.next_mn_offset);
                } else {
                    self.ptr = self.ptr.offset(self.next_panel_offset);
                    self.current_panel += 1;
                    if self.current_panel == self.panels - 1 {
                        self.remain_on_mn = self.last_panel_width;
                    } else {
                        self.remain_on_mn = self.panel_width;
                    }
                }
            }
        }
    }
}
#[inline(never)]
unsafe fn pack_mn_major<Chunk: Copy>(
    b: *const u8,
    packed: *mut u8,
    panel_len: usize,
    k_stride_bytes: isize,
    mn_range_bytes: Range<usize>,
    k_range: Range<usize>,
) {
    let mnr = std::mem::size_of::<Chunk>();
    let full_panes = mn_range_bytes.len() / mnr;
    let partial_pane = mn_range_bytes.len() % mnr;
    for k in 0..k_range.len() {
        let mut p_row = packed.add(k * mnr);
        let mut b_row =
            b.offset((k_range.start + k) as isize * k_stride_bytes + mn_range_bytes.start as isize);
        for _ in 0..full_panes {
            p_row.copy_from_nonoverlapping(b_row, mnr);
            p_row = p_row.add(panel_len);
            b_row = b_row.add(mnr);
        }
        if partial_pane > 0 {
            p_row.copy_from_nonoverlapping(b_row, partial_pane);
        }
    }
}
#[cfg(test)]
mod test {
    use std::ops::Range;
    use proptest::prelude::*;
    use tract_data::internal::num_integer::Integer;
    use tract_data::internal::tract_ndarray::Zip;
    use tract_data::internal::*;
    use tract_ndarray::prelude::*;
    #[derive(Debug)]
    struct PackProblem {
        k: usize,
        mn: usize,
        is_a: bool,
        r: usize,
        k_range: Range<usize>,
        mn_range: Range<usize>,
        align_panel: usize,
    }
    impl PackProblem {
        fn input(&self) -> Array2<u32> {
            let shape = if self.is_a { (self.mn, self.k) } else { (self.k, self.mn) };
            let data = (0..(self.k * self.mn) as u32).collect();
            Array2::from_shape_vec(shape, data).unwrap()
        }
        fn packer(&self) -> Array2<u32> {
            let panels = self.mn_range.len().divceil(self.r);
            let packer = super::Packer::new(self.r, self.align_panel, 0);
            let input = self.input().into_tensor();
            let panel_len = packer.single_panel_len(self.k_range.len());
            let mut output =
                Tensor::zero::<u32>(&[packer.len(self.k_range.len(), self.mn_range.len())])
                    .unwrap();
            unsafe {
                packer.pack_segment(
                    output.view_mut(),
                    input.view(),
                    self.is_a as usize,
                    !self.is_a as usize,
                    self.k_range.clone(),
                    self.mn_range.clone(),
                )
            };
            output.into_array::<u32>().unwrap().into_shape((panels, panel_len)).unwrap()
        }
        fn reference(&self) -> Array2<u32> {
            let input = self.input();
            let panels = self.mn_range.len().divceil(self.r);
            let len = Integer::next_multiple_of(&(self.k_range.len() * self.r), &self.align_panel);
            Array2::from_shape_fn([panels, len], |(panel, z)| {
                let k = z / self.r;
                let x = z % self.r;
                let mn = panel * self.r + x + self.mn_range.start;
                let k = k + self.k_range.start;
                let coords = if self.is_a { (mn, k) } else { (k, mn) };
                *input.get(coords).unwrap_or(&0)
            })
        }
        fn valid(&self) -> Array2<bool> {
            let panels = self.mn_range.len().divceil(self.r);
            let len = Integer::next_multiple_of(&(self.k_range.len() * self.r), &self.align_panel);
            Array2::from_shape_fn([panels, len], |(panel, z)| {
                let k = z / self.r;
                let x = z % self.r;
                let k = k + self.k_range.start;
                let mn = panel * self.r + x + self.mn_range.start;
                k < self.k_range.end.min(self.k) && mn < self.mn_range.end.min(self.mn)
            })
        }
        fn check(&self) {
            let mut packer = self.packer();
            let mut reference = self.reference();
            let valid = self.valid();
            Zip::from(&mut packer).and(&valid).for_each(|p, v| *p = if *v { *p } else { -1 as _ });
            Zip::from(&mut reference)
                .and(&valid)
                .for_each(|p, v| *p = if *v { *p } else { -1 as _ });
            assert_eq!(packer, reference);
        }
    }
    impl Arbitrary for PackProblem {
        type Parameters = ();
        type Strategy = BoxedStrategy<PackProblem>;
        fn arbitrary_with(_args: ()) -> Self::Strategy {
            (any::<bool>(), 1usize..9, 1usize..20, 1usize..20)
                .prop_flat_map(|(is_a, r, k, mn)| {
                    (
                        Just((is_a, r, k, mn)),
                        sub_range_strat(0..k),
                        sub_range_strat(0..mn),
                        1usize..5,
                    )
                })
                .prop_map(|((is_a, r, k, mn), k_range, mn_range, align_panel)| PackProblem {
                    k,
                    mn,
                    is_a,
                    r,
                    k_range,
                    mn_range,
                    align_panel,
                })
                .boxed()
        }
    }
    fn sub_range_strat(range: Range<usize>) -> BoxedStrategy<Range<usize>> {
        (0..range.len())
            .prop_flat_map(|cropped| (Just(cropped), 0..=cropped))
            .prop_map(move |(cropped, left)| range.start + left..range.end - (cropped - left))
            .boxed()
    }
    proptest::proptest! {
        #[test]
        fn prop(pb in any::<PackProblem>()) {
            pb.check();
        }
        #[test]
        fn subrange_prop(_range in sub_range_strat(0..20)) {
        }
    }
    #[test]
    fn simple_b_1() {
        PackProblem {
            k: 2,
            mn: 1,
            is_a: false,
            r: 1,
            k_range: 0..2,
            mn_range: 0..1,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn simple_b_2() {
        PackProblem {
            k: 2,
            mn: 2,
            is_a: false,
            r: 1,
            k_range: 0..2,
            mn_range: 0..2,
            align_panel: 1,
        }
        .check()
    }
    #[test]
    fn simple_b_3() {
        PackProblem {
            k: 2,
            mn: 1,
            is_a: false,
            r: 4,
            k_range: 0..2,
            mn_range: 0..1,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn simple_b_4() {
        PackProblem {
            k: 1,
            mn: 3,
            is_a: false,
            r: 2,
            k_range: 0..1,
            mn_range: 0..3,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn simple_a_1() {
        PackProblem {
            k: 2,
            mn: 2,
            is_a: true,
            r: 1,
            k_range: 0..2,
            mn_range: 0..2,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn simple_a_2() {
        PackProblem {
            k: 2,
            mn: 3,
            is_a: true,
            r: 2,
            k_range: 0..2,
            mn_range: 0..3,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn range_k_0() {
        PackProblem {
            k: 2,
            mn: 1,
            is_a: false,
            r: 1,
            k_range: 1..2,
            mn_range: 0..1,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn range_k_1() {
        PackProblem {
            k: 2,
            mn: 2,
            is_a: false,
            r: 1,
            k_range: 0..2,
            mn_range: 0..1,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn range_k_2() {
        PackProblem {
            k: 2,
            mn: 1,
            is_a: false,
            r: 6,
            k_range: 1..2,
            mn_range: 0..1,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn range_mn_0() {
        PackProblem {
            k: 1,
            mn: 2,
            is_a: false,
            r: 2,
            k_range: 0..1,
            mn_range: 0..1,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn range_b_4() {
        PackProblem {
            k: 1,
            mn: 2,
            is_a: false,
            r: 6,
            k_range: 0..1,
            mn_range: 1..2,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn range_b_5() {
        PackProblem {
            k: 1,
            mn: 7,
            is_a: false,
            r: 6,
            k_range: 0..1,
            mn_range: 1..7,
            align_panel: 1,
        }
        .check();
    }
    #[test]
    fn align_a_1() {
        PackProblem {
            k: 2,
            mn: 2,
            is_a: true,
            r: 1,
            k_range: 0..1,
            mn_range: 0..2,
            align_panel: 2,
        }
        .check();
    }
    #[test]
    fn align_b_1() {
        PackProblem {
            k: 1,
            mn: 1,
            is_a: false,
            r: 1,
            k_range: 0..1,
            mn_range: 0..1,
            align_panel: 2,
        }
        .check();
    }
    #[test]
    fn align_b_2() {
        PackProblem {
            k: 3,
            mn: 1,
            is_a: false,
            r: 1,
            k_range: 0..3,
            mn_range: 0..1,
            align_panel: 2,
        }
        .check();
    }
    #[test]
    fn align_b_3() {
        PackProblem {
            k: 1,
            mn: 1,
            is_a: false,
            r: 3,
            k_range: 0..1,
            mn_range: 0..1,
            align_panel: 2,
        }
        .check();
    }
    #[test]
    fn align_b_4() {
        PackProblem {
            k: 2,
            mn: 1,
            is_a: false,
            r: 1,
            k_range: 0..1,
            mn_range: 0..1,
            align_panel: 2,
        }
        .check();
    }
    #[test]
    fn align_b_5() {
        PackProblem {
            k: 1,
            mn: 5,
            is_a: false,
            r: 4,
            k_range: 0..1,
            mn_range: 0..5,
            align_panel: 3,
        }
        .check();
    }
}