use ndarray::{Array1, ArrayView1, ArrayViewMut1};

use crate::hogwild::Hogwild;
use crate::idx::WordIdx;
use crate::loss::log_logistic_loss;
use crate::train_model::{NegativeSamples, TrainIterFrom, TrainModel, Trainer};
use crate::vec_simd::scaled_add;

/// Stochastic gradient descent
///
/// This data type applies stochastic gradient descent on sentences.
#[derive(Clone)]
pub struct SGD<T> {
    loss: Hogwild<f32>,
    model: TrainModel<T>,
    n_examples: Hogwild<usize>,
    n_tokens_processed: Hogwild<usize>,
    sgd_impl: NegativeSamplingSGD,
}

impl<T> SGD<T>
where
    T: Trainer,
{
    pub fn into_model(self) -> TrainModel<T> {
        self.model
    }

    /// Construct a new SGD instance,
    pub fn new(model: TrainModel<T>) -> Self {
        let sgd_impl = NegativeSamplingSGD::new(model.config().negative_samples as usize);

        SGD {
            loss: Hogwild::default(),
            model,
            n_examples: Hogwild::default(),
            n_tokens_processed: Hogwild::default(),
            sgd_impl,
        }
    }
    /// Get the training model associated with this SGD.
    pub fn model(&self) -> &TrainModel<T> {
        &self.model
    }

    /// Get the number of tokens that are processed by this SGD.
    pub fn n_tokens_processed(&self) -> usize {
        *self.n_tokens_processed
    }

    /// Get the average training loss of this SGD.
    ///
    /// This returns the average training loss over all instances seen by
    /// this SGD instance since its construction.
    pub fn train_loss(&self) -> f32 {
        *self.loss / *self.n_examples as f32
    }

    /// Update the model parameters using the given sentence.
    ///
    /// This applies a gradient descent step on the sentence, with the given
    /// learning rate.
    pub fn update_sentence<'b, S>(&mut self, sentence: &S, lr: f32)
    where
        S: ?Sized,
        T: TrainIterFrom<'b, S> + Trainer + NegativeSamples,
        for<'a> &'a T::Focus: IntoIterator<Item = u64>,
        T::Focus: WordIdx,
    {
        for (focus, contexts) in self.model.trainer().train_iter_from(sentence) {
            // Update parameters for the token focus token i and the
            // context token j.
            let input_embed = self.model.mean_input_embedding(&focus);

            for context in contexts {
                *self.loss += self.sgd_impl.sgd_step(
                    &mut self.model,
                    (&focus).into_iter(),
                    input_embed.view(),
                    context,
                    lr,
                );
                *self.n_examples += 1;
            }
            *self.n_tokens_processed += 1;
        }
    }
}

/// Log-logistic loss SGD with negative sampling.
///
/// This type implements gradient descent for log-logistic loss with negative
/// sampling (Mikolov, 2013).
///
/// In this approach, word embeddings training is shaped as a
/// prediction task. The word vectors should be parametrized such that
/// words that co-occur with a given input get an estimated probability
/// of 1.0, whereas words that do not co-occur with the input get an
/// estimated probability of 0.0.
///
/// The probability is computed from the inner product of two word
/// vectors by applying the logistic function. The loss is the negative
/// log likelihood.
///
/// Due to the vocabulary sizes, it is not possible to update the vectors
/// for all words that do not co-occur in every step. Instead, such
/// negatives are sampled, weighted by word frequency.
#[derive(Clone)]
pub struct NegativeSamplingSGD {
    negative_samples: usize,
}

impl NegativeSamplingSGD {
    /// Create a new loss function.
    pub fn new(negative_samples: usize) -> Self {
        NegativeSamplingSGD { negative_samples }
    }

    /// Perform a step of gradient descent.
    ///
    /// This method will estimate the probability of `output` and randomly
    /// chosen negative samples, given the input. It will then update the
    /// embeddings of the positive/negative outputs and the input (and its
    /// subwords).
    ///
    /// The function returns the sum of losses.
    pub fn sgd_step<T>(
        &mut self,
        model: &mut TrainModel<T>,
        input: impl IntoIterator<Item = u64>,
        input_embed: ArrayView1<f32>,
        output: usize,
        lr: f32,
    ) -> f32
    where
        T: NegativeSamples,
    {
        let mut loss = 0.0;
        let mut input_delta = Array1::zeros(input_embed.shape()[0]);

        // Update the output embedding of the positive instance.
        loss += self.update_output(
            model,
            input_embed.view(),
            input_delta.view_mut(),
            output,
            true,
            lr,
        );

        // Pick the negative examples and update their output embeddings.
        loss += self.negative_samples(model, input_embed, input_delta.view_mut(), output, lr);

        // Update the input embeddings with the accumulated gradient.
        for idx in input {
            let input_embed = model.input_embedding_mut(idx as usize);
            scaled_add(input_embed, input_delta.view(), 1.0);
        }

        loss
    }

    /// Pick, predict and update negative samples.
    fn negative_samples<T>(
        &mut self,
        model: &mut TrainModel<T>,
        input_embed: ArrayView1<f32>,
        mut input_delta: ArrayViewMut1<f32>,
        output: usize,
        lr: f32,
    ) -> f32
    where
        T: NegativeSamples,
    {
        let mut loss = 0f32;

        for _ in 0..self.negative_samples {
            let negative = model.trainer().negative_sample(output);
            // Update input and output for this negative sample.
            loss += self.update_output(
                model,
                input_embed.view(),
                input_delta.view_mut(),
                negative,
                false,
                lr,
            );
        }

        loss
    }

    /// Update an output embedding.
    ///
    /// This also accumulates an update for the input embedding.
    ///
    /// The method returns the loss for predicting the output.
    fn update_output<T>(
        &mut self,
        model: &mut TrainModel<T>,
        input_embed: ArrayView1<f32>,
        input_delta: ArrayViewMut1<f32>,
        output: usize,
        label: bool,
        lr: f32,
    ) -> f32 {
        let (loss, part_gradient) =
            log_logistic_loss(input_embed.view(), model.output_embedding(output), label);

        // Update the input weight: u_n += lr * u_n' v_n. We are not updating
        // the weight immediately, but accumulating the weight updates in
        // input_delta.
        scaled_add(
            input_delta,
            model.output_embedding(output),
            lr * part_gradient,
        );

        // Update the output weight: v_n += lr * v_n' u_n.
        scaled_add(
            model.output_embedding_mut(output),
            input_embed.view(),
            lr * part_gradient,
        );

        loss
    }
}