1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
use ndarray::{Array1, ArrayView1, ArrayViewMut1};

use crate::hogwild::Hogwild;
use crate::idx::WordIdx;
use crate::loss::log_logistic_loss;
use crate::train_model::{NegativeSamples, TrainIterFrom, TrainModel, Trainer};
use crate::vec_simd::scaled_add;

/// Stochastic gradient descent
///
/// This data type applies stochastic gradient descent on sentences.
#[derive(Clone)]
pub struct SGD<T> {
    loss: Hogwild<f32>,
    model: TrainModel<T>,
    n_examples: Hogwild<usize>,
    n_tokens_processed: Hogwild<usize>,
    sgd_impl: NegativeSamplingSGD,
}

impl<T> SGD<T>
where
    T: Trainer,
{
    pub fn into_model(self) -> TrainModel<T> {
        self.model
    }

    /// Construct a new SGD instance,
    pub fn new(model: TrainModel<T>) -> Self {
        let sgd_impl = NegativeSamplingSGD::new(model.config().negative_samples as usize);

        SGD {
            loss: Hogwild::default(),
            model,
            n_examples: Hogwild::default(),
            n_tokens_processed: Hogwild::default(),
            sgd_impl,
        }
    }
    /// Get the training model associated with this SGD.
    pub fn model(&self) -> &TrainModel<T> {
        &self.model
    }

    /// Get the number of tokens that are processed by this SGD.
    pub fn n_tokens_processed(&self) -> usize {
        *self.n_tokens_processed
    }

    /// Get the average training loss of this SGD.
    ///
    /// This returns the average training loss over all instances seen by
    /// this SGD instance since its construction.
    pub fn train_loss(&self) -> f32 {
        *self.loss / *self.n_examples as f32
    }

    /// Update the model parameters using the given sentence.
    ///
    /// This applies a gradient descent step on the sentence, with the given
    /// learning rate.
    pub fn update_sentence<'b, S>(&mut self, sentence: &S, lr: f32)
    where
        S: ?Sized,
        T: TrainIterFrom<'b, S> + Trainer + NegativeSamples,
        for<'a> &'a T::Focus: IntoIterator<Item = u64>,
        T::Focus: WordIdx,
    {
        for (focus, contexts) in self.model.trainer().train_iter_from(sentence) {
            // Update parameters for the token focus token i and the
            // context token j.
            let input_embed = self.model.mean_input_embedding(&focus);

            for context in contexts {
                *self.loss += self.sgd_impl.sgd_step(
                    &mut self.model,
                    (&focus).into_iter(),
                    input_embed.view(),
                    context,
                    lr,
                );
                *self.n_examples += 1;
            }
            *self.n_tokens_processed += 1;
        }
    }
}

/// Log-logistic loss SGD with negative sampling.
///
/// This type implements gradient descent for log-logistic loss with negative
/// sampling (Mikolov, 2013).
///
/// In this approach, word embeddings training is shaped as a
/// prediction task. The word vectors should be parametrized such that
/// words that co-occur with a given input get an estimated probability
/// of 1.0, whereas words that do not co-occur with the input get an
/// estimated probability of 0.0.
///
/// The probability is computed from the inner product of two word
/// vectors by applying the logistic function. The loss is the negative
/// log likelihood.
///
/// Due to the vocabulary sizes, it is not possible to update the vectors
/// for all words that do not co-occur in every step. Instead, such
/// negatives are sampled, weighted by word frequency.
#[derive(Clone)]
pub struct NegativeSamplingSGD {
    negative_samples: usize,
}

impl NegativeSamplingSGD {
    /// Create a new loss function.
    pub fn new(negative_samples: usize) -> Self {
        NegativeSamplingSGD { negative_samples }
    }

    /// Perform a step of gradient descent.
    ///
    /// This method will estimate the probability of `output` and randomly
    /// chosen negative samples, given the input. It will then update the
    /// embeddings of the positive/negative outputs and the input (and its
    /// subwords).
    ///
    /// The function returns the sum of losses.
    pub fn sgd_step<T>(
        &mut self,
        model: &mut TrainModel<T>,
        input: impl IntoIterator<Item = u64>,
        input_embed: ArrayView1<f32>,
        output: usize,
        lr: f32,
    ) -> f32
    where
        T: NegativeSamples,
    {
        let mut loss = 0.0;
        let mut input_delta = Array1::zeros(input_embed.shape()[0]);

        // Update the output embedding of the positive instance.
        loss += self.update_output(
            model,
            input_embed.view(),
            input_delta.view_mut(),
            output,
            true,
            lr,
        );

        // Pick the negative examples and update their output embeddings.
        loss += self.negative_samples(model, input_embed, input_delta.view_mut(), output, lr);

        // Update the input embeddings with the accumulated gradient.
        for idx in input {
            let input_embed = model.input_embedding_mut(idx as usize);
            scaled_add(input_embed, input_delta.view(), 1.0);
        }

        loss
    }

    /// Pick, predict and update negative samples.
    fn negative_samples<T>(
        &mut self,
        model: &mut TrainModel<T>,
        input_embed: ArrayView1<f32>,
        mut input_delta: ArrayViewMut1<f32>,
        output: usize,
        lr: f32,
    ) -> f32
    where
        T: NegativeSamples,
    {
        let mut loss = 0f32;

        for _ in 0..self.negative_samples {
            let negative = model.trainer().negative_sample(output);
            // Update input and output for this negative sample.
            loss += self.update_output(
                model,
                input_embed.view(),
                input_delta.view_mut(),
                negative,
                false,
                lr,
            );
        }

        loss
    }

    /// Update an output embedding.
    ///
    /// This also accumulates an update for the input embedding.
    ///
    /// The method returns the loss for predicting the output.
    fn update_output<T>(
        &mut self,
        model: &mut TrainModel<T>,
        input_embed: ArrayView1<f32>,
        input_delta: ArrayViewMut1<f32>,
        output: usize,
        label: bool,
        lr: f32,
    ) -> f32 {
        let (loss, part_gradient) =
            log_logistic_loss(input_embed.view(), model.output_embedding(output), label);

        // Update the input weight: u_n += lr * u_n' v_n. We are not updating
        // the weight immediately, but accumulating the weight updates in
        // input_delta.
        scaled_add(
            input_delta,
            model.output_embedding(output),
            lr * part_gradient,
        );

        // Update the output weight: v_n += lr * v_n' u_n.
        scaled_add(
            model.output_embedding_mut(output),
            input_embed.view(),
            lr * part_gradient,
        );

        loss
    }
}