1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
use crate::{
    geom, CameraPoint, KeyPointWorldMatch, KeyPointsMatch, NormalizedKeyPoint, WorldPoint,
};
use approx::AbsDiffEq;
use core::cmp::Ordering;
use derive_more::{AsMut, AsRef, Deref, DerefMut, From, Into};
use nalgebra::{
    dimension::{U2, U3, U7},
    Isometry3, Matrix3, Matrix3x2, MatrixMN, Quaternion, Rotation3, Translation3, UnitQuaternion,
    Vector2, Vector3, Vector4, VectorN, SVD,
};
use sample_consensus::Model;

/// This contains a world pose, which is a pose of the world relative to the camera.
/// This maps [`WorldPoint`] into [`CameraPoint`], changing an absolute position into
/// a vector relative to the camera.
#[derive(Debug, Clone, Copy, PartialEq, AsMut, AsRef, Deref, DerefMut, From, Into)]
pub struct WorldPose(pub Isometry3<f32>);

impl Model<KeyPointWorldMatch> for WorldPose {
    fn residual(&self, data: &KeyPointWorldMatch) -> f32 {
        let WorldPose(iso) = *self;
        let KeyPointWorldMatch(image, world) = *data;

        let new_bearing = (iso * world.coords).normalize();
        let bearing_vector = image.to_homogeneous().normalize();
        1.0 - bearing_vector.dot(&new_bearing)
    }
}

impl WorldPose {
    /// Computes difference between the image keypoint and the projected keypoint.
    pub fn projection_error(&self, correspondence: KeyPointWorldMatch) -> Vector2<f32> {
        let KeyPointWorldMatch(NormalizedKeyPoint(image), world) = correspondence;
        let NormalizedKeyPoint(projection) = self.project(world);
        image - projection
    }

    /// Projects the `WorldPoint` onto the camera as a `NormalizedKeyPoint`.
    pub fn project(&self, point: WorldPoint) -> NormalizedKeyPoint {
        self.transform(point).into()
    }

    /// Projects the [`WorldPoint`] into camera space as a [`CameraPoint`].
    pub fn transform(&self, WorldPoint(point): WorldPoint) -> CameraPoint {
        let WorldPose(iso) = *self;
        CameraPoint((iso * point).coords)
    }

    /// Computes the Jacobian of the projection in respect to the `WorldPose`.
    /// The Jacobian is in the format:
    /// ```no_build,no_run
    /// | dx/dtx dy/dPx |
    /// | dx/dty dy/dPy |
    /// | dx/dtz dy/dPz |
    /// | dx/dqr dy/dqr |
    /// | dx/dqx dy/dqx |
    /// | dx/dqy dy/dqy |
    /// | dx/dqz dy/dqz |
    /// ```
    ///
    /// Where `t` refers to the translation vector and `q` refers to the unit quaternion.
    #[rustfmt::skip]
    pub fn projection_pose_jacobian(&self, point: WorldPoint) -> MatrixMN<f32, U7, U2> {
        let q = self.0.rotation.quaternion().coords;
        // World point (input)
        let p = point.0.coords;
        // Camera point (intermediate output)
        let pc = (self.0 * point.0).coords;

        // dP/dT (Jacobian of camera point in respect to translation component)
        let dp_dt = Matrix3::identity();

        // d/dQv (Qv x (Qv x P))
        let qv_qv_p = Matrix3::new(
            q.y * p.y + q.z * p.z,          q.y * p.x - 2.0 * q.x * p.y,    q.z * p.x - 2.0 * q.x * p.z,
            q.x * p.y - 2.0 * q.y * p.x,    q.x * p.x + q.z * p.z,          q.z * p.y - 2.0 * q.y * p.z,
            q.x * p.z - 2.0 * q.z * p.x,    q.y * p.z - 2.0 * q.z * p.y,    q.x * p.x + q.y * p.y
        );
        // d/dQv (Qv x P)
        let qv_p = Matrix3::new(
            0.0,    -p.z,   p.y,
            p.z,    0.0,    -p.x,
            -p.y,   p.x,    0.0,
        );
        // dP/dQv = d/dQv (2 * Qs * Qv x P + 2 * Qv x (Qv x P))
        // Jacobian of camera point in respect to vector component of quaternion
        let dp_dqv = 2.0 * (q.w * qv_p + qv_qv_p);

        // dP/Ds = d/Qs (2 * Qs * Qv x P)
        // Jacobian of camera point in respect to real component of quaternion
        let dp_ds = 2.0 * q.xyz().cross(&p);

        // dP/dT,Q (Jacobian of 3d camera point in respect to translation and quaternion)
        let dp_dtq = MatrixMN::<f32, U7, U3>::from_rows(&[
            dp_dt.row(0).into(),
            dp_dt.row(1).into(),
            dp_dt.row(2).into(),
            dp_dqv.row(0).into(),
            dp_dqv.row(1).into(),
            dp_dqv.row(2).into(),
            dp_ds.transpose(),
        ]);

        // 1 / pz
        let pcz = pc.z.recip();
        // - 1 / pz^2
        let npcz2 = -(pcz * pcz);

        // dK/dp (Jacobian of normalized image coordinate in respect to 3d camera point)
        let dk_dp = Matrix3x2::new(
            pcz,    0.0,
            0.0,    pcz,
            npcz2,  npcz2,
        );

        dp_dtq * dk_dp
    }

    pub fn to_vec(&self) -> VectorN<f32, U7> {
        let Self(iso) = *self;
        let t = iso.translation.vector;
        let rc = iso.rotation.quaternion().coords;
        t.push(rc.x).push(rc.y).push(rc.z).push(rc.w)
    }

    pub fn from_vec(v: VectorN<f32, U7>) -> Self {
        Self(Isometry3::from_parts(
            Translation3::from(Vector3::new(v[0], v[1], v[2])),
            UnitQuaternion::from_quaternion(Quaternion::from(Vector4::new(v[3], v[4], v[5], v[6]))),
        ))
    }
}

impl From<CameraPose> for WorldPose {
    fn from(camera: CameraPose) -> Self {
        Self(camera.inverse())
    }
}

/// This contains a camera pose, which is a pose of the camera relative to the world.
/// This transforms camera points (with depth as `z`) into world coordinates.
/// This also tells you where the camera is located and oriented in the world.
#[derive(Debug, Clone, Copy, PartialEq, AsMut, AsRef, Deref, DerefMut, From, Into)]
pub struct CameraPose(pub Isometry3<f32>);

impl From<WorldPose> for CameraPose {
    fn from(world: WorldPose) -> Self {
        Self(world.inverse())
    }
}

/// This contains a relative pose, which is a pose that transforms the [`CameraPoint`]
/// of one image into the corresponding [`CameraPoint`] of another image. This transforms
/// the point from the camera space of camera `A` to camera `B`.
///
/// Camera space for a given camera is defined as thus:
///
/// * Origin is the optical center
/// * Positive z axis is forwards
/// * Positive y axis is up
/// * Positive x axis is right
///
/// Note that this is a left-handed coordinate space.
#[derive(Debug, Clone, Copy, PartialEq, AsMut, AsRef, Deref, DerefMut, From, Into)]
pub struct RelativeCameraPose(pub Isometry3<f32>);

impl RelativeCameraPose {
    // The relative pose transforms the point in camera space from camera `A` to camera `B`.
    pub fn transform(&self, CameraPoint(point): CameraPoint) -> CameraPoint {
        let Self(iso) = *self;
        CameraPoint(iso * point)
    }
}

/// This stores an essential matrix, which is satisfied by the following constraint:
///
/// transpose(x') * E * x = 0
///
/// Where `x'` and `x` are homogeneous normalized image coordinates. You can get a
/// homogeneous normalized image coordinate by appending `1.0` to a `NormalizedKeyPoint`.
///
/// The essential matrix embodies the epipolar constraint between two images. Given that light
/// travels in a perfectly straight line (it will not, but for short distances it mostly does)
/// and assuming a pinhole camera model, for any point on the camera sensor, the light source
/// for that point exists somewhere along a line extending out from the bearing (direction
/// of travel) of that point. For a normalized image coordinate, that bearing is `(x, y, 1.0)`.
/// That is because normalized image coordinates exist on a virtual plane (the sensor)
/// a distance `z = 1.0` from the optical center (the location of the focal point) where
/// the unit of distance is the focal length. In epipolar geometry, the point on the virtual
/// plane is called an epipole. The line through 3d space created by the bearing that travels
/// from the optical center through the epipole is called an epipolar line.
///
/// If you look at every point along an epipolar line, each one of those points would show
/// up as a different point on the camera sensor of another image (if they are in view).
/// If you traced every point along this epipolar line to where it would appear on the sensor
/// of the camera (projection of the 3d points into normalized image coordinates), then
/// the points would form a straight line. This means that you can draw epipolar lines
/// that do not pass through the optical center of an image on that image.
///
/// The essential matrix makes it possible to create a vector that is perpendicular to all
/// bearings that are formed from the epipolar line on the second image's sensor. This is
/// done by computing `E * x`, where `x` is a homogeneous normalized image coordinate
/// from the first image. The transpose of the resulting vector then has a dot product
/// with the transpose of the second image coordinate `x'` which is equal to `0.0`.
/// This can be written as:
///
/// ```no_build,no_run
/// dot(transpose(E * x), x') = 0
/// ```
///
/// This can be re-written into the form given above:
///
/// ```no_build,no_run
/// transpose(x') * E * x = 0
/// ```
///
/// Where the first operation creates a pependicular vector to the epipoles on the first image
/// and the second takes the dot product which should result in 0.
///
/// With a `EssentialMatrix`, you can retrieve the rotation and translation given
/// one normalized image coordinate and one bearing that is scaled to the depth
/// of the point relative to the current reconstruction. This kind of point can be computed
/// using [`WorldPose::project_camera`] to convert a [`WorldPoint`] to a [`CameraPoint`].
#[derive(Debug, Clone, Copy, PartialEq, PartialOrd, AsMut, AsRef, Deref, DerefMut, From, Into)]
pub struct EssentialMatrix(pub Matrix3<f32>);

impl Model<KeyPointsMatch> for EssentialMatrix {
    fn residual(&self, data: &KeyPointsMatch) -> f32 {
        let Self(mat) = *self;
        let KeyPointsMatch(NormalizedKeyPoint(a), NormalizedKeyPoint(b)) = *data;

        // The result is a 1x1 matrix which we must get element 0 from.
        (a.to_homogeneous().transpose() * mat * b.to_homogeneous())[0]
    }
}

impl EssentialMatrix {
    /// Returns two possible rotations for the essential matrix along with a translation
    /// direction of unit length. The translation's length is unknown and must be
    /// solved for by using a prior.
    ///
    /// `max_iterations` is the maximum number of iterations that singular value decomposition
    /// will run on this matrix. Use this in soft realtime systems to cap the execution time.
    /// A `max_iterations` of `0` may execute indefinitely and is not recommended.
    pub fn possible_poses(
        &self,
        max_iterations: usize,
    ) -> Option<(Rotation3<f32>, Rotation3<f32>, Vector3<f32>)> {
        let Self(essential) = *self;

        // `W` from https://en.wikipedia.org/wiki/Essential_matrix#Finding_one_solution.
        let w = Matrix3::new(0.0, -1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0);
        // Transpose of `W` from https://en.wikipedia.org/wiki/Essential_matrix#Finding_one_solution.
        let wt = w.transpose();

        // Perform SVD.
        let svd = SVD::try_new(
            essential,
            true,
            true,
            f32::default_epsilon(),
            max_iterations,
        );
        // Extract only the U and V matrix from the SVD.
        let u_v = svd.map(|svd| {
            if let SVD {
                u: Some(u),
                v_t: Some(v_t),
                ..
            } = svd
            {
                (u, v_t.transpose())
            } else {
                panic!("Didn't get U and V matrix in SVD");
            }
        });
        // Force the determinants to be positive. I do not know precisely
        // why this is done since it isn't apparent from the Wikipedia page
        // on this subject, but this is what TheiaSfM does in essential_matrix_utils.cc.
        let u_v = u_v.map(|(mut u, mut v)| {
            let fix_det = |m: &mut Matrix3<f32>| {
                if m.determinant() < 0.0 {
                    for n in m.column_mut(2).iter_mut() {
                        *n *= -1.0;
                    }
                }
            };
            fix_det(&mut u);
            fix_det(&mut v);
            (u, v)
        });
        // Compute the possible rotations and the normalized bearing.
        u_v.map(|(u, v)| {
            let vt = v.transpose();
            (
                Rotation3::from_matrix_unchecked(u * wt * vt),
                Rotation3::from_matrix_unchecked(u * w * vt),
                u.column(2).into_owned().normalize(),
            )
        })
    }

    /// Return the [`RelativeCameraPose`] that transforms a [`CameraPoint`] of image
    /// `A` (source of `a`) to the corresponding [`CameraPoint`] of image B (source of `b`).
    /// This determines the average expected translation from the points themselves and
    /// if the points agree with the rotation (points must be in front of the camera).
    /// The function takes an iterator containing tuples in the form `(depth, a, b)`:
    ///
    /// * `depth` - The actual depth (`z` axis, not distance) of normalized keypoint `a`
    /// * `a` - A keypoint from image `A`
    /// * `b` - A keypoint from image `B`
    ///
    /// `self` must satisfy the constraint:
    ///
    /// ```no_build,no_run
    /// transpose(homogeneous(a)) * E * homogeneous(b) = 0
    /// ```
    ///
    /// Also, `a` and `b` must be a correspondence.
    ///
    /// This will take the average translation over the entire iterator. This is done
    /// to smooth out noise and outliers (if present).
    ///
    /// `consensus_ratio` is the ratio of points which must be in front of the camera for the model
    /// to be accepted and return Some. Otherwise, None is returned.
    ///
    /// `max_iterations` is the maximum number of iterations that singular value decomposition
    /// will run on this matrix. Use this in soft realtime systems to cap the execution time.
    /// A `max_iterations` of `0` may execute indefinitely and is not recommended.
    ///
    /// This does not communicate which points were outliers. To determine the outlier points,
    /// get the [`CameraPoint`] for all points and place them in front of
    pub fn solve_pose(
        &self,
        consensus_ratio: f32,
        max_iterations: usize,
        correspondences: impl Iterator<Item = (f32, NormalizedKeyPoint, NormalizedKeyPoint)>,
    ) -> Option<RelativeCameraPose> {
        // Get the possible rotations and the translation
        self.possible_poses(max_iterations)
            .and_then(|(rot_x, rot_y, t)| {
                // Find the translation for both x and y.
                let tx_dir = rot_x.transpose() * t;
                let ty_dir = rot_y.transpose() * t;
                // Get the net translation scale of points that agree with a and b
                // in addition to the number of points that agree with a and b.
                let (xt, yt, xn, yn, total) = correspondences.fold(
                    (0.0, 0.0, 0usize, 0usize, 0usize),
                    |(mut xt, mut yt, mut xn, mut yn, total), (depth, a, b)| {
                        // Compute the CameraPoint of a.
                        let a_point = depth * a.with_depth(1.0).0;
                        let trans_and_agree = |rotation, t_dir| {
                            // Triangulate the position of the CameraPoint of b.
                            // We know the precise 3d position of the a point relative
                            // to camera A, but we do not know the
                            // 3d point in relation to camera B since the translation of
                            // the point is unknown. We do know the direction of translation
                            // of the point. We know only the rotation of the camera B
                            // relative to camera A and the epipolar point on camera B.
                            // What we will need to do is start by rotating the point in space.
                            // After rotating the point, we then need to solve for the translation
                            // that minimizes the reprojection error of the untranslated point as much
                            // as possible. See the documentation for reproject_along_translation
                            // to get more details on the process.
                            let untranslated: Vector3<f32> = rotation * a_point;
                            let translation_scale =
                                geom::reproject_along_translation(untranslated.xy(), b, t_dir);
                            // Now that we have the translation, we can just verify that the point
                            // is in front (z > 1.0) of the camera to see if it agrees with the model.
                            (
                                translation_scale,
                                translation_scale * t_dir.z + untranslated.z > 1.0,
                            )
                        };

                        // Do it for X.
                        if let (scale, true) = trans_and_agree(rot_x, tx_dir) {
                            xt += scale;
                            xn += 1;
                        }

                        // Do it for Y.
                        if let (scale, true) = trans_and_agree(rot_y, ty_dir) {
                            yt += scale;
                            yn += 1;
                        }

                        (xt, yt, xn, yn, total + 1)
                    },
                );
                // Ensure that the best one exceeds the consensus ratio.
                if (core::cmp::max(xn, yn) as f32 / total as f32) < consensus_ratio {
                    return None;
                }
                // TODO: Perhaps if its closer than this we should assume the frame itself is an outlier.
                let (rot, trans) = match xn.cmp(&yn) {
                    Ordering::Equal => return None,
                    Ordering::Greater => (rot_x, xt / xn as f32 * tx_dir),
                    Ordering::Less => (rot_y, yt / yn as f32 * ty_dir),
                };
                Some(RelativeCameraPose(Isometry3::from_parts(
                    trans.into(),
                    UnitQuaternion::from_rotation_matrix(&rot),
                )))
            })
    }
}