SUBROUTINE MB02ND( M, N, L, RANK, THETA, C, LDC, X, LDX, Q, INUL,
     $                   TOL, RELTOL, IWORK, DWORK, LDWORK, BWORK,
     $                   IWARN, INFO )
C
C     PURPOSE
C
C     To solve the Total Least Squares (TLS) problem using a Partial
C     Singular Value Decomposition (PSVD) approach.
C     The TLS problem assumes an overdetermined set of linear equations
C     AX = B, where both the data matrix A as well as the observation
C     matrix B are inaccurate. The routine also solves determined and
C     underdetermined sets of equations by computing the minimum norm
C     solution.
C     It is assumed that all preprocessing measures (scaling, coordinate
C     transformations, whitening, ... ) of the data have been performed
C     in advance.
C
C     ARGUMENTS
C
C     Input/Output Parameters
C
C     M       (input) INTEGER
C             The number of rows in the data matrix A and the
C             observation matrix B.  M >= 0.
C
C     N       (input) INTEGER
C             The number of columns in the data matrix A.  N >= 0.
C
C     L       (input) INTEGER
C             The number of columns in the observation matrix B.
C             L >= 0.
C
C     RANK    (input/output) INTEGER
C             On entry, if RANK < 0, then the rank of the TLS
C             approximation [A+DA|B+DB] (r say) is computed by the
C             routine.
C             Otherwise, RANK must specify the value of r.
C             RANK <= min(M,N).
C             On exit, if RANK < 0 on entry and INFO = 0, then RANK
C             contains the computed rank of the TLS approximation
C             [A+DA|B+DB].
C             Otherwise, the user-supplied value of RANK may be
C             changed by the routine on exit if the RANK-th and the
C             (RANK+1)-th singular values of C = [A|B] are considered
C             to be equal, or if the upper triangular matrix F (as
C             defined in METHOD) is (numerically) singular.
C
C     THETA   (input/output) DOUBLE PRECISION
C             On entry, if RANK < 0, then the rank of the TLS
C             approximation [A+DA|B+DB] is computed using THETA as
C             (min(M,N+L) - d), where d is the number of singular
C             values of [A|B] <= THETA. THETA >= 0.0.
C             Otherwise, THETA is an initial estimate (t say) for
C             computing a lower bound on the RANK largest singular
C             values of [A|B]. If THETA < 0.0 on entry however, then
C             t is computed by the routine.
C             On exit, if RANK >= 0 on entry, then THETA contains the
C             computed bound such that precisely RANK singular values
C             of C = [A|B] are greater than THETA + TOL.
C             Otherwise, THETA is unchanged.
C
C     C       (input/output) DOUBLE PRECISION array, dimension (LDC,N+L)
C             On entry, the leading M-by-(N+L) part of this array must
C             contain the matrices A and B. Specifically, the first N
C             columns must contain the data matrix A and the last L
C             columns the observation matrix B (right-hand sides).
C             On exit, if INFO = 0, the first N+L components of the
C             columns of this array whose index i corresponds with
C             INUL(i) = .TRUE., are the possibly transformed (N+L-RANK)
C             base vectors of the right singular subspace corresponding
C             to the singular values of C = [A|B] which are less than or
C             equal to THETA. Specifically, if L = 0, or if RANK = 0 and
C             IWARN <> 2, these vectors are indeed the base vectors
C             above. Otherwise, these vectors form the matrix V2,
C             transformed as described in Step 4 of the PTLS algorithm
C             (see METHOD). The TLS solution is computed from these
C             vectors. The other columns of array C contain no useful
C             information.
C
C     LDC     INTEGER
C             The leading dimension of array C.  LDC >= max(1,M,N+L).
C
C     X       (output) DOUBLE PRECISION array, dimension (LDX,L)
C             If INFO = 0, the leading N-by-L part of this array
C             contains the solution X to the TLS problem specified by
C             A and B.
C
C     LDX     INTEGER
C             The leading dimension of array X.  LDX >= max(1,N).
C
C     Q       (output) DOUBLE PRECISION array, dimension
C             (max(1,2*min(M,N+L)-1))
C             This array contains the partially diagonalized bidiagonal
C             matrix J computed from C, at the moment that the desired
C             singular subspace has been found. Specifically, the
C             leading p = min(M,N+L) entries of Q contain the diagonal
C             elements q(1),q(2),...,q(p) and the entries Q(p+1),Q(p+2),
C             ...,Q(2*p-1) contain the superdiagonal elements e(1),e(2),
C             ...,e(p-1) of J.
C
C     INUL    (output) LOGICAL array, dimension (N+L)
C             The indices of the elements of this array with value
C             .TRUE. indicate the columns in C containing the base
C             vectors of the right singular subspace of C from which
C             the TLS solution has been computed.
C
C     Tolerances
C
C     TOL     DOUBLE PRECISION
C             This parameter defines the multiplicity of singular values
C             by considering all singular values within an interval of
C             length TOL as coinciding. TOL is used in checking how many
C             singular values are less than or equal to THETA. Also in
C             computing an appropriate upper bound THETA by a bisection
C             method, TOL is used as a stopping criterion defining the
C             minimum (absolute) subinterval width. TOL is also taken
C             as an absolute tolerance for negligible elements in the
C             QR/QL iterations. If the user sets TOL to be less than or
C             equal to 0, then the tolerance is taken as specified in
C             SLICOT Library routine MB04YD document.
C
C     RELTOL  DOUBLE PRECISION
C             This parameter specifies the minimum relative width of an
C             interval. When an interval is narrower than TOL, or than
C             RELTOL times the larger (in magnitude) endpoint, then it
C             is considered to be sufficiently small and bisection has
C             converged. If the user sets RELTOL to be less than
C             BASE * EPS, where BASE is machine radix and EPS is machine
C             precision (see LAPACK Library routine DLAMCH), then the
C             tolerance is taken as BASE * EPS.
C
C     Workspace
C
C     IWORK   INTEGER array, dimension (N+2*L)
C
C     DWORK   DOUBLE PRECISION array, dimension (LDWORK)
C             On exit, if INFO = 0, DWORK(1) returns the optimal value
C             of LDWORK, and DWORK(2) returns the reciprocal of the
C             condition number of the matrix F.
C
C     LDWORK  INTEGER
C             The length of the array DWORK.
C             LDWORK = max(2, max(M,N+L) + 2*min(M,N+L),
C                          min(M,N+L) + LW + max(6*(N+L)-5,
C                                                L*L+max(N+L,3*L)),
C             where
C             LW = (N+L)*(N+L-1)/2,  if M >= N+L,
C             LW = M*(N+L-(M-1)/2),  if M <  N+L.
C             For optimum performance LDWORK should be larger.
C
C             If LDWORK = -1, then a workspace query is assumed;
C             the routine only calculates the optimal size of the
C             DWORK array, returns this value as the first entry of
C             the DWORK array, and no error message related to LDWORK
C             is issued by XERBLA.
C
C     BWORK   LOGICAL array, dimension (N+L)
C
C     Warning Indicator
C
C     IWARN   INTEGER
C             = 0:  no warnings;
C             = 1:  if the rank of matrix C has been lowered because a
C                   singular value of multiplicity greater than 1 was
C                   found;
C             = 2:  if the rank of matrix C has been lowered because the
C                   upper triangular matrix F is (numerically) singular.
C
C     Error Indicator
C
C     INFO    INTEGER
C             = 0:  successful exit;
C             < 0:  if INFO = -i, the i-th argument had an illegal
C                   value;
C             = 1:  if the maximum number of QR/QL iteration steps
C                   (30*MIN(M,N)) has been exceeded;
C             = 2:  if the computed rank of the TLS approximation
C                   [A+DA|B+DB] exceeds MIN(M,N). Try increasing the
C                   value of THETA or set the value of RANK to min(M,N).
C
C     METHOD
C
C     The method used is the Partial Total Least Squares (PTLS) approach
C     proposed by Van Huffel and Vandewalle [5].
C
C     Let C = [A|B] denote the matrix formed by adjoining the columns of
C     B to the columns of A on the right.
C
C     Total Least Squares (TLS) definition:
C     -------------------------------------
C
C       Given matrices A and B, find a matrix X satisfying
C
C            (A + DA) X = B + DB,
C
C       where A and DA are M-by-N matrices, B and DB are M-by-L matrices
C       and X is an N-by-L matrix.
C       The solution X must be such that the Frobenius norm of [DA|DB]
C       is a minimum and each column of B + DB is in the range of
C       A + DA. Whenever the solution is not unique, the routine singles
C       out the minimum norm solution X.
C
C     Let V denote the right singular subspace of C. Since the TLS
C     solution can be computed from any orthogonal basis of the subspace
C     of V corresponding to the smallest singular values of C, the
C     Partial Singular Value Decomposition (PSVD) can be used instead of
C     the classical SVD. The dimension of this subspace of V may be
C     determined by the rank of C or by an upper bound for those
C     smallest singular values.
C
C     The PTLS algorithm proceeds as follows (see [2 - 5]):
C
C     Step 1: Bidiagonalization phase
C             -----------------------
C      (a) If M is large enough than N + L, transform C into upper
C          triangular form R by Householder transformations.
C      (b) Transform C (or R) into upper bidiagonal form
C          (p = min(M,N+L)):
C
C                     |q(1) e(1)  0   ...  0   |
C                (0)  | 0   q(2) e(2)      .   |
C               J   = | .                  .   |
C                     | .                e(p-1)|
C                     | 0             ... q(p) |
C
C          if M >= N + L, or lower bidiagonal form:
C
C                     |q(1)  0    0   ...  0     0   |
C                (0)  |e(1) q(2)  0        .     .   |
C               J   = | .                  .     .   |
C                     | .                 q(p)   .   |
C                     | 0             ... e(p-1) q(p)|
C
C          if M < N + L, using Householder transformations.
C          In the second case, transform the matrix to the upper
C          bidiagonal form by applying Givens rotations.
C      (c) Initialize the right singular base matrix with the identity
C          matrix.
C
C     Step 2: Partial diagonalization phase
C             -----------------------------
C     If the upper bound THETA is not given, then compute THETA such
C     that precisely p - RANK singular values (p=min(M,N+L)) of the
C     bidiagonal matrix are less than or equal to THETA, using a
C     bisection method [5]. Diagonalize the given bidiagonal matrix J
C     partially, using either QL iterations (if the upper left diagonal
C     element of the considered bidiagonal submatrix is smaller than the
C     lower right diagonal element) or QR iterations, such that J is
C     split into unreduced bidiagonal submatrices whose singular values
C     are either all larger than THETA or are all less than or equal
C     to THETA. Accumulate the Givens rotations in V.
C
C     Step 3: Back transformation phase
C             -------------------------
C     Apply the Householder transformations of Step 1(b) onto the base
C     vectors of V associated with the bidiagonal submatrices with all
C     singular values less than or equal to THETA.
C
C     Step 4: Computation of F and Y
C             ----------------------
C     Let V2 be the matrix of the columns of V corresponding to the
C     (N + L - RANK) smallest singular values of C.
C     Compute with Householder transformations the matrices F and Y
C     such that:
C
C                       |VH   Y|
C              V2 x Q = |      |
C                       |0    F|
C
C     where Q is an orthogonal matrix, VH is an N-by-(N-RANK) matrix,
C     Y is an N-by-L matrix and F is an L-by-L upper triangular matrix.
C     If F is singular, then reduce the value of RANK by one and repeat
C     Steps 2, 3 and 4.
C
C     Step 5: Computation of the TLS solution
C             -------------------------------
C     If F is non-singular then the solution X is obtained by solving
C     the following equations by forward elimination:
C
C              X F = -Y.
C
C     Notes:
C     If RANK is lowered in Step 4, some additional base vectors must
C     be computed in Step 2. The additional computations are kept to
C     a minimum.
C     If RANK is lowered in Step 4 but the multiplicity of the RANK-th
C     singular value is larger than 1, then the value of RANK is further
C     lowered with its multiplicity defined by the parameter TOL. This
C     is done at the beginning of Step 2 by calling SLICOT Library
C     routine MB03MD (from MB04YD), which estimates THETA using a
C     bisection method. If F in Step 4 is singular, then the computed
C     solution is infinite and hence does not satisfy the second TLS
C     criterion (see TLS definition). For these cases, Golub and
C     Van Loan [1] claim that the TLS problem has no solution. The
C     properties of these so-called nongeneric problems are described
C     in [6] and the TLS computations are generalized in order to solve
C     them. As proven in [6], the proposed generalization satisfies the
C     TLS criteria for any number L of observation vectors in B provided
C     that, in addition, the solution | X| is constrained to be
C                                     |-I|
C     orthogonal to all vectors of the form |w| which belong to the
C                                           |0|
C     space generated by the columns of the submatrix |Y|.
C                                                     |F|
C
C     REFERENCES
C
C     [1] Golub, G.H. and Van Loan, C.F.
C         An Analysis of the Total Least-Squares Problem.
C         SIAM J. Numer. Anal., 17, pp. 883-893, 1980.
C
C     [2] Van Huffel, S., Vandewalle, J. and Haegemans, A.
C         An Efficient and Reliable Algorithm for Computing the
C         Singular Subspace of a Matrix Associated with its Smallest
C         Singular Values.
C         J. Comput. and Appl. Math., 19, pp. 313-330, 1987.
C
C     [3] Van Huffel, S.
C         Analysis of the Total Least Squares Problem and its Use in
C         Parameter Estimation.
C         Doctoral dissertation, Dept. of Electr. Eng., Katholieke
C         Universiteit Leuven, Belgium, June 1987.
C
C     [4] Chan, T.F.
C         An Improved Algorithm for Computing the Singular Value
C         Decomposition.
C         ACM TOMS, 8, pp. 72-83, 1982.
C
C     [5] Van Huffel, S. and Vandewalle, J.
C         The Partial Total Least Squares Algorithm.
C         J. Comput. Appl. Math., 21, pp. 333-341, 1988.
C
C     [6] Van Huffel, S. and Vandewalle, J.
C         Analysis and Solution of the Nongeneric Total Least Squares
C         Problem.
C         SIAM J. Matr. Anal. and Appl., 9, pp. 360-372, 1988.
C
C     NUMERICAL ASPECTS
C
C     The computational efficiency of the PTLS algorithm compared with
C     the classical TLS algorithm (see [2 - 5]) is obtained by making
C     use of PSVD (see [1]) instead of performing the entire SVD.
C     Depending on the gap between the RANK-th and the (RANK+1)-th
C     singular values of C, the number (N + L - RANK) of base vectors to
C     be computed with respect to the column dimension (N + L) of C and
C     the desired accuracy RELTOL, the algorithm used by this routine is
C     approximately twice as fast as the classical TLS algorithm at the
C     expense of extra storage requirements, namely:
C       (N + L) x (N + L - 1)/2  if M >= N + L or
C       M x (N + L - (M - 1)/2)  if M <  N + L.
C     This is because the Householder transformations performed on the
C     rows of C in the bidiagonalization phase (see Step 1) must be kept
C     until the end (Step 5).
C
C     CONTRIBUTOR
C
C     Release 3.0: V. Sima, Katholieke Univ. Leuven, Belgium, Apr. 1997.
C     Supersedes Release 2.0 routine MB02BD by S. Van Huffel, Katholieke
C     University, Leuven, Belgium.
C
C     REVISIONS
C
C     V. Sima. June 30, 1997, Oct. 19, 2003, Feb. 15, 2004, Aug. 2011.
C
C     KEYWORDS
C
C     Least-squares approximation, singular subspace, singular value
C     decomposition, singular values, total least-squares.
C
C     ******************************************************************
C
C     .. Parameters ..
      DOUBLE PRECISION  ZERO, ONE, TWO
      PARAMETER         ( ZERO = 0.0D0, ONE = 1.0D0, TWO = 2.0D0 )
C     .. Scalar Arguments ..
      INTEGER           INFO, IWARN, L, LDC, LDWORK, LDX, M, N, RANK
      DOUBLE PRECISION  RELTOL, THETA, TOL
C     .. Array Arguments ..
      LOGICAL           BWORK(*), INUL(*)
      INTEGER           IWORK(*)
      DOUBLE PRECISION  C(LDC,*), DWORK(*), Q(*), X(LDX,*)
C     .. Local Scalars ..
      LOGICAL           LFIRST, LQUERY, SUFWRK, USEQR
      INTEGER           I, I1, IFAIL, IHOUSH, IJ, IOFF, ITAUP, ITAUQ,
     $                  IWARM, J, J1, JF, JV, JWORK, K, KF, KJ, LDF, LW,
     $                  MC, MINWRK, MJ, MNL, N1, NJ, NL, P, WRKOPT
      DOUBLE PRECISION  CS, EPS, FIRST, FNORM, HH, INPROD, RCOND, SN,
     $                  TEMP
C     .. Local Arrays ..
      DOUBLE PRECISION  DUMMY(2)
C     .. External Functions ..
      LOGICAL           LSAME
      INTEGER           ILAENV
      DOUBLE PRECISION  DLAMCH, DLANGE, DLANTR
      EXTERNAL          DLAMCH, DLANGE, DLANTR, ILAENV, LSAME
C     .. External Subroutines ..
      EXTERNAL          DCOPY, DGEBRD, DGEQRF, DGERQF, DLARF, DLARFG,
     $                  DLARTG, DLASET, DORMBR, DORMRQ, DTRCON, DTRSM,
     $                  MB04YD, XERBLA
C     .. Intrinsic Functions ..
      INTRINSIC         INT, MAX, MIN
C     .. Executable Statements ..
C
      IWARN = 0
      INFO  = 0
      NL = N + L
      K  = MAX( M, NL )
      P  = MIN( M, NL )
      IF ( M.GE.NL ) THEN
         LW = ( NL*( NL - 1 ) )/2
      ELSE
         LW = M*NL - ( M*( M - 1 ) )/2
      END IF
      JV = P + LW + MAX( 6*NL - 5, L*L + MAX( NL, 3*L ) )
C
C     Test the input scalar arguments.
C
      IF( M.LT.0 ) THEN
         INFO = -1
      ELSE IF( N.LT.0 ) THEN
         INFO = -2
      ELSE IF( L.LT.0 ) THEN
         INFO = -3
      ELSE IF( RANK.GT.MIN( M, N ) ) THEN
         INFO = -4
      ELSE IF( ( RANK.LT.0 ) .AND. ( THETA.LT.ZERO ) ) THEN
         INFO = -5
      ELSE IF( LDC.LT.MAX( 1, K ) ) THEN
         INFO = -7
      ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
         INFO = -9
      ELSE
         USEQR = M.GE.MAX( NL, ILAENV( 6, 'DGESVD', 'N' // 'N', M, NL,
     $                                 0, 0 ) )
         LQUERY = LDWORK.EQ.-1
         MINWRK = MAX( 2, K + 2*P, JV )
         WRKOPT = MINWRK
         IF ( USEQR ) THEN
            MNL = NL
         ELSE
            MNL = M
         END IF
         IF( LQUERY ) THEN
            IF ( USEQR ) THEN
               CALL DGEQRF( M, NL, C, LDC, DWORK, DWORK, -1, IFAIL )
               WRKOPT = MAX( WRKOPT, NL + INT( DWORK(1) ) )
            END IF
            CALL DGEBRD( MNL, NL, C, LDC, Q, Q, DWORK, DWORK, DWORK, -1,
     $                   IFAIL )
            CALL DORMBR( 'P vectors', 'Left', 'No transpose', NL, NL,
     $                   MNL, DWORK, P, DWORK, DWORK, NL, DWORK(2), -1,
     $                   IFAIL )
            CALL DGERQF( L, NL, DWORK, NL, DWORK, DUMMY, -1, IFAIL )
            CALL DORMRQ( 'Right', 'Transpose', N, NL, L, DWORK, NL,
     $                   DWORK, DWORK, NL, DUMMY(2), -1, IFAIL )
            TEMP   = MAX( DWORK(2), DUMMY(1), DUMMY(2) )
            WRKOPT = MAX( WRKOPT, 2*P + INT( DWORK(1) ), 
     $                    P + P*NL + MAX( 6*NL-5,
     $                                    NL**2 + MAX( INT( TEMP ),
     $                                                 3*L ) ) )
         END IF
         IF( LDWORK.LT.MINWRK .AND. .NOT.LQUERY )
     $       INFO = -16
      END IF
C
      IF ( INFO.NE.0 ) THEN
C
C        Error return.
C
         CALL XERBLA( 'MB02ND', -INFO )
         RETURN
      ELSE IF( LQUERY ) THEN
         DWORK(1) = WRKOPT
         RETURN
      END IF
C
C     Quick return if possible.
C
      IF ( MIN( M, NL ).EQ.0 ) THEN
         IF ( M.EQ.0 ) THEN
            CALL DLASET( 'Full', NL, NL, ZERO, ONE, C, LDC )
            CALL DLASET( 'Full', N, L, ZERO, ZERO, X, LDX )
C
            DO 10 I = 1, NL
               INUL(I) = .TRUE.
   10       CONTINUE
C
         END IF
         IF ( RANK.GE.0 )
     $      THETA = ZERO
         RANK = 0
         DWORK(1) = TWO
         DWORK(2) = ONE
         RETURN
      END IF
C
      WRKOPT = 2
      N1 = N + 1
C
      EPS = DLAMCH( 'Precision' )
      LFIRST = .TRUE.
C
C     Initializations.
C
      DO 20 I = 1, P
         INUL(I)  = .FALSE.
         BWORK(I) = .FALSE.
   20 CONTINUE
C
      DO 40 I = P + 1, NL
         INUL(I)  = .TRUE.
         BWORK(I) = .FALSE.
   40 CONTINUE
C
C     Subroutine MB02ND solves a set of linear equations by a Total
C     Least Squares Approximation, based on the Partial SVD.
C
C     Step 1: Bidiagonalization phase
C             -----------------------
C     1.a): If M is large enough than N+L, transform C into upper
C           triangular form R by Householder transformations.
C
C     (Note: Comments in the code beginning "Workspace:" describe the
C     minimal amount of real workspace needed at that point in the
C     code, as well as the preferred amount for good performance.
C     NB refers to the optimal block size for the immediately
C     following subroutine, as returned by ILAENV.)
C
      IF ( USEQR ) THEN
C
C        Workspace: need   2*(N+L),
C                   prefer N+L + (N+L)*NB.
C
         ITAUQ = 1
         JWORK = ITAUQ + NL
         CALL DGEQRF( M, NL, C, LDC, DWORK(ITAUQ), DWORK(JWORK),
     $                LDWORK-JWORK+1, IFAIL )
         WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
         IF ( NL.GT.1 )
     $      CALL DLASET( 'Lower', NL-1, NL-1, ZERO, ZERO, C(2,1), LDC )
      END IF
C
C     1.b): Transform C (or R) into bidiagonal form Q using Householder
C           transformations.
C     Workspace: need   2*min(M,N+L) + max(M,N+L),
C                prefer 2*min(M,N+L) + (M+N+L)*NB.
C
      ITAUP = 1
      ITAUQ = ITAUP + P
      JWORK = ITAUQ + P
      CALL DGEBRD( MNL, NL, C, LDC, Q, Q(P+1), DWORK(ITAUQ),
     $             DWORK(ITAUP), DWORK(JWORK), LDWORK-JWORK+1, IFAIL )
      WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
C
C     If the matrix is lower bidiagonal, rotate to be upper bidiagonal
C     by applying Givens rotations on the left.
C
      IF ( M.LT.NL ) THEN
         IOFF = 0
C
         DO 60 I = 1, P - 1
            CALL DLARTG( Q(I), Q(P+I), CS, SN, TEMP )
            Q(I)   = TEMP
            Q(P+I) = SN*Q(I+1)
            Q(I+1) = CS*Q(I+1)
   60    CONTINUE
C
      ELSE
         IOFF = 1
      END IF
C
C     Store the Householder transformations performed onto the rows of C
C     in the extra storage locations DWORK(IHOUSH).
C     Workspace: need   LDW = min(M,N+L) + (N+L)*(N+L-1)/2, if M >= N+L,
C                       LDW = min(M,N+L) + M*(N+L-(M-1)/2), if M <  N+L;
C                prefer LDW = min(M,N+L) + (N+L)**2,        if M >= N+L,
C                       LDW = min(M,N+L) + M*(N+L),         if M <  N+L.
C
      IHOUSH = ITAUQ
      MC = NL - IOFF
      KF = IHOUSH + P*NL
      SUFWRK = LDWORK.GE.( KF + MAX( 6*NL-5,
     $                               NL**2 + MAX( NL, 3*L ) - 1 ) )
      IF ( SUFWRK ) THEN
C
C        Enough workspace for a fast algorithm.
C
         CALL DLACPY( 'Upper', P, NL, C, LDC, DWORK(IHOUSH), P )
         KJ = KF
         WRKOPT = MAX( WRKOPT, KF - 1 )
      ELSE
C
C        Not enough workspace for a fast algorithm.
C
         KJ = IHOUSH
C
         DO 80 NJ = 1, MIN( P, MC )
            J = MC - NJ + 1
            CALL DCOPY( J, C(NJ,NJ+IOFF), LDC, DWORK(KJ), 1 )
            KJ = KJ + J
   80    CONTINUE
C
      END IF
C
C     1.c): Initialize the right singular base matrix V with the
C           identity matrix (V overwrites C).
C
      CALL DLASET( 'Full', NL, NL, ZERO, ONE, C, LDC )
      JV = KJ
      IWARM = 0
C
C     REPEAT
C
C     Compute the Householder matrix Q and matrices F and Y such that
C     F is nonsingular.
C
C     Step 2: Partial diagonalization phase.
C             -----------------------------
C     Diagonalize the bidiagonal Q partially until convergence to
C     the desired right singular subspace.
C     Workspace: LDW + 6*(N+L)-5.
C
  100 CONTINUE
      JWORK = JV
      CALL MB04YD( 'No U', 'Update V', P, NL, RANK, THETA, Q, Q(P+1),
     $             DUMMY, 1, C, LDC, INUL, TOL, RELTOL, DWORK(JWORK),
     $             LDWORK-JWORK+1, IWARN, INFO )
      WRKOPT = MAX( WRKOPT, JWORK + 6*NL - 6 )
C
      IWARN = MAX( IWARN, IWARM )
      IF ( INFO.GT.0 )
     $   RETURN
C
C     Set pointers to the selected base vectors in the right singular
C     matrix of C.
C
      K = 0
C
      DO 120 I = 1, NL
         IF ( INUL(I) ) THEN
            K = K + 1
            IWORK(K) = I
         END IF
  120 CONTINUE
C
      IF ( K.LT.L ) THEN
C
C        Rank of the TLS approximation is larger than min(M,N).
C
         INFO = 2
         RETURN
      END IF
C
C     Step 3: Back transformation phase.
C             -------------------------
C     Apply in backward order the Householder transformations (stored
C     in DWORK(IHOUSH)) performed onto the rows of C during the
C     bidiagonalization phase, to the selected base vectors (specified
C     by INUL(I) = .TRUE.). Already transformed vectors are those for
C     which BWORK(I) = .TRUE..
C
      KF = K
      IF ( SUFWRK.AND.LFIRST ) THEN
C
C        Enough workspace for a fast algorithm and first pass.
C
         IJ = JV
C
         DO 140 J = 1, K
            CALL DCOPY (NL, C(1,IWORK(J)), 1, DWORK(IJ), 1 )
            IJ = IJ + NL
  140    CONTINUE
C
C        Workspace: need   LDW + (N+L)*K + K,
C                   prefer LDW + (N+L)*K + K*NB.
C
         IJ = JV
         JWORK = IJ + NL*K
         CALL DORMBR( 'P vectors', 'Left', 'No transpose', NL, K,
     $                MNL, DWORK(IHOUSH), P, DWORK(ITAUP), DWORK(IJ),
     $                NL, DWORK(JWORK), LDWORK-JWORK+1, IFAIL )
         WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
C
         DO 160 I = 1, NL
            IF ( INUL(I) .AND. ( .NOT. BWORK(I) ) )
     $         BWORK(I) = .TRUE.
  160    CONTINUE
C
      ELSE
C
C        Not enough workspace for a fast algorithm or subsequent passes.
C
         DO 180 I = 1, NL
            IF ( INUL(I) .AND. ( .NOT. BWORK(I) ) ) THEN
               KJ = JV
C
               DO 170 NJ = MIN( P, MC ), 1, -1
                  J = MC - NJ + 1
                  KJ = KJ - J
                  FIRST = DWORK(KJ)
                  DWORK(KJ) = ONE
                  CALL DLARF( 'Left', J, 1, DWORK(KJ), 1,
     $                        DWORK(ITAUP+NJ-1), C(NJ+IOFF,I), LDC,
     $                        DWORK(JWORK) )
                  DWORK(KJ) = FIRST
  170          CONTINUE
C
               BWORK(I) = .TRUE.
            END IF
  180    CONTINUE
      END IF
C
      IF ( RANK.LE.0 )
     $   RANK = 0
      IF ( MIN( RANK, L ).EQ.0 ) THEN
         IF ( SUFWRK.AND.LFIRST )
     $      CALL DLACPY( 'Full', NL, K, DWORK(JV), NL, C, LDC )
         DWORK(1) = WRKOPT
         DWORK(2) = ONE
         RETURN
      END IF
C
C     Step 4: Compute matrices F and Y
C             ------------------------
C             using Householder transformation Q.
C
C     Compute the orthogonal matrix Q (in factorized form) and the
C     matrices F and Y using RQ factorization. It is assumed that,
C     generically, the last L rows of V2 matrix have full rank.
C     The code could not be the most efficient when RANK has been
C     lowered, because the already created zero pattern of the last
C     L rows of V2 matrix is not exploited.
C
      IF ( SUFWRK.AND.LFIRST ) THEN
C
C        Enough workspace for a fast algorithm and first pass.
C        Workspace: need   LDW1 + 2*L,
C                   prefer LDW1 + L + L*NB, where
C                          LDW1 = LDW + (N+L)*K;
C
         ITAUQ = JWORK
         JWORK = ITAUQ + L
         CALL DGERQF( L, K, DWORK(JV+N), NL, DWORK(ITAUQ), DWORK(JWORK),
     $                LDWORK-JWORK+1, INFO )
         WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
C
C        Workspace: need   LDW1 + N+L,
C                   prefer LDW1 + L + N*NB.
C
         CALL DORMRQ( 'Right', 'Transpose', N, K, L, DWORK(JV+N), NL,
     $                DWORK(ITAUQ), DWORK(JV), NL, DWORK(JWORK),
     $                LDWORK-JWORK+1, INFO )
         WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
C
         JF = JV + NL*(K-L) + N
         LDF = NL
         JWORK = JF + LDF*L - N
         CALL DLASET( 'Full', L, K-L, ZERO, ZERO, DWORK(JV+N), LDF )
         IF ( L.GT.1 )
     $      CALL DLASET( 'Lower', L-1, L-1, ZERO, ZERO, DWORK(JF+1),
     $                   LDF )
         IJ = JV
C
         DO 200 J = 1, K
            CALL DCOPY( NL, DWORK(IJ), 1, C(1,IWORK(J)), 1 )
            IJ = IJ + NL
  200    CONTINUE
C
      ELSE
C
C        Not enough workspace for a fast algorithm or subsequent passes.
C        Workspace: LDW2 + N+L, where LDW2 = LDW + L*L.
C
         I  = NL
         JF = JV
         LDF = L
         JWORK  = JF + LDF*L
         WRKOPT = MAX( WRKOPT, JWORK+NL-1 )
C
C        WHILE ( ( K >= 1 ) .AND. ( I > N ) ) DO
  220    CONTINUE
         IF ( ( K.GE.1 ) .AND. ( I.GT.N ) ) THEN
C
            DO 240 J = 1, K
               DWORK(JWORK+J-1) = C(I,IWORK(J))
  240       CONTINUE
C
C           Compute Householder transformation.
C
            CALL DLARFG( K, DWORK(JWORK+K-1), DWORK(JWORK), 1, TEMP )
            C(I,IWORK(K)) = DWORK(JWORK+K-1)
            IF ( TEMP.NE.ZERO ) THEN
C
C              Apply Householder transformation onto the selected base
C              vectors.
C
               DO 300 I1 = 1, I - 1
                  INPROD = C(I1,IWORK(K))
C
                  DO 260 J = 1, K - 1
                     INPROD = INPROD + DWORK(JWORK+J-1)*C(I1,IWORK(J))
  260             CONTINUE
C
                  HH = INPROD*TEMP
                  C(I1,IWORK(K)) = C(I1,IWORK(K)) - HH
C
                  DO 280 J = 1, K - 1
                     J1 = IWORK(J)
                     C(I1,J1) = C(I1,J1) - DWORK(JWORK+J-1)*HH
                     C(I,J1)  = ZERO
  280             CONTINUE
C
  300          CONTINUE
C
            END IF
            CALL DCOPY( I-N, C(N1,IWORK(K)), 1, DWORK(JF+(I-N-1)*L), 1 )
            K = K - 1
            I = I - 1
            GO TO 220
         END IF
C        END WHILE 220
      END IF
C
C     Estimate the reciprocal condition number of the matrix F.
C     If F singular, lower the rank of the TLS approximation.
C     Workspace: LDW1 + 3*L or
C                LDW2 + 3*L.
C
      CALL DTRCON( '1-norm', 'Upper', 'Non-unit', L, DWORK(JF), LDF,
     $             RCOND, DWORK(JWORK), IWORK(KF+1), INFO )
      WRKOPT = MAX( WRKOPT, JWORK + 3*L - 1 )
C
      DO 320 J = 1, L
         CALL DCOPY( N, C(1,IWORK(KF-L+J)), 1, X(1,J), 1 )
  320 CONTINUE
C
      FNORM = DLANTR( '1-norm', 'Upper', 'Non-unit', L, L, DWORK(JF),
     $                LDF, DWORK(JWORK) )
      IF ( RCOND.LE.EPS*FNORM ) THEN
         RANK = RANK - 1
         GO TO 340
      END IF
      IF ( FNORM.LE.EPS*DLANGE( '1-norm', N, L, X, LDX,
     $                             DWORK(JWORK) ) ) THEN
         RANK = RANK - L
         GO TO 340
      ELSE
         GO TO 400
      END IF
C
  340    CONTINUE
         IWARM = 2
         THETA = -ONE
         IF ( SUFWRK.AND.LFIRST ) THEN
C
C           Rearrange the stored Householder transformations for
C           subsequent passes, taking care to avoid overwriting.
C
            IF ( P.LT.NL ) THEN
               KJ = IHOUSH + NL*(NL - 1)
               MJ = IHOUSH +  P*(NL - 1)
C
               DO 360 NJ = 1, NL
                  DO 350 J = P - 1, 0, -1
                     DWORK(KJ+J) = DWORK(MJ+J)
  350             CONTINUE
                  KJ = KJ - NL
                  MJ = MJ - P
  360          CONTINUE
C
            END IF
            KJ = IHOUSH
            MJ = IHOUSH + NL*IOFF
C
            DO 380 NJ = 1, MIN( P, MC )
               DO 370 J = 0, MC - NJ
                  DWORK(KJ) = DWORK(MJ+J*P)
                  KJ = KJ + 1
  370          CONTINUE
               MJ = MJ + NL + 1
  380       CONTINUE
C
            JV = KJ
            LFIRST = .FALSE.
         END IF
         GO TO 100
C     UNTIL ( F nonsingular, i.e., RCOND.GT.EPS*FNORM or
C                                  FNORM.GT.EPS*norm(Y) )
  400 CONTINUE
C
C     Step 5: Compute TLS solution.
C             --------------------
C     Solve X F = -Y  by forward elimination  (F is upper triangular).
C
      CALL DTRSM( 'Right', 'Upper', 'No transpose', 'Non-unit', N, L,
     $            -ONE, DWORK(JF), LDF, X, LDX )
C
C     Set the optimal workspace and reciprocal condition number of F.
C
      DWORK(1) = WRKOPT
      DWORK(2) = RCOND
C
      RETURN
C *** Last line of MB02ND ***
      END