SUBROUTINE MB04XD( JOBU, JOBV, M, N, RANK, THETA, A, LDA, U, LDU,
     $                   V, LDV, Q, INUL, TOL, RELTOL, DWORK, LDWORK,
     $                   IWARN, INFO )
C
C     PURPOSE
C
C     To compute a basis for the left and/or right singular subspace of
C     an M-by-N matrix A corresponding to its smallest singular values.
C
C     ARGUMENTS
C
C     Mode Parameters
C
C     JOBU    CHARACTER*1
C             Specifies whether to compute the left singular subspace
C             as follows:
C             = 'N':  Do not compute the left singular subspace;
C             = 'A':  Return the (M - RANK) base vectors of the desired
C                     left singular subspace in U;
C             = 'S':  Return the first (min(M,N) - RANK) base vectors
C                     of the desired left singular subspace in U.
C
C     JOBV    CHARACTER*1
C             Specifies whether to compute the right singular subspace
C             as follows:
C             = 'N':  Do not compute the right singular subspace;
C             = 'A':  Return the (N - RANK) base vectors of the desired
C                     right singular subspace in V;
C             = 'S':  Return the first (min(M,N) - RANK) base vectors
C                     of the desired right singular subspace in V.
C
C     Input/Output Parameters
C
C     M       (input) INTEGER
C             The number of rows in matrix A.  M >= 0.
C
C     N       (input) INTEGER
C             The number of columns in matrix A.  N >= 0.
C
C     RANK    (input/output) INTEGER
C             On entry, if RANK < 0, then the rank of matrix A is
C             computed by the routine as the number of singular values
C             greater than THETA.
C             Otherwise, RANK must specify the rank of matrix A.
C             RANK <= min(M,N).
C             On exit, if RANK < 0 on entry, then RANK contains the
C             computed rank of matrix A. That is, the number of singular
C             values of A greater than THETA.
C             Otherwise, the user-supplied value of RANK may be changed
C             by the routine on exit if the RANK-th and the (RANK+1)-th
C             singular values of A are considered to be equal.
C             See also the description of parameter TOL below.
C
C     THETA   (input/output) DOUBLE PRECISION
C             On entry, if RANK < 0, then THETA must specify an upper
C             bound on the smallest singular values of A corresponding
C             to the singular subspace to be computed.  THETA >= 0.0.
C             Otherwise, THETA must specify an initial estimate (t say)
C             for computing an upper bound on the (min(M,N) - RANK)
C             smallest singular values of A. If THETA < 0.0, then t is
C             computed by the routine.
C             On exit, if RANK >= 0 on entry, then THETA contains the
C             computed upper bound such that precisely RANK singular
C             values of A are greater than THETA + TOL.
C             Otherwise, THETA is unchanged.
C
C     A       (input) DOUBLE PRECISION array, dimension (LDA,N)
C             The leading M-by-N part of this array must contain the
C             matrix A from which the basis of a desired singular
C             subspace is to be computed.
C             NOTE that this array is destroyed.
C
C     LDA     INTEGER
C             The leading dimension of array A.  LDA >= max(1,M).
C
C     U       (output) DOUBLE PRECISION array, dimension (LDU,*)
C             If JOBU = 'A', then the leading M-by-M part of this array
C             contains the (M - RANK) M-dimensional base vectors of the
C             desired left singular subspace of A corresponding to its
C             singular values less than or equal to THETA. These vectors
C             are stored in the i-th column(s) of U for which
C             INUL(i) = .TRUE., where i = 1,2,...,M.
C
C             If JOBU = 'S', then the leading M-by-min(M,N) part of this
C             array contains the first (min(M,N) - RANK) M-dimensional
C             base vectors of the desired left singular subspace of A
C             corresponding to its singular values less than or equal to
C             THETA. These vectors are stored in the i-th column(s) of U
C             for which INUL(i) = .TRUE., where i = 1,2,..., min(M,N).
C
C             Otherwise, U is not referenced (since JOBU = 'N') and can
C             be supplied as a dummy array (i.e. set parameter LDU = 1
C             and declare this array to be U(1,1) in the calling
C             program).
C
C     LDU     INTEGER
C             The leading dimension of array U.
C             LDU >= max(1,M) if JOBU = 'A' or JOBU = 'S',
C             LDU >= 1        if JOBU = 'N'.
C
C     V       (output) DOUBLE PRECISION array, dimension (LDV,*)
C             If JOBV = 'A', then the leading N-by-N part of this array
C             contains the (N - RANK) N-dimensional base vectors of the
C             desired right singular subspace of A corresponding to its
C             singular values less than or equal to THETA. These vectors
C             are stored in the i-th column(s) of V for which
C             INUL(i) = .TRUE., where i = 1,2,...,N.
C
C             If JOBV = 'S', then the leading N-by-min(M,N) part of this
C             array contains the first (min(M,N) - RANK) N-dimensional
C             base vectors of the desired right singular subspace of A
C             corresponding to its singular values less than or equal to
C             THETA. These vectors are stored in the i-th column(s) of V
C             for which INUL(i) = .TRUE., where i = 1,2,...,MIN( M,N).
C
C             Otherwise, V is not referenced (since JOBV = 'N') and can
C             be supplied as a dummy array (i.e. set parameter LDV = 1
C             and declare this array to be V(1,1) in the calling
C             program).
C
C     LDV     INTEGER
C             The leading dimension of array V.
C             LDV >= max(1,N) if JOBV = 'A' or JOBV = 'S',
C             LDV >= 1        if JOBV = 'N'.
C
C     Q       (output) DOUBLE PRECISION array, dimension (2*min(M,N)-1)
C             This array contains the partially diagonalized bidiagonal
C             matrix J computed from A, at the moment that the desired
C             singular subspace has been found. Specifically, the
C             leading p = min(M,N) entries of Q contain the diagonal
C             elements q(1),q(2),...,q(p) and the entries Q(p+1),
C             Q(p+2),...,Q(2*p-1) contain the superdiagonal elements
C             e(1),e(2),...,e(p-1) of J.
C
C     INUL    (output) LOGICAL array, dimension (max(M,N))
C             If JOBU <> 'N' or JOBV <> 'N', then the indices of the
C             elements of this array with value .TRUE. indicate the
C             columns in U and/or V containing the base vectors of the
C             desired left and/or right singular subspace of A. They
C             also equal the indices of the diagonal elements of the
C             bidiagonal submatrices in the array Q, which correspond
C             to the computed singular subspaces.
C
C     Tolerances
C
C     TOL     DOUBLE PRECISION
C             This parameter defines the multiplicity of singular values
C             by considering all singular values within an interval of
C             length TOL as coinciding. TOL is used in checking how many
C             singular values are less than or equal to THETA. Also in
C             computing an appropriate upper bound THETA by a bisection
C             method, TOL is used as a stopping criterion defining the
C             minimum (absolute) subinterval width. TOL is also taken
C             as an absolute tolerance for negligible elements in the
C             QR/QL iterations. If the user sets TOL to be less than or
C             equal to 0, then the tolerance is taken as specified in
C             SLICOT Library routine MB04YD document.
C
C     RELTOL  DOUBLE PRECISION
C             This parameter specifies the minimum relative width of an
C             interval. When an interval is narrower than TOL, or than
C             RELTOL times the larger (in magnitude) endpoint, then it
C             is considered to be sufficiently small and bisection has
C             converged. If the user sets RELTOL to be less than
C             BASE * EPS, where BASE is machine radix and EPS is machine
C             precision (see LAPACK Library routine DLAMCH), then the
C             tolerance is taken as BASE * EPS.
C
C     Workspace
C
C     DWORK   DOUBLE PRECISION array, dimension (LDWORK)
C             On exit, if INFO = 0, DWORK(1) returns the optimal value
C             of LDWORK.
C
C     LDWORK  INTEGER
C             The length of the array DWORK.
C             LDWORK = max(1, LDW + max(2*P + max(M,N), LDY)), where
C                  P = min(M,N);
C                LDW = max(2*N, N*(N+1)/2), if JOBU <> 'N' and M large
C                                                        enough than N;
C                LDW = 0,                   otherwise;
C                LDY = 8*P - 5, if JOBU <> 'N' or  JOBV <> 'N';
C                LDY = 6*P - 3, if JOBU =  'N' and JOBV =  'N'.
C             For optimum performance LDWORK should be larger.
C
C             If LDWORK = -1, then a workspace query is assumed;
C             the routine only calculates the optimal size of the
C             DWORK array, returns this value as the first entry of
C             the DWORK array, and no error message related to LDWORK
C             is issued by XERBLA.
C
C     Warning Indicator
C
C     IWARN   INTEGER
C             = 0:  no warning;
C             = 1:  if the rank of matrix A (as specified by the user)
C                   has been lowered because a singular value of
C                   multiplicity greater than 1 was found.
C
C     Error Indicator
C
C     INFO    INTEGER
C             = 0:  successful exit;
C             < 0:  if INFO = -i, the i-th argument had an illegal
C                   value;
C             = 1:  if the maximum number of QR/QL iteration steps
C                   (30*MIN(M,N)) has been exceeded.
C
C     METHOD
C
C     The method used is the Partial Singular Value Decomposition (PSVD)
C     approach proposed by Van Huffel, Vandewalle and Haegemans, which
C     is an efficient technique (see [1]) for computing the singular
C     subspace of a matrix corresponding to its smallest singular
C     values. It differs from the classical SVD algorithm [3] at three
C     points, which results in high efficiency. Firstly, the Householder
C     transformations of the bidiagonalization need only to be applied
C     on the base vectors of the desired singular subspaces; secondly,
C     the bidiagonal matrix need only be partially diagonalized; and
C     thirdly, the convergence rate of the iterative diagonalization can
C     be improved by an appropriate choice between QL and QR iterations.
C     (Note, however, that LAPACK Library routine DGESVD, for computing
C     SVD, also uses either QL and QR iterations.) Depending on the gap,
C     the desired numerical accuracy and the dimension of the desired
C     singular subspace, the PSVD can be up to three times faster than
C     the classical SVD algorithm.
C
C     The PSVD algorithm [1-2] for an M-by-N matrix A proceeds as
C     follows:
C
C     Step 1: Bidiagonalization phase
C             -----------------------
C      (a) If M is large enough than N, transform A into upper
C          triangular form R.
C
C      (b) Transform A (or R) into bidiagonal form:
C
C                |q(1) e(1)  0   ...  0   |
C           (0)  | 0   q(2) e(2)      .   |
C          J   = | .                  .   |
C                | .                e(N-1)|
C                | 0            ...  q(N) |
C
C     if M >= N, or
C
C                |q(1)  0    0   ...  0     0   |
C           (0)  |e(1) q(2)  0        .     .   |
C          J   = | .                  .     .   |
C                | .                 q(M-1) .   |
C                | 0             ... e(M-1) q(M)|
C
C     if M < N, using Householder transformations.
C     In the second case, transform the matrix to the upper bidiagonal
C     form by applying Givens rotations.
C
C      (c) If U is requested, initialize U with the identity matrix.
C          If V is requested, initialize V with the identity matrix.
C
C     Step 2: Partial diagonalization phase
C             -----------------------------
C     If the upper bound THETA is not given, then compute THETA such
C     that precisely (min(M,N) - RANK) singular values of the bidiagonal
C     matrix are less than or equal to THETA, using a bisection method
C     [4]. Diagonalize the given bidiagonal matrix J partially, using
C     either QR iterations (if the upper left diagonal element of the
C     considered bidiagonal submatrix is larger than the lower right
C     diagonal element) or QL iterations, such that J is split into
C     unreduced bidiagonal submatrices whose singular values are either
C     all larger than THETA or all less than or equal to THETA.
C     Accumulate the Givens rotations in U and/or V (if desired).
C
C     Step 3: Back transformation phase
C             -------------------------
C      (a) Apply the Householder transformations of Step 1(b) onto the
C          columns of U and/or V associated with the bidiagonal
C          submatrices with all singular values less than or equal to
C          THETA (if U and/or V is desired).
C
C      (b) If M is large enough than N, and U is desired, then apply the
C          Householder transformations of Step 1(a) onto each computed
C          column of U in Step 3(a).
C
C     REFERENCES
C
C     [1] Van Huffel, S., Vandewalle, J. and Haegemans, A.
C         An efficient and reliable algorithm for computing the singular
C         subspace of a matrix associated with its smallest singular
C         values.
C         J. Comput. and Appl. Math., 19, pp. 313-330, 1987.
C
C     [2] Van Huffel, S.
C         Analysis of the total least squares problem and its use in
C         parameter estimation.
C         Doctoral dissertation, Dept. of Electr. Eng., Katholieke
C         Universiteit Leuven, Belgium, June 1987.
C
C     [3] Chan, T.F.
C         An improved algorithm for computing the singular value
C         decomposition.
C         ACM TOMS, 8, pp. 72-83, 1982.
C
C     [4] Van Huffel, S. and Vandewalle, J.
C         The partial total least squares algorithm.
C         J. Comput. and Appl. Math., 21, pp. 333-341, 1988.
C
C     NUMERICAL ASPECTS
C
C     Using the PSVD a large reduction in computation time can be
C     gained in total least squares applications (cf [2 - 4]), in the
C     computation of the null space of a matrix and in solving
C     (non)homogeneous linear equations.
C
C     CONTRIBUTOR
C
C     Release 3.0: V. Sima, Katholieke Univ. Leuven, Belgium, June 1997.
C     Supersedes Release 2.0 routine MB04PD by S. Van Huffel, Katholieke
C     University Leuven, Belgium.
C
C     REVISIONS
C
C     V. Sima, July 10, 1997, Aug. 2011.
C
C     KEYWORDS
C
C     Bidiagonalization, singular subspace, singular value
C     decomposition, singular values.
C
C     ******************************************************************
C
C     .. Parameters ..
      DOUBLE PRECISION  ZERO, ONE
      PARAMETER         ( ZERO = 0.0D0, ONE = 1.0D0 )
C     .. Scalar Arguments ..
      CHARACTER         JOBU, JOBV
      INTEGER           INFO, IWARN, LDA, LDU, LDV, LDWORK, M, N, RANK
      DOUBLE PRECISION  RELTOL, THETA, TOL
C     .. Array Arguments ..
      LOGICAL           INUL(*)
      DOUBLE PRECISION  A(LDA,*), DWORK(*), Q(*), U(LDU,*), V(LDV,*)
C     .. Local Scalars ..
      CHARACTER*1       JOBUY, JOBVY
      LOGICAL           ALL, LJOBUA, LJOBUS, LJOBVA, LJOBVS, LQUERY, QR,
     $                  WANTU, WANTV
      INTEGER           I, IHOUSH, IJ, ITAU, ITAUP, ITAUQ, J, JU, JV,
     $                  JWORK, K, LDW, LDY, MA, MINWRK, P, PP1, WRKOPT
      DOUBLE PRECISION  CS, SN, TEMP
C     .. External Functions ..
      LOGICAL           LSAME
      INTEGER           ILAENV
      EXTERNAL          ILAENV, LSAME
C     .. External Subroutines ..
      EXTERNAL          DCOPY, DGEBRD, DGEQRF, DLARTG, DLASET, DLASR,
     $                  MB04XY, MB04YD, XERBLA
C     .. Intrinsic Functions ..
      INTRINSIC         INT, MAX, MIN
C     .. Executable Statements ..
C
      IWARN = 0
      INFO  = 0
      P = MIN( M, N )
      K = MAX( M, N )
C
C     Determine whether U and/or V are/is to be computed.
C
      LJOBUA = LSAME( JOBU, 'A' )
      LJOBUS = LSAME( JOBU, 'S' )
      LJOBVA = LSAME( JOBV, 'A' )
      LJOBVS = LSAME( JOBV, 'S' )
      WANTU  = LJOBUA.OR.LJOBUS
      WANTV  = LJOBVA.OR.LJOBVS
      LQUERY = LDWORK.EQ.-1
C
C     Test the input scalar arguments.
C
      IF( .NOT.WANTU .AND. .NOT.LSAME( JOBU, 'N' ) ) THEN
         INFO = -1
      ELSE IF( .NOT.WANTV .AND. .NOT.LSAME( JOBV, 'N' ) ) THEN
         INFO = -2
      ELSE IF( M.LT.0 ) THEN
         INFO = -3
      ELSE IF( N.LT.0 ) THEN
         INFO = -4
      ELSE IF( RANK.GT.P ) THEN
         INFO = -5
      ELSE IF( RANK.LT.0 .AND. THETA.LT.ZERO ) THEN
         INFO = -6
      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -8
      ELSE IF( ( .NOT.WANTU .AND. LDU.LT.1 )             .OR.
     $         (      WANTU .AND. LDU.LT.MAX( 1, M ) ) ) THEN
         INFO = -10
      ELSE IF( ( .NOT.WANTV .AND. LDV.LT.1 )             .OR.
     $         (      WANTV .AND. LDV.LT.MAX( 1, N ) ) ) THEN
         INFO = -12
      ELSE
C
C        Compute workspace.
C
         QR = M.GE.ILAENV( 6, 'DGESVD', 'N' // 'N', M, N, 0, 0 )
         IF ( QR.AND.WANTU ) THEN
            LDW = MAX( 2*N, N*( N + 1 )/2 )
         ELSE
            LDW = 0
         END IF
         IF ( WANTU.OR.WANTV ) THEN
            LDY = 8*P - 5
         ELSE
            LDY = 6*P - 3
         END IF
         MINWRK = MAX( 1, LDW + MAX( 2*P + K, LDY ) )
         IF( LDWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
            INFO = -18
         ELSE IF( LQUERY ) THEN
            IF ( QR ) THEN
               CALL DGEQRF( M, N, A, LDA, DWORK, DWORK, -1, INFO )
               WRKOPT = MAX( MINWRK, N + INT( DWORK(1) ) )
               MA = N
            ELSE
               WRKOPT = MINWRK
               MA = M
            END IF
            CALL DGEBRD( MA, N, A, LDA, Q, Q, DWORK, DWORK, DWORK, -1,
     $                   INFO )
            WRKOPT = MAX( WRKOPT, LDW + 2*P + INT( DWORK(1) ) )
         END IF
      END IF
C
      IF ( INFO.NE.0 ) THEN
C
C        Error return.
C
         CALL XERBLA( 'MB04XD', -INFO )
         RETURN
      ELSE IF( LQUERY ) THEN
         DWORK(1) = WRKOPT
         RETURN
      END IF
C
C     Quick return if possible.
C
      IF ( P.EQ.0 ) THEN
         IF ( RANK.GE.0 )
     $      THETA = ZERO
         RANK = 0
         RETURN
      END IF
C
C     Initializations.
C
      PP1 = P + 1
      ALL = ( LJOBUA .AND. M.GT.N ) .OR. ( LJOBVA .AND. M.LT.N )
C
      IF ( ALL .AND. ( .NOT.QR ) ) THEN
C
         DO 20 I = 1, P
            INUL(I) = .FALSE.
   20    CONTINUE
C
         DO 40 I = PP1, K
            INUL(I) = .TRUE.
   40    CONTINUE
C
      ELSE
C
         DO 60 I = 1, K
            INUL(I) = .FALSE.
   60    CONTINUE
C
      END IF
C
C     Step 1: Bidiagonalization phase
C             -----------------------
C     (Note: Comments in the code beginning "Workspace:" describe the
C     minimal amount of real workspace needed at that point in the
C     code, as well as the preferred amount for good performance.
C     NB refers to the optimal block size for the immediately
C     following subroutine, as returned by ILAENV.)
C
      IF ( QR ) THEN
C
C        1.a.: M is large enough than N; transform A into upper
C              triangular form R by Householder transformations.
C
C        Workspace: need 2*N;  prefer N + N*NB.
C
         ITAU = 1
         JWORK = ITAU + N
         CALL DGEQRF( M, N, A, LDA, DWORK(ITAU), DWORK(JWORK),
     $                LDWORK-JWORK+1, INFO )
         WRKOPT = INT( DWORK(JWORK) )+JWORK-1
C
C        If (WANTU), store information on the Householder
C        transformations performed on the columns of A in N*(N+1)/2
C        extra storage locations DWORK(K), for K = 1,2,...,N*(N+1)/2.
C        (The first N locations store the scalar factors of Householder
C        transformations.)
C
C        Workspace: LDW = max(2*N, N*(N+1)/2).
C
         IF ( WANTU ) THEN
            IHOUSH = JWORK
            K = IHOUSH
            I = N
         ELSE
            K = 1
         END IF
C
         DO 100 J = 1, N - 1
            IF ( WANTU ) THEN
               I = I - 1
               CALL DCOPY( I, A(J+1,J), 1, DWORK(K), 1 )
               K = K + I
            END IF
C
            DO 80 IJ = J + 1, N
               A(IJ,J) = ZERO
   80       CONTINUE
C
  100    CONTINUE
C
         MA = N
         WRKOPT = MAX( WRKOPT, K )
      ELSE
C
C        Workspace: LDW = 0.
C
         K  = 1
         MA = M
         WRKOPT = 1
      END IF
C
C     1.b.: Transform A (or R) into bidiagonal form Q using Householder
C           transformations.
C
C     Workspace: need   LDW + 2*min(M,N) + max(M,N);
C                prefer LDW + 2*min(M,N) + (M+N)*NB.
C
      ITAUQ = K
      ITAUP = ITAUQ + P
      JWORK = ITAUP + P
      CALL DGEBRD( MA, N, A, LDA, Q, Q(PP1), DWORK(ITAUQ),
     $             DWORK(ITAUP), DWORK(JWORK), LDWORK-JWORK+1, INFO )
      WRKOPT = MAX( WRKOPT, INT( DWORK(JWORK) )+JWORK-1 )
C
C     1.c.: Initialize U (if WANTU) and V (if WANTV) with the identity
C           matrix.
C
      IF ( WANTU ) THEN
         IF ( ALL ) THEN
            JU = M
         ELSE
            JU = P
         END IF
         CALL DLASET( 'Full', M, JU, ZERO, ONE, U, LDU )
         JOBUY = 'U'
      ELSE
         JOBUY = 'N'
      END IF
      IF ( WANTV ) THEN
         IF ( ALL ) THEN
            JV = N
         ELSE
            JV = P
         END IF
         CALL DLASET( 'Full', N, JV, ZERO, ONE, V, LDV )
         JOBVY = 'U'
      ELSE
         JOBVY = 'N'
      END IF
C
C     If the matrix is lower bidiagonal, rotate to be upper bidiagonal
C     by applying Givens rotations on the left.
C
      IF ( M.LT.N ) THEN
C
         DO 120 I = 1, P - 1
            CALL DLARTG( Q(I), Q(P+I), CS, SN, TEMP )
            Q(I)   = TEMP
            Q(P+I) = SN*Q(I+1)
            Q(I+1) = CS*Q(I+1)
            IF ( WANTU ) THEN
C
C              Workspace: LDW + 4*min(M,N) - 2.
C
               DWORK(JWORK+I-1) = CS
               DWORK(JWORK+P+I-2) = SN
            END IF
  120    CONTINUE
C
C        Update left singular vectors if desired.
C
         IF( WANTU )
     $      CALL DLASR( 'Right', 'Variable pivot', 'Forward', M, JU,
     $                  DWORK(JWORK), DWORK(JWORK+P-1), U, LDU )
C
      END IF
C
C     Step 2: Partial diagonalization phase.
C             -----------------------------
C             Diagonalize the bidiagonal Q partially until convergence
C             to  the desired left and/or right singular subspace.
C
C              Workspace: LDW + 8*min(M,N) - 5, if WANTU or WANTV;
C              Workspace: LDW + 6*min(M,N) - 3, if JOBU = JOBV = 'N'.
C
      CALL MB04YD( JOBUY, JOBVY, M, N, RANK, THETA, Q, Q(PP1), U, LDU,
     $             V, LDV, INUL, TOL, RELTOL, DWORK(JWORK),
     $             LDWORK-JWORK+1, IWARN, INFO )
      IF ( WANTU.OR.WANTV ) THEN
         WRKOPT = MAX( WRKOPT, JWORK - 6 + 8*P )
      ELSE
         WRKOPT = MAX( WRKOPT, JWORK - 4 + 6*P )
      END IF
      IF ( INFO.GT.0 )
     $   RETURN
C
C     Step 3: Back transformation phase.
C             -------------------------
C     3.a.: Apply the Householder transformations of the bidiagonaliza-
C           tion onto the base vectors associated with the desired
C           bidiagonal submatrices.
C
C           Workspace: LDW + 2*min(M,N).
C
      CALL MB04XY( JOBU, JOBV, MA, N, A, LDA, DWORK(ITAUQ),
     $             DWORK(ITAUP), U, LDU, V, LDV, INUL, INFO )
C
C     3.b.: If A was reduced to upper triangular form R and JOBU = 'A'
C           or JOBU = 'S' apply the Householder transformations of the
C           triangularization of A onto the desired base vectors.
C
      IF ( QR.AND.WANTU ) THEN
         IF ( ALL ) THEN
C
            DO 140 I = PP1, M
               INUL(I) = .TRUE.
  140       CONTINUE
C
         END IF
         K = IHOUSH
         I = N
C
         DO 160 J = 1, N - 1
            I = I - 1
            CALL DCOPY( I, DWORK(K), 1, A(J+1,J), 1 )
            K = K + I
  160    CONTINUE
C
C        Workspace: MIN(M,N) + 1.
C
         JWORK = PP1
         CALL MB04XY( JOBU, 'No V', M, N, A, LDA, DWORK(ITAU),
     $                DWORK(ITAU), U, LDU, DWORK(JWORK), 1, INUL, INFO )
         WRKOPT = MAX( WRKOPT, PP1 )
      END IF
C
C     Set the optimal workspace.
C
      DWORK(1) = WRKOPT
      RETURN
C *** Last line of MB04XD ***
      END