function [alpha, c, wresid, wresid_norm, y_est, Regression] = ...
          varpro(y, w, alpha, n, ada, lb, ub, options)
%VARPRO Solve a separable nonlinear least squares problem.
% [alpha, c, wresid, wresid_norm, y_est, Regression] =
%             VARPRO(y, w, alpha, n, ada, lb, ub, options)
%
% Given a set of m observations y(1),...,y(m)
% this program computes a weighted least squares fit using the model
%
%    eta(alpha,c,t) = 
%            c_1 * phi_1 (alpha,t) + ...  + c_n * phi_n (alpha,t) 
% (possibly with an extra term  + phi_{n+1} (alpha,t) ).
%
% This program determines optimal values of the q nonlinear parameters
% alpha and the n linear parameters c, given observations y at m
% different values of the "time" t and given evaluation of phi and 
% (optionally) derivatives of phi.
%
% On Input:
%
%   y    m x 1   vector containing the m observations.
%   w    m x 1   vector of weights used in the least squares
%                fit.  We minimize the norm of the weighted residual
%                vector r, where, for i=1:m,
%
%                r(i) = w(i) * (y(i) - eta(alpha, c, t(i,:))).
%
%                Therefore, w(i) should be set to 1 divided by
%                the standard deviation in the measurement y(i).  
%                If this number is unknown, set w(i) = 1.
%   alpha q x 1  initial estimates of the parameters alpha.
%                If alpha = [], Varpro assumes that the problem
%                is linear and returns estimates of the c parameters.
%   n            number of linear parameters c
%   ada          a function handle, described below.
%   lb    q x 1  lower bounds on the parameters alpha. 
%   (Optional)   (Omit this argument or use [] if there are
%                no lower bounds.)
%   ub    q x 1  upper bounds on the parameters alpha. 
%   (Optional)   (Omit this argument or use [] if there are
%                no upper bounds.)
%   options      The Matlab optimization parameter structure,
%   (Optional)   set by "optimset", to control convergence
%                tolerances, maximum number of function evaluations,
%                information displayed in the command window, etc. 
%                To use default options, omit this parameter.
%                To determine the default options, type
%                    options = optimset('lsqnonlin')
%                After doing this, the defaults can be modified;
%                to modify the display option, for example, type
%                    options = optimset('lsqnonlin');
%                    optimset(options,'Display','iter');
%
% On Output:
%
%  alpha  q x 1  estimates of the nonlinear parameters.
%  c      n x 1  estimates of the linear parameters.
%  wresid m x 1  weighted residual vector, with i-th component
%                w(i) * (y(i) - eta(alpha, c, t(i,:))).
%  wresid_norm   norm of wresid.
%  y_est  m x 1  the model estimates = eta(alpha, c, t(i,:)))
%  Regression    a structure containing diagnostics about the model fit.
%                **************************************************
%                *                C a u t i o n:                  *
%                *   The theory that makes statistical            *
%                *   diagnostics useful is derived for            *
%                *   linear regression, with no upper- or         *
%                *   lower-bounds on variables.                   *
%                *   The relevance of these quantities to our     *
%                *   nonlinear model is determined by how well    *
%                *   the linearized model (Taylor series model)   *
%                *         eta(alpha_true, c_true)                *
%                *            +  Phi * (c  - c_true)              *
%                *            + dPhi * (alpha - alpha_true)       *
%                *   fits the data in the neighborhood of the     *
%                *   true values for alpha and c, where Phi       *
%                *   and dPhi contain the partial derivatives     *
%                *   of the model with respect to the c and       *
%                *   alpha parameters, respectively, and are      *
%                *   defined in ada.                              *
%                **************************************************
%  Regression.report:  
%                This structure includes information on the solution
%                process, including the number of iterations, 
%                termination criterion, and exitflag from lsqnonlin.
%                (Type 'help lsqnonlin' to see the exit conditions.)
%                Regression.report.rank is the computed rank of the 
%                matrix for the linear subproblem.  If this equals
%                n, then the linear coefficients are well-determined.
%                If it is less than n, then although the model might
%                fit the data well, other linear coefficients might
%                give just as good a fit.
%  Regression.sigma:        
%                The estimate of the standard deviation is the
%                weighted residual norm divided by the square root
%                of the number of degrees of freedom.
%                This is also called the "regression standard error"
%                or the square-root of the weighted SSR (sum squared
%                residual) divided by the square root of the
%                number of degrees of freedom.
%  Regression.RMS:
%                The "residual mean square" is equal to sigma^2:
%                RMS = wresid_norm^2 / (m-n+q)
%  Regression.coef_determ:
%                The "coefficient of determination" for the fit,
%                also called the square of the multiple correlation
%                coefficient, is sometimes called R^2.
%                It is computed as 1 - wresid_norm^2/CTSS,
%                where the "corrected total sum of squares"
%                CTSS is the norm-squared of W*(y-y_bar),
%                and the entries of y_bar are all equal to
%                (the sum of W_i^2 y_i) divided by (the sum of W_i^2).
%                A value of .95, for example, indicates that 95 per 
%                cent of the CTTS is accounted for by the fit.
%
%  Regression.CovMx: (n+q) x (n+q)
%                This is the estimated variance/covariance matrix for
%                the parameters.  The linear parameters c are ordered
%                first, followed by the nonlinear parameters alpha.
%                This is empty if dPhi is not computed by ada.
%  Regression.CorMx: (n+q) x (n+q)
%                This is the estimated correlation matrix for the 
%                parameters.  The linear parameters c are ordered
%                first, followed by the nonlinear parameters alpha.
%                This is empty if dPhi is not computed by ada.
%  Regression.std_param: (n+q) x 1
%                This vector contains the estimate of the standard 
%                deviation for each parameter.
%                The k-th element is the square root of the k-th main 
%                diagonal element of Regression.CovMx
%                This is empty if dPhi is not computed by ada.
%  Regression.t_ratio:   (n+q) x 1
%                The t-ratio for each parameter is equal to the
%                parameter estimate divided by its standard deviation.
%                (linear parameters c first, followed by alpha)
%                This is empty if dPhi is not computed by ada.
%  Regression.standardized_wresid:
%                The k-th component of the "standardized weighted 
%                residual" is the k-th component of the weighted 
%                residual divided by its standard deviation.
%                This is empty if dPhi is not computed by ada.
%
%---------------------------------------------------------------
% Specification of the function ada, which computes information
% related to Phi:
%
%   function [Phi,dPhi,Ind] = ada(alpha)
%
%     This function computes Phi and, if possible, dPhi.
%
%     On Input: 
%
%        alpha q x 1    contains the current value of the alpha parameters.
%
%        Note:  If more input arguments are needed, use the standard
%               Matlab syntax to accomplish this.  For example, if
%               the input arguments to ada are t, z, and alpha, then
%               before calling varpro, initialize t and z, and in calling 
%               varpro, replace "@ada" by "@(alpha)ada(t,z,alpha)".
%
%     On Output:
%
%        Phi   m x n1   where Phi(i,j) = phi_j(alpha,t_i).
%                       (n1 = n if there is no extra term; 
%                        n1 = n+1 if an extra term is used)
%        dPhi  m x p    where the columns contain partial derivative
%                       information for Phi and p is the number of 
%                       columns in Ind 
%                       (or dPhi = [] if derivatives are not available).
%        Ind   2 x p    Column k of dPhi contains the partial
%                       derivative of Phi_j with respect to alpha_i, 
%                       evaluated at the current value of alpha, 
%                       where j = Ind(1,k) and i = Ind(2,k).
%                       Columns of dPhi that are always zero, independent
%                       of alpha, need not be stored. 
%        Example:  if  phi_1 is a function of alpha_2 and alpha_3, 
%                  and phi_2 is a function of alpha_1 and alpha_2, then 
%                  we can set
%                          Ind = [ 1 1 2 2
%                                  2 3 1 2 ]
%                  In this case, the p=4 columns of dPhi contain
%                          d phi_1 / d alpha_2,
%                          d phi_1 / d alpha_3,
%                          d phi_2 / d alpha_1,
%                          d phi_2 / d alpha_2,
%                  evaluated at each t_i.
%                  There are no restrictions on how the columns of
%                  dPhi are ordered, as long as Ind correctly specifies
%                  the ordering.
%
%        If derivatives dPhi are not available, then set dPhi = Ind = [].
%      
%---------------------------------------------------------------
%
%  Varpro calls lsqnonlin, which solves a constrained least squares
%  problem with upper and lower bounds on the constraints.  What
%  distinguishes varpro from lsqnonlin is that, for efficiency and
%  reliability, varpro causes lsqnonlin to only iterate on the
%  nonlinear parameters.  Given the information in Phi and dPhi, this 
%  requires an intricate but inexpensive computation of partial 
%  derivatives, and this is handled by the varpro function formJacobian.
%
%  lsqnonlin is in Matlab's Optimization Toolbox.  Another solver
%  can be substituted if the toolbox is not available.
%
%  Any parameters that require upper or lower bounds should be put in
%  alpha, not c, even if they appear linearly in the model.
%  
%  The original Fortran implementation of the variable projection 
%  algorithm (ref. 2) was modified in 1977 by John Bolstad 
%  Computer Science Department, Serra House, Stanford University,
%  using ideas of Linda Kaufman (ref. 5) to speed up the 
%  computation of derivatives.  He also allowed weights on
%  the observations, and computed the covariance matrix.
%  Our Matlab version takes advantage of 30 years of improvements
%  in programming languages and minimization algorithms.  
%  In this version, we also allow upper and lower bounds on the 
%  nonlinear parameters.
%
%  It is hoped that this implementation will be of use to Matlab
%  users, but also that its simplicity will make it easier for the
%  algorithm to be implemented in other languages.
%
%  This program is documented in
%  Dianne P. O'Leary and Bert W. Rust,
%  Variable Projection for Nonlinear Least Squares Problems,
%  Computational Optimization and Applications (2012)
%  doi 10.1007/s10589-012-9492-9.
%
%  US National Inst. of Standards and Technology, 2010.
%
%  Main reference:
%
%    0.  Gene H. Golub and V. Pereyra, 'Separable nonlinear least   
%        squares: the variable projection method and its applications,'
%        Inverse Problems 19, R1-R26 (2003).
%         
%  See also these papers, cited in John Bolstad's Fortran code:
%                                                                       
%    1.  Gene H. Golub and V. Pereyra, 'The differentiation of      
%        pseudo-inverses and nonlinear least squares problems whose 
%        variables separate,' SIAM J. Numer. Anal. 10, 413-432      
%         (1973).                                                    
%    2.  ------, same title, Stanford C.S. Report 72-261, Feb. 1972.
%    3.  Michael R. Osborne, 'Some aspects of non-linear least      
%        squares calculations,' in Lootsma, Ed., 'Numerical Methods 
%        for Non-Linear Optimization,' Academic Press, London, 1972.
%    4.  Fred Krogh, 'Efficient implementation of a variable projection
%        algorithm for nonlinear least squares problems,'           
%        Comm. ACM 17:3, 167-169 (March, 1974).                    
%    5.  Linda Kaufman, 'A variable projection method for solving  
%        separable nonlinear least squares problems', B.I.T. 15,   
%        49-57 (1975).                                             
%    6.  C. Lawson and R. Hanson, Solving Least Squares Problems,
%        Prentice-Hall, Englewood Cliffs, N. J., 1974.          
%
%  These books discuss the statistical background:
%
%    7.  David A. Belsley, Edwin Kuh, and Roy E. Welsch, Regression 
%        Diagnostics, John Wiley & Sons, New York, 1980, Chap. 2.
%    8.  G.A.F. Seber and C.J. Wild, Nonlinear Regression,
%        John Wiley & Sons, New York, 1989, Sec. 2.1, 5.1, and 5.2.
%
%  Dianne P. O'Leary, NIST and University of Maryland, February 2011.
%  Bert W. Rust,      NIST                             February 2011.
%  Comments updated 07-2012.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

global mydebug myneglect   % test neglect of extra term in Jacobian

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Initialization: Check input, set default parameters and options.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

[m,ell] = size(y);      % m = number of observations
[m1,ell1] = size(w);

if (m1 ~= m) | (ell > 1) | (ell1 > 1)
   error('y and w must be column vectors of the same length')
end

[q,ell] = size(alpha);   % q = number of nonlinear parameters

if (ell > 1)
   error('alpha must be a column vector containing initial guesses for nonlinear parameters')
end

if (nargin < 6)  
    lb = [];
else
    [q1,ell] = size(lb);
    if (q1 > 0) & (ell > 0)
       if (q1 ~= q) | (ell > 1)
          error('lb must be empty or a column vector of the same length as alpha')
       end
    end
end

if (nargin < 7)
    ub = [];
else
    [q1,ell] = size(ub);
    if (q1 > 0) & (ell > 0)
       if (q1 ~= q) | (ell > 1)
          error('ub must be empty or a column vector of the same length as alpha')
       end
    end
end

if (nargin < 8)
    options = optimset('lsqnonlin');  
end

if (~strcmp(options.Display,'off'))
    disp(sprintf('\n-------------------'))
    disp(sprintf('VARPRO is beginning.'))
end

W = spdiags(w,0,m,m);  % Create an m x m diagonal matrix from the vector w

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Make the first call to ada and do some error checking.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

[Phi, dPhi, Ind] = feval(ada, alpha);

[m1,n1] = size(Phi);     % n1 = number of basis functions Phi.

[m2,n2] = size(dPhi);
[ell,n3] = size(Ind);

if (m1 ~= m2) & (m2 > 0)
   error('In user function ada: Phi and dPhi must have the same number of rows.')
end

if (n1 < n) | (n1 > n+1)
   error('In user function ada: The number of columns in Phi must be n or n+1.')
end

if (n2 > 0) & (ell ~= 2)
   error('In user function ada: Ind must have two rows.')
end

if (n2 > 0) & (n2 ~= n3)
   error('In user function ada: dPhi and Ind must have the same number of columns.')
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Solve the least squares problem using lsqnonlin or, if there
% are no nonlinear parameters, using the SVD procedure in formJacobian.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

if (q > 0) % The problem is nonlinear.
   
   if ~isempty(dPhi)
      options = optimset(options,'Jacobian','on');
   end
 
   [alpha, wresid_norm2, wresid, exitflag,output] = ...
                  lsqnonlin(@f_lsq, alpha, lb, ub, options);
   [r, Jacobian, Phi, dPhi, y_est, rank] = f_lsq(alpha);
   wresid_norm = sqrt(wresid_norm2);
   Regression.report = output;
   Regression.report.rank = rank;
   Regression.report.exitflag = exitflag;

else       % The problem is linear.

   if (~strcmp(options.Display,'off'))
      disp(sprintf('VARPRO problem is linear, since length(alpha)=0.'))
   end

   [Jacobian, c, wresid, y_est, Regression.report.rank] =  ...
                                       formJacobian(alpha, Phi, dPhi);
   wresid_norm = norm(wresid);
   wresid_norm2 = wresid_norm * wresid_norm;

end % if q > 0
        
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
%  Compute some statistical diagnostics for the solution.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Calculate sample variance,  the norm-squared of the residual
%    divided by the number of degrees of freedom.

sigma2 = wresid_norm2 / (m-n-q);

% Compute  Regression.sigma:        
%                square-root of weighted residual norm squared divided 
%                by number of degrees of freedom.

Regression.sigma = sqrt(sigma2);

% Compute  Regression.coef_determ:
%                The coeficient of determination for the fit,
%                also called the square of the multiple correlation
%                coefficient, or R^2.
%                It is computed as 1 - wresid_norm^2/CTSS,
%                where the "corrected total sum of squares"
%                CTSS is the norm-squared of W*(y-y_bar),
%                and the entries of y_bar are all equal to
%                (the sum of W_i y_i) divided by (the sum of W_i).

y_bar = sum(w.*y) / sum(w);
CTTS = norm(W * (y - y_bar)) ^2;
Regression.coef_determ = 1 - wresid_norm^2 / CTTS;

% Compute  Regression.RMS = sigma^2:
%                the weighted residual norm divided by the number of
%                degrees of freedom.
%                RMS = wresid_norm / sqrt(m-n+q)

Regression.RMS = sigma2;                        

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
%  Compute some additional statistical diagnostics for the 
%  solution, if the user requested it.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

if (nargout==6) 

   if (isempty(dPhi))

      Regression.CovMx = [];
      Regression.CorMx = [];
      Regression.std_param = [];
      Regression.t_ratio = [];
      Regression.standardized_wresid = [];

   else

% Calculate the covariance matrix CovMx, which is sigma^2 times the
% inverse of H'*H, where  
%              H = W*[Phi,J] 
% contains the partial derivatives of  wresid  with
% respect to the parameters in alpha and c.
    
      [xx,pp] = size(dPhi);
      J = zeros(m,q);
      for kk = 1:pp,
         j = Ind(1,kk);
         i = Ind(2,kk);
         if (j > n)
             J(:,i) = J(:,i) + dPhi(:,kk);
         else
             J(:,i) = J(:,i) + c(j) * dPhi(:,kk);
         end
      end
      [Qj,Rj,Pj] = qr(W*[Phi(:,1:n),J], 0);     % Uses compact pivoted QR.
      T2 = Rj \ (eye(size(Rj,1)));
      CovMx = sigma2 * T2 * T2';
      Regression.CovMx(Pj,Pj) = CovMx;  % Undo the pivoting permutation.
   
% Compute  Regression.CorMx:        
%                estimated correlation matrix (n+q) x (n+q) for the
%                parameters.  The linear parameters are ordered
%                first, followed by the nonlinear parameters.

      d = 1 ./ sqrt(diag(Regression.CovMx));
      D = spdiags(d,0,n+q,n+q);
      Regression.CorMx = D * Regression.CovMx * D;

% Compute  Regression.std_param:
%                The k-th element is the square root of the k-th main 
%                diagonal element of CovMx. 

      Regression.std_param = sqrt(diag(Regression.CovMx));

% Compute  Regression.t_ratio:
%                parameter estimates divided by their standard deviations.

      alpha = reshape(alpha, q, 1);
      Regression.t_ratio = [c; alpha] .* d;

% Compute  Regression.standardized_wresid:
%                The k-th component is the k-th component of the
%                weighted residual, divided by its standard deviation.
%                Let X = W*[Phi,J], 
%                    h(k) = k-th main diagonal element of covariance
%                           matrix for wresid
%                         = k-th main diagonal element of X*inv(X'*X)*X' 
%                         = k-th main diagonal element of Qj*Qj'.
%                Then the standard deviation is estimated by
%                sigma*sqrt(1-h(k)).

      for k=1:m
          temp(k,1) = Qj(k,:) * Qj(k,:)';
      end
      Regression.standardized_wresid = wresid ./(Regression.sigma*sqrt(1-temp));

   end
end

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% End of statistical diagnostics computations.
% Print some final information if desired.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


if (~strcmp(options.Display,'off'))
  disp(sprintf(' '))
  disp(sprintf('VARPRO Results:'))
  disp(sprintf(' Linear Parameters:'))
  disp(sprintf(' %12.6f ',c))
  
  disp(sprintf(' Nonlinear Parameters:'))
  disp(sprintf(' %12.6f ',alpha))
  disp(sprintf(' '))
  
  disp(sprintf(' Norm-squared of weighted residual  = %12.6f',wresid_norm2))
  disp(sprintf(' Norm-squared of data vector        = %12.6f',norm(w.*y)^2))
  disp(sprintf(' Norm         of weighted residual  = %12.6f',wresid_norm))
  disp(sprintf(' Norm         of data vector        = %12.6f',norm(w.*y)))
  disp(sprintf(' Expected error of observations     = %12.6f',Regression.sigma))
  disp(sprintf(' Coefficient of determination       = %12.6f',Regression.coef_determ))
  disp(sprintf('VARPRO is finished.'))
  disp(sprintf('-------------------\n'))
end
    
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% The computation is now completed.  
%
% varpro uses the following two functions, f_lsq and formJacobian.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%--------------------------- Beginning of f_lsq --------------------------

function [wr_trial, Jacobian, Phi_trial, dPhi_trial, y_est,myrank] = ...
                                                        f_lsq(alpha_trial)

% function [wr_trial,Jacobian,Phi_trial,dPhi_trial,y_est,myrank] = ...
%                                                       f_lsq(alpha_trial)
%
% This function is used by lsqnonlin to compute 
%          wr_trial   the current weighted residual
%          Jacobian   the Jacobian matrix for the nonlinear parameters
% It also computes
%          Phi_trial  the current Phi matrix
%          dPhi_trial the partial derivatives of Phi_trial (if available).
%          y_est      the model estimates of y
%          myrank     the rank of the matrix W*Phi in the linear problem.
%
% It uses the user-supplied function ada and the Varpro function formJacobian.

[Phi_trial, dPhi_trial, Ind] = feval(ada, alpha_trial);

[Jacobian, c, wr_trial, y_est,myrank] = ...
          formJacobian(alpha_trial, Phi_trial, dPhi_trial);

end %--------------------------- End of f_lsq ---------------------------

%----------------------- Beginning of formJacobian ----------------------

function [Jacobian, c, wresid, y_est,myrank] = formJacobian(alpha, Phi, dPhi)

% function [Jacobian, c, wresid, y_est,myrank] =  formJacobian(alpha, Phi, dPhi)
%
% This function computes the Jacobian, the linear parameters c,
% and the residual for the model with nonlinear parameters alpha.
% It is used by the functions Varpro and f_lsq.
%
% Notation: there are m data observations
%                     n1 basis functions (columns of Phi)
%                     n linear parameters c
%                       (n = n1, or n = n1 - 1)
%                     q nonlinear parameters alpha
%                     p nonzero columns of partial derivatives of Phi 
%
% Input:
%
%      alpha  q x 1   the nonlinear parameters,
%      Phi    m x n1  the basis functions Phi(alpha),
%      dPhi   m x p   the partial derivatives of Phi
%
% The variables W, y, q, m, n1, and n are also used.
%
% Output: 
%
%      Jacobian  m x p the Jacobian matrix, with J(i,k) = partial
%                      derivative of W resid(i) with respect to alpha(k).
%      c      n x 1 the optimal linear parameters for this choice of alpha.
%      wresid    m x 1 the weighted residual = W(y - Phi * c)
%      y_est     m x 1 the model estimates = Phi * c)
%      myrank    1 x 1 the rank of the matrix W*Phi.


% First we determine the optimal linear parameters c for
% the given values of alpha, and the resulting residual.
%
% We use the singular value decomposition to solve the linear least
% squares problem
%
%    min_{c} || W resid ||.
%       resid =  y - Phi * c.
%
% If W*Phi has any singular value less than m * its largest singular value, 
% these singular values are set to zero.

[U,S,V] = svd(W*Phi(:,1:n));

% Three cases: Usually n > 1, but n = 1 and n = 0 require
% special handling (one or no linear parameters).

if (n > 1)
  s = diag(S);
elseif (n==1)
  s = S;
else    %  n = 0
 if isempty(Ind)
     Jacobian = [];
 else
     Jacobian = zeros(length(y),length(alpha));
     Jacobian(:,Ind(2,:)) = -W*dPhi;
 end
 c = [];
 y_est = Phi;
 wresid = W * (y - y_est);
 myrank = 1;
 return
end

tol = m * eps;
myrank = sum(s > tol*s(1) ); % number of singular values > tol*norm(W*Phi)
s = s(1:myrank);

if (myrank < n) & (~strcmp(options.Display,'off'))
   disp(sprintf('Warning from VARPRO:'))
   disp(sprintf('   The linear parameters are currently not well-determined.'))
   disp(sprintf('   The rank of the matrix in the subproblem is %d',myrank))
   disp(sprintf('   which is less than the n=%d linear parameters.',n))
end

yuse = y;
if (n < n1)
   yuse  = y - Phi(:,n1); % extra function Phi(:,n+1)
end
temp  = U(:,1:myrank)' * (W*yuse);    
c = V(:,1:myrank) * (temp./s);
y_est = Phi(:,1:n) * c;
wresid = W * (yuse - y_est);
if (n < n1)
   y_est = y_est + Phi(:,n1);
end

if (q == 0) | (isempty(dPhi))
   Jacobian = [];
   return
end

% Second, we compute the Jacobian.
% There are two pieces, which we call Jac1 and Jac2,
% with Jacobian = - (Jac1 + Jac2).
%
% The formula for Jac1 is (P D(W*Phi) pinv(W*Phi) y,
%             and Jac2 is ((W*Phi)^+})^T (P D(W*Phi))^T y.
% where  P           is the projection onto the orthogonal complement
%                       of the range of W*Phi,
%        D(W*Phi)    is the m x n x q tensor of derivatives of W*Phi,
%        pinv(W*Phi) is the pseudo-inverse of W*Phi.
%  (See Golub&Pereyra (1973) equation (5.4).  We use their notational  
%   conventions for multiplications by tensors.)
%
% Golub&Pereyra (2003), p. R5 break these formulas down by columns:
%     The j-th column of Jac1 is P D_j pinv(W*Phi) y
%                             =  P D_j c             
% and the j-th column of Jac2 is (P D_j pinv(W*Phi))^T y,
%                             =  (pinv(W*Phi))^T D_j^T P^T y
%                             =  (pinv(W*Phi))^T D_j^T wresid.
% where D_j is the m x n matrix of partial derivatives of W*Phi
%     with respect to alpha(j).

% We begin the computation by precomputing 
%       WdPhi, which contains the derivatives of W*Phi, and 
%       WdPhi_r, which contains WdPhi' * wresid.

WdPhi = W * dPhi;
WdPhi_r = WdPhi' * wresid;
T2 = zeros(n1, q);
ctemp = c;
if (n1 > n)
   ctemp = [ctemp; 1];
end

%   Now we work column-by-column, for j=1:q.
%
%   We form Jac1 = D(W*Phi) ctemp.
%   After the loop, this matrix is multiplied by 
%        P = U(:,myrank+1:m)*(U(:,myrank+1:m)'
%   to complete the computation.
% 
%   We also form  T2 = (D_j(W*Phi))^T wresid  by unpacking
%   the information in WdPhi_r, using Ind.
%   After the loop, T2 is multiplied by the pseudoinverse
%       (pinv(W*Phi))^T = U(:,1:myrank) * diag(1./s) * (V(:,1:myrank)'
%   to complete the computation of Jac2.
%   Note: if n1 > n, last row of T2 is not needed.   

for j=1:q,                            % for each nonlinear parameter alpha(j)
    range = find(Ind(2,:)==j);        % columns of WdPhi relevant to alpha(j)
    indrows = Ind(1,range);           % relevant rows of ctemp
    Jac1(:,j) = WdPhi(:,range) * ctemp(indrows);      
    T2(indrows,j) = WdPhi_r(range);
end


Jac1 = U(:,myrank+1:m) * (U(:,myrank+1:m)' * Jac1);

T2 = diag(1 ./ s(1:myrank)) * (V(:,1:myrank)' * T2(1:n,:));
Jac2 = U(:,1:myrank) * T2;

Jacobian = -(Jac1 + Jac2);

if (mydebug)
   disp(sprintf('VARPRO norm(neglected Jacobian)/norm(Jacobian) = %e',...
         norm(Jac2,'fro')/norm(Jacobian,'fro') ))
   if (myneglect)
       disp('neglecting')
       Jacobian = -Jac1;
   end
end

end %-------------------------- End of formJacobian ----------------------

end %------------------------------ End of varpro ------------------------