ThreeB: conjugate_gradient_only.h Source File

ThreeB 1.1
00001 #ifndef CONJUGATE_GRADIENT_ONLY
00002 #define CONJUGATE_GRADIENT_ONLY
00003 #include <TooN/TooN.h>
00004 #include <utility>
00005 #include <cstdlib>
00006 
00007 ///Class for performing optimization with Conjugate Gradient, where only the derivatives are available.
00008 ///@ingroup gStorm
00009 template<int Size=-1, class Precision=double> struct ConjugateGradientOnly
00010 {
00011     const int size;      ///< Dimensionality of the space.
00012     TooN::Vector<Size> g;      ///< Gradient vector used by the next call to iterate()
00013     TooN::Vector<Size> new_g;  ///< The new gradient set at the end of iterate()
00014     TooN::Vector<Size> h;      ///< Conjugate vector to be searched along in the next call to iterate()
00015     TooN::Vector<Size> minus_h;///< negative of h as this is required to be passed into a function which uses references (so can't be temporary)
00016     TooN::Vector<Size> old_g;  ///< Gradient vector used to compute $h$ in the last call to iterate()
00017     TooN::Vector<Size> old_h;  ///< Conjugate vector searched along in the last call to iterate()
00018     TooN::Vector<Size> x;      ///< Current position (best known point)
00019     TooN::Vector<Size> old_x;  ///< Previous point (not set at construction)
00020 
00021     Precision tolerance; ///< Tolerance used to determine if the optimization is complete. Defaults to square root of machine precision.
00022     int       max_iterations; ///< Maximum number of iterations. Defaults to \c size\f$*100\f$
00023     
00024     TooN::Vector<Size> delta_max; ///< Maximum distance to travel along all axes in line search
00025 
00026     Precision linesearch_tolerance; ///< Tolerance used to determine if the linesearch is complete. Defaults to square root of machine precision.
00027     Precision linesearch_epsilon;   ///< Additive term in tolerance to prevent excessive iterations if \f$x_\mathrm{optimal} = 0\f$. Known as \c ZEPS in numerical recipies. Defaults to 1e-20
00028 
00029 
00030     int iterations; ///< Number of iterations performed
00031 
00032     ///Initialize the ConjugateGradient class with sensible values.
00033     ///@param start Starting point, \e x
00034     ///@param deriv  Function to compute \f$\nabla f(x)\f$
00035     ///@param d Maximum allowed movement (delta) in any dimension
00036     template<class Deriv> ConjugateGradientOnly(const TooN::Vector<Size>& start, const Deriv& deriv, const TooN::Vector<Size>& d)
00037     : size(start.size()),
00038       g(size),new_g(size),h(size),minus_h(size),old_g(size),old_h(size),x(start),old_x(size),delta_max(d)
00039     {
00040         init(start, deriv);
00041     }
00042     
00043     ///Initialise given a starting point and the derivative at the starting point
00044     ///@param start starting point
00045     ///@param deriv derivative at starting point
00046     template<int S, class P, class B> void init(const TooN::Vector<Size>& start, const TooN::Vector<S, P, B>& deriv)
00047     {
00048 
00049         using std::numeric_limits;
00050         x = start;
00051 
00052         //Start with the conjugate direction aligned with
00053         //the gradient
00054         g = deriv;
00055         h = g;
00056         minus_h=-h;
00057 
00058         tolerance = sqrt(numeric_limits<Precision>::epsilon());
00059         max_iterations = size * 100;
00060 
00061 
00062         linesearch_tolerance =  sqrt(numeric_limits<Precision>::epsilon());
00063         linesearch_epsilon = 1e-20;
00064 
00065         iterations=0;
00066     }
00067 
00068     ///Initialise given a starting point and a functor for computing derivatives
00069     ///@param start starting point
00070     ///@param deriv derivative computing functor
00071     template<class Deriv> void init(const TooN::Vector<Size>& start, const Deriv& deriv)
00072     {
00073         init(start, deriv(start));
00074     }
00075 
00076     ///Perform a linesearch from the current point (x) along the current
00077     ///conjugate vector (h).  The linesearch does not make use of values.
00078     ///You probably do not want to call this function directly. See iterate() instead.
00079     ///This function updates:
00080     /// - x
00081     /// - old_c
00082     /// - iterations
00083     /// Note that the conjugate direction and gradient are not updated.
00084     /// If bracket_minimum_forward detects a local maximum, then essentially a zero
00085     /// sized step is taken.
00086     /// @param deriv Functor returning the derivative value at a given point.
00087     template<class Deriv> void find_next_point(const Deriv& deriv)
00088     {
00089         iterations++;
00090         using std::abs;
00091         //Conjugate direction is -h
00092         //grad.-h is (should be negative)
00093 
00094         //We should have that f1 is negative.
00095         new_g = g;
00096         double f1 = g * minus_h;
00097         //If f1 is positive, then the conjugate vector points agains the
00098         //newly computed gradient. This can only happen if there is an error
00099         //in the computation of the gradient (eg we're using a stochastic method)
00100         //not much to do here except to stop.
00101         if(f1 > 0)
00102         {
00103             //Return, so try to search in a direction conjugate to the current one.
00104             return;
00105         }
00106         //What step size takes us up to the maximum length
00107         Precision max_step = HUGE_VAL;
00108         for(int i=0; i < minus_h.size(); i++)
00109             max_step = min(max_step, abs(delta_max[i]/h[i]));
00110 
00111         //Taking the maximum step size may result in NaNs.
00112         //Try the maximum step size, and seccessively reduce it.
00113         Precision full_max_step = max_step;
00114         
00115         for(;;)
00116         {
00117             new_g = deriv(x + max_step * minus_h);
00118 
00119             if(!TooN::isnan(new_g))
00120             {
00121 //  cout << "new_g is NAN free :)\n";
00122                 break;
00123             }
00124             else
00125                 max_step /=2;
00126 
00127             //If the step size gets too small then 
00128             //return as we can't really do anything
00129             if(max_step < full_max_step * linesearch_tolerance)
00130                 return;
00131         }
00132 
00133         double f2 = new_g * minus_h;
00134 
00135 
00136         //If f2 hasn't gone negative, then the largest allowed step is not large enough.
00137         //So, take a full step, then keep going in the same direction
00138         if(f2 < 0)
00139         {   
00140             //Take a full step
00141             x += max_step * minus_h;
00142             return;
00143         }
00144 
00145         //Now, x1 and x2 bracket a root, and find the root using bisection
00146         //x1 and x2 are represented by the scalars s1 and s2
00147         double s1 = 0;
00148         double s2 = max_step;
00149         double s_new = max_step;
00150 
00151         int updates[2] = {0,0};
00152 
00153         while(abs(s1 - s2) > abs(s1 + s2) * linesearch_tolerance + linesearch_epsilon)
00154         {
00155             if(updates[0] != updates[1] && updates[0] != 0)
00156             {
00157 
00158                 //Compute the new point using false position.
00159                 s_new = s1 + f1 * (s2 - s1)/(f1 - f2);
00160                 new_g = deriv(x + s_new * minus_h);
00161                 double f_new = new_g*minus_h;
00162 
00163                 updates[0] = updates[1];
00164 
00165                 if(f_new == 0)
00166                     break;
00167 
00168                 if(f_new < 0) 
00169                 {
00170                     s1 = s_new;
00171                     f1 = f_new;
00172                     updates[1] = 1;
00173                 }
00174                 else
00175                 {
00176                     s2 = s_new;
00177                     f2 = f_new;
00178                     updates[1] = 2;
00179                 }
00180             }
00181             else
00182             {
00183                 //Compute the new point
00184                 
00185                 s_new = (s1 + s2) / 2;
00186 
00187                 new_g = deriv(x + s_new * minus_h);
00188                 double f_new = new_g*minus_h;
00189 
00190                 if(f_new < 0) 
00191                 {
00192                     s1 = s_new;
00193                     f1 = f_new;
00194                     updates[0] = 1;
00195                 }
00196                 else
00197                 {
00198                     s2 = s_new;
00199                     f2 = f_new;
00200                     updates[0] = 2;
00201                 }
00202             
00203             }
00204 
00205         }
00206 
00207         //Update the current position and value
00208         //The most recent gradient computation is at s_new
00209         x += minus_h * s_new;
00210 
00211     }
00212 
00213     ///Check to see it iteration should stop. You probably do not want to use
00214     ///this function. See iterate() instead. This function updates nothing.
00215     bool finished()
00216     {
00217         using std::abs;
00218         return iterations > max_iterations || norm_inf(new_g) < tolerance;
00219     }
00220 
00221     ///After an iteration, update the gradient and conjugate using the
00222     ///Polak-Ribiere equations.
00223     ///This function updates:
00224     ///- g
00225     ///- old_g
00226     ///- h
00227     ///- old_h
00228     ///@param grad The derivatives of the function at \e x
00229     void update_vectors_PR(const TooN::Vector<Size>& grad)
00230     {
00231         //Update the position, gradient and conjugate directions
00232         old_g = g;
00233         old_h = h;
00234 
00235         g = grad;
00236         //Precision gamma = (g * g - oldg*g)/(oldg * oldg);
00237         Precision gamma = (g * g - old_g*g)/(old_g * old_g);
00238         h = g + gamma * old_h;
00239         minus_h=-h;
00240     }
00241 
00242     ///Use this function to iterate over the optimization. Note that after
00243     ///iterate returns false, g, h, old_g and old_h will not have been
00244     ///updated.
00245     ///This function updates:
00246     /// - x
00247     /// - old_c
00248     /// - iterations
00249     /// - g*
00250     /// - old_g*
00251     /// - h*
00252     /// - old_h*
00253     /// *'d variables not updated on the last iteration.
00254     ///@param deriv Functor to compute derivatives at the specified point.
00255     ///@return Whether to continue.
00256     template<class Deriv> bool iterate(const Deriv& deriv)
00257     {
00258         find_next_point(deriv);
00259 
00260         if(!finished())
00261         {
00262             update_vectors_PR(new_g);
00263             return 1;
00264         }
00265         else
00266             return 0;
00267     }
00268 };
00269 
00270 
00271 #endif
00272