Newton solver is changed from trust region to line search

author Wei-Lin Chiang <infwinston@gmail.com>

Sun, 19 Jul 2020 10:31:37 +0000 (18:31 +0800)

committer Wei-Lin Chiang <infwinston@gmail.com>

Wed, 22 Jul 2020 07:21:56 +0000 (15:21 +0800)
author Wei-Lin Chiang <infwinston@gmail.com>
Sun, 19 Jul 2020 10:31:37 +0000 (18:31 +0800)
committer Wei-Lin Chiang <infwinston@gmail.com>
Wed, 22 Jul 2020 07:21:56 +0000 (15:21 +0800)
diff --git a/Makefile b/Makefile

index b97c45baafdccf42d7c543c057ee9c1e03614ae1..49e8a988c5b4ac39658dbb24fed10e1a45aa9781 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -8,22 +8,22 @@ OS = $(shell uname)
  
  all: train predict
  
-lib: linear.o tron.o blas/blas.a
+lib: linear.o newton.o blas/blas.a
         if [ "$(OS)" = "Darwin" ]; then \
                 SHARED_LIB_FLAG="-dynamiclib -Wl,-install_name,liblinear.so.$(SHVER)"; \
         else \
                 SHARED_LIB_FLAG="-shared -Wl,-soname,liblinear.so.$(SHVER)"; \
         fi; \
-       $(CXX) $${SHARED_LIB_FLAG} linear.o tron.o blas/blas.a -o liblinear.so.$(SHVER)
+       $(CXX) $${SHARED_LIB_FLAG} linear.o newton.o blas/blas.a -o liblinear.so.$(SHVER)
  
-train: tron.o linear.o train.c blas/blas.a
-       $(CXX) $(CFLAGS) -o train train.c tron.o linear.o $(LIBS)
+train: newton.o linear.o train.c blas/blas.a
+       $(CXX) $(CFLAGS) -o train train.c newton.o linear.o $(LIBS)
  
-predict: tron.o linear.o predict.c blas/blas.a
-       $(CXX) $(CFLAGS) -o predict predict.c tron.o linear.o $(LIBS)
+predict: newton.o linear.o predict.c blas/blas.a
+       $(CXX) $(CFLAGS) -o predict predict.c newton.o linear.o $(LIBS)
  
-tron.o: tron.cpp tron.h
-       $(CXX) $(CFLAGS) -c -o tron.o tron.cpp
+newton.o: newton.cpp newton.h
+       $(CXX) $(CFLAGS) -c -o newton.o newton.cpp
  
  linear.o: linear.cpp linear.h
         $(CXX) $(CFLAGS) -c -o linear.o linear.cpp
@@ -34,4 +34,4 @@ blas/blas.a: blas/*.c blas/*.h
  clean:
         make -C blas clean
         make -C matlab clean
-       rm -f *~ tron.o linear.o train predict liblinear.so.$(SHVER)
+       rm -f *~ newton.o linear.o train predict liblinear.so.$(SHVER)
diff --git a/linear.cpp b/linear.cpp

index 2335406f6fa44878ff6761dbb2d9383f2d9d68f1..9ef2b9e1ae0b0f94f2b7a1cde53952fe87cd760a 100644 (file)
--- a/linear.cpp
+++ b/linear.cpp
@@ -5,7 +5,7 @@
  #include <stdarg.h>
  #include <locale.h>
  #include "linear.h"
-#include "tron.h"
+#include "newton.h"
  int liblinear_version = LIBLINEAR_VERSION;
  typedef signed char schar;
  template <class T> static inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
@@ -102,74 +102,195 @@ public:
         }
  };
  
-class l2r_lr_fun: public function
+// L2-regularized empirical risk minimization
+// min_w w^Tw/2 + \sum C_i \xi(w^Tx_i), where \xi() is the loss
+
+class l2r_erm_fun: public function
  {
  public:
-       l2r_lr_fun(const problem *prob, const parameter *param, double *C);
-       ~l2r_lr_fun();
+       l2r_erm_fun(const problem *prob, const parameter *param, double *C);
+       ~l2r_erm_fun();
  
         double fun(double *w);
-       void grad(double *w, double *g);
-       void Hv(double *s, double *Hs);
-
+       double linesearch_and_update(double *w, double *d, double *f, double *g, double alpha);
         int get_nr_variable(void);
-       void get_diag_preconditioner(double *M);
  
-private:
+protected:
+       virtual double C_times_loss(int i, double wx_i) = 0;
         void Xv(double *v, double *Xv);
         void XTv(double *v, double *XTv);
  
         double *C;
-       double *z;
-       double *D;
         const problem *prob;
+       double *wx;
+       double *tmp; // a working array
+       double wTw;
         int regularize_bias;
  };
  
-l2r_lr_fun::l2r_lr_fun(const problem *prob, const parameter *param, double *C)
+l2r_erm_fun::l2r_erm_fun(const problem *prob, const parameter *param, double *C)
  {
         int l=prob->l;
  
         this->prob = prob;
  
-       z = new double[l];
-       D = new double[l];
+       wx = new double[l];
+       tmp = new double[l];
         this->C = C;
         this->regularize_bias = param->regularize_bias;
  }
  
-l2r_lr_fun::~l2r_lr_fun()
+l2r_erm_fun::~l2r_erm_fun()
  {
-       delete[] z;
-       delete[] D;
+       delete[] wx;
+       delete[] tmp;
  }
  
-
-double l2r_lr_fun::fun(double *w)
+double l2r_erm_fun::fun(double *w)
  {
         int i;
         double f=0;
-       double *y=prob->y;
         int l=prob->l;
         int w_size=get_nr_variable();
  
-       Xv(w, z);
+       wTw = 0;
+       Xv(w, wx);
  
         for(i=0;i<w_size;i++)
-               f += w[i]*w[i];
+               wTw += w[i]*w[i];
         if(regularize_bias == 0)
-               f -= w[w_size-1]*w[w_size-1];
-       f /= 2.0;
+               wTw -= w[w_size-1]*w[w_size-1];
         for(i=0;i<l;i++)
+               f += C_times_loss(i, wx[i]);
+       f = f + 0.5 * wTw;
+
+       return(f);
+}
+
+int l2r_erm_fun::get_nr_variable(void)
+{
+       return prob->n;
+}
+
+// On entry *f must be the function value of w
+// On exit w is updated and *f is the new function value
+double l2r_erm_fun::linesearch_and_update(double *w, double *s, double *f, double *g, double alpha)
+{
+       int i;
+       int l = prob->l;
+       double sTs = 0;
+       double wTs = 0;
+       double gTs = 0;
+       double eta = 0.01;
+       int w_size = get_nr_variable();
+       int max_num_linesearch = 20;
+       double fold = *f;
+       Xv(s, tmp);
+
+       for (i=0;i<w_size;i++)
+       {
+               sTs += s[i] * s[i];
+               wTs += s[i] * w[i];
+               gTs += s[i] * g[i];
+       }
+       if(regularize_bias == 0)
+       {
+               // bias not used in calculating (w + \alpha s)^T (w + \alpha s)
+               sTs -= s[w_size-1] * s[w_size-1];
+               wTs -= s[w_size-1] * w[w_size-1];
+       }
+
+       int num_linesearch = 0;
+       for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
         {
-               double yz = y[i]*z[i];
-               if (yz >= 0)
-                       f += C[i]*log(1 + exp(-yz));
+               double loss = 0;
+               for(i=0;i<l;i++)
+               {
+                       double inner_product = tmp[i] * alpha + wx[i];
+                       loss += C_times_loss(i, inner_product);
+               }
+               *f = loss + (alpha * alpha * sTs + wTw) / 2.0 + alpha * wTs;
+               if (*f - fold <= eta * alpha * gTs)
+               {
+                       for (i=0;i<l;i++)
+                               wx[i] += alpha * tmp[i];
+                       break;
+               }
                 else
-                       f += C[i]*(-yz+log(1 + exp(yz)));
+                       alpha *= 0.5;
         }
  
-       return(f);
+       if (num_linesearch >= max_num_linesearch)
+       {
+               *f = fold;
+               return 0;
+       }
+       else
+               for (i=0;i<w_size;i++)
+                       w[i] += alpha * s[i];
+
+       wTw += alpha * alpha * sTs + 2* alpha * wTs;
+       return alpha;
+}
+
+void l2r_erm_fun::Xv(double *v, double *Xv)
+{
+       int i;
+       int l=prob->l;
+       feature_node **x=prob->x;
+
+       for(i=0;i<l;i++)
+               Xv[i]=sparse_operator::dot(v, x[i]);
+}
+
+void l2r_erm_fun::XTv(double *v, double *XTv)
+{
+       int i;
+       int l=prob->l;
+       int w_size=get_nr_variable();
+       feature_node **x=prob->x;
+
+       for(i=0;i<w_size;i++)
+               XTv[i]=0;
+       for(i=0;i<l;i++)
+               sparse_operator::axpy(v[i], x[i], XTv);
+}
+
+class l2r_lr_fun: public l2r_erm_fun
+{
+public:
+       l2r_lr_fun(const problem *prob, const parameter *param, double *C);
+       ~l2r_lr_fun();
+
+       void grad(double *w, double *g);
+       void Hv(double *s, double *Hs);
+
+       void get_diag_preconditioner(double *M);
+
+private:
+       double *D;
+       double C_times_loss(int i, double wx_i);
+};
+
+l2r_lr_fun::l2r_lr_fun(const problem *prob, const parameter *param, double *C):
+       l2r_erm_fun(prob, param, C)
+{
+       int l=prob->l;
+       D = new double[l];
+}
+
+l2r_lr_fun::~l2r_lr_fun()
+{
+       delete[] D;
+}
+
+double l2r_lr_fun::C_times_loss(int i, double wx_i)
+{
+       double ywx_i = wx_i * prob->y[i];
+       if (ywx_i >= 0)
+               return C[i]*log(1 + exp(-ywx_i));
+       else
+               return C[i]*(-ywx_i + log(1 + exp(ywx_i)));
  }
  
  void l2r_lr_fun::grad(double *w, double *g)
@@ -181,11 +302,11 @@ void l2r_lr_fun::grad(double *w, double *g)
  
         for(i=0;i<l;i++)
         {
-               z[i] = 1/(1 + exp(-y[i]*z[i]));
-               D[i] = z[i]*(1-z[i]);
-               z[i] = C[i]*(z[i]-1)*y[i];
+               tmp[i] = 1/(1 + exp(-y[i]*wx[i]));
+               D[i] = tmp[i]*(1-tmp[i]);
+               tmp[i] = C[i]*(tmp[i]-1)*y[i];
         }
-       XTv(z, g);
+       XTv(tmp, g);
  
         for(i=0;i<w_size;i++)
                 g[i] = w[i] + g[i];
@@ -193,11 +314,6 @@ void l2r_lr_fun::grad(double *w, double *g)
                 g[w_size-1] -= w[w_size-1];
  }
  
-int l2r_lr_fun::get_nr_variable(void)
-{
-       return prob->n;
-}
-
  void l2r_lr_fun::get_diag_preconditioner(double *M)
  {
         int i;
@@ -245,96 +361,45 @@ void l2r_lr_fun::Hv(double *s, double *Hs)
                 Hs[w_size-1] -= s[w_size-1];
  }
  
-void l2r_lr_fun::Xv(double *v, double *Xv)
-{
-       int i;
-       int l=prob->l;
-       feature_node **x=prob->x;
-
-       for(i=0;i<l;i++)
-               Xv[i]=sparse_operator::dot(v, x[i]);
-}
-
-void l2r_lr_fun::XTv(double *v, double *XTv)
-{
-       int i;
-       int l=prob->l;
-       int w_size=get_nr_variable();
-       feature_node **x=prob->x;
-
-       for(i=0;i<w_size;i++)
-               XTv[i]=0;
-       for(i=0;i<l;i++)
-               sparse_operator::axpy(v[i], x[i], XTv);
-}
-
-class l2r_l2_svc_fun: public function
+class l2r_l2_svc_fun: public l2r_erm_fun
  {
  public:
         l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C);
         ~l2r_l2_svc_fun();
  
-       double fun(double *w);
         void grad(double *w, double *g);
         void Hv(double *s, double *Hs);
  
-       int get_nr_variable(void);
         void get_diag_preconditioner(double *M);
  
  protected:
-       void Xv(double *v, double *Xv);
         void subXTv(double *v, double *XTv);
  
-       double *C;
-       double *z;
         int *I;
         int sizeI;
-       const problem *prob;
-       int regularize_bias;
+
+private:
+       double C_times_loss(int i, double wx_i);
  };
  
-l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C)
+l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C):
+       l2r_erm_fun(prob, param, C)
  {
-       int l=prob->l;
-
-       this->prob = prob;
-
-       z = new double[l];
-       I = new int[l];
-       this->C = C;
-       this->regularize_bias = param->regularize_bias;
+       I = new int[prob->l];
  }
  
  l2r_l2_svc_fun::~l2r_l2_svc_fun()
  {
-       delete[] z;
         delete[] I;
  }
  
-double l2r_l2_svc_fun::fun(double *w)
+double l2r_l2_svc_fun::C_times_loss(int i, double wx_i)
  {
-       int i;
-       double f=0;
-       double *y=prob->y;
-       int l=prob->l;
-       int w_size=get_nr_variable();
-
-       Xv(w, z);
-
-       for(i=0;i<w_size;i++)
-               f += w[i]*w[i];
-       if(regularize_bias == 0)
-               f -= w[w_size-1]*w[w_size-1];
-       f /= 2.0;
-       for(i=0;i<l;i++)
-       {
-               z[i] = y[i]*z[i];
-               double d = 1-z[i];
-               if (d > 0)
-                       f += C[i]*d*d;
-       }
-
-       return(f);
+       double d = 1 - prob->y[i] * wx_i;
+       if (d > 0)
+               return C[i] * d * d;
+       else
+               return 0;
  }
  
  void l2r_l2_svc_fun::grad(double *w, double *g)
@@ -346,13 +411,16 @@ void l2r_l2_svc_fun::grad(double *w, double *g)
  
         sizeI = 0;
         for (i=0;i<l;i++)
-               if (z[i] < 1)
+       {
+               tmp[i] = wx[i] * y[i];
+               if (tmp[i] < 1)
                 {
-                       z[sizeI] = C[i]*y[i]*(z[i]-1);
+                       tmp[sizeI] = C[i]*y[i]*(tmp[i]-1);
                         I[sizeI] = i;
                         sizeI++;
                 }
-       subXTv(z, g);
+       }
+       subXTv(tmp, g);
  
         for(i=0;i<w_size;i++)
                 g[i] = w[i] + 2*g[i];
@@ -360,11 +428,6 @@ void l2r_l2_svc_fun::grad(double *w, double *g)
                 g[w_size-1] -= w[w_size-1];
  }
  
-int l2r_l2_svc_fun::get_nr_variable(void)
-{
-       return prob->n;
-}
-
  void l2r_l2_svc_fun::get_diag_preconditioner(double *M)
  {
         int i;
@@ -411,16 +474,6 @@ void l2r_l2_svc_fun::Hv(double *s, double *Hs)
                 Hs[w_size-1] -= s[w_size-1];
  }
  
-void l2r_l2_svc_fun::Xv(double *v, double *Xv)
-{
-       int i;
-       int l=prob->l;
-       feature_node **x=prob->x;
-
-       for(i=0;i<l;i++)
-               Xv[i]=sparse_operator::dot(v, x[i]);
-}
-
  void l2r_l2_svc_fun::subXTv(double *v, double *XTv)
  {
         int i;
@@ -438,12 +491,11 @@ class l2r_l2_svr_fun: public l2r_l2_svc_fun
  public:
         l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C);
  
-       double fun(double *w);
         void grad(double *w, double *g);
  
  private:
+       double C_times_loss(int i, double wx_i);
         double p;
-       int regularize_bias;
  };
  
  l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C):
@@ -453,32 +505,14 @@ l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, const parameter *param, doub
         this->regularize_bias = param->regularize_bias;
  }
  
-double l2r_l2_svr_fun::fun(double *w)
+double l2r_l2_svr_fun::C_times_loss(int i, double wx_i)
  {
-       int i;
-       double f=0;
-       double *y=prob->y;
-       int l=prob->l;
-       int w_size=get_nr_variable();
-       double d;
-
-       Xv(w, z);
-
-       for(i=0;i<w_size;i++)
-               f += w[i]*w[i];
-       if(regularize_bias == 0)
-               f -= w[w_size-1]*w[w_size-1];
-       f /= 2;
-       for(i=0;i<l;i++)
-       {
-               d = z[i] - y[i];
-               if(d < -p)
-                       f += C[i]*(d+p)*(d+p);
-               else if(d > p)
-                       f += C[i]*(d-p)*(d-p);
-       }
-
-       return(f);
+       double d = wx_i - prob->y[i];
+       if(d < -p)
+               return C[i]*(d+p)*(d+p);
+       else if(d > p)
+               return C[i]*(d-p)*(d-p);
+       return 0;
  }
  
  void l2r_l2_svr_fun::grad(double *w, double *g)
@@ -492,24 +526,24 @@ void l2r_l2_svr_fun::grad(double *w, double *g)
         sizeI = 0;
         for(i=0;i<l;i++)
         {
-               d = z[i] - y[i];
+               d = wx[i] - y[i];
  
                 // generate index set I
                 if(d < -p)
                 {
-                       z[sizeI] = C[i]*(d+p);
+                       tmp[sizeI] = C[i]*(d+p);
                         I[sizeI] = i;
                         sizeI++;
                 }
                 else if(d > p)
                 {
-                       z[sizeI] = C[i]*(d-p);
+                       tmp[sizeI] = C[i]*(d-p);
                         I[sizeI] = i;
                         sizeI++;
                 }
  
         }
-       subXTv(z, g);
+       subXTv(tmp, g);
  
         for(i=0;i<w_size;i++)
                 g[i] = w[i] + 2*g[i];
@@ -2582,11 +2616,7 @@ static void group_classes(const problem *prob, int *nr_class_ret, int **label_re
  
  static void train_one(const problem *prob, const parameter *param, double *w, double Cp, double Cn)
  {
-       //inner and outer tolerances for TRON
         double eps = param->eps;
-       double eps_cg = 0.1;
-       if(param->init_sol != NULL)
-               eps_cg = 0.5;
  
         int pos = 0;
         int neg = 0;
@@ -2610,9 +2640,9 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
                                         C[i] = Cn;
                         }
                         fun_obj=new l2r_lr_fun(prob, param, C);
-                       TRON tron_obj(fun_obj, primal_solver_tol, eps_cg);
-                       tron_obj.set_print_string(liblinear_print_string);
-                       tron_obj.tron(w);
+                       NEWTON newton_obj(fun_obj, primal_solver_tol);
+                       newton_obj.set_print_string(liblinear_print_string);
+                       newton_obj.newton(w);
                         delete fun_obj;
                         delete[] C;
                         break;
@@ -2628,9 +2658,9 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
                                         C[i] = Cn;
                         }
                         fun_obj=new l2r_l2_svc_fun(prob, param, C);
-                       TRON tron_obj(fun_obj, primal_solver_tol, eps_cg);
-                       tron_obj.set_print_string(liblinear_print_string);
-                       tron_obj.tron(w);
+                       NEWTON newton_obj(fun_obj, primal_solver_tol);
+                       newton_obj.set_print_string(liblinear_print_string);
+                       newton_obj.newton(w);
                         delete fun_obj;
                         delete[] C;
                         break;
@@ -2673,9 +2703,9 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
                                 C[i] = param->C;
  
                         fun_obj=new l2r_l2_svr_fun(prob, param, C);
-                       TRON tron_obj(fun_obj, param->eps);
-                       tron_obj.set_print_string(liblinear_print_string);
-                       tron_obj.tron(w);
+                       NEWTON newton_obj(fun_obj, param->eps);
+                       newton_obj.set_print_string(liblinear_print_string);
+                       newton_obj.newton(w);
                         delete fun_obj;
                         delete[] C;
                         break;
diff --git a/newton.cpp b/newton.cpp

new file mode 100644 (file)

index 0000000..3c1dd98
--- /dev/null
+++ b/newton.cpp
@@ -0,0 +1,245 @@
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include "newton.h"
+
+#ifndef min
+template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
+#endif
+
+#ifndef max
+template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern double dnrm2_(int *, double *, int *);
+extern double ddot_(int *, double *, int *, double *, int *);
+extern int daxpy_(int *, double *, double *, int *, double *, int *);
+extern int dscal_(int *, double *, double *, int *);
+
+#ifdef __cplusplus
+}
+#endif
+
+static void default_print(const char *buf)
+{
+       fputs(buf,stdout);
+       fflush(stdout);
+}
+
+// On entry *f must be the function value of w
+// On exit w is updated and *f is the new function value
+double function::linesearch_and_update(double *w, double *s, double *f, double *g, double alpha)
+{
+       double gTs = 0;
+       double eta = 0.01;
+       int n = get_nr_variable();
+       int max_num_linesearch = 20;
+       double *w_new = new double[n];
+       double fold = *f;
+
+       for (int i=0;i<n;i++)
+               gTs += s[i] * g[i];
+
+       int num_linesearch = 0;
+       for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
+       {
+               for (int i=0;i<n;i++)
+                       w_new[i] = w[i] + alpha*s[i];
+               *f = fun(w_new);
+               if (*f - fold <= eta * alpha * gTs)
+                       break;
+               else
+                       alpha *= 0.5;
+       }
+
+       if (num_linesearch >= max_num_linesearch)
+       {
+               *f = fold;
+               return 0;
+       }
+       else
+               memcpy(w, w_new, sizeof(double)*n);
+
+       delete [] w_new;
+       return alpha;
+}
+
+void NEWTON::info(const char *fmt,...)
+{
+       char buf[BUFSIZ];
+       va_list ap;
+       va_start(ap,fmt);
+       vsprintf(buf,fmt,ap);
+       va_end(ap);
+       (*newton_print_string)(buf);
+}
+
+NEWTON::NEWTON(const function *fun_obj, double eps, double eps_cg, int max_iter)
+{
+       this->fun_obj=const_cast<function *>(fun_obj);
+       this->eps=eps;
+       this->eps_cg=eps_cg;
+       this->max_iter=max_iter;
+       newton_print_string = default_print;
+}
+
+NEWTON::~NEWTON()
+{
+}
+
+void NEWTON::newton(double *w)
+{
+       int n = fun_obj->get_nr_variable();
+       int i, cg_iter;
+       double step_size;
+       double f, fold, actred;
+       double init_step_size = 1;
+       int search = 1, iter = 1, inc = 1;
+       double *s = new double[n];
+       double *r = new double[n];
+       double *g = new double[n];
+
+       const double alpha_pcg = 0.01;
+       double *M = new double[n];
+
+       // calculate gradient norm at w=0 for stopping condition.
+       double *w0 = new double[n];
+       for (i=0; i<n; i++)
+               w0[i] = 0;
+       fun_obj->fun(w0);
+       fun_obj->grad(w0, g);
+       double gnorm0 = dnrm2_(&n, g, &inc);
+       delete [] w0;
+
+       f = fun_obj->fun(w);
+       info("init f %5.3e\n", f);
+       fun_obj->grad(w, g);
+       double gnorm = dnrm2_(&n, g, &inc);
+
+       if (gnorm <= eps*gnorm0)
+               search = 0;
+
+       double *w_new = new double[n];
+       while (iter <= max_iter && search)
+       {
+               fun_obj->get_diag_preconditioner(M);
+               for(i=0; i<n; i++)
+                       M[i] = (1-alpha_pcg) + alpha_pcg*M[i];
+               cg_iter = pcg(g, M, s, r);
+
+               fold = f;
+               step_size = fun_obj->linesearch_and_update(w, s, & f, g, init_step_size);
+
+               if (step_size == 0)
+               {
+                       info("WARNING: line search fails\n");
+                       break;
+               }
+
+               info("iter %2d f %5.3e |g| %5.3e CG %3d step_size %4.2e \n", iter, f, gnorm, cg_iter, step_size);
+
+               actred = fold - f;
+               iter++;
+
+               fun_obj->grad(w, g);
+
+               gnorm = dnrm2_(&n, g, &inc);
+               if (gnorm <= eps*gnorm0)
+                       break;
+               if (f < -1.0e+32)
+               {
+                       info("WARNING: f < -1.0e+32\n");
+                       break;
+               }
+               if (fabs(actred) <= 1.0e-12*fabs(f))
+               {
+                       info("WARNING: actred too small\n");
+                       break;
+               }
+       }
+
+       delete[] g;
+       delete[] r;
+       delete[] w_new;
+       delete[] s;
+       delete[] M;
+}
+
+int NEWTON::pcg(double *g, double *M, double *s, double *r)
+{
+       int i, inc = 1;
+       int n = fun_obj->get_nr_variable();
+       double one = 1;
+       double *d = new double[n];
+       double *Hd = new double[n];
+       double zTr, znewTrnew, alpha, beta, cgtol;
+       double *z = new double[n];
+       double Q = 0, newQ, Qdiff;
+
+       for (i=0; i<n; i++)
+       {
+               s[i] = 0;
+               r[i] = -g[i];
+               z[i] = r[i] / M[i];
+               d[i] = z[i];
+       }
+
+       zTr = ddot_(&n, z, &inc, r, &inc);
+       double gMinv_norm = sqrt(zTr);
+       cgtol = min(eps_cg, sqrt(gMinv_norm));
+       int cg_iter = 0;
+       int max_cg_iter = max(n, 5);
+
+       while (cg_iter < max_cg_iter)
+       {
+               cg_iter++;
+               fun_obj->Hv(d, Hd);
+
+               alpha = zTr/ddot_(&n, d, &inc, Hd, &inc);
+               daxpy_(&n, &alpha, d, &inc, s, &inc);
+               alpha = -alpha;
+               daxpy_(&n, &alpha, Hd, &inc, r, &inc);
+
+               // Using quadratic approximation as CG stopping criterion
+               newQ = -0.5*(ddot_(&n, s, &inc, r, &inc) - ddot_(&n, s, &inc, g, &inc));
+               Qdiff = newQ - Q;
+               if (newQ <= 0 && Qdiff <= 0)
+               {
+                       if (cg_iter * Qdiff >= cgtol * newQ)
+                               break;
+               }
+               else
+               {
+                       info("WARNING: quadratic approximation > 0 or increasing in CG\n");
+                       break;
+               }
+               Q = newQ;
+
+               for (i=0; i<n; i++)
+                       z[i] = r[i] / M[i];
+               znewTrnew = ddot_(&n, z, &inc, r, &inc);
+               beta = znewTrnew/zTr;
+               dscal_(&n, &beta, d, &inc);
+               daxpy_(&n, &one, z, &inc, d, &inc);
+               zTr = znewTrnew;
+       }
+
+       if (cg_iter == max_cg_iter)
+               info("WARNING: reaching maximal number of CG steps\n");
+
+       delete[] d;
+       delete[] Hd;
+       delete[] z;
+
+       return(cg_iter);
+}
+
+void NEWTON::set_print_string(void (*print_string) (const char *buf))
+{
+       newton_print_string = print_string;
+}
diff --git a/tron.h b/newton.h

similarity index 53%

rename from tron.h

rename to newton.h

index 524fa9c91cf49c5af4d57831e9f6a859fab96d9c..c28bee859ffcf841eb963889847bd01c0cb68534 100644 (file)
--- a/tron.h
+++ b/newton.h
@@ -1,5 +1,5 @@
-#ifndef _TRON_H
-#define _TRON_H
+#ifndef _NEWTON_H
+#define _NEWTON_H
  
  class function
  {
@@ -7,30 +7,31 @@ public:
         virtual double fun(double *w) = 0 ;
         virtual void grad(double *w, double *g) = 0 ;
         virtual void Hv(double *s, double *Hs) = 0 ;
-
         virtual int get_nr_variable(void) = 0 ;
         virtual void get_diag_preconditioner(double *M) = 0 ;
         virtual ~function(void){}
+
+       // base implementation in newton.cpp
+       virtual double linesearch_and_update(double *w, double *s, double *f, double *g, double alpha);
  };
  
-class TRON
+class NEWTON
  {
  public:
-       TRON(const function *fun_obj, double eps = 0.1, double eps_cg = 0.1, int max_iter = 1000);
-       ~TRON();
+       NEWTON(const function *fun_obj, double eps = 0.1, double eps_cg = 0.5, int max_iter = 1000);
+       ~NEWTON();
  
-       void tron(double *w);
+       void newton(double *w);
         void set_print_string(void (*i_print) (const char *buf));
  
  private:
-       int trpcg(double delta, double *g, double *M, double *s, double *r, bool *reach_boundary);
-       double norm_inf(int n, double *x);
+       int pcg(double *g, double *M, double *s, double *r);
  
         double eps;
         double eps_cg;
         int max_iter;
         function *fun_obj;
         void info(const char *fmt,...);
-       void (*tron_print_string)(const char *buf);
+       void (*newton_print_string)(const char *buf);
  };
  #endif
diff --git a/tron.cpp b/tron.cpp

deleted file mode 100644 (file)

index 2e0cb24..0000000
--- a/tron.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdarg.h>
-#include "tron.h"
-
-#ifndef min
-template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
-#endif
-
-#ifndef max
-template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern double dnrm2_(int *, double *, int *);
-extern double ddot_(int *, double *, int *, double *, int *);
-extern int daxpy_(int *, double *, double *, int *, double *, int *);
-extern int dscal_(int *, double *, double *, int *);
-
-#ifdef __cplusplus
-}
-#endif
-
-static void default_print(const char *buf)
-{
-       fputs(buf,stdout);
-       fflush(stdout);
-}
-
-static double uTMv(int n, double *u, double *M, double *v)
-{
-       const int m = n-4;
-       double res = 0;
-       int i;
-       for (i=0; i<m; i+=5)
-               res += u[i]*M[i]*v[i]+u[i+1]*M[i+1]*v[i+1]+u[i+2]*M[i+2]*v[i+2]+
-                       u[i+3]*M[i+3]*v[i+3]+u[i+4]*M[i+4]*v[i+4];
-       for (; i<n; i++)
-               res += u[i]*M[i]*v[i];
-       return res;
-}
-
-void TRON::info(const char *fmt,...)
-{
-       char buf[BUFSIZ];
-       va_list ap;
-       va_start(ap,fmt);
-       vsprintf(buf,fmt,ap);
-       va_end(ap);
-       (*tron_print_string)(buf);
-}
-
-TRON::TRON(const function *fun_obj, double eps, double eps_cg, int max_iter)
-{
-       this->fun_obj=const_cast<function *>(fun_obj);
-       this->eps=eps;
-       this->eps_cg=eps_cg;
-       this->max_iter=max_iter;
-       tron_print_string = default_print;
-}
-
-TRON::~TRON()
-{
-}
-
-void TRON::tron(double *w)
-{
-       // Parameters for updating the iterates.
-       double eta0 = 1e-4, eta1 = 0.25, eta2 = 0.75;
-
-       // Parameters for updating the trust region size delta.
-       double sigma1 = 0.25, sigma2 = 0.5, sigma3 = 4;
-
-       int n = fun_obj->get_nr_variable();
-       int i, cg_iter;
-       double delta=0, sMnorm, one=1.0;
-       double alpha, f, fnew, prered, actred, gs;
-       int search = 1, iter = 1, inc = 1;
-       double *s = new double[n];
-       double *r = new double[n];
-       double *g = new double[n];
-
-       const double alpha_pcg = 0.01;
-       double *M = new double[n];
-
-       // calculate gradient norm at w=0 for stopping condition.
-       double *w0 = new double[n];
-       for (i=0; i<n; i++)
-               w0[i] = 0;
-       fun_obj->fun(w0);
-       fun_obj->grad(w0, g);
-       double gnorm0 = dnrm2_(&n, g, &inc);
-       delete [] w0;
-
-       f = fun_obj->fun(w);
-       fun_obj->grad(w, g);
-       double gnorm = dnrm2_(&n, g, &inc);
-
-       if (gnorm <= eps*gnorm0)
-               search = 0;
-
-       fun_obj->get_diag_preconditioner(M);
-       for(i=0; i<n; i++)
-               M[i] = (1-alpha_pcg) + alpha_pcg*M[i];
-       delta = sqrt(uTMv(n, g, M, g));
-
-       double *w_new = new double[n];
-       bool reach_boundary;
-       bool delta_adjusted = false;
-       while (iter <= max_iter && search)
-       {
-               cg_iter = trpcg(delta, g, M, s, r, &reach_boundary);
-
-               memcpy(w_new, w, sizeof(double)*n);
-               daxpy_(&n, &one, s, &inc, w_new, &inc);
-
-               gs = ddot_(&n, g, &inc, s, &inc);
-               prered = -0.5*(gs-ddot_(&n, s, &inc, r, &inc));
-               fnew = fun_obj->fun(w_new);
-
-               // Compute the actual reduction.
-               actred = f - fnew;
-
-               // On the first iteration, adjust the initial step bound.
-               sMnorm = sqrt(uTMv(n, s, M, s));
-               if (iter == 1 && !delta_adjusted)
-               {
-                       delta = min(delta, sMnorm);
-                       delta_adjusted = true;
-               }
-
-               // Compute prediction alpha*sMnorm of the step.
-               if (fnew - f - gs <= 0)
-                       alpha = sigma3;
-               else
-                       alpha = max(sigma1, -0.5*(gs/(fnew - f - gs)));
-
-               // Update the trust region bound according to the ratio of actual to predicted reduction.
-               if (actred < eta0*prered)
-                       delta = min(alpha*sMnorm, sigma2*delta);
-               else if (actred < eta1*prered)
-                       delta = max(sigma1*delta, min(alpha*sMnorm, sigma2*delta));
-               else if (actred < eta2*prered)
-                       delta = max(sigma1*delta, min(alpha*sMnorm, sigma3*delta));
-               else
-               {
-                       if (reach_boundary)
-                               delta = sigma3*delta;
-                       else
-                               delta = max(delta, min(alpha*sMnorm, sigma3*delta));
-               }
-
-               info("iter %2d act %5.3e pre %5.3e delta %5.3e f %5.3e |g| %5.3e CG %3d\n", iter, actred, prered, delta, f, gnorm, cg_iter);
-
-               if (actred > eta0*prered)
-               {
-                       iter++;
-                       memcpy(w, w_new, sizeof(double)*n);
-                       f = fnew;
-                       fun_obj->grad(w, g);
-                       fun_obj->get_diag_preconditioner(M);
-                       for(i=0; i<n; i++)
-                               M[i] = (1-alpha_pcg) + alpha_pcg*M[i];
-
-                       gnorm = dnrm2_(&n, g, &inc);
-                       if (gnorm <= eps*gnorm0)
-                               break;
-               }
-               if (f < -1.0e+32)
-               {
-                       info("WARNING: f < -1.0e+32\n");
-                       break;
-               }
-               if (prered <= 0)
-               {
-                       info("WARNING: prered <= 0\n");
-                       break;
-               }
-               if (fabs(actred) <= 1.0e-12*fabs(f) &&
-                   fabs(prered) <= 1.0e-12*fabs(f))
-               {
-                       info("WARNING: actred and prered too small\n");
-                       break;
-               }
-       }
-
-       delete[] g;
-       delete[] r;
-       delete[] w_new;
-       delete[] s;
-       delete[] M;
-}
-
-int TRON::trpcg(double delta, double *g, double *M, double *s, double *r, bool *reach_boundary)
-{
-       int i, inc = 1;
-       int n = fun_obj->get_nr_variable();
-       double one = 1;
-       double *d = new double[n];
-       double *Hd = new double[n];
-       double zTr, znewTrnew, alpha, beta, cgtol;
-       double *z = new double[n];
-
-       *reach_boundary = false;
-       for (i=0; i<n; i++)
-       {
-               s[i] = 0;
-               r[i] = -g[i];
-               z[i] = r[i] / M[i];
-               d[i] = z[i];
-       }
-
-       zTr = ddot_(&n, z, &inc, r, &inc);
-       cgtol = eps_cg*sqrt(zTr);
-       int cg_iter = 0;
-       int max_cg_iter = max(n, 5);
-
-       while (cg_iter < max_cg_iter)
-       {
-               if (sqrt(zTr) <= cgtol)
-                       break;
-               cg_iter++;
-               fun_obj->Hv(d, Hd);
-
-               alpha = zTr/ddot_(&n, d, &inc, Hd, &inc);
-               daxpy_(&n, &alpha, d, &inc, s, &inc);
-
-               double sMnorm = sqrt(uTMv(n, s, M, s));
-               if (sMnorm > delta)
-               {
-                       info("cg reaches trust region boundary\n");
-                       *reach_boundary = true;
-                       alpha = -alpha;
-                       daxpy_(&n, &alpha, d, &inc, s, &inc);
-
-                       double sTMd = uTMv(n, s, M, d);
-                       double sTMs = uTMv(n, s, M, s);
-                       double dTMd = uTMv(n, d, M, d);
-                       double dsq = delta*delta;
-                       double rad = sqrt(sTMd*sTMd + dTMd*(dsq-sTMs));
-                       if (sTMd >= 0)
-                               alpha = (dsq - sTMs)/(sTMd + rad);
-                       else
-                               alpha = (rad - sTMd)/dTMd;
-                       daxpy_(&n, &alpha, d, &inc, s, &inc);
-                       alpha = -alpha;
-                       daxpy_(&n, &alpha, Hd, &inc, r, &inc);
-                       break;
-               }
-               alpha = -alpha;
-               daxpy_(&n, &alpha, Hd, &inc, r, &inc);
-
-               for (i=0; i<n; i++)
-                       z[i] = r[i] / M[i];
-               znewTrnew = ddot_(&n, z, &inc, r, &inc);
-               beta = znewTrnew/zTr;
-               dscal_(&n, &beta, d, &inc);
-               daxpy_(&n, &one, z, &inc, d, &inc);
-               zTr = znewTrnew;
-       }
-
-       if (cg_iter == max_cg_iter)
-               info("WARNING: reaching maximal number of CG steps\n");
-
-       delete[] d;
-       delete[] Hd;
-       delete[] z;
-
-       return(cg_iter);
-}
-
-double TRON::norm_inf(int n, double *x)
-{
-       double dmax = fabs(x[0]);
-       for (int i=1; i<n; i++)
-               if (fabs(x[i]) >= dmax)
-                       dmax = fabs(x[i]);
-       return(dmax);
-}
-
-void TRON::set_print_string(void (*print_string) (const char *buf))
-{
-       tron_print_string = print_string;
-}
author	Wei-Lin Chiang <infwinston@gmail.com>
	Sun, 19 Jul 2020 10:31:37 +0000 (18:31 +0800)
committer	Wei-Lin Chiang <infwinston@gmail.com>
	Wed, 22 Jul 2020 07:21:56 +0000 (15:21 +0800)
Makefile		patch \| blob \| history
linear.cpp		patch \| blob \| history
newton.cpp	[new file with mode: 0644]	patch \| blob
newton.h	[moved from tron.h with 53% similarity]	patch \| blob \| history
tron.cpp	[deleted file]	patch \| blob \| history