From: Wei-Lin Chiang <infwinston@gmail.com>
Date: Sun, 19 Jul 2020 10:31:37 +0000 (+0800)
Subject: Newton solver is changed from trust region to line search
X-Git-Tag: v240~3
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=ec50a26c82cde2ec007b86fe968975375cb845e6;p=liblinear

Newton solver is changed from trust region to line search

Specific changes are as follows.

- file name change:

tron.cpp and tron.h are changed to newton.cpp and newton.h, respectively.

In newton.cpp, the subroutine trpcg is changed to pcg.

Some functions (in the class newton) used for trust region are removed.

- classes of functions in linear.cpp:

A new class of function l2r_erm_fun is added to cover lr and svm.
It assumes the following function form

min_w w^Tw/2 + \sum C_i \xi(w^Tx_i), where \xi() is the loss

Some common utilities in particular the line-search subroutine (see below) are
implemented here.

- line search subroutines:

A special line-search subroutine for l2r_erm_fun is implemented in linear.cpp
so for each step size the function value can be cheaply calculated.

A base implementation of line search is provided in newton.cpp, where it
calculates every function value from scratch.

We require that the line-search subroutine to update w and also maintain
the function value.

- CG stopping criterion

It's changed to check a quadratic approximation instead of the
residual. See the release notes of version 2.40.

- CG stopping tolerance

LIBLINEAR enlarged eps_cg if the initial primal solution is not
null. This was from Chu et al. (2015) to avoid too many CG steps
in warm start for parameter selection. We found that this setting
is no longer needed; see details in version 2.40 release notes.
---

diff --git a/Makefile b/Makefile
index b97c45b..49e8a98 100644
--- a/Makefile
+++ b/Makefile
@@ -8,22 +8,22 @@ OS = $(shell uname)
 
 all: train predict
 
-lib: linear.o tron.o blas/blas.a
+lib: linear.o newton.o blas/blas.a
 	if [ "$(OS)" = "Darwin" ]; then \
 		SHARED_LIB_FLAG="-dynamiclib -Wl,-install_name,liblinear.so.$(SHVER)"; \
 	else \
 		SHARED_LIB_FLAG="-shared -Wl,-soname,liblinear.so.$(SHVER)"; \
 	fi; \
-	$(CXX) $${SHARED_LIB_FLAG} linear.o tron.o blas/blas.a -o liblinear.so.$(SHVER)
+	$(CXX) $${SHARED_LIB_FLAG} linear.o newton.o blas/blas.a -o liblinear.so.$(SHVER)
 
-train: tron.o linear.o train.c blas/blas.a
-	$(CXX) $(CFLAGS) -o train train.c tron.o linear.o $(LIBS)
+train: newton.o linear.o train.c blas/blas.a
+	$(CXX) $(CFLAGS) -o train train.c newton.o linear.o $(LIBS)
 
-predict: tron.o linear.o predict.c blas/blas.a
-	$(CXX) $(CFLAGS) -o predict predict.c tron.o linear.o $(LIBS)
+predict: newton.o linear.o predict.c blas/blas.a
+	$(CXX) $(CFLAGS) -o predict predict.c newton.o linear.o $(LIBS)
 
-tron.o: tron.cpp tron.h
-	$(CXX) $(CFLAGS) -c -o tron.o tron.cpp
+newton.o: newton.cpp newton.h
+	$(CXX) $(CFLAGS) -c -o newton.o newton.cpp
 
 linear.o: linear.cpp linear.h
 	$(CXX) $(CFLAGS) -c -o linear.o linear.cpp
@@ -34,4 +34,4 @@ blas/blas.a: blas/*.c blas/*.h
 clean:
 	make -C blas clean
 	make -C matlab clean
-	rm -f *~ tron.o linear.o train predict liblinear.so.$(SHVER)
+	rm -f *~ newton.o linear.o train predict liblinear.so.$(SHVER)
diff --git a/linear.cpp b/linear.cpp
index 2335406..9ef2b9e 100644
--- a/linear.cpp
+++ b/linear.cpp
@@ -5,7 +5,7 @@
 #include <stdarg.h>
 #include <locale.h>
 #include "linear.h"
-#include "tron.h"
+#include "newton.h"
 int liblinear_version = LIBLINEAR_VERSION;
 typedef signed char schar;
 template <class T> static inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
@@ -102,74 +102,195 @@ public:
 	}
 };
 
-class l2r_lr_fun: public function
+// L2-regularized empirical risk minimization
+// min_w w^Tw/2 + \sum C_i \xi(w^Tx_i), where \xi() is the loss
+
+class l2r_erm_fun: public function
 {
 public:
-	l2r_lr_fun(const problem *prob, const parameter *param, double *C);
-	~l2r_lr_fun();
+	l2r_erm_fun(const problem *prob, const parameter *param, double *C);
+	~l2r_erm_fun();
 
 	double fun(double *w);
-	void grad(double *w, double *g);
-	void Hv(double *s, double *Hs);
-
+	double linesearch_and_update(double *w, double *d, double *f, double *g, double alpha);
 	int get_nr_variable(void);
-	void get_diag_preconditioner(double *M);
 
-private:
+protected:
+	virtual double C_times_loss(int i, double wx_i) = 0;
 	void Xv(double *v, double *Xv);
 	void XTv(double *v, double *XTv);
 
 	double *C;
-	double *z;
-	double *D;
 	const problem *prob;
+	double *wx;
+	double *tmp; // a working array
+	double wTw;
 	int regularize_bias;
 };
 
-l2r_lr_fun::l2r_lr_fun(const problem *prob, const parameter *param, double *C)
+l2r_erm_fun::l2r_erm_fun(const problem *prob, const parameter *param, double *C)
 {
 	int l=prob->l;
 
 	this->prob = prob;
 
-	z = new double[l];
-	D = new double[l];
+	wx = new double[l];
+	tmp = new double[l];
 	this->C = C;
 	this->regularize_bias = param->regularize_bias;
 }
 
-l2r_lr_fun::~l2r_lr_fun()
+l2r_erm_fun::~l2r_erm_fun()
 {
-	delete[] z;
-	delete[] D;
+	delete[] wx;
+	delete[] tmp;
 }
 
-
-double l2r_lr_fun::fun(double *w)
+double l2r_erm_fun::fun(double *w)
 {
 	int i;
 	double f=0;
-	double *y=prob->y;
 	int l=prob->l;
 	int w_size=get_nr_variable();
 
-	Xv(w, z);
+	wTw = 0;
+	Xv(w, wx);
 
 	for(i=0;i<w_size;i++)
-		f += w[i]*w[i];
+		wTw += w[i]*w[i];
 	if(regularize_bias == 0)
-		f -= w[w_size-1]*w[w_size-1];
-	f /= 2.0;
+		wTw -= w[w_size-1]*w[w_size-1];
 	for(i=0;i<l;i++)
+		f += C_times_loss(i, wx[i]);
+	f = f + 0.5 * wTw;
+
+	return(f);
+}
+
+int l2r_erm_fun::get_nr_variable(void)
+{
+	return prob->n;
+}
+
+// On entry *f must be the function value of w
+// On exit w is updated and *f is the new function value
+double l2r_erm_fun::linesearch_and_update(double *w, double *s, double *f, double *g, double alpha)
+{
+	int i;
+	int l = prob->l;
+	double sTs = 0;
+	double wTs = 0;
+	double gTs = 0;
+	double eta = 0.01;
+	int w_size = get_nr_variable();
+	int max_num_linesearch = 20;
+	double fold = *f;
+	Xv(s, tmp);
+
+	for (i=0;i<w_size;i++)
+	{
+		sTs += s[i] * s[i];
+		wTs += s[i] * w[i];
+		gTs += s[i] * g[i];
+	}
+	if(regularize_bias == 0)
+	{
+		// bias not used in calculating (w + \alpha s)^T (w + \alpha s)
+		sTs -= s[w_size-1] * s[w_size-1];
+		wTs -= s[w_size-1] * w[w_size-1];
+	}
+
+	int num_linesearch = 0;
+	for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
 	{
-		double yz = y[i]*z[i];
-		if (yz >= 0)
-			f += C[i]*log(1 + exp(-yz));
+		double loss = 0;
+		for(i=0;i<l;i++)
+		{
+			double inner_product = tmp[i] * alpha + wx[i];
+			loss += C_times_loss(i, inner_product);
+		}
+		*f = loss + (alpha * alpha * sTs + wTw) / 2.0 + alpha * wTs;
+		if (*f - fold <= eta * alpha * gTs)
+		{
+			for (i=0;i<l;i++)
+				wx[i] += alpha * tmp[i];
+			break;
+		}
 		else
-			f += C[i]*(-yz+log(1 + exp(yz)));
+			alpha *= 0.5;
 	}
 
-	return(f);
+	if (num_linesearch >= max_num_linesearch)
+	{
+		*f = fold;
+		return 0;
+	}
+	else
+		for (i=0;i<w_size;i++)
+			w[i] += alpha * s[i];
+
+	wTw += alpha * alpha * sTs + 2* alpha * wTs;
+	return alpha;
+}
+
+void l2r_erm_fun::Xv(double *v, double *Xv)
+{
+	int i;
+	int l=prob->l;
+	feature_node **x=prob->x;
+
+	for(i=0;i<l;i++)
+		Xv[i]=sparse_operator::dot(v, x[i]);
+}
+
+void l2r_erm_fun::XTv(double *v, double *XTv)
+{
+	int i;
+	int l=prob->l;
+	int w_size=get_nr_variable();
+	feature_node **x=prob->x;
+
+	for(i=0;i<w_size;i++)
+		XTv[i]=0;
+	for(i=0;i<l;i++)
+		sparse_operator::axpy(v[i], x[i], XTv);
+}
+
+class l2r_lr_fun: public l2r_erm_fun
+{
+public:
+	l2r_lr_fun(const problem *prob, const parameter *param, double *C);
+	~l2r_lr_fun();
+
+	void grad(double *w, double *g);
+	void Hv(double *s, double *Hs);
+
+	void get_diag_preconditioner(double *M);
+
+private:
+	double *D;
+	double C_times_loss(int i, double wx_i);
+};
+
+l2r_lr_fun::l2r_lr_fun(const problem *prob, const parameter *param, double *C):
+	l2r_erm_fun(prob, param, C)
+{
+	int l=prob->l;
+	D = new double[l];
+}
+
+l2r_lr_fun::~l2r_lr_fun()
+{
+	delete[] D;
+}
+
+double l2r_lr_fun::C_times_loss(int i, double wx_i)
+{
+	double ywx_i = wx_i * prob->y[i];
+	if (ywx_i >= 0)
+		return C[i]*log(1 + exp(-ywx_i));
+	else
+		return C[i]*(-ywx_i + log(1 + exp(ywx_i)));
 }
 
 void l2r_lr_fun::grad(double *w, double *g)
@@ -181,11 +302,11 @@ void l2r_lr_fun::grad(double *w, double *g)
 
 	for(i=0;i<l;i++)
 	{
-		z[i] = 1/(1 + exp(-y[i]*z[i]));
-		D[i] = z[i]*(1-z[i]);
-		z[i] = C[i]*(z[i]-1)*y[i];
+		tmp[i] = 1/(1 + exp(-y[i]*wx[i]));
+		D[i] = tmp[i]*(1-tmp[i]);
+		tmp[i] = C[i]*(tmp[i]-1)*y[i];
 	}
-	XTv(z, g);
+	XTv(tmp, g);
 
 	for(i=0;i<w_size;i++)
 		g[i] = w[i] + g[i];
@@ -193,11 +314,6 @@ void l2r_lr_fun::grad(double *w, double *g)
 		g[w_size-1] -= w[w_size-1];
 }
 
-int l2r_lr_fun::get_nr_variable(void)
-{
-	return prob->n;
-}
-
 void l2r_lr_fun::get_diag_preconditioner(double *M)
 {
 	int i;
@@ -245,96 +361,45 @@ void l2r_lr_fun::Hv(double *s, double *Hs)
 		Hs[w_size-1] -= s[w_size-1];
 }
 
-void l2r_lr_fun::Xv(double *v, double *Xv)
-{
-	int i;
-	int l=prob->l;
-	feature_node **x=prob->x;
-
-	for(i=0;i<l;i++)
-		Xv[i]=sparse_operator::dot(v, x[i]);
-}
-
-void l2r_lr_fun::XTv(double *v, double *XTv)
-{
-	int i;
-	int l=prob->l;
-	int w_size=get_nr_variable();
-	feature_node **x=prob->x;
-
-	for(i=0;i<w_size;i++)
-		XTv[i]=0;
-	for(i=0;i<l;i++)
-		sparse_operator::axpy(v[i], x[i], XTv);
-}
-
-class l2r_l2_svc_fun: public function
+class l2r_l2_svc_fun: public l2r_erm_fun
 {
 public:
 	l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C);
 	~l2r_l2_svc_fun();
 
-	double fun(double *w);
 	void grad(double *w, double *g);
 	void Hv(double *s, double *Hs);
 
-	int get_nr_variable(void);
 	void get_diag_preconditioner(double *M);
 
 protected:
-	void Xv(double *v, double *Xv);
 	void subXTv(double *v, double *XTv);
 
-	double *C;
-	double *z;
 	int *I;
 	int sizeI;
-	const problem *prob;
-	int regularize_bias;
+
+private:
+	double C_times_loss(int i, double wx_i);
 };
 
-l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C)
+l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C):
+	l2r_erm_fun(prob, param, C)
 {
-	int l=prob->l;
-
-	this->prob = prob;
-
-	z = new double[l];
-	I = new int[l];
-	this->C = C;
-	this->regularize_bias = param->regularize_bias;
+	I = new int[prob->l];
 }
 
 l2r_l2_svc_fun::~l2r_l2_svc_fun()
 {
-	delete[] z;
 	delete[] I;
 }
 
-double l2r_l2_svc_fun::fun(double *w)
+double l2r_l2_svc_fun::C_times_loss(int i, double wx_i)
 {
-	int i;
-	double f=0;
-	double *y=prob->y;
-	int l=prob->l;
-	int w_size=get_nr_variable();
-
-	Xv(w, z);
-
-	for(i=0;i<w_size;i++)
-		f += w[i]*w[i];
-	if(regularize_bias == 0)
-		f -= w[w_size-1]*w[w_size-1];
-	f /= 2.0;
-	for(i=0;i<l;i++)
-	{
-		z[i] = y[i]*z[i];
-		double d = 1-z[i];
-		if (d > 0)
-			f += C[i]*d*d;
-	}
-
-	return(f);
+	double d = 1 - prob->y[i] * wx_i;
+	if (d > 0)
+		return C[i] * d * d;
+	else
+		return 0;
 }
 
 void l2r_l2_svc_fun::grad(double *w, double *g)
@@ -346,13 +411,16 @@ void l2r_l2_svc_fun::grad(double *w, double *g)
 
 	sizeI = 0;
 	for (i=0;i<l;i++)
-		if (z[i] < 1)
+	{
+		tmp[i] = wx[i] * y[i];
+		if (tmp[i] < 1)
 		{
-			z[sizeI] = C[i]*y[i]*(z[i]-1);
+			tmp[sizeI] = C[i]*y[i]*(tmp[i]-1);
 			I[sizeI] = i;
 			sizeI++;
 		}
-	subXTv(z, g);
+	}
+	subXTv(tmp, g);
 
 	for(i=0;i<w_size;i++)
 		g[i] = w[i] + 2*g[i];
@@ -360,11 +428,6 @@ void l2r_l2_svc_fun::grad(double *w, double *g)
 		g[w_size-1] -= w[w_size-1];
 }
 
-int l2r_l2_svc_fun::get_nr_variable(void)
-{
-	return prob->n;
-}
-
 void l2r_l2_svc_fun::get_diag_preconditioner(double *M)
 {
 	int i;
@@ -411,16 +474,6 @@ void l2r_l2_svc_fun::Hv(double *s, double *Hs)
 		Hs[w_size-1] -= s[w_size-1];
 }
 
-void l2r_l2_svc_fun::Xv(double *v, double *Xv)
-{
-	int i;
-	int l=prob->l;
-	feature_node **x=prob->x;
-
-	for(i=0;i<l;i++)
-		Xv[i]=sparse_operator::dot(v, x[i]);
-}
-
 void l2r_l2_svc_fun::subXTv(double *v, double *XTv)
 {
 	int i;
@@ -438,12 +491,11 @@ class l2r_l2_svr_fun: public l2r_l2_svc_fun
 public:
 	l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C);
 
-	double fun(double *w);
 	void grad(double *w, double *g);
 
 private:
+	double C_times_loss(int i, double wx_i);
 	double p;
-	int regularize_bias;
 };
 
 l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C):
@@ -453,32 +505,14 @@ l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, const parameter *param, doub
 	this->regularize_bias = param->regularize_bias;
 }
 
-double l2r_l2_svr_fun::fun(double *w)
+double l2r_l2_svr_fun::C_times_loss(int i, double wx_i)
 {
-	int i;
-	double f=0;
-	double *y=prob->y;
-	int l=prob->l;
-	int w_size=get_nr_variable();
-	double d;
-
-	Xv(w, z);
-
-	for(i=0;i<w_size;i++)
-		f += w[i]*w[i];
-	if(regularize_bias == 0)
-		f -= w[w_size-1]*w[w_size-1];
-	f /= 2;
-	for(i=0;i<l;i++)
-	{
-		d = z[i] - y[i];
-		if(d < -p)
-			f += C[i]*(d+p)*(d+p);
-		else if(d > p)
-			f += C[i]*(d-p)*(d-p);
-	}
-
-	return(f);
+	double d = wx_i - prob->y[i];
+	if(d < -p)
+		return C[i]*(d+p)*(d+p);
+	else if(d > p)
+		return C[i]*(d-p)*(d-p);
+	return 0;
 }
 
 void l2r_l2_svr_fun::grad(double *w, double *g)
@@ -492,24 +526,24 @@ void l2r_l2_svr_fun::grad(double *w, double *g)
 	sizeI = 0;
 	for(i=0;i<l;i++)
 	{
-		d = z[i] - y[i];
+		d = wx[i] - y[i];
 
 		// generate index set I
 		if(d < -p)
 		{
-			z[sizeI] = C[i]*(d+p);
+			tmp[sizeI] = C[i]*(d+p);
 			I[sizeI] = i;
 			sizeI++;
 		}
 		else if(d > p)
 		{
-			z[sizeI] = C[i]*(d-p);
+			tmp[sizeI] = C[i]*(d-p);
 			I[sizeI] = i;
 			sizeI++;
 		}
 
 	}
-	subXTv(z, g);
+	subXTv(tmp, g);
 
 	for(i=0;i<w_size;i++)
 		g[i] = w[i] + 2*g[i];
@@ -2582,11 +2616,7 @@ static void group_classes(const problem *prob, int *nr_class_ret, int **label_re
 
 static void train_one(const problem *prob, const parameter *param, double *w, double Cp, double Cn)
 {
-	//inner and outer tolerances for TRON
 	double eps = param->eps;
-	double eps_cg = 0.1;
-	if(param->init_sol != NULL)
-		eps_cg = 0.5;
 
 	int pos = 0;
 	int neg = 0;
@@ -2610,9 +2640,9 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
 					C[i] = Cn;
 			}
 			fun_obj=new l2r_lr_fun(prob, param, C);
-			TRON tron_obj(fun_obj, primal_solver_tol, eps_cg);
-			tron_obj.set_print_string(liblinear_print_string);
-			tron_obj.tron(w);
+			NEWTON newton_obj(fun_obj, primal_solver_tol);
+			newton_obj.set_print_string(liblinear_print_string);
+			newton_obj.newton(w);
 			delete fun_obj;
 			delete[] C;
 			break;
@@ -2628,9 +2658,9 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
 					C[i] = Cn;
 			}
 			fun_obj=new l2r_l2_svc_fun(prob, param, C);
-			TRON tron_obj(fun_obj, primal_solver_tol, eps_cg);
-			tron_obj.set_print_string(liblinear_print_string);
-			tron_obj.tron(w);
+			NEWTON newton_obj(fun_obj, primal_solver_tol);
+			newton_obj.set_print_string(liblinear_print_string);
+			newton_obj.newton(w);
 			delete fun_obj;
 			delete[] C;
 			break;
@@ -2673,9 +2703,9 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
 				C[i] = param->C;
 
 			fun_obj=new l2r_l2_svr_fun(prob, param, C);
-			TRON tron_obj(fun_obj, param->eps);
-			tron_obj.set_print_string(liblinear_print_string);
-			tron_obj.tron(w);
+			NEWTON newton_obj(fun_obj, param->eps);
+			newton_obj.set_print_string(liblinear_print_string);
+			newton_obj.newton(w);
 			delete fun_obj;
 			delete[] C;
 			break;
diff --git a/newton.cpp b/newton.cpp
new file mode 100644
index 0000000..3c1dd98
--- /dev/null
+++ b/newton.cpp
@@ -0,0 +1,245 @@
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+#include "newton.h"
+
+#ifndef min
+template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
+#endif
+
+#ifndef max
+template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern double dnrm2_(int *, double *, int *);
+extern double ddot_(int *, double *, int *, double *, int *);
+extern int daxpy_(int *, double *, double *, int *, double *, int *);
+extern int dscal_(int *, double *, double *, int *);
+
+#ifdef __cplusplus
+}
+#endif
+
+static void default_print(const char *buf)
+{
+	fputs(buf,stdout);
+	fflush(stdout);
+}
+
+// On entry *f must be the function value of w
+// On exit w is updated and *f is the new function value
+double function::linesearch_and_update(double *w, double *s, double *f, double *g, double alpha)
+{
+	double gTs = 0;
+	double eta = 0.01;
+	int n = get_nr_variable();
+	int max_num_linesearch = 20;
+	double *w_new = new double[n];
+	double fold = *f;
+
+	for (int i=0;i<n;i++)
+		gTs += s[i] * g[i];
+
+	int num_linesearch = 0;
+	for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
+	{
+		for (int i=0;i<n;i++)
+			w_new[i] = w[i] + alpha*s[i];
+		*f = fun(w_new);
+		if (*f - fold <= eta * alpha * gTs)
+			break;
+		else
+			alpha *= 0.5;
+	}
+
+	if (num_linesearch >= max_num_linesearch)
+	{
+		*f = fold;
+		return 0;
+	}
+	else
+		memcpy(w, w_new, sizeof(double)*n);
+
+	delete [] w_new;
+	return alpha;
+}
+
+void NEWTON::info(const char *fmt,...)
+{
+	char buf[BUFSIZ];
+	va_list ap;
+	va_start(ap,fmt);
+	vsprintf(buf,fmt,ap);
+	va_end(ap);
+	(*newton_print_string)(buf);
+}
+
+NEWTON::NEWTON(const function *fun_obj, double eps, double eps_cg, int max_iter)
+{
+	this->fun_obj=const_cast<function *>(fun_obj);
+	this->eps=eps;
+	this->eps_cg=eps_cg;
+	this->max_iter=max_iter;
+	newton_print_string = default_print;
+}
+
+NEWTON::~NEWTON()
+{
+}
+
+void NEWTON::newton(double *w)
+{
+	int n = fun_obj->get_nr_variable();
+	int i, cg_iter;
+	double step_size;
+	double f, fold, actred;
+	double init_step_size = 1;
+	int search = 1, iter = 1, inc = 1;
+	double *s = new double[n];
+	double *r = new double[n];
+	double *g = new double[n];
+
+	const double alpha_pcg = 0.01;
+	double *M = new double[n];
+
+	// calculate gradient norm at w=0 for stopping condition.
+	double *w0 = new double[n];
+	for (i=0; i<n; i++)
+		w0[i] = 0;
+	fun_obj->fun(w0);
+	fun_obj->grad(w0, g);
+	double gnorm0 = dnrm2_(&n, g, &inc);
+	delete [] w0;
+
+	f = fun_obj->fun(w);
+	info("init f %5.3e\n", f);
+	fun_obj->grad(w, g);
+	double gnorm = dnrm2_(&n, g, &inc);
+
+	if (gnorm <= eps*gnorm0)
+		search = 0;
+
+	double *w_new = new double[n];
+	while (iter <= max_iter && search)
+	{
+		fun_obj->get_diag_preconditioner(M);
+		for(i=0; i<n; i++)
+			M[i] = (1-alpha_pcg) + alpha_pcg*M[i];
+		cg_iter = pcg(g, M, s, r);
+
+		fold = f;
+		step_size = fun_obj->linesearch_and_update(w, s, & f, g, init_step_size);
+
+		if (step_size == 0)
+		{
+			info("WARNING: line search fails\n");
+			break;
+		}
+
+		info("iter %2d f %5.3e |g| %5.3e CG %3d step_size %4.2e \n", iter, f, gnorm, cg_iter, step_size);
+
+		actred = fold - f;
+		iter++;
+
+		fun_obj->grad(w, g);
+
+		gnorm = dnrm2_(&n, g, &inc);
+		if (gnorm <= eps*gnorm0)
+			break;
+		if (f < -1.0e+32)
+		{
+			info("WARNING: f < -1.0e+32\n");
+			break;
+		}
+		if (fabs(actred) <= 1.0e-12*fabs(f))
+		{
+			info("WARNING: actred too small\n");
+			break;
+		}
+	}
+
+	delete[] g;
+	delete[] r;
+	delete[] w_new;
+	delete[] s;
+	delete[] M;
+}
+
+int NEWTON::pcg(double *g, double *M, double *s, double *r)
+{
+	int i, inc = 1;
+	int n = fun_obj->get_nr_variable();
+	double one = 1;
+	double *d = new double[n];
+	double *Hd = new double[n];
+	double zTr, znewTrnew, alpha, beta, cgtol;
+	double *z = new double[n];
+	double Q = 0, newQ, Qdiff;
+
+	for (i=0; i<n; i++)
+	{
+		s[i] = 0;
+		r[i] = -g[i];
+		z[i] = r[i] / M[i];
+		d[i] = z[i];
+	}
+
+	zTr = ddot_(&n, z, &inc, r, &inc);
+	double gMinv_norm = sqrt(zTr);
+	cgtol = min(eps_cg, sqrt(gMinv_norm));
+	int cg_iter = 0;
+	int max_cg_iter = max(n, 5);
+
+	while (cg_iter < max_cg_iter)
+	{
+		cg_iter++;
+		fun_obj->Hv(d, Hd);
+
+		alpha = zTr/ddot_(&n, d, &inc, Hd, &inc);
+		daxpy_(&n, &alpha, d, &inc, s, &inc);
+		alpha = -alpha;
+		daxpy_(&n, &alpha, Hd, &inc, r, &inc);
+
+		// Using quadratic approximation as CG stopping criterion
+		newQ = -0.5*(ddot_(&n, s, &inc, r, &inc) - ddot_(&n, s, &inc, g, &inc));
+		Qdiff = newQ - Q;
+		if (newQ <= 0 && Qdiff <= 0)
+		{
+			if (cg_iter * Qdiff >= cgtol * newQ)
+				break;
+		}
+		else
+		{
+			info("WARNING: quadratic approximation > 0 or increasing in CG\n");
+			break;
+		}
+		Q = newQ;
+
+		for (i=0; i<n; i++)
+			z[i] = r[i] / M[i];
+		znewTrnew = ddot_(&n, z, &inc, r, &inc);
+		beta = znewTrnew/zTr;
+		dscal_(&n, &beta, d, &inc);
+		daxpy_(&n, &one, z, &inc, d, &inc);
+		zTr = znewTrnew;
+	}
+
+	if (cg_iter == max_cg_iter)
+		info("WARNING: reaching maximal number of CG steps\n");
+
+	delete[] d;
+	delete[] Hd;
+	delete[] z;
+
+	return(cg_iter);
+}
+
+void NEWTON::set_print_string(void (*print_string) (const char *buf))
+{
+	newton_print_string = print_string;
+}
diff --git a/tron.h b/newton.h
similarity index 53%
rename from tron.h
rename to newton.h
index 524fa9c..c28bee8 100644
--- a/tron.h
+++ b/newton.h
@@ -1,5 +1,5 @@
-#ifndef _TRON_H
-#define _TRON_H
+#ifndef _NEWTON_H
+#define _NEWTON_H
 
 class function
 {
@@ -7,30 +7,31 @@ public:
 	virtual double fun(double *w) = 0 ;
 	virtual void grad(double *w, double *g) = 0 ;
 	virtual void Hv(double *s, double *Hs) = 0 ;
-
 	virtual int get_nr_variable(void) = 0 ;
 	virtual void get_diag_preconditioner(double *M) = 0 ;
 	virtual ~function(void){}
+
+	// base implementation in newton.cpp
+	virtual double linesearch_and_update(double *w, double *s, double *f, double *g, double alpha);
 };
 
-class TRON
+class NEWTON
 {
 public:
-	TRON(const function *fun_obj, double eps = 0.1, double eps_cg = 0.1, int max_iter = 1000);
-	~TRON();
+	NEWTON(const function *fun_obj, double eps = 0.1, double eps_cg = 0.5, int max_iter = 1000);
+	~NEWTON();
 
-	void tron(double *w);
+	void newton(double *w);
 	void set_print_string(void (*i_print) (const char *buf));
 
 private:
-	int trpcg(double delta, double *g, double *M, double *s, double *r, bool *reach_boundary);
-	double norm_inf(int n, double *x);
+	int pcg(double *g, double *M, double *s, double *r);
 
 	double eps;
 	double eps_cg;
 	int max_iter;
 	function *fun_obj;
 	void info(const char *fmt,...);
-	void (*tron_print_string)(const char *buf);
+	void (*newton_print_string)(const char *buf);
 };
 #endif
diff --git a/tron.cpp b/tron.cpp
deleted file mode 100644
index 2e0cb24..0000000
--- a/tron.cpp
+++ /dev/null
@@ -1,288 +0,0 @@
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdarg.h>
-#include "tron.h"
-
-#ifndef min
-template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
-#endif
-
-#ifndef max
-template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-extern double dnrm2_(int *, double *, int *);
-extern double ddot_(int *, double *, int *, double *, int *);
-extern int daxpy_(int *, double *, double *, int *, double *, int *);
-extern int dscal_(int *, double *, double *, int *);
-
-#ifdef __cplusplus
-}
-#endif
-
-static void default_print(const char *buf)
-{
-	fputs(buf,stdout);
-	fflush(stdout);
-}
-
-static double uTMv(int n, double *u, double *M, double *v)
-{
-	const int m = n-4;
-	double res = 0;
-	int i;
-	for (i=0; i<m; i+=5)
-		res += u[i]*M[i]*v[i]+u[i+1]*M[i+1]*v[i+1]+u[i+2]*M[i+2]*v[i+2]+
-			u[i+3]*M[i+3]*v[i+3]+u[i+4]*M[i+4]*v[i+4];
-	for (; i<n; i++)
-		res += u[i]*M[i]*v[i];
-	return res;
-}
-
-void TRON::info(const char *fmt,...)
-{
-	char buf[BUFSIZ];
-	va_list ap;
-	va_start(ap,fmt);
-	vsprintf(buf,fmt,ap);
-	va_end(ap);
-	(*tron_print_string)(buf);
-}
-
-TRON::TRON(const function *fun_obj, double eps, double eps_cg, int max_iter)
-{
-	this->fun_obj=const_cast<function *>(fun_obj);
-	this->eps=eps;
-	this->eps_cg=eps_cg;
-	this->max_iter=max_iter;
-	tron_print_string = default_print;
-}
-
-TRON::~TRON()
-{
-}
-
-void TRON::tron(double *w)
-{
-	// Parameters for updating the iterates.
-	double eta0 = 1e-4, eta1 = 0.25, eta2 = 0.75;
-
-	// Parameters for updating the trust region size delta.
-	double sigma1 = 0.25, sigma2 = 0.5, sigma3 = 4;
-
-	int n = fun_obj->get_nr_variable();
-	int i, cg_iter;
-	double delta=0, sMnorm, one=1.0;
-	double alpha, f, fnew, prered, actred, gs;
-	int search = 1, iter = 1, inc = 1;
-	double *s = new double[n];
-	double *r = new double[n];
-	double *g = new double[n];
-
-	const double alpha_pcg = 0.01;
-	double *M = new double[n];
-
-	// calculate gradient norm at w=0 for stopping condition.
-	double *w0 = new double[n];
-	for (i=0; i<n; i++)
-		w0[i] = 0;
-	fun_obj->fun(w0);
-	fun_obj->grad(w0, g);
-	double gnorm0 = dnrm2_(&n, g, &inc);
-	delete [] w0;
-
-	f = fun_obj->fun(w);
-	fun_obj->grad(w, g);
-	double gnorm = dnrm2_(&n, g, &inc);
-
-	if (gnorm <= eps*gnorm0)
-		search = 0;
-
-	fun_obj->get_diag_preconditioner(M);
-	for(i=0; i<n; i++)
-		M[i] = (1-alpha_pcg) + alpha_pcg*M[i];
-	delta = sqrt(uTMv(n, g, M, g));
-
-	double *w_new = new double[n];
-	bool reach_boundary;
-	bool delta_adjusted = false;
-	while (iter <= max_iter && search)
-	{
-		cg_iter = trpcg(delta, g, M, s, r, &reach_boundary);
-
-		memcpy(w_new, w, sizeof(double)*n);
-		daxpy_(&n, &one, s, &inc, w_new, &inc);
-
-		gs = ddot_(&n, g, &inc, s, &inc);
-		prered = -0.5*(gs-ddot_(&n, s, &inc, r, &inc));
-		fnew = fun_obj->fun(w_new);
-
-		// Compute the actual reduction.
-		actred = f - fnew;
-
-		// On the first iteration, adjust the initial step bound.
-		sMnorm = sqrt(uTMv(n, s, M, s));
-		if (iter == 1 && !delta_adjusted)
-		{
-			delta = min(delta, sMnorm);
-			delta_adjusted = true;
-		}
-
-		// Compute prediction alpha*sMnorm of the step.
-		if (fnew - f - gs <= 0)
-			alpha = sigma3;
-		else
-			alpha = max(sigma1, -0.5*(gs/(fnew - f - gs)));
-
-		// Update the trust region bound according to the ratio of actual to predicted reduction.
-		if (actred < eta0*prered)
-			delta = min(alpha*sMnorm, sigma2*delta);
-		else if (actred < eta1*prered)
-			delta = max(sigma1*delta, min(alpha*sMnorm, sigma2*delta));
-		else if (actred < eta2*prered)
-			delta = max(sigma1*delta, min(alpha*sMnorm, sigma3*delta));
-		else
-		{
-			if (reach_boundary)
-				delta = sigma3*delta;
-			else
-				delta = max(delta, min(alpha*sMnorm, sigma3*delta));
-		}
-
-		info("iter %2d act %5.3e pre %5.3e delta %5.3e f %5.3e |g| %5.3e CG %3d\n", iter, actred, prered, delta, f, gnorm, cg_iter);
-
-		if (actred > eta0*prered)
-		{
-			iter++;
-			memcpy(w, w_new, sizeof(double)*n);
-			f = fnew;
-			fun_obj->grad(w, g);
-			fun_obj->get_diag_preconditioner(M);
-			for(i=0; i<n; i++)
-				M[i] = (1-alpha_pcg) + alpha_pcg*M[i];
-
-			gnorm = dnrm2_(&n, g, &inc);
-			if (gnorm <= eps*gnorm0)
-				break;
-		}
-		if (f < -1.0e+32)
-		{
-			info("WARNING: f < -1.0e+32\n");
-			break;
-		}
-		if (prered <= 0)
-		{
-			info("WARNING: prered <= 0\n");
-			break;
-		}
-		if (fabs(actred) <= 1.0e-12*fabs(f) &&
-		    fabs(prered) <= 1.0e-12*fabs(f))
-		{
-			info("WARNING: actred and prered too small\n");
-			break;
-		}
-	}
-
-	delete[] g;
-	delete[] r;
-	delete[] w_new;
-	delete[] s;
-	delete[] M;
-}
-
-int TRON::trpcg(double delta, double *g, double *M, double *s, double *r, bool *reach_boundary)
-{
-	int i, inc = 1;
-	int n = fun_obj->get_nr_variable();
-	double one = 1;
-	double *d = new double[n];
-	double *Hd = new double[n];
-	double zTr, znewTrnew, alpha, beta, cgtol;
-	double *z = new double[n];
-
-	*reach_boundary = false;
-	for (i=0; i<n; i++)
-	{
-		s[i] = 0;
-		r[i] = -g[i];
-		z[i] = r[i] / M[i];
-		d[i] = z[i];
-	}
-
-	zTr = ddot_(&n, z, &inc, r, &inc);
-	cgtol = eps_cg*sqrt(zTr);
-	int cg_iter = 0;
-	int max_cg_iter = max(n, 5);
-
-	while (cg_iter < max_cg_iter)
-	{
-		if (sqrt(zTr) <= cgtol)
-			break;
-		cg_iter++;
-		fun_obj->Hv(d, Hd);
-
-		alpha = zTr/ddot_(&n, d, &inc, Hd, &inc);
-		daxpy_(&n, &alpha, d, &inc, s, &inc);
-
-		double sMnorm = sqrt(uTMv(n, s, M, s));
-		if (sMnorm > delta)
-		{
-			info("cg reaches trust region boundary\n");
-			*reach_boundary = true;
-			alpha = -alpha;
-			daxpy_(&n, &alpha, d, &inc, s, &inc);
-
-			double sTMd = uTMv(n, s, M, d);
-			double sTMs = uTMv(n, s, M, s);
-			double dTMd = uTMv(n, d, M, d);
-			double dsq = delta*delta;
-			double rad = sqrt(sTMd*sTMd + dTMd*(dsq-sTMs));
-			if (sTMd >= 0)
-				alpha = (dsq - sTMs)/(sTMd + rad);
-			else
-				alpha = (rad - sTMd)/dTMd;
-			daxpy_(&n, &alpha, d, &inc, s, &inc);
-			alpha = -alpha;
-			daxpy_(&n, &alpha, Hd, &inc, r, &inc);
-			break;
-		}
-		alpha = -alpha;
-		daxpy_(&n, &alpha, Hd, &inc, r, &inc);
-
-		for (i=0; i<n; i++)
-			z[i] = r[i] / M[i];
-		znewTrnew = ddot_(&n, z, &inc, r, &inc);
-		beta = znewTrnew/zTr;
-		dscal_(&n, &beta, d, &inc);
-		daxpy_(&n, &one, z, &inc, d, &inc);
-		zTr = znewTrnew;
-	}
-
-	if (cg_iter == max_cg_iter)
-		info("WARNING: reaching maximal number of CG steps\n");
-
-	delete[] d;
-	delete[] Hd;
-	delete[] z;
-
-	return(cg_iter);
-}
-
-double TRON::norm_inf(int n, double *x)
-{
-	double dmax = fabs(x[0]);
-	for (int i=1; i<n; i++)
-		if (fabs(x[i]) >= dmax)
-			dmax = fabs(x[i]);
-	return(dmax);
-}
-
-void TRON::set_print_string(void (*print_string) (const char *buf))
-{
-	tron_print_string = print_string;
-}