From ab84f1062fb46cb7c3759a5266d66f1a47a77a20 Mon Sep 17 00:00:00 2001
From: eaudex <eaudex@16e7d947-dcc2-db11-b54a-0017319806e7>
Date: Mon, 7 Mar 2011 08:21:31 +0000
Subject: [PATCH] In solve_l1r_lr(.), replace CDN with newGLMNET.

In solve_l1r_l2_svc(.),
change the stopping condition from inf-norm to 1-norm.
---
 linear.cpp | 355 +++++++++++++++++++++++++++++++++--------------------
 train.c    |   2 +-
 2 files changed, 222 insertions(+), 135 deletions(-)

diff --git a/linear.cpp b/linear.cpp
index a97b6bc..ded6345 100644
--- a/linear.cpp
+++ b/linear.cpp
@@ -1086,8 +1086,8 @@ static void solve_l1r_l2_svc(
 	double sigma = 0.01;
 	double d, G_loss, G, H;
 	double Gmax_old = INF;
-	double Gmax_new;
-	double Gmax_init;
+	double Gmax_new, Gnorm1_new;
+	double Gnorm1_init;
 	double d_old, d_diff;
 	double loss_old, loss_new;
 	double appxcond, cond;
@@ -1126,7 +1126,8 @@ static void solve_l1r_l2_svc(
 
 	while(iter < max_iter)
 	{
-		Gmax_new  = 0;
+		Gmax_new = 0;
+		Gnorm1_new = 0;
 
 		for(j=0; j<active_size; j++)
 		{
@@ -1182,6 +1183,7 @@ static void solve_l1r_l2_svc(
 				violation = fabs(Gn);
 
 			Gmax_new = max(Gmax_new, violation);
+			Gnorm1_new += violation;
 
 			// obtain Newton direction d
 			if(Gp <= H*w[j])
@@ -1280,12 +1282,12 @@ static void solve_l1r_l2_svc(
 		}
 
 		if(iter == 0)
-			Gmax_init = Gmax_new;
+			Gnorm1_init = Gnorm1_new;
 		iter++;
 		if(iter % 10 == 0)
 			info(".");
 
-		if(Gmax_new <= eps*Gmax_init)
+		if(Gnorm1_new <= eps*Gnorm1_init)
 		{
 			if(active_size == w_size)
 				break;
@@ -1347,7 +1349,7 @@ static void solve_l1r_l2_svc(
 //
 // solution will be put in w
 //
-// See Yuan et al. (2010) and appendix of LIBLINEAR paper, Fan et al. (2008)
+// See Yuan et al. (2011) and appendix of LIBLINEAR paper, Fan et al. (2008)
 
 #undef GETI
 #define GETI(i) (y[i]+1)
@@ -1359,100 +1361,93 @@ static void solve_l1r_lr(
 {
 	int l = prob_col->l;
 	int w_size = prob_col->n;
-	int j, s, iter = 0;
+	int j, s, newton_iter=0, iter=0;
+	int max_newton_iter = 100;
 	int max_iter = 1000;
-	int active_size = w_size;
 	int max_num_linesearch = 20;
+	int active_size;
+	int QP_active_size;
 
-	double x_min = 0;
+	double nu = 1e-12;
+	double inner_eps = 1;
 	double sigma = 0.01;
-	double d, G, H;
+	double w_norm=0, w_norm_new;
+	double z, G, H;
+	double Gnorm1_init;
 	double Gmax_old = INF;
-	double Gmax_new;
-	double Gmax_init;
-	double sum1, appxcond1;
-	double sum2, appxcond2;
-	double cond;
+	double Gmax_new, Gnorm1_new;
+	double QP_Gmax_old = INF;
+	double QP_Gmax_new, QP_Gnorm1_new;
+	double delta, negsum_xTd, cond;
 
 	int *index = new int[w_size];
 	schar *y = new schar[l];
+	double *Hdiag = new double[w_size];
+	double *Grad = new double[w_size];
+	double *wpd = new double[w_size];
+	double *xjneg_sum = new double[w_size];
+	double *xTd = new double[l];
 	double *exp_wTx = new double[l];
 	double *exp_wTx_new = new double[l];
-	double *xj_max = new double[w_size];
-	double *C_sum = new double[w_size];
-	double *xjneg_sum = new double[w_size];
-	double *xjpos_sum = new double[w_size];
+	double *tau = new double[l];
+	double *D = new double[l];
 	feature_node *x;
 
 	double C[3] = {Cn,0,Cp};
 
 	for(j=0; j<l; j++)
 	{
-		exp_wTx[j] = 1;
 		if(prob_col->y[j] > 0)
 			y[j] = 1;
 		else
 			y[j] = -1;
+
+		// assume initial w is 0
+		exp_wTx[j] = 1;
+		tau[j] = C[GETI(j)]*0.5;
+		D[j] = C[GETI(j)]*0.25;
 	}
 	for(j=0; j<w_size; j++)
 	{
 		w[j] = 0;
+		wpd[j] = w[j];
 		index[j] = j;
-		xj_max[j] = 0;
-		C_sum[j] = 0;
 		xjneg_sum[j] = 0;
-		xjpos_sum[j] = 0;
 		x = prob_col->x[j];
 		while(x->index != -1)
 		{
 			int ind = x->index-1;
-			double val = x->value;
-			x_min = min(x_min, val);
-			xj_max[j] = max(xj_max[j], val);
-			C_sum[j] += C[GETI(ind)];
 			if(y[ind] == -1)
-				xjneg_sum[j] += C[GETI(ind)]*val;
-			else
-				xjpos_sum[j] += C[GETI(ind)]*val;
+				xjneg_sum[j] += C[GETI(ind)]*x->value;
 			x++;
 		}
 	}
 
-	while(iter < max_iter)
+	while(newton_iter < max_newton_iter)
 	{
 		Gmax_new = 0;
-
-		for(j=0; j<active_size; j++)
-		{
-			int i = j+rand()%(active_size-j);
-			swap(index[i], index[j]);
-		}
+		Gnorm1_new = 0;
+		active_size = w_size;
 
 		for(s=0; s<active_size; s++)
 		{
 			j = index[s];
-			sum1 = 0;
-			sum2 = 0;
-			H = 0;
+			Hdiag[j] = nu;
+			Grad[j] = 0;
 
+			double tmp = 0;
 			x = prob_col->x[j];
 			while(x->index != -1)
 			{
 				int ind = x->index-1;
-				double exp_wTxind = exp_wTx[ind];
-				double tmp1 = x->value/(1+exp_wTxind);
-				double tmp2 = C[GETI(ind)]*tmp1;
-				double tmp3 = tmp2*exp_wTxind;
-				sum2 += tmp2;
-				sum1 += tmp3;
-				H += tmp1*tmp3;
+				Hdiag[j] += x->value*x->value*D[ind];
+				tmp += x->value*tau[ind];
 				x++;
 			}
+			Grad[j] = -tmp + xjneg_sum[j];
 
-			G = -sum2 + xjneg_sum[j];
-
-			double Gp = G+1;
-			double Gn = G-1;
+			double Gp = Grad[j]+1;
+			double Gn = Grad[j]-1;
 			double violation = 0;
 			if(w[j] == 0)
 			{
@@ -1460,6 +1455,7 @@ static void solve_l1r_lr(
 					violation = -Gp;
 				else if(Gn > 0)
 					violation = Gn;
+				//outer-level shrinking
 				else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
 				{
 					active_size--;
@@ -1474,126 +1470,214 @@ static void solve_l1r_lr(
 				violation = fabs(Gn);
 
 			Gmax_new = max(Gmax_new, violation);
+			Gnorm1_new += violation;
+		}
 
-			// obtain Newton direction d
-			if(Gp <= H*w[j])
-				d = -Gp/H;
-			else if(Gn >= H*w[j])
-				d = -Gn/H;
-			else
-				d = -w[j];
+		if(newton_iter == 0)
+			Gnorm1_init = Gnorm1_new;
 
-			if(fabs(d) < 1.0e-12)
-				continue;
+		if(Gnorm1_new <= eps*Gnorm1_init)
+			break;
 
-			d = min(max(d,-10.0),10.0);
+		iter = 0;
+		QP_Gmax_old = INF;
+		QP_active_size = active_size;
 
-			double delta = fabs(w[j]+d)-fabs(w[j]) + G*d;
-			int num_linesearch;
-			for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
+		for(int i=0; i<l; i++)
+			xTd[i] = 0;
+
+		// optimize QP over wpd
+		while(iter < max_iter)
+		{
+			QP_Gmax_new = 0;
+			QP_Gnorm1_new = 0;
+
+			for(j=0; j<QP_active_size; j++)
 			{
-				cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta;
+				int i = j+rand()%(QP_active_size-j);
+				swap(index[i], index[j]);
+			}
+
+			for(s=0; s<QP_active_size; s++)
+			{
+				j = index[s];
+				H = Hdiag[j];
 
-				// approx line search works only if xij>=0
-				if(x_min >= 0)
+				x = prob_col->x[j];
+				G = Grad[j] + (wpd[j]-w[j])*nu;
+				while(x->index != -1)
 				{
-					double tmp = exp(d*xj_max[j]);
-					appxcond1 = log(1+sum1*(tmp-1)/xj_max[j]/C_sum[j])*C_sum[j] + cond - d*xjpos_sum[j];
-					appxcond2 = log(1+sum2*(1/tmp-1)/xj_max[j]/C_sum[j])*C_sum[j] + cond + d*xjneg_sum[j];
-					if(min(appxcond1,appxcond2) <= 0)
+					int ind = x->index-1;
+					G += x->value*D[ind]*xTd[ind];
+					x++;
+				}
+
+				double Gp = G+1;
+				double Gn = G-1;
+				double violation = 0;
+				if(wpd[j] == 0)
+				{
+					if(Gp < 0)
+						violation = -Gp;
+					else if(Gn > 0)
+						violation = Gn;
+					//inner-level shrinking
+					else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l)
 					{
-						x = prob_col->x[j];
-						while(x->index != -1)
-						{
-							exp_wTx[x->index-1] *= exp(d*x->value);
-							x++;
-						}
-						break;
+						QP_active_size--;
+						swap(index[s], index[QP_active_size]);
+						s--;
+						continue;
 					}
 				}
+				else if(wpd[j] > 0)
+					violation = fabs(Gp);
+				else
+					violation = fabs(Gn);
+
+				QP_Gmax_new = max(QP_Gmax_new, violation);
+				QP_Gnorm1_new += violation;
 
-				cond += d*xjneg_sum[j];
+				// obtain solution of one-variable problem
+				if(Gp <= H*wpd[j])
+					z = -Gp/H;
+				else if(Gn >= H*wpd[j])
+					z = -Gn/H;
+				else
+					z = -wpd[j];
+
+				if(fabs(z) < 1.0e-12)
+					continue;
+				z = min(max(z,-10.0),10.0);
+
+				wpd[j] += z;
 
-				int i = 0;
 				x = prob_col->x[j];
 				while(x->index != -1)
 				{
 					int ind = x->index-1;
-					double exp_dx = exp(d*x->value);
-					exp_wTx_new[i] = exp_wTx[ind]*exp_dx;
-					cond += C[GETI(ind)]*log((1+exp_wTx_new[i])/(exp_dx+exp_wTx_new[i]));
-					x++; i++;
+					xTd[ind] += x->value*z;
+					x++;
 				}
+			}
 
-				if(cond <= 0)
-				{
-					int i = 0;
-					x = prob_col->x[j];
-					while(x->index != -1)
-					{
-						int ind = x->index-1;
-						exp_wTx[ind] = exp_wTx_new[i];
-						x++; i++;
-					}
+			iter++;
+			//if(iter % 10 == 0)
+			//	info(".");
+
+			if(QP_Gnorm1_new <= inner_eps*Gnorm1_init)
+			{
+				//inner stopping
+				if(QP_active_size == active_size)
 					break;
-				}
+				//active set reactivation
 				else
 				{
-					d *= 0.5;
-					delta *= 0.5;
+					info("*");
+					QP_active_size = active_size;
+					QP_Gmax_old = INF;
+					continue;
 				}
 			}
 
-			w[j] += d;
+			QP_Gmax_old = QP_Gmax_new;
+		}
 
-			// recompute exp_wTx[] if line search takes too many steps
-			if(num_linesearch >= max_num_linesearch)
+		if(iter >= max_iter)
+			info("WARNING: reaching max number of inner iterations\n");
+
+		delta = 0;
+		w_norm_new = 0;
+		for(j=0; j<w_size; j++)
+		{
+			delta += Grad[j]*(wpd[j]-w[j]);
+			if(wpd[j] != 0)
+				w_norm_new += fabs(wpd[j]);
+		}
+		delta += (w_norm_new-w_norm);
+
+		negsum_xTd = 0;
+		for(int i=0; i<l; i++)
+			if(y[i] == -1)
+				negsum_xTd += C[GETI(i)]*xTd[i];
+
+		int num_linesearch;
+		for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
+		{
+			cond = w_norm_new-w_norm - sigma*delta;
+			cond += negsum_xTd;
+
+			for(int i=0; i<l; i++)
 			{
-				info("#");
-				for(int i=0; i<l; i++)
-					exp_wTx[i] = 0;
+				double exp_xTd = exp(xTd[i]);
+				exp_wTx_new[i] = exp_wTx[i]*exp_xTd;
+				cond += C[GETI(i)]*log((1+exp_wTx_new[i])/(exp_xTd+exp_wTx_new[i]));
+			}
 
-				for(int i=0; i<w_size; i++)
+			if(cond <= 0)
+			{
+				w_norm = w_norm_new;
+				for(j=0; j<w_size; j++)
+					w[j] = wpd[j];
+				for(int i=0; i<l; i++)
 				{
-					if(w[i]==0) continue;
-					x = prob_col->x[i];
-					while(x->index != -1)
-					{
-						exp_wTx[x->index-1] += w[i]*x->value;
-						x++;
-					}
+					exp_wTx[i] = exp_wTx_new[i];
+					double tau_tmp = 1/(1+exp_wTx[i]);
+					tau[i] = C[GETI(i)]*tau_tmp;
+					D[i] = C[GETI(i)]*exp_wTx[i]*tau_tmp*tau_tmp;
 				}
-
+				break;
+			}
+			else
+			{
+				w_norm_new = 0;
+				for(j=0; j<w_size; j++)
+				{
+					wpd[j] = (w[j]+wpd[j])*0.5;
+					if(wpd[j] != 0)
+						w_norm_new += fabs(wpd[j]);
+				}
+				delta *= 0.5;
+				negsum_xTd *= 0.5;
 				for(int i=0; i<l; i++)
-					exp_wTx[i] = exp(exp_wTx[i]);
+					xTd[i] *= 0.5;
 			}
 		}
 
-		if(iter == 0)
-			Gmax_init = Gmax_new;
-		iter++;
-		if(iter % 10 == 0)
-			info(".");
-
-		if(Gmax_new <= eps*Gmax_init)
+		// Recompute some info due to too many line search steps
+		if(num_linesearch >= max_num_linesearch)
 		{
-			if(active_size == w_size)
-				break;
-			else
+			for(int i=0; i<l; i++)
+				exp_wTx[i] = 0;
+
+			for(int i=0; i<w_size; i++)
 			{
-				active_size = w_size;
-				info("*");
-				Gmax_old = INF;
-				continue;
+				if(w[i]==0) continue;
+				x = prob_col->x[i];
+				while(x->index != -1)
+				{
+					exp_wTx[x->index-1] += w[i]*x->value;
+					x++;
+				}
 			}
+
+			for(int i=0; i<l; i++)
+				exp_wTx[i] = exp(exp_wTx[i]);
 		}
 
+		if(iter == 1)
+			inner_eps *= 0.25;
+
+		newton_iter++;
 		Gmax_old = Gmax_new;
+
+		info("iter %3d  #CD cycles %d\n", newton_iter, iter);
 	}
 
-	info("\noptimization finished, #iter = %d\n", iter);
-	if(iter >= max_iter)
-		info("\nWARNING: reaching max number of iterations\n");
+	info("=========================\n");
+	info("optimization finished, #iter = %d\n", newton_iter);
+	if(newton_iter >= max_newton_iter)
+		info("WARNING: reaching max number of iterations\n");
 
 	// calculate objective value
 	
@@ -1616,12 +1700,15 @@ static void solve_l1r_lr(
 
 	delete [] index;
 	delete [] y;
+	delete [] Hdiag;
+	delete [] Grad;
+	delete [] wpd;
+	delete [] xjneg_sum;
+	delete [] xTd;
 	delete [] exp_wTx;
 	delete [] exp_wTx_new;
-	delete [] xj_max;
-	delete [] C_sum;
-	delete [] xjneg_sum;
-	delete [] xjpos_sum;
+	delete [] tau;
+	delete [] D;
 }
 
 // transpose matrix X from row format to column format
diff --git a/train.c b/train.c
index 60c8dae..4920313 100644
--- a/train.c
+++ b/train.c
@@ -33,7 +33,7 @@ void exit_with_help()
 	"	-s 1, 3, 4 and 7\n"
 	"		Dual maximal violation <= eps; similar to libsvm (default 0.1)\n"
 	"	-s 5 and 6\n"
-	"		|f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,\n"
+	"		|f'(w)|_1 <= eps*min(pos,neg)/l*|f'(w0)|_1,\n"
 	"		where f is the primal function (default 0.01)\n"
 	"-B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)\n"
 	"-wi weight: weights adjust the parameter C of different classes (see README for details)\n"
-- 
2.40.0