From: cjlin <cjlin@16e7d947-dcc2-db11-b54a-0017319806e7>
Date: Sun, 5 Sep 2010 13:36:05 +0000 (+0000)
Subject: Add L2-regularized logistic regression (dual)
X-Git-Tag: v170~6
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=978067018e7d3cb385e0942dbb042e16a82620e4;p=liblinear

Add L2-regularized logistic regression (dual)
by implementing Alg 5 of Yu et al.
---

diff --git a/README b/README
index b59ce23..c9d6660 100644
--- a/README
+++ b/README
@@ -96,25 +96,27 @@ and mark
 
 Usage: train [options] training_set_file [model_file]
 options:
+options:
 -s type : set type of solver (default 1)
-        0 -- L2-regularized logistic regression
-        1 -- L2-regularized L2-loss support vector classification (dual)
-        2 -- L2-regularized L2-loss support vector classification (primal)
-        3 -- L2-regularized L1-loss support vector classification (dual)
-        4 -- multi-class support vector classification by Crammer and Singer
-        5 -- L1-regularized L2-loss support vector classification
-        6 -- L1-regularized logistic regression
+	0 -- L2-regularized logistic regression (primal)
+	1 -- L2-regularized L2-loss support vector classification (dual)
+	2 -- L2-regularized L2-loss support vector classification (primal)
+	3 -- L2-regularized L1-loss support vector classification (dual)
+	4 -- multi-class support vector classification by Crammer and Singer
+	5 -- L1-regularized L2-loss support vector classification
+	6 -- L1-regularized logistic regression
+	7 -- L2-regularized logistic regression (dual)
 -c cost : set the parameter C (default 1)
 -e epsilon : set tolerance of termination criterion
-        -s 0 and 2
-                |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
-                where f is the primal function and pos/neg are # of
-                positive/negative data (default 0.01)
-        -s 1, 3, and 4
-                Dual maximal violation <= eps; similar to libsvm (default 0.1)
-        -s 5 and 6
-                |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
-        where f is the primal function (default 0.01)
+	-s 0 and 2
+		|f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
+		where f is the primal function and pos/neg are # of
+		positive/negative data (default 0.01)
+	-s 1, 3, 4 and 7
+		Dual maximal violation <= eps; similar to libsvm (default 0.1)
+	-s 5 and 6
+		|f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
+		where f is the primal function (default 0.01)
 -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
 -wi weight: weights adjust the parameter C of different classes (see README for details)
 -v n: n-fold cross validation mode
@@ -155,10 +157,15 @@ where
 
 Q is a matrix with Q_ij = y_i y_j x_i^T x_j.
 
+For L2-regularized logistic regression (-s 7), we solve
+
+min_alpha  0.5(alpha^T Q alpha) + \sum alpha_i*log(alpha_i) + \sum (C-alpha_i)*log(C-alpha_i) - a constant
+    s.t.   0 <= alpha_i <= C,
+
 If bias >= 0, w becomes [w; w_{n+1}] and x becomes [x; bias].
 
-The primal-dual relationship implies that -s 1 and -s 2 gives the same
-model.
+The primal-dual relationship implies that -s 1 and -s 2 give the same
+model, and -s 0 and -s 7 give the same.
 
 We implement 1-vs-the rest multi-class strategy. In training i
 vs. non_i, their C parameters are (weight from -wi)*C and C,
@@ -288,15 +295,16 @@ Library Usage
                 double* weight;
         };
 
-    solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR.
+    solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL.
 
-    L2R_LR                L2-regularized logistic regression
+    L2R_LR                L2-regularized logistic regression (primal)
     L2R_L2LOSS_SVC_DUAL   L2-regularized L2-loss support vector classification (dual)
     L2R_L2LOSS_SVC        L2-regularized L2-loss support vector classification (primal)
     L2R_L1LOSS_SVC_DUAL   L2-regularized L1-loss support vector classification (dual)
     MCSVM_CS              multi-class support vector classification by Crammer and Singer
     L1R_L2LOSS_SVC        L1-regularized L2-loss support vector classification
     L1R_LR                L1-regularized logistic regression
+    L2R_LR_DUAL           L2-regularized logistic regression (dual)
 
     C is the cost of constraints violation.
     eps is the stopping criterion.
diff --git a/linear.cpp b/linear.cpp
index 9058c19..df52406 100644
--- a/linear.cpp
+++ b/linear.cpp
@@ -648,7 +648,7 @@ void Solver_MCSVM_CS::Solve(double *w)
 
 	info("\noptimization finished, #iter = %d\n",iter);
 	if (iter >= max_iter)
-		info("Warning: reaching max number of iterations\n");
+		info("\nWARNING: reaching max number of iterations\n");
 
 	// calculate objective value
 	double v = 0;
@@ -887,6 +887,174 @@ static void solve_l2r_l1l2_svc(
 	delete [] index;
 }
 
+// A coordinate descent algorithm for 
+// the dual of L2-regularized logistic regression problems
+//
+//  min_\alpha  0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - alpha_i) log (upper_bound_i - alpha_i) ,
+//    s.t.      0 <= alpha_i <= upper_bound_i,
+// 
+//  where Qij = yi yj xi^T xj and 
+//  upper_bound_i = Cp if y_i = 1
+//  upper_bound_i = Cn if y_i = -1
+//
+// Given: 
+// x, y, Cp, Cn
+// eps is the stopping tolerance
+//
+// solution will be put in w
+//
+// See Algorithm 5 of Yu et al., MLJ 2010
+
+#undef GETI
+#define GETI(i) (y[i]+1)
+// To support weights for instances, use GETI(i) (i)
+
+void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, double Cn)
+{
+	int l = prob->l;
+	int w_size = prob->n;
+	int i, s, iter = 0;
+	double *xTx = new double[l];
+	int max_iter = 1000;
+	int *index = new int[l];		
+	double *alpha = new double[2*l]; // store alpha and C - alpha
+	schar *y = new schar[l];	
+	int max_inner_iter = 100; // for inner Newton
+	double innereps = 1e-2; 
+	double innereps_min = min(1e-8, eps);
+	double upper_bound[3] = {Cn, 0, Cp};
+
+	for(i=0; i<w_size; i++)
+		w[i] = 0;
+	for(i=0; i<l; i++)
+	{
+		if(prob->y[i] > 0)
+		{
+			y[i] = +1; 
+		}
+		else
+		{
+			y[i] = -1;
+		}
+		alpha[2*i] = min(0.001*upper_bound[GETI(i)], 1e-8);
+		alpha[2*i+1] = upper_bound[GETI(i)] - alpha[2*i];
+
+		xTx[i] = 0;
+		feature_node *xi = prob->x[i];
+		while (xi->index != -1)
+		{
+			xTx[i] += (xi->value)*(xi->value);
+			w[xi->index-1] += y[i]*alpha[2*i]*xi->value;
+			xi++;
+		}
+		index[i] = i;
+	}
+
+	while (iter < max_iter)
+	{
+		for (i=0; i<l; i++)
+		{
+			int j = i+rand()%(l-i);
+			swap(index[i], index[j]);
+		}
+		int newton_iter = 0;
+		double Gmax = 0;
+		for (s=0; s<l; s++)
+		{
+			i = index[s];
+			schar yi = y[i];
+			double C = upper_bound[GETI(i)];
+			double ywTx = 0, xisq = xTx[i];
+			feature_node *xi = prob->x[i];
+			while (xi->index != -1)
+			{
+				ywTx += w[xi->index-1]*xi->value;
+				xi++;
+			}
+			ywTx *= y[i];
+			double a = xisq, b = ywTx;
+
+			// Decide to minimize g_1(z) or g_2(z)
+			int ind1 = 2*i, ind2 = 2*i+1, sign = 1;
+			if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0) 
+			{
+				ind1 = 2*i+1;
+				ind2 = 2*i;
+				sign = -1;
+			}
+
+			//  g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old)
+			double alpha_old = alpha[ind1];
+			double z = alpha_old;
+			if(C - z < 0.5 * C) 
+				z = 0.1*z;
+			double gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
+			Gmax = max(Gmax, fabs(gp));
+
+			// Newton method on the sub-problem
+			const double eta = 0.1; // xi in the paper
+			int inner_iter = 0;
+			while (inner_iter <= max_inner_iter) 
+			{
+				if(fabs(gp) < innereps)
+					break;
+				double gpp = a + C/(C-z)/z;
+				double tmpz = z - gp/gpp;
+				if(tmpz <= 0) 
+					z *= eta;
+				else // tmpz in (0, C)
+					z = tmpz;
+				gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
+				newton_iter++;
+				inner_iter++;
+			}
+
+			if(inner_iter > 0) // update w
+			{
+				alpha[ind1] = z;
+				alpha[ind2] = C-z;
+				xi = prob->x[i];
+				while (xi->index != -1)
+				{
+					w[xi->index-1] += sign*(z-alpha_old)*yi*xi->value;
+					xi++;
+				}  
+			}
+		}
+
+		iter++;
+		if(iter % 10 == 0)
+			info(".");
+
+		if(Gmax < eps) 
+			break;
+
+		if(newton_iter < l/10) 
+			innereps = max(innereps_min, 0.1*innereps);
+
+	}
+
+	info("\noptimization finished, #iter = %d\n",iter);
+	if (iter >= max_iter)
+		info("\nWARNING: reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n");
+
+	// calculate objective value
+	
+	double v = 0;
+	for(i=0; i<w_size; i++)
+		v += w[i] * w[i];
+	v *= 0.5;
+	for(i=0; i<l; i++)
+		v += alpha[2*i] * log(alpha[2*i]) + alpha[2*i+1] * log(alpha[2*i+1]) 
+			- upper_bound[GETI(i)] * log(upper_bound[GETI(i)]);
+	info("Objective value = %lf\n", v);
+
+	delete [] xTx;
+	delete [] alpha;
+	delete [] y;
+	delete [] index;
+}
+
 // A coordinate descent algorithm for 
 // L1-regularized L2-loss support vector classification
 //
@@ -1629,6 +1797,9 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
 			delete [] x_space;
 			break;
 		}
+		case L2R_LR_DUAL:
+			solve_l2r_lr_dual(prob, w, eps, Cp, Cn);
+			break;
 		default:
 			fprintf(stderr, "Error: unknown solver_type\n");
 			break;
@@ -1677,7 +1848,7 @@ model* train(const problem *prob, const parameter *param)
 			if(param->weight_label[i] == label[j])
 				break;
 		if(j == nr_class)
-			fprintf(stderr,"warning: class label %d specified in weight is not found\n", param->weight_label[i]);
+			fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]);
 		else
 			weighted_C[j] *= param->weight[i];
 	}
@@ -1899,7 +2070,8 @@ int predict_probability(const struct model *model_, const struct feature_node *x
 
 static const char *solver_type_table[]=
 {
-	"L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC","L2R_L1LOSS_SVC_DUAL","MCSVM_CS", "L1R_L2LOSS_SVC","L1R_LR", NULL
+	"L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC", "L2R_L1LOSS_SVC_DUAL", "MCSVM_CS",
+	"L1R_L2LOSS_SVC", "L1R_LR", "L2R_LR_DUAL", NULL
 };
 
 int save_model(const char *model_file_name, const struct model *model_)
@@ -2100,7 +2272,8 @@ const char *check_parameter(const problem *prob, const parameter *param)
 		&& param->solver_type != L2R_L1LOSS_SVC_DUAL
 		&& param->solver_type != MCSVM_CS
 		&& param->solver_type != L1R_L2LOSS_SVC
-		&& param->solver_type != L1R_LR)
+		&& param->solver_type != L1R_LR
+		&& param->solver_type != L2R_LR_DUAL)
 		return "unknown solver type";
 
 	return NULL;
@@ -2108,7 +2281,9 @@ const char *check_parameter(const problem *prob, const parameter *param)
 
 int check_probability_model(const struct model *model_)
 {
-	return (model_->param.solver_type==L2R_LR || model_->param.solver_type==L1R_LR);
+	return (model_->param.solver_type==L2R_LR ||
+			model_->param.solver_type==L2R_LR_DUAL ||
+			model_->param.solver_type==L1R_LR);
 }
 
 void set_print_string_function(void (*print_func)(const char*))
diff --git a/linear.h b/linear.h
index 9d9b170..2a1aa28 100644
--- a/linear.h
+++ b/linear.h
@@ -19,7 +19,7 @@ struct problem
 	double bias;            /* < 0 if no bias term */  
 };
 
-enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR }; /* solver_type */
+enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL }; /* solver_type */
 
 struct parameter
 {
diff --git a/train.c b/train.c
index 98bb0a2..c3ce195 100644
--- a/train.c
+++ b/train.c
@@ -16,20 +16,21 @@ void exit_with_help()
 	"Usage: train [options] training_set_file [model_file]\n"
 	"options:\n"
 	"-s type : set type of solver (default 1)\n"
-	"	0 -- L2-regularized logistic regression\n"
+	"	0 -- L2-regularized logistic regression (primal)\n"
 	"	1 -- L2-regularized L2-loss support vector classification (dual)\n"	
 	"	2 -- L2-regularized L2-loss support vector classification (primal)\n"
 	"	3 -- L2-regularized L1-loss support vector classification (dual)\n"
 	"	4 -- multi-class support vector classification by Crammer and Singer\n"
 	"	5 -- L1-regularized L2-loss support vector classification\n"
 	"	6 -- L1-regularized logistic regression\n"
+	"	7 -- L2-regularized logistic regression (dual)\n"
 	"-c cost : set the parameter C (default 1)\n"
 	"-e epsilon : set tolerance of termination criterion\n"
 	"	-s 0 and 2\n" 
 	"		|f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,\n" 
 	"		where f is the primal function and pos/neg are # of\n" 
 	"		positive/negative data (default 0.01)\n"
-	"	-s 1, 3, and 4\n"
+	"	-s 1, 3, 4 and 7\n"
 	"		Dual maximal violation <= eps; similar to libsvm (default 0.1)\n"
 	"	-s 5 and 6\n"
 	"		|f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,\n"
@@ -229,7 +230,7 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode
 	{
 		if(param.solver_type == L2R_LR || param.solver_type == L2R_L2LOSS_SVC)
 			param.eps = 0.01;
-		else if(param.solver_type == L2R_L2LOSS_SVC_DUAL || param.solver_type == L2R_L1LOSS_SVC_DUAL || param.solver_type == MCSVM_CS)
+		else if(param.solver_type == L2R_L2LOSS_SVC_DUAL || param.solver_type == L2R_L1LOSS_SVC_DUAL || param.solver_type == MCSVM_CS || param.solver_type == L2R_LR_DUAL)
 			param.eps = 0.1;
 		else if(param.solver_type == L1R_L2LOSS_SVC || param.solver_type == L1R_LR)
 			param.eps = 0.01;