Add L2-regularized logistic regression (dual)

author cjlin <cjlin@16e7d947-dcc2-db11-b54a-0017319806e7>

Sun, 5 Sep 2010 13:36:05 +0000 (13:36 +0000)

committer cjlin <cjlin@16e7d947-dcc2-db11-b54a-0017319806e7>

Sun, 5 Sep 2010 13:36:05 +0000 (13:36 +0000)
author cjlin <cjlin@16e7d947-dcc2-db11-b54a-0017319806e7>
Sun, 5 Sep 2010 13:36:05 +0000 (13:36 +0000)
committer cjlin <cjlin@16e7d947-dcc2-db11-b54a-0017319806e7>
Sun, 5 Sep 2010 13:36:05 +0000 (13:36 +0000)
diff --git a/README b/README

index b59ce23acf63ade82e3a72cf0ffd7e4d00a0647c..c9d6660b6621cb7547df77967843be37a310f6ba 100644 (file)
--- a/README
+++ b/README
@@ -96,25 +96,27 @@ and mark
  
  Usage: train [options] training_set_file [model_file]
  options:
+options:
  -s type : set type of solver (default 1)
-        0 -- L2-regularized logistic regression
-        1 -- L2-regularized L2-loss support vector classification (dual)
-        2 -- L2-regularized L2-loss support vector classification (primal)
-        3 -- L2-regularized L1-loss support vector classification (dual)
-        4 -- multi-class support vector classification by Crammer and Singer
-        5 -- L1-regularized L2-loss support vector classification
-        6 -- L1-regularized logistic regression
+       0 -- L2-regularized logistic regression (primal)
+       1 -- L2-regularized L2-loss support vector classification (dual)
+       2 -- L2-regularized L2-loss support vector classification (primal)
+       3 -- L2-regularized L1-loss support vector classification (dual)
+       4 -- multi-class support vector classification by Crammer and Singer
+       5 -- L1-regularized L2-loss support vector classification
+       6 -- L1-regularized logistic regression
+       7 -- L2-regularized logistic regression (dual)
  -c cost : set the parameter C (default 1)
  -e epsilon : set tolerance of termination criterion
-        -s 0 and 2
-                |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
-                where f is the primal function and pos/neg are # of
-                positive/negative data (default 0.01)
-        -s 1, 3, and 4
-                Dual maximal violation <= eps; similar to libsvm (default 0.1)
-        -s 5 and 6
-                |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
-        where f is the primal function (default 0.01)
+       -s 0 and 2
+               |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
+               where f is the primal function and pos/neg are # of
+               positive/negative data (default 0.01)
+       -s 1, 3, 4 and 7
+               Dual maximal violation <= eps; similar to libsvm (default 0.1)
+       -s 5 and 6
+               |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
+               where f is the primal function (default 0.01)
  -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
  -wi weight: weights adjust the parameter C of different classes (see README for details)
  -v n: n-fold cross validation mode
@@ -155,10 +157,15 @@ where
  
  Q is a matrix with Q_ij = y_i y_j x_i^T x_j.
  
+For L2-regularized logistic regression (-s 7), we solve
+
+min_alpha  0.5(alpha^T Q alpha) + \sum alpha_i*log(alpha_i) + \sum (C-alpha_i)*log(C-alpha_i) - a constant
+    s.t.   0 <= alpha_i <= C,
+
  If bias >= 0, w becomes [w; w_{n+1}] and x becomes [x; bias].
  
-The primal-dual relationship implies that -s 1 and -s 2 gives the same
-model.
+The primal-dual relationship implies that -s 1 and -s 2 give the same
+model, and -s 0 and -s 7 give the same.
  
  We implement 1-vs-the rest multi-class strategy. In training i
  vs. non_i, their C parameters are (weight from -wi)*C and C,
@@ -288,15 +295,16 @@ Library Usage
                  double* weight;
          };
  
-    solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR.
+    solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL.
  
-    L2R_LR                L2-regularized logistic regression
+    L2R_LR                L2-regularized logistic regression (primal)
      L2R_L2LOSS_SVC_DUAL   L2-regularized L2-loss support vector classification (dual)
      L2R_L2LOSS_SVC        L2-regularized L2-loss support vector classification (primal)
      L2R_L1LOSS_SVC_DUAL   L2-regularized L1-loss support vector classification (dual)
      MCSVM_CS              multi-class support vector classification by Crammer and Singer
      L1R_L2LOSS_SVC        L1-regularized L2-loss support vector classification
      L1R_LR                L1-regularized logistic regression
+    L2R_LR_DUAL           L2-regularized logistic regression (dual)
  
      C is the cost of constraints violation.
      eps is the stopping criterion.
diff --git a/linear.cpp b/linear.cpp

index 9058c194b81db6c3e214aa1ecc7d48c500cb6413..df52406955fd1d2061e0387539a456a7367cb8b1 100644 (file)
--- a/linear.cpp
+++ b/linear.cpp
@@ -648,7 +648,7 @@ void Solver_MCSVM_CS::Solve(double *w)
  
         info("\noptimization finished, #iter = %d\n",iter);
         if (iter >= max_iter)
-               info("Warning: reaching max number of iterations\n");
+               info("\nWARNING: reaching max number of iterations\n");
  
         // calculate objective value
         double v = 0;
@@ -887,6 +887,174 @@ static void solve_l2r_l1l2_svc(
         delete [] index;
  }
  
+// A coordinate descent algorithm for 
+// the dual of L2-regularized logistic regression problems
+//
+//  min_\alpha  0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - alpha_i) log (upper_bound_i - alpha_i) ,
+//    s.t.      0 <= alpha_i <= upper_bound_i,
+// 
+//  where Qij = yi yj xi^T xj and 
+//  upper_bound_i = Cp if y_i = 1
+//  upper_bound_i = Cn if y_i = -1
+//
+// Given: 
+// x, y, Cp, Cn
+// eps is the stopping tolerance
+//
+// solution will be put in w
+//
+// See Algorithm 5 of Yu et al., MLJ 2010
+
+#undef GETI
+#define GETI(i) (y[i]+1)
+// To support weights for instances, use GETI(i) (i)
+
+void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, double Cn)
+{
+       int l = prob->l;
+       int w_size = prob->n;
+       int i, s, iter = 0;
+       double *xTx = new double[l];
+       int max_iter = 1000;
+       int *index = new int[l];                
+       double *alpha = new double[2*l]; // store alpha and C - alpha
+       schar *y = new schar[l];        
+       int max_inner_iter = 100; // for inner Newton
+       double innereps = 1e-2; 
+       double innereps_min = min(1e-8, eps);
+       double upper_bound[3] = {Cn, 0, Cp};
+
+       for(i=0; i<w_size; i++)
+               w[i] = 0;
+       for(i=0; i<l; i++)
+       {
+               if(prob->y[i] > 0)
+               {
+                       y[i] = +1; 
+               }
+               else
+               {
+                       y[i] = -1;
+               }
+               alpha[2*i] = min(0.001*upper_bound[GETI(i)], 1e-8);
+               alpha[2*i+1] = upper_bound[GETI(i)] - alpha[2*i];
+
+               xTx[i] = 0;
+               feature_node *xi = prob->x[i];
+               while (xi->index != -1)
+               {
+                       xTx[i] += (xi->value)*(xi->value);
+                       w[xi->index-1] += y[i]*alpha[2*i]*xi->value;
+                       xi++;
+               }
+               index[i] = i;
+       }
+
+       while (iter < max_iter)
+       {
+               for (i=0; i<l; i++)
+               {
+                       int j = i+rand()%(l-i);
+                       swap(index[i], index[j]);
+               }
+               int newton_iter = 0;
+               double Gmax = 0;
+               for (s=0; s<l; s++)
+               {
+                       i = index[s];
+                       schar yi = y[i];
+                       double C = upper_bound[GETI(i)];
+                       double ywTx = 0, xisq = xTx[i];
+                       feature_node *xi = prob->x[i];
+                       while (xi->index != -1)
+                       {
+                               ywTx += w[xi->index-1]*xi->value;
+                               xi++;
+                       }
+                       ywTx *= y[i];
+                       double a = xisq, b = ywTx;
+
+                       // Decide to minimize g_1(z) or g_2(z)
+                       int ind1 = 2*i, ind2 = 2*i+1, sign = 1;
+                       if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0) 
+                       {
+                               ind1 = 2*i+1;
+                               ind2 = 2*i;
+                               sign = -1;
+                       }
+
+                       //  g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old)
+                       double alpha_old = alpha[ind1];
+                       double z = alpha_old;
+                       if(C - z < 0.5 * C) 
+                               z = 0.1*z;
+                       double gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
+                       Gmax = max(Gmax, fabs(gp));
+
+                       // Newton method on the sub-problem
+                       const double eta = 0.1; // xi in the paper
+                       int inner_iter = 0;
+                       while (inner_iter <= max_inner_iter) 
+                       {
+                               if(fabs(gp) < innereps)
+                                       break;
+                               double gpp = a + C/(C-z)/z;
+                               double tmpz = z - gp/gpp;
+                               if(tmpz <= 0) 
+                                       z *= eta;
+                               else // tmpz in (0, C)
+                                       z = tmpz;
+                               gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
+                               newton_iter++;
+                               inner_iter++;
+                       }
+
+                       if(inner_iter > 0) // update w
+                       {
+                               alpha[ind1] = z;
+                               alpha[ind2] = C-z;
+                               xi = prob->x[i];
+                               while (xi->index != -1)
+                               {
+                                       w[xi->index-1] += sign*(z-alpha_old)*yi*xi->value;
+                                       xi++;
+                               }  
+                       }
+               }
+
+               iter++;
+               if(iter % 10 == 0)
+                       info(".");
+
+               if(Gmax < eps) 
+                       break;
+
+               if(newton_iter < l/10) 
+                       innereps = max(innereps_min, 0.1*innereps);
+
+       }
+
+       info("\noptimization finished, #iter = %d\n",iter);
+       if (iter >= max_iter)
+               info("\nWARNING: reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n");
+
+       // calculate objective value
+       
+       double v = 0;
+       for(i=0; i<w_size; i++)
+               v += w[i] * w[i];
+       v *= 0.5;
+       for(i=0; i<l; i++)
+               v += alpha[2*i] * log(alpha[2*i]) + alpha[2*i+1] * log(alpha[2*i+1]) 
+                       - upper_bound[GETI(i)] * log(upper_bound[GETI(i)]);
+       info("Objective value = %lf\n", v);
+
+       delete [] xTx;
+       delete [] alpha;
+       delete [] y;
+       delete [] index;
+}
+
  // A coordinate descent algorithm for 
  // L1-regularized L2-loss support vector classification
  //
@@ -1629,6 +1797,9 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
                         delete [] x_space;
                         break;
                 }
+               case L2R_LR_DUAL:
+                       solve_l2r_lr_dual(prob, w, eps, Cp, Cn);
+                       break;
                 default:
                         fprintf(stderr, "Error: unknown solver_type\n");
                         break;
@@ -1677,7 +1848,7 @@ model* train(const problem *prob, const parameter *param)
                         if(param->weight_label[i] == label[j])
                                 break;
                 if(j == nr_class)
-                       fprintf(stderr,"warning: class label %d specified in weight is not found\n", param->weight_label[i]);
+                       fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]);
                 else
                         weighted_C[j] *= param->weight[i];
         }
@@ -1899,7 +2070,8 @@ int predict_probability(const struct model *model_, const struct feature_node *x
  
  static const char *solver_type_table[]=
  {
-       "L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC","L2R_L1LOSS_SVC_DUAL","MCSVM_CS", "L1R_L2LOSS_SVC","L1R_LR", NULL
+       "L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC", "L2R_L1LOSS_SVC_DUAL", "MCSVM_CS",
+       "L1R_L2LOSS_SVC", "L1R_LR", "L2R_LR_DUAL", NULL
  };
  
  int save_model(const char *model_file_name, const struct model *model_)
@@ -2100,7 +2272,8 @@ const char *check_parameter(const problem *prob, const parameter *param)
                 && param->solver_type != L2R_L1LOSS_SVC_DUAL
                 && param->solver_type != MCSVM_CS
                 && param->solver_type != L1R_L2LOSS_SVC
-               && param->solver_type != L1R_LR)
+               && param->solver_type != L1R_LR
+               && param->solver_type != L2R_LR_DUAL)
                 return "unknown solver type";
  
         return NULL;
@@ -2108,7 +2281,9 @@ const char *check_parameter(const problem *prob, const parameter *param)
  
  int check_probability_model(const struct model *model_)
  {
-       return (model_->param.solver_type==L2R_LR || model_->param.solver_type==L1R_LR);
+       return (model_->param.solver_type==L2R_LR ||
+                       model_->param.solver_type==L2R_LR_DUAL ||
+                       model_->param.solver_type==L1R_LR);
  }
  
  void set_print_string_function(void (*print_func)(const char*))
diff --git a/linear.h b/linear.h

index 9d9b170be6c2674906ff038baeea6bb30a6d323e..2a1aa28d2d4e5aab647dffe3ac0a66e2c55ba248 100644 (file)
--- a/linear.h
+++ b/linear.h
@@ -19,7 +19,7 @@ struct problem
         double bias;            /* < 0 if no bias term */  
  };
  
-enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR }; /* solver_type */
+enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL }; /* solver_type */
  
  struct parameter
  {
diff --git a/train.c b/train.c

index 98bb0a20888fd6142636b1e5d63619130c2796b7..c3ce195a03045d1b2b2d84fc7e3f39d12422badb 100644 (file)
--- a/train.c
+++ b/train.c
@@ -16,20 +16,21 @@ void exit_with_help()
         "Usage: train [options] training_set_file [model_file]\n"
         "options:\n"
         "-s type : set type of solver (default 1)\n"
-       "       0 -- L2-regularized logistic regression\n"
+       "       0 -- L2-regularized logistic regression (primal)\n"
         "       1 -- L2-regularized L2-loss support vector classification (dual)\n"     
         "       2 -- L2-regularized L2-loss support vector classification (primal)\n"
         "       3 -- L2-regularized L1-loss support vector classification (dual)\n"
         "       4 -- multi-class support vector classification by Crammer and Singer\n"
         "       5 -- L1-regularized L2-loss support vector classification\n"
         "       6 -- L1-regularized logistic regression\n"
+       "       7 -- L2-regularized logistic regression (dual)\n"
         "-c cost : set the parameter C (default 1)\n"
         "-e epsilon : set tolerance of termination criterion\n"
         "       -s 0 and 2\n" 
         "               |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,\n" 
         "               where f is the primal function and pos/neg are # of\n" 
         "               positive/negative data (default 0.01)\n"
-       "       -s 1, 3, and 4\n"
+       "       -s 1, 3, 4 and 7\n"
         "               Dual maximal violation <= eps; similar to libsvm (default 0.1)\n"
         "       -s 5 and 6\n"
         "               |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,\n"
@@ -229,7 +230,7 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode
         {
                 if(param.solver_type == L2R_LR || param.solver_type == L2R_L2LOSS_SVC)
                         param.eps = 0.01;
-               else if(param.solver_type == L2R_L2LOSS_SVC_DUAL || param.solver_type == L2R_L1LOSS_SVC_DUAL || param.solver_type == MCSVM_CS)
+               else if(param.solver_type == L2R_L2LOSS_SVC_DUAL || param.solver_type == L2R_L1LOSS_SVC_DUAL || param.solver_type == MCSVM_CS || param.solver_type == L2R_LR_DUAL)
                         param.eps = 0.1;
                 else if(param.solver_type == L1R_L2LOSS_SVC || param.solver_type == L1R_LR)
                         param.eps = 0.01;
author	cjlin <cjlin@16e7d947-dcc2-db11-b54a-0017319806e7>
	Sun, 5 Sep 2010 13:36:05 +0000 (13:36 +0000)
committer	cjlin <cjlin@16e7d947-dcc2-db11-b54a-0017319806e7>
	Sun, 5 Sep 2010 13:36:05 +0000 (13:36 +0000)
README		patch \| blob \| history
linear.cpp		patch \| blob \| history
linear.h		patch \| blob \| history
train.c		patch \| blob \| history