]> granicus.if.org Git - liblinear/commitdiff
Add L2-regularized logistic regression (dual)
authorcjlin <cjlin@16e7d947-dcc2-db11-b54a-0017319806e7>
Sun, 5 Sep 2010 13:36:05 +0000 (13:36 +0000)
committercjlin <cjlin@16e7d947-dcc2-db11-b54a-0017319806e7>
Sun, 5 Sep 2010 13:36:05 +0000 (13:36 +0000)
by implementing Alg 5 of Yu et al.

README
linear.cpp
linear.h
train.c

diff --git a/README b/README
index b59ce23acf63ade82e3a72cf0ffd7e4d00a0647c..c9d6660b6621cb7547df77967843be37a310f6ba 100644 (file)
--- a/README
+++ b/README
@@ -96,25 +96,27 @@ and mark
 
 Usage: train [options] training_set_file [model_file]
 options:
+options:
 -s type : set type of solver (default 1)
-        0 -- L2-regularized logistic regression
-        1 -- L2-regularized L2-loss support vector classification (dual)
-        2 -- L2-regularized L2-loss support vector classification (primal)
-        3 -- L2-regularized L1-loss support vector classification (dual)
-        4 -- multi-class support vector classification by Crammer and Singer
-        5 -- L1-regularized L2-loss support vector classification
-        6 -- L1-regularized logistic regression
+       0 -- L2-regularized logistic regression (primal)
+       1 -- L2-regularized L2-loss support vector classification (dual)
+       2 -- L2-regularized L2-loss support vector classification (primal)
+       3 -- L2-regularized L1-loss support vector classification (dual)
+       4 -- multi-class support vector classification by Crammer and Singer
+       5 -- L1-regularized L2-loss support vector classification
+       6 -- L1-regularized logistic regression
+       7 -- L2-regularized logistic regression (dual)
 -c cost : set the parameter C (default 1)
 -e epsilon : set tolerance of termination criterion
-        -s 0 and 2
-                |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
-                where f is the primal function and pos/neg are # of
-                positive/negative data (default 0.01)
-        -s 1, 3, and 4
-                Dual maximal violation <= eps; similar to libsvm (default 0.1)
-        -s 5 and 6
-                |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
-        where f is the primal function (default 0.01)
+       -s 0 and 2
+               |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,
+               where f is the primal function and pos/neg are # of
+               positive/negative data (default 0.01)
+       -s 1, 3, 4 and 7
+               Dual maximal violation <= eps; similar to libsvm (default 0.1)
+       -s 5 and 6
+               |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,
+               where f is the primal function (default 0.01)
 -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
 -wi weight: weights adjust the parameter C of different classes (see README for details)
 -v n: n-fold cross validation mode
@@ -155,10 +157,15 @@ where
 
 Q is a matrix with Q_ij = y_i y_j x_i^T x_j.
 
+For L2-regularized logistic regression (-s 7), we solve
+
+min_alpha  0.5(alpha^T Q alpha) + \sum alpha_i*log(alpha_i) + \sum (C-alpha_i)*log(C-alpha_i) - a constant
+    s.t.   0 <= alpha_i <= C,
+
 If bias >= 0, w becomes [w; w_{n+1}] and x becomes [x; bias].
 
-The primal-dual relationship implies that -s 1 and -s 2 gives the same
-model.
+The primal-dual relationship implies that -s 1 and -s 2 give the same
+model, and -s 0 and -s 7 give the same.
 
 We implement 1-vs-the rest multi-class strategy. In training i
 vs. non_i, their C parameters are (weight from -wi)*C and C,
@@ -288,15 +295,16 @@ Library Usage
                 double* weight;
         };
 
-    solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR.
+    solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL.
 
-    L2R_LR                L2-regularized logistic regression
+    L2R_LR                L2-regularized logistic regression (primal)
     L2R_L2LOSS_SVC_DUAL   L2-regularized L2-loss support vector classification (dual)
     L2R_L2LOSS_SVC        L2-regularized L2-loss support vector classification (primal)
     L2R_L1LOSS_SVC_DUAL   L2-regularized L1-loss support vector classification (dual)
     MCSVM_CS              multi-class support vector classification by Crammer and Singer
     L1R_L2LOSS_SVC        L1-regularized L2-loss support vector classification
     L1R_LR                L1-regularized logistic regression
+    L2R_LR_DUAL           L2-regularized logistic regression (dual)
 
     C is the cost of constraints violation.
     eps is the stopping criterion.
index 9058c194b81db6c3e214aa1ecc7d48c500cb6413..df52406955fd1d2061e0387539a456a7367cb8b1 100644 (file)
@@ -648,7 +648,7 @@ void Solver_MCSVM_CS::Solve(double *w)
 
        info("\noptimization finished, #iter = %d\n",iter);
        if (iter >= max_iter)
-               info("Warning: reaching max number of iterations\n");
+               info("\nWARNING: reaching max number of iterations\n");
 
        // calculate objective value
        double v = 0;
@@ -887,6 +887,174 @@ static void solve_l2r_l1l2_svc(
        delete [] index;
 }
 
+// A coordinate descent algorithm for 
+// the dual of L2-regularized logistic regression problems
+//
+//  min_\alpha  0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - alpha_i) log (upper_bound_i - alpha_i) ,
+//    s.t.      0 <= alpha_i <= upper_bound_i,
+// 
+//  where Qij = yi yj xi^T xj and 
+//  upper_bound_i = Cp if y_i = 1
+//  upper_bound_i = Cn if y_i = -1
+//
+// Given: 
+// x, y, Cp, Cn
+// eps is the stopping tolerance
+//
+// solution will be put in w
+//
+// See Algorithm 5 of Yu et al., MLJ 2010
+
+#undef GETI
+#define GETI(i) (y[i]+1)
+// To support weights for instances, use GETI(i) (i)
+
+void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, double Cn)
+{
+       int l = prob->l;
+       int w_size = prob->n;
+       int i, s, iter = 0;
+       double *xTx = new double[l];
+       int max_iter = 1000;
+       int *index = new int[l];                
+       double *alpha = new double[2*l]; // store alpha and C - alpha
+       schar *y = new schar[l];        
+       int max_inner_iter = 100; // for inner Newton
+       double innereps = 1e-2; 
+       double innereps_min = min(1e-8, eps);
+       double upper_bound[3] = {Cn, 0, Cp};
+
+       for(i=0; i<w_size; i++)
+               w[i] = 0;
+       for(i=0; i<l; i++)
+       {
+               if(prob->y[i] > 0)
+               {
+                       y[i] = +1; 
+               }
+               else
+               {
+                       y[i] = -1;
+               }
+               alpha[2*i] = min(0.001*upper_bound[GETI(i)], 1e-8);
+               alpha[2*i+1] = upper_bound[GETI(i)] - alpha[2*i];
+
+               xTx[i] = 0;
+               feature_node *xi = prob->x[i];
+               while (xi->index != -1)
+               {
+                       xTx[i] += (xi->value)*(xi->value);
+                       w[xi->index-1] += y[i]*alpha[2*i]*xi->value;
+                       xi++;
+               }
+               index[i] = i;
+       }
+
+       while (iter < max_iter)
+       {
+               for (i=0; i<l; i++)
+               {
+                       int j = i+rand()%(l-i);
+                       swap(index[i], index[j]);
+               }
+               int newton_iter = 0;
+               double Gmax = 0;
+               for (s=0; s<l; s++)
+               {
+                       i = index[s];
+                       schar yi = y[i];
+                       double C = upper_bound[GETI(i)];
+                       double ywTx = 0, xisq = xTx[i];
+                       feature_node *xi = prob->x[i];
+                       while (xi->index != -1)
+                       {
+                               ywTx += w[xi->index-1]*xi->value;
+                               xi++;
+                       }
+                       ywTx *= y[i];
+                       double a = xisq, b = ywTx;
+
+                       // Decide to minimize g_1(z) or g_2(z)
+                       int ind1 = 2*i, ind2 = 2*i+1, sign = 1;
+                       if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0) 
+                       {
+                               ind1 = 2*i+1;
+                               ind2 = 2*i;
+                               sign = -1;
+                       }
+
+                       //  g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old)
+                       double alpha_old = alpha[ind1];
+                       double z = alpha_old;
+                       if(C - z < 0.5 * C) 
+                               z = 0.1*z;
+                       double gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
+                       Gmax = max(Gmax, fabs(gp));
+
+                       // Newton method on the sub-problem
+                       const double eta = 0.1; // xi in the paper
+                       int inner_iter = 0;
+                       while (inner_iter <= max_inner_iter) 
+                       {
+                               if(fabs(gp) < innereps)
+                                       break;
+                               double gpp = a + C/(C-z)/z;
+                               double tmpz = z - gp/gpp;
+                               if(tmpz <= 0) 
+                                       z *= eta;
+                               else // tmpz in (0, C)
+                                       z = tmpz;
+                               gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
+                               newton_iter++;
+                               inner_iter++;
+                       }
+
+                       if(inner_iter > 0) // update w
+                       {
+                               alpha[ind1] = z;
+                               alpha[ind2] = C-z;
+                               xi = prob->x[i];
+                               while (xi->index != -1)
+                               {
+                                       w[xi->index-1] += sign*(z-alpha_old)*yi*xi->value;
+                                       xi++;
+                               }  
+                       }
+               }
+
+               iter++;
+               if(iter % 10 == 0)
+                       info(".");
+
+               if(Gmax < eps) 
+                       break;
+
+               if(newton_iter < l/10) 
+                       innereps = max(innereps_min, 0.1*innereps);
+
+       }
+
+       info("\noptimization finished, #iter = %d\n",iter);
+       if (iter >= max_iter)
+               info("\nWARNING: reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n");
+
+       // calculate objective value
+       
+       double v = 0;
+       for(i=0; i<w_size; i++)
+               v += w[i] * w[i];
+       v *= 0.5;
+       for(i=0; i<l; i++)
+               v += alpha[2*i] * log(alpha[2*i]) + alpha[2*i+1] * log(alpha[2*i+1]) 
+                       - upper_bound[GETI(i)] * log(upper_bound[GETI(i)]);
+       info("Objective value = %lf\n", v);
+
+       delete [] xTx;
+       delete [] alpha;
+       delete [] y;
+       delete [] index;
+}
+
 // A coordinate descent algorithm for 
 // L1-regularized L2-loss support vector classification
 //
@@ -1629,6 +1797,9 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
                        delete [] x_space;
                        break;
                }
+               case L2R_LR_DUAL:
+                       solve_l2r_lr_dual(prob, w, eps, Cp, Cn);
+                       break;
                default:
                        fprintf(stderr, "Error: unknown solver_type\n");
                        break;
@@ -1677,7 +1848,7 @@ model* train(const problem *prob, const parameter *param)
                        if(param->weight_label[i] == label[j])
                                break;
                if(j == nr_class)
-                       fprintf(stderr,"warning: class label %d specified in weight is not found\n", param->weight_label[i]);
+                       fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]);
                else
                        weighted_C[j] *= param->weight[i];
        }
@@ -1899,7 +2070,8 @@ int predict_probability(const struct model *model_, const struct feature_node *x
 
 static const char *solver_type_table[]=
 {
-       "L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC","L2R_L1LOSS_SVC_DUAL","MCSVM_CS", "L1R_L2LOSS_SVC","L1R_LR", NULL
+       "L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC", "L2R_L1LOSS_SVC_DUAL", "MCSVM_CS",
+       "L1R_L2LOSS_SVC", "L1R_LR", "L2R_LR_DUAL", NULL
 };
 
 int save_model(const char *model_file_name, const struct model *model_)
@@ -2100,7 +2272,8 @@ const char *check_parameter(const problem *prob, const parameter *param)
                && param->solver_type != L2R_L1LOSS_SVC_DUAL
                && param->solver_type != MCSVM_CS
                && param->solver_type != L1R_L2LOSS_SVC
-               && param->solver_type != L1R_LR)
+               && param->solver_type != L1R_LR
+               && param->solver_type != L2R_LR_DUAL)
                return "unknown solver type";
 
        return NULL;
@@ -2108,7 +2281,9 @@ const char *check_parameter(const problem *prob, const parameter *param)
 
 int check_probability_model(const struct model *model_)
 {
-       return (model_->param.solver_type==L2R_LR || model_->param.solver_type==L1R_LR);
+       return (model_->param.solver_type==L2R_LR ||
+                       model_->param.solver_type==L2R_LR_DUAL ||
+                       model_->param.solver_type==L1R_LR);
 }
 
 void set_print_string_function(void (*print_func)(const char*))
index 9d9b170be6c2674906ff038baeea6bb30a6d323e..2a1aa28d2d4e5aab647dffe3ac0a66e2c55ba248 100644 (file)
--- a/linear.h
+++ b/linear.h
@@ -19,7 +19,7 @@ struct problem
        double bias;            /* < 0 if no bias term */  
 };
 
-enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR }; /* solver_type */
+enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL }; /* solver_type */
 
 struct parameter
 {
diff --git a/train.c b/train.c
index 98bb0a20888fd6142636b1e5d63619130c2796b7..c3ce195a03045d1b2b2d84fc7e3f39d12422badb 100644 (file)
--- a/train.c
+++ b/train.c
@@ -16,20 +16,21 @@ void exit_with_help()
        "Usage: train [options] training_set_file [model_file]\n"
        "options:\n"
        "-s type : set type of solver (default 1)\n"
-       "       0 -- L2-regularized logistic regression\n"
+       "       0 -- L2-regularized logistic regression (primal)\n"
        "       1 -- L2-regularized L2-loss support vector classification (dual)\n"     
        "       2 -- L2-regularized L2-loss support vector classification (primal)\n"
        "       3 -- L2-regularized L1-loss support vector classification (dual)\n"
        "       4 -- multi-class support vector classification by Crammer and Singer\n"
        "       5 -- L1-regularized L2-loss support vector classification\n"
        "       6 -- L1-regularized logistic regression\n"
+       "       7 -- L2-regularized logistic regression (dual)\n"
        "-c cost : set the parameter C (default 1)\n"
        "-e epsilon : set tolerance of termination criterion\n"
        "       -s 0 and 2\n" 
        "               |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,\n" 
        "               where f is the primal function and pos/neg are # of\n" 
        "               positive/negative data (default 0.01)\n"
-       "       -s 1, 3, and 4\n"
+       "       -s 1, 3, 4 and 7\n"
        "               Dual maximal violation <= eps; similar to libsvm (default 0.1)\n"
        "       -s 5 and 6\n"
        "               |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,\n"
@@ -229,7 +230,7 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode
        {
                if(param.solver_type == L2R_LR || param.solver_type == L2R_L2LOSS_SVC)
                        param.eps = 0.01;
-               else if(param.solver_type == L2R_L2LOSS_SVC_DUAL || param.solver_type == L2R_L1LOSS_SVC_DUAL || param.solver_type == MCSVM_CS)
+               else if(param.solver_type == L2R_L2LOSS_SVC_DUAL || param.solver_type == L2R_L1LOSS_SVC_DUAL || param.solver_type == MCSVM_CS || param.solver_type == L2R_LR_DUAL)
                        param.eps = 0.1;
                else if(param.solver_type == L1R_L2LOSS_SVC || param.solver_type == L1R_LR)
                        param.eps = 0.01;