From: cjlin Date: Sun, 5 Sep 2010 13:36:05 +0000 (+0000) Subject: Add L2-regularized logistic regression (dual) X-Git-Tag: v170~6 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=978067018e7d3cb385e0942dbb042e16a82620e4;p=liblinear Add L2-regularized logistic regression (dual) by implementing Alg 5 of Yu et al. --- diff --git a/README b/README index b59ce23..c9d6660 100644 --- a/README +++ b/README @@ -96,25 +96,27 @@ and mark Usage: train [options] training_set_file [model_file] options: +options: -s type : set type of solver (default 1) - 0 -- L2-regularized logistic regression - 1 -- L2-regularized L2-loss support vector classification (dual) - 2 -- L2-regularized L2-loss support vector classification (primal) - 3 -- L2-regularized L1-loss support vector classification (dual) - 4 -- multi-class support vector classification by Crammer and Singer - 5 -- L1-regularized L2-loss support vector classification - 6 -- L1-regularized logistic regression + 0 -- L2-regularized logistic regression (primal) + 1 -- L2-regularized L2-loss support vector classification (dual) + 2 -- L2-regularized L2-loss support vector classification (primal) + 3 -- L2-regularized L1-loss support vector classification (dual) + 4 -- multi-class support vector classification by Crammer and Singer + 5 -- L1-regularized L2-loss support vector classification + 6 -- L1-regularized logistic regression + 7 -- L2-regularized logistic regression (dual) -c cost : set the parameter C (default 1) -e epsilon : set tolerance of termination criterion - -s 0 and 2 - |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2, - where f is the primal function and pos/neg are # of - positive/negative data (default 0.01) - -s 1, 3, and 4 - Dual maximal violation <= eps; similar to libsvm (default 0.1) - -s 5 and 6 - |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf, - where f is the primal function (default 0.01) + -s 0 and 2 + |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2, + where f is the primal function and pos/neg are # of + positive/negative data (default 0.01) + -s 1, 3, 4 and 7 + Dual maximal violation <= eps; similar to libsvm (default 0.1) + -s 5 and 6 + |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf, + where f is the primal function (default 0.01) -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1) -wi weight: weights adjust the parameter C of different classes (see README for details) -v n: n-fold cross validation mode @@ -155,10 +157,15 @@ where Q is a matrix with Q_ij = y_i y_j x_i^T x_j. +For L2-regularized logistic regression (-s 7), we solve + +min_alpha 0.5(alpha^T Q alpha) + \sum alpha_i*log(alpha_i) + \sum (C-alpha_i)*log(C-alpha_i) - a constant + s.t. 0 <= alpha_i <= C, + If bias >= 0, w becomes [w; w_{n+1}] and x becomes [x; bias]. -The primal-dual relationship implies that -s 1 and -s 2 gives the same -model. +The primal-dual relationship implies that -s 1 and -s 2 give the same +model, and -s 0 and -s 7 give the same. We implement 1-vs-the rest multi-class strategy. In training i vs. non_i, their C parameters are (weight from -wi)*C and C, @@ -288,15 +295,16 @@ Library Usage double* weight; }; - solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR. + solver_type can be one of L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL. - L2R_LR L2-regularized logistic regression + L2R_LR L2-regularized logistic regression (primal) L2R_L2LOSS_SVC_DUAL L2-regularized L2-loss support vector classification (dual) L2R_L2LOSS_SVC L2-regularized L2-loss support vector classification (primal) L2R_L1LOSS_SVC_DUAL L2-regularized L1-loss support vector classification (dual) MCSVM_CS multi-class support vector classification by Crammer and Singer L1R_L2LOSS_SVC L1-regularized L2-loss support vector classification L1R_LR L1-regularized logistic regression + L2R_LR_DUAL L2-regularized logistic regression (dual) C is the cost of constraints violation. eps is the stopping criterion. diff --git a/linear.cpp b/linear.cpp index 9058c19..df52406 100644 --- a/linear.cpp +++ b/linear.cpp @@ -648,7 +648,7 @@ void Solver_MCSVM_CS::Solve(double *w) info("\noptimization finished, #iter = %d\n",iter); if (iter >= max_iter) - info("Warning: reaching max number of iterations\n"); + info("\nWARNING: reaching max number of iterations\n"); // calculate objective value double v = 0; @@ -887,6 +887,174 @@ static void solve_l2r_l1l2_svc( delete [] index; } +// A coordinate descent algorithm for +// the dual of L2-regularized logistic regression problems +// +// min_\alpha 0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - alpha_i) log (upper_bound_i - alpha_i) , +// s.t. 0 <= alpha_i <= upper_bound_i, +// +// where Qij = yi yj xi^T xj and +// upper_bound_i = Cp if y_i = 1 +// upper_bound_i = Cn if y_i = -1 +// +// Given: +// x, y, Cp, Cn +// eps is the stopping tolerance +// +// solution will be put in w +// +// See Algorithm 5 of Yu et al., MLJ 2010 + +#undef GETI +#define GETI(i) (y[i]+1) +// To support weights for instances, use GETI(i) (i) + +void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, double Cn) +{ + int l = prob->l; + int w_size = prob->n; + int i, s, iter = 0; + double *xTx = new double[l]; + int max_iter = 1000; + int *index = new int[l]; + double *alpha = new double[2*l]; // store alpha and C - alpha + schar *y = new schar[l]; + int max_inner_iter = 100; // for inner Newton + double innereps = 1e-2; + double innereps_min = min(1e-8, eps); + double upper_bound[3] = {Cn, 0, Cp}; + + for(i=0; iy[i] > 0) + { + y[i] = +1; + } + else + { + y[i] = -1; + } + alpha[2*i] = min(0.001*upper_bound[GETI(i)], 1e-8); + alpha[2*i+1] = upper_bound[GETI(i)] - alpha[2*i]; + + xTx[i] = 0; + feature_node *xi = prob->x[i]; + while (xi->index != -1) + { + xTx[i] += (xi->value)*(xi->value); + w[xi->index-1] += y[i]*alpha[2*i]*xi->value; + xi++; + } + index[i] = i; + } + + while (iter < max_iter) + { + for (i=0; ix[i]; + while (xi->index != -1) + { + ywTx += w[xi->index-1]*xi->value; + xi++; + } + ywTx *= y[i]; + double a = xisq, b = ywTx; + + // Decide to minimize g_1(z) or g_2(z) + int ind1 = 2*i, ind2 = 2*i+1, sign = 1; + if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0) + { + ind1 = 2*i+1; + ind2 = 2*i; + sign = -1; + } + + // g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old) + double alpha_old = alpha[ind1]; + double z = alpha_old; + if(C - z < 0.5 * C) + z = 0.1*z; + double gp = a*(z-alpha_old)+sign*b+log(z/(C-z)); + Gmax = max(Gmax, fabs(gp)); + + // Newton method on the sub-problem + const double eta = 0.1; // xi in the paper + int inner_iter = 0; + while (inner_iter <= max_inner_iter) + { + if(fabs(gp) < innereps) + break; + double gpp = a + C/(C-z)/z; + double tmpz = z - gp/gpp; + if(tmpz <= 0) + z *= eta; + else // tmpz in (0, C) + z = tmpz; + gp = a*(z-alpha_old)+sign*b+log(z/(C-z)); + newton_iter++; + inner_iter++; + } + + if(inner_iter > 0) // update w + { + alpha[ind1] = z; + alpha[ind2] = C-z; + xi = prob->x[i]; + while (xi->index != -1) + { + w[xi->index-1] += sign*(z-alpha_old)*yi*xi->value; + xi++; + } + } + } + + iter++; + if(iter % 10 == 0) + info("."); + + if(Gmax < eps) + break; + + if(newton_iter < l/10) + innereps = max(innereps_min, 0.1*innereps); + + } + + info("\noptimization finished, #iter = %d\n",iter); + if (iter >= max_iter) + info("\nWARNING: reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n"); + + // calculate objective value + + double v = 0; + for(i=0; iweight_label[i] == label[j]) break; if(j == nr_class) - fprintf(stderr,"warning: class label %d specified in weight is not found\n", param->weight_label[i]); + fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]); else weighted_C[j] *= param->weight[i]; } @@ -1899,7 +2070,8 @@ int predict_probability(const struct model *model_, const struct feature_node *x static const char *solver_type_table[]= { - "L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC","L2R_L1LOSS_SVC_DUAL","MCSVM_CS", "L1R_L2LOSS_SVC","L1R_LR", NULL + "L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC", "L2R_L1LOSS_SVC_DUAL", "MCSVM_CS", + "L1R_L2LOSS_SVC", "L1R_LR", "L2R_LR_DUAL", NULL }; int save_model(const char *model_file_name, const struct model *model_) @@ -2100,7 +2272,8 @@ const char *check_parameter(const problem *prob, const parameter *param) && param->solver_type != L2R_L1LOSS_SVC_DUAL && param->solver_type != MCSVM_CS && param->solver_type != L1R_L2LOSS_SVC - && param->solver_type != L1R_LR) + && param->solver_type != L1R_LR + && param->solver_type != L2R_LR_DUAL) return "unknown solver type"; return NULL; @@ -2108,7 +2281,9 @@ const char *check_parameter(const problem *prob, const parameter *param) int check_probability_model(const struct model *model_) { - return (model_->param.solver_type==L2R_LR || model_->param.solver_type==L1R_LR); + return (model_->param.solver_type==L2R_LR || + model_->param.solver_type==L2R_LR_DUAL || + model_->param.solver_type==L1R_LR); } void set_print_string_function(void (*print_func)(const char*)) diff --git a/linear.h b/linear.h index 9d9b170..2a1aa28 100644 --- a/linear.h +++ b/linear.h @@ -19,7 +19,7 @@ struct problem double bias; /* < 0 if no bias term */ }; -enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR }; /* solver_type */ +enum { L2R_LR, L2R_L2LOSS_SVC_DUAL, L2R_L2LOSS_SVC, L2R_L1LOSS_SVC_DUAL, MCSVM_CS, L1R_L2LOSS_SVC, L1R_LR, L2R_LR_DUAL }; /* solver_type */ struct parameter { diff --git a/train.c b/train.c index 98bb0a2..c3ce195 100644 --- a/train.c +++ b/train.c @@ -16,20 +16,21 @@ void exit_with_help() "Usage: train [options] training_set_file [model_file]\n" "options:\n" "-s type : set type of solver (default 1)\n" - " 0 -- L2-regularized logistic regression\n" + " 0 -- L2-regularized logistic regression (primal)\n" " 1 -- L2-regularized L2-loss support vector classification (dual)\n" " 2 -- L2-regularized L2-loss support vector classification (primal)\n" " 3 -- L2-regularized L1-loss support vector classification (dual)\n" " 4 -- multi-class support vector classification by Crammer and Singer\n" " 5 -- L1-regularized L2-loss support vector classification\n" " 6 -- L1-regularized logistic regression\n" + " 7 -- L2-regularized logistic regression (dual)\n" "-c cost : set the parameter C (default 1)\n" "-e epsilon : set tolerance of termination criterion\n" " -s 0 and 2\n" " |f'(w)|_2 <= eps*min(pos,neg)/l*|f'(w0)|_2,\n" " where f is the primal function and pos/neg are # of\n" " positive/negative data (default 0.01)\n" - " -s 1, 3, and 4\n" + " -s 1, 3, 4 and 7\n" " Dual maximal violation <= eps; similar to libsvm (default 0.1)\n" " -s 5 and 6\n" " |f'(w)|_inf <= eps*min(pos,neg)/l*|f'(w0)|_inf,\n" @@ -229,7 +230,7 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode { if(param.solver_type == L2R_LR || param.solver_type == L2R_L2LOSS_SVC) param.eps = 0.01; - else if(param.solver_type == L2R_L2LOSS_SVC_DUAL || param.solver_type == L2R_L1LOSS_SVC_DUAL || param.solver_type == MCSVM_CS) + else if(param.solver_type == L2R_L2LOSS_SVC_DUAL || param.solver_type == L2R_L1LOSS_SVC_DUAL || param.solver_type == MCSVM_CS || param.solver_type == L2R_LR_DUAL) param.eps = 0.1; else if(param.solver_type == L1R_L2LOSS_SVC || param.solver_type == L1R_LR) param.eps = 0.01;