]> granicus.if.org Git - liblinear/commitdiff
Add a new option -R for not regularizing the bias
authorWei-Lin Chiang <infwinston@gmail.com>
Thu, 16 Jul 2020 14:25:44 +0000 (22:25 +0800)
committerWei-Lin Chiang <infwinston@gmail.com>
Thu, 16 Jul 2020 14:25:44 +0000 (22:25 +0800)
README
linear.cpp
linear.h
train.c

diff --git a/README b/README
index 51436804db732ebaffa8e2918fd6229cf6a6bec4..b81ad8c9d85bfaea2d70e3f0c980218c16095e56 100644 (file)
--- a/README
+++ b/README
@@ -136,6 +136,8 @@ options:
                |f'(alpha)|_1 <= eps |f'(alpha0)|,
                where f is the dual function (default 0.1)
 -B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)
+-R : not regularize the bias; must with -B 1 to have the bias; DON'T use this unless you know what it is
+       (for -s 0, 2, 5, 6, 11)
 -wi weight: weights adjust the parameter C of different classes (see README for details)
 -v n: n-fold cross validation mode
 -C : find parameters (C for -s 0, 2 and C, p for -s 11)
index e2965eb56dab3eac62c4c8d496f152dddd4b924d..2335406f6fa44878ff6761dbb2d9383f2d9d68f1 100644 (file)
@@ -105,7 +105,7 @@ public:
 class l2r_lr_fun: public function
 {
 public:
-       l2r_lr_fun(const problem *prob, double *C);
+       l2r_lr_fun(const problem *prob, const parameter *param, double *C);
        ~l2r_lr_fun();
 
        double fun(double *w);
@@ -123,9 +123,10 @@ private:
        double *z;
        double *D;
        const problem *prob;
+       int regularize_bias;
 };
 
-l2r_lr_fun::l2r_lr_fun(const problem *prob, double *C)
+l2r_lr_fun::l2r_lr_fun(const problem *prob, const parameter *param, double *C)
 {
        int l=prob->l;
 
@@ -134,6 +135,7 @@ l2r_lr_fun::l2r_lr_fun(const problem *prob, double *C)
        z = new double[l];
        D = new double[l];
        this->C = C;
+       this->regularize_bias = param->regularize_bias;
 }
 
 l2r_lr_fun::~l2r_lr_fun()
@@ -155,6 +157,8 @@ double l2r_lr_fun::fun(double *w)
 
        for(i=0;i<w_size;i++)
                f += w[i]*w[i];
+       if(regularize_bias == 0)
+               f -= w[w_size-1]*w[w_size-1];
        f /= 2.0;
        for(i=0;i<l;i++)
        {
@@ -185,6 +189,8 @@ void l2r_lr_fun::grad(double *w, double *g)
 
        for(i=0;i<w_size;i++)
                g[i] = w[i] + g[i];
+       if(regularize_bias == 0)
+               g[w_size-1] -= w[w_size-1];
 }
 
 int l2r_lr_fun::get_nr_variable(void)
@@ -201,6 +207,8 @@ void l2r_lr_fun::get_diag_preconditioner(double *M)
 
        for (i=0; i<w_size; i++)
                M[i] = 1;
+       if(regularize_bias == 0)
+               M[w_size-1] = 0;
 
        for (i=0; i<l; i++)
        {
@@ -233,6 +241,8 @@ void l2r_lr_fun::Hv(double *s, double *Hs)
        }
        for(i=0;i<w_size;i++)
                Hs[i] = s[i] + Hs[i];
+       if(regularize_bias == 0)
+               Hs[w_size-1] -= s[w_size-1];
 }
 
 void l2r_lr_fun::Xv(double *v, double *Xv)
@@ -261,7 +271,7 @@ void l2r_lr_fun::XTv(double *v, double *XTv)
 class l2r_l2_svc_fun: public function
 {
 public:
-       l2r_l2_svc_fun(const problem *prob, double *C);
+       l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C);
        ~l2r_l2_svc_fun();
 
        double fun(double *w);
@@ -280,9 +290,10 @@ protected:
        int *I;
        int sizeI;
        const problem *prob;
+       int regularize_bias;
 };
 
-l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, double *C)
+l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C)
 {
        int l=prob->l;
 
@@ -291,6 +302,7 @@ l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, double *C)
        z = new double[l];
        I = new int[l];
        this->C = C;
+       this->regularize_bias = param->regularize_bias;
 }
 
 l2r_l2_svc_fun::~l2r_l2_svc_fun()
@@ -311,6 +323,8 @@ double l2r_l2_svc_fun::fun(double *w)
 
        for(i=0;i<w_size;i++)
                f += w[i]*w[i];
+       if(regularize_bias == 0)
+               f -= w[w_size-1]*w[w_size-1];
        f /= 2.0;
        for(i=0;i<l;i++)
        {
@@ -342,6 +356,8 @@ void l2r_l2_svc_fun::grad(double *w, double *g)
 
        for(i=0;i<w_size;i++)
                g[i] = w[i] + 2*g[i];
+       if(regularize_bias == 0)
+               g[w_size-1] -= w[w_size-1];
 }
 
 int l2r_l2_svc_fun::get_nr_variable(void)
@@ -357,6 +373,8 @@ void l2r_l2_svc_fun::get_diag_preconditioner(double *M)
 
        for (i=0; i<w_size; i++)
                M[i] = 1;
+       if(regularize_bias == 0)
+               M[w_size-1] = 0;
 
        for (i=0; i<sizeI; i++)
        {
@@ -389,6 +407,8 @@ void l2r_l2_svc_fun::Hv(double *s, double *Hs)
        }
        for(i=0;i<w_size;i++)
                Hs[i] = s[i] + 2*Hs[i];
+       if(regularize_bias == 0)
+               Hs[w_size-1] -= s[w_size-1];
 }
 
 void l2r_l2_svc_fun::Xv(double *v, double *Xv)
@@ -416,19 +436,21 @@ void l2r_l2_svc_fun::subXTv(double *v, double *XTv)
 class l2r_l2_svr_fun: public l2r_l2_svc_fun
 {
 public:
-       l2r_l2_svr_fun(const problem *prob, double *C, double p);
+       l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C);
 
        double fun(double *w);
        void grad(double *w, double *g);
 
 private:
        double p;
+       int regularize_bias;
 };
 
-l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, double *C, double p):
-       l2r_l2_svc_fun(prob, C)
+l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C):
+       l2r_l2_svc_fun(prob, param, C)
 {
-       this->p = p;
+       this->p = param->p;
+       this->regularize_bias = param->regularize_bias;
 }
 
 double l2r_l2_svr_fun::fun(double *w)
@@ -444,6 +466,8 @@ double l2r_l2_svr_fun::fun(double *w)
 
        for(i=0;i<w_size;i++)
                f += w[i]*w[i];
+       if(regularize_bias == 0)
+               f -= w[w_size-1]*w[w_size-1];
        f /= 2;
        for(i=0;i<l;i++)
        {
@@ -489,6 +513,8 @@ void l2r_l2_svr_fun::grad(double *w, double *g)
 
        for(i=0;i<w_size;i++)
                g[i] = w[i] + 2*g[i];
+       if(regularize_bias == 0)
+               g[w_size-1] -= w[w_size-1];
 }
 
 // A coordinate descent algorithm for
@@ -1400,6 +1426,9 @@ void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, do
 // solution will be put in w
 //
 // See Yuan et al. (2010) and appendix of LIBLINEAR paper, Fan et al. (2008)
+//
+// To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1
+// must have been added to the original data. (see -B and -R option)
 
 #undef GETI
 #define GETI(i) (y[i]+1)
@@ -1407,7 +1436,7 @@ void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, do
 
 static void solve_l1r_l2_svc(
        problem *prob_col, double *w, double eps,
-       double Cp, double Cn)
+       double Cp, double Cn, int regularize_bias)
 {
        int l = prob_col->l;
        int w_size = prob_col->n;
@@ -1497,49 +1526,66 @@ static void solve_l1r_l2_svc(
                        H *= 2;
                        H = max(H, 1e-12);
 
-                       double Gp = G+1;
-                       double Gn = G-1;
                        double violation = 0;
-                       if(w[j] == 0)
+                       double Gp = 0, Gn = 0;
+                       if(j == w_size-1 && regularize_bias == 0)
+                               violation = fabs(G);
+                       else
                        {
-                               if(Gp < 0)
-                                       violation = -Gp;
-                               else if(Gn > 0)
-                                       violation = Gn;
-                               else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
+                               Gp = G+1;
+                               Gn = G-1;
+                               if(w[j] == 0)
                                {
-                                       active_size--;
-                                       swap(index[s], index[active_size]);
-                                       s--;
-                                       continue;
+                                       if(Gp < 0)
+                                               violation = -Gp;
+                                       else if(Gn > 0)
+                                               violation = Gn;
+                                       else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
+                                       {
+                                               active_size--;
+                                               swap(index[s], index[active_size]);
+                                               s--;
+                                               continue;
+                                       }
                                }
+                               else if(w[j] > 0)
+                                       violation = fabs(Gp);
+                               else
+                                       violation = fabs(Gn);
                        }
-                       else if(w[j] > 0)
-                               violation = fabs(Gp);
-                       else
-                               violation = fabs(Gn);
-
                        Gmax_new = max(Gmax_new, violation);
                        Gnorm1_new += violation;
 
                        // obtain Newton direction d
-                       if(Gp < H*w[j])
-                               d = -Gp/H;
-                       else if(Gn > H*w[j])
-                               d = -Gn/H;
+                       if(j == w_size-1 && regularize_bias == 0)
+                               d = -G/H;
                        else
-                               d = -w[j];
+                       {
+                               if(Gp < H*w[j])
+                                       d = -Gp/H;
+                               else if(Gn > H*w[j])
+                                       d = -Gn/H;
+                               else
+                                       d = -w[j];
+                       }
 
                        if(fabs(d) < 1.0e-12)
                                continue;
 
-                       double delta = fabs(w[j]+d)-fabs(w[j]) + G*d;
+                       double delta;
+                       if(j == w_size-1 && regularize_bias == 0)
+                               delta = G*d;
+                       else
+                               delta = fabs(w[j]+d)-fabs(w[j]) + G*d;
                        d_old = 0;
                        int num_linesearch;
                        for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
                        {
                                d_diff = d_old - d;
-                               cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta;
+                               if(j == w_size-1 && regularize_bias == 0)
+                                       cond = -sigma*delta;
+                               else
+                                       cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta;
 
                                appxcond = xj_sq[j]*d*d + G_loss*d + cond;
                                if(appxcond <= 0)
@@ -1654,6 +1700,8 @@ static void solve_l1r_l2_svc(
                        nnz++;
                }
        }
+       if (regularize_bias == 0)
+               v -= fabs(w[w_size-1]);
        for(j=0; j<l; j++)
                if(b[j] > 0)
                        v += C[GETI(j)]*b[j]*b[j];
@@ -1679,6 +1727,9 @@ static void solve_l1r_l2_svc(
 // solution will be put in w
 //
 // See Yuan et al. (2011) and appendix of LIBLINEAR paper, Fan et al. (2008)
+//
+// To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1
+// must have been added to the original data. (see -B and -R option)
 
 #undef GETI
 #define GETI(i) (y[i]+1)
@@ -1686,7 +1737,7 @@ static void solve_l1r_l2_svc(
 
 static void solve_l1r_lr(
        const problem *prob_col, double *w, double eps,
-       double Cp, double Cn)
+       double Cp, double Cn, int regularize_bias)
 {
        int l = prob_col->l;
        int w_size = prob_col->n;
@@ -1756,6 +1807,9 @@ static void solve_l1r_lr(
                        x++;
                }
        }
+       if (regularize_bias == 0)
+               w_norm -= fabs(w[w_size-1]);
+
        for(j=0; j<l; j++)
        {
                exp_wTx[j] = exp(exp_wTx[j]);
@@ -1787,29 +1841,33 @@ static void solve_l1r_lr(
                        }
                        Grad[j] = -tmp + xjneg_sum[j];
 
-                       double Gp = Grad[j]+1;
-                       double Gn = Grad[j]-1;
                        double violation = 0;
-                       if(w[j] == 0)
+                       if (j == w_size-1 && regularize_bias == 0)
+                               violation = fabs(Grad[j]);
+                       else
                        {
-                               if(Gp < 0)
-                                       violation = -Gp;
-                               else if(Gn > 0)
-                                       violation = Gn;
-                               //outer-level shrinking
-                               else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
+                               double Gp = Grad[j]+1;
+                               double Gn = Grad[j]-1;
+                               if(w[j] == 0)
                                {
-                                       active_size--;
-                                       swap(index[s], index[active_size]);
-                                       s--;
-                                       continue;
+                                       if(Gp < 0)
+                                               violation = -Gp;
+                                       else if(Gn > 0)
+                                               violation = Gn;
+                                       //outer-level shrinking
+                                       else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
+                                       {
+                                               active_size--;
+                                               swap(index[s], index[active_size]);
+                                               s--;
+                                               continue;
+                                       }
                                }
+                               else if(w[j] > 0)
+                                       violation = fabs(Gp);
+                               else
+                                       violation = fabs(Gn);
                        }
-                       else if(w[j] > 0)
-                               violation = fabs(Gp);
-                       else
-                               violation = fabs(Gn);
-
                        Gmax_new = max(Gmax_new, violation);
                        Gnorm1_new += violation;
                }
@@ -1853,40 +1911,48 @@ static void solve_l1r_lr(
                                        x++;
                                }
 
-                               double Gp = G+1;
-                               double Gn = G-1;
                                double violation = 0;
-                               if(wpd[j] == 0)
+                               if (j == w_size-1 && regularize_bias == 0)
                                {
-                                       if(Gp < 0)
-                                               violation = -Gp;
-                                       else if(Gn > 0)
-                                               violation = Gn;
-                                       //inner-level shrinking
-                                       else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l)
-                                       {
-                                               QP_active_size--;
-                                               swap(index[s], index[QP_active_size]);
-                                               s--;
-                                               continue;
-                                       }
+                                       // bias term not shrunken
+                                       violation = fabs(G);
+                                       z = -G/H;
                                }
-                               else if(wpd[j] > 0)
-                                       violation = fabs(Gp);
                                else
-                                       violation = fabs(Gn);
+                               {
+                                       double Gp = G+1;
+                                       double Gn = G-1;
+                                       if(wpd[j] == 0)
+                                       {
+                                               if(Gp < 0)
+                                                       violation = -Gp;
+                                               else if(Gn > 0)
+                                                       violation = Gn;
+                                               //inner-level shrinking
+                                               else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l)
+                                               {
+                                                       QP_active_size--;
+                                                       swap(index[s], index[QP_active_size]);
+                                                       s--;
+                                                       continue;
+                                               }
+                                       }
+                                       else if(wpd[j] > 0)
+                                               violation = fabs(Gp);
+                                       else
+                                               violation = fabs(Gn);
 
+                                       // obtain solution of one-variable problem
+                                       if(Gp < H*wpd[j])
+                                               z = -Gp/H;
+                                       else if(Gn > H*wpd[j])
+                                               z = -Gn/H;
+                                       else
+                                               z = -wpd[j];
+                               }
                                QP_Gmax_new = max(QP_Gmax_new, violation);
                                QP_Gnorm1_new += violation;
 
-                               // obtain solution of one-variable problem
-                               if(Gp < H*wpd[j])
-                                       z = -Gp/H;
-                               else if(Gn > H*wpd[j])
-                                       z = -Gn/H;
-                               else
-                                       z = -wpd[j];
-
                                if(fabs(z) < 1.0e-12)
                                        continue;
                                z = min(max(z,-10.0),10.0);
@@ -1927,6 +1993,8 @@ static void solve_l1r_lr(
                        if(wpd[j] != 0)
                                w_norm_new += fabs(wpd[j]);
                }
+               if (regularize_bias == 0)
+                       w_norm_new -= fabs(wpd[w_size-1]);
                delta += (w_norm_new-w_norm);
 
                negsum_xTd = 0;
@@ -1969,6 +2037,8 @@ static void solve_l1r_lr(
                                        if(wpd[j] != 0)
                                                w_norm_new += fabs(wpd[j]);
                                }
+                               if (regularize_bias == 0)
+                                       w_norm_new -= fabs(wpd[w_size-1]);
                                delta *= 0.5;
                                negsum_xTd *= 0.5;
                                for(int i=0; i<l; i++)
@@ -2017,6 +2087,8 @@ static void solve_l1r_lr(
                        v += fabs(w[j]);
                        nnz++;
                }
+       if (regularize_bias == 0)
+               v -= fabs(w[w_size-1]);
        for(j=0; j<l; j++)
                if(y[j] == 1)
                        v += C[GETI(j)]*log(1+1/exp_wTx[j]);
@@ -2537,7 +2609,7 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
                                else
                                        C[i] = Cn;
                        }
-                       fun_obj=new l2r_lr_fun(prob, C);
+                       fun_obj=new l2r_lr_fun(prob, param, C);
                        TRON tron_obj(fun_obj, primal_solver_tol, eps_cg);
                        tron_obj.set_print_string(liblinear_print_string);
                        tron_obj.tron(w);
@@ -2555,7 +2627,7 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
                                else
                                        C[i] = Cn;
                        }
-                       fun_obj=new l2r_l2_svc_fun(prob, C);
+                       fun_obj=new l2r_l2_svc_fun(prob, param, C);
                        TRON tron_obj(fun_obj, primal_solver_tol, eps_cg);
                        tron_obj.set_print_string(liblinear_print_string);
                        tron_obj.tron(w);
@@ -2574,7 +2646,7 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
                        problem prob_col;
                        feature_node *x_space = NULL;
                        transpose(prob, &x_space ,&prob_col);
-                       solve_l1r_l2_svc(&prob_col, w, primal_solver_tol, Cp, Cn);
+                       solve_l1r_l2_svc(&prob_col, w, primal_solver_tol, Cp, Cn, param->regularize_bias);
                        delete [] prob_col.y;
                        delete [] prob_col.x;
                        delete [] x_space;
@@ -2585,7 +2657,7 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
                        problem prob_col;
                        feature_node *x_space = NULL;
                        transpose(prob, &x_space ,&prob_col);
-                       solve_l1r_lr(&prob_col, w, primal_solver_tol, Cp, Cn);
+                       solve_l1r_lr(&prob_col, w, primal_solver_tol, Cp, Cn, param->regularize_bias);
                        delete [] prob_col.y;
                        delete [] prob_col.x;
                        delete [] x_space;
@@ -2600,7 +2672,7 @@ static void train_one(const problem *prob, const parameter *param, double *w, do
                        for(int i = 0; i < prob->l; i++)
                                C[i] = param->C;
 
-                       fun_obj=new l2r_l2_svr_fun(prob, C, param->p);
+                       fun_obj=new l2r_l2_svr_fun(prob, param, C);
                        TRON tron_obj(fun_obj, param->eps);
                        tron_obj.set_print_string(liblinear_print_string);
                        tron_obj.tron(w);
@@ -3559,6 +3631,18 @@ const char *check_parameter(const problem *prob, const parameter *param)
        if(prob->bias >= 0 && param->solver_type == ONECLASS_SVM)
                return "prob->bias >=0, but this is ignored in ONECLASS_SVM";
 
+       if(param->regularize_bias == 0)
+       {
+               if(prob->bias != 1.0)
+                       return "To not regularize bias, must specify -B 1 along with -R";
+               if(param->solver_type != L2R_LR
+                       && param->solver_type != L2R_L2LOSS_SVC
+                       && param->solver_type != L1R_L2LOSS_SVC
+                       && param->solver_type != L1R_LR
+                       && param->solver_type != L2R_L2LOSS_SVR)
+                       return "-R option supported only for solver L2R_LR, L2R_L2LOSS_SVC, L1R_L2LOSS_SVC, L1R_LR, and L2R_L2LOSS_SVR";
+       }
+
        if(param->solver_type != L2R_LR
                && param->solver_type != L2R_L2LOSS_SVC_DUAL
                && param->solver_type != L2R_L2LOSS_SVC
index 368ba2f51b811d343cc307e3c7a1d33a68a1503d..0180f04b535bf28b8c7691141f168417b9a04759 100644 (file)
--- a/linear.h
+++ b/linear.h
@@ -38,6 +38,7 @@ struct parameter
        double p;
        double nu;
        double *init_sol;
+       int regularize_bias;
 };
 
 struct model
diff --git a/train.c b/train.c
index bd0af9465b218f5a619b791f7b03a5d5968fe74c..ef8fe70142dab5a5895e901ed9b7e205a118e8d4 100644 (file)
--- a/train.c
+++ b/train.c
@@ -50,6 +50,8 @@ void exit_with_help()
        "               |f'(alpha)|_1 <= eps |f'(alpha0)|,\n"
        "               where f is the dual function (default 0.1)\n"
        "-B bias : if bias >= 0, instance x becomes [x; bias]; if < 0, no bias term added (default -1)\n"
+       "-R : not regularize the bias; must with -B 1 to have the bias; DON'T use this unless you know what it is\n"
+       "       (for -s 0, 2, 5, 6, 11)\n"
        "-wi weight: weights adjust the parameter C of different classes (see README for details)\n"
        "-v n: n-fold cross validation mode\n"
        "-C : find parameters (C for -s 0, 2 and C, p for -s 11)\n"
@@ -218,6 +220,7 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode
        param.nu = 0.5;
        param.eps = INF; // see setting below
        param.nr_weight = 0;
+       param.regularize_bias = 1;
        param.weight_label = NULL;
        param.weight = NULL;
        param.init_sol = NULL;
@@ -291,6 +294,11 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode
                                i--;
                                break;
 
+                       case 'R':
+                               param.regularize_bias = 0;
+                               i--;
+                               break;
+
                        default:
                                fprintf(stderr,"unknown option: -%c\n", argv[i-1][1]);
                                exit_with_help();