granicus.if.org Git - liblinear/blob - linear.cpp

   1 #include <math.h>
   2 #include <stdio.h>
   3 #include <stdlib.h>
   4 #include <string.h>
   5 #include <stdarg.h>
   6 #include <locale.h>
   7 #include "linear.h"
   8 #include "newton.h"
   9 int liblinear_version = LIBLINEAR_VERSION;
  10 typedef signed char schar;
  11 template <class T> static inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
  12 #ifndef min
  13 template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
  14 #endif
  15 #ifndef max
  16 template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
  17 #endif
  18 template <class S, class T> static inline void clone(T*& dst, S* src, int n)
  19 {
  20         dst = new T[n];
  21         memcpy((void *)dst,(void *)src,sizeof(T)*n);
  22 }
  23 #define INF HUGE_VAL
  24 #define Malloc(type,n) (type *)malloc((n)*sizeof(type))
  25
  26 static void print_string_stdout(const char *s)
  27 {
  28         fputs(s,stdout);
  29         fflush(stdout);
  30 }
  31 static void print_null(const char *s) {}
  32
  33 static void (*liblinear_print_string) (const char *) = &print_string_stdout;
  34
  35 #if 1
  36 static void info(const char *fmt,...)
  37 {
  38         char buf[BUFSIZ];
  39         va_list ap;
  40         va_start(ap,fmt);
  41         vsprintf(buf,fmt,ap);
  42         va_end(ap);
  43         (*liblinear_print_string)(buf);
  44 }
  45 #else
  46 static void info(const char *fmt,...) {}
  47 #endif
  48 class sparse_operator
  49 {
  50 public:
  51         static double nrm2_sq(const feature_node *x)
  52         {
  53                 double ret = 0;
  54                 while(x->index != -1)
  55                 {
  56                         ret += x->value*x->value;
  57                         x++;
  58                 }
  59                 return (ret);
  60         }
  61
  62         static double dot(const double *s, const feature_node *x)
  63         {
  64                 double ret = 0;
  65                 while(x->index != -1)
  66                 {
  67                         ret += s[x->index-1]*x->value;
  68                         x++;
  69                 }
  70                 return (ret);
  71         }
  72
  73         static double sparse_dot(const feature_node *x1, const feature_node *x2)
  74         {
  75                 double ret = 0;
  76                 while(x1->index != -1 && x2->index != -1)
  77                 {
  78                         if(x1->index == x2->index)
  79                         {
  80                                 ret += x1->value * x2->value;
  81                                 ++x1;
  82                                 ++x2;
  83                         }
  84                         else
  85                         {
  86                                 if(x1->index > x2->index)
  87                                         ++x2;
  88                                 else
  89                                         ++x1;
  90                         }
  91                 }
  92                 return (ret);
  93         }
  94
  95         static void axpy(const double a, const feature_node *x, double *y)
  96         {
  97                 while(x->index != -1)
  98                 {
  99                         y[x->index-1] += a*x->value;
 100                         x++;
 101                 }
 102         }
 103 };
 104
 105 // L2-regularized empirical risk minimization
 106 // min_w w^Tw/2 + \sum C_i \xi(w^Tx_i), where \xi() is the loss
 107
 108 class l2r_erm_fun: public function
 109 {
 110 public:
 111         l2r_erm_fun(const problem *prob, const parameter *param, double *C);
 112         ~l2r_erm_fun();
 113
 114         double fun(double *w);
 115         double linesearch_and_update(double *w, double *d, double *f, double *g, double alpha);
 116         int get_nr_variable(void);
 117
 118 protected:
 119         virtual double C_times_loss(int i, double wx_i) = 0;
 120         void Xv(double *v, double *Xv);
 121         void XTv(double *v, double *XTv);
 122
 123         double *C;
 124         const problem *prob;
 125         double *wx;
 126         double *tmp; // a working array
 127         double wTw;
 128         int regularize_bias;
 129 };
 130
 131 l2r_erm_fun::l2r_erm_fun(const problem *prob, const parameter *param, double *C)
 132 {
 133         int l=prob->l;
 134
 135         this->prob = prob;
 136
 137         wx = new double[l];
 138         tmp = new double[l];
 139         this->C = C;
 140         this->regularize_bias = param->regularize_bias;
 141 }
 142
 143 l2r_erm_fun::~l2r_erm_fun()
 144 {
 145         delete[] wx;
 146         delete[] tmp;
 147 }
 148
 149 double l2r_erm_fun::fun(double *w)
 150 {
 151         int i;
 152         double f=0;
 153         int l=prob->l;
 154         int w_size=get_nr_variable();
 155
 156         wTw = 0;
 157         Xv(w, wx);
 158
 159         for(i=0;i<w_size;i++)
 160                 wTw += w[i]*w[i];
 161         if(regularize_bias == 0)
 162                 wTw -= w[w_size-1]*w[w_size-1];
 163         for(i=0;i<l;i++)
 164                 f += C_times_loss(i, wx[i]);
 165         f = f + 0.5 * wTw;
 166
 167         return(f);
 168 }
 169
 170 int l2r_erm_fun::get_nr_variable(void)
 171 {
 172         return prob->n;
 173 }
 174
 175 // On entry *f must be the function value of w
 176 // On exit w is updated and *f is the new function value
 177 double l2r_erm_fun::linesearch_and_update(double *w, double *s, double *f, double *g, double alpha)
 178 {
 179         int i;
 180         int l = prob->l;
 181         double sTs = 0;
 182         double wTs = 0;
 183         double gTs = 0;
 184         double eta = 0.01;
 185         int w_size = get_nr_variable();
 186         int max_num_linesearch = 20;
 187         double fold = *f;
 188         Xv(s, tmp);
 189
 190         for (i=0;i<w_size;i++)
 191         {
 192                 sTs += s[i] * s[i];
 193                 wTs += s[i] * w[i];
 194                 gTs += s[i] * g[i];
 195         }
 196         if(regularize_bias == 0)
 197         {
 198                 // bias not used in calculating (w + \alpha s)^T (w + \alpha s)
 199                 sTs -= s[w_size-1] * s[w_size-1];
 200                 wTs -= s[w_size-1] * w[w_size-1];
 201         }
 202
 203         int num_linesearch = 0;
 204         for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
 205         {
 206                 double loss = 0;
 207                 for(i=0;i<l;i++)
 208                 {
 209                         double inner_product = tmp[i] * alpha + wx[i];
 210                         loss += C_times_loss(i, inner_product);
 211                 }
 212                 *f = loss + (alpha * alpha * sTs + wTw) / 2.0 + alpha * wTs;
 213                 if (*f - fold <= eta * alpha * gTs)
 214                 {
 215                         for (i=0;i<l;i++)
 216                                 wx[i] += alpha * tmp[i];
 217                         break;
 218                 }
 219                 else
 220                         alpha *= 0.5;
 221         }
 222
 223         if (num_linesearch >= max_num_linesearch)
 224         {
 225                 *f = fold;
 226                 return 0;
 227         }
 228         else
 229                 for (i=0;i<w_size;i++)
 230                         w[i] += alpha * s[i];
 231
 232         wTw += alpha * alpha * sTs + 2* alpha * wTs;
 233         return alpha;
 234 }
 235
 236 void l2r_erm_fun::Xv(double *v, double *Xv)
 237 {
 238         int i;
 239         int l=prob->l;
 240         feature_node **x=prob->x;
 241
 242         for(i=0;i<l;i++)
 243                 Xv[i]=sparse_operator::dot(v, x[i]);
 244 }
 245
 246 void l2r_erm_fun::XTv(double *v, double *XTv)
 247 {
 248         int i;
 249         int l=prob->l;
 250         int w_size=get_nr_variable();
 251         feature_node **x=prob->x;
 252
 253         for(i=0;i<w_size;i++)
 254                 XTv[i]=0;
 255         for(i=0;i<l;i++)
 256                 sparse_operator::axpy(v[i], x[i], XTv);
 257 }
 258
 259 class l2r_lr_fun: public l2r_erm_fun
 260 {
 261 public:
 262         l2r_lr_fun(const problem *prob, const parameter *param, double *C);
 263         ~l2r_lr_fun();
 264
 265         void grad(double *w, double *g);
 266         void Hv(double *s, double *Hs);
 267
 268         void get_diag_preconditioner(double *M);
 269
 270 private:
 271         double *D;
 272         double C_times_loss(int i, double wx_i);
 273 };
 274
 275 l2r_lr_fun::l2r_lr_fun(const problem *prob, const parameter *param, double *C):
 276         l2r_erm_fun(prob, param, C)
 277 {
 278         int l=prob->l;
 279         D = new double[l];
 280 }
 281
 282 l2r_lr_fun::~l2r_lr_fun()
 283 {
 284         delete[] D;
 285 }
 286
 287 double l2r_lr_fun::C_times_loss(int i, double wx_i)
 288 {
 289         double ywx_i = wx_i * prob->y[i];
 290         if (ywx_i >= 0)
 291                 return C[i]*log(1 + exp(-ywx_i));
 292         else
 293                 return C[i]*(-ywx_i + log(1 + exp(ywx_i)));
 294 }
 295
 296 void l2r_lr_fun::grad(double *w, double *g)
 297 {
 298         int i;
 299         double *y=prob->y;
 300         int l=prob->l;
 301         int w_size=get_nr_variable();
 302
 303         for(i=0;i<l;i++)
 304         {
 305                 tmp[i] = 1/(1 + exp(-y[i]*wx[i]));
 306                 D[i] = tmp[i]*(1-tmp[i]);
 307                 tmp[i] = C[i]*(tmp[i]-1)*y[i];
 308         }
 309         XTv(tmp, g);
 310
 311         for(i=0;i<w_size;i++)
 312                 g[i] = w[i] + g[i];
 313         if(regularize_bias == 0)
 314                 g[w_size-1] -= w[w_size-1];
 315 }
 316
 317 void l2r_lr_fun::get_diag_preconditioner(double *M)
 318 {
 319         int i;
 320         int l = prob->l;
 321         int w_size=get_nr_variable();
 322         feature_node **x = prob->x;
 323
 324         for (i=0; i<w_size; i++)
 325                 M[i] = 1;
 326         if(regularize_bias == 0)
 327                 M[w_size-1] = 0;
 328
 329         for (i=0; i<l; i++)
 330         {
 331                 feature_node *xi = x[i];
 332                 while (xi->index!=-1)
 333                 {
 334                         M[xi->index-1] += xi->value*xi->value*C[i]*D[i];
 335                         xi++;
 336                 }
 337         }
 338 }
 339
 340 void l2r_lr_fun::Hv(double *s, double *Hs)
 341 {
 342         int i;
 343         int l=prob->l;
 344         int w_size=get_nr_variable();
 345         feature_node **x=prob->x;
 346
 347         for(i=0;i<w_size;i++)
 348                 Hs[i] = 0;
 349         for(i=0;i<l;i++)
 350         {
 351                 feature_node * const xi=x[i];
 352                 double xTs = sparse_operator::dot(s, xi);
 353
 354                 xTs = C[i]*D[i]*xTs;
 355
 356                 sparse_operator::axpy(xTs, xi, Hs);
 357         }
 358         for(i=0;i<w_size;i++)
 359                 Hs[i] = s[i] + Hs[i];
 360         if(regularize_bias == 0)
 361                 Hs[w_size-1] -= s[w_size-1];
 362 }
 363
 364 class l2r_l2_svc_fun: public l2r_erm_fun
 365 {
 366 public:
 367         l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C);
 368         ~l2r_l2_svc_fun();
 369
 370         void grad(double *w, double *g);
 371         void Hv(double *s, double *Hs);
 372
 373         void get_diag_preconditioner(double *M);
 374
 375 protected:
 376         void subXTv(double *v, double *XTv);
 377
 378         int *I;
 379         int sizeI;
 380
 381 private:
 382         double C_times_loss(int i, double wx_i);
 383 };
 384
 385 l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C):
 386         l2r_erm_fun(prob, param, C)
 387 {
 388         I = new int[prob->l];
 389 }
 390
 391 l2r_l2_svc_fun::~l2r_l2_svc_fun()
 392 {
 393         delete[] I;
 394 }
 395
 396 double l2r_l2_svc_fun::C_times_loss(int i, double wx_i)
 397 {
 398         double d = 1 - prob->y[i] * wx_i;
 399         if (d > 0)
 400                 return C[i] * d * d;
 401         else
 402                 return 0;
 403 }
 404
 405 void l2r_l2_svc_fun::grad(double *w, double *g)
 406 {
 407         int i;
 408         double *y=prob->y;
 409         int l=prob->l;
 410         int w_size=get_nr_variable();
 411
 412         sizeI = 0;
 413         for (i=0;i<l;i++)
 414         {
 415                 tmp[i] = wx[i] * y[i];
 416                 if (tmp[i] < 1)
 417                 {
 418                         tmp[sizeI] = C[i]*y[i]*(tmp[i]-1);
 419                         I[sizeI] = i;
 420                         sizeI++;
 421                 }
 422         }
 423         subXTv(tmp, g);
 424
 425         for(i=0;i<w_size;i++)
 426                 g[i] = w[i] + 2*g[i];
 427         if(regularize_bias == 0)
 428                 g[w_size-1] -= w[w_size-1];
 429 }
 430
 431 void l2r_l2_svc_fun::get_diag_preconditioner(double *M)
 432 {
 433         int i;
 434         int w_size=get_nr_variable();
 435         feature_node **x = prob->x;
 436
 437         for (i=0; i<w_size; i++)
 438                 M[i] = 1;
 439         if(regularize_bias == 0)
 440                 M[w_size-1] = 0;
 441
 442         for (i=0; i<sizeI; i++)
 443         {
 444                 int idx = I[i];
 445                 feature_node *xi = x[idx];
 446                 while (xi->index!=-1)
 447                 {
 448                         M[xi->index-1] += xi->value*xi->value*C[idx]*2;
 449                         xi++;
 450                 }
 451         }
 452 }
 453
 454 void l2r_l2_svc_fun::Hv(double *s, double *Hs)
 455 {
 456         int i;
 457         int w_size=get_nr_variable();
 458         feature_node **x=prob->x;
 459
 460         for(i=0;i<w_size;i++)
 461                 Hs[i]=0;
 462         for(i=0;i<sizeI;i++)
 463         {
 464                 feature_node * const xi=x[I[i]];
 465                 double xTs = sparse_operator::dot(s, xi);
 466
 467                 xTs = C[I[i]]*xTs;
 468
 469                 sparse_operator::axpy(xTs, xi, Hs);
 470         }
 471         for(i=0;i<w_size;i++)
 472                 Hs[i] = s[i] + 2*Hs[i];
 473         if(regularize_bias == 0)
 474                 Hs[w_size-1] -= s[w_size-1];
 475 }
 476
 477 void l2r_l2_svc_fun::subXTv(double *v, double *XTv)
 478 {
 479         int i;
 480         int w_size=get_nr_variable();
 481         feature_node **x=prob->x;
 482
 483         for(i=0;i<w_size;i++)
 484                 XTv[i]=0;
 485         for(i=0;i<sizeI;i++)
 486                 sparse_operator::axpy(v[i], x[I[i]], XTv);
 487 }
 488
 489 class l2r_l2_svr_fun: public l2r_l2_svc_fun
 490 {
 491 public:
 492         l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C);
 493
 494         void grad(double *w, double *g);
 495
 496 private:
 497         double C_times_loss(int i, double wx_i);
 498         double p;
 499 };
 500
 501 l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C):
 502         l2r_l2_svc_fun(prob, param, C)
 503 {
 504         this->p = param->p;
 505         this->regularize_bias = param->regularize_bias;
 506 }
 507
 508 double l2r_l2_svr_fun::C_times_loss(int i, double wx_i)
 509 {
 510         double d = wx_i - prob->y[i];
 511         if(d < -p)
 512                 return C[i]*(d+p)*(d+p);
 513         else if(d > p)
 514                 return C[i]*(d-p)*(d-p);
 515         return 0;
 516 }
 517
 518 void l2r_l2_svr_fun::grad(double *w, double *g)
 519 {
 520         int i;
 521         double *y=prob->y;
 522         int l=prob->l;
 523         int w_size=get_nr_variable();
 524         double d;
 525
 526         sizeI = 0;
 527         for(i=0;i<l;i++)
 528         {
 529                 d = wx[i] - y[i];
 530
 531                 // generate index set I
 532                 if(d < -p)
 533                 {
 534                         tmp[sizeI] = C[i]*(d+p);
 535                         I[sizeI] = i;
 536                         sizeI++;
 537                 }
 538                 else if(d > p)
 539                 {
 540                         tmp[sizeI] = C[i]*(d-p);
 541                         I[sizeI] = i;
 542                         sizeI++;
 543                 }
 544
 545         }
 546         subXTv(tmp, g);
 547
 548         for(i=0;i<w_size;i++)
 549                 g[i] = w[i] + 2*g[i];
 550         if(regularize_bias == 0)
 551                 g[w_size-1] -= w[w_size-1];
 552 }
 553
 554 // A coordinate descent algorithm for
 555 // multi-class support vector machines by Crammer and Singer
 556 //
 557 //  min_{\alpha}  0.5 \sum_m ||w_m(\alpha)||^2 + \sum_i \sum_m e^m_i alpha^m_i
 558 //    s.t.     \alpha^m_i <= C^m_i \forall m,i , \sum_m \alpha^m_i=0 \forall i
 559 //
 560 //  where e^m_i = 0 if y_i  = m,
 561 //        e^m_i = 1 if y_i != m,
 562 //  C^m_i = C if m  = y_i,
 563 //  C^m_i = 0 if m != y_i,
 564 //  and w_m(\alpha) = \sum_i \alpha^m_i x_i
 565 //
 566 // Given:
 567 // x, y, C
 568 // eps is the stopping tolerance
 569 //
 570 // solution will be put in w
 571 //
 572 // See Appendix of LIBLINEAR paper, Fan et al. (2008)
 573
 574 #define GETI(i) ((int) prob->y[i])
 575 // To support weights for instances, use GETI(i) (i)
 576
 577 class Solver_MCSVM_CS
 578 {
 579         public:
 580                 Solver_MCSVM_CS(const problem *prob, int nr_class, double *C, double eps=0.1, int max_iter=100000);
 581                 ~Solver_MCSVM_CS();
 582                 void Solve(double *w);
 583         private:
 584                 void solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new);
 585                 bool be_shrunk(int i, int m, int yi, double alpha_i, double minG);
 586                 double *B, *C, *G;
 587                 int w_size, l;
 588                 int nr_class;
 589                 int max_iter;
 590                 double eps;
 591                 const problem *prob;
 592 };
 593
 594 Solver_MCSVM_CS::Solver_MCSVM_CS(const problem *prob, int nr_class, double *weighted_C, double eps, int max_iter)
 595 {
 596         this->w_size = prob->n;
 597         this->l = prob->l;
 598         this->nr_class = nr_class;
 599         this->eps = eps;
 600         this->max_iter = max_iter;
 601         this->prob = prob;
 602         this->B = new double[nr_class];
 603         this->G = new double[nr_class];
 604         this->C = weighted_C;
 605 }
 606
 607 Solver_MCSVM_CS::~Solver_MCSVM_CS()
 608 {
 609         delete[] B;
 610         delete[] G;
 611 }
 612
 613 int compare_double(const void *a, const void *b)
 614 {
 615         if(*(double *)a > *(double *)b)
 616                 return -1;
 617         if(*(double *)a < *(double *)b)
 618                 return 1;
 619         return 0;
 620 }
 621
 622 void Solver_MCSVM_CS::solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new)
 623 {
 624         int r;
 625         double *D;
 626
 627         clone(D, B, active_i);
 628         if(yi < active_i)
 629                 D[yi] += A_i*C_yi;
 630         qsort(D, active_i, sizeof(double), compare_double);
 631
 632         double beta = D[0] - A_i*C_yi;
 633         for(r=1;r<active_i && beta<r*D[r];r++)
 634                 beta += D[r];
 635         beta /= r;
 636
 637         for(r=0;r<active_i;r++)
 638         {
 639                 if(r == yi)
 640                         alpha_new[r] = min(C_yi, (beta-B[r])/A_i);
 641                 else
 642                         alpha_new[r] = min((double)0, (beta - B[r])/A_i);
 643         }
 644         delete[] D;
 645 }
 646
 647 bool Solver_MCSVM_CS::be_shrunk(int i, int m, int yi, double alpha_i, double minG)
 648 {
 649         double bound = 0;
 650         if(m == yi)
 651                 bound = C[GETI(i)];
 652         if(alpha_i == bound && G[m] < minG)
 653                 return true;
 654         return false;
 655 }
 656
 657 void Solver_MCSVM_CS::Solve(double *w)
 658 {
 659         int i, m, s;
 660         int iter = 0;
 661         double *alpha =  new double[l*nr_class];
 662         double *alpha_new = new double[nr_class];
 663         int *index = new int[l];
 664         double *QD = new double[l];
 665         int *d_ind = new int[nr_class];
 666         double *d_val = new double[nr_class];
 667         int *alpha_index = new int[nr_class*l];
 668         int *y_index = new int[l];
 669         int active_size = l;
 670         int *active_size_i = new int[l];
 671         double eps_shrink = max(10.0*eps, 1.0); // stopping tolerance for shrinking
 672         bool start_from_all = true;
 673
 674         // Initial alpha can be set here. Note that
 675         // sum_m alpha[i*nr_class+m] = 0, for all i=1,...,l-1
 676         // alpha[i*nr_class+m] <= C[GETI(i)] if prob->y[i] == m
 677         // alpha[i*nr_class+m] <= 0 if prob->y[i] != m
 678         // If initial alpha isn't zero, uncomment the for loop below to initialize w
 679         for(i=0;i<l*nr_class;i++)
 680                 alpha[i] = 0;
 681
 682         for(i=0;i<w_size*nr_class;i++)
 683                 w[i] = 0;
 684         for(i=0;i<l;i++)
 685         {
 686                 for(m=0;m<nr_class;m++)
 687                         alpha_index[i*nr_class+m] = m;
 688                 feature_node *xi = prob->x[i];
 689                 QD[i] = 0;
 690                 while(xi->index != -1)
 691                 {
 692                         double val = xi->value;
 693                         QD[i] += val*val;
 694
 695                         // Uncomment the for loop if initial alpha isn't zero
 696                         // for(m=0; m<nr_class; m++)
 697                         //      w[(xi->index-1)*nr_class+m] += alpha[i*nr_class+m]*val;
 698                         xi++;
 699                 }
 700                 active_size_i[i] = nr_class;
 701                 y_index[i] = (int)prob->y[i];
 702                 index[i] = i;
 703         }
 704
 705         while(iter < max_iter)
 706         {
 707                 double stopping = -INF;
 708                 for(i=0;i<active_size;i++)
 709                 {
 710                         int j = i+rand()%(active_size-i);
 711                         swap(index[i], index[j]);
 712                 }
 713                 for(s=0;s<active_size;s++)
 714                 {
 715                         i = index[s];
 716                         double Ai = QD[i];
 717                         double *alpha_i = &alpha[i*nr_class];
 718                         int *alpha_index_i = &alpha_index[i*nr_class];
 719
 720                         if(Ai > 0)
 721                         {
 722                                 for(m=0;m<active_size_i[i];m++)
 723                                         G[m] = 1;
 724                                 if(y_index[i] < active_size_i[i])
 725                                         G[y_index[i]] = 0;
 726
 727                                 feature_node *xi = prob->x[i];
 728                                 while(xi->index!= -1)
 729                                 {
 730                                         double *w_i = &w[(xi->index-1)*nr_class];
 731                                         for(m=0;m<active_size_i[i];m++)
 732                                                 G[m] += w_i[alpha_index_i[m]]*(xi->value);
 733                                         xi++;
 734                                 }
 735
 736                                 double minG = INF;
 737                                 double maxG = -INF;
 738                                 for(m=0;m<active_size_i[i];m++)
 739                                 {
 740                                         if(alpha_i[alpha_index_i[m]] < 0 && G[m] < minG)
 741                                                 minG = G[m];
 742                                         if(G[m] > maxG)
 743                                                 maxG = G[m];
 744                                 }
 745                                 if(y_index[i] < active_size_i[i])
 746                                         if(alpha_i[(int) prob->y[i]] < C[GETI(i)] && G[y_index[i]] < minG)
 747                                                 minG = G[y_index[i]];
 748
 749                                 for(m=0;m<active_size_i[i];m++)
 750                                 {
 751                                         if(be_shrunk(i, m, y_index[i], alpha_i[alpha_index_i[m]], minG))
 752                                         {
 753                                                 active_size_i[i]--;
 754                                                 while(active_size_i[i]>m)
 755                                                 {
 756                                                         if(!be_shrunk(i, active_size_i[i], y_index[i],
 757                                                                                         alpha_i[alpha_index_i[active_size_i[i]]], minG))
 758                                                         {
 759                                                                 swap(alpha_index_i[m], alpha_index_i[active_size_i[i]]);
 760                                                                 swap(G[m], G[active_size_i[i]]);
 761                                                                 if(y_index[i] == active_size_i[i])
 762                                                                         y_index[i] = m;
 763                                                                 else if(y_index[i] == m)
 764                                                                         y_index[i] = active_size_i[i];
 765                                                                 break;
 766                                                         }
 767                                                         active_size_i[i]--;
 768                                                 }
 769                                         }
 770                                 }
 771
 772                                 if(active_size_i[i] <= 1)
 773                                 {
 774                                         active_size--;
 775                                         swap(index[s], index[active_size]);
 776                                         s--;
 777                                         continue;
 778                                 }
 779
 780                                 if(maxG-minG <= 1e-12)
 781                                         continue;
 782                                 else
 783                                         stopping = max(maxG - minG, stopping);
 784
 785                                 for(m=0;m<active_size_i[i];m++)
 786                                         B[m] = G[m] - Ai*alpha_i[alpha_index_i[m]] ;
 787
 788                                 solve_sub_problem(Ai, y_index[i], C[GETI(i)], active_size_i[i], alpha_new);
 789                                 int nz_d = 0;
 790                                 for(m=0;m<active_size_i[i];m++)
 791                                 {
 792                                         double d = alpha_new[m] - alpha_i[alpha_index_i[m]];
 793                                         alpha_i[alpha_index_i[m]] = alpha_new[m];
 794                                         if(fabs(d) >= 1e-12)
 795                                         {
 796                                                 d_ind[nz_d] = alpha_index_i[m];
 797                                                 d_val[nz_d] = d;
 798                                                 nz_d++;
 799                                         }
 800                                 }
 801
 802                                 xi = prob->x[i];
 803                                 while(xi->index != -1)
 804                                 {
 805                                         double *w_i = &w[(xi->index-1)*nr_class];
 806                                         for(m=0;m<nz_d;m++)
 807                                                 w_i[d_ind[m]] += d_val[m]*xi->value;
 808                                         xi++;
 809                                 }
 810                         }
 811                 }
 812
 813                 iter++;
 814                 if(iter % 10 == 0)
 815                 {
 816                         info(".");
 817                 }
 818
 819                 if(stopping < eps_shrink)
 820                 {
 821                         if(stopping < eps && start_from_all == true)
 822                                 break;
 823                         else
 824                         {
 825                                 active_size = l;
 826                                 for(i=0;i<l;i++)
 827                                         active_size_i[i] = nr_class;
 828                                 info("*");
 829                                 eps_shrink = max(eps_shrink/2, eps);
 830                                 start_from_all = true;
 831                         }
 832                 }
 833                 else
 834                         start_from_all = false;
 835         }
 836
 837         info("\noptimization finished, #iter = %d\n",iter);
 838         if (iter >= max_iter)
 839                 info("\nWARNING: reaching max number of iterations\n");
 840
 841         // calculate objective value
 842         double v = 0;
 843         int nSV = 0;
 844         for(i=0;i<w_size*nr_class;i++)
 845                 v += w[i]*w[i];
 846         v = 0.5*v;
 847         for(i=0;i<l*nr_class;i++)
 848         {
 849                 v += alpha[i];
 850                 if(fabs(alpha[i]) > 0)
 851                         nSV++;
 852         }
 853         for(i=0;i<l;i++)
 854                 v -= alpha[i*nr_class+(int)prob->y[i]];
 855         info("Objective value = %lf\n",v);
 856         info("nSV = %d\n",nSV);
 857
 858         delete [] alpha;
 859         delete [] alpha_new;
 860         delete [] index;
 861         delete [] QD;
 862         delete [] d_ind;
 863         delete [] d_val;
 864         delete [] alpha_index;
 865         delete [] y_index;
 866         delete [] active_size_i;
 867 }
 868
 869 // A coordinate descent algorithm for
 870 // L1-loss and L2-loss SVM dual problems
 871 //
 872 //  min_\alpha  0.5(\alpha^T (Q + D)\alpha) - e^T \alpha,
 873 //    s.t.      0 <= \alpha_i <= upper_bound_i,
 874 //
 875 //  where Qij = yi yj xi^T xj and
 876 //  D is a diagonal matrix
 877 //
 878 // In L1-SVM case:
 879 //              upper_bound_i = Cp if y_i = 1
 880 //              upper_bound_i = Cn if y_i = -1
 881 //              D_ii = 0
 882 // In L2-SVM case:
 883 //              upper_bound_i = INF
 884 //              D_ii = 1/(2*Cp) if y_i = 1
 885 //              D_ii = 1/(2*Cn) if y_i = -1
 886 //
 887 // Given:
 888 // x, y, Cp, Cn
 889 // eps is the stopping tolerance
 890 //
 891 // solution will be put in w
 892 //
 893 // this function returns the number of iterations
 894 //
 895 // See Algorithm 3 of Hsieh et al., ICML 2008
 896
 897 #undef GETI
 898 #define GETI(i) (y[i]+1)
 899 // To support weights for instances, use GETI(i) (i)
 900
 901 static int solve_l2r_l1l2_svc(const problem *prob, const parameter *param, double *w, double Cp, double Cn, int max_iter=500)
 902 {
 903         int l = prob->l;
 904         int w_size = prob->n;
 905         double eps = param->eps;
 906         int solver_type = param->solver_type;
 907         int i, s, iter = 0;
 908         double C, d, G;
 909         double *QD = new double[l];
 910         int *index = new int[l];
 911         double *alpha = new double[l];
 912         schar *y = new schar[l];
 913         int active_size = l;
 914
 915         // PG: projected gradient, for shrinking and stopping
 916         double PG;
 917         double PGmax_old = INF;
 918         double PGmin_old = -INF;
 919         double PGmax_new, PGmin_new;
 920
 921         // default solver_type: L2R_L2LOSS_SVC_DUAL
 922         double diag[3] = {0.5/Cn, 0, 0.5/Cp};
 923         double upper_bound[3] = {INF, 0, INF};
 924         if(solver_type == L2R_L1LOSS_SVC_DUAL)
 925         {
 926                 diag[0] = 0;
 927                 diag[2] = 0;
 928                 upper_bound[0] = Cn;
 929                 upper_bound[2] = Cp;
 930         }
 931
 932         for(i=0; i<l; i++)
 933         {
 934                 if(prob->y[i] > 0)
 935                 {
 936                         y[i] = +1;
 937                 }
 938                 else
 939                 {
 940                         y[i] = -1;
 941                 }
 942         }
 943
 944         // Initial alpha can be set here. Note that
 945         // 0 <= alpha[i] <= upper_bound[GETI(i)]
 946         for(i=0; i<l; i++)
 947                 alpha[i] = 0;
 948
 949         for(i=0; i<w_size; i++)
 950                 w[i] = 0;
 951         for(i=0; i<l; i++)
 952         {
 953                 QD[i] = diag[GETI(i)];
 954
 955                 feature_node * const xi = prob->x[i];
 956                 QD[i] += sparse_operator::nrm2_sq(xi);
 957                 sparse_operator::axpy(y[i]*alpha[i], xi, w);
 958
 959                 index[i] = i;
 960         }
 961
 962         while (iter < max_iter)
 963         {
 964                 PGmax_new = -INF;
 965                 PGmin_new = INF;
 966
 967                 for (i=0; i<active_size; i++)
 968                 {
 969                         int j = i+rand()%(active_size-i);
 970                         swap(index[i], index[j]);
 971                 }
 972
 973                 for (s=0; s<active_size; s++)
 974                 {
 975                         i = index[s];
 976                         const schar yi = y[i];
 977                         feature_node * const xi = prob->x[i];
 978
 979                         G = yi*sparse_operator::dot(w, xi)-1;
 980
 981                         C = upper_bound[GETI(i)];
 982                         G += alpha[i]*diag[GETI(i)];
 983
 984                         PG = 0;
 985                         if (alpha[i] == 0)
 986                         {
 987                                 if (G > PGmax_old)
 988                                 {
 989                                         active_size--;
 990                                         swap(index[s], index[active_size]);
 991                                         s--;
 992                                         continue;
 993                                 }
 994                                 else if (G < 0)
 995                                         PG = G;
 996                         }
 997                         else if (alpha[i] == C)
 998                         {
 999                                 if (G < PGmin_old)
1000                                 {
1001                                         active_size--;
1002                                         swap(index[s], index[active_size]);
1003                                         s--;
1004                                         continue;
1005                                 }
1006                                 else if (G > 0)
1007                                         PG = G;
1008                         }
1009                         else
1010                                 PG = G;
1011
1012                         PGmax_new = max(PGmax_new, PG);
1013                         PGmin_new = min(PGmin_new, PG);
1014
1015                         if(fabs(PG) > 1.0e-12)
1016                         {
1017                                 double alpha_old = alpha[i];
1018                                 alpha[i] = min(max(alpha[i] - G/QD[i], 0.0), C);
1019                                 d = (alpha[i] - alpha_old)*yi;
1020                                 sparse_operator::axpy(d, xi, w);
1021                         }
1022                 }
1023
1024                 iter++;
1025                 if(iter % 10 == 0)
1026                         info(".");
1027
1028                 if(PGmax_new - PGmin_new <= eps)
1029                 {
1030                         if(active_size == l)
1031                                 break;
1032                         else
1033                         {
1034                                 active_size = l;
1035                                 info("*");
1036                                 PGmax_old = INF;
1037                                 PGmin_old = -INF;
1038                                 continue;
1039                         }
1040                 }
1041                 PGmax_old = PGmax_new;
1042                 PGmin_old = PGmin_new;
1043                 if (PGmax_old <= 0)
1044                         PGmax_old = INF;
1045                 if (PGmin_old >= 0)
1046                         PGmin_old = -INF;
1047         }
1048
1049         info("\noptimization finished, #iter = %d\n",iter);
1050
1051         // calculate objective value
1052
1053         double v = 0;
1054         int nSV = 0;
1055         for(i=0; i<w_size; i++)
1056                 v += w[i]*w[i];
1057         for(i=0; i<l; i++)
1058         {
1059                 v += alpha[i]*(alpha[i]*diag[GETI(i)] - 2);
1060                 if(alpha[i] > 0)
1061                         ++nSV;
1062         }
1063         info("Objective value = %lf\n",v/2);
1064         info("nSV = %d\n",nSV);
1065
1066         delete [] QD;
1067         delete [] alpha;
1068         delete [] y;
1069         delete [] index;
1070
1071         return iter;
1072 }
1073
1074
1075 // A coordinate descent algorithm for
1076 // L1-loss and L2-loss epsilon-SVR dual problem
1077 //
1078 //  min_\beta  0.5\beta^T (Q + diag(lambda)) \beta - p \sum_{i=1}^l|\beta_i| + \sum_{i=1}^l yi\beta_i,
1079 //    s.t.      -upper_bound_i <= \beta_i <= upper_bound_i,
1080 //
1081 //  where Qij = xi^T xj and
1082 //  D is a diagonal matrix
1083 //
1084 // In L1-SVM case:
1085 //              upper_bound_i = C
1086 //              lambda_i = 0
1087 // In L2-SVM case:
1088 //              upper_bound_i = INF
1089 //              lambda_i = 1/(2*C)
1090 //
1091 // Given:
1092 // x, y, p, C
1093 // eps is the stopping tolerance
1094 //
1095 // solution will be put in w
1096 //
1097 // this function returns the number of iterations
1098 //
1099 // See Algorithm 4 of Ho and Lin, 2012
1100
1101 #undef GETI
1102 #define GETI(i) (0)
1103 // To support weights for instances, use GETI(i) (i)
1104
1105 static int solve_l2r_l1l2_svr(const problem *prob, const parameter *param, double *w, int max_iter=500)
1106 {
1107         const int solver_type = param->solver_type;
1108         int l = prob->l;
1109         double C = param->C;
1110         double p = param->p;
1111         int w_size = prob->n;
1112         double eps = param->eps;
1113         int i, s, iter = 0;
1114         int active_size = l;
1115         int *index = new int[l];
1116
1117         double d, G, H;
1118         double Gmax_old = INF;
1119         double Gmax_new, Gnorm1_new;
1120         double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1121         double *beta = new double[l];
1122         double *QD = new double[l];
1123         double *y = prob->y;
1124
1125         // L2R_L2LOSS_SVR_DUAL
1126         double lambda[1], upper_bound[1];
1127         lambda[0] = 0.5/C;
1128         upper_bound[0] = INF;
1129
1130         if(solver_type == L2R_L1LOSS_SVR_DUAL)
1131         {
1132                 lambda[0] = 0;
1133                 upper_bound[0] = C;
1134         }
1135
1136         // Initial beta can be set here. Note that
1137         // -upper_bound <= beta[i] <= upper_bound
1138         for(i=0; i<l; i++)
1139                 beta[i] = 0;
1140
1141         for(i=0; i<w_size; i++)
1142                 w[i] = 0;
1143         for(i=0; i<l; i++)
1144         {
1145                 feature_node * const xi = prob->x[i];
1146                 QD[i] = sparse_operator::nrm2_sq(xi);
1147                 sparse_operator::axpy(beta[i], xi, w);
1148
1149                 index[i] = i;
1150         }
1151
1152
1153         while(iter < max_iter)
1154         {
1155                 Gmax_new = 0;
1156                 Gnorm1_new = 0;
1157
1158                 for(i=0; i<active_size; i++)
1159                 {
1160                         int j = i+rand()%(active_size-i);
1161                         swap(index[i], index[j]);
1162                 }
1163
1164                 for(s=0; s<active_size; s++)
1165                 {
1166                         i = index[s];
1167                         G = -y[i] + lambda[GETI(i)]*beta[i];
1168                         H = QD[i] + lambda[GETI(i)];
1169
1170                         feature_node * const xi = prob->x[i];
1171                         G += sparse_operator::dot(w, xi);
1172
1173                         double Gp = G+p;
1174                         double Gn = G-p;
1175                         double violation = 0;
1176                         if(beta[i] == 0)
1177                         {
1178                                 if(Gp < 0)
1179                                         violation = -Gp;
1180                                 else if(Gn > 0)
1181                                         violation = Gn;
1182                                 else if(Gp>Gmax_old && Gn<-Gmax_old)
1183                                 {
1184                                         active_size--;
1185                                         swap(index[s], index[active_size]);
1186                                         s--;
1187                                         continue;
1188                                 }
1189                         }
1190                         else if(beta[i] >= upper_bound[GETI(i)])
1191                         {
1192                                 if(Gp > 0)
1193                                         violation = Gp;
1194                                 else if(Gp < -Gmax_old)
1195                                 {
1196                                         active_size--;
1197                                         swap(index[s], index[active_size]);
1198                                         s--;
1199                                         continue;
1200                                 }
1201                         }
1202                         else if(beta[i] <= -upper_bound[GETI(i)])
1203                         {
1204                                 if(Gn < 0)
1205                                         violation = -Gn;
1206                                 else if(Gn > Gmax_old)
1207                                 {
1208                                         active_size--;
1209                                         swap(index[s], index[active_size]);
1210                                         s--;
1211                                         continue;
1212                                 }
1213                         }
1214                         else if(beta[i] > 0)
1215                                 violation = fabs(Gp);
1216                         else
1217                                 violation = fabs(Gn);
1218
1219                         Gmax_new = max(Gmax_new, violation);
1220                         Gnorm1_new += violation;
1221
1222                         // obtain Newton direction d
1223                         if(Gp < H*beta[i])
1224                                 d = -Gp/H;
1225                         else if(Gn > H*beta[i])
1226                                 d = -Gn/H;
1227                         else
1228                                 d = -beta[i];
1229
1230                         if(fabs(d) < 1.0e-12)
1231                                 continue;
1232
1233                         double beta_old = beta[i];
1234                         beta[i] = min(max(beta[i]+d, -upper_bound[GETI(i)]), upper_bound[GETI(i)]);
1235                         d = beta[i]-beta_old;
1236
1237                         if(d != 0)
1238                                 sparse_operator::axpy(d, xi, w);
1239                 }
1240
1241                 if(iter == 0)
1242                         Gnorm1_init = Gnorm1_new;
1243                 iter++;
1244                 if(iter % 10 == 0)
1245                         info(".");
1246
1247                 if(Gnorm1_new <= eps*Gnorm1_init)
1248                 {
1249                         if(active_size == l)
1250                                 break;
1251                         else
1252                         {
1253                                 active_size = l;
1254                                 info("*");
1255                                 Gmax_old = INF;
1256                                 continue;
1257                         }
1258                 }
1259
1260                 Gmax_old = Gmax_new;
1261         }
1262
1263         info("\noptimization finished, #iter = %d\n", iter);
1264
1265         // calculate objective value
1266         double v = 0;
1267         int nSV = 0;
1268         for(i=0; i<w_size; i++)
1269                 v += w[i]*w[i];
1270         v = 0.5*v;
1271         for(i=0; i<l; i++)
1272         {
1273                 v += p*fabs(beta[i]) - y[i]*beta[i] + 0.5*lambda[GETI(i)]*beta[i]*beta[i];
1274                 if(beta[i] != 0)
1275                         nSV++;
1276         }
1277
1278         info("Objective value = %lf\n", v);
1279         info("nSV = %d\n",nSV);
1280
1281         delete [] beta;
1282         delete [] QD;
1283         delete [] index;
1284
1285         return iter;
1286 }
1287
1288
1289 // A coordinate descent algorithm for
1290 // the dual of L2-regularized logistic regression problems
1291 //
1292 //  min_\alpha  0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - \alpha_i) log (upper_bound_i - \alpha_i),
1293 //    s.t.      0 <= \alpha_i <= upper_bound_i,
1294 //
1295 //  where Qij = yi yj xi^T xj and
1296 //  upper_bound_i = Cp if y_i = 1
1297 //  upper_bound_i = Cn if y_i = -1
1298 //
1299 // Given:
1300 // x, y, Cp, Cn
1301 // eps is the stopping tolerance
1302 //
1303 // solution will be put in w
1304 //
1305 // this function returns the number of iterations
1306 //
1307 // See Algorithm 5 of Yu et al., MLJ 2010
1308
1309 #undef GETI
1310 #define GETI(i) (y[i]+1)
1311 // To support weights for instances, use GETI(i) (i)
1312
1313 static int solve_l2r_lr_dual(const problem *prob, const parameter *param, double *w, double Cp, double Cn, int max_iter=500)
1314 {
1315         int l = prob->l;
1316         int w_size = prob->n;
1317         double eps = param->eps;
1318         int i, s, iter = 0;
1319         double *xTx = new double[l];
1320         int *index = new int[l];
1321         double *alpha = new double[2*l]; // store alpha and C - alpha
1322         schar *y = new schar[l];
1323         int max_inner_iter = 100; // for inner Newton
1324         double innereps = 1e-2;
1325         double innereps_min = min(1e-8, eps);
1326         double upper_bound[3] = {Cn, 0, Cp};
1327
1328         for(i=0; i<l; i++)
1329         {
1330                 if(prob->y[i] > 0)
1331                 {
1332                         y[i] = +1;
1333                 }
1334                 else
1335                 {
1336                         y[i] = -1;
1337                 }
1338         }
1339
1340         // Initial alpha can be set here. Note that
1341         // 0 < alpha[i] < upper_bound[GETI(i)]
1342         // alpha[2*i] + alpha[2*i+1] = upper_bound[GETI(i)]
1343         for(i=0; i<l; i++)
1344         {
1345                 alpha[2*i] = min(0.001*upper_bound[GETI(i)], 1e-8);
1346                 alpha[2*i+1] = upper_bound[GETI(i)] - alpha[2*i];
1347         }
1348
1349         for(i=0; i<w_size; i++)
1350                 w[i] = 0;
1351         for(i=0; i<l; i++)
1352         {
1353                 feature_node * const xi = prob->x[i];
1354                 xTx[i] = sparse_operator::nrm2_sq(xi);
1355                 sparse_operator::axpy(y[i]*alpha[2*i], xi, w);
1356                 index[i] = i;
1357         }
1358
1359         while (iter < max_iter)
1360         {
1361                 for (i=0; i<l; i++)
1362                 {
1363                         int j = i+rand()%(l-i);
1364                         swap(index[i], index[j]);
1365                 }
1366                 int newton_iter = 0;
1367                 double Gmax = 0;
1368                 for (s=0; s<l; s++)
1369                 {
1370                         i = index[s];
1371                         const schar yi = y[i];
1372                         double C = upper_bound[GETI(i)];
1373                         double ywTx = 0, xisq = xTx[i];
1374                         feature_node * const xi = prob->x[i];
1375                         ywTx = yi*sparse_operator::dot(w, xi);
1376                         double a = xisq, b = ywTx;
1377
1378                         // Decide to minimize g_1(z) or g_2(z)
1379                         int ind1 = 2*i, ind2 = 2*i+1, sign = 1;
1380                         if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0)
1381                         {
1382                                 ind1 = 2*i+1;
1383                                 ind2 = 2*i;
1384                                 sign = -1;
1385                         }
1386
1387                         //  g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old)
1388                         double alpha_old = alpha[ind1];
1389                         double z = alpha_old;
1390                         if(C - z < 0.5 * C)
1391                                 z = 0.1*z;
1392                         double gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
1393                         Gmax = max(Gmax, fabs(gp));
1394
1395                         // Newton method on the sub-problem
1396                         const double eta = 0.1; // xi in the paper
1397                         int inner_iter = 0;
1398                         while (inner_iter <= max_inner_iter)
1399                         {
1400                                 if(fabs(gp) < innereps)
1401                                         break;
1402                                 double gpp = a + C/(C-z)/z;
1403                                 double tmpz = z - gp/gpp;
1404                                 if(tmpz <= 0)
1405                                         z *= eta;
1406                                 else // tmpz in (0, C)
1407                                         z = tmpz;
1408                                 gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
1409                                 newton_iter++;
1410                                 inner_iter++;
1411                         }
1412
1413                         if(inner_iter > 0) // update w
1414                         {
1415                                 alpha[ind1] = z;
1416                                 alpha[ind2] = C-z;
1417                                 sparse_operator::axpy(sign*(z-alpha_old)*yi, xi, w);
1418                         }
1419                 }
1420
1421                 iter++;
1422                 if(iter % 10 == 0)
1423                         info(".");
1424
1425                 if(Gmax < eps)
1426                         break;
1427
1428                 if(newton_iter <= l/10)
1429                         innereps = max(innereps_min, 0.1*innereps);
1430
1431         }
1432
1433         info("\noptimization finished, #iter = %d\n",iter);
1434
1435         // calculate objective value
1436
1437         double v = 0;
1438         for(i=0; i<w_size; i++)
1439                 v += w[i] * w[i];
1440         v *= 0.5;
1441         for(i=0; i<l; i++)
1442                 v += alpha[2*i] * log(alpha[2*i]) + alpha[2*i+1] * log(alpha[2*i+1])
1443                         - upper_bound[GETI(i)] * log(upper_bound[GETI(i)]);
1444         info("Objective value = %lf\n", v);
1445
1446         delete [] xTx;
1447         delete [] alpha;
1448         delete [] y;
1449         delete [] index;
1450
1451         return iter;
1452 }
1453
1454 // A coordinate descent algorithm for
1455 // L1-regularized L2-loss support vector classification
1456 //
1457 //  min_w \sum |wj| + C \sum max(0, 1-yi w^T xi)^2,
1458 //
1459 // Given:
1460 // x, y, Cp, Cn
1461 // eps is the stopping tolerance
1462 //
1463 // solution will be put in w
1464 //
1465 // this function returns the number of iterations
1466 //
1467 // See Yuan et al. (2010) and appendix of LIBLINEAR paper, Fan et al. (2008)
1468 //
1469 // To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1
1470 // must have been added to the original data. (see -B and -R option)
1471
1472 #undef GETI
1473 #define GETI(i) (y[i]+1)
1474 // To support weights for instances, use GETI(i) (i)
1475
1476 static int solve_l1r_l2_svc(const problem *prob_col, const parameter* param, double *w, double Cp, double Cn, double eps)
1477 {
1478         int l = prob_col->l;
1479         int w_size = prob_col->n;
1480         int regularize_bias = param->regularize_bias;
1481         int j, s, iter = 0;
1482         int max_iter = 1000;
1483         int active_size = w_size;
1484         int max_num_linesearch = 20;
1485
1486         double sigma = 0.01;
1487         double d, G_loss, G, H;
1488         double Gmax_old = INF;
1489         double Gmax_new, Gnorm1_new;
1490         double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1491         double d_old, d_diff;
1492         double loss_old = 0, loss_new;
1493         double appxcond, cond;
1494
1495         int *index = new int[w_size];
1496         schar *y = new schar[l];
1497         double *b = new double[l]; // b = 1-ywTx
1498         double *xj_sq = new double[w_size];
1499         feature_node *x;
1500
1501         double C[3] = {Cn,0,Cp};
1502
1503         // Initial w can be set here.
1504         for(j=0; j<w_size; j++)
1505                 w[j] = 0;
1506
1507         for(j=0; j<l; j++)
1508         {
1509                 b[j] = 1;
1510                 if(prob_col->y[j] > 0)
1511                         y[j] = 1;
1512                 else
1513                         y[j] = -1;
1514         }
1515         for(j=0; j<w_size; j++)
1516         {
1517                 index[j] = j;
1518                 xj_sq[j] = 0;
1519                 x = prob_col->x[j];
1520                 while(x->index != -1)
1521                 {
1522                         int ind = x->index-1;
1523                         x->value *= y[ind]; // x->value stores yi*xij
1524                         double val = x->value;
1525                         b[ind] -= w[j]*val;
1526                         xj_sq[j] += C[GETI(ind)]*val*val;
1527                         x++;
1528                 }
1529         }
1530
1531         while(iter < max_iter)
1532         {
1533                 Gmax_new = 0;
1534                 Gnorm1_new = 0;
1535
1536                 for(j=0; j<active_size; j++)
1537                 {
1538                         int i = j+rand()%(active_size-j);
1539                         swap(index[i], index[j]);
1540                 }
1541
1542                 for(s=0; s<active_size; s++)
1543                 {
1544                         j = index[s];
1545                         G_loss = 0;
1546                         H = 0;
1547
1548                         x = prob_col->x[j];
1549                         while(x->index != -1)
1550                         {
1551                                 int ind = x->index-1;
1552                                 if(b[ind] > 0)
1553                                 {
1554                                         double val = x->value;
1555                                         double tmp = C[GETI(ind)]*val;
1556                                         G_loss -= tmp*b[ind];
1557                                         H += tmp*val;
1558                                 }
1559                                 x++;
1560                         }
1561                         G_loss *= 2;
1562
1563                         G = G_loss;
1564                         H *= 2;
1565                         H = max(H, 1e-12);
1566
1567                         double violation = 0;
1568                         double Gp = 0, Gn = 0;
1569                         if(j == w_size-1 && regularize_bias == 0)
1570                                 violation = fabs(G);
1571                         else
1572                         {
1573                                 Gp = G+1;
1574                                 Gn = G-1;
1575                                 if(w[j] == 0)
1576                                 {
1577                                         if(Gp < 0)
1578                                                 violation = -Gp;
1579                                         else if(Gn > 0)
1580                                                 violation = Gn;
1581                                         else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
1582                                         {
1583                                                 active_size--;
1584                                                 swap(index[s], index[active_size]);
1585                                                 s--;
1586                                                 continue;
1587                                         }
1588                                 }
1589                                 else if(w[j] > 0)
1590                                         violation = fabs(Gp);
1591                                 else
1592                                         violation = fabs(Gn);
1593                         }
1594                         Gmax_new = max(Gmax_new, violation);
1595                         Gnorm1_new += violation;
1596
1597                         // obtain Newton direction d
1598                         if(j == w_size-1 && regularize_bias == 0)
1599                                 d = -G/H;
1600                         else
1601                         {
1602                                 if(Gp < H*w[j])
1603                                         d = -Gp/H;
1604                                 else if(Gn > H*w[j])
1605                                         d = -Gn/H;
1606                                 else
1607                                         d = -w[j];
1608                         }
1609
1610                         if(fabs(d) < 1.0e-12)
1611                                 continue;
1612
1613                         double delta;
1614                         if(j == w_size-1 && regularize_bias == 0)
1615                                 delta = G*d;
1616                         else
1617                                 delta = fabs(w[j]+d)-fabs(w[j]) + G*d;
1618                         d_old = 0;
1619                         int num_linesearch;
1620                         for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
1621                         {
1622                                 d_diff = d_old - d;
1623                                 if(j == w_size-1 && regularize_bias == 0)
1624                                         cond = -sigma*delta;
1625                                 else
1626                                         cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta;
1627
1628                                 appxcond = xj_sq[j]*d*d + G_loss*d + cond;
1629                                 if(appxcond <= 0)
1630                                 {
1631                                         x = prob_col->x[j];
1632                                         sparse_operator::axpy(d_diff, x, b);
1633                                         break;
1634                                 }
1635
1636                                 if(num_linesearch == 0)
1637                                 {
1638                                         loss_old = 0;
1639                                         loss_new = 0;
1640                                         x = prob_col->x[j];
1641                                         while(x->index != -1)
1642                                         {
1643                                                 int ind = x->index-1;
1644                                                 if(b[ind] > 0)
1645                                                         loss_old += C[GETI(ind)]*b[ind]*b[ind];
1646                                                 double b_new = b[ind] + d_diff*x->value;
1647                                                 b[ind] = b_new;
1648                                                 if(b_new > 0)
1649                                                         loss_new += C[GETI(ind)]*b_new*b_new;
1650                                                 x++;
1651                                         }
1652                                 }
1653                                 else
1654                                 {
1655                                         loss_new = 0;
1656                                         x = prob_col->x[j];
1657                                         while(x->index != -1)
1658                                         {
1659                                                 int ind = x->index-1;
1660                                                 double b_new = b[ind] + d_diff*x->value;
1661                                                 b[ind] = b_new;
1662                                                 if(b_new > 0)
1663                                                         loss_new += C[GETI(ind)]*b_new*b_new;
1664                                                 x++;
1665                                         }
1666                                 }
1667
1668                                 cond = cond + loss_new - loss_old;
1669                                 if(cond <= 0)
1670                                         break;
1671                                 else
1672                                 {
1673                                         d_old = d;
1674                                         d *= 0.5;
1675                                         delta *= 0.5;
1676                                 }
1677                         }
1678
1679                         w[j] += d;
1680
1681                         // recompute b[] if line search takes too many steps
1682                         if(num_linesearch >= max_num_linesearch)
1683                         {
1684                                 info("#");
1685                                 for(int i=0; i<l; i++)
1686                                         b[i] = 1;
1687
1688                                 for(int i=0; i<w_size; i++)
1689                                 {
1690                                         if(w[i]==0) continue;
1691                                         x = prob_col->x[i];
1692                                         sparse_operator::axpy(-w[i], x, b);
1693                                 }
1694                         }
1695                 }
1696
1697                 if(iter == 0)
1698                         Gnorm1_init = Gnorm1_new;
1699                 iter++;
1700                 if(iter % 10 == 0)
1701                         info(".");
1702
1703                 if(Gnorm1_new <= eps*Gnorm1_init)
1704                 {
1705                         if(active_size == w_size)
1706                                 break;
1707                         else
1708                         {
1709                                 active_size = w_size;
1710                                 info("*");
1711                                 Gmax_old = INF;
1712                                 continue;
1713                         }
1714                 }
1715
1716                 Gmax_old = Gmax_new;
1717         }
1718
1719         info("\noptimization finished, #iter = %d\n", iter);
1720         if(iter >= max_iter)
1721                 info("\nWARNING: reaching max number of iterations\n");
1722
1723         // calculate objective value
1724
1725         double v = 0;
1726         int nnz = 0;
1727         for(j=0; j<w_size; j++)
1728         {
1729                 x = prob_col->x[j];
1730                 while(x->index != -1)
1731                 {
1732                         x->value *= prob_col->y[x->index-1]; // restore x->value
1733                         x++;
1734                 }
1735                 if(w[j] != 0)
1736                 {
1737                         v += fabs(w[j]);
1738                         nnz++;
1739                 }
1740         }
1741         if (regularize_bias == 0)
1742                 v -= fabs(w[w_size-1]);
1743         for(j=0; j<l; j++)
1744                 if(b[j] > 0)
1745                         v += C[GETI(j)]*b[j]*b[j];
1746
1747         info("Objective value = %lf\n", v);
1748         info("#nonzeros/#features = %d/%d\n", nnz, w_size);
1749
1750         delete [] index;
1751         delete [] y;
1752         delete [] b;
1753         delete [] xj_sq;
1754
1755         return iter;
1756 }
1757
1758 // A coordinate descent algorithm for
1759 // L1-regularized logistic regression problems
1760 //
1761 //  min_w \sum |wj| + C \sum log(1+exp(-yi w^T xi)),
1762 //
1763 // Given:
1764 // x, y, Cp, Cn
1765 // eps is the stopping tolerance
1766 //
1767 // solution will be put in w
1768 //
1769 // this function returns the number of iterations
1770 //
1771 // See Yuan et al. (2011) and appendix of LIBLINEAR paper, Fan et al. (2008)
1772 //
1773 // To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1
1774 // must have been added to the original data. (see -B and -R option)
1775
1776 #undef GETI
1777 #define GETI(i) (y[i]+1)
1778 // To support weights for instances, use GETI(i) (i)
1779
1780 static int solve_l1r_lr(const problem *prob_col, const parameter *param, double *w, double Cp, double Cn, double eps)
1781 {
1782         int l = prob_col->l;
1783         int w_size = prob_col->n;
1784         int regularize_bias = param->regularize_bias;
1785         int j, s, newton_iter=0, iter=0;
1786         int max_newton_iter = 100;
1787         int max_iter = 1000;
1788         int max_num_linesearch = 20;
1789         int active_size;
1790         int QP_active_size;
1791
1792         double nu = 1e-12;
1793         double inner_eps = 1;
1794         double sigma = 0.01;
1795         double w_norm, w_norm_new;
1796         double z, G, H;
1797         double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1798         double Gmax_old = INF;
1799         double Gmax_new, Gnorm1_new;
1800         double QP_Gmax_old = INF;
1801         double QP_Gmax_new, QP_Gnorm1_new;
1802         double delta, negsum_xTd, cond;
1803
1804         int *index = new int[w_size];
1805         schar *y = new schar[l];
1806         double *Hdiag = new double[w_size];
1807         double *Grad = new double[w_size];
1808         double *wpd = new double[w_size];
1809         double *xjneg_sum = new double[w_size];
1810         double *xTd = new double[l];
1811         double *exp_wTx = new double[l];
1812         double *exp_wTx_new = new double[l];
1813         double *tau = new double[l];
1814         double *D = new double[l];
1815         feature_node *x;
1816
1817         double C[3] = {Cn,0,Cp};
1818
1819         // Initial w can be set here.
1820         for(j=0; j<w_size; j++)
1821                 w[j] = 0;
1822
1823         for(j=0; j<l; j++)
1824         {
1825                 if(prob_col->y[j] > 0)
1826                         y[j] = 1;
1827                 else
1828                         y[j] = -1;
1829
1830                 exp_wTx[j] = 0;
1831         }
1832
1833         w_norm = 0;
1834         for(j=0; j<w_size; j++)
1835         {
1836                 w_norm += fabs(w[j]);
1837                 wpd[j] = w[j];
1838                 index[j] = j;
1839                 xjneg_sum[j] = 0;
1840                 x = prob_col->x[j];
1841                 while(x->index != -1)
1842                 {
1843                         int ind = x->index-1;
1844                         double val = x->value;
1845                         exp_wTx[ind] += w[j]*val;
1846                         if(y[ind] == -1)
1847                                 xjneg_sum[j] += C[GETI(ind)]*val;
1848                         x++;
1849                 }
1850         }
1851         if (regularize_bias == 0)
1852                 w_norm -= fabs(w[w_size-1]);
1853
1854         for(j=0; j<l; j++)
1855         {
1856                 exp_wTx[j] = exp(exp_wTx[j]);
1857                 double tau_tmp = 1/(1+exp_wTx[j]);
1858                 tau[j] = C[GETI(j)]*tau_tmp;
1859                 D[j] = C[GETI(j)]*exp_wTx[j]*tau_tmp*tau_tmp;
1860         }
1861
1862         while(newton_iter < max_newton_iter)
1863         {
1864                 Gmax_new = 0;
1865                 Gnorm1_new = 0;
1866                 active_size = w_size;
1867
1868                 for(s=0; s<active_size; s++)
1869                 {
1870                         j = index[s];
1871                         Hdiag[j] = nu;
1872                         Grad[j] = 0;
1873
1874                         double tmp = 0;
1875                         x = prob_col->x[j];
1876                         while(x->index != -1)
1877                         {
1878                                 int ind = x->index-1;
1879                                 Hdiag[j] += x->value*x->value*D[ind];
1880                                 tmp += x->value*tau[ind];
1881                                 x++;
1882                         }
1883                         Grad[j] = -tmp + xjneg_sum[j];
1884
1885                         double violation = 0;
1886                         if (j == w_size-1 && regularize_bias == 0)
1887                                 violation = fabs(Grad[j]);
1888                         else
1889                         {
1890                                 double Gp = Grad[j]+1;
1891                                 double Gn = Grad[j]-1;
1892                                 if(w[j] == 0)
1893                                 {
1894                                         if(Gp < 0)
1895                                                 violation = -Gp;
1896                                         else if(Gn > 0)
1897                                                 violation = Gn;
1898                                         //outer-level shrinking
1899                                         else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
1900                                         {
1901                                                 active_size--;
1902                                                 swap(index[s], index[active_size]);
1903                                                 s--;
1904                                                 continue;
1905                                         }
1906                                 }
1907                                 else if(w[j] > 0)
1908                                         violation = fabs(Gp);
1909                                 else
1910                                         violation = fabs(Gn);
1911                         }
1912                         Gmax_new = max(Gmax_new, violation);
1913                         Gnorm1_new += violation;
1914                 }
1915
1916                 if(newton_iter == 0)
1917                         Gnorm1_init = Gnorm1_new;
1918
1919                 if(Gnorm1_new <= eps*Gnorm1_init)
1920                         break;
1921
1922                 iter = 0;
1923                 QP_Gmax_old = INF;
1924                 QP_active_size = active_size;
1925
1926                 for(int i=0; i<l; i++)
1927                         xTd[i] = 0;
1928
1929                 // optimize QP over wpd
1930                 while(iter < max_iter)
1931                 {
1932                         QP_Gmax_new = 0;
1933                         QP_Gnorm1_new = 0;
1934
1935                         for(j=0; j<QP_active_size; j++)
1936                         {
1937                                 int i = j+rand()%(QP_active_size-j);
1938                                 swap(index[i], index[j]);
1939                         }
1940
1941                         for(s=0; s<QP_active_size; s++)
1942                         {
1943                                 j = index[s];
1944                                 H = Hdiag[j];
1945
1946                                 x = prob_col->x[j];
1947                                 G = Grad[j] + (wpd[j]-w[j])*nu;
1948                                 while(x->index != -1)
1949                                 {
1950                                         int ind = x->index-1;
1951                                         G += x->value*D[ind]*xTd[ind];
1952                                         x++;
1953                                 }
1954
1955                                 double violation = 0;
1956                                 if (j == w_size-1 && regularize_bias == 0)
1957                                 {
1958                                         // bias term not shrunken
1959                                         violation = fabs(G);
1960                                         z = -G/H;
1961                                 }
1962                                 else
1963                                 {
1964                                         double Gp = G+1;
1965                                         double Gn = G-1;
1966                                         if(wpd[j] == 0)
1967                                         {
1968                                                 if(Gp < 0)
1969                                                         violation = -Gp;
1970                                                 else if(Gn > 0)
1971                                                         violation = Gn;
1972                                                 //inner-level shrinking
1973                                                 else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l)
1974                                                 {
1975                                                         QP_active_size--;
1976                                                         swap(index[s], index[QP_active_size]);
1977                                                         s--;
1978                                                         continue;
1979                                                 }
1980                                         }
1981                                         else if(wpd[j] > 0)
1982                                                 violation = fabs(Gp);
1983                                         else
1984                                                 violation = fabs(Gn);
1985
1986                                         // obtain solution of one-variable problem
1987                                         if(Gp < H*wpd[j])
1988                                                 z = -Gp/H;
1989                                         else if(Gn > H*wpd[j])
1990                                                 z = -Gn/H;
1991                                         else
1992                                                 z = -wpd[j];
1993                                 }
1994                                 QP_Gmax_new = max(QP_Gmax_new, violation);
1995                                 QP_Gnorm1_new += violation;
1996
1997                                 if(fabs(z) < 1.0e-12)
1998                                         continue;
1999                                 z = min(max(z,-10.0),10.0);
2000
2001                                 wpd[j] += z;
2002
2003                                 x = prob_col->x[j];
2004                                 sparse_operator::axpy(z, x, xTd);
2005                         }
2006
2007                         iter++;
2008
2009                         if(QP_Gnorm1_new <= inner_eps*Gnorm1_init)
2010                         {
2011                                 //inner stopping
2012                                 if(QP_active_size == active_size)
2013                                         break;
2014                                 //active set reactivation
2015                                 else
2016                                 {
2017                                         QP_active_size = active_size;
2018                                         QP_Gmax_old = INF;
2019                                         continue;
2020                                 }
2021                         }
2022
2023                         QP_Gmax_old = QP_Gmax_new;
2024                 }
2025
2026                 if(iter >= max_iter)
2027                         info("WARNING: reaching max number of inner iterations\n");
2028
2029                 delta = 0;
2030                 w_norm_new = 0;
2031                 for(j=0; j<w_size; j++)
2032                 {
2033                         delta += Grad[j]*(wpd[j]-w[j]);
2034                         if(wpd[j] != 0)
2035                                 w_norm_new += fabs(wpd[j]);
2036                 }
2037                 if (regularize_bias == 0)
2038                         w_norm_new -= fabs(wpd[w_size-1]);
2039                 delta += (w_norm_new-w_norm);
2040
2041                 negsum_xTd = 0;
2042                 for(int i=0; i<l; i++)
2043                         if(y[i] == -1)
2044                                 negsum_xTd += C[GETI(i)]*xTd[i];
2045
2046                 int num_linesearch;
2047                 for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
2048                 {
2049                         cond = w_norm_new - w_norm + negsum_xTd - sigma*delta;
2050
2051                         for(int i=0; i<l; i++)
2052                         {
2053                                 double exp_xTd = exp(xTd[i]);
2054                                 exp_wTx_new[i] = exp_wTx[i]*exp_xTd;
2055                                 cond += C[GETI(i)]*log((1+exp_wTx_new[i])/(exp_xTd+exp_wTx_new[i]));
2056                         }
2057
2058                         if(cond <= 0)
2059                         {
2060                                 w_norm = w_norm_new;
2061                                 for(j=0; j<w_size; j++)
2062                                         w[j] = wpd[j];
2063                                 for(int i=0; i<l; i++)
2064                                 {
2065                                         exp_wTx[i] = exp_wTx_new[i];
2066                                         double tau_tmp = 1/(1+exp_wTx[i]);
2067                                         tau[i] = C[GETI(i)]*tau_tmp;
2068                                         D[i] = C[GETI(i)]*exp_wTx[i]*tau_tmp*tau_tmp;
2069                                 }
2070                                 break;
2071                         }
2072                         else
2073                         {
2074                                 w_norm_new = 0;
2075                                 for(j=0; j<w_size; j++)
2076                                 {
2077                                         wpd[j] = (w[j]+wpd[j])*0.5;
2078                                         if(wpd[j] != 0)
2079                                                 w_norm_new += fabs(wpd[j]);
2080                                 }
2081                                 if (regularize_bias == 0)
2082                                         w_norm_new -= fabs(wpd[w_size-1]);
2083                                 delta *= 0.5;
2084                                 negsum_xTd *= 0.5;
2085                                 for(int i=0; i<l; i++)
2086                                         xTd[i] *= 0.5;
2087                         }
2088                 }
2089
2090                 // Recompute some info due to too many line search steps
2091                 if(num_linesearch >= max_num_linesearch)
2092                 {
2093                         for(int i=0; i<l; i++)
2094                                 exp_wTx[i] = 0;
2095
2096                         for(int i=0; i<w_size; i++)
2097                         {
2098                                 if(w[i]==0) continue;
2099                                 x = prob_col->x[i];
2100                                 sparse_operator::axpy(w[i], x, exp_wTx);
2101                         }
2102
2103                         for(int i=0; i<l; i++)
2104                                 exp_wTx[i] = exp(exp_wTx[i]);
2105                 }
2106
2107                 if(iter == 1)
2108                         inner_eps *= 0.25;
2109
2110                 newton_iter++;
2111                 Gmax_old = Gmax_new;
2112
2113                 info("iter %3d  #CD cycles %d\n", newton_iter, iter);
2114         }
2115
2116         info("=========================\n");
2117         info("optimization finished, #iter = %d\n", newton_iter);
2118         if(newton_iter >= max_newton_iter)
2119                 info("WARNING: reaching max number of iterations\n");
2120
2121         // calculate objective value
2122
2123         double v = 0;
2124         int nnz = 0;
2125         for(j=0; j<w_size; j++)
2126                 if(w[j] != 0)
2127                 {
2128                         v += fabs(w[j]);
2129                         nnz++;
2130                 }
2131         if (regularize_bias == 0)
2132                 v -= fabs(w[w_size-1]);
2133         for(j=0; j<l; j++)
2134                 if(y[j] == 1)
2135                         v += C[GETI(j)]*log(1+1/exp_wTx[j]);
2136                 else
2137                         v += C[GETI(j)]*log(1+exp_wTx[j]);
2138
2139         info("Objective value = %lf\n", v);
2140         info("#nonzeros/#features = %d/%d\n", nnz, w_size);
2141
2142         delete [] index;
2143         delete [] y;
2144         delete [] Hdiag;
2145         delete [] Grad;
2146         delete [] wpd;
2147         delete [] xjneg_sum;
2148         delete [] xTd;
2149         delete [] exp_wTx;
2150         delete [] exp_wTx_new;
2151         delete [] tau;
2152         delete [] D;
2153
2154         return newton_iter;
2155 }
2156
2157 struct heap {
2158         enum HEAP_TYPE { MIN, MAX };
2159         int _size;
2160         HEAP_TYPE _type;
2161         feature_node* a;
2162
2163         heap(int max_size, HEAP_TYPE type)
2164         {
2165                 _size = 0;
2166                 a = new feature_node[max_size];
2167                 _type = type;
2168         }
2169         ~heap()
2170         {
2171                 delete [] a;
2172         }
2173         bool cmp(const feature_node& left, const feature_node& right)
2174         {
2175                 if(_type == MIN)
2176                         return left.value > right.value;
2177                 else
2178                         return left.value < right.value;
2179         }
2180         int size()
2181         {
2182                 return _size;
2183         }
2184         void push(feature_node node)
2185         {
2186                 a[_size] = node;
2187                 _size++;
2188                 int i = _size-1;
2189                 while(i)
2190                 {
2191                         int p = (i-1)/2;
2192                         if(cmp(a[p], a[i]))
2193                         {
2194                                 swap(a[i], a[p]);
2195                                 i = p;
2196                         }
2197                         else
2198                                 break;
2199                 }
2200         }
2201         void pop()
2202         {
2203                 _size--;
2204                 a[0] = a[_size];
2205                 int i = 0;
2206                 while(i*2+1 < _size)
2207                 {
2208                         int l = i*2+1;
2209                         int r = i*2+2;
2210                         if(r < _size && cmp(a[l], a[r]))
2211                                 l = r;
2212                         if(cmp(a[i], a[l]))
2213                         {
2214                                 swap(a[i], a[l]);
2215                                 i = l;
2216                         }
2217                         else
2218                                 break;
2219                 }
2220         }
2221         feature_node top()
2222         {
2223                 return a[0];
2224         }
2225 };
2226
2227 // A two-level coordinate descent algorithm for
2228 // a scaled one-class SVM dual problem
2229 //
2230 //  min_\alpha  0.5(\alpha^T Q \alpha),
2231 //    s.t.      0 <= \alpha_i <= 1 and
2232 //              e^T \alpha = \nu l
2233 //
2234 //  where Qij = xi^T xj
2235 //
2236 // Given:
2237 // x, nu
2238 // eps is the stopping tolerance
2239 //
2240 // solution will be put in w and rho
2241 //
2242 // this function returns the number of iterations
2243 //
2244 // See Algorithm 7 in supplementary materials of Chou et al., SDM 2020.
2245
2246 static int solve_oneclass_svm(const problem *prob, const parameter *param, double *w, double *rho)
2247 {
2248         int l = prob->l;
2249         int w_size = prob->n;
2250         double eps = param->eps;
2251         double nu = param->nu;
2252         int i, j, s, iter = 0;
2253         double Gi, Gj;
2254         double Qij, quad_coef, delta, sum;
2255         double old_alpha_i;
2256         double *QD = new double[l];
2257         double *G = new double[l];
2258         int *index = new int[l];
2259         double *alpha = new double[l];
2260         int max_inner_iter;
2261         int max_iter = 1000;
2262         int active_size = l;
2263
2264         double negGmax;                 // max { -grad(f)_i | alpha_i < 1 }
2265         double negGmin;                 // min { -grad(f)_i | alpha_i > 0 }
2266
2267         int *most_violating_i = new int[l];
2268         int *most_violating_j = new int[l];
2269
2270         int n = (int)(nu*l);            // # of alpha's at upper bound
2271         for(i=0; i<n; i++)
2272                 alpha[i] = 1;
2273         if (n<l)
2274                 alpha[i] = nu*l-n;
2275         for(i=n+1; i<l; i++)
2276                 alpha[i] = 0;
2277
2278         for(i=0; i<w_size; i++)
2279                 w[i] = 0;
2280         for(i=0; i<l; i++)
2281         {
2282                 feature_node * const xi = prob->x[i];
2283                 QD[i] = sparse_operator::nrm2_sq(xi);
2284                 sparse_operator::axpy(alpha[i], xi, w);
2285
2286                 index[i] = i;
2287         }
2288
2289         while (iter < max_iter)
2290         {
2291                 negGmax = -INF;
2292                 negGmin = INF;
2293
2294                 for (s=0; s<active_size; s++)
2295                 {
2296                         i = index[s];
2297                         feature_node * const xi = prob->x[i];
2298                         G[i] = sparse_operator::dot(w, xi);
2299                         if (alpha[i] < 1)
2300                                 negGmax = max(negGmax, -G[i]);
2301                         if (alpha[i] > 0)
2302                                 negGmin = min(negGmin, -G[i]);
2303                 }
2304
2305                 if (negGmax - negGmin < eps)
2306                 {
2307                         if (active_size == l)
2308                                 break;
2309                         else
2310                         {
2311                                 active_size = l;
2312                                 info("*");
2313                                 continue;
2314                         }
2315                 }
2316
2317                 for(s=0; s<active_size; s++)
2318                 {
2319                         i = index[s];
2320                         if ((alpha[i] == 1 && -G[i] > negGmax) ||
2321                             (alpha[i] == 0 && -G[i] < negGmin))
2322                         {
2323                                 active_size--;
2324                                 swap(index[s], index[active_size]);
2325                                 s--;
2326                         }
2327                 }
2328
2329                 max_inner_iter = max(active_size/10, 1);
2330                 struct heap min_heap = heap(max_inner_iter, heap::MIN);
2331                 struct heap max_heap = heap(max_inner_iter, heap::MAX);
2332                 struct feature_node node;
2333                 for(s=0; s<active_size; s++)
2334                 {
2335                         i = index[s];
2336                         node.index = i;
2337                         node.value = -G[i];
2338
2339                         if (alpha[i] < 1)
2340                         {
2341                                 if (min_heap.size() < max_inner_iter)
2342                                         min_heap.push(node);
2343                                 else if (min_heap.top().value < node.value)
2344                                 {
2345                                         min_heap.pop();
2346                                         min_heap.push(node);
2347                                 }
2348                         }
2349
2350                         if (alpha[i] > 0)
2351                         {
2352                                 if (max_heap.size() < max_inner_iter)
2353                                         max_heap.push(node);
2354                                 else if (max_heap.top().value > node.value)
2355                                 {
2356                                         max_heap.pop();
2357                                         max_heap.push(node);
2358                                 }
2359                         }
2360                 }
2361                 max_inner_iter = min(min_heap.size(), max_heap.size());
2362                 while (max_heap.size() > max_inner_iter)
2363                         max_heap.pop();
2364                 while (min_heap.size() > max_inner_iter)
2365                         min_heap.pop();
2366
2367                 for (s=max_inner_iter-1; s>=0; s--)
2368                 {
2369                         most_violating_i[s] = min_heap.top().index;
2370                         most_violating_j[s] = max_heap.top().index;
2371                         min_heap.pop();
2372                         max_heap.pop();
2373                 }
2374
2375                 for (s=0; s<max_inner_iter; s++)
2376                 {
2377                         i = most_violating_i[s];
2378                         j = most_violating_j[s];
2379
2380                         if ((alpha[i] == 0 && alpha[j] == 0) ||
2381                             (alpha[i] == 1 && alpha[j] == 1))
2382                                 continue;
2383
2384                         feature_node const * xi = prob->x[i];
2385                         feature_node const * xj = prob->x[j];
2386
2387                         Gi = sparse_operator::dot(w, xi);
2388                         Gj = sparse_operator::dot(w, xj);
2389
2390                         int violating_pair = 0;
2391                         if (alpha[i] < 1 && alpha[j] > 0 && -Gj + 1e-12 < -Gi)
2392                                 violating_pair = 1;
2393                         else
2394                                 if (alpha[i] > 0 && alpha[j] < 1 && -Gi + 1e-12 < -Gj)
2395                                         violating_pair = 1;
2396                         if (violating_pair == 0)
2397                                 continue;
2398
2399                         Qij = sparse_operator::sparse_dot(xi, xj);
2400                         quad_coef = QD[i] + QD[j] - 2*Qij;
2401                         if(quad_coef <= 0)
2402                                 quad_coef = 1e-12;
2403                         delta = (Gi - Gj) / quad_coef;
2404                         old_alpha_i = alpha[i];
2405                         sum = alpha[i] + alpha[j];
2406                         alpha[i] = alpha[i] - delta;
2407                         alpha[j] = alpha[j] + delta;
2408                         if (sum > 1)
2409                         {
2410                                 if (alpha[i] > 1)
2411                                 {
2412                                         alpha[i] = 1;
2413                                         alpha[j] = sum - 1;
2414                                 }
2415                         }
2416                         else
2417                         {
2418                                 if (alpha[j] < 0)
2419                                 {
2420                                         alpha[j] = 0;
2421                                         alpha[i] = sum;
2422                                 }
2423                         }
2424                         if (sum > 1)
2425                         {
2426                                 if (alpha[j] > 1)
2427                                 {
2428                                         alpha[j] = 1;
2429                                         alpha[i] = sum - 1;
2430                                 }
2431                         }
2432                         else
2433                         {
2434                                 if (alpha[i] < 0)
2435                                 {
2436                                         alpha[i] = 0;
2437                                         alpha[j] = sum;
2438                                 }
2439                         }
2440                         delta = alpha[i] - old_alpha_i;
2441                         sparse_operator::axpy(delta, xi, w);
2442                         sparse_operator::axpy(-delta, xj, w);
2443                 }
2444                 iter++;
2445                 if (iter % 10 == 0)
2446                         info(".");
2447         }
2448         info("\noptimization finished, #iter = %d\n",iter);
2449         if (iter >= max_iter)
2450                 info("\nWARNING: reaching max number of iterations\n\n");
2451
2452         // calculate object value
2453         double v = 0;
2454         for(i=0; i<w_size; i++)
2455                 v += w[i]*w[i];
2456         int nSV = 0;
2457         for(i=0; i<l; i++)
2458         {
2459                 if (alpha[i] > 0)
2460                         ++nSV;
2461         }
2462         info("Objective value = %lf\n", v/2);
2463         info("nSV = %d\n", nSV);
2464
2465         // calculate rho
2466         double nr_free = 0;
2467         double ub = INF, lb = -INF, sum_free = 0;
2468         for(i=0; i<l; i++)
2469         {
2470                 double G = sparse_operator::dot(w, prob->x[i]);
2471                 if (alpha[i] == 1)
2472                         lb = max(lb, G);
2473                 else if (alpha[i] == 0)
2474                         ub = min(ub, G);
2475                 else
2476                 {
2477                         ++nr_free;
2478                         sum_free += G;
2479                 }
2480         }
2481
2482         if (nr_free > 0)
2483                 *rho = sum_free/nr_free;
2484         else
2485                 *rho = (ub + lb)/2;
2486
2487         info("rho = %lf\n", *rho);
2488
2489         delete [] QD;
2490         delete [] G;
2491         delete [] index;
2492         delete [] alpha;
2493         delete [] most_violating_i;
2494         delete [] most_violating_j;
2495
2496         return iter;
2497 }
2498
2499 // transpose matrix X from row format to column format
2500 static void transpose(const problem *prob, feature_node **x_space_ret, problem *prob_col)
2501 {
2502         int i;
2503         int l = prob->l;
2504         int n = prob->n;
2505         size_t nnz = 0;
2506         size_t *col_ptr = new size_t [n+1];
2507         feature_node *x_space;
2508         prob_col->l = l;
2509         prob_col->n = n;
2510         prob_col->y = new double[l];
2511         prob_col->x = new feature_node*[n];
2512
2513         for(i=0; i<l; i++)
2514                 prob_col->y[i] = prob->y[i];
2515
2516         for(i=0; i<n+1; i++)
2517                 col_ptr[i] = 0;
2518         for(i=0; i<l; i++)
2519         {
2520                 feature_node *x = prob->x[i];
2521                 while(x->index != -1)
2522                 {
2523                         nnz++;
2524                         col_ptr[x->index]++;
2525                         x++;
2526                 }
2527         }
2528         for(i=1; i<n+1; i++)
2529                 col_ptr[i] += col_ptr[i-1] + 1;
2530
2531         x_space = new feature_node[nnz+n];
2532         for(i=0; i<n; i++)
2533                 prob_col->x[i] = &x_space[col_ptr[i]];
2534
2535         for(i=0; i<l; i++)
2536         {
2537                 feature_node *x = prob->x[i];
2538                 while(x->index != -1)
2539                 {
2540                         int ind = x->index-1;
2541                         x_space[col_ptr[ind]].index = i+1; // starts from 1
2542                         x_space[col_ptr[ind]].value = x->value;
2543                         col_ptr[ind]++;
2544                         x++;
2545                 }
2546         }
2547         for(i=0; i<n; i++)
2548                 x_space[col_ptr[i]].index = -1;
2549
2550         *x_space_ret = x_space;
2551
2552         delete [] col_ptr;
2553 }
2554
2555 // label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data
2556 // perm, length l, must be allocated before calling this subroutine
2557 static void group_classes(const problem *prob, int *nr_class_ret, int **label_ret, int **start_ret, int **count_ret, int *perm)
2558 {
2559         int l = prob->l;
2560         int max_nr_class = 16;
2561         int nr_class = 0;
2562         int *label = Malloc(int,max_nr_class);
2563         int *count = Malloc(int,max_nr_class);
2564         int *data_label = Malloc(int,l);
2565         int i;
2566
2567         for(i=0;i<l;i++)
2568         {
2569                 int this_label = (int)prob->y[i];
2570                 int j;
2571                 for(j=0;j<nr_class;j++)
2572                 {
2573                         if(this_label == label[j])
2574                         {
2575                                 ++count[j];
2576                                 break;
2577                         }
2578                 }
2579                 data_label[i] = j;
2580                 if(j == nr_class)
2581                 {
2582                         if(nr_class == max_nr_class)
2583                         {
2584                                 max_nr_class *= 2;
2585                                 label = (int *)realloc(label,max_nr_class*sizeof(int));
2586                                 count = (int *)realloc(count,max_nr_class*sizeof(int));
2587                         }
2588                         label[nr_class] = this_label;
2589                         count[nr_class] = 1;
2590                         ++nr_class;
2591                 }
2592         }
2593
2594         //
2595         // Labels are ordered by their first occurrence in the training set.
2596         // However, for two-class sets with -1/+1 labels and -1 appears first,
2597         // we swap labels to ensure that internally the binary SVM has positive data corresponding to the +1 instances.
2598         //
2599         if (nr_class == 2 && label[0] == -1 && label[1] == 1)
2600         {
2601                 swap(label[0],label[1]);
2602                 swap(count[0],count[1]);
2603                 for(i=0;i<l;i++)
2604                 {
2605                         if(data_label[i] == 0)
2606                                 data_label[i] = 1;
2607                         else
2608                                 data_label[i] = 0;
2609                 }
2610         }
2611
2612         int *start = Malloc(int,nr_class);
2613         start[0] = 0;
2614         for(i=1;i<nr_class;i++)
2615                 start[i] = start[i-1]+count[i-1];
2616         for(i=0;i<l;i++)
2617         {
2618                 perm[start[data_label[i]]] = i;
2619                 ++start[data_label[i]];
2620         }
2621         start[0] = 0;
2622         for(i=1;i<nr_class;i++)
2623                 start[i] = start[i-1]+count[i-1];
2624
2625         *nr_class_ret = nr_class;
2626         *label_ret = label;
2627         *start_ret = start;
2628         *count_ret = count;
2629         free(data_label);
2630 }
2631
2632 static void train_one(const problem *prob, const parameter *param, double *w, double Cp, double Cn)
2633 {
2634         int solver_type = param->solver_type;
2635         int dual_solver_max_iter = 300;
2636         int iter;
2637
2638         bool is_regression = (solver_type==L2R_L2LOSS_SVR ||
2639                                 solver_type==L2R_L1LOSS_SVR_DUAL ||
2640                                 solver_type==L2R_L2LOSS_SVR_DUAL);
2641
2642         // Some solvers use Cp,Cn but not C array; extensions possible but no plan for now
2643         double *C = new double[prob->l];
2644         double primal_solver_tol = param->eps;
2645         if(is_regression)
2646         {
2647                 for(int i=0;i<prob->l;i++)
2648                         C[i] = param->C;
2649         }
2650         else
2651         {
2652                 int pos = 0;
2653                 for(int i=0;i<prob->l;i++)
2654                 {
2655                         if(prob->y[i] > 0)
2656                         {
2657                                 pos++;
2658                                 C[i] = Cp;
2659                         }
2660                         else
2661                                 C[i] = Cn;
2662                 }
2663                 int neg = prob->l - pos;
2664                 primal_solver_tol = param->eps*max(min(pos,neg), 1)/prob->l;
2665         }
2666
2667         switch(solver_type)
2668         {
2669                 case L2R_LR:
2670                 {
2671                         l2r_lr_fun fun_obj(prob, param, C);
2672                         NEWTON newton_obj(&fun_obj, primal_solver_tol);
2673                         newton_obj.set_print_string(liblinear_print_string);
2674                         newton_obj.newton(w);
2675                         break;
2676                 }
2677                 case L2R_L2LOSS_SVC:
2678                 {
2679                         l2r_l2_svc_fun fun_obj(prob, param, C);
2680                         NEWTON newton_obj(&fun_obj, primal_solver_tol);
2681                         newton_obj.set_print_string(liblinear_print_string);
2682                         newton_obj.newton(w);
2683                         break;
2684                 }
2685                 case L2R_L2LOSS_SVC_DUAL:
2686                 {
2687                         iter = solve_l2r_l1l2_svc(prob, param, w, Cp, Cn, dual_solver_max_iter);
2688                         if(iter >= dual_solver_max_iter)
2689                         {
2690                                 info("\nWARNING: reaching max number of iterations\nSwitching to use -s 2\n\n");
2691                                 // primal_solver_tol obtained from eps for dual may be too loose
2692                                 primal_solver_tol *= 0.1;
2693                                 l2r_l2_svc_fun fun_obj(prob, param, C);
2694                                 NEWTON newton_obj(&fun_obj, primal_solver_tol);
2695                                 newton_obj.set_print_string(liblinear_print_string);
2696                                 newton_obj.newton(w);
2697                         }
2698                         break;
2699                 }
2700                 case L2R_L1LOSS_SVC_DUAL:
2701                 {
2702                         solve_l2r_l1l2_svc(prob, param, w, Cp, Cn, dual_solver_max_iter);
2703                         break;
2704                 }
2705                 case L1R_L2LOSS_SVC:
2706                 {
2707                         problem prob_col;
2708                         feature_node *x_space = NULL;
2709                         transpose(prob, &x_space ,&prob_col);
2710                         solve_l1r_l2_svc(&prob_col, param, w, Cp, Cn, primal_solver_tol);
2711                         delete [] prob_col.y;
2712                         delete [] prob_col.x;
2713                         delete [] x_space;
2714                         break;
2715                 }
2716                 case L1R_LR:
2717                 {
2718                         problem prob_col;
2719                         feature_node *x_space = NULL;
2720                         transpose(prob, &x_space ,&prob_col);
2721                         solve_l1r_lr(&prob_col, param, w, Cp, Cn, primal_solver_tol);
2722                         delete [] prob_col.y;
2723                         delete [] prob_col.x;
2724                         delete [] x_space;
2725                         break;
2726                 }
2727                 case L2R_LR_DUAL:
2728                 {
2729                         iter = solve_l2r_lr_dual(prob, param, w, Cp, Cn, dual_solver_max_iter);
2730                         if(iter >= dual_solver_max_iter)
2731                         {
2732                                 info("\nWARNING: reaching max number of iterations\nSwitching to use -s 0\n\n");
2733                                 // primal_solver_tol obtained from eps for dual may be too loose
2734                                 primal_solver_tol *= 0.1;
2735                                 l2r_lr_fun fun_obj(prob, param, C);
2736                                 NEWTON newton_obj(&fun_obj, primal_solver_tol);
2737                                 newton_obj.set_print_string(liblinear_print_string);
2738                                 newton_obj.newton(w);
2739                         }
2740                         break;
2741                 }
2742                 case L2R_L2LOSS_SVR:
2743                 {
2744                         l2r_l2_svr_fun fun_obj(prob, param, C);
2745                         NEWTON newton_obj(&fun_obj, primal_solver_tol);
2746                         newton_obj.set_print_string(liblinear_print_string);
2747                         newton_obj.newton(w);
2748                         break;
2749
2750                 }
2751                 case L2R_L1LOSS_SVR_DUAL:
2752                 {
2753                         solve_l2r_l1l2_svr(prob, param, w, dual_solver_max_iter);
2754                         break;
2755                 }
2756                 case L2R_L2LOSS_SVR_DUAL:
2757                 {
2758                         iter = solve_l2r_l1l2_svr(prob, param, w, dual_solver_max_iter);
2759                         if(iter >= dual_solver_max_iter)
2760                         {
2761                                 info("\nWARNING: reaching max number of iterations\nSwitching to use -s 11\n\n");
2762                                 // primal_solver_tol obtained from eps for dual may be too loose
2763                                 primal_solver_tol *= 0.001;
2764                                 l2r_l2_svr_fun fun_obj(prob, param, C);
2765                                 NEWTON newton_obj(&fun_obj, primal_solver_tol);
2766                                 newton_obj.set_print_string(liblinear_print_string);
2767                                 newton_obj.newton(w);
2768                         }
2769                         break;
2770                 }
2771                 default:
2772                         fprintf(stderr, "ERROR: unknown solver_type\n");
2773                         break;
2774         }
2775
2776         delete[] C;
2777 }
2778
2779 // Calculate the initial C for parameter selection
2780 static double calc_start_C(const problem *prob, const parameter *param)
2781 {
2782         int i;
2783         double xTx, max_xTx;
2784         max_xTx = 0;
2785         for(i=0; i<prob->l; i++)
2786         {
2787                 xTx = 0;
2788                 feature_node *xi=prob->x[i];
2789                 while(xi->index != -1)
2790                 {
2791                         double val = xi->value;
2792                         xTx += val*val;
2793                         xi++;
2794                 }
2795                 if(xTx > max_xTx)
2796                         max_xTx = xTx;
2797         }
2798
2799         double min_C = 1.0;
2800         if(param->solver_type == L2R_LR)
2801                 min_C = 1.0 / (prob->l * max_xTx);
2802         else if(param->solver_type == L2R_L2LOSS_SVC)
2803                 min_C = 1.0 / (2 * prob->l * max_xTx);
2804         else if(param->solver_type == L2R_L2LOSS_SVR)
2805         {
2806                 double sum_y, loss, y_abs;
2807                 double delta2 = 0.1;
2808                 sum_y = 0, loss = 0;
2809                 for(i=0; i<prob->l; i++)
2810                 {
2811                         y_abs = fabs(prob->y[i]);
2812                         sum_y += y_abs;
2813                         loss += max(y_abs - param->p, 0.0) * max(y_abs - param->p, 0.0);
2814                 }
2815                 if(loss > 0)
2816                         min_C = delta2 * delta2 * loss / (8 * sum_y * sum_y * max_xTx);
2817                 else
2818                         min_C = INF;
2819         }
2820
2821         return pow( 2, floor(log(min_C) / log(2.0)) );
2822 }
2823
2824 static double calc_max_p(const problem *prob)
2825 {
2826         int i;
2827         double max_p = 0.0;
2828         for(i = 0; i < prob->l; i++)
2829                 max_p = max(max_p, fabs(prob->y[i]));
2830
2831         return max_p;
2832 }
2833
2834 static void find_parameter_C(const problem *prob, parameter *param_tmp, double start_C, double max_C, double *best_C, double *best_score, const int *fold_start, const int *perm, const problem *subprob, int nr_fold)
2835 {
2836         // variables for CV
2837         int i;
2838         double *target = Malloc(double, prob->l);
2839
2840         // variables for warm start
2841         double ratio = 2;
2842         double **prev_w = Malloc(double*, nr_fold);
2843         for(i = 0; i < nr_fold; i++)
2844                 prev_w[i] = NULL;
2845         int num_unchanged_w = 0;
2846         void (*default_print_string) (const char *) = liblinear_print_string;
2847
2848         if(param_tmp->solver_type == L2R_LR || param_tmp->solver_type == L2R_L2LOSS_SVC)
2849                 *best_score = 0.0;
2850         else if(param_tmp->solver_type == L2R_L2LOSS_SVR)
2851                 *best_score = INF;
2852         *best_C = start_C;
2853
2854         param_tmp->C = start_C;
2855         while(param_tmp->C <= max_C)
2856         {
2857                 //Output disabled for running CV at a particular C
2858                 set_print_string_function(&print_null);
2859
2860                 for(i=0; i<nr_fold; i++)
2861                 {
2862                         int j;
2863                         int begin = fold_start[i];
2864                         int end = fold_start[i+1];
2865
2866                         param_tmp->init_sol = prev_w[i];
2867                         struct model *submodel = train(&subprob[i],param_tmp);
2868
2869                         int total_w_size;
2870                         if(submodel->nr_class == 2)
2871                                 total_w_size = subprob[i].n;
2872                         else
2873                                 total_w_size = subprob[i].n * submodel->nr_class;
2874
2875                         if(prev_w[i] == NULL)
2876                         {
2877                                 prev_w[i] = Malloc(double, total_w_size);
2878                                 for(j=0; j<total_w_size; j++)
2879                                         prev_w[i][j] = submodel->w[j];
2880                         }
2881                         else if(num_unchanged_w >= 0)
2882                         {
2883                                 double norm_w_diff = 0;
2884                                 for(j=0; j<total_w_size; j++)
2885                                 {
2886                                         norm_w_diff += (submodel->w[j] - prev_w[i][j])*(submodel->w[j] - prev_w[i][j]);
2887                                         prev_w[i][j] = submodel->w[j];
2888                                 }
2889                                 norm_w_diff = sqrt(norm_w_diff);
2890
2891                                 if(norm_w_diff > 1e-15)
2892                                         num_unchanged_w = -1;
2893                         }
2894                         else
2895                         {
2896                                 for(j=0; j<total_w_size; j++)
2897                                         prev_w[i][j] = submodel->w[j];
2898                         }
2899
2900                         for(j=begin; j<end; j++)
2901                                 target[perm[j]] = predict(submodel,prob->x[perm[j]]);
2902
2903                         free_and_destroy_model(&submodel);
2904                 }
2905                 set_print_string_function(default_print_string);
2906
2907                 if(param_tmp->solver_type == L2R_LR || param_tmp->solver_type == L2R_L2LOSS_SVC)
2908                 {
2909                         int total_correct = 0;
2910                         for(i=0; i<prob->l; i++)
2911                                 if(target[i] == prob->y[i])
2912                                         ++total_correct;
2913                         double current_rate = (double)total_correct/prob->l;
2914                         if(current_rate > *best_score)
2915                         {
2916                                 *best_C = param_tmp->C;
2917                                 *best_score = current_rate;
2918                         }
2919
2920                         info("log2c=%7.2f\trate=%g\n",log(param_tmp->C)/log(2.0),100.0*current_rate);
2921                 }
2922                 else if(param_tmp->solver_type == L2R_L2LOSS_SVR)
2923                 {
2924                         double total_error = 0.0;
2925                         for(i=0; i<prob->l; i++)
2926                         {
2927                                 double y = prob->y[i];
2928                                 double v = target[i];
2929                                 total_error += (v-y)*(v-y);
2930                         }
2931                         double current_error = total_error/prob->l;
2932                         if(current_error < *best_score)
2933                         {
2934                                 *best_C = param_tmp->C;
2935                                 *best_score = current_error;
2936                         }
2937
2938                         info("log2c=%7.2f\tp=%7.2f\tMean squared error=%g\n",log(param_tmp->C)/log(2.0),param_tmp->p,current_error);
2939                 }
2940
2941                 num_unchanged_w++;
2942                 if(num_unchanged_w == 5)
2943                         break;
2944                 param_tmp->C = param_tmp->C*ratio;
2945         }
2946
2947         if(param_tmp->C > max_C)
2948                 info("WARNING: maximum C reached.\n");
2949         free(target);
2950         for(i=0; i<nr_fold; i++)
2951                 free(prev_w[i]);
2952         free(prev_w);
2953 }
2954
2955
2956 //
2957 // Interface functions
2958 //
2959 model* train(const problem *prob, const parameter *param)
2960 {
2961         int i,j;
2962         int l = prob->l;
2963         int n = prob->n;
2964         int w_size = prob->n;
2965         model *model_ = Malloc(model,1);
2966
2967         if(prob->bias>=0)
2968                 model_->nr_feature=n-1;
2969         else
2970                 model_->nr_feature=n;
2971         model_->param = *param;
2972         model_->bias = prob->bias;
2973
2974         if(check_regression_model(model_))
2975         {
2976                 model_->w = Malloc(double, w_size);
2977
2978                 if(param->init_sol != NULL)
2979                         for(i=0;i<w_size;i++)
2980                                 model_->w[i] = param->init_sol[i];
2981                 else
2982                         for(i=0;i<w_size;i++)
2983                                 model_->w[i] = 0;
2984
2985                 model_->nr_class = 2;
2986                 model_->label = NULL;
2987                 train_one(prob, param, model_->w, 0, 0);
2988         }
2989         else if(check_oneclass_model(model_))
2990         {
2991                 model_->w = Malloc(double, w_size);
2992                 model_->nr_class = 2;
2993                 model_->label = NULL;
2994                 solve_oneclass_svm(prob, param, model_->w, &(model_->rho));
2995         }
2996         else
2997         {
2998                 int nr_class;
2999                 int *label = NULL;
3000                 int *start = NULL;
3001                 int *count = NULL;
3002                 int *perm = Malloc(int,l);
3003
3004                 // group training data of the same class
3005                 group_classes(prob,&nr_class,&label,&start,&count,perm);
3006
3007                 model_->nr_class=nr_class;
3008                 model_->label = Malloc(int,nr_class);
3009                 for(i=0;i<nr_class;i++)
3010                         model_->label[i] = label[i];
3011
3012                 // calculate weighted C
3013                 double *weighted_C = Malloc(double, nr_class);
3014                 for(i=0;i<nr_class;i++)
3015                         weighted_C[i] = param->C;
3016                 for(i=0;i<param->nr_weight;i++)
3017                 {
3018                         for(j=0;j<nr_class;j++)
3019                                 if(param->weight_label[i] == label[j])
3020                                         break;
3021                         if(j == nr_class)
3022                                 fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]);
3023                         else
3024                                 weighted_C[j] *= param->weight[i];
3025                 }
3026
3027                 // constructing the subproblem
3028                 feature_node **x = Malloc(feature_node *,l);
3029                 for(i=0;i<l;i++)
3030                         x[i] = prob->x[perm[i]];
3031
3032                 int k;
3033                 problem sub_prob;
3034                 sub_prob.l = l;
3035                 sub_prob.n = n;
3036                 sub_prob.x = Malloc(feature_node *,sub_prob.l);
3037                 sub_prob.y = Malloc(double,sub_prob.l);
3038
3039                 for(k=0; k<sub_prob.l; k++)
3040                         sub_prob.x[k] = x[k];
3041
3042                 // multi-class svm by Crammer and Singer
3043                 if(param->solver_type == MCSVM_CS)
3044                 {
3045                         model_->w=Malloc(double, n*nr_class);
3046                         for(i=0;i<nr_class;i++)
3047                                 for(j=start[i];j<start[i]+count[i];j++)
3048                                         sub_prob.y[j] = i;
3049                         Solver_MCSVM_CS Solver(&sub_prob, nr_class, weighted_C, param->eps);
3050                         Solver.Solve(model_->w);
3051                 }
3052                 else
3053                 {
3054                         if(nr_class == 2)
3055                         {
3056                                 model_->w=Malloc(double, w_size);
3057
3058                                 int e0 = start[0]+count[0];
3059                                 k=0;
3060                                 for(; k<e0; k++)
3061                                         sub_prob.y[k] = +1;
3062                                 for(; k<sub_prob.l; k++)
3063                                         sub_prob.y[k] = -1;
3064
3065                                 if(param->init_sol != NULL)
3066                                         for(i=0;i<w_size;i++)
3067                                                 model_->w[i] = param->init_sol[i];
3068                                 else
3069                                         for(i=0;i<w_size;i++)
3070                                                 model_->w[i] = 0;
3071
3072                                 train_one(&sub_prob, param, model_->w, weighted_C[0], weighted_C[1]);
3073                         }
3074                         else
3075                         {
3076                                 model_->w=Malloc(double, w_size*nr_class);
3077                                 double *w=Malloc(double, w_size);
3078                                 for(i=0;i<nr_class;i++)
3079                                 {
3080                                         int si = start[i];
3081                                         int ei = si+count[i];
3082
3083                                         k=0;
3084                                         for(; k<si; k++)
3085                                                 sub_prob.y[k] = -1;
3086                                         for(; k<ei; k++)
3087                                                 sub_prob.y[k] = +1;
3088                                         for(; k<sub_prob.l; k++)
3089                                                 sub_prob.y[k] = -1;
3090
3091                                         if(param->init_sol != NULL)
3092                                                 for(j=0;j<w_size;j++)
3093                                                         w[j] = param->init_sol[j*nr_class+i];
3094                                         else
3095                                                 for(j=0;j<w_size;j++)
3096                                                         w[j] = 0;
3097
3098                                         train_one(&sub_prob, param, w, weighted_C[i], param->C);
3099
3100                                         for(j=0;j<w_size;j++)
3101                                                 model_->w[j*nr_class+i] = w[j];
3102                                 }
3103                                 free(w);
3104                         }
3105
3106                 }
3107
3108                 free(x);
3109                 free(label);
3110                 free(start);
3111                 free(count);
3112                 free(perm);
3113                 free(sub_prob.x);
3114                 free(sub_prob.y);
3115                 free(weighted_C);
3116         }
3117         return model_;
3118 }
3119
3120 void cross_validation(const problem *prob, const parameter *param, int nr_fold, double *target)
3121 {
3122         int i;
3123         int *fold_start;
3124         int l = prob->l;
3125         int *perm = Malloc(int,l);
3126         if (nr_fold > l)
3127         {
3128                 nr_fold = l;
3129                 fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n");
3130         }
3131         fold_start = Malloc(int,nr_fold+1);
3132         for(i=0;i<l;i++) perm[i]=i;
3133         for(i=0;i<l;i++)
3134         {
3135                 int j = i+rand()%(l-i);
3136                 swap(perm[i],perm[j]);
3137         }
3138         for(i=0;i<=nr_fold;i++)
3139                 fold_start[i]=i*l/nr_fold;
3140
3141         for(i=0;i<nr_fold;i++)
3142         {
3143                 int begin = fold_start[i];
3144                 int end = fold_start[i+1];
3145                 int j,k;
3146                 struct problem subprob;
3147
3148                 subprob.bias = prob->bias;
3149                 subprob.n = prob->n;
3150                 subprob.l = l-(end-begin);
3151                 subprob.x = Malloc(struct feature_node*,subprob.l);
3152                 subprob.y = Malloc(double,subprob.l);
3153
3154                 k=0;
3155                 for(j=0;j<begin;j++)
3156                 {
3157                         subprob.x[k] = prob->x[perm[j]];
3158                         subprob.y[k] = prob->y[perm[j]];
3159                         ++k;
3160                 }
3161                 for(j=end;j<l;j++)
3162                 {
3163                         subprob.x[k] = prob->x[perm[j]];
3164                         subprob.y[k] = prob->y[perm[j]];
3165                         ++k;
3166                 }
3167                 struct model *submodel = train(&subprob,param);
3168                 for(j=begin;j<end;j++)
3169                         target[perm[j]] = predict(submodel,prob->x[perm[j]]);
3170                 free_and_destroy_model(&submodel);
3171                 free(subprob.x);
3172                 free(subprob.y);
3173         }
3174         free(fold_start);
3175         free(perm);
3176 }
3177
3178
3179 void find_parameters(const problem *prob, const parameter *param, int nr_fold, double start_C, double start_p, double *best_C, double *best_p, double *best_score)
3180 {
3181         // prepare CV folds
3182
3183         int i;
3184         int *fold_start;
3185         int l = prob->l;
3186         int *perm = Malloc(int, l);
3187         struct problem *subprob = Malloc(problem,nr_fold);
3188
3189         if (nr_fold > l)
3190         {
3191                 nr_fold = l;
3192                 fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n");
3193         }
3194         fold_start = Malloc(int,nr_fold+1);
3195         for(i=0;i<l;i++) perm[i]=i;
3196         for(i=0;i<l;i++)
3197         {
3198                 int j = i+rand()%(l-i);
3199                 swap(perm[i],perm[j]);
3200         }
3201         for(i=0;i<=nr_fold;i++)
3202                 fold_start[i]=i*l/nr_fold;
3203
3204         for(i=0;i<nr_fold;i++)
3205         {
3206                 int begin = fold_start[i];
3207                 int end = fold_start[i+1];
3208                 int j,k;
3209
3210                 subprob[i].bias = prob->bias;
3211                 subprob[i].n = prob->n;
3212                 subprob[i].l = l-(end-begin);
3213                 subprob[i].x = Malloc(struct feature_node*,subprob[i].l);
3214                 subprob[i].y = Malloc(double,subprob[i].l);
3215
3216                 k=0;
3217                 for(j=0;j<begin;j++)
3218                 {
3219                         subprob[i].x[k] = prob->x[perm[j]];
3220                         subprob[i].y[k] = prob->y[perm[j]];
3221                         ++k;
3222                 }
3223                 for(j=end;j<l;j++)
3224                 {
3225                         subprob[i].x[k] = prob->x[perm[j]];
3226                         subprob[i].y[k] = prob->y[perm[j]];
3227                         ++k;
3228                 }
3229         }
3230
3231         struct parameter param_tmp = *param;
3232         *best_p = -1;
3233         if(param->solver_type == L2R_LR || param->solver_type == L2R_L2LOSS_SVC)
3234         {
3235                 if(start_C <= 0)
3236                         start_C = calc_start_C(prob, &param_tmp);
3237                 double max_C = 1024;
3238                 start_C = min(start_C, max_C);
3239                 double best_C_tmp, best_score_tmp;
3240
3241                 find_parameter_C(prob, &param_tmp, start_C, max_C, &best_C_tmp, &best_score_tmp, fold_start, perm, subprob, nr_fold);
3242
3243                 *best_C = best_C_tmp;
3244                 *best_score = best_score_tmp;
3245         }
3246         else if(param->solver_type == L2R_L2LOSS_SVR)
3247         {
3248                 double max_p = calc_max_p(prob);
3249                 int num_p_steps = 20;
3250                 double max_C = 1048576;
3251                 *best_score = INF;
3252
3253                 i = num_p_steps-1;
3254                 if(start_p > 0)
3255                         i = min((int)(start_p/(max_p/num_p_steps)), i);
3256                 for(; i >= 0; i--)
3257                 {
3258                         param_tmp.p = i*max_p/num_p_steps;
3259                         double start_C_tmp;
3260                         if(start_C <= 0)
3261                                 start_C_tmp = calc_start_C(prob, &param_tmp);
3262                         else
3263                                 start_C_tmp = start_C;
3264                         start_C_tmp = min(start_C_tmp, max_C);
3265                         double best_C_tmp, best_score_tmp;
3266
3267                         find_parameter_C(prob, &param_tmp, start_C_tmp, max_C, &best_C_tmp, &best_score_tmp, fold_start, perm, subprob, nr_fold);
3268
3269                         if(best_score_tmp < *best_score)
3270                         {
3271                                 *best_p = param_tmp.p;
3272                                 *best_C = best_C_tmp;
3273                                 *best_score = best_score_tmp;
3274                         }
3275                 }
3276         }
3277
3278         free(fold_start);
3279         free(perm);
3280         for(i=0; i<nr_fold; i++)
3281         {
3282                 free(subprob[i].x);
3283                 free(subprob[i].y);
3284         }
3285         free(subprob);
3286 }
3287
3288 double predict_values(const struct model *model_, const struct feature_node *x, double *dec_values)
3289 {
3290         int idx;
3291         int n;
3292         if(model_->bias>=0)
3293                 n=model_->nr_feature+1;
3294         else
3295                 n=model_->nr_feature;
3296         double *w=model_->w;
3297         int nr_class=model_->nr_class;
3298         int i;
3299         int nr_w;
3300         if(nr_class==2 && model_->param.solver_type != MCSVM_CS)
3301                 nr_w = 1;
3302         else
3303                 nr_w = nr_class;
3304
3305         const feature_node *lx=x;
3306         for(i=0;i<nr_w;i++)
3307                 dec_values[i] = 0;
3308         for(; (idx=lx->index)!=-1; lx++)
3309         {
3310                 // the dimension of testing data may exceed that of training
3311                 if(idx<=n)
3312                         for(i=0;i<nr_w;i++)
3313                                 dec_values[i] += w[(idx-1)*nr_w+i]*lx->value;
3314         }
3315         if(check_oneclass_model(model_))
3316                 dec_values[0] -= model_->rho;
3317
3318         if(nr_class==2)
3319         {
3320                 if(check_regression_model(model_))
3321                         return dec_values[0];
3322                 else if(check_oneclass_model(model_))
3323                         return (dec_values[0]>0)?1:-1;
3324                 else
3325                         return (dec_values[0]>0)?model_->label[0]:model_->label[1];
3326         }
3327         else
3328         {
3329                 int dec_max_idx = 0;
3330                 for(i=1;i<nr_class;i++)
3331                 {
3332                         if(dec_values[i] > dec_values[dec_max_idx])
3333                                 dec_max_idx = i;
3334                 }
3335                 return model_->label[dec_max_idx];
3336         }
3337 }
3338
3339 double predict(const model *model_, const feature_node *x)
3340 {
3341         double *dec_values = Malloc(double, model_->nr_class);
3342         double label=predict_values(model_, x, dec_values);
3343         free(dec_values);
3344         return label;
3345 }
3346
3347 double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates)
3348 {
3349         if(check_probability_model(model_))
3350         {
3351                 int i;
3352                 int nr_class=model_->nr_class;
3353                 int nr_w;
3354                 if(nr_class==2)
3355                         nr_w = 1;
3356                 else
3357                         nr_w = nr_class;
3358
3359                 double label=predict_values(model_, x, prob_estimates);
3360                 for(i=0;i<nr_w;i++)
3361                         prob_estimates[i]=1/(1+exp(-prob_estimates[i]));
3362
3363                 if(nr_class==2) // for binary classification
3364                         prob_estimates[1]=1.-prob_estimates[0];
3365                 else
3366                 {
3367                         double sum=0;
3368                         for(i=0; i<nr_class; i++)
3369                                 sum+=prob_estimates[i];
3370
3371                         for(i=0; i<nr_class; i++)
3372                                 prob_estimates[i]=prob_estimates[i]/sum;
3373                 }
3374
3375                 return label;
3376         }
3377         else
3378                 return 0;
3379 }
3380
3381 static const char *solver_type_table[]=
3382 {
3383         "L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC", "L2R_L1LOSS_SVC_DUAL", "MCSVM_CS",
3384         "L1R_L2LOSS_SVC", "L1R_LR", "L2R_LR_DUAL",
3385         "", "", "",
3386         "L2R_L2LOSS_SVR", "L2R_L2LOSS_SVR_DUAL", "L2R_L1LOSS_SVR_DUAL",
3387         "", "", "", "", "", "", "",
3388         "ONECLASS_SVM", NULL
3389 };
3390
3391 int save_model(const char *model_file_name, const struct model *model_)
3392 {
3393         int i;
3394         int nr_feature=model_->nr_feature;
3395         int n;
3396         const parameter& param = model_->param;
3397
3398         if(model_->bias>=0)
3399                 n=nr_feature+1;
3400         else
3401                 n=nr_feature;
3402         int w_size = n;
3403         FILE *fp = fopen(model_file_name,"w");
3404         if(fp==NULL) return -1;
3405
3406         char *old_locale = setlocale(LC_ALL, NULL);
3407         if (old_locale)
3408         {
3409                 old_locale = strdup(old_locale);
3410         }
3411         setlocale(LC_ALL, "C");
3412
3413         int nr_w;
3414         if(model_->nr_class==2 && model_->param.solver_type != MCSVM_CS)
3415                 nr_w=1;
3416         else
3417                 nr_w=model_->nr_class;
3418
3419         fprintf(fp, "solver_type %s\n", solver_type_table[param.solver_type]);
3420         fprintf(fp, "nr_class %d\n", model_->nr_class);
3421
3422         if(model_->label)
3423         {
3424                 fprintf(fp, "label");
3425                 for(i=0; i<model_->nr_class; i++)
3426                         fprintf(fp, " %d", model_->label[i]);
3427                 fprintf(fp, "\n");
3428         }
3429
3430         fprintf(fp, "nr_feature %d\n", nr_feature);
3431
3432         fprintf(fp, "bias %.17g\n", model_->bias);
3433
3434         if(check_oneclass_model(model_))
3435                 fprintf(fp, "rho %.17g\n", model_->rho);
3436
3437         fprintf(fp, "w\n");
3438         for(i=0; i<w_size; i++)
3439         {
3440                 int j;
3441                 for(j=0; j<nr_w; j++)
3442                         fprintf(fp, "%.17g ", model_->w[i*nr_w+j]);
3443                 fprintf(fp, "\n");
3444         }
3445
3446         setlocale(LC_ALL, old_locale);
3447         free(old_locale);
3448
3449         if (ferror(fp) != 0 || fclose(fp) != 0) return -1;
3450         else return 0;
3451 }
3452
3453 //
3454 // FSCANF helps to handle fscanf failures.
3455 // Its do-while block avoids the ambiguity when
3456 // if (...)
3457 //    FSCANF();
3458 // is used
3459 //
3460 #define FSCANF(_stream, _format, _var)do\
3461 {\
3462         if (fscanf(_stream, _format, _var) != 1)\
3463         {\
3464                 fprintf(stderr, "ERROR: fscanf failed to read the model\n");\
3465                 EXIT_LOAD_MODEL()\
3466         }\
3467 }while(0)
3468 // EXIT_LOAD_MODEL should NOT end with a semicolon.
3469 #define EXIT_LOAD_MODEL()\
3470 {\
3471         setlocale(LC_ALL, old_locale);\
3472         free(model_->label);\
3473         free(model_);\
3474         free(old_locale);\
3475         return NULL;\
3476 }
3477 struct model *load_model(const char *model_file_name)
3478 {
3479         FILE *fp = fopen(model_file_name,"r");
3480         if(fp==NULL) return NULL;
3481
3482         int i;
3483         int nr_feature;
3484         int n;
3485         int nr_class;
3486         double bias;
3487         double rho;
3488         model *model_ = Malloc(model,1);
3489         parameter& param = model_->param;
3490         // parameters for training only won't be assigned, but arrays are assigned as NULL for safety
3491         param.nr_weight = 0;
3492         param.weight_label = NULL;
3493         param.weight = NULL;
3494         param.init_sol = NULL;
3495
3496         model_->label = NULL;
3497
3498         char *old_locale = setlocale(LC_ALL, NULL);
3499         if (old_locale)
3500         {
3501                 old_locale = strdup(old_locale);
3502         }
3503         setlocale(LC_ALL, "C");
3504
3505         char cmd[81];
3506         while(1)
3507         {
3508                 FSCANF(fp,"%80s",cmd);
3509                 if(strcmp(cmd,"solver_type")==0)
3510                 {
3511                         FSCANF(fp,"%80s",cmd);
3512                         int i;
3513                         for(i=0;solver_type_table[i];i++)
3514                         {
3515                                 if(strcmp(solver_type_table[i],cmd)==0)
3516                                 {
3517                                         param.solver_type=i;
3518                                         break;
3519                                 }
3520                         }
3521                         if(solver_type_table[i] == NULL)
3522                         {
3523                                 fprintf(stderr,"unknown solver type.\n");
3524                                 EXIT_LOAD_MODEL()
3525                         }
3526                 }
3527                 else if(strcmp(cmd,"nr_class")==0)
3528                 {
3529                         FSCANF(fp,"%d",&nr_class);
3530                         model_->nr_class=nr_class;
3531                 }
3532                 else if(strcmp(cmd,"nr_feature")==0)
3533                 {
3534                         FSCANF(fp,"%d",&nr_feature);
3535                         model_->nr_feature=nr_feature;
3536                 }
3537                 else if(strcmp(cmd,"bias")==0)
3538                 {
3539                         FSCANF(fp,"%lf",&bias);
3540                         model_->bias=bias;
3541                 }
3542                 else if(strcmp(cmd,"rho")==0)
3543                 {
3544                         FSCANF(fp,"%lf",&rho);
3545                         model_->rho=rho;
3546                 }
3547                 else if(strcmp(cmd,"w")==0)
3548                 {
3549                         break;
3550                 }
3551                 else if(strcmp(cmd,"label")==0)
3552                 {
3553                         int nr_class = model_->nr_class;
3554                         model_->label = Malloc(int,nr_class);
3555                         for(int i=0;i<nr_class;i++)
3556                                 FSCANF(fp,"%d",&model_->label[i]);
3557                 }
3558                 else
3559                 {
3560                         fprintf(stderr,"unknown text in model file: [%s]\n",cmd);
3561                         EXIT_LOAD_MODEL()
3562                 }
3563         }
3564
3565         nr_feature=model_->nr_feature;
3566         if(model_->bias>=0)
3567                 n=nr_feature+1;
3568         else
3569                 n=nr_feature;
3570         int w_size = n;
3571         int nr_w;
3572         if(nr_class==2 && param.solver_type != MCSVM_CS)
3573                 nr_w = 1;
3574         else
3575                 nr_w = nr_class;
3576
3577         model_->w=Malloc(double, w_size*nr_w);
3578         for(i=0; i<w_size; i++)
3579         {
3580                 int j;
3581                 for(j=0; j<nr_w; j++)
3582                         FSCANF(fp, "%lf ", &model_->w[i*nr_w+j]);
3583         }
3584
3585         setlocale(LC_ALL, old_locale);
3586         free(old_locale);
3587
3588         if (ferror(fp) != 0 || fclose(fp) != 0) return NULL;
3589
3590         return model_;
3591 }
3592
3593 int get_nr_feature(const model *model_)
3594 {
3595         return model_->nr_feature;
3596 }
3597
3598 int get_nr_class(const model *model_)
3599 {
3600         return model_->nr_class;
3601 }
3602
3603 void get_labels(const model *model_, int* label)
3604 {
3605         if (model_->label != NULL)
3606                 for(int i=0;i<model_->nr_class;i++)
3607                         label[i] = model_->label[i];
3608 }
3609
3610 // use inline here for better performance (around 20% faster than the non-inline one)
3611 static inline double get_w_value(const struct model *model_, int idx, int label_idx)
3612 {
3613         int nr_class = model_->nr_class;
3614         int solver_type = model_->param.solver_type;
3615         const double *w = model_->w;
3616
3617         if(idx < 0 || idx > model_->nr_feature)
3618                 return 0;
3619         if(check_regression_model(model_) || check_oneclass_model(model_))
3620                 return w[idx];
3621         else
3622         {
3623                 if(label_idx < 0 || label_idx >= nr_class)
3624                         return 0;
3625                 if(nr_class == 2 && solver_type != MCSVM_CS)
3626                 {
3627                         if(label_idx == 0)
3628                                 return w[idx];
3629                         else
3630                                 return -w[idx];
3631                 }
3632                 else
3633                         return w[idx*nr_class+label_idx];
3634         }
3635 }
3636
3637 // feat_idx: starting from 1 to nr_feature
3638 // label_idx: starting from 0 to nr_class-1 for classification models;
3639 //            for regression and one-class SVM models, label_idx is
3640 //            ignored.
3641 double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx)
3642 {
3643         if(feat_idx > model_->nr_feature)
3644                 return 0;
3645         return get_w_value(model_, feat_idx-1, label_idx);
3646 }
3647
3648 double get_decfun_bias(const struct model *model_, int label_idx)
3649 {
3650         if(check_oneclass_model(model_))
3651         {
3652                 fprintf(stderr, "ERROR: get_decfun_bias can not be called for a one-class SVM model\n");
3653                 return 0;
3654         }
3655         int bias_idx = model_->nr_feature;
3656         double bias = model_->bias;
3657         if(bias <= 0)
3658                 return 0;
3659         else
3660                 return bias*get_w_value(model_, bias_idx, label_idx);
3661 }
3662
3663 double get_decfun_rho(const struct model *model_)
3664 {
3665         if(check_oneclass_model(model_))
3666                 return model_->rho;
3667         else
3668         {
3669                 fprintf(stderr, "ERROR: get_decfun_rho can be called only for a one-class SVM model\n");
3670                 return 0;
3671         }
3672 }
3673
3674 void free_model_content(struct model *model_ptr)
3675 {
3676         if(model_ptr->w != NULL)
3677                 free(model_ptr->w);
3678         if(model_ptr->label != NULL)
3679                 free(model_ptr->label);
3680 }
3681
3682 void free_and_destroy_model(struct model **model_ptr_ptr)
3683 {
3684         struct model *model_ptr = *model_ptr_ptr;
3685         if(model_ptr != NULL)
3686         {
3687                 free_model_content(model_ptr);
3688                 free(model_ptr);
3689         }
3690 }
3691
3692 void destroy_param(parameter* param)
3693 {
3694         if(param->weight_label != NULL)
3695                 free(param->weight_label);
3696         if(param->weight != NULL)
3697                 free(param->weight);
3698         if(param->init_sol != NULL)
3699                 free(param->init_sol);
3700 }
3701
3702 const char *check_parameter(const problem *prob, const parameter *param)
3703 {
3704         if(param->eps <= 0)
3705                 return "eps <= 0";
3706
3707         if(param->C <= 0)
3708                 return "C <= 0";
3709
3710         if(param->p < 0)
3711                 return "p < 0";
3712
3713         if(prob->bias >= 0 && param->solver_type == ONECLASS_SVM)
3714                 return "prob->bias >=0, but this is ignored in ONECLASS_SVM";
3715
3716         if(param->regularize_bias == 0)
3717         {
3718                 if(prob->bias != 1.0)
3719                         return "To not regularize bias, must specify -B 1 along with -R";
3720                 if(param->solver_type != L2R_LR
3721                         && param->solver_type != L2R_L2LOSS_SVC
3722                         && param->solver_type != L1R_L2LOSS_SVC
3723                         && param->solver_type != L1R_LR
3724                         && param->solver_type != L2R_L2LOSS_SVR)
3725                         return "-R option supported only for solver L2R_LR, L2R_L2LOSS_SVC, L1R_L2LOSS_SVC, L1R_LR, and L2R_L2LOSS_SVR";
3726         }
3727
3728         if(param->solver_type != L2R_LR
3729                 && param->solver_type != L2R_L2LOSS_SVC_DUAL
3730                 && param->solver_type != L2R_L2LOSS_SVC
3731                 && param->solver_type != L2R_L1LOSS_SVC_DUAL
3732                 && param->solver_type != MCSVM_CS
3733                 && param->solver_type != L1R_L2LOSS_SVC
3734                 && param->solver_type != L1R_LR
3735                 && param->solver_type != L2R_LR_DUAL
3736                 && param->solver_type != L2R_L2LOSS_SVR
3737                 && param->solver_type != L2R_L2LOSS_SVR_DUAL
3738                 && param->solver_type != L2R_L1LOSS_SVR_DUAL
3739                 && param->solver_type != ONECLASS_SVM)
3740                 return "unknown solver type";
3741
3742         if(param->init_sol != NULL
3743                 && param->solver_type != L2R_LR
3744                 && param->solver_type != L2R_L2LOSS_SVC
3745                 && param->solver_type != L2R_L2LOSS_SVR)
3746                 return "Initial-solution specification supported only for solvers L2R_LR, L2R_L2LOSS_SVC, and L2R_L2LOSS_SVR";
3747
3748         return NULL;
3749 }
3750
3751 int check_probability_model(const struct model *model_)
3752 {
3753         return (model_->param.solver_type==L2R_LR ||
3754                         model_->param.solver_type==L2R_LR_DUAL ||
3755                         model_->param.solver_type==L1R_LR);
3756 }
3757
3758 int check_regression_model(const struct model *model_)
3759 {
3760         return (model_->param.solver_type==L2R_L2LOSS_SVR ||
3761                         model_->param.solver_type==L2R_L1LOSS_SVR_DUAL ||
3762                         model_->param.solver_type==L2R_L2LOSS_SVR_DUAL);
3763 }
3764
3765 int check_oneclass_model(const struct model *model_)
3766 {
3767         return model_->param.solver_type == ONECLASS_SVM;
3768 }
3769
3770 void set_print_string_function(void (*print_func)(const char*))
3771 {
3772         if (print_func == NULL)
3773                 liblinear_print_string = &print_string_stdout;
3774         else
3775                 liblinear_print_string = print_func;
3776 }
3777