granicus.if.org Git - liblinear/blob - linear.cpp

   1 #include <math.h>
   2 #include <stdio.h>
   3 #include <stdlib.h>
   4 #include <string.h>
   5 #include <stdarg.h>
   6 #include <locale.h>
   7 #include "linear.h"
   8 #include "newton.h"
   9 int liblinear_version = LIBLINEAR_VERSION;
  10 typedef signed char schar;
  11 template <class T> static inline void swap(T& x, T& y) { T t=x; x=y; y=t; }
  12 #ifndef min
  13 template <class T> static inline T min(T x,T y) { return (x<y)?x:y; }
  14 #endif
  15 #ifndef max
  16 template <class T> static inline T max(T x,T y) { return (x>y)?x:y; }
  17 #endif
  18 template <class S, class T> static inline void clone(T*& dst, S* src, int n)
  19 {
  20         dst = new T[n];
  21         memcpy((void *)dst,(void *)src,sizeof(T)*n);
  22 }
  23 #define INF HUGE_VAL
  24 #define Malloc(type,n) (type *)malloc((n)*sizeof(type))
  25
  26 static void print_string_stdout(const char *s)
  27 {
  28         fputs(s,stdout);
  29         fflush(stdout);
  30 }
  31 static void print_null(const char *s) {}
  32
  33 static void (*liblinear_print_string) (const char *) = &print_string_stdout;
  34
  35 #if 1
  36 static void info(const char *fmt,...)
  37 {
  38         char buf[BUFSIZ];
  39         va_list ap;
  40         va_start(ap,fmt);
  41         vsprintf(buf,fmt,ap);
  42         va_end(ap);
  43         (*liblinear_print_string)(buf);
  44 }
  45 #else
  46 static void info(const char *fmt,...) {}
  47 #endif
  48 class sparse_operator
  49 {
  50 public:
  51         static double nrm2_sq(const feature_node *x)
  52         {
  53                 double ret = 0;
  54                 while(x->index != -1)
  55                 {
  56                         ret += x->value*x->value;
  57                         x++;
  58                 }
  59                 return (ret);
  60         }
  61
  62         static double dot(const double *s, const feature_node *x)
  63         {
  64                 double ret = 0;
  65                 while(x->index != -1)
  66                 {
  67                         ret += s[x->index-1]*x->value;
  68                         x++;
  69                 }
  70                 return (ret);
  71         }
  72
  73         static double sparse_dot(const feature_node *x1, const feature_node *x2)
  74         {
  75                 double ret = 0;
  76                 while(x1->index != -1 && x2->index != -1)
  77                 {
  78                         if(x1->index == x2->index)
  79                         {
  80                                 ret += x1->value * x2->value;
  81                                 ++x1;
  82                                 ++x2;
  83                         }
  84                         else
  85                         {
  86                                 if(x1->index > x2->index)
  87                                         ++x2;
  88                                 else
  89                                         ++x1;
  90                         }
  91                 }
  92                 return (ret);
  93         }
  94
  95         static void axpy(const double a, const feature_node *x, double *y)
  96         {
  97                 while(x->index != -1)
  98                 {
  99                         y[x->index-1] += a*x->value;
 100                         x++;
 101                 }
 102         }
 103 };
 104
 105 // L2-regularized empirical risk minimization
 106 // min_w w^Tw/2 + \sum C_i \xi(w^Tx_i), where \xi() is the loss
 107
 108 class l2r_erm_fun: public function
 109 {
 110 public:
 111         l2r_erm_fun(const problem *prob, const parameter *param, double *C);
 112         ~l2r_erm_fun();
 113
 114         double fun(double *w);
 115         double linesearch_and_update(double *w, double *d, double *f, double *g, double alpha);
 116         int get_nr_variable(void);
 117
 118 protected:
 119         virtual double C_times_loss(int i, double wx_i) = 0;
 120         void Xv(double *v, double *Xv);
 121         void XTv(double *v, double *XTv);
 122
 123         double *C;
 124         const problem *prob;
 125         double *wx;
 126         double *tmp; // a working array
 127         double wTw;
 128         int regularize_bias;
 129 };
 130
 131 l2r_erm_fun::l2r_erm_fun(const problem *prob, const parameter *param, double *C)
 132 {
 133         int l=prob->l;
 134
 135         this->prob = prob;
 136
 137         wx = new double[l];
 138         tmp = new double[l];
 139         this->C = C;
 140         this->regularize_bias = param->regularize_bias;
 141 }
 142
 143 l2r_erm_fun::~l2r_erm_fun()
 144 {
 145         delete[] wx;
 146         delete[] tmp;
 147 }
 148
 149 double l2r_erm_fun::fun(double *w)
 150 {
 151         int i;
 152         double f=0;
 153         int l=prob->l;
 154         int w_size=get_nr_variable();
 155
 156         wTw = 0;
 157         Xv(w, wx);
 158
 159         for(i=0;i<w_size;i++)
 160                 wTw += w[i]*w[i];
 161         if(regularize_bias == 0)
 162                 wTw -= w[w_size-1]*w[w_size-1];
 163         for(i=0;i<l;i++)
 164                 f += C_times_loss(i, wx[i]);
 165         f = f + 0.5 * wTw;
 166
 167         return(f);
 168 }
 169
 170 int l2r_erm_fun::get_nr_variable(void)
 171 {
 172         return prob->n;
 173 }
 174
 175 // On entry *f must be the function value of w
 176 // On exit w is updated and *f is the new function value
 177 double l2r_erm_fun::linesearch_and_update(double *w, double *s, double *f, double *g, double alpha)
 178 {
 179         int i;
 180         int l = prob->l;
 181         double sTs = 0;
 182         double wTs = 0;
 183         double gTs = 0;
 184         double eta = 0.01;
 185         int w_size = get_nr_variable();
 186         int max_num_linesearch = 20;
 187         double fold = *f;
 188         Xv(s, tmp);
 189
 190         for (i=0;i<w_size;i++)
 191         {
 192                 sTs += s[i] * s[i];
 193                 wTs += s[i] * w[i];
 194                 gTs += s[i] * g[i];
 195         }
 196         if(regularize_bias == 0)
 197         {
 198                 // bias not used in calculating (w + \alpha s)^T (w + \alpha s)
 199                 sTs -= s[w_size-1] * s[w_size-1];
 200                 wTs -= s[w_size-1] * w[w_size-1];
 201         }
 202
 203         int num_linesearch = 0;
 204         for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
 205         {
 206                 double loss = 0;
 207                 for(i=0;i<l;i++)
 208                 {
 209                         double inner_product = tmp[i] * alpha + wx[i];
 210                         loss += C_times_loss(i, inner_product);
 211                 }
 212                 *f = loss + (alpha * alpha * sTs + wTw) / 2.0 + alpha * wTs;
 213                 if (*f - fold <= eta * alpha * gTs)
 214                 {
 215                         for (i=0;i<l;i++)
 216                                 wx[i] += alpha * tmp[i];
 217                         break;
 218                 }
 219                 else
 220                         alpha *= 0.5;
 221         }
 222
 223         if (num_linesearch >= max_num_linesearch)
 224         {
 225                 *f = fold;
 226                 return 0;
 227         }
 228         else
 229                 for (i=0;i<w_size;i++)
 230                         w[i] += alpha * s[i];
 231
 232         wTw += alpha * alpha * sTs + 2* alpha * wTs;
 233         return alpha;
 234 }
 235
 236 void l2r_erm_fun::Xv(double *v, double *Xv)
 237 {
 238         int i;
 239         int l=prob->l;
 240         feature_node **x=prob->x;
 241
 242         for(i=0;i<l;i++)
 243                 Xv[i]=sparse_operator::dot(v, x[i]);
 244 }
 245
 246 void l2r_erm_fun::XTv(double *v, double *XTv)
 247 {
 248         int i;
 249         int l=prob->l;
 250         int w_size=get_nr_variable();
 251         feature_node **x=prob->x;
 252
 253         for(i=0;i<w_size;i++)
 254                 XTv[i]=0;
 255         for(i=0;i<l;i++)
 256                 sparse_operator::axpy(v[i], x[i], XTv);
 257 }
 258
 259 class l2r_lr_fun: public l2r_erm_fun
 260 {
 261 public:
 262         l2r_lr_fun(const problem *prob, const parameter *param, double *C);
 263         ~l2r_lr_fun();
 264
 265         void grad(double *w, double *g);
 266         void Hv(double *s, double *Hs);
 267
 268         void get_diag_preconditioner(double *M);
 269
 270 private:
 271         double *D;
 272         double C_times_loss(int i, double wx_i);
 273 };
 274
 275 l2r_lr_fun::l2r_lr_fun(const problem *prob, const parameter *param, double *C):
 276         l2r_erm_fun(prob, param, C)
 277 {
 278         int l=prob->l;
 279         D = new double[l];
 280 }
 281
 282 l2r_lr_fun::~l2r_lr_fun()
 283 {
 284         delete[] D;
 285 }
 286
 287 double l2r_lr_fun::C_times_loss(int i, double wx_i)
 288 {
 289         double ywx_i = wx_i * prob->y[i];
 290         if (ywx_i >= 0)
 291                 return C[i]*log(1 + exp(-ywx_i));
 292         else
 293                 return C[i]*(-ywx_i + log(1 + exp(ywx_i)));
 294 }
 295
 296 void l2r_lr_fun::grad(double *w, double *g)
 297 {
 298         int i;
 299         double *y=prob->y;
 300         int l=prob->l;
 301         int w_size=get_nr_variable();
 302
 303         for(i=0;i<l;i++)
 304         {
 305                 tmp[i] = 1/(1 + exp(-y[i]*wx[i]));
 306                 D[i] = tmp[i]*(1-tmp[i]);
 307                 tmp[i] = C[i]*(tmp[i]-1)*y[i];
 308         }
 309         XTv(tmp, g);
 310
 311         for(i=0;i<w_size;i++)
 312                 g[i] = w[i] + g[i];
 313         if(regularize_bias == 0)
 314                 g[w_size-1] -= w[w_size-1];
 315 }
 316
 317 void l2r_lr_fun::get_diag_preconditioner(double *M)
 318 {
 319         int i;
 320         int l = prob->l;
 321         int w_size=get_nr_variable();
 322         feature_node **x = prob->x;
 323
 324         for (i=0; i<w_size; i++)
 325                 M[i] = 1;
 326         if(regularize_bias == 0)
 327                 M[w_size-1] = 0;
 328
 329         for (i=0; i<l; i++)
 330         {
 331                 feature_node *xi = x[i];
 332                 while (xi->index!=-1)
 333                 {
 334                         M[xi->index-1] += xi->value*xi->value*C[i]*D[i];
 335                         xi++;
 336                 }
 337         }
 338 }
 339
 340 void l2r_lr_fun::Hv(double *s, double *Hs)
 341 {
 342         int i;
 343         int l=prob->l;
 344         int w_size=get_nr_variable();
 345         feature_node **x=prob->x;
 346
 347         for(i=0;i<w_size;i++)
 348                 Hs[i] = 0;
 349         for(i=0;i<l;i++)
 350         {
 351                 feature_node * const xi=x[i];
 352                 double xTs = sparse_operator::dot(s, xi);
 353
 354                 xTs = C[i]*D[i]*xTs;
 355
 356                 sparse_operator::axpy(xTs, xi, Hs);
 357         }
 358         for(i=0;i<w_size;i++)
 359                 Hs[i] = s[i] + Hs[i];
 360         if(regularize_bias == 0)
 361                 Hs[w_size-1] -= s[w_size-1];
 362 }
 363
 364 class l2r_l2_svc_fun: public l2r_erm_fun
 365 {
 366 public:
 367         l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C);
 368         ~l2r_l2_svc_fun();
 369
 370         void grad(double *w, double *g);
 371         void Hv(double *s, double *Hs);
 372
 373         void get_diag_preconditioner(double *M);
 374
 375 protected:
 376         void subXTv(double *v, double *XTv);
 377
 378         int *I;
 379         int sizeI;
 380
 381 private:
 382         double C_times_loss(int i, double wx_i);
 383 };
 384
 385 l2r_l2_svc_fun::l2r_l2_svc_fun(const problem *prob, const parameter *param, double *C):
 386         l2r_erm_fun(prob, param, C)
 387 {
 388         I = new int[prob->l];
 389 }
 390
 391 l2r_l2_svc_fun::~l2r_l2_svc_fun()
 392 {
 393         delete[] I;
 394 }
 395
 396 double l2r_l2_svc_fun::C_times_loss(int i, double wx_i)
 397 {
 398         double d = 1 - prob->y[i] * wx_i;
 399         if (d > 0)
 400                 return C[i] * d * d;
 401         else
 402                 return 0;
 403 }
 404
 405 void l2r_l2_svc_fun::grad(double *w, double *g)
 406 {
 407         int i;
 408         double *y=prob->y;
 409         int l=prob->l;
 410         int w_size=get_nr_variable();
 411
 412         sizeI = 0;
 413         for (i=0;i<l;i++)
 414         {
 415                 tmp[i] = wx[i] * y[i];
 416                 if (tmp[i] < 1)
 417                 {
 418                         tmp[sizeI] = C[i]*y[i]*(tmp[i]-1);
 419                         I[sizeI] = i;
 420                         sizeI++;
 421                 }
 422         }
 423         subXTv(tmp, g);
 424
 425         for(i=0;i<w_size;i++)
 426                 g[i] = w[i] + 2*g[i];
 427         if(regularize_bias == 0)
 428                 g[w_size-1] -= w[w_size-1];
 429 }
 430
 431 void l2r_l2_svc_fun::get_diag_preconditioner(double *M)
 432 {
 433         int i;
 434         int w_size=get_nr_variable();
 435         feature_node **x = prob->x;
 436
 437         for (i=0; i<w_size; i++)
 438                 M[i] = 1;
 439         if(regularize_bias == 0)
 440                 M[w_size-1] = 0;
 441
 442         for (i=0; i<sizeI; i++)
 443         {
 444                 int idx = I[i];
 445                 feature_node *xi = x[idx];
 446                 while (xi->index!=-1)
 447                 {
 448                         M[xi->index-1] += xi->value*xi->value*C[idx]*2;
 449                         xi++;
 450                 }
 451         }
 452 }
 453
 454 void l2r_l2_svc_fun::Hv(double *s, double *Hs)
 455 {
 456         int i;
 457         int w_size=get_nr_variable();
 458         feature_node **x=prob->x;
 459
 460         for(i=0;i<w_size;i++)
 461                 Hs[i]=0;
 462         for(i=0;i<sizeI;i++)
 463         {
 464                 feature_node * const xi=x[I[i]];
 465                 double xTs = sparse_operator::dot(s, xi);
 466
 467                 xTs = C[I[i]]*xTs;
 468
 469                 sparse_operator::axpy(xTs, xi, Hs);
 470         }
 471         for(i=0;i<w_size;i++)
 472                 Hs[i] = s[i] + 2*Hs[i];
 473         if(regularize_bias == 0)
 474                 Hs[w_size-1] -= s[w_size-1];
 475 }
 476
 477 void l2r_l2_svc_fun::subXTv(double *v, double *XTv)
 478 {
 479         int i;
 480         int w_size=get_nr_variable();
 481         feature_node **x=prob->x;
 482
 483         for(i=0;i<w_size;i++)
 484                 XTv[i]=0;
 485         for(i=0;i<sizeI;i++)
 486                 sparse_operator::axpy(v[i], x[I[i]], XTv);
 487 }
 488
 489 class l2r_l2_svr_fun: public l2r_l2_svc_fun
 490 {
 491 public:
 492         l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C);
 493
 494         void grad(double *w, double *g);
 495
 496 private:
 497         double C_times_loss(int i, double wx_i);
 498         double p;
 499 };
 500
 501 l2r_l2_svr_fun::l2r_l2_svr_fun(const problem *prob, const parameter *param, double *C):
 502         l2r_l2_svc_fun(prob, param, C)
 503 {
 504         this->p = param->p;
 505         this->regularize_bias = param->regularize_bias;
 506 }
 507
 508 double l2r_l2_svr_fun::C_times_loss(int i, double wx_i)
 509 {
 510         double d = wx_i - prob->y[i];
 511         if(d < -p)
 512                 return C[i]*(d+p)*(d+p);
 513         else if(d > p)
 514                 return C[i]*(d-p)*(d-p);
 515         return 0;
 516 }
 517
 518 void l2r_l2_svr_fun::grad(double *w, double *g)
 519 {
 520         int i;
 521         double *y=prob->y;
 522         int l=prob->l;
 523         int w_size=get_nr_variable();
 524         double d;
 525
 526         sizeI = 0;
 527         for(i=0;i<l;i++)
 528         {
 529                 d = wx[i] - y[i];
 530
 531                 // generate index set I
 532                 if(d < -p)
 533                 {
 534                         tmp[sizeI] = C[i]*(d+p);
 535                         I[sizeI] = i;
 536                         sizeI++;
 537                 }
 538                 else if(d > p)
 539                 {
 540                         tmp[sizeI] = C[i]*(d-p);
 541                         I[sizeI] = i;
 542                         sizeI++;
 543                 }
 544
 545         }
 546         subXTv(tmp, g);
 547
 548         for(i=0;i<w_size;i++)
 549                 g[i] = w[i] + 2*g[i];
 550         if(regularize_bias == 0)
 551                 g[w_size-1] -= w[w_size-1];
 552 }
 553
 554 // A coordinate descent algorithm for
 555 // multi-class support vector machines by Crammer and Singer
 556 //
 557 //  min_{\alpha}  0.5 \sum_m ||w_m(\alpha)||^2 + \sum_i \sum_m e^m_i alpha^m_i
 558 //    s.t.     \alpha^m_i <= C^m_i \forall m,i , \sum_m \alpha^m_i=0 \forall i
 559 //
 560 //  where e^m_i = 0 if y_i  = m,
 561 //        e^m_i = 1 if y_i != m,
 562 //  C^m_i = C if m  = y_i,
 563 //  C^m_i = 0 if m != y_i,
 564 //  and w_m(\alpha) = \sum_i \alpha^m_i x_i
 565 //
 566 // Given:
 567 // x, y, C
 568 // eps is the stopping tolerance
 569 //
 570 // solution will be put in w
 571 //
 572 // See Appendix of LIBLINEAR paper, Fan et al. (2008)
 573
 574 #define GETI(i) ((int) prob->y[i])
 575 // To support weights for instances, use GETI(i) (i)
 576
 577 class Solver_MCSVM_CS
 578 {
 579         public:
 580                 Solver_MCSVM_CS(const problem *prob, int nr_class, double *C, double eps=0.1, int max_iter=100000);
 581                 ~Solver_MCSVM_CS();
 582                 void Solve(double *w);
 583         private:
 584                 void solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new);
 585                 bool be_shrunk(int i, int m, int yi, double alpha_i, double minG);
 586                 double *B, *C, *G;
 587                 int w_size, l;
 588                 int nr_class;
 589                 int max_iter;
 590                 double eps;
 591                 const problem *prob;
 592 };
 593
 594 Solver_MCSVM_CS::Solver_MCSVM_CS(const problem *prob, int nr_class, double *weighted_C, double eps, int max_iter)
 595 {
 596         this->w_size = prob->n;
 597         this->l = prob->l;
 598         this->nr_class = nr_class;
 599         this->eps = eps;
 600         this->max_iter = max_iter;
 601         this->prob = prob;
 602         this->B = new double[nr_class];
 603         this->G = new double[nr_class];
 604         this->C = weighted_C;
 605 }
 606
 607 Solver_MCSVM_CS::~Solver_MCSVM_CS()
 608 {
 609         delete[] B;
 610         delete[] G;
 611 }
 612
 613 int compare_double(const void *a, const void *b)
 614 {
 615         if(*(double *)a > *(double *)b)
 616                 return -1;
 617         if(*(double *)a < *(double *)b)
 618                 return 1;
 619         return 0;
 620 }
 621
 622 void Solver_MCSVM_CS::solve_sub_problem(double A_i, int yi, double C_yi, int active_i, double *alpha_new)
 623 {
 624         int r;
 625         double *D;
 626
 627         clone(D, B, active_i);
 628         if(yi < active_i)
 629                 D[yi] += A_i*C_yi;
 630         qsort(D, active_i, sizeof(double), compare_double);
 631
 632         double beta = D[0] - A_i*C_yi;
 633         for(r=1;r<active_i && beta<r*D[r];r++)
 634                 beta += D[r];
 635         beta /= r;
 636
 637         for(r=0;r<active_i;r++)
 638         {
 639                 if(r == yi)
 640                         alpha_new[r] = min(C_yi, (beta-B[r])/A_i);
 641                 else
 642                         alpha_new[r] = min((double)0, (beta - B[r])/A_i);
 643         }
 644         delete[] D;
 645 }
 646
 647 bool Solver_MCSVM_CS::be_shrunk(int i, int m, int yi, double alpha_i, double minG)
 648 {
 649         double bound = 0;
 650         if(m == yi)
 651                 bound = C[GETI(i)];
 652         if(alpha_i == bound && G[m] < minG)
 653                 return true;
 654         return false;
 655 }
 656
 657 void Solver_MCSVM_CS::Solve(double *w)
 658 {
 659         int i, m, s;
 660         int iter = 0;
 661         double *alpha =  new double[l*nr_class];
 662         double *alpha_new = new double[nr_class];
 663         int *index = new int[l];
 664         double *QD = new double[l];
 665         int *d_ind = new int[nr_class];
 666         double *d_val = new double[nr_class];
 667         int *alpha_index = new int[nr_class*l];
 668         int *y_index = new int[l];
 669         int active_size = l;
 670         int *active_size_i = new int[l];
 671         double eps_shrink = max(10.0*eps, 1.0); // stopping tolerance for shrinking
 672         bool start_from_all = true;
 673
 674         // Initial alpha can be set here. Note that
 675         // sum_m alpha[i*nr_class+m] = 0, for all i=1,...,l-1
 676         // alpha[i*nr_class+m] <= C[GETI(i)] if prob->y[i] == m
 677         // alpha[i*nr_class+m] <= 0 if prob->y[i] != m
 678         // If initial alpha isn't zero, uncomment the for loop below to initialize w
 679         for(i=0;i<l*nr_class;i++)
 680                 alpha[i] = 0;
 681
 682         for(i=0;i<w_size*nr_class;i++)
 683                 w[i] = 0;
 684         for(i=0;i<l;i++)
 685         {
 686                 for(m=0;m<nr_class;m++)
 687                         alpha_index[i*nr_class+m] = m;
 688                 feature_node *xi = prob->x[i];
 689                 QD[i] = 0;
 690                 while(xi->index != -1)
 691                 {
 692                         double val = xi->value;
 693                         QD[i] += val*val;
 694
 695                         // Uncomment the for loop if initial alpha isn't zero
 696                         // for(m=0; m<nr_class; m++)
 697                         //      w[(xi->index-1)*nr_class+m] += alpha[i*nr_class+m]*val;
 698                         xi++;
 699                 }
 700                 active_size_i[i] = nr_class;
 701                 y_index[i] = (int)prob->y[i];
 702                 index[i] = i;
 703         }
 704
 705         while(iter < max_iter)
 706         {
 707                 double stopping = -INF;
 708                 for(i=0;i<active_size;i++)
 709                 {
 710                         int j = i+rand()%(active_size-i);
 711                         swap(index[i], index[j]);
 712                 }
 713                 for(s=0;s<active_size;s++)
 714                 {
 715                         i = index[s];
 716                         double Ai = QD[i];
 717                         double *alpha_i = &alpha[i*nr_class];
 718                         int *alpha_index_i = &alpha_index[i*nr_class];
 719
 720                         if(Ai > 0)
 721                         {
 722                                 for(m=0;m<active_size_i[i];m++)
 723                                         G[m] = 1;
 724                                 if(y_index[i] < active_size_i[i])
 725                                         G[y_index[i]] = 0;
 726
 727                                 feature_node *xi = prob->x[i];
 728                                 while(xi->index!= -1)
 729                                 {
 730                                         double *w_i = &w[(xi->index-1)*nr_class];
 731                                         for(m=0;m<active_size_i[i];m++)
 732                                                 G[m] += w_i[alpha_index_i[m]]*(xi->value);
 733                                         xi++;
 734                                 }
 735
 736                                 double minG = INF;
 737                                 double maxG = -INF;
 738                                 for(m=0;m<active_size_i[i];m++)
 739                                 {
 740                                         if(alpha_i[alpha_index_i[m]] < 0 && G[m] < minG)
 741                                                 minG = G[m];
 742                                         if(G[m] > maxG)
 743                                                 maxG = G[m];
 744                                 }
 745                                 if(y_index[i] < active_size_i[i])
 746                                         if(alpha_i[(int) prob->y[i]] < C[GETI(i)] && G[y_index[i]] < minG)
 747                                                 minG = G[y_index[i]];
 748
 749                                 for(m=0;m<active_size_i[i];m++)
 750                                 {
 751                                         if(be_shrunk(i, m, y_index[i], alpha_i[alpha_index_i[m]], minG))
 752                                         {
 753                                                 active_size_i[i]--;
 754                                                 while(active_size_i[i]>m)
 755                                                 {
 756                                                         if(!be_shrunk(i, active_size_i[i], y_index[i],
 757                                                                                         alpha_i[alpha_index_i[active_size_i[i]]], minG))
 758                                                         {
 759                                                                 swap(alpha_index_i[m], alpha_index_i[active_size_i[i]]);
 760                                                                 swap(G[m], G[active_size_i[i]]);
 761                                                                 if(y_index[i] == active_size_i[i])
 762                                                                         y_index[i] = m;
 763                                                                 else if(y_index[i] == m)
 764                                                                         y_index[i] = active_size_i[i];
 765                                                                 break;
 766                                                         }
 767                                                         active_size_i[i]--;
 768                                                 }
 769                                         }
 770                                 }
 771
 772                                 if(active_size_i[i] <= 1)
 773                                 {
 774                                         active_size--;
 775                                         swap(index[s], index[active_size]);
 776                                         s--;
 777                                         continue;
 778                                 }
 779
 780                                 if(maxG-minG <= 1e-12)
 781                                         continue;
 782                                 else
 783                                         stopping = max(maxG - minG, stopping);
 784
 785                                 for(m=0;m<active_size_i[i];m++)
 786                                         B[m] = G[m] - Ai*alpha_i[alpha_index_i[m]] ;
 787
 788                                 solve_sub_problem(Ai, y_index[i], C[GETI(i)], active_size_i[i], alpha_new);
 789                                 int nz_d = 0;
 790                                 for(m=0;m<active_size_i[i];m++)
 791                                 {
 792                                         double d = alpha_new[m] - alpha_i[alpha_index_i[m]];
 793                                         alpha_i[alpha_index_i[m]] = alpha_new[m];
 794                                         if(fabs(d) >= 1e-12)
 795                                         {
 796                                                 d_ind[nz_d] = alpha_index_i[m];
 797                                                 d_val[nz_d] = d;
 798                                                 nz_d++;
 799                                         }
 800                                 }
 801
 802                                 xi = prob->x[i];
 803                                 while(xi->index != -1)
 804                                 {
 805                                         double *w_i = &w[(xi->index-1)*nr_class];
 806                                         for(m=0;m<nz_d;m++)
 807                                                 w_i[d_ind[m]] += d_val[m]*xi->value;
 808                                         xi++;
 809                                 }
 810                         }
 811                 }
 812
 813                 iter++;
 814                 if(iter % 10 == 0)
 815                 {
 816                         info(".");
 817                 }
 818
 819                 if(stopping < eps_shrink)
 820                 {
 821                         if(stopping < eps && start_from_all == true)
 822                                 break;
 823                         else
 824                         {
 825                                 active_size = l;
 826                                 for(i=0;i<l;i++)
 827                                         active_size_i[i] = nr_class;
 828                                 info("*");
 829                                 eps_shrink = max(eps_shrink/2, eps);
 830                                 start_from_all = true;
 831                         }
 832                 }
 833                 else
 834                         start_from_all = false;
 835         }
 836
 837         info("\noptimization finished, #iter = %d\n",iter);
 838         if (iter >= max_iter)
 839                 info("\nWARNING: reaching max number of iterations\n");
 840
 841         // calculate objective value
 842         double v = 0;
 843         int nSV = 0;
 844         for(i=0;i<w_size*nr_class;i++)
 845                 v += w[i]*w[i];
 846         v = 0.5*v;
 847         for(i=0;i<l*nr_class;i++)
 848         {
 849                 v += alpha[i];
 850                 if(fabs(alpha[i]) > 0)
 851                         nSV++;
 852         }
 853         for(i=0;i<l;i++)
 854                 v -= alpha[i*nr_class+(int)prob->y[i]];
 855         info("Objective value = %lf\n",v);
 856         info("nSV = %d\n",nSV);
 857
 858         delete [] alpha;
 859         delete [] alpha_new;
 860         delete [] index;
 861         delete [] QD;
 862         delete [] d_ind;
 863         delete [] d_val;
 864         delete [] alpha_index;
 865         delete [] y_index;
 866         delete [] active_size_i;
 867 }
 868
 869 // A coordinate descent algorithm for
 870 // L1-loss and L2-loss SVM dual problems
 871 //
 872 //  min_\alpha  0.5(\alpha^T (Q + D)\alpha) - e^T \alpha,
 873 //    s.t.      0 <= \alpha_i <= upper_bound_i,
 874 //
 875 //  where Qij = yi yj xi^T xj and
 876 //  D is a diagonal matrix
 877 //
 878 // In L1-SVM case:
 879 //              upper_bound_i = Cp if y_i = 1
 880 //              upper_bound_i = Cn if y_i = -1
 881 //              D_ii = 0
 882 // In L2-SVM case:
 883 //              upper_bound_i = INF
 884 //              D_ii = 1/(2*Cp) if y_i = 1
 885 //              D_ii = 1/(2*Cn) if y_i = -1
 886 //
 887 // Given:
 888 // x, y, Cp, Cn
 889 // eps is the stopping tolerance
 890 //
 891 // solution will be put in w
 892 //
 893 // See Algorithm 3 of Hsieh et al., ICML 2008
 894
 895 #undef GETI
 896 #define GETI(i) (y[i]+1)
 897 // To support weights for instances, use GETI(i) (i)
 898
 899 static void solve_l2r_l1l2_svc(
 900         const problem *prob, double *w, double eps,
 901         double Cp, double Cn, int solver_type)
 902 {
 903         int l = prob->l;
 904         int w_size = prob->n;
 905         int i, s, iter = 0;
 906         double C, d, G;
 907         double *QD = new double[l];
 908         int max_iter = 1000;
 909         int *index = new int[l];
 910         double *alpha = new double[l];
 911         schar *y = new schar[l];
 912         int active_size = l;
 913
 914         // PG: projected gradient, for shrinking and stopping
 915         double PG;
 916         double PGmax_old = INF;
 917         double PGmin_old = -INF;
 918         double PGmax_new, PGmin_new;
 919
 920         // default solver_type: L2R_L2LOSS_SVC_DUAL
 921         double diag[3] = {0.5/Cn, 0, 0.5/Cp};
 922         double upper_bound[3] = {INF, 0, INF};
 923         if(solver_type == L2R_L1LOSS_SVC_DUAL)
 924         {
 925                 diag[0] = 0;
 926                 diag[2] = 0;
 927                 upper_bound[0] = Cn;
 928                 upper_bound[2] = Cp;
 929         }
 930
 931         for(i=0; i<l; i++)
 932         {
 933                 if(prob->y[i] > 0)
 934                 {
 935                         y[i] = +1;
 936                 }
 937                 else
 938                 {
 939                         y[i] = -1;
 940                 }
 941         }
 942
 943         // Initial alpha can be set here. Note that
 944         // 0 <= alpha[i] <= upper_bound[GETI(i)]
 945         for(i=0; i<l; i++)
 946                 alpha[i] = 0;
 947
 948         for(i=0; i<w_size; i++)
 949                 w[i] = 0;
 950         for(i=0; i<l; i++)
 951         {
 952                 QD[i] = diag[GETI(i)];
 953
 954                 feature_node * const xi = prob->x[i];
 955                 QD[i] += sparse_operator::nrm2_sq(xi);
 956                 sparse_operator::axpy(y[i]*alpha[i], xi, w);
 957
 958                 index[i] = i;
 959         }
 960
 961         while (iter < max_iter)
 962         {
 963                 PGmax_new = -INF;
 964                 PGmin_new = INF;
 965
 966                 for (i=0; i<active_size; i++)
 967                 {
 968                         int j = i+rand()%(active_size-i);
 969                         swap(index[i], index[j]);
 970                 }
 971
 972                 for (s=0; s<active_size; s++)
 973                 {
 974                         i = index[s];
 975                         const schar yi = y[i];
 976                         feature_node * const xi = prob->x[i];
 977
 978                         G = yi*sparse_operator::dot(w, xi)-1;
 979
 980                         C = upper_bound[GETI(i)];
 981                         G += alpha[i]*diag[GETI(i)];
 982
 983                         PG = 0;
 984                         if (alpha[i] == 0)
 985                         {
 986                                 if (G > PGmax_old)
 987                                 {
 988                                         active_size--;
 989                                         swap(index[s], index[active_size]);
 990                                         s--;
 991                                         continue;
 992                                 }
 993                                 else if (G < 0)
 994                                         PG = G;
 995                         }
 996                         else if (alpha[i] == C)
 997                         {
 998                                 if (G < PGmin_old)
 999                                 {
1000                                         active_size--;
1001                                         swap(index[s], index[active_size]);
1002                                         s--;
1003                                         continue;
1004                                 }
1005                                 else if (G > 0)
1006                                         PG = G;
1007                         }
1008                         else
1009                                 PG = G;
1010
1011                         PGmax_new = max(PGmax_new, PG);
1012                         PGmin_new = min(PGmin_new, PG);
1013
1014                         if(fabs(PG) > 1.0e-12)
1015                         {
1016                                 double alpha_old = alpha[i];
1017                                 alpha[i] = min(max(alpha[i] - G/QD[i], 0.0), C);
1018                                 d = (alpha[i] - alpha_old)*yi;
1019                                 sparse_operator::axpy(d, xi, w);
1020                         }
1021                 }
1022
1023                 iter++;
1024                 if(iter % 10 == 0)
1025                         info(".");
1026
1027                 if(PGmax_new - PGmin_new <= eps)
1028                 {
1029                         if(active_size == l)
1030                                 break;
1031                         else
1032                         {
1033                                 active_size = l;
1034                                 info("*");
1035                                 PGmax_old = INF;
1036                                 PGmin_old = -INF;
1037                                 continue;
1038                         }
1039                 }
1040                 PGmax_old = PGmax_new;
1041                 PGmin_old = PGmin_new;
1042                 if (PGmax_old <= 0)
1043                         PGmax_old = INF;
1044                 if (PGmin_old >= 0)
1045                         PGmin_old = -INF;
1046         }
1047
1048         info("\noptimization finished, #iter = %d\n",iter);
1049         if (iter >= max_iter)
1050                 info("\nWARNING: reaching max number of iterations\nUsing -s 2 may be faster (also see FAQ)\n\n");
1051
1052         // calculate objective value
1053
1054         double v = 0;
1055         int nSV = 0;
1056         for(i=0; i<w_size; i++)
1057                 v += w[i]*w[i];
1058         for(i=0; i<l; i++)
1059         {
1060                 v += alpha[i]*(alpha[i]*diag[GETI(i)] - 2);
1061                 if(alpha[i] > 0)
1062                         ++nSV;
1063         }
1064         info("Objective value = %lf\n",v/2);
1065         info("nSV = %d\n",nSV);
1066
1067         delete [] QD;
1068         delete [] alpha;
1069         delete [] y;
1070         delete [] index;
1071 }
1072
1073
1074 // A coordinate descent algorithm for
1075 // L1-loss and L2-loss epsilon-SVR dual problem
1076 //
1077 //  min_\beta  0.5\beta^T (Q + diag(lambda)) \beta - p \sum_{i=1}^l|\beta_i| + \sum_{i=1}^l yi\beta_i,
1078 //    s.t.      -upper_bound_i <= \beta_i <= upper_bound_i,
1079 //
1080 //  where Qij = xi^T xj and
1081 //  D is a diagonal matrix
1082 //
1083 // In L1-SVM case:
1084 //              upper_bound_i = C
1085 //              lambda_i = 0
1086 // In L2-SVM case:
1087 //              upper_bound_i = INF
1088 //              lambda_i = 1/(2*C)
1089 //
1090 // Given:
1091 // x, y, p, C
1092 // eps is the stopping tolerance
1093 //
1094 // solution will be put in w
1095 //
1096 // See Algorithm 4 of Ho and Lin, 2012
1097
1098 #undef GETI
1099 #define GETI(i) (0)
1100 // To support weights for instances, use GETI(i) (i)
1101
1102 static void solve_l2r_l1l2_svr(
1103         const problem *prob, double *w, const parameter *param,
1104         int solver_type)
1105 {
1106         int l = prob->l;
1107         double C = param->C;
1108         double p = param->p;
1109         int w_size = prob->n;
1110         double eps = param->eps;
1111         int i, s, iter = 0;
1112         int max_iter = 1000;
1113         int active_size = l;
1114         int *index = new int[l];
1115
1116         double d, G, H;
1117         double Gmax_old = INF;
1118         double Gmax_new, Gnorm1_new;
1119         double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1120         double *beta = new double[l];
1121         double *QD = new double[l];
1122         double *y = prob->y;
1123
1124         // L2R_L2LOSS_SVR_DUAL
1125         double lambda[1], upper_bound[1];
1126         lambda[0] = 0.5/C;
1127         upper_bound[0] = INF;
1128
1129         if(solver_type == L2R_L1LOSS_SVR_DUAL)
1130         {
1131                 lambda[0] = 0;
1132                 upper_bound[0] = C;
1133         }
1134
1135         // Initial beta can be set here. Note that
1136         // -upper_bound <= beta[i] <= upper_bound
1137         for(i=0; i<l; i++)
1138                 beta[i] = 0;
1139
1140         for(i=0; i<w_size; i++)
1141                 w[i] = 0;
1142         for(i=0; i<l; i++)
1143         {
1144                 feature_node * const xi = prob->x[i];
1145                 QD[i] = sparse_operator::nrm2_sq(xi);
1146                 sparse_operator::axpy(beta[i], xi, w);
1147
1148                 index[i] = i;
1149         }
1150
1151
1152         while(iter < max_iter)
1153         {
1154                 Gmax_new = 0;
1155                 Gnorm1_new = 0;
1156
1157                 for(i=0; i<active_size; i++)
1158                 {
1159                         int j = i+rand()%(active_size-i);
1160                         swap(index[i], index[j]);
1161                 }
1162
1163                 for(s=0; s<active_size; s++)
1164                 {
1165                         i = index[s];
1166                         G = -y[i] + lambda[GETI(i)]*beta[i];
1167                         H = QD[i] + lambda[GETI(i)];
1168
1169                         feature_node * const xi = prob->x[i];
1170                         G += sparse_operator::dot(w, xi);
1171
1172                         double Gp = G+p;
1173                         double Gn = G-p;
1174                         double violation = 0;
1175                         if(beta[i] == 0)
1176                         {
1177                                 if(Gp < 0)
1178                                         violation = -Gp;
1179                                 else if(Gn > 0)
1180                                         violation = Gn;
1181                                 else if(Gp>Gmax_old && Gn<-Gmax_old)
1182                                 {
1183                                         active_size--;
1184                                         swap(index[s], index[active_size]);
1185                                         s--;
1186                                         continue;
1187                                 }
1188                         }
1189                         else if(beta[i] >= upper_bound[GETI(i)])
1190                         {
1191                                 if(Gp > 0)
1192                                         violation = Gp;
1193                                 else if(Gp < -Gmax_old)
1194                                 {
1195                                         active_size--;
1196                                         swap(index[s], index[active_size]);
1197                                         s--;
1198                                         continue;
1199                                 }
1200                         }
1201                         else if(beta[i] <= -upper_bound[GETI(i)])
1202                         {
1203                                 if(Gn < 0)
1204                                         violation = -Gn;
1205                                 else if(Gn > Gmax_old)
1206                                 {
1207                                         active_size--;
1208                                         swap(index[s], index[active_size]);
1209                                         s--;
1210                                         continue;
1211                                 }
1212                         }
1213                         else if(beta[i] > 0)
1214                                 violation = fabs(Gp);
1215                         else
1216                                 violation = fabs(Gn);
1217
1218                         Gmax_new = max(Gmax_new, violation);
1219                         Gnorm1_new += violation;
1220
1221                         // obtain Newton direction d
1222                         if(Gp < H*beta[i])
1223                                 d = -Gp/H;
1224                         else if(Gn > H*beta[i])
1225                                 d = -Gn/H;
1226                         else
1227                                 d = -beta[i];
1228
1229                         if(fabs(d) < 1.0e-12)
1230                                 continue;
1231
1232                         double beta_old = beta[i];
1233                         beta[i] = min(max(beta[i]+d, -upper_bound[GETI(i)]), upper_bound[GETI(i)]);
1234                         d = beta[i]-beta_old;
1235
1236                         if(d != 0)
1237                                 sparse_operator::axpy(d, xi, w);
1238                 }
1239
1240                 if(iter == 0)
1241                         Gnorm1_init = Gnorm1_new;
1242                 iter++;
1243                 if(iter % 10 == 0)
1244                         info(".");
1245
1246                 if(Gnorm1_new <= eps*Gnorm1_init)
1247                 {
1248                         if(active_size == l)
1249                                 break;
1250                         else
1251                         {
1252                                 active_size = l;
1253                                 info("*");
1254                                 Gmax_old = INF;
1255                                 continue;
1256                         }
1257                 }
1258
1259                 Gmax_old = Gmax_new;
1260         }
1261
1262         info("\noptimization finished, #iter = %d\n", iter);
1263         if(iter >= max_iter)
1264                 info("\nWARNING: reaching max number of iterations\nUsing -s 11 may be faster\n\n");
1265
1266         // calculate objective value
1267         double v = 0;
1268         int nSV = 0;
1269         for(i=0; i<w_size; i++)
1270                 v += w[i]*w[i];
1271         v = 0.5*v;
1272         for(i=0; i<l; i++)
1273         {
1274                 v += p*fabs(beta[i]) - y[i]*beta[i] + 0.5*lambda[GETI(i)]*beta[i]*beta[i];
1275                 if(beta[i] != 0)
1276                         nSV++;
1277         }
1278
1279         info("Objective value = %lf\n", v);
1280         info("nSV = %d\n",nSV);
1281
1282         delete [] beta;
1283         delete [] QD;
1284         delete [] index;
1285 }
1286
1287
1288 // A coordinate descent algorithm for
1289 // the dual of L2-regularized logistic regression problems
1290 //
1291 //  min_\alpha  0.5(\alpha^T Q \alpha) + \sum \alpha_i log (\alpha_i) + (upper_bound_i - \alpha_i) log (upper_bound_i - \alpha_i),
1292 //    s.t.      0 <= \alpha_i <= upper_bound_i,
1293 //
1294 //  where Qij = yi yj xi^T xj and
1295 //  upper_bound_i = Cp if y_i = 1
1296 //  upper_bound_i = Cn if y_i = -1
1297 //
1298 // Given:
1299 // x, y, Cp, Cn
1300 // eps is the stopping tolerance
1301 //
1302 // solution will be put in w
1303 //
1304 // See Algorithm 5 of Yu et al., MLJ 2010
1305
1306 #undef GETI
1307 #define GETI(i) (y[i]+1)
1308 // To support weights for instances, use GETI(i) (i)
1309
1310 void solve_l2r_lr_dual(const problem *prob, double *w, double eps, double Cp, double Cn)
1311 {
1312         int l = prob->l;
1313         int w_size = prob->n;
1314         int i, s, iter = 0;
1315         double *xTx = new double[l];
1316         int max_iter = 1000;
1317         int *index = new int[l];
1318         double *alpha = new double[2*l]; // store alpha and C - alpha
1319         schar *y = new schar[l];
1320         int max_inner_iter = 100; // for inner Newton
1321         double innereps = 1e-2;
1322         double innereps_min = min(1e-8, eps);
1323         double upper_bound[3] = {Cn, 0, Cp};
1324
1325         for(i=0; i<l; i++)
1326         {
1327                 if(prob->y[i] > 0)
1328                 {
1329                         y[i] = +1;
1330                 }
1331                 else
1332                 {
1333                         y[i] = -1;
1334                 }
1335         }
1336
1337         // Initial alpha can be set here. Note that
1338         // 0 < alpha[i] < upper_bound[GETI(i)]
1339         // alpha[2*i] + alpha[2*i+1] = upper_bound[GETI(i)]
1340         for(i=0; i<l; i++)
1341         {
1342                 alpha[2*i] = min(0.001*upper_bound[GETI(i)], 1e-8);
1343                 alpha[2*i+1] = upper_bound[GETI(i)] - alpha[2*i];
1344         }
1345
1346         for(i=0; i<w_size; i++)
1347                 w[i] = 0;
1348         for(i=0; i<l; i++)
1349         {
1350                 feature_node * const xi = prob->x[i];
1351                 xTx[i] = sparse_operator::nrm2_sq(xi);
1352                 sparse_operator::axpy(y[i]*alpha[2*i], xi, w);
1353                 index[i] = i;
1354         }
1355
1356         while (iter < max_iter)
1357         {
1358                 for (i=0; i<l; i++)
1359                 {
1360                         int j = i+rand()%(l-i);
1361                         swap(index[i], index[j]);
1362                 }
1363                 int newton_iter = 0;
1364                 double Gmax = 0;
1365                 for (s=0; s<l; s++)
1366                 {
1367                         i = index[s];
1368                         const schar yi = y[i];
1369                         double C = upper_bound[GETI(i)];
1370                         double ywTx = 0, xisq = xTx[i];
1371                         feature_node * const xi = prob->x[i];
1372                         ywTx = yi*sparse_operator::dot(w, xi);
1373                         double a = xisq, b = ywTx;
1374
1375                         // Decide to minimize g_1(z) or g_2(z)
1376                         int ind1 = 2*i, ind2 = 2*i+1, sign = 1;
1377                         if(0.5*a*(alpha[ind2]-alpha[ind1])+b < 0)
1378                         {
1379                                 ind1 = 2*i+1;
1380                                 ind2 = 2*i;
1381                                 sign = -1;
1382                         }
1383
1384                         //  g_t(z) = z*log(z) + (C-z)*log(C-z) + 0.5a(z-alpha_old)^2 + sign*b(z-alpha_old)
1385                         double alpha_old = alpha[ind1];
1386                         double z = alpha_old;
1387                         if(C - z < 0.5 * C)
1388                                 z = 0.1*z;
1389                         double gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
1390                         Gmax = max(Gmax, fabs(gp));
1391
1392                         // Newton method on the sub-problem
1393                         const double eta = 0.1; // xi in the paper
1394                         int inner_iter = 0;
1395                         while (inner_iter <= max_inner_iter)
1396                         {
1397                                 if(fabs(gp) < innereps)
1398                                         break;
1399                                 double gpp = a + C/(C-z)/z;
1400                                 double tmpz = z - gp/gpp;
1401                                 if(tmpz <= 0)
1402                                         z *= eta;
1403                                 else // tmpz in (0, C)
1404                                         z = tmpz;
1405                                 gp = a*(z-alpha_old)+sign*b+log(z/(C-z));
1406                                 newton_iter++;
1407                                 inner_iter++;
1408                         }
1409
1410                         if(inner_iter > 0) // update w
1411                         {
1412                                 alpha[ind1] = z;
1413                                 alpha[ind2] = C-z;
1414                                 sparse_operator::axpy(sign*(z-alpha_old)*yi, xi, w);
1415                         }
1416                 }
1417
1418                 iter++;
1419                 if(iter % 10 == 0)
1420                         info(".");
1421
1422                 if(Gmax < eps)
1423                         break;
1424
1425                 if(newton_iter <= l/10)
1426                         innereps = max(innereps_min, 0.1*innereps);
1427
1428         }
1429
1430         info("\noptimization finished, #iter = %d\n",iter);
1431         if (iter >= max_iter)
1432                 info("\nWARNING: reaching max number of iterations\nUsing -s 0 may be faster (also see FAQ)\n\n");
1433
1434         // calculate objective value
1435
1436         double v = 0;
1437         for(i=0; i<w_size; i++)
1438                 v += w[i] * w[i];
1439         v *= 0.5;
1440         for(i=0; i<l; i++)
1441                 v += alpha[2*i] * log(alpha[2*i]) + alpha[2*i+1] * log(alpha[2*i+1])
1442                         - upper_bound[GETI(i)] * log(upper_bound[GETI(i)]);
1443         info("Objective value = %lf\n", v);
1444
1445         delete [] xTx;
1446         delete [] alpha;
1447         delete [] y;
1448         delete [] index;
1449 }
1450
1451 // A coordinate descent algorithm for
1452 // L1-regularized L2-loss support vector classification
1453 //
1454 //  min_w \sum |wj| + C \sum max(0, 1-yi w^T xi)^2,
1455 //
1456 // Given:
1457 // x, y, Cp, Cn
1458 // eps is the stopping tolerance
1459 //
1460 // solution will be put in w
1461 //
1462 // See Yuan et al. (2010) and appendix of LIBLINEAR paper, Fan et al. (2008)
1463 //
1464 // To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1
1465 // must have been added to the original data. (see -B and -R option)
1466
1467 #undef GETI
1468 #define GETI(i) (y[i]+1)
1469 // To support weights for instances, use GETI(i) (i)
1470
1471 static void solve_l1r_l2_svc(
1472         problem *prob_col, double *w, double eps,
1473         double Cp, double Cn, int regularize_bias)
1474 {
1475         int l = prob_col->l;
1476         int w_size = prob_col->n;
1477         int j, s, iter = 0;
1478         int max_iter = 1000;
1479         int active_size = w_size;
1480         int max_num_linesearch = 20;
1481
1482         double sigma = 0.01;
1483         double d, G_loss, G, H;
1484         double Gmax_old = INF;
1485         double Gmax_new, Gnorm1_new;
1486         double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1487         double d_old, d_diff;
1488         double loss_old = 0, loss_new;
1489         double appxcond, cond;
1490
1491         int *index = new int[w_size];
1492         schar *y = new schar[l];
1493         double *b = new double[l]; // b = 1-ywTx
1494         double *xj_sq = new double[w_size];
1495         feature_node *x;
1496
1497         double C[3] = {Cn,0,Cp};
1498
1499         // Initial w can be set here.
1500         for(j=0; j<w_size; j++)
1501                 w[j] = 0;
1502
1503         for(j=0; j<l; j++)
1504         {
1505                 b[j] = 1;
1506                 if(prob_col->y[j] > 0)
1507                         y[j] = 1;
1508                 else
1509                         y[j] = -1;
1510         }
1511         for(j=0; j<w_size; j++)
1512         {
1513                 index[j] = j;
1514                 xj_sq[j] = 0;
1515                 x = prob_col->x[j];
1516                 while(x->index != -1)
1517                 {
1518                         int ind = x->index-1;
1519                         x->value *= y[ind]; // x->value stores yi*xij
1520                         double val = x->value;
1521                         b[ind] -= w[j]*val;
1522                         xj_sq[j] += C[GETI(ind)]*val*val;
1523                         x++;
1524                 }
1525         }
1526
1527         while(iter < max_iter)
1528         {
1529                 Gmax_new = 0;
1530                 Gnorm1_new = 0;
1531
1532                 for(j=0; j<active_size; j++)
1533                 {
1534                         int i = j+rand()%(active_size-j);
1535                         swap(index[i], index[j]);
1536                 }
1537
1538                 for(s=0; s<active_size; s++)
1539                 {
1540                         j = index[s];
1541                         G_loss = 0;
1542                         H = 0;
1543
1544                         x = prob_col->x[j];
1545                         while(x->index != -1)
1546                         {
1547                                 int ind = x->index-1;
1548                                 if(b[ind] > 0)
1549                                 {
1550                                         double val = x->value;
1551                                         double tmp = C[GETI(ind)]*val;
1552                                         G_loss -= tmp*b[ind];
1553                                         H += tmp*val;
1554                                 }
1555                                 x++;
1556                         }
1557                         G_loss *= 2;
1558
1559                         G = G_loss;
1560                         H *= 2;
1561                         H = max(H, 1e-12);
1562
1563                         double violation = 0;
1564                         double Gp = 0, Gn = 0;
1565                         if(j == w_size-1 && regularize_bias == 0)
1566                                 violation = fabs(G);
1567                         else
1568                         {
1569                                 Gp = G+1;
1570                                 Gn = G-1;
1571                                 if(w[j] == 0)
1572                                 {
1573                                         if(Gp < 0)
1574                                                 violation = -Gp;
1575                                         else if(Gn > 0)
1576                                                 violation = Gn;
1577                                         else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
1578                                         {
1579                                                 active_size--;
1580                                                 swap(index[s], index[active_size]);
1581                                                 s--;
1582                                                 continue;
1583                                         }
1584                                 }
1585                                 else if(w[j] > 0)
1586                                         violation = fabs(Gp);
1587                                 else
1588                                         violation = fabs(Gn);
1589                         }
1590                         Gmax_new = max(Gmax_new, violation);
1591                         Gnorm1_new += violation;
1592
1593                         // obtain Newton direction d
1594                         if(j == w_size-1 && regularize_bias == 0)
1595                                 d = -G/H;
1596                         else
1597                         {
1598                                 if(Gp < H*w[j])
1599                                         d = -Gp/H;
1600                                 else if(Gn > H*w[j])
1601                                         d = -Gn/H;
1602                                 else
1603                                         d = -w[j];
1604                         }
1605
1606                         if(fabs(d) < 1.0e-12)
1607                                 continue;
1608
1609                         double delta;
1610                         if(j == w_size-1 && regularize_bias == 0)
1611                                 delta = G*d;
1612                         else
1613                                 delta = fabs(w[j]+d)-fabs(w[j]) + G*d;
1614                         d_old = 0;
1615                         int num_linesearch;
1616                         for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
1617                         {
1618                                 d_diff = d_old - d;
1619                                 if(j == w_size-1 && regularize_bias == 0)
1620                                         cond = -sigma*delta;
1621                                 else
1622                                         cond = fabs(w[j]+d)-fabs(w[j]) - sigma*delta;
1623
1624                                 appxcond = xj_sq[j]*d*d + G_loss*d + cond;
1625                                 if(appxcond <= 0)
1626                                 {
1627                                         x = prob_col->x[j];
1628                                         sparse_operator::axpy(d_diff, x, b);
1629                                         break;
1630                                 }
1631
1632                                 if(num_linesearch == 0)
1633                                 {
1634                                         loss_old = 0;
1635                                         loss_new = 0;
1636                                         x = prob_col->x[j];
1637                                         while(x->index != -1)
1638                                         {
1639                                                 int ind = x->index-1;
1640                                                 if(b[ind] > 0)
1641                                                         loss_old += C[GETI(ind)]*b[ind]*b[ind];
1642                                                 double b_new = b[ind] + d_diff*x->value;
1643                                                 b[ind] = b_new;
1644                                                 if(b_new > 0)
1645                                                         loss_new += C[GETI(ind)]*b_new*b_new;
1646                                                 x++;
1647                                         }
1648                                 }
1649                                 else
1650                                 {
1651                                         loss_new = 0;
1652                                         x = prob_col->x[j];
1653                                         while(x->index != -1)
1654                                         {
1655                                                 int ind = x->index-1;
1656                                                 double b_new = b[ind] + d_diff*x->value;
1657                                                 b[ind] = b_new;
1658                                                 if(b_new > 0)
1659                                                         loss_new += C[GETI(ind)]*b_new*b_new;
1660                                                 x++;
1661                                         }
1662                                 }
1663
1664                                 cond = cond + loss_new - loss_old;
1665                                 if(cond <= 0)
1666                                         break;
1667                                 else
1668                                 {
1669                                         d_old = d;
1670                                         d *= 0.5;
1671                                         delta *= 0.5;
1672                                 }
1673                         }
1674
1675                         w[j] += d;
1676
1677                         // recompute b[] if line search takes too many steps
1678                         if(num_linesearch >= max_num_linesearch)
1679                         {
1680                                 info("#");
1681                                 for(int i=0; i<l; i++)
1682                                         b[i] = 1;
1683
1684                                 for(int i=0; i<w_size; i++)
1685                                 {
1686                                         if(w[i]==0) continue;
1687                                         x = prob_col->x[i];
1688                                         sparse_operator::axpy(-w[i], x, b);
1689                                 }
1690                         }
1691                 }
1692
1693                 if(iter == 0)
1694                         Gnorm1_init = Gnorm1_new;
1695                 iter++;
1696                 if(iter % 10 == 0)
1697                         info(".");
1698
1699                 if(Gnorm1_new <= eps*Gnorm1_init)
1700                 {
1701                         if(active_size == w_size)
1702                                 break;
1703                         else
1704                         {
1705                                 active_size = w_size;
1706                                 info("*");
1707                                 Gmax_old = INF;
1708                                 continue;
1709                         }
1710                 }
1711
1712                 Gmax_old = Gmax_new;
1713         }
1714
1715         info("\noptimization finished, #iter = %d\n", iter);
1716         if(iter >= max_iter)
1717                 info("\nWARNING: reaching max number of iterations\n");
1718
1719         // calculate objective value
1720
1721         double v = 0;
1722         int nnz = 0;
1723         for(j=0; j<w_size; j++)
1724         {
1725                 x = prob_col->x[j];
1726                 while(x->index != -1)
1727                 {
1728                         x->value *= prob_col->y[x->index-1]; // restore x->value
1729                         x++;
1730                 }
1731                 if(w[j] != 0)
1732                 {
1733                         v += fabs(w[j]);
1734                         nnz++;
1735                 }
1736         }
1737         if (regularize_bias == 0)
1738                 v -= fabs(w[w_size-1]);
1739         for(j=0; j<l; j++)
1740                 if(b[j] > 0)
1741                         v += C[GETI(j)]*b[j]*b[j];
1742
1743         info("Objective value = %lf\n", v);
1744         info("#nonzeros/#features = %d/%d\n", nnz, w_size);
1745
1746         delete [] index;
1747         delete [] y;
1748         delete [] b;
1749         delete [] xj_sq;
1750 }
1751
1752 // A coordinate descent algorithm for
1753 // L1-regularized logistic regression problems
1754 //
1755 //  min_w \sum |wj| + C \sum log(1+exp(-yi w^T xi)),
1756 //
1757 // Given:
1758 // x, y, Cp, Cn
1759 // eps is the stopping tolerance
1760 //
1761 // solution will be put in w
1762 //
1763 // See Yuan et al. (2011) and appendix of LIBLINEAR paper, Fan et al. (2008)
1764 //
1765 // To not regularize the bias (i.e., regularize_bias = 0), a constant feature = 1
1766 // must have been added to the original data. (see -B and -R option)
1767
1768 #undef GETI
1769 #define GETI(i) (y[i]+1)
1770 // To support weights for instances, use GETI(i) (i)
1771
1772 static void solve_l1r_lr(
1773         const problem *prob_col, double *w, double eps,
1774         double Cp, double Cn, int regularize_bias)
1775 {
1776         int l = prob_col->l;
1777         int w_size = prob_col->n;
1778         int j, s, newton_iter=0, iter=0;
1779         int max_newton_iter = 100;
1780         int max_iter = 1000;
1781         int max_num_linesearch = 20;
1782         int active_size;
1783         int QP_active_size;
1784
1785         double nu = 1e-12;
1786         double inner_eps = 1;
1787         double sigma = 0.01;
1788         double w_norm, w_norm_new;
1789         double z, G, H;
1790         double Gnorm1_init = -1.0; // Gnorm1_init is initialized at the first iteration
1791         double Gmax_old = INF;
1792         double Gmax_new, Gnorm1_new;
1793         double QP_Gmax_old = INF;
1794         double QP_Gmax_new, QP_Gnorm1_new;
1795         double delta, negsum_xTd, cond;
1796
1797         int *index = new int[w_size];
1798         schar *y = new schar[l];
1799         double *Hdiag = new double[w_size];
1800         double *Grad = new double[w_size];
1801         double *wpd = new double[w_size];
1802         double *xjneg_sum = new double[w_size];
1803         double *xTd = new double[l];
1804         double *exp_wTx = new double[l];
1805         double *exp_wTx_new = new double[l];
1806         double *tau = new double[l];
1807         double *D = new double[l];
1808         feature_node *x;
1809
1810         double C[3] = {Cn,0,Cp};
1811
1812         // Initial w can be set here.
1813         for(j=0; j<w_size; j++)
1814                 w[j] = 0;
1815
1816         for(j=0; j<l; j++)
1817         {
1818                 if(prob_col->y[j] > 0)
1819                         y[j] = 1;
1820                 else
1821                         y[j] = -1;
1822
1823                 exp_wTx[j] = 0;
1824         }
1825
1826         w_norm = 0;
1827         for(j=0; j<w_size; j++)
1828         {
1829                 w_norm += fabs(w[j]);
1830                 wpd[j] = w[j];
1831                 index[j] = j;
1832                 xjneg_sum[j] = 0;
1833                 x = prob_col->x[j];
1834                 while(x->index != -1)
1835                 {
1836                         int ind = x->index-1;
1837                         double val = x->value;
1838                         exp_wTx[ind] += w[j]*val;
1839                         if(y[ind] == -1)
1840                                 xjneg_sum[j] += C[GETI(ind)]*val;
1841                         x++;
1842                 }
1843         }
1844         if (regularize_bias == 0)
1845                 w_norm -= fabs(w[w_size-1]);
1846
1847         for(j=0; j<l; j++)
1848         {
1849                 exp_wTx[j] = exp(exp_wTx[j]);
1850                 double tau_tmp = 1/(1+exp_wTx[j]);
1851                 tau[j] = C[GETI(j)]*tau_tmp;
1852                 D[j] = C[GETI(j)]*exp_wTx[j]*tau_tmp*tau_tmp;
1853         }
1854
1855         while(newton_iter < max_newton_iter)
1856         {
1857                 Gmax_new = 0;
1858                 Gnorm1_new = 0;
1859                 active_size = w_size;
1860
1861                 for(s=0; s<active_size; s++)
1862                 {
1863                         j = index[s];
1864                         Hdiag[j] = nu;
1865                         Grad[j] = 0;
1866
1867                         double tmp = 0;
1868                         x = prob_col->x[j];
1869                         while(x->index != -1)
1870                         {
1871                                 int ind = x->index-1;
1872                                 Hdiag[j] += x->value*x->value*D[ind];
1873                                 tmp += x->value*tau[ind];
1874                                 x++;
1875                         }
1876                         Grad[j] = -tmp + xjneg_sum[j];
1877
1878                         double violation = 0;
1879                         if (j == w_size-1 && regularize_bias == 0)
1880                                 violation = fabs(Grad[j]);
1881                         else
1882                         {
1883                                 double Gp = Grad[j]+1;
1884                                 double Gn = Grad[j]-1;
1885                                 if(w[j] == 0)
1886                                 {
1887                                         if(Gp < 0)
1888                                                 violation = -Gp;
1889                                         else if(Gn > 0)
1890                                                 violation = Gn;
1891                                         //outer-level shrinking
1892                                         else if(Gp>Gmax_old/l && Gn<-Gmax_old/l)
1893                                         {
1894                                                 active_size--;
1895                                                 swap(index[s], index[active_size]);
1896                                                 s--;
1897                                                 continue;
1898                                         }
1899                                 }
1900                                 else if(w[j] > 0)
1901                                         violation = fabs(Gp);
1902                                 else
1903                                         violation = fabs(Gn);
1904                         }
1905                         Gmax_new = max(Gmax_new, violation);
1906                         Gnorm1_new += violation;
1907                 }
1908
1909                 if(newton_iter == 0)
1910                         Gnorm1_init = Gnorm1_new;
1911
1912                 if(Gnorm1_new <= eps*Gnorm1_init)
1913                         break;
1914
1915                 iter = 0;
1916                 QP_Gmax_old = INF;
1917                 QP_active_size = active_size;
1918
1919                 for(int i=0; i<l; i++)
1920                         xTd[i] = 0;
1921
1922                 // optimize QP over wpd
1923                 while(iter < max_iter)
1924                 {
1925                         QP_Gmax_new = 0;
1926                         QP_Gnorm1_new = 0;
1927
1928                         for(j=0; j<QP_active_size; j++)
1929                         {
1930                                 int i = j+rand()%(QP_active_size-j);
1931                                 swap(index[i], index[j]);
1932                         }
1933
1934                         for(s=0; s<QP_active_size; s++)
1935                         {
1936                                 j = index[s];
1937                                 H = Hdiag[j];
1938
1939                                 x = prob_col->x[j];
1940                                 G = Grad[j] + (wpd[j]-w[j])*nu;
1941                                 while(x->index != -1)
1942                                 {
1943                                         int ind = x->index-1;
1944                                         G += x->value*D[ind]*xTd[ind];
1945                                         x++;
1946                                 }
1947
1948                                 double violation = 0;
1949                                 if (j == w_size-1 && regularize_bias == 0)
1950                                 {
1951                                         // bias term not shrunken
1952                                         violation = fabs(G);
1953                                         z = -G/H;
1954                                 }
1955                                 else
1956                                 {
1957                                         double Gp = G+1;
1958                                         double Gn = G-1;
1959                                         if(wpd[j] == 0)
1960                                         {
1961                                                 if(Gp < 0)
1962                                                         violation = -Gp;
1963                                                 else if(Gn > 0)
1964                                                         violation = Gn;
1965                                                 //inner-level shrinking
1966                                                 else if(Gp>QP_Gmax_old/l && Gn<-QP_Gmax_old/l)
1967                                                 {
1968                                                         QP_active_size--;
1969                                                         swap(index[s], index[QP_active_size]);
1970                                                         s--;
1971                                                         continue;
1972                                                 }
1973                                         }
1974                                         else if(wpd[j] > 0)
1975                                                 violation = fabs(Gp);
1976                                         else
1977                                                 violation = fabs(Gn);
1978
1979                                         // obtain solution of one-variable problem
1980                                         if(Gp < H*wpd[j])
1981                                                 z = -Gp/H;
1982                                         else if(Gn > H*wpd[j])
1983                                                 z = -Gn/H;
1984                                         else
1985                                                 z = -wpd[j];
1986                                 }
1987                                 QP_Gmax_new = max(QP_Gmax_new, violation);
1988                                 QP_Gnorm1_new += violation;
1989
1990                                 if(fabs(z) < 1.0e-12)
1991                                         continue;
1992                                 z = min(max(z,-10.0),10.0);
1993
1994                                 wpd[j] += z;
1995
1996                                 x = prob_col->x[j];
1997                                 sparse_operator::axpy(z, x, xTd);
1998                         }
1999
2000                         iter++;
2001
2002                         if(QP_Gnorm1_new <= inner_eps*Gnorm1_init)
2003                         {
2004                                 //inner stopping
2005                                 if(QP_active_size == active_size)
2006                                         break;
2007                                 //active set reactivation
2008                                 else
2009                                 {
2010                                         QP_active_size = active_size;
2011                                         QP_Gmax_old = INF;
2012                                         continue;
2013                                 }
2014                         }
2015
2016                         QP_Gmax_old = QP_Gmax_new;
2017                 }
2018
2019                 if(iter >= max_iter)
2020                         info("WARNING: reaching max number of inner iterations\n");
2021
2022                 delta = 0;
2023                 w_norm_new = 0;
2024                 for(j=0; j<w_size; j++)
2025                 {
2026                         delta += Grad[j]*(wpd[j]-w[j]);
2027                         if(wpd[j] != 0)
2028                                 w_norm_new += fabs(wpd[j]);
2029                 }
2030                 if (regularize_bias == 0)
2031                         w_norm_new -= fabs(wpd[w_size-1]);
2032                 delta += (w_norm_new-w_norm);
2033
2034                 negsum_xTd = 0;
2035                 for(int i=0; i<l; i++)
2036                         if(y[i] == -1)
2037                                 negsum_xTd += C[GETI(i)]*xTd[i];
2038
2039                 int num_linesearch;
2040                 for(num_linesearch=0; num_linesearch < max_num_linesearch; num_linesearch++)
2041                 {
2042                         cond = w_norm_new - w_norm + negsum_xTd - sigma*delta;
2043
2044                         for(int i=0; i<l; i++)
2045                         {
2046                                 double exp_xTd = exp(xTd[i]);
2047                                 exp_wTx_new[i] = exp_wTx[i]*exp_xTd;
2048                                 cond += C[GETI(i)]*log((1+exp_wTx_new[i])/(exp_xTd+exp_wTx_new[i]));
2049                         }
2050
2051                         if(cond <= 0)
2052                         {
2053                                 w_norm = w_norm_new;
2054                                 for(j=0; j<w_size; j++)
2055                                         w[j] = wpd[j];
2056                                 for(int i=0; i<l; i++)
2057                                 {
2058                                         exp_wTx[i] = exp_wTx_new[i];
2059                                         double tau_tmp = 1/(1+exp_wTx[i]);
2060                                         tau[i] = C[GETI(i)]*tau_tmp;
2061                                         D[i] = C[GETI(i)]*exp_wTx[i]*tau_tmp*tau_tmp;
2062                                 }
2063                                 break;
2064                         }
2065                         else
2066                         {
2067                                 w_norm_new = 0;
2068                                 for(j=0; j<w_size; j++)
2069                                 {
2070                                         wpd[j] = (w[j]+wpd[j])*0.5;
2071                                         if(wpd[j] != 0)
2072                                                 w_norm_new += fabs(wpd[j]);
2073                                 }
2074                                 if (regularize_bias == 0)
2075                                         w_norm_new -= fabs(wpd[w_size-1]);
2076                                 delta *= 0.5;
2077                                 negsum_xTd *= 0.5;
2078                                 for(int i=0; i<l; i++)
2079                                         xTd[i] *= 0.5;
2080                         }
2081                 }
2082
2083                 // Recompute some info due to too many line search steps
2084                 if(num_linesearch >= max_num_linesearch)
2085                 {
2086                         for(int i=0; i<l; i++)
2087                                 exp_wTx[i] = 0;
2088
2089                         for(int i=0; i<w_size; i++)
2090                         {
2091                                 if(w[i]==0) continue;
2092                                 x = prob_col->x[i];
2093                                 sparse_operator::axpy(w[i], x, exp_wTx);
2094                         }
2095
2096                         for(int i=0; i<l; i++)
2097                                 exp_wTx[i] = exp(exp_wTx[i]);
2098                 }
2099
2100                 if(iter == 1)
2101                         inner_eps *= 0.25;
2102
2103                 newton_iter++;
2104                 Gmax_old = Gmax_new;
2105
2106                 info("iter %3d  #CD cycles %d\n", newton_iter, iter);
2107         }
2108
2109         info("=========================\n");
2110         info("optimization finished, #iter = %d\n", newton_iter);
2111         if(newton_iter >= max_newton_iter)
2112                 info("WARNING: reaching max number of iterations\n");
2113
2114         // calculate objective value
2115
2116         double v = 0;
2117         int nnz = 0;
2118         for(j=0; j<w_size; j++)
2119                 if(w[j] != 0)
2120                 {
2121                         v += fabs(w[j]);
2122                         nnz++;
2123                 }
2124         if (regularize_bias == 0)
2125                 v -= fabs(w[w_size-1]);
2126         for(j=0; j<l; j++)
2127                 if(y[j] == 1)
2128                         v += C[GETI(j)]*log(1+1/exp_wTx[j]);
2129                 else
2130                         v += C[GETI(j)]*log(1+exp_wTx[j]);
2131
2132         info("Objective value = %lf\n", v);
2133         info("#nonzeros/#features = %d/%d\n", nnz, w_size);
2134
2135         delete [] index;
2136         delete [] y;
2137         delete [] Hdiag;
2138         delete [] Grad;
2139         delete [] wpd;
2140         delete [] xjneg_sum;
2141         delete [] xTd;
2142         delete [] exp_wTx;
2143         delete [] exp_wTx_new;
2144         delete [] tau;
2145         delete [] D;
2146 }
2147
2148 struct heap {
2149         enum HEAP_TYPE { MIN, MAX };
2150         int _size;
2151         HEAP_TYPE _type;
2152         feature_node* a;
2153
2154         heap(int max_size, HEAP_TYPE type)
2155         {
2156                 _size = 0;
2157                 a = new feature_node[max_size];
2158                 _type = type;
2159         }
2160         ~heap()
2161         {
2162                 delete [] a;
2163         }
2164         bool cmp(const feature_node& left, const feature_node& right)
2165         {
2166                 if(_type == MIN)
2167                         return left.value > right.value;
2168                 else
2169                         return left.value < right.value;
2170         }
2171         int size()
2172         {
2173                 return _size;
2174         }
2175         void push(feature_node node)
2176         {
2177                 a[_size] = node;
2178                 _size++;
2179                 int i = _size-1;
2180                 while(i)
2181                 {
2182                         int p = (i-1)/2;
2183                         if(cmp(a[p], a[i]))
2184                         {
2185                                 swap(a[i], a[p]);
2186                                 i = p;
2187                         }
2188                         else
2189                                 break;
2190                 }
2191         }
2192         void pop()
2193         {
2194                 _size--;
2195                 a[0] = a[_size];
2196                 int i = 0;
2197                 while(i*2+1 < _size)
2198                 {
2199                         int l = i*2+1;
2200                         int r = i*2+2;
2201                         if(r < _size && cmp(a[l], a[r]))
2202                                 l = r;
2203                         if(cmp(a[i], a[l]))
2204                         {
2205                                 swap(a[i], a[l]);
2206                                 i = l;
2207                         }
2208                         else
2209                                 break;
2210                 }
2211         }
2212         feature_node top()
2213         {
2214                 return a[0];
2215         }
2216 };
2217
2218 // A two-level coordinate descent algorithm for
2219 // a scaled one-class SVM dual problem
2220 //
2221 //  min_\alpha  0.5(\alpha^T Q \alpha),
2222 //    s.t.      0 <= \alpha_i <= 1 and
2223 //              e^T \alpha = \nu l
2224 //
2225 //  where Qij = xi^T xj
2226 //
2227 // Given:
2228 // x, nu
2229 // eps is the stopping tolerance
2230 //
2231 // solution will be put in w and rho
2232 //
2233 // See Algorithm 7 in supplementary materials of Chou et al., SDM 2020.
2234
2235 static void solve_oneclass_svm(const problem *prob, double *w, double *rho, double eps, double nu)
2236 {
2237         int l = prob->l;
2238         int w_size = prob->n;
2239         int i, j, s, iter = 0;
2240         double Gi, Gj;
2241         double Qij, quad_coef, delta, sum;
2242         double old_alpha_i;
2243         double *QD = new double[l];
2244         double *G = new double[l];
2245         int *index = new int[l];
2246         double *alpha = new double[l];
2247         int max_inner_iter;
2248         int max_iter = 1000;
2249         int active_size = l;
2250
2251         double negGmax;                 // max { -grad(f)_i | alpha_i < 1 }
2252         double negGmin;                 // min { -grad(f)_i | alpha_i > 0 }
2253
2254         int *most_violating_i = new int[l];
2255         int *most_violating_j = new int[l];
2256
2257         int n = (int)(nu*l);            // # of alpha's at upper bound
2258         for(i=0; i<n; i++)
2259                 alpha[i] = 1;
2260         if (n<l)
2261                 alpha[i] = nu*l-n;
2262         for(i=n+1; i<l; i++)
2263                 alpha[i] = 0;
2264
2265         for(i=0; i<w_size; i++)
2266                 w[i] = 0;
2267         for(i=0; i<l; i++)
2268         {
2269                 feature_node * const xi = prob->x[i];
2270                 QD[i] = sparse_operator::nrm2_sq(xi);
2271                 sparse_operator::axpy(alpha[i], xi, w);
2272
2273                 index[i] = i;
2274         }
2275
2276         while (iter < max_iter)
2277         {
2278                 negGmax = -INF;
2279                 negGmin = INF;
2280
2281                 for (s=0; s<active_size; s++)
2282                 {
2283                         i = index[s];
2284                         feature_node * const xi = prob->x[i];
2285                         G[i] = sparse_operator::dot(w, xi);
2286                         if (alpha[i] < 1)
2287                                 negGmax = max(negGmax, -G[i]);
2288                         if (alpha[i] > 0)
2289                                 negGmin = min(negGmin, -G[i]);
2290                 }
2291
2292                 if (negGmax - negGmin < eps)
2293                 {
2294                         if (active_size == l)
2295                                 break;
2296                         else
2297                         {
2298                                 active_size = l;
2299                                 info("*");
2300                                 continue;
2301                         }
2302                 }
2303
2304                 for(s=0; s<active_size; s++)
2305                 {
2306                         i = index[s];
2307                         if ((alpha[i] == 1 && -G[i] > negGmax) ||
2308                             (alpha[i] == 0 && -G[i] < negGmin))
2309                         {
2310                                 active_size--;
2311                                 swap(index[s], index[active_size]);
2312                                 s--;
2313                         }
2314                 }
2315
2316                 max_inner_iter = max(active_size/10, 1);
2317                 struct heap min_heap = heap(max_inner_iter, heap::MIN);
2318                 struct heap max_heap = heap(max_inner_iter, heap::MAX);
2319                 struct feature_node node;
2320                 for(s=0; s<active_size; s++)
2321                 {
2322                         i = index[s];
2323                         node.index = i;
2324                         node.value = -G[i];
2325
2326                         if (alpha[i] < 1)
2327                         {
2328                                 if (min_heap.size() < max_inner_iter)
2329                                         min_heap.push(node);
2330                                 else if (min_heap.top().value < node.value)
2331                                 {
2332                                         min_heap.pop();
2333                                         min_heap.push(node);
2334                                 }
2335                         }
2336
2337                         if (alpha[i] > 0)
2338                         {
2339                                 if (max_heap.size() < max_inner_iter)
2340                                         max_heap.push(node);
2341                                 else if (max_heap.top().value > node.value)
2342                                 {
2343                                         max_heap.pop();
2344                                         max_heap.push(node);
2345                                 }
2346                         }
2347                 }
2348                 max_inner_iter = min(min_heap.size(), max_heap.size());
2349                 while (max_heap.size() > max_inner_iter)
2350                         max_heap.pop();
2351                 while (min_heap.size() > max_inner_iter)
2352                         min_heap.pop();
2353
2354                 for (s=max_inner_iter-1; s>=0; s--)
2355                 {
2356                         most_violating_i[s] = min_heap.top().index;
2357                         most_violating_j[s] = max_heap.top().index;
2358                         min_heap.pop();
2359                         max_heap.pop();
2360                 }
2361
2362                 for (s=0; s<max_inner_iter; s++)
2363                 {
2364                         i = most_violating_i[s];
2365                         j = most_violating_j[s];
2366
2367                         if ((alpha[i] == 0 && alpha[j] == 0) ||
2368                             (alpha[i] == 1 && alpha[j] == 1))
2369                                 continue;
2370
2371                         feature_node const * xi = prob->x[i];
2372                         feature_node const * xj = prob->x[j];
2373
2374                         Gi = sparse_operator::dot(w, xi);
2375                         Gj = sparse_operator::dot(w, xj);
2376
2377                         int violating_pair = 0;
2378                         if (alpha[i] < 1 && alpha[j] > 0 && -Gj + 1e-12 < -Gi)
2379                                 violating_pair = 1;
2380                         else
2381                                 if (alpha[i] > 0 && alpha[j] < 1 && -Gi + 1e-12 < -Gj)
2382                                         violating_pair = 1;
2383                         if (violating_pair == 0)
2384                                 continue;
2385
2386                         Qij = sparse_operator::sparse_dot(xi, xj);
2387                         quad_coef = QD[i] + QD[j] - 2*Qij;
2388                         if(quad_coef <= 0)
2389                                 quad_coef = 1e-12;
2390                         delta = (Gi - Gj) / quad_coef;
2391                         old_alpha_i = alpha[i];
2392                         sum = alpha[i] + alpha[j];
2393                         alpha[i] = alpha[i] - delta;
2394                         alpha[j] = alpha[j] + delta;
2395                         if (sum > 1)
2396                         {
2397                                 if (alpha[i] > 1)
2398                                 {
2399                                         alpha[i] = 1;
2400                                         alpha[j] = sum - 1;
2401                                 }
2402                         }
2403                         else
2404                         {
2405                                 if (alpha[j] < 0)
2406                                 {
2407                                         alpha[j] = 0;
2408                                         alpha[i] = sum;
2409                                 }
2410                         }
2411                         if (sum > 1)
2412                         {
2413                                 if (alpha[j] > 1)
2414                                 {
2415                                         alpha[j] = 1;
2416                                         alpha[i] = sum - 1;
2417                                 }
2418                         }
2419                         else
2420                         {
2421                                 if (alpha[i] < 0)
2422                                 {
2423                                         alpha[i] = 0;
2424                                         alpha[j] = sum;
2425                                 }
2426                         }
2427                         delta = alpha[i] - old_alpha_i;
2428                         sparse_operator::axpy(delta, xi, w);
2429                         sparse_operator::axpy(-delta, xj, w);
2430                 }
2431                 iter++;
2432                 if (iter % 10 == 0)
2433                         info(".");
2434         }
2435         info("\noptimization finished, #iter = %d\n",iter);
2436         if (iter >= max_iter)
2437                 info("\nWARNING: reaching max number of iterations\n\n");
2438
2439         // calculate object value
2440         double v = 0;
2441         for(i=0; i<w_size; i++)
2442                 v += w[i]*w[i];
2443         int nSV = 0;
2444         for(i=0; i<l; i++)
2445         {
2446                 if (alpha[i] > 0)
2447                         ++nSV;
2448         }
2449         info("Objective value = %lf\n", v/2);
2450         info("nSV = %d\n", nSV);
2451
2452         // calculate rho
2453         double nr_free = 0;
2454         double ub = INF, lb = -INF, sum_free = 0;
2455         for(i=0; i<l; i++)
2456         {
2457                 double G = sparse_operator::dot(w, prob->x[i]);
2458                 if (alpha[i] == 0)
2459                         lb = max(lb, G);
2460                 else if (alpha[i] == 1)
2461                         ub = min(ub, G);
2462                 else
2463                 {
2464                         ++nr_free;
2465                         sum_free += G;
2466                 }
2467         }
2468
2469         if (nr_free > 0)
2470                 *rho = sum_free/nr_free;
2471         else
2472                 *rho = (ub + lb)/2;
2473
2474         info("rho = %lf\n", *rho);
2475
2476         delete [] QD;
2477         delete [] G;
2478         delete [] index;
2479         delete [] alpha;
2480         delete [] most_violating_i;
2481         delete [] most_violating_j;
2482 }
2483
2484 // transpose matrix X from row format to column format
2485 static void transpose(const problem *prob, feature_node **x_space_ret, problem *prob_col)
2486 {
2487         int i;
2488         int l = prob->l;
2489         int n = prob->n;
2490         size_t nnz = 0;
2491         size_t *col_ptr = new size_t [n+1];
2492         feature_node *x_space;
2493         prob_col->l = l;
2494         prob_col->n = n;
2495         prob_col->y = new double[l];
2496         prob_col->x = new feature_node*[n];
2497
2498         for(i=0; i<l; i++)
2499                 prob_col->y[i] = prob->y[i];
2500
2501         for(i=0; i<n+1; i++)
2502                 col_ptr[i] = 0;
2503         for(i=0; i<l; i++)
2504         {
2505                 feature_node *x = prob->x[i];
2506                 while(x->index != -1)
2507                 {
2508                         nnz++;
2509                         col_ptr[x->index]++;
2510                         x++;
2511                 }
2512         }
2513         for(i=1; i<n+1; i++)
2514                 col_ptr[i] += col_ptr[i-1] + 1;
2515
2516         x_space = new feature_node[nnz+n];
2517         for(i=0; i<n; i++)
2518                 prob_col->x[i] = &x_space[col_ptr[i]];
2519
2520         for(i=0; i<l; i++)
2521         {
2522                 feature_node *x = prob->x[i];
2523                 while(x->index != -1)
2524                 {
2525                         int ind = x->index-1;
2526                         x_space[col_ptr[ind]].index = i+1; // starts from 1
2527                         x_space[col_ptr[ind]].value = x->value;
2528                         col_ptr[ind]++;
2529                         x++;
2530                 }
2531         }
2532         for(i=0; i<n; i++)
2533                 x_space[col_ptr[i]].index = -1;
2534
2535         *x_space_ret = x_space;
2536
2537         delete [] col_ptr;
2538 }
2539
2540 // label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data
2541 // perm, length l, must be allocated before calling this subroutine
2542 static void group_classes(const problem *prob, int *nr_class_ret, int **label_ret, int **start_ret, int **count_ret, int *perm)
2543 {
2544         int l = prob->l;
2545         int max_nr_class = 16;
2546         int nr_class = 0;
2547         int *label = Malloc(int,max_nr_class);
2548         int *count = Malloc(int,max_nr_class);
2549         int *data_label = Malloc(int,l);
2550         int i;
2551
2552         for(i=0;i<l;i++)
2553         {
2554                 int this_label = (int)prob->y[i];
2555                 int j;
2556                 for(j=0;j<nr_class;j++)
2557                 {
2558                         if(this_label == label[j])
2559                         {
2560                                 ++count[j];
2561                                 break;
2562                         }
2563                 }
2564                 data_label[i] = j;
2565                 if(j == nr_class)
2566                 {
2567                         if(nr_class == max_nr_class)
2568                         {
2569                                 max_nr_class *= 2;
2570                                 label = (int *)realloc(label,max_nr_class*sizeof(int));
2571                                 count = (int *)realloc(count,max_nr_class*sizeof(int));
2572                         }
2573                         label[nr_class] = this_label;
2574                         count[nr_class] = 1;
2575                         ++nr_class;
2576                 }
2577         }
2578
2579         //
2580         // Labels are ordered by their first occurrence in the training set.
2581         // However, for two-class sets with -1/+1 labels and -1 appears first,
2582         // we swap labels to ensure that internally the binary SVM has positive data corresponding to the +1 instances.
2583         //
2584         if (nr_class == 2 && label[0] == -1 && label[1] == 1)
2585         {
2586                 swap(label[0],label[1]);
2587                 swap(count[0],count[1]);
2588                 for(i=0;i<l;i++)
2589                 {
2590                         if(data_label[i] == 0)
2591                                 data_label[i] = 1;
2592                         else
2593                                 data_label[i] = 0;
2594                 }
2595         }
2596
2597         int *start = Malloc(int,nr_class);
2598         start[0] = 0;
2599         for(i=1;i<nr_class;i++)
2600                 start[i] = start[i-1]+count[i-1];
2601         for(i=0;i<l;i++)
2602         {
2603                 perm[start[data_label[i]]] = i;
2604                 ++start[data_label[i]];
2605         }
2606         start[0] = 0;
2607         for(i=1;i<nr_class;i++)
2608                 start[i] = start[i-1]+count[i-1];
2609
2610         *nr_class_ret = nr_class;
2611         *label_ret = label;
2612         *start_ret = start;
2613         *count_ret = count;
2614         free(data_label);
2615 }
2616
2617 static void train_one(const problem *prob, const parameter *param, double *w, double Cp, double Cn)
2618 {
2619         double eps = param->eps;
2620
2621         int pos = 0;
2622         int neg = 0;
2623         for(int i=0;i<prob->l;i++)
2624                 if(prob->y[i] > 0)
2625                         pos++;
2626         neg = prob->l - pos;
2627         double primal_solver_tol = eps*max(min(pos,neg), 1)/prob->l;
2628
2629         function *fun_obj=NULL;
2630         switch(param->solver_type)
2631         {
2632                 case L2R_LR:
2633                 {
2634                         double *C = new double[prob->l];
2635                         for(int i = 0; i < prob->l; i++)
2636                         {
2637                                 if(prob->y[i] > 0)
2638                                         C[i] = Cp;
2639                                 else
2640                                         C[i] = Cn;
2641                         }
2642                         fun_obj=new l2r_lr_fun(prob, param, C);
2643                         NEWTON newton_obj(fun_obj, primal_solver_tol);
2644                         newton_obj.set_print_string(liblinear_print_string);
2645                         newton_obj.newton(w);
2646                         delete fun_obj;
2647                         delete[] C;
2648                         break;
2649                 }
2650                 case L2R_L2LOSS_SVC:
2651                 {
2652                         double *C = new double[prob->l];
2653                         for(int i = 0; i < prob->l; i++)
2654                         {
2655                                 if(prob->y[i] > 0)
2656                                         C[i] = Cp;
2657                                 else
2658                                         C[i] = Cn;
2659                         }
2660                         fun_obj=new l2r_l2_svc_fun(prob, param, C);
2661                         NEWTON newton_obj(fun_obj, primal_solver_tol);
2662                         newton_obj.set_print_string(liblinear_print_string);
2663                         newton_obj.newton(w);
2664                         delete fun_obj;
2665                         delete[] C;
2666                         break;
2667                 }
2668                 case L2R_L2LOSS_SVC_DUAL:
2669                         solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L2LOSS_SVC_DUAL);
2670                         break;
2671                 case L2R_L1LOSS_SVC_DUAL:
2672                         solve_l2r_l1l2_svc(prob, w, eps, Cp, Cn, L2R_L1LOSS_SVC_DUAL);
2673                         break;
2674                 case L1R_L2LOSS_SVC:
2675                 {
2676                         problem prob_col;
2677                         feature_node *x_space = NULL;
2678                         transpose(prob, &x_space ,&prob_col);
2679                         solve_l1r_l2_svc(&prob_col, w, primal_solver_tol, Cp, Cn, param->regularize_bias);
2680                         delete [] prob_col.y;
2681                         delete [] prob_col.x;
2682                         delete [] x_space;
2683                         break;
2684                 }
2685                 case L1R_LR:
2686                 {
2687                         problem prob_col;
2688                         feature_node *x_space = NULL;
2689                         transpose(prob, &x_space ,&prob_col);
2690                         solve_l1r_lr(&prob_col, w, primal_solver_tol, Cp, Cn, param->regularize_bias);
2691                         delete [] prob_col.y;
2692                         delete [] prob_col.x;
2693                         delete [] x_space;
2694                         break;
2695                 }
2696                 case L2R_LR_DUAL:
2697                         solve_l2r_lr_dual(prob, w, eps, Cp, Cn);
2698                         break;
2699                 case L2R_L2LOSS_SVR:
2700                 {
2701                         double *C = new double[prob->l];
2702                         for(int i = 0; i < prob->l; i++)
2703                                 C[i] = param->C;
2704
2705                         fun_obj=new l2r_l2_svr_fun(prob, param, C);
2706                         NEWTON newton_obj(fun_obj, param->eps);
2707                         newton_obj.set_print_string(liblinear_print_string);
2708                         newton_obj.newton(w);
2709                         delete fun_obj;
2710                         delete[] C;
2711                         break;
2712
2713                 }
2714                 case L2R_L1LOSS_SVR_DUAL:
2715                         solve_l2r_l1l2_svr(prob, w, param, L2R_L1LOSS_SVR_DUAL);
2716                         break;
2717                 case L2R_L2LOSS_SVR_DUAL:
2718                         solve_l2r_l1l2_svr(prob, w, param, L2R_L2LOSS_SVR_DUAL);
2719                         break;
2720                 default:
2721                         fprintf(stderr, "ERROR: unknown solver_type\n");
2722                         break;
2723         }
2724 }
2725
2726 // Calculate the initial C for parameter selection
2727 static double calc_start_C(const problem *prob, const parameter *param)
2728 {
2729         int i;
2730         double xTx, max_xTx;
2731         max_xTx = 0;
2732         for(i=0; i<prob->l; i++)
2733         {
2734                 xTx = 0;
2735                 feature_node *xi=prob->x[i];
2736                 while(xi->index != -1)
2737                 {
2738                         double val = xi->value;
2739                         xTx += val*val;
2740                         xi++;
2741                 }
2742                 if(xTx > max_xTx)
2743                         max_xTx = xTx;
2744         }
2745
2746         double min_C = 1.0;
2747         if(param->solver_type == L2R_LR)
2748                 min_C = 1.0 / (prob->l * max_xTx);
2749         else if(param->solver_type == L2R_L2LOSS_SVC)
2750                 min_C = 1.0 / (2 * prob->l * max_xTx);
2751         else if(param->solver_type == L2R_L2LOSS_SVR)
2752         {
2753                 double sum_y, loss, y_abs;
2754                 double delta2 = 0.1;
2755                 sum_y = 0, loss = 0;
2756                 for(i=0; i<prob->l; i++)
2757                 {
2758                         y_abs = fabs(prob->y[i]);
2759                         sum_y += y_abs;
2760                         loss += max(y_abs - param->p, 0.0) * max(y_abs - param->p, 0.0);
2761                 }
2762                 if(loss > 0)
2763                         min_C = delta2 * delta2 * loss / (8 * sum_y * sum_y * max_xTx);
2764                 else
2765                         min_C = INF;
2766         }
2767
2768         return pow( 2, floor(log(min_C) / log(2.0)) );
2769 }
2770
2771 static double calc_max_p(const problem *prob, const parameter *param)
2772 {
2773         int i;
2774         double max_p = 0.0;
2775         for(i = 0; i < prob->l; i++)
2776                 max_p = max(max_p, fabs(prob->y[i]));
2777
2778         return max_p;
2779 }
2780
2781 static void find_parameter_C(const problem *prob, parameter *param_tmp, double start_C, double max_C, double *best_C, double *best_score, const int *fold_start, const int *perm, const problem *subprob, int nr_fold)
2782 {
2783         // variables for CV
2784         int i;
2785         double *target = Malloc(double, prob->l);
2786
2787         // variables for warm start
2788         double ratio = 2;
2789         double **prev_w = Malloc(double*, nr_fold);
2790         for(i = 0; i < nr_fold; i++)
2791                 prev_w[i] = NULL;
2792         int num_unchanged_w = 0;
2793         void (*default_print_string) (const char *) = liblinear_print_string;
2794
2795         if(param_tmp->solver_type == L2R_LR || param_tmp->solver_type == L2R_L2LOSS_SVC)
2796                 *best_score = 0.0;
2797         else if(param_tmp->solver_type == L2R_L2LOSS_SVR)
2798                 *best_score = INF;
2799         *best_C = start_C;
2800
2801         param_tmp->C = start_C;
2802         while(param_tmp->C <= max_C)
2803         {
2804                 //Output disabled for running CV at a particular C
2805                 set_print_string_function(&print_null);
2806
2807                 for(i=0; i<nr_fold; i++)
2808                 {
2809                         int j;
2810                         int begin = fold_start[i];
2811                         int end = fold_start[i+1];
2812
2813                         param_tmp->init_sol = prev_w[i];
2814                         struct model *submodel = train(&subprob[i],param_tmp);
2815
2816                         int total_w_size;
2817                         if(submodel->nr_class == 2)
2818                                 total_w_size = subprob[i].n;
2819                         else
2820                                 total_w_size = subprob[i].n * submodel->nr_class;
2821
2822                         if(prev_w[i] == NULL)
2823                         {
2824                                 prev_w[i] = Malloc(double, total_w_size);
2825                                 for(j=0; j<total_w_size; j++)
2826                                         prev_w[i][j] = submodel->w[j];
2827                         }
2828                         else if(num_unchanged_w >= 0)
2829                         {
2830                                 double norm_w_diff = 0;
2831                                 for(j=0; j<total_w_size; j++)
2832                                 {
2833                                         norm_w_diff += (submodel->w[j] - prev_w[i][j])*(submodel->w[j] - prev_w[i][j]);
2834                                         prev_w[i][j] = submodel->w[j];
2835                                 }
2836                                 norm_w_diff = sqrt(norm_w_diff);
2837
2838                                 if(norm_w_diff > 1e-15)
2839                                         num_unchanged_w = -1;
2840                         }
2841                         else
2842                         {
2843                                 for(j=0; j<total_w_size; j++)
2844                                         prev_w[i][j] = submodel->w[j];
2845                         }
2846
2847                         for(j=begin; j<end; j++)
2848                                 target[perm[j]] = predict(submodel,prob->x[perm[j]]);
2849
2850                         free_and_destroy_model(&submodel);
2851                 }
2852                 set_print_string_function(default_print_string);
2853
2854                 if(param_tmp->solver_type == L2R_LR || param_tmp->solver_type == L2R_L2LOSS_SVC)
2855                 {
2856                         int total_correct = 0;
2857                         for(i=0; i<prob->l; i++)
2858                                 if(target[i] == prob->y[i])
2859                                         ++total_correct;
2860                         double current_rate = (double)total_correct/prob->l;
2861                         if(current_rate > *best_score)
2862                         {
2863                                 *best_C = param_tmp->C;
2864                                 *best_score = current_rate;
2865                         }
2866
2867                         info("log2c=%7.2f\trate=%g\n",log(param_tmp->C)/log(2.0),100.0*current_rate);
2868                 }
2869                 else if(param_tmp->solver_type == L2R_L2LOSS_SVR)
2870                 {
2871                         double total_error = 0.0;
2872                         for(i=0; i<prob->l; i++)
2873                         {
2874                                 double y = prob->y[i];
2875                                 double v = target[i];
2876                                 total_error += (v-y)*(v-y);
2877                         }
2878                         double current_error = total_error/prob->l;
2879                         if(current_error < *best_score)
2880                         {
2881                                 *best_C = param_tmp->C;
2882                                 *best_score = current_error;
2883                         }
2884
2885                         info("log2c=%7.2f\tp=%7.2f\tMean squared error=%g\n",log(param_tmp->C)/log(2.0),param_tmp->p,current_error);
2886                 }
2887
2888                 num_unchanged_w++;
2889                 if(num_unchanged_w == 5)
2890                         break;
2891                 param_tmp->C = param_tmp->C*ratio;
2892         }
2893
2894         if(param_tmp->C > max_C)
2895                 info("WARNING: maximum C reached.\n");
2896         free(target);
2897         for(i=0; i<nr_fold; i++)
2898                 free(prev_w[i]);
2899         free(prev_w);
2900 }
2901
2902
2903 //
2904 // Interface functions
2905 //
2906 model* train(const problem *prob, const parameter *param)
2907 {
2908         int i,j;
2909         int l = prob->l;
2910         int n = prob->n;
2911         int w_size = prob->n;
2912         model *model_ = Malloc(model,1);
2913
2914         if(prob->bias>=0)
2915                 model_->nr_feature=n-1;
2916         else
2917                 model_->nr_feature=n;
2918         model_->param = *param;
2919         model_->bias = prob->bias;
2920
2921         if(check_regression_model(model_))
2922         {
2923                 model_->w = Malloc(double, w_size);
2924
2925                 if(param->init_sol != NULL)
2926                         for(i=0;i<w_size;i++)
2927                                 model_->w[i] = param->init_sol[i];
2928                 else
2929                         for(i=0;i<w_size;i++)
2930                                 model_->w[i] = 0;
2931
2932                 model_->nr_class = 2;
2933                 model_->label = NULL;
2934                 train_one(prob, param, model_->w, 0, 0);
2935         }
2936         else if(check_oneclass_model(model_))
2937         {
2938                 model_->w = Malloc(double, w_size);
2939                 model_->nr_class = 2;
2940                 model_->label = NULL;
2941                 solve_oneclass_svm(prob, model_->w, &(model_->rho), param->eps, param->nu);
2942         }
2943         else
2944         {
2945                 int nr_class;
2946                 int *label = NULL;
2947                 int *start = NULL;
2948                 int *count = NULL;
2949                 int *perm = Malloc(int,l);
2950
2951                 // group training data of the same class
2952                 group_classes(prob,&nr_class,&label,&start,&count,perm);
2953
2954                 model_->nr_class=nr_class;
2955                 model_->label = Malloc(int,nr_class);
2956                 for(i=0;i<nr_class;i++)
2957                         model_->label[i] = label[i];
2958
2959                 // calculate weighted C
2960                 double *weighted_C = Malloc(double, nr_class);
2961                 for(i=0;i<nr_class;i++)
2962                         weighted_C[i] = param->C;
2963                 for(i=0;i<param->nr_weight;i++)
2964                 {
2965                         for(j=0;j<nr_class;j++)
2966                                 if(param->weight_label[i] == label[j])
2967                                         break;
2968                         if(j == nr_class)
2969                                 fprintf(stderr,"WARNING: class label %d specified in weight is not found\n", param->weight_label[i]);
2970                         else
2971                                 weighted_C[j] *= param->weight[i];
2972                 }
2973
2974                 // constructing the subproblem
2975                 feature_node **x = Malloc(feature_node *,l);
2976                 for(i=0;i<l;i++)
2977                         x[i] = prob->x[perm[i]];
2978
2979                 int k;
2980                 problem sub_prob;
2981                 sub_prob.l = l;
2982                 sub_prob.n = n;
2983                 sub_prob.x = Malloc(feature_node *,sub_prob.l);
2984                 sub_prob.y = Malloc(double,sub_prob.l);
2985
2986                 for(k=0; k<sub_prob.l; k++)
2987                         sub_prob.x[k] = x[k];
2988
2989                 // multi-class svm by Crammer and Singer
2990                 if(param->solver_type == MCSVM_CS)
2991                 {
2992                         model_->w=Malloc(double, n*nr_class);
2993                         for(i=0;i<nr_class;i++)
2994                                 for(j=start[i];j<start[i]+count[i];j++)
2995                                         sub_prob.y[j] = i;
2996                         Solver_MCSVM_CS Solver(&sub_prob, nr_class, weighted_C, param->eps);
2997                         Solver.Solve(model_->w);
2998                 }
2999                 else
3000                 {
3001                         if(nr_class == 2)
3002                         {
3003                                 model_->w=Malloc(double, w_size);
3004
3005                                 int e0 = start[0]+count[0];
3006                                 k=0;
3007                                 for(; k<e0; k++)
3008                                         sub_prob.y[k] = +1;
3009                                 for(; k<sub_prob.l; k++)
3010                                         sub_prob.y[k] = -1;
3011
3012                                 if(param->init_sol != NULL)
3013                                         for(i=0;i<w_size;i++)
3014                                                 model_->w[i] = param->init_sol[i];
3015                                 else
3016                                         for(i=0;i<w_size;i++)
3017                                                 model_->w[i] = 0;
3018
3019                                 train_one(&sub_prob, param, model_->w, weighted_C[0], weighted_C[1]);
3020                         }
3021                         else
3022                         {
3023                                 model_->w=Malloc(double, w_size*nr_class);
3024                                 double *w=Malloc(double, w_size);
3025                                 for(i=0;i<nr_class;i++)
3026                                 {
3027                                         int si = start[i];
3028                                         int ei = si+count[i];
3029
3030                                         k=0;
3031                                         for(; k<si; k++)
3032                                                 sub_prob.y[k] = -1;
3033                                         for(; k<ei; k++)
3034                                                 sub_prob.y[k] = +1;
3035                                         for(; k<sub_prob.l; k++)
3036                                                 sub_prob.y[k] = -1;
3037
3038                                         if(param->init_sol != NULL)
3039                                                 for(j=0;j<w_size;j++)
3040                                                         w[j] = param->init_sol[j*nr_class+i];
3041                                         else
3042                                                 for(j=0;j<w_size;j++)
3043                                                         w[j] = 0;
3044
3045                                         train_one(&sub_prob, param, w, weighted_C[i], param->C);
3046
3047                                         for(j=0;j<w_size;j++)
3048                                                 model_->w[j*nr_class+i] = w[j];
3049                                 }
3050                                 free(w);
3051                         }
3052
3053                 }
3054
3055                 free(x);
3056                 free(label);
3057                 free(start);
3058                 free(count);
3059                 free(perm);
3060                 free(sub_prob.x);
3061                 free(sub_prob.y);
3062                 free(weighted_C);
3063         }
3064         return model_;
3065 }
3066
3067 void cross_validation(const problem *prob, const parameter *param, int nr_fold, double *target)
3068 {
3069         int i;
3070         int *fold_start;
3071         int l = prob->l;
3072         int *perm = Malloc(int,l);
3073         if (nr_fold > l)
3074         {
3075                 nr_fold = l;
3076                 fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n");
3077         }
3078         fold_start = Malloc(int,nr_fold+1);
3079         for(i=0;i<l;i++) perm[i]=i;
3080         for(i=0;i<l;i++)
3081         {
3082                 int j = i+rand()%(l-i);
3083                 swap(perm[i],perm[j]);
3084         }
3085         for(i=0;i<=nr_fold;i++)
3086                 fold_start[i]=i*l/nr_fold;
3087
3088         for(i=0;i<nr_fold;i++)
3089         {
3090                 int begin = fold_start[i];
3091                 int end = fold_start[i+1];
3092                 int j,k;
3093                 struct problem subprob;
3094
3095                 subprob.bias = prob->bias;
3096                 subprob.n = prob->n;
3097                 subprob.l = l-(end-begin);
3098                 subprob.x = Malloc(struct feature_node*,subprob.l);
3099                 subprob.y = Malloc(double,subprob.l);
3100
3101                 k=0;
3102                 for(j=0;j<begin;j++)
3103                 {
3104                         subprob.x[k] = prob->x[perm[j]];
3105                         subprob.y[k] = prob->y[perm[j]];
3106                         ++k;
3107                 }
3108                 for(j=end;j<l;j++)
3109                 {
3110                         subprob.x[k] = prob->x[perm[j]];
3111                         subprob.y[k] = prob->y[perm[j]];
3112                         ++k;
3113                 }
3114                 struct model *submodel = train(&subprob,param);
3115                 for(j=begin;j<end;j++)
3116                         target[perm[j]] = predict(submodel,prob->x[perm[j]]);
3117                 free_and_destroy_model(&submodel);
3118                 free(subprob.x);
3119                 free(subprob.y);
3120         }
3121         free(fold_start);
3122         free(perm);
3123 }
3124
3125
3126 void find_parameters(const problem *prob, const parameter *param, int nr_fold, double start_C, double start_p, double *best_C, double *best_p, double *best_score)
3127 {
3128         // prepare CV folds
3129
3130         int i;
3131         int *fold_start;
3132         int l = prob->l;
3133         int *perm = Malloc(int, l);
3134         struct problem *subprob = Malloc(problem,nr_fold);
3135
3136         if (nr_fold > l)
3137         {
3138                 nr_fold = l;
3139                 fprintf(stderr,"WARNING: # folds > # data. Will use # folds = # data instead (i.e., leave-one-out cross validation)\n");
3140         }
3141         fold_start = Malloc(int,nr_fold+1);
3142         for(i=0;i<l;i++) perm[i]=i;
3143         for(i=0;i<l;i++)
3144         {
3145                 int j = i+rand()%(l-i);
3146                 swap(perm[i],perm[j]);
3147         }
3148         for(i=0;i<=nr_fold;i++)
3149                 fold_start[i]=i*l/nr_fold;
3150
3151         for(i=0;i<nr_fold;i++)
3152         {
3153                 int begin = fold_start[i];
3154                 int end = fold_start[i+1];
3155                 int j,k;
3156
3157                 subprob[i].bias = prob->bias;
3158                 subprob[i].n = prob->n;
3159                 subprob[i].l = l-(end-begin);
3160                 subprob[i].x = Malloc(struct feature_node*,subprob[i].l);
3161                 subprob[i].y = Malloc(double,subprob[i].l);
3162
3163                 k=0;
3164                 for(j=0;j<begin;j++)
3165                 {
3166                         subprob[i].x[k] = prob->x[perm[j]];
3167                         subprob[i].y[k] = prob->y[perm[j]];
3168                         ++k;
3169                 }
3170                 for(j=end;j<l;j++)
3171                 {
3172                         subprob[i].x[k] = prob->x[perm[j]];
3173                         subprob[i].y[k] = prob->y[perm[j]];
3174                         ++k;
3175                 }
3176
3177         }
3178
3179         struct parameter param_tmp = *param;
3180         *best_p = -1;
3181         if(param->solver_type == L2R_LR || param->solver_type == L2R_L2LOSS_SVC)
3182         {
3183                 if(start_C <= 0)
3184                         start_C = calc_start_C(prob, &param_tmp);
3185                 double max_C = 1024;
3186                 start_C = min(start_C, max_C);
3187                 double best_C_tmp, best_score_tmp;
3188
3189                 find_parameter_C(prob, &param_tmp, start_C, max_C, &best_C_tmp, &best_score_tmp, fold_start, perm, subprob, nr_fold);
3190
3191                 *best_C = best_C_tmp;
3192                 *best_score = best_score_tmp;
3193         }
3194         else if(param->solver_type == L2R_L2LOSS_SVR)
3195         {
3196                 double max_p = calc_max_p(prob, &param_tmp);
3197                 int num_p_steps = 20;
3198                 double max_C = 1048576;
3199                 *best_score = INF;
3200
3201                 i = num_p_steps-1;
3202                 if(start_p > 0)
3203                         i = min((int)(start_p/(max_p/num_p_steps)), i);
3204                 for(; i >= 0; i--)
3205                 {
3206                         param_tmp.p = i*max_p/num_p_steps;
3207                         double start_C_tmp;
3208                         if(start_C <= 0)
3209                                 start_C_tmp = calc_start_C(prob, &param_tmp);
3210                         else
3211                                 start_C_tmp = start_C;
3212                         start_C_tmp = min(start_C_tmp, max_C);
3213                         double best_C_tmp, best_score_tmp;
3214
3215                         find_parameter_C(prob, &param_tmp, start_C_tmp, max_C, &best_C_tmp, &best_score_tmp, fold_start, perm, subprob, nr_fold);
3216
3217                         if(best_score_tmp < *best_score)
3218                         {
3219                                 *best_p = param_tmp.p;
3220                                 *best_C = best_C_tmp;
3221                                 *best_score = best_score_tmp;
3222                         }
3223                 }
3224         }
3225
3226         free(fold_start);
3227         free(perm);
3228         for(i=0; i<nr_fold; i++)
3229         {
3230                 free(subprob[i].x);
3231                 free(subprob[i].y);
3232         }
3233         free(subprob);
3234 }
3235
3236 double predict_values(const struct model *model_, const struct feature_node *x, double *dec_values)
3237 {
3238         int idx;
3239         int n;
3240         if(model_->bias>=0)
3241                 n=model_->nr_feature+1;
3242         else
3243                 n=model_->nr_feature;
3244         double *w=model_->w;
3245         int nr_class=model_->nr_class;
3246         int i;
3247         int nr_w;
3248         if(nr_class==2 && model_->param.solver_type != MCSVM_CS)
3249                 nr_w = 1;
3250         else
3251                 nr_w = nr_class;
3252
3253         const feature_node *lx=x;
3254         for(i=0;i<nr_w;i++)
3255                 dec_values[i] = 0;
3256         for(; (idx=lx->index)!=-1; lx++)
3257         {
3258                 // the dimension of testing data may exceed that of training
3259                 if(idx<=n)
3260                         for(i=0;i<nr_w;i++)
3261                                 dec_values[i] += w[(idx-1)*nr_w+i]*lx->value;
3262         }
3263         if(check_oneclass_model(model_))
3264                 dec_values[0] -= model_->rho;
3265
3266         if(nr_class==2)
3267         {
3268                 if(check_regression_model(model_))
3269                         return dec_values[0];
3270                 else if(check_oneclass_model(model_))
3271                         return (dec_values[0]>0)?1:-1;
3272                 else
3273                         return (dec_values[0]>0)?model_->label[0]:model_->label[1];
3274         }
3275         else
3276         {
3277                 int dec_max_idx = 0;
3278                 for(i=1;i<nr_class;i++)
3279                 {
3280                         if(dec_values[i] > dec_values[dec_max_idx])
3281                                 dec_max_idx = i;
3282                 }
3283                 return model_->label[dec_max_idx];
3284         }
3285 }
3286
3287 double predict(const model *model_, const feature_node *x)
3288 {
3289         double *dec_values = Malloc(double, model_->nr_class);
3290         double label=predict_values(model_, x, dec_values);
3291         free(dec_values);
3292         return label;
3293 }
3294
3295 double predict_probability(const struct model *model_, const struct feature_node *x, double* prob_estimates)
3296 {
3297         if(check_probability_model(model_))
3298         {
3299                 int i;
3300                 int nr_class=model_->nr_class;
3301                 int nr_w;
3302                 if(nr_class==2)
3303                         nr_w = 1;
3304                 else
3305                         nr_w = nr_class;
3306
3307                 double label=predict_values(model_, x, prob_estimates);
3308                 for(i=0;i<nr_w;i++)
3309                         prob_estimates[i]=1/(1+exp(-prob_estimates[i]));
3310
3311                 if(nr_class==2) // for binary classification
3312                         prob_estimates[1]=1.-prob_estimates[0];
3313                 else
3314                 {
3315                         double sum=0;
3316                         for(i=0; i<nr_class; i++)
3317                                 sum+=prob_estimates[i];
3318
3319                         for(i=0; i<nr_class; i++)
3320                                 prob_estimates[i]=prob_estimates[i]/sum;
3321                 }
3322
3323                 return label;
3324         }
3325         else
3326                 return 0;
3327 }
3328
3329 static const char *solver_type_table[]=
3330 {
3331         "L2R_LR", "L2R_L2LOSS_SVC_DUAL", "L2R_L2LOSS_SVC", "L2R_L1LOSS_SVC_DUAL", "MCSVM_CS",
3332         "L1R_L2LOSS_SVC", "L1R_LR", "L2R_LR_DUAL",
3333         "", "", "",
3334         "L2R_L2LOSS_SVR", "L2R_L2LOSS_SVR_DUAL", "L2R_L1LOSS_SVR_DUAL",
3335         "", "", "", "", "", "", "",
3336         "ONECLASS_SVM", NULL
3337 };
3338
3339 int save_model(const char *model_file_name, const struct model *model_)
3340 {
3341         int i;
3342         int nr_feature=model_->nr_feature;
3343         int n;
3344         const parameter& param = model_->param;
3345
3346         if(model_->bias>=0)
3347                 n=nr_feature+1;
3348         else
3349                 n=nr_feature;
3350         int w_size = n;
3351         FILE *fp = fopen(model_file_name,"w");
3352         if(fp==NULL) return -1;
3353
3354         char *old_locale = setlocale(LC_ALL, NULL);
3355         if (old_locale)
3356         {
3357                 old_locale = strdup(old_locale);
3358         }
3359         setlocale(LC_ALL, "C");
3360
3361         int nr_w;
3362         if(model_->nr_class==2 && model_->param.solver_type != MCSVM_CS)
3363                 nr_w=1;
3364         else
3365                 nr_w=model_->nr_class;
3366
3367         fprintf(fp, "solver_type %s\n", solver_type_table[param.solver_type]);
3368         fprintf(fp, "nr_class %d\n", model_->nr_class);
3369
3370         if(model_->label)
3371         {
3372                 fprintf(fp, "label");
3373                 for(i=0; i<model_->nr_class; i++)
3374                         fprintf(fp, " %d", model_->label[i]);
3375                 fprintf(fp, "\n");
3376         }
3377
3378         fprintf(fp, "nr_feature %d\n", nr_feature);
3379
3380         fprintf(fp, "bias %.17g\n", model_->bias);
3381
3382         if(check_oneclass_model(model_))
3383                 fprintf(fp, "rho %.17g\n", model_->rho);
3384
3385         fprintf(fp, "w\n");
3386         for(i=0; i<w_size; i++)
3387         {
3388                 int j;
3389                 for(j=0; j<nr_w; j++)
3390                         fprintf(fp, "%.17g ", model_->w[i*nr_w+j]);
3391                 fprintf(fp, "\n");
3392         }
3393
3394         setlocale(LC_ALL, old_locale);
3395         free(old_locale);
3396
3397         if (ferror(fp) != 0 || fclose(fp) != 0) return -1;
3398         else return 0;
3399 }
3400
3401 //
3402 // FSCANF helps to handle fscanf failures.
3403 // Its do-while block avoids the ambiguity when
3404 // if (...)
3405 //    FSCANF();
3406 // is used
3407 //
3408 #define FSCANF(_stream, _format, _var)do\
3409 {\
3410         if (fscanf(_stream, _format, _var) != 1)\
3411         {\
3412                 fprintf(stderr, "ERROR: fscanf failed to read the model\n");\
3413                 EXIT_LOAD_MODEL()\
3414         }\
3415 }while(0)
3416 // EXIT_LOAD_MODEL should NOT end with a semicolon.
3417 #define EXIT_LOAD_MODEL()\
3418 {\
3419         setlocale(LC_ALL, old_locale);\
3420         free(model_->label);\
3421         free(model_);\
3422         free(old_locale);\
3423         return NULL;\
3424 }
3425 struct model *load_model(const char *model_file_name)
3426 {
3427         FILE *fp = fopen(model_file_name,"r");
3428         if(fp==NULL) return NULL;
3429
3430         int i;
3431         int nr_feature;
3432         int n;
3433         int nr_class;
3434         double bias;
3435         double rho;
3436         model *model_ = Malloc(model,1);
3437         parameter& param = model_->param;
3438         // parameters for training only won't be assigned, but arrays are assigned as NULL for safety
3439         param.nr_weight = 0;
3440         param.weight_label = NULL;
3441         param.weight = NULL;
3442         param.init_sol = NULL;
3443
3444         model_->label = NULL;
3445
3446         char *old_locale = setlocale(LC_ALL, NULL);
3447         if (old_locale)
3448         {
3449                 old_locale = strdup(old_locale);
3450         }
3451         setlocale(LC_ALL, "C");
3452
3453         char cmd[81];
3454         while(1)
3455         {
3456                 FSCANF(fp,"%80s",cmd);
3457                 if(strcmp(cmd,"solver_type")==0)
3458                 {
3459                         FSCANF(fp,"%80s",cmd);
3460                         int i;
3461                         for(i=0;solver_type_table[i];i++)
3462                         {
3463                                 if(strcmp(solver_type_table[i],cmd)==0)
3464                                 {
3465                                         param.solver_type=i;
3466                                         break;
3467                                 }
3468                         }
3469                         if(solver_type_table[i] == NULL)
3470                         {
3471                                 fprintf(stderr,"unknown solver type.\n");
3472                                 EXIT_LOAD_MODEL()
3473                         }
3474                 }
3475                 else if(strcmp(cmd,"nr_class")==0)
3476                 {
3477                         FSCANF(fp,"%d",&nr_class);
3478                         model_->nr_class=nr_class;
3479                 }
3480                 else if(strcmp(cmd,"nr_feature")==0)
3481                 {
3482                         FSCANF(fp,"%d",&nr_feature);
3483                         model_->nr_feature=nr_feature;
3484                 }
3485                 else if(strcmp(cmd,"bias")==0)
3486                 {
3487                         FSCANF(fp,"%lf",&bias);
3488                         model_->bias=bias;
3489                 }
3490                 else if(strcmp(cmd,"rho")==0)
3491                 {
3492                         FSCANF(fp,"%lf",&rho);
3493                         model_->rho=rho;
3494                 }
3495                 else if(strcmp(cmd,"w")==0)
3496                 {
3497                         break;
3498                 }
3499                 else if(strcmp(cmd,"label")==0)
3500                 {
3501                         int nr_class = model_->nr_class;
3502                         model_->label = Malloc(int,nr_class);
3503                         for(int i=0;i<nr_class;i++)
3504                                 FSCANF(fp,"%d",&model_->label[i]);
3505                 }
3506                 else
3507                 {
3508                         fprintf(stderr,"unknown text in model file: [%s]\n",cmd);
3509                         EXIT_LOAD_MODEL()
3510                 }
3511         }
3512
3513         nr_feature=model_->nr_feature;
3514         if(model_->bias>=0)
3515                 n=nr_feature+1;
3516         else
3517                 n=nr_feature;
3518         int w_size = n;
3519         int nr_w;
3520         if(nr_class==2 && param.solver_type != MCSVM_CS)
3521                 nr_w = 1;
3522         else
3523                 nr_w = nr_class;
3524
3525         model_->w=Malloc(double, w_size*nr_w);
3526         for(i=0; i<w_size; i++)
3527         {
3528                 int j;
3529                 for(j=0; j<nr_w; j++)
3530                         FSCANF(fp, "%lf ", &model_->w[i*nr_w+j]);
3531         }
3532
3533         setlocale(LC_ALL, old_locale);
3534         free(old_locale);
3535
3536         if (ferror(fp) != 0 || fclose(fp) != 0) return NULL;
3537
3538         return model_;
3539 }
3540
3541 int get_nr_feature(const model *model_)
3542 {
3543         return model_->nr_feature;
3544 }
3545
3546 int get_nr_class(const model *model_)
3547 {
3548         return model_->nr_class;
3549 }
3550
3551 void get_labels(const model *model_, int* label)
3552 {
3553         if (model_->label != NULL)
3554                 for(int i=0;i<model_->nr_class;i++)
3555                         label[i] = model_->label[i];
3556 }
3557
3558 // use inline here for better performance (around 20% faster than the non-inline one)
3559 static inline double get_w_value(const struct model *model_, int idx, int label_idx)
3560 {
3561         int nr_class = model_->nr_class;
3562         int solver_type = model_->param.solver_type;
3563         const double *w = model_->w;
3564
3565         if(idx < 0 || idx > model_->nr_feature)
3566                 return 0;
3567         if(check_regression_model(model_) || check_oneclass_model(model_))
3568                 return w[idx];
3569         else
3570         {
3571                 if(label_idx < 0 || label_idx >= nr_class)
3572                         return 0;
3573                 if(nr_class == 2 && solver_type != MCSVM_CS)
3574                 {
3575                         if(label_idx == 0)
3576                                 return w[idx];
3577                         else
3578                                 return -w[idx];
3579                 }
3580                 else
3581                         return w[idx*nr_class+label_idx];
3582         }
3583 }
3584
3585 // feat_idx: starting from 1 to nr_feature
3586 // label_idx: starting from 0 to nr_class-1 for classification models;
3587 //            for regression and one-class SVM models, label_idx is
3588 //            ignored.
3589 double get_decfun_coef(const struct model *model_, int feat_idx, int label_idx)
3590 {
3591         if(feat_idx > model_->nr_feature)
3592                 return 0;
3593         return get_w_value(model_, feat_idx-1, label_idx);
3594 }
3595
3596 double get_decfun_bias(const struct model *model_, int label_idx)
3597 {
3598         if(check_oneclass_model(model_))
3599         {
3600                 fprintf(stderr, "ERROR: get_decfun_bias can not be called for a one-class SVM model\n");
3601                 return 0;
3602         }
3603         int bias_idx = model_->nr_feature;
3604         double bias = model_->bias;
3605         if(bias <= 0)
3606                 return 0;
3607         else
3608                 return bias*get_w_value(model_, bias_idx, label_idx);
3609 }
3610
3611 double get_decfun_rho(const struct model *model_)
3612 {
3613         if(check_oneclass_model(model_))
3614                 return model_->rho;
3615         else
3616         {
3617                 fprintf(stderr, "ERROR: get_decfun_rho can be called only for a one-class SVM model\n");
3618                 return 0;
3619         }
3620 }
3621
3622 void free_model_content(struct model *model_ptr)
3623 {
3624         if(model_ptr->w != NULL)
3625                 free(model_ptr->w);
3626         if(model_ptr->label != NULL)
3627                 free(model_ptr->label);
3628 }
3629
3630 void free_and_destroy_model(struct model **model_ptr_ptr)
3631 {
3632         struct model *model_ptr = *model_ptr_ptr;
3633         if(model_ptr != NULL)
3634         {
3635                 free_model_content(model_ptr);
3636                 free(model_ptr);
3637         }
3638 }
3639
3640 void destroy_param(parameter* param)
3641 {
3642         if(param->weight_label != NULL)
3643                 free(param->weight_label);
3644         if(param->weight != NULL)
3645                 free(param->weight);
3646         if(param->init_sol != NULL)
3647                 free(param->init_sol);
3648 }
3649
3650 const char *check_parameter(const problem *prob, const parameter *param)
3651 {
3652         if(param->eps <= 0)
3653                 return "eps <= 0";
3654
3655         if(param->C <= 0)
3656                 return "C <= 0";
3657
3658         if(param->p < 0)
3659                 return "p < 0";
3660
3661         if(prob->bias >= 0 && param->solver_type == ONECLASS_SVM)
3662                 return "prob->bias >=0, but this is ignored in ONECLASS_SVM";
3663
3664         if(param->regularize_bias == 0)
3665         {
3666                 if(prob->bias != 1.0)
3667                         return "To not regularize bias, must specify -B 1 along with -R";
3668                 if(param->solver_type != L2R_LR
3669                         && param->solver_type != L2R_L2LOSS_SVC
3670                         && param->solver_type != L1R_L2LOSS_SVC
3671                         && param->solver_type != L1R_LR
3672                         && param->solver_type != L2R_L2LOSS_SVR)
3673                         return "-R option supported only for solver L2R_LR, L2R_L2LOSS_SVC, L1R_L2LOSS_SVC, L1R_LR, and L2R_L2LOSS_SVR";
3674         }
3675
3676         if(param->solver_type != L2R_LR
3677                 && param->solver_type != L2R_L2LOSS_SVC_DUAL
3678                 && param->solver_type != L2R_L2LOSS_SVC
3679                 && param->solver_type != L2R_L1LOSS_SVC_DUAL
3680                 && param->solver_type != MCSVM_CS
3681                 && param->solver_type != L1R_L2LOSS_SVC
3682                 && param->solver_type != L1R_LR
3683                 && param->solver_type != L2R_LR_DUAL
3684                 && param->solver_type != L2R_L2LOSS_SVR
3685                 && param->solver_type != L2R_L2LOSS_SVR_DUAL
3686                 && param->solver_type != L2R_L1LOSS_SVR_DUAL
3687                 && param->solver_type != ONECLASS_SVM)
3688                 return "unknown solver type";
3689
3690         if(param->init_sol != NULL
3691                 && param->solver_type != L2R_LR
3692                 && param->solver_type != L2R_L2LOSS_SVC
3693                 && param->solver_type != L2R_L2LOSS_SVR)
3694                 return "Initial-solution specification supported only for solvers L2R_LR, L2R_L2LOSS_SVC, and L2R_L2LOSS_SVR";
3695
3696         return NULL;
3697 }
3698
3699 int check_probability_model(const struct model *model_)
3700 {
3701         return (model_->param.solver_type==L2R_LR ||
3702                         model_->param.solver_type==L2R_LR_DUAL ||
3703                         model_->param.solver_type==L1R_LR);
3704 }
3705
3706 int check_regression_model(const struct model *model_)
3707 {
3708         return (model_->param.solver_type==L2R_L2LOSS_SVR ||
3709                         model_->param.solver_type==L2R_L1LOSS_SVR_DUAL ||
3710                         model_->param.solver_type==L2R_L2LOSS_SVR_DUAL);
3711 }
3712
3713 int check_oneclass_model(const struct model *model_)
3714 {
3715         return model_->param.solver_type == ONECLASS_SVM;
3716 }
3717
3718 void set_print_string_function(void (*print_func)(const char*))
3719 {
3720         if (print_func == NULL)
3721                 liblinear_print_string = &print_string_stdout;
3722         else
3723                 liblinear_print_string = print_func;
3724 }
3725